feat(server): 重构知识库服务和路由配置,优化LLM维基知识管理接口,增强知识检索能力

This commit is contained in:
caoxiaozhu
2026-05-15 09:33:59 +00:00
parent 7a3feb14a0
commit 6793b6f832
7 changed files with 812 additions and 123 deletions

View File

@@ -8,7 +8,7 @@ from dataclasses import dataclass
from datetime import UTC, datetime
from decimal import Decimal
from pathlib import Path
from typing import Any, Literal
from typing import Any, Callable, Literal
from uuid import uuid4
from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator
@@ -47,7 +47,8 @@ from app.services.system_hermes import SystemHermesService
logger = get_logger("app.services.llm_wiki")
HERMES_CANDIDATE_MODEL_TIMEOUT_SECONDS = 10
HERMES_CANDIDATE_GROUP_SIZE = 3
HERMES_CANDIDATE_GROUP_SIZE = 2
HERMES_CANDIDATE_CONTENT_LIMIT = 520
LOW_SIGNAL_DOTTED_LINE_PATTERN = re.compile(r"[..。·•]{6,}\s*[0-9]{0,3}$")
PAGE_FOOTER_PATTERN = re.compile(r"^第\s*\d+\s*页\s*共\s*\d+\s*页$")
POLICY_SUBSTANCE_KEYWORDS = (
@@ -412,6 +413,8 @@ class LlmWikiService:
current_user: CurrentUserContext,
document_ids: list[str] | None = None,
force: bool = False,
agent_run_id: str | None = None,
progress_callback: Callable[[dict[str, Any], str], None] | None = None,
) -> LlmWikiSyncRead:
self.knowledge_service.ensure_library_ready()
documents = self.knowledge_service.list_folder_documents(folder=folder)
@@ -427,6 +430,7 @@ class LlmWikiService:
self.knowledge_service.set_document_ingest_statuses(
target_document_ids,
status_code=KNOWLEDGE_INGEST_STATUS_SYNCING,
agent_run_id=agent_run_id,
)
try:
@@ -441,9 +445,27 @@ class LlmWikiService:
rule_candidate_count = 0
generated_rule_asset_ids: list[str] = []
changed_document_count = 0
skipped_document_count = 0
sync_summaries: list[str] = []
failed_document_ids: list[str] = []
total_documents = len(documents)
for entry in documents:
self._emit_progress(
progress_callback,
{
"phase": "running",
"progress": {
"total_documents": total_documents,
"completed_documents": 0,
"failed_documents": 0,
"skipped_documents": 0,
"percent": 0,
},
},
f"Hermes 已开始归纳,待处理文档 {total_documents} 个。",
)
for index_value, entry in enumerate(documents, start=1):
document_id = str(entry.get("id") or "").strip()
if not document_id:
continue
@@ -451,15 +473,85 @@ class LlmWikiService:
existing = existing_by_id.get(document_id)
sync_reason = self._resolve_sync_reason(entry=entry, existing=existing, force=force)
if sync_reason == "unchanged_skipped":
skipped_document_count += 1
sync_summaries.append(f"{entry['original_name']}:未变化,跳过。")
self._emit_progress(
progress_callback,
{
"phase": "running",
"progress": {
"total_documents": total_documents,
"completed_documents": changed_document_count,
"failed_documents": len(failed_document_ids),
"skipped_documents": skipped_document_count,
"current_document_index": index_value,
"current_document_id": document_id,
"current_document_name": entry["original_name"],
"current_stage": "skipped",
"percent": self._calculate_progress_percent(
completed_documents=changed_document_count,
skipped_documents=skipped_document_count,
total_documents=total_documents,
),
},
},
f"{entry['original_name']}》未变化,跳过本次归纳。",
)
continue
self._emit_progress(
progress_callback,
{
"phase": "running",
"progress": {
"total_documents": total_documents,
"completed_documents": changed_document_count,
"failed_documents": len(failed_document_ids),
"skipped_documents": skipped_document_count,
"current_document_index": index_value,
"current_document_id": document_id,
"current_document_name": entry["original_name"],
"current_stage": "document_started",
"percent": self._calculate_progress_percent(
completed_documents=changed_document_count,
skipped_documents=skipped_document_count,
total_documents=total_documents,
),
},
},
f"Hermes 正在归纳《{entry['original_name']}》。",
)
changed_document_count += 1
document_payload = self._sync_single_document(
entry=entry,
folder=folder,
current_user=current_user,
sync_reason=sync_reason,
progress_callback=lambda payload, summary, *, document_id=document_id, document_name=entry["original_name"], document_index=index_value: self._emit_progress(
progress_callback,
{
"phase": "running",
"progress": {
"total_documents": total_documents,
"completed_documents": max(changed_document_count - 1, 0),
"failed_documents": len(failed_document_ids),
"skipped_documents": skipped_document_count,
"current_document_index": document_index,
"current_document_id": document_id,
"current_document_name": document_name,
**payload,
"percent": self._calculate_progress_percent(
completed_documents=max(changed_document_count - 1, 0),
skipped_documents=skipped_document_count,
total_documents=total_documents,
group_count=int(payload.get("group_count") or 0),
current_group_index=int(payload.get("current_group_index") or 0),
),
},
},
summary,
),
)
existing_by_id[document_id] = document_payload["document"]
knowledge_candidate_count += len(document_payload["knowledge_candidates"])
@@ -471,13 +563,47 @@ class LlmWikiService:
if str(item.get("generated_asset_id") or "").strip()
]
)
if document_payload["document"].get("quality_status") in {"fallback_only", "runtime_only", "failed"}:
failed_document_ids.append(document_id)
sync_summaries.append(
f"{entry['original_name']}{sync_reason},知识候选 {len(document_payload['knowledge_candidates'])} 条,"
f"规则候选 {len(document_payload['rule_candidates'])}"
f"规则候选 {len(document_payload['rule_candidates'])}"
f"归纳质量 {document_payload['document'].get('quality_status') or 'formal'}"
)
self._emit_progress(
progress_callback,
{
"phase": "running",
"progress": {
"total_documents": total_documents,
"completed_documents": changed_document_count,
"failed_documents": len(failed_document_ids),
"skipped_documents": skipped_document_count,
"current_document_index": index_value,
"current_document_id": document_id,
"current_document_name": entry["original_name"],
"current_stage": "document_completed",
"knowledge_candidate_count": len(document_payload["knowledge_candidates"]),
"rule_candidate_count": len(document_payload["rule_candidates"]),
"quality_status": document_payload["document"].get("quality_status") or "formal",
"percent": self._calculate_progress_percent(
completed_documents=changed_document_count,
skipped_documents=skipped_document_count,
total_documents=total_documents,
),
},
},
f"{entry['original_name']}》归纳完成,质量状态为 {document_payload['document'].get('quality_status') or 'formal'}",
)
index["documents"] = list(existing_by_id.values())
self._write_json_file(self.knowledge_service.llm_wiki_index_path, index)
if failed_document_ids:
self.knowledge_service.set_document_ingest_statuses(
failed_document_ids,
status_code=KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
sync_runs.setdefault("runs", [])
sync_runs["runs"].append(
@@ -502,6 +628,22 @@ class LlmWikiService:
generated_rule_ids = list(dict.fromkeys(generated_rule_asset_ids))
summary = "".join(sync_summaries) if sync_summaries else "未发现需要同步的知识文档。"
self._emit_progress(
progress_callback,
{
"phase": "running",
"progress": {
"total_documents": total_documents,
"completed_documents": changed_document_count,
"failed_documents": len(failed_document_ids),
"skipped_documents": skipped_document_count,
"knowledge_candidate_count": knowledge_candidate_count,
"rule_candidate_count": rule_candidate_count,
"percent": 100,
},
},
summary,
)
return LlmWikiSyncRead(
ok=True,
run_id=run_id,
@@ -518,6 +660,7 @@ class LlmWikiService:
self.knowledge_service.set_document_ingest_statuses(
target_document_ids,
status_code=KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
raise
@@ -528,6 +671,7 @@ class LlmWikiService:
folder: str,
current_user: CurrentUserContext,
sync_reason: str,
progress_callback: Callable[[dict[str, Any], str], None] | None = None,
) -> dict[str, Any]:
document_id = str(entry["id"])
document_name = str(entry["original_name"])
@@ -539,10 +683,19 @@ class LlmWikiService:
text_path.write_text(extracted_text, encoding="utf-8")
chunks = self._build_chunks(document_id=document_id, text=extracted_text)
knowledge_candidates, rule_candidates = self._extract_candidates(
self._emit_progress(
progress_callback,
{
"current_stage": "text_extracted",
"chunk_count": len(chunks),
},
f"{document_name}》文本提取完成,共形成 {len(chunks)} 个分块。",
)
knowledge_candidates, rule_candidates, extraction_stats = self._extract_candidates(
entry=entry,
chunks=chunks,
current_user=current_user,
progress_callback=progress_callback,
)
generated_candidates: list[dict[str, Any]] = []
@@ -563,8 +716,17 @@ class LlmWikiService:
"checksum": str(entry.get("sha256") or ""),
"extracted_text_path": str(text_path),
"chunk_count": len(chunks),
"candidate_chunk_count": extraction_stats.candidate_chunk_count,
"filtered_chunk_count": extraction_stats.filtered_chunk_count,
"group_count": extraction_stats.group_count,
"successful_group_count": extraction_stats.successful_group_count,
"failed_group_count": extraction_stats.failed_group_count,
"knowledge_candidate_count": len(knowledge_candidates),
"formal_knowledge_candidate_count": extraction_stats.formal_knowledge_candidate_count,
"fallback_knowledge_candidate_count": extraction_stats.fallback_knowledge_candidate_count,
"rule_candidate_count": len(generated_candidates),
"quality_status": extraction_stats.quality_status,
"quality_note": extraction_stats.quality_note,
"updated_at": datetime.now(UTC).isoformat(),
"signature": self._build_document_signature(entry),
"sync_reason": sync_reason,
@@ -593,49 +755,147 @@ class LlmWikiService:
entry: dict[str, Any],
chunks: list[dict[str, Any]],
current_user: CurrentUserContext,
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
progress_callback: Callable[[dict[str, Any], str], None] | None = None,
) -> tuple[list[dict[str, Any]], list[dict[str, Any]], CandidateExtractionStats]:
stats = CandidateExtractionStats(raw_chunk_count=len(chunks))
if not chunks:
return [], []
stats.quality_status = "failed"
stats.quality_note = "文档未提取到可用分块,无法形成 LLM Wiki。"
return [], [], stats
candidate_chunks = self._select_candidate_chunks(chunks)
stats.candidate_chunk_count = len(candidate_chunks)
stats.filtered_chunk_count = max(0, len(chunks) - len(candidate_chunks))
if not candidate_chunks:
stats.quality_status = "failed"
stats.quality_note = "正文条款分块为空,当前仅识别到封面、目录或低信息量内容,未形成正式归纳。"
return [], [], stats
projected_group_count = len(self._group_chunks(candidate_chunks, size=HERMES_CANDIDATE_GROUP_SIZE))
self._emit_progress(
progress_callback,
{
"current_stage": "candidate_chunks_selected",
"candidate_chunk_count": stats.candidate_chunk_count,
"filtered_chunk_count": stats.filtered_chunk_count,
"group_count": projected_group_count,
"current_group_index": 0,
"successful_group_count": 0,
"failed_group_count": 0,
},
f"{entry['original_name']}》已筛出 {stats.candidate_chunk_count} 个有效正文分块,准备分 {projected_group_count} 组归纳。",
)
knowledge_candidates: list[dict[str, Any]] = []
rule_candidates: list[dict[str, Any]] = []
seen_knowledge_keys: set[str] = set()
seen_rule_keys: set[str] = set()
for chunk_group in self._group_chunks(chunks, size=4):
payload = self._call_candidate_model(entry=entry, chunk_group=chunk_group)
for chunk_group in self._group_chunks(candidate_chunks, size=HERMES_CANDIDATE_GROUP_SIZE):
stats.group_count += 1
attempt = self._call_candidate_model(entry=entry, chunk_group=chunk_group)
if isinstance(attempt, dict):
attempt = CandidateModelAttempt(payload=attempt, source="hermes", ok=True)
if not attempt.ok:
stats.failed_group_count += 1
self._emit_progress(
progress_callback,
{
"current_stage": "extracting_candidates",
"group_count": projected_group_count,
"current_group_index": stats.group_count,
"successful_group_count": stats.successful_group_count,
"failed_group_count": stats.failed_group_count,
},
f"{entry['original_name']}》第 {stats.group_count}/{projected_group_count} 组归纳失败,继续处理下一组。",
)
continue
stats.successful_group_count += 1
batch_knowledge = self._normalize_knowledge_candidates(
raw_items=list(payload.get("knowledge_candidates") or []),
raw_items=list(attempt.payload.get("knowledge_candidates") or []),
entry=entry,
chunk_group=chunk_group,
seen_keys=seen_knowledge_keys,
extraction_mode=attempt.source,
)
batch_rules = self._normalize_rule_candidates(
raw_items=list(payload.get("rule_candidates") or []),
entry=entry,
chunk_group=chunk_group,
current_user=current_user,
seen_keys=seen_rule_keys,
)
batch_rules: list[dict[str, Any]] = []
if attempt.source == "hermes":
batch_rules = self._normalize_rule_candidates(
raw_items=list(attempt.payload.get("rule_candidates") or []),
entry=entry,
chunk_group=chunk_group,
current_user=current_user,
seen_keys=seen_rule_keys,
)
knowledge_candidates.extend(batch_knowledge)
rule_candidates.extend(batch_rules)
self._emit_progress(
progress_callback,
{
"current_stage": "extracting_candidates",
"group_count": projected_group_count,
"current_group_index": stats.group_count,
"successful_group_count": stats.successful_group_count,
"failed_group_count": stats.failed_group_count,
"knowledge_candidate_count": len(knowledge_candidates),
"rule_candidate_count": len(rule_candidates),
},
f"{entry['original_name']}》已完成第 {stats.group_count}/{projected_group_count} 组归纳。",
)
if not knowledge_candidates:
fallback = self._build_fallback_knowledge_candidate(entry=entry, chunks=chunks)
formal_knowledge_candidate_count = sum(
1 for item in knowledge_candidates if str(item.get("extraction_mode") or "hermes") == "hermes"
)
if formal_knowledge_candidate_count <= 0:
fallback = self._build_fallback_knowledge_candidate(
entry=entry,
chunks=candidate_chunks,
reason=(
"Hermes 未能从正文条款中形成正式知识候选。当前结果仅为降级兜底预览,不能视为正式归纳。"
),
)
if fallback is not None:
knowledge_candidates.append(fallback)
return knowledge_candidates[:12], rule_candidates[:12]
truncated_knowledge_candidates = knowledge_candidates[:12]
truncated_rule_candidates = rule_candidates[:12]
stats.formal_knowledge_candidate_count = sum(
1 for item in truncated_knowledge_candidates if str(item.get("extraction_mode") or "hermes") == "hermes"
)
stats.fallback_knowledge_candidate_count = max(
0,
len(truncated_knowledge_candidates) - stats.formal_knowledge_candidate_count,
)
stats.quality_status, stats.quality_note = self._resolve_quality_status(
stats=stats,
knowledge_candidates=truncated_knowledge_candidates,
)
self._emit_progress(
progress_callback,
{
"current_stage": "candidate_extraction_completed",
"group_count": projected_group_count,
"current_group_index": projected_group_count,
"successful_group_count": stats.successful_group_count,
"failed_group_count": stats.failed_group_count,
"knowledge_candidate_count": len(truncated_knowledge_candidates),
"formal_knowledge_candidate_count": stats.formal_knowledge_candidate_count,
"fallback_knowledge_candidate_count": stats.fallback_knowledge_candidate_count,
"rule_candidate_count": len(truncated_rule_candidates),
"quality_status": stats.quality_status,
},
f"{entry['original_name']}》候选提炼完成,质量状态为 {stats.quality_status}",
)
return truncated_knowledge_candidates, truncated_rule_candidates, stats
def _call_candidate_model(
self,
*,
entry: dict[str, Any],
chunk_group: list[dict[str, Any]],
) -> dict[str, Any]:
if self._candidate_model_disabled:
return {}
) -> CandidateModelAttempt:
facts = {
"document_id": entry["id"],
"document_name": entry["original_name"],
@@ -653,7 +913,7 @@ class LlmWikiService:
{
"chunk_id": item["chunk_id"],
"title": item["title"],
"content": item["content"][:900],
"content": item["content"][:HERMES_CANDIDATE_CONTENT_LIMIT],
"source_page": item.get("source_page"),
"tags": item.get("tags", []),
}
@@ -663,6 +923,8 @@ class LlmWikiService:
system_prompt = (
"你是企业财务制度知识库的 Hermes 规则形成器。"
"你只能基于提供的制度条款生成结构化知识候选和规则候选,不能自由发散。"
"封面、目录、通知、页眉页脚、密级说明、印发信息不属于知识候选,必须忽略。"
"只提炼具有执行意义、审核意义、报销约束意义的条款。"
"规则候选必须从允许模板中选 template_key严禁自创模板。"
"runtime_rule 必须严格遵守 runtime_rule_contracts 中对应模板的字段结构和允许值。"
"如果条款不适合自动规则化,可以只返回 knowledge_candidates。"
@@ -675,6 +937,7 @@ class LlmWikiService:
)
user_prompt = (
"请根据以下制度分块生成候选。"
"每组最多提炼 3 条高价值 knowledge_candidates优先保留可直接供报销审核、附件校验、审批判断使用的知识。"
"只返回 JSON 对象,不要输出解释,不要调用工具,不要追加任何其他文本。\n"
f"{json.dumps(facts, ensure_ascii=False, indent=2)}"
)
@@ -693,29 +956,44 @@ class LlmWikiService:
timeout_seconds=HERMES_CANDIDATE_MODEL_TIMEOUT_SECONDS,
)
payload = self._extract_json_payload(cli_result.response_text)
if payload:
return payload
self._candidate_model_disabled = True
if payload is not None:
return CandidateModelAttempt(payload=payload, source="hermes", ok=True)
logger.warning(
"System Hermes returned no parseable JSON for LLM Wiki doc=%s; using fallback candidates.",
"System Hermes returned no parseable JSON for LLM Wiki doc=%s chunk_group=%s.",
entry.get("id"),
",".join(item.get("chunk_id", "") for item in chunk_group),
)
return CandidateModelAttempt(
payload={},
source="hermes",
ok=False,
failure_reason="system_hermes_no_json",
)
return {}
except TimeoutExpired:
self._candidate_model_disabled = True
logger.warning(
"System Hermes timed out during LLM Wiki candidate extraction doc=%s; using fallback candidates.",
"System Hermes timed out during LLM Wiki candidate extraction doc=%s chunk_group=%s.",
entry.get("id"),
",".join(item.get("chunk_id", "") for item in chunk_group),
)
return CandidateModelAttempt(
payload={},
source="hermes",
ok=False,
failure_reason="system_hermes_timeout",
)
return {}
except Exception as exc:
self._candidate_model_disabled = True
logger.warning(
"System Hermes failed during LLM Wiki candidate extraction doc=%s: %s",
"System Hermes failed during LLM Wiki candidate extraction doc=%s chunk_group=%s: %s",
entry.get("id"),
",".join(item.get("chunk_id", "") for item in chunk_group),
exc,
)
return {}
return CandidateModelAttempt(
payload={},
source="hermes",
ok=False,
failure_reason=str(exc) or "system_hermes_failed",
)
response_text = self.runtime_chat_service.complete(
[
@@ -727,10 +1005,129 @@ class LlmWikiService:
temperature=0.0,
)
payload = self._extract_json_payload(response_text)
if not payload:
self._candidate_model_disabled = True
return {}
return payload
if payload is None:
return CandidateModelAttempt(
payload={},
source="runtime",
ok=False,
failure_reason="runtime_no_json",
)
return CandidateModelAttempt(
payload=payload,
source="runtime",
ok=True,
failure_reason="system_hermes_unavailable",
)
def _select_candidate_chunks(self, chunks: list[dict[str, Any]]) -> list[dict[str, Any]]:
selected: list[dict[str, Any]] = []
for chunk in chunks:
if self._is_low_signal_chunk(chunk):
continue
selected.append(chunk)
return selected
def _is_low_signal_chunk(self, chunk: dict[str, Any]) -> bool:
title = str(chunk.get("title") or "").strip()
content = str(chunk.get("content") or "").strip()
page = int(chunk.get("source_page") or 0)
if not content:
return True
if self._looks_like_table_of_contents(title=title, content=content):
return True
if self._looks_like_cover_notice(title=title, content=content, source_page=page):
return True
compact_content = re.sub(r"\s+", "", content)
if len(compact_content) < 24 and not self._has_policy_substance(content):
return True
if title.startswith("附件") and len(compact_content) < 40:
return True
return False
def _is_low_signal_candidate(self, *, title: str, content: str) -> bool:
compact_content = re.sub(r"\s+", "", content)
if len(compact_content) < 24 and not self._has_policy_substance(content):
return True
if self._looks_like_table_of_contents(title=title, content=content):
return True
if self._looks_like_cover_notice(title=title, content=content, source_page=0):
return True
return False
@staticmethod
def _has_policy_substance(text: str) -> bool:
sample = str(text or "")
return any(keyword in sample for keyword in POLICY_SUBSTANCE_KEYWORDS)
@staticmethod
def _looks_like_table_of_contents(*, title: str, content: str) -> bool:
title_text = str(title or "").strip()
content_text = str(content or "").strip()
if title_text == "目录" or content_text == "目录":
return True
lines = [line.strip() for line in content_text.splitlines() if line.strip()]
if lines and sum(1 for line in lines if LOW_SIGNAL_DOTTED_LINE_PATTERN.search(line)) >= max(2, len(lines) // 2):
return True
if LOW_SIGNAL_DOTTED_LINE_PATTERN.search(content_text) and "" in title_text:
return True
return False
def _looks_like_cover_notice(self, *, title: str, content: str, source_page: int) -> bool:
text = f"{title}\n{content}"
if PAGE_FOOTER_PATTERN.fullmatch(str(content or "").strip()):
return True
cover_keywords = ("关于颁布", "特此通知", "印发", "商密", "制度〔", "有限公司文件", "通知")
if source_page == 1 and any(keyword in text for keyword in cover_keywords):
return True
if source_page > 2:
return False
if any(keyword in text for keyword in cover_keywords):
if "必须" not in text and "应当" not in text and "不得" not in text:
return True
return False
def _resolve_quality_status(
self,
*,
stats: CandidateExtractionStats,
knowledge_candidates: list[dict[str, Any]],
) -> tuple[str, str]:
if stats.formal_knowledge_candidate_count <= 0:
runtime_count = sum(
1 for item in knowledge_candidates if str(item.get("extraction_mode") or "") == "runtime"
)
if runtime_count > 0:
return (
"runtime_only",
"当前知识候选来自运行时模型而非系统 Hermes仅供人工参考不计入正式归纳。",
)
if stats.fallback_knowledge_candidate_count > 0:
return (
"fallback_only",
"Hermes 未形成正式知识候选,当前仅保留降级兜底预览,不能作为正式知识上线。",
)
return (
"failed",
"Hermes 未能从当前文档提炼出可用知识候选,请调整文档内容或重新归纳。",
)
if stats.failed_group_count > 0:
return (
"partial_degraded",
f"Hermes 成功处理 {stats.successful_group_count}/{stats.group_count} 个分组,"
f"仍有 {stats.failed_group_count} 个分组未成功,请人工复核后再使用。",
)
if stats.filtered_chunk_count > 0:
return (
"formal",
f"已自动过滤 {stats.filtered_chunk_count} 个封面、目录或低信息量分块,当前结果来自正文条款。",
)
return ("formal", "Hermes 已基于正文条款完成正式归纳。")
def _normalize_knowledge_candidates(
self,
@@ -739,6 +1136,7 @@ class LlmWikiService:
entry: dict[str, Any],
chunk_group: list[dict[str, Any]],
seen_keys: set[str],
extraction_mode: str,
) -> list[dict[str, Any]]:
normalized: list[dict[str, Any]] = []
default_chunk_ids = [item["chunk_id"] for item in chunk_group]
@@ -750,10 +1148,17 @@ class LlmWikiService:
content = str(item.get("content") or "").strip()
if not title or not content:
continue
if self._is_low_signal_candidate(title=title, content=content):
continue
candidate_key = f"{title.casefold()}::{content[:80].casefold()}"
if candidate_key in seen_keys:
continue
seen_keys.add(candidate_key)
quality_flags: list[str] = []
fallback_reason = ""
if extraction_mode != "hermes":
quality_flags.append("non_hermes_source")
fallback_reason = "当前知识候选不是由系统 Hermes 正式提炼,不能视为正式归纳。"
normalized.append(
{
"candidate_id": f"kc_{uuid4().hex[:12]}",
@@ -775,6 +1180,9 @@ class LlmWikiService:
"status": "draft",
"created_by": "hermes",
"created_at": datetime.now(UTC).isoformat(),
"extraction_mode": extraction_mode,
"quality_flags": quality_flags,
"fallback_reason": fallback_reason,
}
)
return normalized
@@ -1218,6 +1626,7 @@ class LlmWikiService:
*,
entry: dict[str, Any],
chunks: list[dict[str, Any]],
reason: str,
) -> dict[str, Any] | None:
first_chunk = next((item for item in chunks if str(item.get("content") or "").strip()), None)
if first_chunk is None:
@@ -1244,6 +1653,9 @@ class LlmWikiService:
"status": "draft",
"created_by": "hermes",
"created_at": datetime.now(UTC).isoformat(),
"extraction_mode": "fallback",
"quality_flags": ["fallback_only", "not_formal_ingest"],
"fallback_reason": reason,
}
@staticmethod
@@ -1266,10 +1678,38 @@ class LlmWikiService:
f"- 来源文档:{document_name}",
f"- 知识条目数:{len(knowledge_candidates)}",
"",
"## 核心知识",
"",
]
quality_status = str(entry.get("quality_status") or "formal").strip() or "formal"
quality_note = str(entry.get("quality_note") or "").strip()
if quality_status != "formal":
lines.extend(
[
"## 归纳状态",
"",
f"- 质量状态:{quality_status}",
f"- 说明:{quality_note or '当前结果不是正式 Hermes 归纳。'}",
"",
]
)
elif quality_note:
lines.extend(
[
"## 归纳状态",
"",
f"- 质量状态:{quality_status}",
f"- 说明:{quality_note}",
"",
]
)
lines.extend(
[
"## 核心知识",
"",
]
)
if not knowledge_candidates:
lines.extend(
[
@@ -1624,6 +2064,10 @@ class LlmWikiService:
def _clean_line(line: str) -> str:
cleaned = str(line or "").replace("\u3000", " ").strip()
cleaned = re.sub(r"\s+", " ", cleaned)
if PAGE_FOOTER_PATTERN.fullmatch(cleaned):
return ""
if cleaned in {"商密【中】", "商密【高】", "商密【低】"}:
return ""
return cleaned
@staticmethod
@@ -1701,6 +2145,11 @@ class LlmWikiService:
return "forced_rebuild"
if existing is None:
return "initial_build"
existing_quality_status = str(existing.get("quality_status") or "").strip()
if existing_quality_status and existing_quality_status != "formal":
return f"quality_{existing_quality_status}_rebuild"
if int(existing.get("formal_knowledge_candidate_count") or 0) <= 0:
return "formal_candidate_missing_rebuild"
previous_signature = existing.get("signature")
if not isinstance(previous_signature, dict):
@@ -1723,6 +2172,35 @@ class LlmWikiService:
return "unchanged_skipped"
return ",".join(reasons)
@staticmethod
def _emit_progress(
progress_callback: Callable[[dict[str, Any], str], None] | None,
payload: dict[str, Any],
summary: str,
) -> None:
if progress_callback is None:
return
progress_callback(payload, summary)
@staticmethod
def _calculate_progress_percent(
*,
completed_documents: int,
skipped_documents: int,
total_documents: int,
group_count: int = 0,
current_group_index: int = 0,
) -> int:
if total_documents <= 0:
return 100
completed_units = completed_documents + skipped_documents
if group_count > 0 and current_group_index > 0:
completed_units += min(current_group_index, group_count) / group_count
percent = round((completed_units / total_documents) * 100)
return max(0, min(percent, 100))
@staticmethod
def _build_document_signature(entry: dict[str, Any]) -> dict[str, Any]:
return {