feat: 增强知识库功能，优化索引和RAG检索

2026-05-18 02:49:39 +00:00
parent 55e0591a5e
commit 4414ffb34c
18 changed files with 5656 additions and 4659 deletions
--- a/server/src/app/services/knowledge.py
+++ b/server/src/app/services/knowledge.py
@@ -856,7 +856,13 @@ class KnowledgeService:

            status_payload = status_map.get(document_id) or {}
            rag_status = str(status_payload.get("status") or "").strip().lower()
-            if bool(status_payload.get("query_ready")):
+            linked_run_status = self._resolve_linked_ingest_run_status(entry)
+            if (
+                linked_run_status == AgentRunStatus.FAILED.value
+                and rag_status in {"pending", "processing", "preprocessed"}
+            ):
+                desired_status = KNOWLEDGE_INGEST_STATUS_FAILED
+            elif bool(status_payload.get("query_ready")):
                desired_status = KNOWLEDGE_INGEST_STATUS_INGESTED
            elif rag_status in {"pending", "processing", "preprocessed"}:
                desired_status = KNOWLEDGE_INGEST_STATUS_SYNCING
@@ -1007,12 +1013,22 @@ class KnowledgeService:
            probe_entry = {"ingest_status_updated_at": heartbeat_at}
            return not self._is_syncing_status_stale(probe_entry)

-        return not self._is_syncing_status_stale(entry)
-
-    def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]:
-        for entry in index["documents"]:
-            if entry["id"] == document_id:
-                return entry
+        return not self._is_syncing_status_stale(entry)
+
+    def _resolve_linked_ingest_run_status(self, entry: dict[str, Any]) -> str:
+        agent_run_id = str(entry.get("ingest_agent_run_id") or "").strip()
+        if not agent_run_id or self.db is None:
+            return ""
+
+        run = self.db.scalar(select(AgentRun).where(AgentRun.run_id == agent_run_id))
+        if run is None:
+            return ""
+        return str(run.status or "").strip()
+
+    def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]:
+        for entry in index["documents"]:
+            if entry["id"] == document_id:
+                return entry
        raise FileNotFoundError(document_id)

    def _resolve_document_path(self, entry: dict[str, Any]) -> Path:
--- a/server/src/app/services/knowledge_index_tasks.py
+++ b/server/src/app/services/knowledge_index_tasks.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import threading
 from concurrent.futures import Future, ThreadPoolExecutor
 from datetime import UTC, datetime
 from time import perf_counter
@@ -18,6 +19,7 @@ from app.services.knowledge import (
 from app.services.knowledge_rag import KnowledgeRagService

 logger = get_logger("app.services.knowledge_index_tasks")
+HEARTBEAT_INTERVAL_SECONDS = 10


 class KnowledgeIndexTaskManager:
@@ -58,6 +60,15 @@ class KnowledgeIndexTaskManager:
        session_factory = get_session_factory()
        db = session_factory()
        started = perf_counter()
+        heartbeat_stop = threading.Event()
+        heartbeat_thread: threading.Thread | None = None
+        tool_call_id = ""
+        tool_request_json = {
+            "agent": AgentName.HERMES.value,
+            "folder": folder,
+            "document_ids": document_ids,
+            "force": force,
+        }

        try:
            run_service = AgentRunService(db)
@@ -84,6 +95,44 @@ class KnowledgeIndexTaskManager:
                    },
                },
            )
+            tool_call = run_service.record_tool_call(
+                run_id=agent_run_id,
+                tool_type=AgentToolType.LLM.value,
+                tool_name="lightrag.index_documents",
+                request_json=tool_request_json,
+                response_json={"phase": "indexing"},
+                status="running",
+                duration_ms=0,
+                error_message=None,
+            )
+            tool_call_id = tool_call.id
+
+            def heartbeat_worker() -> None:
+                while not heartbeat_stop.wait(HEARTBEAT_INTERVAL_SECONDS):
+                    heartbeat_db = session_factory()
+                    try:
+                        AgentRunService(heartbeat_db).merge_route_json(
+                            agent_run_id,
+                            {
+                                "job_type": "knowledge_index_sync",
+                                "phase": "indexing",
+                                "heartbeat_at": datetime.now(UTC).isoformat(),
+                            },
+                        )
+                    except Exception:
+                        logger.exception(
+                            "Knowledge index heartbeat update failed run_id=%s",
+                            agent_run_id,
+                        )
+                    finally:
+                        heartbeat_db.close()
+
+            heartbeat_thread = threading.Thread(
+                target=heartbeat_worker,
+                name=f"knowledge-index-heartbeat-{agent_run_id}",
+                daemon=True,
+            )
+            heartbeat_thread.start()

            response = rag_service.index_documents(document_ids=document_ids, force=force)
            succeeded_document_ids = [
@@ -117,16 +166,11 @@ class KnowledgeIndexTaskManager:

            duration_ms = int((perf_counter() - started) * 1000)
            tool_status = "succeeded" if not failed_document_ids else "failed"
-            run_service.record_tool_call(
-                run_id=agent_run_id,
-                tool_type=AgentToolType.LLM.value,
-                tool_name="lightrag.index_documents",
-                request_json={
-                    "agent": AgentName.HERMES.value,
-                    "folder": folder,
-                    "document_ids": document_ids,
-                    "force": force,
-                },
+            heartbeat_stop.set()
+            if heartbeat_thread is not None:
+                heartbeat_thread.join(timeout=1)
+            run_service.update_tool_call(
+                tool_call_id,
                response_json=response,
                status=tool_status,
                duration_ms=duration_ms,
@@ -166,22 +210,29 @@ class KnowledgeIndexTaskManager:
                finished_at=datetime.now(UTC),
            )
        except Exception as exc:
+            heartbeat_stop.set()
+            if heartbeat_thread is not None:
+                heartbeat_thread.join(timeout=1)
            try:
-                AgentRunService(db).record_tool_call(
-                    run_id=agent_run_id,
-                    tool_type=AgentToolType.LLM.value,
-                    tool_name="lightrag.index_documents",
-                    request_json={
-                        "agent": AgentName.HERMES.value,
-                        "folder": folder,
-                        "document_ids": document_ids,
-                        "force": force,
-                    },
-                    response_json={"error": str(exc)},
-                    status="failed",
-                    duration_ms=int((perf_counter() - started) * 1000),
-                    error_message=str(exc),
-                )
+                if tool_call_id:
+                    AgentRunService(db).update_tool_call(
+                        tool_call_id,
+                        response_json={"error": str(exc)},
+                        status="failed",
+                        duration_ms=int((perf_counter() - started) * 1000),
+                        error_message=str(exc),
+                    )
+                else:
+                    AgentRunService(db).record_tool_call(
+                        run_id=agent_run_id,
+                        tool_type=AgentToolType.LLM.value,
+                        tool_name="lightrag.index_documents",
+                        request_json=tool_request_json,
+                        response_json={"error": str(exc)},
+                        status="failed",
+                        duration_ms=int((perf_counter() - started) * 1000),
+                        error_message=str(exc),
+                    )
                KnowledgeService(db=db).set_document_ingest_statuses(
                    document_ids,
                    KNOWLEDGE_INGEST_STATUS_FAILED,
@@ -210,6 +261,9 @@ class KnowledgeIndexTaskManager:
                logger.exception("Knowledge index task finalization failed run_id=%s", agent_run_id)
            logger.exception("Knowledge index task failed run_id=%s", agent_run_id)
        finally:
+            heartbeat_stop.set()
+            if heartbeat_thread is not None and heartbeat_thread.is_alive():
+                heartbeat_thread.join(timeout=1)
            db.close()


--- a/server/src/app/services/knowledge_normalizer.py
+++ b/server/src/app/services/knowledge_normalizer.py
@@ -83,24 +83,23 @@ class KnowledgeNormalizationService:
            if rendered:
                normalized_tables.append(f"## {candidate.title}\n\n{rendered}")

-        parts: list[str] = []
+        appendix_parts: list[str] = []
        if section_appendix:
-            parts.append(section_appendix)
+            appendix_parts.append(section_appendix)
        if answer_clue_appendix:
-            parts.append(answer_clue_appendix)
+            appendix_parts.append(answer_clue_appendix)
        if normalized_tables:
            appendix = "\n\n".join(normalized_tables)
-            parts.append(
+            appendix_parts.append(
                "# 结构化表格补充\n\n"
                "以下表格由知识归纳阶段依据原文重新整理，供问答检索时优先理解行列关系。\n\n"
                f"{appendix}"
            )

-        if not parts:
+        if not appendix_parts:
            return normalized_text

-        parts.append(f"# 原文\n\n{normalized_text}")
-        return "\n\n".join(parts)
+        return "\n\n".join([normalized_text, *appendix_parts])

    @staticmethod
    def _extract_table_candidates(text: str) -> list[TableCandidate]:
--- a/server/src/app/services/knowledge_rag.py
+++ b/server/src/app/services/knowledge_rag.py
@@ -33,6 +33,7 @@ DEFAULT_LIGHTRAG_QUERY_MODE = "naive"
 DEFAULT_LLM_TIMEOUT_SECONDS = 180
 DEFAULT_EMBEDDING_TIMEOUT_SECONDS = 120
 MAX_KNOWLEDGE_HIT_CONTENT_LENGTH = 2200
+MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH = 220
 MAX_QUERY_TERMS = 12
 QUERY_TERM_STOPWORDS = {
    "什么",
@@ -62,6 +63,13 @@ TABLE_OR_STANDARD_QUERY_HINTS = (
    "档位",
    "额度",
 )
+STRUCTURED_APPENDIX_LEADING_MARKERS = (
+    "# 章节导航",
+    "# 重点章节摘录",
+    "# 问答线索补充",
+    "# 结构化表格补充",
+)
+STRUCTURED_APPENDIX_LEADING_WINDOW = 220

 _runtime_lock = threading.RLock()
 _runtime_instance: _LightRagRuntime | None = None
@@ -830,7 +838,11 @@ class KnowledgeRagService:
            document_id, document_name = _parse_document_identity(file_path)
            normalized_chunk_id = chunk_id or f"path-{rank}"
            normalized_content = _truncate_text(content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH)
-            excerpt = _build_excerpt(normalized_content, max_length=220)
+            excerpt = _build_query_focused_excerpt(
+                normalized_content,
+                query_terms=query_terms,
+                max_length=MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH,
+            )
            candidates.append(
                {
                    "code": f"knowledge.{document_id or 'unknown'}.{normalized_chunk_id}",
@@ -907,8 +919,12 @@ class KnowledgeRagService:
    @staticmethod
    def is_query_ready_status(status_obj: Any) -> bool:
        status_text = KnowledgeRagService._status_value(status_obj)
+        if status_text in {"failed", "error", "aborted"}:
+            return False
        if status_text == "processed":
            return True
+        if status_text in {"pending", "processing", "preprocessed"}:
+            return False

        chunks_count = getattr(status_obj, "chunks_count", None)
        if chunks_count is None and isinstance(status_obj, dict):
@@ -1168,6 +1184,35 @@ def _build_excerpt(text: str, *, max_length: int = 180) -> str:
    return f"{normalized[: max_length - 3].rstrip()}..."


+def _build_query_focused_excerpt(
+    text: str,
+    *,
+    query_terms: list[str],
+    max_length: int = 180,
+) -> str:
+    normalized = " ".join(str(text or "").split()).strip()
+    if not normalized:
+        return ""
+
+    lowered = normalized.lower()
+    match_positions = [
+        lowered.find(term)
+        for term in query_terms
+        if term and lowered.find(term) >= 0
+    ]
+    if not match_positions:
+        return _build_excerpt(normalized, max_length=max_length)
+
+    start = max(0, min(match_positions) - max_length // 3)
+    end = min(len(normalized), start + max_length)
+    snippet = normalized[start:end].strip()
+    if start > 0:
+        snippet = f"...{snippet.lstrip()}"
+    if end < len(normalized):
+        snippet = f"{snippet.rstrip()}..."
+    return snippet
+
+
 def _truncate_text(text: str, *, max_length: int) -> str:
    normalized = str(text or "").strip()
    if len(normalized) <= max_length:
@@ -1243,19 +1288,43 @@ def _score_knowledge_hit(
    score += len(matched_terms) * 8
    score += sum(1 for term in matched_terms if term in title) * 6

-    if "结构化表格补充" in content:
-        score += 18
-    if "问答线索补充" in content:
-        score += 16 if not prefers_tabular_evidence else 8
-    if "重点章节摘录" in content:
+    leading_appendix_marker = _leading_structured_appendix_marker(content)
+    if leading_appendix_marker == "# 章节导航":
+        score -= 24
+    elif leading_appendix_marker == "# 重点章节摘录":
+        score += 4 if matched_terms else -12
+    elif leading_appendix_marker == "# 问答线索补充":
+        score += 8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
+    elif leading_appendix_marker == "# 结构化表格补充":
+        if prefers_tabular_evidence and matched_terms:
+            score += 16
+        elif matched_terms:
+            score += 6
+        else:
+            score -= 18
+
+    if prefers_tabular_evidence and matched_terms and ("|" in content or "表" in content):
        score += 10
-    if "章节导航" in content:
+    if matched_terms and any(marker in content for marker in ("：", ":")):
+        score += 10
+    if matched_terms and "\n" in content:
        score += 4
-    if prefers_tabular_evidence and ("|" in content or "表" in content or "结构化表格补充" in content):
-        score += 12
-    if not prefers_tabular_evidence and any(marker in content for marker in ("第", "条", "：", "-", "•")):
+    if matched_terms and any(marker in content for marker in ("附表", "第", "条")):
+        score += 4
+    if not prefers_tabular_evidence and matched_terms and any(marker in content for marker in ("第", "条", "：", "-", "•")):
        score += 4
    if title and any(term in title for term in query_terms):
        score += 6
+    if re.search(r"没有.{0,8}(信息|规定|说明|依据)", content):
+        score -= 12

    return score
+
+
+def _leading_structured_appendix_marker(content: str) -> str:
+    normalized = str(content or "").lstrip()
+    for marker in STRUCTURED_APPENDIX_LEADING_MARKERS:
+        index = normalized.find(marker)
+        if 0 <= index <= STRUCTURED_APPENDIX_LEADING_WINDOW:
+            return marker
+    return ""