feat: 增强知识库索引与设置页面模块化拆分

扩展知识库索引任务和 RAG 检索支持增量入库和文档去重，优化本体检测和规则匹配精度，前端设置页面拆分为 LLM、邮件和 Hermes 员工同步子面板并重构样式，新增日志详情组件和知识入库日志模型，补充单元测试覆盖。
2026-05-22 23:47:28 +08:00
parent 88ff04bef8
commit 5b388d08c0
84 changed files with 10170 additions and 2599 deletions
--- a/server/src/app/services/knowledge_rag.py
+++ b/server/src/app/services/knowledge_rag.py
@@ -12,24 +12,15 @@ from sqlalchemy.orm import Session
 from app.core.config import get_settings
 from app.core.logging import get_logger
 from app.db.session import get_session_factory
+from app.services.knowledge_ingest_log import (
+    build_document_graph_summary,
+    build_ingest_document_summary,
+    build_ingest_status_summary,
+)
 from app.services.knowledge_rag_runtime import (
-    DEFAULT_EMBEDDING_TIMEOUT_SECONDS,
-    DEFAULT_LIGHTRAG_QUERY_MODE,
-    DEFAULT_LLM_TIMEOUT_SECONDS,
    KnowledgeRagError,
    RuntimeModelConfig,
    _LightRagRuntime,
-    _build_ali_rerank_request,
-    _build_azure_deployment_base,
-    _build_headers,
-    _ensure_path,
-    _extract_chat_text,
-    _extract_embedding_vectors,
-    _extract_error_message,
-    _extract_rerank_results,
-    _normalize_endpoint,
-    _parse_json_body,
-    _send_json_request,
 )
 from app.services.settings import SettingsService

@@ -76,11 +67,9 @@ STRUCTURED_APPENDIX_LEADING_MARKERS = (
    "# 结构化表格补充",
 )
 STRUCTURED_APPENDIX_LEADING_WINDOW = 220
-
-
 _runtime_lock = threading.RLock()
-_runtime_instance: _LightRagRuntime | None = None
-_runtime_signature: tuple[Any, ...] | None = None
+_runtime_instances: dict[int, _LightRagRuntime] = {}
+_runtime_signatures: dict[int, tuple[Any, ...]] = {}


 class KnowledgeRagService:
@@ -147,7 +136,11 @@ class KnowledgeRagService:
            "query": normalized_query,
            "record_count": len(hits),
            "hits": hits,
-            "references": [str(item.get("code") or "").strip() for item in hits if str(item.get("code") or "").strip()],
+            "references": [
+                str(item.get("code") or "").strip()
+                for item in hits
+                if str(item.get("code") or "").strip()
+            ],
            "raw_references": references,
            "metadata": raw.get("metadata") if isinstance(raw, dict) else {},
            "message": f"已从知识库中检索到 {len(hits)} 条相关内容。",
@@ -172,6 +165,7 @@ class KnowledgeRagService:
        )
        texts: list[str] = []
        file_paths: list[str] = []
+        document_summaries: list[dict[str, Any]] = []

        runtime = self._get_runtime()
        existing_statuses = runtime.get_document_statuses(normalized_ids)
@@ -182,12 +176,29 @@ class KnowledgeRagService:
                try:
                    runtime.delete_document(document_id)
                except Exception as exc:
-                    logger.warning("Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc)
+                    logger.warning(
+                        "Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc
+                    )
            text = knowledge_service.extract_document_text(document_id)
+            raw_text = text
            if normalization_service is not None:
                text = normalization_service.build_enriched_text(text)
            texts.append(text)
-            file_paths.append(str((knowledge_service.library_root / entry["folder"] / entry["stored_name"]).resolve()))
+            file_paths.append(
+                str(
+                    (
+                        knowledge_service.library_root / entry["folder"] / entry["stored_name"]
+                    ).resolve()
+                )
+            )
+            document_summaries.append(
+                build_ingest_document_summary(
+                    document_id=document_id,
+                    entry=entry,
+                    raw_text=raw_text,
+                    indexed_text=text,
+                )
+            )

        track_id = runtime.insert_documents(
            texts=texts,
@@ -198,10 +209,32 @@ class KnowledgeRagService:
        statuses = runtime.get_document_statuses(normalized_ids)
        succeeded_document_ids: list[str] = []
        failed_documents: list[dict[str, str]] = []
+        summary_by_id = {
+            str(item.get("document_id") or "").strip(): item
+            for item in document_summaries
+            if str(item.get("document_id") or "").strip()
+        }

        for document_id in normalized_ids:
            status_obj = statuses.get(document_id)
            status_text = self._status_value(status_obj)
+            status_payload = self._serialize_status(status_obj)
+            workspace = (
+                os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
+                or DEFAULT_LIGHTRAG_WORKSPACE
+            )
+            graph_summary = build_document_graph_summary(
+                self.storage_root,
+                workspace=workspace,
+                document_id=document_id,
+            )
+            if document_id in summary_by_id:
+                summary_by_id[document_id].update(
+                    build_ingest_status_summary(
+                        status_payload=status_payload,
+                        graph_summary=graph_summary,
+                    )
+                )
            if self.is_query_ready_status(status_obj):
                succeeded_document_ids.append(document_id)
                continue
@@ -218,13 +251,18 @@ class KnowledgeRagService:
            "requested_document_ids": normalized_ids,
            "succeeded_document_ids": succeeded_document_ids,
            "failed_documents": failed_documents,
+            "document_summaries": [
+                summary_by_id.get(document_id, {}) for document_id in normalized_ids
+            ],
            "status_snapshot": {
                document_id: self._serialize_status(status_obj)
                for document_id, status_obj in statuses.items()
            },
        }

-    def get_document_status_map(self, document_ids: list[str] | None = None) -> dict[str, dict[str, Any]]:
+    def get_document_status_map(
+        self, document_ids: list[str] | None = None
+    ) -> dict[str, dict[str, Any]]:
        target_ids = [str(item).strip() for item in document_ids or [] if str(item).strip()]
        if not target_ids:
            return {}
@@ -248,28 +286,32 @@ class KnowledgeRagService:
            logger.warning("Delete LightRAG document ignored doc_id=%s: %s", normalized_id, exc)

    def _get_runtime(self) -> _LightRagRuntime:
-        global _runtime_instance, _runtime_signature
-
        signature, runtime_kwargs = self._build_runtime_signature()
+        thread_id = threading.get_ident()
        with _runtime_lock:
-            if _runtime_instance is not None and _runtime_signature == signature:
-                return _runtime_instance
+            runtime = _runtime_instances.get(thread_id)
+            if runtime is not None and _runtime_signatures.get(thread_id) == signature:
+                return runtime

-            if _runtime_instance is not None:
+            if runtime is not None:
                try:
-                    _runtime_instance.finalize()
+                    runtime.finalize()
                except Exception as exc:  # pragma: no cover - best effort cleanup
                    logger.warning("Finalize previous LightRAG runtime failed: %s", exc)

-            _runtime_instance = _LightRagRuntime(**runtime_kwargs)
-            _runtime_signature = signature
-            return _runtime_instance
+            runtime = _LightRagRuntime(**runtime_kwargs)
+            _runtime_instances[thread_id] = runtime
+            _runtime_signatures[thread_id] = signature
+            return runtime

    def _build_runtime_signature(self) -> tuple[tuple[Any, ...], dict[str, Any]]:
        configs = self._load_runtime_configs()
        settings = get_settings()
        working_dir = (self.storage_root / "knowledge" / ".lightrag").resolve()
-        workspace = os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip() or DEFAULT_LIGHTRAG_WORKSPACE
+        workspace = (
+            os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
+            or DEFAULT_LIGHTRAG_WORKSPACE
+        )
        qdrant_url = os.environ.get("QDRANT_URL", "").strip() or _resolve_default_qdrant_url()
        qdrant_api_key = os.environ.get("QDRANT_API_KEY", "").strip()

@@ -318,7 +360,9 @@ class KnowledgeRagService:
        try:
            settings_service = SettingsService(session)
            main = self._normalize_runtime_model(settings_service.get_runtime_model_config("main"))
-            embedding = self._normalize_runtime_model(settings_service.get_runtime_model_config("embedding"))
+            embedding = self._normalize_runtime_model(
+                settings_service.get_runtime_model_config("embedding")
+            )
            try:
                backup_raw = settings_service.get_runtime_model_config("backup")
                backup = self._normalize_runtime_model(backup_raw)
@@ -405,7 +449,9 @@ class KnowledgeRagService:

            document_id, document_name = _parse_document_identity(file_path)
            normalized_chunk_id = chunk_id or f"path-{rank}"
-            normalized_content = _truncate_text(content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH)
+            normalized_content = _truncate_text(
+                content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH
+            )
            excerpt = _build_query_focused_excerpt(
                normalized_content,
                query_terms=query_terms,
@@ -510,17 +556,14 @@ class KnowledgeRagService:


 def shutdown_knowledge_rag_runtime() -> None:
-    global _runtime_instance, _runtime_signature
-
    with _runtime_lock:
-        if _runtime_instance is None:
-            return
-        try:
-            _runtime_instance.finalize()
-        except Exception as exc:  # pragma: no cover - best effort cleanup
-            logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
-        _runtime_instance = None
-        _runtime_signature = None
+        for runtime in list(_runtime_instances.values()):
+            try:
+                runtime.finalize()
+            except Exception as exc:  # pragma: no cover - best effort cleanup
+                logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
+        _runtime_instances.clear()
+        _runtime_signatures.clear()


 def _parse_document_identity(file_path: str) -> tuple[str, str]:
@@ -551,9 +594,7 @@ def _build_query_focused_excerpt(

    lowered = normalized.lower()
    match_positions = [
-        lowered.find(term)
-        for term in query_terms
-        if term and lowered.find(term) >= 0
+        lowered.find(term) for term in query_terms if term and lowered.find(term) >= 0
    ]
    if not match_positions:
        return _build_excerpt(normalized, max_length=max_length)
@@ -649,7 +690,9 @@ def _score_knowledge_hit(
    elif leading_appendix_marker == "# 重点章节摘录":
        score += 4 if matched_terms else -12
    elif leading_appendix_marker == "# 问答线索补充":
-        score += 8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
+        score += (
+            8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
+        )
    elif leading_appendix_marker == "# 结构化表格补充":
        if prefers_tabular_evidence and matched_terms:
            score += 16
@@ -666,7 +709,11 @@ def _score_knowledge_hit(
        score += 4
    if matched_terms and any(marker in content for marker in ("附表", "第", "条")):
        score += 4
-    if not prefers_tabular_evidence and matched_terms and any(marker in content for marker in ("第", "条", "：", "-", "•")):
+    if (
+        not prefers_tabular_evidence
+        and matched_terms
+        and any(marker in content for marker in ("第", "条", "：", "-", "•"))
+    ):
        score += 4
    if title and any(term in title for term in query_terms):
        score += 6