feat: 增强知识库索引与设置页面模块化拆分

扩展知识库索引任务和 RAG 检索支持增量入库和文档去重，优化本体检测和规则匹配精度，前端设置页面拆分为 LLM、邮件和 Hermes 员工同步子面板并重构样式，新增日志详情组件和知识入库日志模型，补充单元测试覆盖。
2026-05-22 23:47:28 +08:00
parent 88ff04bef8
commit 5b388d08c0
84 changed files with 10170 additions and 2599 deletions
--- a/server/src/app/services/knowledge_index_tasks.py
+++ b/server/src/app/services/knowledge_index_tasks.py
@@ -63,6 +63,7 @@ class KnowledgeIndexTaskManager:
        heartbeat_stop = threading.Event()
        heartbeat_thread: threading.Thread | None = None
        tool_call_id = ""
+        knowledge_ingest: dict[str, Any] | None = None
        tool_request_json = {
            "agent": AgentName.HERMES.value,
            "folder": folder,
@@ -74,6 +75,10 @@ class KnowledgeIndexTaskManager:
            run_service = AgentRunService(db)
            knowledge_service = KnowledgeService(db=db)
            rag_service = KnowledgeRagService(db=db)
+            knowledge_ingest = _build_initial_knowledge_ingest_state(
+                knowledge_service,
+                document_ids=document_ids,
+            )

            run_service.merge_route_json(
                agent_run_id,
@@ -93,7 +98,18 @@ class KnowledgeIndexTaskManager:
                        "skipped_documents": 0,
                        "percent": 10 if document_ids else 100,
                    },
+                    "knowledge_ingest": knowledge_ingest,
                },
+                result_summary=_build_ingest_running_summary(
+                    knowledge_ingest,
+                    {
+                        "total_documents": len(document_ids),
+                        "completed_documents": 0,
+                        "failed_documents": 0,
+                        "skipped_documents": 0,
+                        "percent": 10 if document_ids else 100,
+                    },
+                ),
            )
            tool_call = run_service.record_tool_call(
                run_id=agent_run_id,
@@ -134,44 +150,159 @@ class KnowledgeIndexTaskManager:
            )
            heartbeat_thread.start()

-            response = rag_service.index_documents(document_ids=document_ids, force=force)
-            succeeded_document_ids = [
-                str(item).strip()
-                for item in list(response.get("succeeded_document_ids") or [])
-                if str(item).strip()
-            ]
-            failed_documents = [
-                item
-                for item in list(response.get("failed_documents") or [])
-                if isinstance(item, dict)
-            ]
+            responses: list[dict[str, Any]] = []
+            succeeded_document_ids: list[str] = []
+            failed_documents: list[dict[str, str]] = []
+            total_documents = len(document_ids)
+
+            for index, document_id in enumerate(document_ids, start=1):
+                _patch_ingest_document(
+                    knowledge_ingest,
+                    document_id,
+                    {
+                        "status": "running",
+                        "phase": "indexing",
+                        "started_at": datetime.now(UTC).isoformat(),
+                    },
+                    event=f"开始处理第 {index}/{total_documents} 个文件，正在写入 LightRAG。",
+                )
+                knowledge_ingest["current_document_id"] = document_id
+                _sync_ingest_route_json(
+                    run_service,
+                    agent_run_id,
+                    knowledge_ingest,
+                    progress=_build_ingest_progress(knowledge_ingest, total_documents),
+                )
+
+                try:
+                    response = rag_service.index_documents(document_ids=[document_id], force=force)
+                except Exception as exc:
+                    logger.exception(
+                        "Knowledge document index failed run_id=%s doc_id=%s",
+                        agent_run_id,
+                        document_id,
+                    )
+                    failed_documents.append(
+                        {
+                            "document_id": document_id,
+                            "status": "exception",
+                            "error": str(exc),
+                        }
+                    )
+                    _patch_ingest_document(
+                        knowledge_ingest,
+                        document_id,
+                        {
+                            "status": "failed",
+                            "phase": "failed",
+                            "finished_at": datetime.now(UTC).isoformat(),
+                            "error": str(exc),
+                        },
+                        event=f"归集失败：{exc}",
+                        level="error",
+                    )
+                    knowledge_service.set_document_ingest_statuses(
+                        [document_id],
+                        KNOWLEDGE_INGEST_STATUS_FAILED,
+                        agent_run_id=agent_run_id,
+                    )
+                    _refresh_ingest_graph(knowledge_ingest)
+                    _sync_ingest_route_json(
+                        run_service,
+                        agent_run_id,
+                        knowledge_ingest,
+                        progress=_build_ingest_progress(knowledge_ingest, total_documents),
+                    )
+                    continue
+
+                responses.append(response)
+                response_failed_documents = _extract_failed_documents(response, document_id)
+                document_summary = _extract_document_summary(response, document_id)
+                if response_failed_documents:
+                    failed_documents.extend(response_failed_documents)
+                    error_text = (
+                        response_failed_documents[0].get("error") or "LightRAG 未返回可查询状态"
+                    )
+                    _patch_ingest_document(
+                        knowledge_ingest,
+                        document_id,
+                        {
+                            **document_summary,
+                            "status": "failed",
+                            "phase": "failed",
+                            "finished_at": datetime.now(UTC).isoformat(),
+                            "error": error_text,
+                            "track_id": str(response.get("track_id") or "").strip(),
+                        },
+                        event=f"LightRAG 索引失败：{error_text}",
+                        level="error",
+                    )
+                    knowledge_service.set_document_ingest_statuses(
+                        [document_id],
+                        KNOWLEDGE_INGEST_STATUS_FAILED,
+                        agent_run_id=agent_run_id,
+                    )
+                else:
+                    succeeded_document_ids.append(document_id)
+                    chunk_count = int(document_summary.get("chunk_count") or 0)
+                    entity_count = int(document_summary.get("entity_count") or 0)
+                    relation_count = int(document_summary.get("relation_count") or 0)
+                    _patch_ingest_document(
+                        knowledge_ingest,
+                        document_id,
+                        {
+                            **document_summary,
+                            "status": "succeeded",
+                            "phase": "indexed",
+                            "finished_at": datetime.now(UTC).isoformat(),
+                            "track_id": str(response.get("track_id") or "").strip(),
+                        },
+                        event=(
+                            "LightRAG 索引完成："
+                            f"{chunk_count} 个 chunk，{entity_count} 个实体，"
+                            f"{relation_count} 条关系。"
+                        ),
+                    )
+                    knowledge_service.set_document_ingest_statuses(
+                        [document_id],
+                        KNOWLEDGE_INGEST_STATUS_INGESTED,
+                        agent_run_id=agent_run_id,
+                    )
+                _refresh_ingest_graph(knowledge_ingest)
+                _sync_ingest_route_json(
+                    run_service,
+                    agent_run_id,
+                    knowledge_ingest,
+                    progress=_build_ingest_progress(knowledge_ingest, total_documents),
+                )
+
            failed_document_ids = [
                str(item.get("document_id") or "").strip()
                for item in failed_documents
                if str(item.get("document_id") or "").strip()
            ]

-            if succeeded_document_ids:
-                knowledge_service.set_document_ingest_statuses(
-                    succeeded_document_ids,
-                    KNOWLEDGE_INGEST_STATUS_INGESTED,
-                    agent_run_id=agent_run_id,
-                )
-            if failed_document_ids:
-                knowledge_service.set_document_ingest_statuses(
-                    failed_document_ids,
-                    KNOWLEDGE_INGEST_STATUS_FAILED,
-                    agent_run_id=agent_run_id,
-                )
-
            duration_ms = int((perf_counter() - started) * 1000)
            tool_status = "succeeded" if not failed_document_ids else "failed"
+            latest_track_id = _resolve_latest_track_id(responses)
+            knowledge_ingest["current_document_id"] = ""
+            knowledge_ingest["status"] = tool_status
+            knowledge_ingest["phase"] = "completed"
+            knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
+            knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
            heartbeat_stop.set()
            if heartbeat_thread is not None:
                heartbeat_thread.join(timeout=1)
            run_service.update_tool_call(
                tool_call_id,
-                response_json=response,
+                response_json={
+                    "track_id": latest_track_id,
+                    "requested_document_ids": document_ids,
+                    "succeeded_document_ids": succeeded_document_ids,
+                    "failed_documents": failed_documents,
+                    "documents": knowledge_ingest.get("documents", []),
+                    "responses": responses,
+                },
                status=tool_status,
                duration_ms=duration_ms,
                error_message=None if tool_status == "succeeded" else "部分文档索引失败。",
@@ -183,14 +314,17 @@ class KnowledgeIndexTaskManager:
            summary = (
                f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引。"
                if failed_count == 0
-                else f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引，失败 {failed_count} 个。"
+                else (
+                    f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引，"
+                    f"失败 {failed_count} 个。"
+                )
            )
            run_service.merge_route_json(
                agent_run_id,
                {
                    "job_type": "knowledge_index_sync",
                    "phase": "completed",
-                    "track_id": str(response.get("track_id") or "").strip(),
+                    "track_id": latest_track_id,
                    "heartbeat_at": datetime.now(UTC).isoformat(),
                    "progress": {
                        "total_documents": total_documents,
@@ -199,6 +333,7 @@ class KnowledgeIndexTaskManager:
                        "skipped_documents": 0,
                        "percent": 100,
                    },
+                    "knowledge_ingest": knowledge_ingest,
                },
                status=(
                    AgentRunStatus.SUCCEEDED.value
@@ -234,24 +369,50 @@ class KnowledgeIndexTaskManager:
                        error_message=str(exc),
                    )
                KnowledgeService(db=db).set_document_ingest_statuses(
-                    document_ids,
+                    _resolve_failed_ingest_document_ids(knowledge_ingest, document_ids),
                    KNOWLEDGE_INGEST_STATUS_FAILED,
                    agent_run_id=agent_run_id,
                )
+                if knowledge_ingest is not None:
+                    for document_id in document_ids:
+                        document = _find_ingest_document(knowledge_ingest, document_id)
+                        if document is None or document.get("status") in {"succeeded", "failed"}:
+                            continue
+                        _patch_ingest_document(
+                            knowledge_ingest,
+                            document_id,
+                            {
+                                "status": "failed",
+                                "phase": "failed",
+                                "finished_at": datetime.now(UTC).isoformat(),
+                                "error": str(exc),
+                            },
+                            event=f"归集任务中断：{exc}",
+                            level="error",
+                        )
+                    knowledge_ingest["status"] = "failed"
+                    knowledge_ingest["phase"] = "failed"
+                    knowledge_ingest["current_document_id"] = ""
+                    knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
+                    knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
+
+                route_payload: dict[str, Any] = {
+                    "job_type": "knowledge_index_sync",
+                    "phase": "failed",
+                    "heartbeat_at": datetime.now(UTC).isoformat(),
+                    "progress": {
+                        "total_documents": len(document_ids),
+                        "completed_documents": 0,
+                        "failed_documents": len(document_ids),
+                        "skipped_documents": 0,
+                        "percent": 100,
+                    },
+                }
+                if knowledge_ingest is not None:
+                    route_payload["knowledge_ingest"] = knowledge_ingest
                AgentRunService(db).merge_route_json(
                    agent_run_id,
-                    {
-                        "job_type": "knowledge_index_sync",
-                        "phase": "failed",
-                        "heartbeat_at": datetime.now(UTC).isoformat(),
-                        "progress": {
-                            "total_documents": len(document_ids),
-                            "completed_documents": 0,
-                            "failed_documents": len(document_ids),
-                            "skipped_documents": 0,
-                            "percent": 100,
-                        },
-                    },
+                    route_payload,
                    status=AgentRunStatus.FAILED.value,
                    result_summary=str(exc),
                    error_message=str(exc),
@@ -267,4 +428,312 @@ class KnowledgeIndexTaskManager:
            db.close()


+def _build_initial_knowledge_ingest_state(
+    knowledge_service: KnowledgeService,
+    *,
+    document_ids: list[str],
+) -> dict[str, Any]:
+    now = datetime.now(UTC).isoformat()
+    documents = [
+        _build_initial_knowledge_ingest_document(knowledge_service, document_id, now=now)
+        for document_id in document_ids
+    ]
+    return {
+        "schema_version": 1,
+        "status": "running",
+        "phase": "queued",
+        "started_at": now,
+        "finished_at": None,
+        "current_document_id": documents[0]["document_id"] if documents else "",
+        "documents": documents,
+        "graph": _build_ingest_graph({"documents": documents}),
+    }
+
+
+def _build_initial_knowledge_ingest_document(
+    knowledge_service: KnowledgeService,
+    document_id: str,
+    *,
+    now: str,
+) -> dict[str, Any]:
+    try:
+        entry = knowledge_service.get_document_entry(document_id)
+    except Exception:
+        entry = {}
+    return {
+        "document_id": document_id,
+        "name": str(entry.get("original_name") or document_id).strip(),
+        "folder": str(entry.get("folder") or "").strip(),
+        "extension": str(entry.get("extension") or "").strip(),
+        "mime_type": str(entry.get("mime_type") or "").strip(),
+        "status": "queued",
+        "phase": "queued",
+        "started_at": None,
+        "finished_at": None,
+        "text_chars": 0,
+        "indexed_text_chars": 0,
+        "section_count": 0,
+        "sections": [],
+        "chunk_count": 0,
+        "chunk_ids": [],
+        "chunks": [],
+        "entity_count": 0,
+        "relation_count": 0,
+        "entities": [],
+        "relations": [],
+        "events": [
+            {
+                "at": now,
+                "level": "info",
+                "message": "已进入知识归集队列，等待 LightRAG 处理。",
+            }
+        ],
+    }
+
+
+def _patch_ingest_document(
+    knowledge_ingest: dict[str, Any],
+    document_id: str,
+    updates: dict[str, Any],
+    *,
+    event: str = "",
+    level: str = "info",
+) -> None:
+    document = _find_ingest_document(knowledge_ingest, document_id)
+    if document is None:
+        return
+    document.update(updates)
+    if event:
+        _append_ingest_event(document, event, level=level)
+
+
+def _append_ingest_event(document: dict[str, Any], message: str, *, level: str) -> None:
+    events = document.get("events")
+    if not isinstance(events, list):
+        events = []
+    events.append(
+        {
+            "at": datetime.now(UTC).isoformat(),
+            "level": level,
+            "message": message,
+        }
+    )
+    document["events"] = events[-30:]
+
+
+def _find_ingest_document(
+    knowledge_ingest: dict[str, Any],
+    document_id: str,
+) -> dict[str, Any] | None:
+    for document in list(knowledge_ingest.get("documents") or []):
+        if not isinstance(document, dict):
+            continue
+        if str(document.get("document_id") or "").strip() == document_id:
+            return document
+    return None
+
+
+def _sync_ingest_route_json(
+    run_service: AgentRunService,
+    agent_run_id: str,
+    knowledge_ingest: dict[str, Any],
+    *,
+    progress: dict[str, int],
+) -> None:
+    run_service.merge_route_json(
+        agent_run_id,
+        {
+            "job_type": "knowledge_index_sync",
+            "phase": "indexing",
+            "heartbeat_at": datetime.now(UTC).isoformat(),
+            "progress": progress,
+            "knowledge_ingest": knowledge_ingest,
+        },
+        result_summary=_build_ingest_running_summary(knowledge_ingest, progress),
+    )
+
+
+def _build_ingest_running_summary(
+    knowledge_ingest: dict[str, Any],
+    progress: dict[str, int],
+) -> str:
+    total_documents = int(progress.get("total_documents") or 0)
+    completed_documents = int(progress.get("completed_documents") or 0)
+    failed_documents = int(progress.get("failed_documents") or 0)
+    current_document_id = str(knowledge_ingest.get("current_document_id") or "").strip()
+    current_document = (
+        _find_ingest_document(knowledge_ingest, current_document_id)
+        if current_document_id
+        else None
+    )
+    if current_document is not None:
+        name = str(current_document.get("name") or current_document_id).strip()
+        current_index = _resolve_ingest_document_index(knowledge_ingest, current_document_id)
+        return (
+            f"知识归纳正在处理 {current_index}/{total_documents}：{name}。"
+            f"已完成 {completed_documents} 个，失败 {failed_documents} 个。"
+        )
+    return (
+        f"知识归纳正在运行，已完成 {completed_documents}/{total_documents} 个文档，"
+        f"失败 {failed_documents} 个。"
+    )
+
+
+def _resolve_ingest_document_index(
+    knowledge_ingest: dict[str, Any],
+    document_id: str,
+) -> int:
+    documents = [
+        item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
+    ]
+    for index, document in enumerate(documents, start=1):
+        if str(document.get("document_id") or "").strip() == document_id:
+            return index
+    return 0
+
+
+def _build_ingest_progress(
+    knowledge_ingest: dict[str, Any],
+    total_documents: int,
+) -> dict[str, int]:
+    documents = [
+        item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
+    ]
+    completed_documents = sum(1 for item in documents if item.get("status") == "succeeded")
+    failed_documents = sum(1 for item in documents if item.get("status") == "failed")
+    skipped_documents = sum(1 for item in documents if item.get("status") == "skipped")
+    done_documents = completed_documents + failed_documents + skipped_documents
+    if total_documents <= 0:
+        percent = 100
+    else:
+        percent = min(95, max(10, 10 + int(done_documents * 85 / total_documents)))
+    return {
+        "total_documents": total_documents,
+        "completed_documents": completed_documents,
+        "failed_documents": failed_documents,
+        "skipped_documents": skipped_documents,
+        "percent": percent,
+    }
+
+
+def _extract_document_summary(response: dict[str, Any], document_id: str) -> dict[str, Any]:
+    for item in list(response.get("document_summaries") or []):
+        if not isinstance(item, dict):
+            continue
+        if str(item.get("document_id") or "").strip() == document_id:
+            return dict(item)
+    return {}
+
+
+def _extract_failed_documents(
+    response: dict[str, Any],
+    document_id: str,
+) -> list[dict[str, str]]:
+    failed_documents: list[dict[str, str]] = []
+    for item in list(response.get("failed_documents") or []):
+        if not isinstance(item, dict):
+            continue
+        item_document_id = str(item.get("document_id") or "").strip()
+        if item_document_id and item_document_id != document_id:
+            continue
+        failed_documents.append(
+            {
+                "document_id": item_document_id or document_id,
+                "status": str(item.get("status") or "failed").strip(),
+                "error": str(item.get("error") or "LightRAG 索引失败").strip(),
+            }
+        )
+    return failed_documents
+
+
+def _resolve_failed_ingest_document_ids(
+    knowledge_ingest: dict[str, Any] | None,
+    document_ids: list[str],
+) -> list[str]:
+    if knowledge_ingest is None:
+        return document_ids
+    failed_document_ids: list[str] = []
+    seen_document_ids: set[str] = set()
+    for document in list(knowledge_ingest.get("documents") or []):
+        if not isinstance(document, dict):
+            continue
+        document_id = str(document.get("document_id") or "").strip()
+        if not document_id:
+            continue
+        seen_document_ids.add(document_id)
+        if document.get("status") != "succeeded":
+            failed_document_ids.append(document_id)
+    failed_document_ids.extend(
+        document_id for document_id in document_ids if document_id not in seen_document_ids
+    )
+    return failed_document_ids
+
+
+def _refresh_ingest_graph(knowledge_ingest: dict[str, Any]) -> None:
+    knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
+
+
+def _build_ingest_graph(knowledge_ingest: dict[str, Any]) -> dict[str, Any]:
+    documents = [
+        item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
+    ]
+    entities = _dedupe_text_items(
+        entity for document in documents for entity in list(document.get("entities") or [])
+    )
+    relations = _dedupe_relations(
+        relation for document in documents for relation in list(document.get("relations") or [])
+    )
+    return {
+        "chunk_count": sum(_to_int(document.get("chunk_count")) for document in documents),
+        "entity_count": sum(_to_int(document.get("entity_count")) for document in documents),
+        "relation_count": sum(_to_int(document.get("relation_count")) for document in documents),
+        "entities": entities[:60],
+        "relations": relations[:60],
+    }
+
+
+def _dedupe_text_items(items: Any) -> list[str]:
+    deduped: list[str] = []
+    seen: set[str] = set()
+    for item in items:
+        text = str(item or "").strip()
+        if not text or text in seen:
+            continue
+        seen.add(text)
+        deduped.append(text)
+    return deduped
+
+
+def _dedupe_relations(items: Any) -> list[dict[str, str]]:
+    deduped: list[dict[str, str]] = []
+    seen: set[tuple[str, str, str]] = set()
+    for item in items:
+        if not isinstance(item, dict):
+            continue
+        source = str(item.get("source") or "").strip()
+        target = str(item.get("target") or "").strip()
+        relation_type = str(item.get("type") or "关联").strip()
+        key = (source, target, relation_type)
+        if not source or not target or key in seen:
+            continue
+        seen.add(key)
+        deduped.append({"source": source, "target": target, "type": relation_type})
+    return deduped
+
+
+def _resolve_latest_track_id(responses: list[dict[str, Any]]) -> str:
+    for response in reversed(responses):
+        track_id = str(response.get("track_id") or "").strip()
+        if track_id:
+            return track_id
+    return ""
+
+
+def _to_int(value: Any) -> int:
+    try:
+        return int(value or 0)
+    except (TypeError, ValueError):
+        return 0
+
+
 knowledge_index_task_manager = KnowledgeIndexTaskManager()