后端新增风险规则自动生成和模板执行服务,支持从规则资产 批量生成并持久化风险规则文件;知识库入库日志增强图谱 查询和本地 RAG 回退,前端审计页面增加风险规则模型和流 程图组件,知识入库面板拆分为图谱可视化子组件,报销创 建页面增加引导式流程模型,更新知识库索引数据。
773 lines
29 KiB
Python
773 lines
29 KiB
Python
from __future__ import annotations
|
||
|
||
import threading
|
||
from concurrent.futures import Future, ThreadPoolExecutor
|
||
from datetime import UTC, datetime
|
||
from time import perf_counter
|
||
from typing import Any
|
||
|
||
from app.api.deps import CurrentUserContext
|
||
from app.core.agent_enums import AgentName, AgentRunStatus, AgentToolType
|
||
from app.core.logging import get_logger
|
||
from app.db.session import get_session_factory
|
||
from app.services.agent_runs import AgentRunService
|
||
from app.services.knowledge import (
|
||
KNOWLEDGE_INGEST_STATUS_FAILED,
|
||
KNOWLEDGE_INGEST_STATUS_INGESTED,
|
||
KnowledgeService,
|
||
)
|
||
from app.services.knowledge_rag import KnowledgeRagService
|
||
|
||
logger = get_logger("app.services.knowledge_index_tasks")
|
||
HEARTBEAT_INTERVAL_SECONDS = 10
|
||
|
||
|
||
class KnowledgeIndexTaskManager:
|
||
def __init__(self) -> None:
|
||
self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="knowledge-index")
|
||
self._futures: dict[str, Future[Any]] = {}
|
||
|
||
def submit_sync(
|
||
self,
|
||
*,
|
||
agent_run_id: str,
|
||
folder: str,
|
||
current_user: CurrentUserContext,
|
||
document_ids: list[str],
|
||
force: bool,
|
||
) -> None:
|
||
future = self._executor.submit(
|
||
self._run_sync,
|
||
agent_run_id,
|
||
folder,
|
||
current_user,
|
||
[str(item).strip() for item in document_ids if str(item).strip()],
|
||
force,
|
||
)
|
||
self._futures[agent_run_id] = future
|
||
|
||
def shutdown(self) -> None:
|
||
self._executor.shutdown(wait=False, cancel_futures=True)
|
||
|
||
@staticmethod
|
||
def _run_sync(
|
||
agent_run_id: str,
|
||
folder: str,
|
||
current_user: CurrentUserContext,
|
||
document_ids: list[str],
|
||
force: bool,
|
||
) -> None:
|
||
session_factory = get_session_factory()
|
||
db = session_factory()
|
||
started = perf_counter()
|
||
heartbeat_stop = threading.Event()
|
||
heartbeat_thread: threading.Thread | None = None
|
||
tool_call_id = ""
|
||
knowledge_ingest: dict[str, Any] | None = None
|
||
tool_request_json = {
|
||
"agent": AgentName.HERMES.value,
|
||
"folder": folder,
|
||
"document_ids": document_ids,
|
||
"force": force,
|
||
}
|
||
|
||
try:
|
||
run_service = AgentRunService(db)
|
||
knowledge_service = KnowledgeService(db=db)
|
||
rag_service = KnowledgeRagService(db=db)
|
||
knowledge_ingest = _build_initial_knowledge_ingest_state(
|
||
knowledge_service,
|
||
document_ids=document_ids,
|
||
)
|
||
|
||
run_service.merge_route_json(
|
||
agent_run_id,
|
||
{
|
||
"job_type": "knowledge_index_sync",
|
||
"phase": "indexing",
|
||
"folder": folder,
|
||
"force": force,
|
||
"heartbeat_at": datetime.now(UTC).isoformat(),
|
||
"requested_document_ids": document_ids,
|
||
"requested_by_username": current_user.username,
|
||
"requested_by_name": current_user.name,
|
||
"progress": {
|
||
"total_documents": len(document_ids),
|
||
"completed_documents": 0,
|
||
"failed_documents": 0,
|
||
"skipped_documents": 0,
|
||
"percent": 10 if document_ids else 100,
|
||
},
|
||
"knowledge_ingest": knowledge_ingest,
|
||
},
|
||
result_summary=_build_ingest_running_summary(
|
||
knowledge_ingest,
|
||
{
|
||
"total_documents": len(document_ids),
|
||
"completed_documents": 0,
|
||
"failed_documents": 0,
|
||
"skipped_documents": 0,
|
||
"percent": 10 if document_ids else 100,
|
||
},
|
||
),
|
||
)
|
||
tool_call = run_service.record_tool_call(
|
||
run_id=agent_run_id,
|
||
tool_type=AgentToolType.LLM.value,
|
||
tool_name="lightrag.index_documents",
|
||
request_json=tool_request_json,
|
||
response_json={"phase": "indexing"},
|
||
status="running",
|
||
duration_ms=0,
|
||
error_message=None,
|
||
)
|
||
tool_call_id = tool_call.id
|
||
|
||
def heartbeat_worker() -> None:
|
||
while not heartbeat_stop.wait(HEARTBEAT_INTERVAL_SECONDS):
|
||
heartbeat_db = session_factory()
|
||
try:
|
||
AgentRunService(heartbeat_db).merge_route_json(
|
||
agent_run_id,
|
||
{
|
||
"job_type": "knowledge_index_sync",
|
||
"phase": "indexing",
|
||
"heartbeat_at": datetime.now(UTC).isoformat(),
|
||
},
|
||
)
|
||
except Exception:
|
||
logger.exception(
|
||
"Knowledge index heartbeat update failed run_id=%s",
|
||
agent_run_id,
|
||
)
|
||
finally:
|
||
heartbeat_db.close()
|
||
|
||
heartbeat_thread = threading.Thread(
|
||
target=heartbeat_worker,
|
||
name=f"knowledge-index-heartbeat-{agent_run_id}",
|
||
daemon=True,
|
||
)
|
||
heartbeat_thread.start()
|
||
|
||
responses: list[dict[str, Any]] = []
|
||
succeeded_document_ids: list[str] = []
|
||
failed_documents: list[dict[str, str]] = []
|
||
total_documents = len(document_ids)
|
||
|
||
for index, document_id in enumerate(document_ids, start=1):
|
||
_patch_ingest_document(
|
||
knowledge_ingest,
|
||
document_id,
|
||
{
|
||
"status": "running",
|
||
"phase": "indexing",
|
||
"started_at": datetime.now(UTC).isoformat(),
|
||
},
|
||
event=f"开始处理第 {index}/{total_documents} 个文件,正在写入 LightRAG。",
|
||
)
|
||
knowledge_ingest["current_document_id"] = document_id
|
||
_sync_ingest_route_json(
|
||
run_service,
|
||
agent_run_id,
|
||
knowledge_ingest,
|
||
progress=_build_ingest_progress(knowledge_ingest, total_documents),
|
||
)
|
||
|
||
try:
|
||
response = rag_service.index_documents(document_ids=[document_id], force=force)
|
||
except Exception as exc:
|
||
logger.exception(
|
||
"Knowledge document index failed run_id=%s doc_id=%s",
|
||
agent_run_id,
|
||
document_id,
|
||
)
|
||
failed_documents.append(
|
||
{
|
||
"document_id": document_id,
|
||
"status": "exception",
|
||
"error": str(exc),
|
||
}
|
||
)
|
||
_patch_ingest_document(
|
||
knowledge_ingest,
|
||
document_id,
|
||
{
|
||
"status": "failed",
|
||
"phase": "failed",
|
||
"finished_at": datetime.now(UTC).isoformat(),
|
||
"error": str(exc),
|
||
},
|
||
event=f"归集失败:{exc}",
|
||
level="error",
|
||
)
|
||
knowledge_service.set_document_ingest_statuses(
|
||
[document_id],
|
||
KNOWLEDGE_INGEST_STATUS_FAILED,
|
||
agent_run_id=agent_run_id,
|
||
)
|
||
_refresh_ingest_graph(knowledge_ingest)
|
||
_sync_ingest_route_json(
|
||
run_service,
|
||
agent_run_id,
|
||
knowledge_ingest,
|
||
progress=_build_ingest_progress(knowledge_ingest, total_documents),
|
||
)
|
||
continue
|
||
|
||
responses.append(response)
|
||
response_failed_documents = _extract_failed_documents(response, document_id)
|
||
document_summary = _extract_document_summary(response, document_id)
|
||
if response_failed_documents:
|
||
failed_documents.extend(response_failed_documents)
|
||
error_text = (
|
||
response_failed_documents[0].get("error") or "LightRAG 未返回可查询状态"
|
||
)
|
||
_patch_ingest_document(
|
||
knowledge_ingest,
|
||
document_id,
|
||
{
|
||
**document_summary,
|
||
"status": "failed",
|
||
"phase": "failed",
|
||
"finished_at": datetime.now(UTC).isoformat(),
|
||
"error": error_text,
|
||
"track_id": str(response.get("track_id") or "").strip(),
|
||
},
|
||
event=f"LightRAG 索引失败:{error_text}",
|
||
level="error",
|
||
)
|
||
knowledge_service.set_document_ingest_statuses(
|
||
[document_id],
|
||
KNOWLEDGE_INGEST_STATUS_FAILED,
|
||
agent_run_id=agent_run_id,
|
||
)
|
||
else:
|
||
succeeded_document_ids.append(document_id)
|
||
chunk_count = int(document_summary.get("chunk_count") or 0)
|
||
entity_count = int(document_summary.get("entity_count") or 0)
|
||
relation_count = int(document_summary.get("relation_count") or 0)
|
||
_patch_ingest_document(
|
||
knowledge_ingest,
|
||
document_id,
|
||
{
|
||
**document_summary,
|
||
"status": "succeeded",
|
||
"phase": "indexed",
|
||
"finished_at": datetime.now(UTC).isoformat(),
|
||
"track_id": str(response.get("track_id") or "").strip(),
|
||
},
|
||
event=(
|
||
"LightRAG 索引完成:"
|
||
f"{chunk_count} 个 chunk,{entity_count} 个实体,"
|
||
f"{relation_count} 条关系。"
|
||
),
|
||
)
|
||
knowledge_service.set_document_ingest_statuses(
|
||
[document_id],
|
||
KNOWLEDGE_INGEST_STATUS_INGESTED,
|
||
agent_run_id=agent_run_id,
|
||
)
|
||
_refresh_ingest_graph(knowledge_ingest)
|
||
_sync_ingest_route_json(
|
||
run_service,
|
||
agent_run_id,
|
||
knowledge_ingest,
|
||
progress=_build_ingest_progress(knowledge_ingest, total_documents),
|
||
)
|
||
|
||
failed_document_ids = [
|
||
str(item.get("document_id") or "").strip()
|
||
for item in failed_documents
|
||
if str(item.get("document_id") or "").strip()
|
||
]
|
||
|
||
duration_ms = int((perf_counter() - started) * 1000)
|
||
tool_status = "succeeded" if not failed_document_ids else "failed"
|
||
latest_track_id = _resolve_latest_track_id(responses)
|
||
knowledge_ingest["current_document_id"] = ""
|
||
knowledge_ingest["status"] = tool_status
|
||
knowledge_ingest["phase"] = "completed"
|
||
knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
|
||
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
|
||
heartbeat_stop.set()
|
||
if heartbeat_thread is not None:
|
||
heartbeat_thread.join(timeout=1)
|
||
run_service.update_tool_call(
|
||
tool_call_id,
|
||
response_json={
|
||
"track_id": latest_track_id,
|
||
"requested_document_ids": document_ids,
|
||
"succeeded_document_ids": succeeded_document_ids,
|
||
"failed_documents": failed_documents,
|
||
"documents": knowledge_ingest.get("documents", []),
|
||
"responses": responses,
|
||
},
|
||
status=tool_status,
|
||
duration_ms=duration_ms,
|
||
error_message=None if tool_status == "succeeded" else "部分文档索引失败。",
|
||
)
|
||
|
||
completed_documents = len(succeeded_document_ids)
|
||
failed_count = len(failed_document_ids)
|
||
total_documents = len(document_ids)
|
||
summary = (
|
||
f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引。"
|
||
if failed_count == 0
|
||
else (
|
||
f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引,"
|
||
f"失败 {failed_count} 个。"
|
||
)
|
||
)
|
||
run_service.merge_route_json(
|
||
agent_run_id,
|
||
{
|
||
"job_type": "knowledge_index_sync",
|
||
"phase": "completed",
|
||
"track_id": latest_track_id,
|
||
"heartbeat_at": datetime.now(UTC).isoformat(),
|
||
"progress": {
|
||
"total_documents": total_documents,
|
||
"completed_documents": completed_documents,
|
||
"failed_documents": failed_count,
|
||
"skipped_documents": 0,
|
||
"percent": 100,
|
||
},
|
||
"knowledge_ingest": knowledge_ingest,
|
||
},
|
||
status=(
|
||
AgentRunStatus.SUCCEEDED.value
|
||
if failed_count == 0
|
||
else AgentRunStatus.FAILED.value
|
||
),
|
||
result_summary=summary,
|
||
error_message="部分文档索引失败。" if failed_count else None,
|
||
finished_at=datetime.now(UTC),
|
||
)
|
||
except Exception as exc:
|
||
heartbeat_stop.set()
|
||
if heartbeat_thread is not None:
|
||
heartbeat_thread.join(timeout=1)
|
||
try:
|
||
if tool_call_id:
|
||
AgentRunService(db).update_tool_call(
|
||
tool_call_id,
|
||
response_json={"error": str(exc)},
|
||
status="failed",
|
||
duration_ms=int((perf_counter() - started) * 1000),
|
||
error_message=str(exc),
|
||
)
|
||
else:
|
||
AgentRunService(db).record_tool_call(
|
||
run_id=agent_run_id,
|
||
tool_type=AgentToolType.LLM.value,
|
||
tool_name="lightrag.index_documents",
|
||
request_json=tool_request_json,
|
||
response_json={"error": str(exc)},
|
||
status="failed",
|
||
duration_ms=int((perf_counter() - started) * 1000),
|
||
error_message=str(exc),
|
||
)
|
||
KnowledgeService(db=db).set_document_ingest_statuses(
|
||
_resolve_failed_ingest_document_ids(knowledge_ingest, document_ids),
|
||
KNOWLEDGE_INGEST_STATUS_FAILED,
|
||
agent_run_id=agent_run_id,
|
||
)
|
||
if knowledge_ingest is not None:
|
||
for document_id in document_ids:
|
||
document = _find_ingest_document(knowledge_ingest, document_id)
|
||
if document is None or document.get("status") in {"succeeded", "failed"}:
|
||
continue
|
||
_patch_ingest_document(
|
||
knowledge_ingest,
|
||
document_id,
|
||
{
|
||
"status": "failed",
|
||
"phase": "failed",
|
||
"finished_at": datetime.now(UTC).isoformat(),
|
||
"error": str(exc),
|
||
},
|
||
event=f"归集任务中断:{exc}",
|
||
level="error",
|
||
)
|
||
knowledge_ingest["status"] = "failed"
|
||
knowledge_ingest["phase"] = "failed"
|
||
knowledge_ingest["current_document_id"] = ""
|
||
knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
|
||
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
|
||
|
||
route_payload: dict[str, Any] = {
|
||
"job_type": "knowledge_index_sync",
|
||
"phase": "failed",
|
||
"heartbeat_at": datetime.now(UTC).isoformat(),
|
||
"progress": {
|
||
"total_documents": len(document_ids),
|
||
"completed_documents": 0,
|
||
"failed_documents": len(document_ids),
|
||
"skipped_documents": 0,
|
||
"percent": 100,
|
||
},
|
||
}
|
||
if knowledge_ingest is not None:
|
||
route_payload["knowledge_ingest"] = knowledge_ingest
|
||
AgentRunService(db).merge_route_json(
|
||
agent_run_id,
|
||
route_payload,
|
||
status=AgentRunStatus.FAILED.value,
|
||
result_summary=str(exc),
|
||
error_message=str(exc),
|
||
finished_at=datetime.now(UTC),
|
||
)
|
||
except Exception:
|
||
logger.exception("Knowledge index task finalization failed run_id=%s", agent_run_id)
|
||
logger.exception("Knowledge index task failed run_id=%s", agent_run_id)
|
||
finally:
|
||
heartbeat_stop.set()
|
||
if heartbeat_thread is not None and heartbeat_thread.is_alive():
|
||
heartbeat_thread.join(timeout=1)
|
||
db.close()
|
||
|
||
|
||
def _build_initial_knowledge_ingest_state(
|
||
knowledge_service: KnowledgeService,
|
||
*,
|
||
document_ids: list[str],
|
||
) -> dict[str, Any]:
|
||
now = datetime.now(UTC).isoformat()
|
||
documents = [
|
||
_build_initial_knowledge_ingest_document(knowledge_service, document_id, now=now)
|
||
for document_id in document_ids
|
||
]
|
||
return {
|
||
"schema_version": 1,
|
||
"status": "running",
|
||
"phase": "queued",
|
||
"started_at": now,
|
||
"finished_at": None,
|
||
"current_document_id": documents[0]["document_id"] if documents else "",
|
||
"documents": documents,
|
||
"graph": _build_ingest_graph({"documents": documents}),
|
||
}
|
||
|
||
|
||
def _build_initial_knowledge_ingest_document(
|
||
knowledge_service: KnowledgeService,
|
||
document_id: str,
|
||
*,
|
||
now: str,
|
||
) -> dict[str, Any]:
|
||
try:
|
||
entry = knowledge_service.get_document_entry(document_id)
|
||
except Exception:
|
||
entry = {}
|
||
return {
|
||
"document_id": document_id,
|
||
"name": str(entry.get("original_name") or document_id).strip(),
|
||
"folder": str(entry.get("folder") or "").strip(),
|
||
"extension": str(entry.get("extension") or "").strip(),
|
||
"mime_type": str(entry.get("mime_type") or "").strip(),
|
||
"status": "queued",
|
||
"phase": "queued",
|
||
"started_at": None,
|
||
"finished_at": None,
|
||
"text_chars": 0,
|
||
"indexed_text_chars": 0,
|
||
"section_count": 0,
|
||
"sections": [],
|
||
"chunk_count": 0,
|
||
"chunk_ids": [],
|
||
"chunks": [],
|
||
"entity_count": 0,
|
||
"relation_count": 0,
|
||
"entities": [],
|
||
"entity_chunks": [],
|
||
"relations": [],
|
||
"events": [
|
||
{
|
||
"at": now,
|
||
"level": "info",
|
||
"message": "已进入知识归集队列,等待 LightRAG 处理。",
|
||
}
|
||
],
|
||
}
|
||
|
||
|
||
def _patch_ingest_document(
|
||
knowledge_ingest: dict[str, Any],
|
||
document_id: str,
|
||
updates: dict[str, Any],
|
||
*,
|
||
event: str = "",
|
||
level: str = "info",
|
||
) -> None:
|
||
document = _find_ingest_document(knowledge_ingest, document_id)
|
||
if document is None:
|
||
return
|
||
document.update(updates)
|
||
if event:
|
||
_append_ingest_event(document, event, level=level)
|
||
|
||
|
||
def _append_ingest_event(document: dict[str, Any], message: str, *, level: str) -> None:
|
||
events = document.get("events")
|
||
if not isinstance(events, list):
|
||
events = []
|
||
events.append(
|
||
{
|
||
"at": datetime.now(UTC).isoformat(),
|
||
"level": level,
|
||
"message": message,
|
||
}
|
||
)
|
||
document["events"] = events[-30:]
|
||
|
||
|
||
def _find_ingest_document(
|
||
knowledge_ingest: dict[str, Any],
|
||
document_id: str,
|
||
) -> dict[str, Any] | None:
|
||
for document in list(knowledge_ingest.get("documents") or []):
|
||
if not isinstance(document, dict):
|
||
continue
|
||
if str(document.get("document_id") or "").strip() == document_id:
|
||
return document
|
||
return None
|
||
|
||
|
||
def _sync_ingest_route_json(
|
||
run_service: AgentRunService,
|
||
agent_run_id: str,
|
||
knowledge_ingest: dict[str, Any],
|
||
*,
|
||
progress: dict[str, int],
|
||
) -> None:
|
||
run_service.merge_route_json(
|
||
agent_run_id,
|
||
{
|
||
"job_type": "knowledge_index_sync",
|
||
"phase": "indexing",
|
||
"heartbeat_at": datetime.now(UTC).isoformat(),
|
||
"progress": progress,
|
||
"knowledge_ingest": knowledge_ingest,
|
||
},
|
||
result_summary=_build_ingest_running_summary(knowledge_ingest, progress),
|
||
)
|
||
|
||
|
||
def _build_ingest_running_summary(
|
||
knowledge_ingest: dict[str, Any],
|
||
progress: dict[str, int],
|
||
) -> str:
|
||
total_documents = int(progress.get("total_documents") or 0)
|
||
completed_documents = int(progress.get("completed_documents") or 0)
|
||
failed_documents = int(progress.get("failed_documents") or 0)
|
||
current_document_id = str(knowledge_ingest.get("current_document_id") or "").strip()
|
||
current_document = (
|
||
_find_ingest_document(knowledge_ingest, current_document_id)
|
||
if current_document_id
|
||
else None
|
||
)
|
||
if current_document is not None:
|
||
name = str(current_document.get("name") or current_document_id).strip()
|
||
current_index = _resolve_ingest_document_index(knowledge_ingest, current_document_id)
|
||
return (
|
||
f"知识归纳正在处理 {current_index}/{total_documents}:{name}。"
|
||
f"已完成 {completed_documents} 个,失败 {failed_documents} 个。"
|
||
)
|
||
return (
|
||
f"知识归纳正在运行,已完成 {completed_documents}/{total_documents} 个文档,"
|
||
f"失败 {failed_documents} 个。"
|
||
)
|
||
|
||
|
||
def _resolve_ingest_document_index(
|
||
knowledge_ingest: dict[str, Any],
|
||
document_id: str,
|
||
) -> int:
|
||
documents = [
|
||
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
|
||
]
|
||
for index, document in enumerate(documents, start=1):
|
||
if str(document.get("document_id") or "").strip() == document_id:
|
||
return index
|
||
return 0
|
||
|
||
|
||
def _build_ingest_progress(
|
||
knowledge_ingest: dict[str, Any],
|
||
total_documents: int,
|
||
) -> dict[str, int]:
|
||
documents = [
|
||
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
|
||
]
|
||
completed_documents = sum(1 for item in documents if item.get("status") == "succeeded")
|
||
failed_documents = sum(1 for item in documents if item.get("status") == "failed")
|
||
skipped_documents = sum(1 for item in documents if item.get("status") == "skipped")
|
||
done_documents = completed_documents + failed_documents + skipped_documents
|
||
if total_documents <= 0:
|
||
percent = 100
|
||
else:
|
||
percent = min(95, max(10, 10 + int(done_documents * 85 / total_documents)))
|
||
return {
|
||
"total_documents": total_documents,
|
||
"completed_documents": completed_documents,
|
||
"failed_documents": failed_documents,
|
||
"skipped_documents": skipped_documents,
|
||
"percent": percent,
|
||
}
|
||
|
||
|
||
def _extract_document_summary(response: dict[str, Any], document_id: str) -> dict[str, Any]:
|
||
for item in list(response.get("document_summaries") or []):
|
||
if not isinstance(item, dict):
|
||
continue
|
||
if str(item.get("document_id") or "").strip() == document_id:
|
||
return dict(item)
|
||
return {}
|
||
|
||
|
||
def _extract_failed_documents(
|
||
response: dict[str, Any],
|
||
document_id: str,
|
||
) -> list[dict[str, str]]:
|
||
failed_documents: list[dict[str, str]] = []
|
||
for item in list(response.get("failed_documents") or []):
|
||
if not isinstance(item, dict):
|
||
continue
|
||
item_document_id = str(item.get("document_id") or "").strip()
|
||
if item_document_id and item_document_id != document_id:
|
||
continue
|
||
failed_documents.append(
|
||
{
|
||
"document_id": item_document_id or document_id,
|
||
"status": str(item.get("status") or "failed").strip(),
|
||
"error": str(item.get("error") or "LightRAG 索引失败").strip(),
|
||
}
|
||
)
|
||
return failed_documents
|
||
|
||
|
||
def _resolve_failed_ingest_document_ids(
|
||
knowledge_ingest: dict[str, Any] | None,
|
||
document_ids: list[str],
|
||
) -> list[str]:
|
||
if knowledge_ingest is None:
|
||
return document_ids
|
||
failed_document_ids: list[str] = []
|
||
seen_document_ids: set[str] = set()
|
||
for document in list(knowledge_ingest.get("documents") or []):
|
||
if not isinstance(document, dict):
|
||
continue
|
||
document_id = str(document.get("document_id") or "").strip()
|
||
if not document_id:
|
||
continue
|
||
seen_document_ids.add(document_id)
|
||
if document.get("status") != "succeeded":
|
||
failed_document_ids.append(document_id)
|
||
failed_document_ids.extend(
|
||
document_id for document_id in document_ids if document_id not in seen_document_ids
|
||
)
|
||
return failed_document_ids
|
||
|
||
|
||
def _refresh_ingest_graph(knowledge_ingest: dict[str, Any]) -> None:
|
||
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
|
||
|
||
|
||
def _build_ingest_graph(knowledge_ingest: dict[str, Any]) -> dict[str, Any]:
|
||
documents = [
|
||
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
|
||
]
|
||
entities = _dedupe_entities(
|
||
entity for document in documents for entity in list(document.get("entities") or [])
|
||
)
|
||
relations = _dedupe_relations(
|
||
relation for document in documents for relation in list(document.get("relations") or [])
|
||
)
|
||
return {
|
||
"chunk_count": sum(_to_int(document.get("chunk_count")) for document in documents),
|
||
"entity_count": sum(_to_int(document.get("entity_count")) for document in documents),
|
||
"relation_count": sum(_to_int(document.get("relation_count")) for document in documents),
|
||
"entities": entities[:60],
|
||
"relations": relations[:60],
|
||
}
|
||
|
||
|
||
def _dedupe_entities(items: Any) -> list[dict[str, Any]]:
|
||
deduped: list[dict[str, Any]] = []
|
||
seen: set[str] = set()
|
||
for item in items:
|
||
if isinstance(item, dict):
|
||
name = str(
|
||
item.get("name")
|
||
or item.get("entity")
|
||
or item.get("entity_id")
|
||
or item.get("title")
|
||
or item.get("id")
|
||
or ""
|
||
).strip()
|
||
entity = dict(item)
|
||
else:
|
||
name = str(item or "").strip()
|
||
entity = {}
|
||
if not name or name in seen:
|
||
continue
|
||
seen.add(name)
|
||
entity["name"] = name
|
||
entity["type"] = str(
|
||
entity.get("type")
|
||
or entity.get("entity_type")
|
||
or entity.get("category")
|
||
or entity.get("kind")
|
||
or "实体"
|
||
).strip()
|
||
description = str(entity.get("description") or "").strip()
|
||
descriptions = entity.get("descriptions")
|
||
if not isinstance(descriptions, list):
|
||
descriptions = [description] if description else []
|
||
entity["description"] = description
|
||
entity["descriptions"] = [
|
||
str(description_item or "").strip()
|
||
for description_item in descriptions
|
||
if str(description_item or "").strip()
|
||
][:5]
|
||
if not isinstance(entity.get("properties"), dict):
|
||
entity["properties"] = {}
|
||
deduped.append(entity)
|
||
return deduped
|
||
|
||
|
||
def _dedupe_relations(items: Any) -> list[dict[str, Any]]:
|
||
deduped: list[dict[str, Any]] = []
|
||
seen: set[tuple[str, str, str]] = set()
|
||
for item in items:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
source = str(item.get("source") or "").strip()
|
||
target = str(item.get("target") or "").strip()
|
||
relation_type = str(item.get("type") or "关联").strip()
|
||
key = (source, target, relation_type)
|
||
if not source or not target or key in seen:
|
||
continue
|
||
seen.add(key)
|
||
deduped.append({**item, "source": source, "target": target, "type": relation_type})
|
||
return deduped
|
||
|
||
|
||
def _resolve_latest_track_id(responses: list[dict[str, Any]]) -> str:
|
||
for response in reversed(responses):
|
||
track_id = str(response.get("track_id") or "").strip()
|
||
if track_id:
|
||
return track_id
|
||
return ""
|
||
|
||
|
||
def _to_int(value: Any) -> int:
|
||
try:
|
||
return int(value or 0)
|
||
except (TypeError, ValueError):
|
||
return 0
|
||
|
||
|
||
knowledge_index_task_manager = KnowledgeIndexTaskManager()
|