Files
X-Financial/server/src/app/services/knowledge_index_tasks.py
caoxiaozhu 575f093c74 feat: 新增风险规则生成引擎与知识图谱可视化
后端新增风险规则自动生成和模板执行服务,支持从规则资产
批量生成并持久化风险规则文件;知识库入库日志增强图谱
查询和本地 RAG 回退,前端审计页面增加风险规则模型和流
程图组件,知识入库面板拆分为图谱可视化子组件,报销创
建页面增加引导式流程模型,更新知识库索引数据。
2026-05-23 19:54:42 +08:00

773 lines
29 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import threading
from concurrent.futures import Future, ThreadPoolExecutor
from datetime import UTC, datetime
from time import perf_counter
from typing import Any
from app.api.deps import CurrentUserContext
from app.core.agent_enums import AgentName, AgentRunStatus, AgentToolType
from app.core.logging import get_logger
from app.db.session import get_session_factory
from app.services.agent_runs import AgentRunService
from app.services.knowledge import (
KNOWLEDGE_INGEST_STATUS_FAILED,
KNOWLEDGE_INGEST_STATUS_INGESTED,
KnowledgeService,
)
from app.services.knowledge_rag import KnowledgeRagService
logger = get_logger("app.services.knowledge_index_tasks")
HEARTBEAT_INTERVAL_SECONDS = 10
class KnowledgeIndexTaskManager:
def __init__(self) -> None:
self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="knowledge-index")
self._futures: dict[str, Future[Any]] = {}
def submit_sync(
self,
*,
agent_run_id: str,
folder: str,
current_user: CurrentUserContext,
document_ids: list[str],
force: bool,
) -> None:
future = self._executor.submit(
self._run_sync,
agent_run_id,
folder,
current_user,
[str(item).strip() for item in document_ids if str(item).strip()],
force,
)
self._futures[agent_run_id] = future
def shutdown(self) -> None:
self._executor.shutdown(wait=False, cancel_futures=True)
@staticmethod
def _run_sync(
agent_run_id: str,
folder: str,
current_user: CurrentUserContext,
document_ids: list[str],
force: bool,
) -> None:
session_factory = get_session_factory()
db = session_factory()
started = perf_counter()
heartbeat_stop = threading.Event()
heartbeat_thread: threading.Thread | None = None
tool_call_id = ""
knowledge_ingest: dict[str, Any] | None = None
tool_request_json = {
"agent": AgentName.HERMES.value,
"folder": folder,
"document_ids": document_ids,
"force": force,
}
try:
run_service = AgentRunService(db)
knowledge_service = KnowledgeService(db=db)
rag_service = KnowledgeRagService(db=db)
knowledge_ingest = _build_initial_knowledge_ingest_state(
knowledge_service,
document_ids=document_ids,
)
run_service.merge_route_json(
agent_run_id,
{
"job_type": "knowledge_index_sync",
"phase": "indexing",
"folder": folder,
"force": force,
"heartbeat_at": datetime.now(UTC).isoformat(),
"requested_document_ids": document_ids,
"requested_by_username": current_user.username,
"requested_by_name": current_user.name,
"progress": {
"total_documents": len(document_ids),
"completed_documents": 0,
"failed_documents": 0,
"skipped_documents": 0,
"percent": 10 if document_ids else 100,
},
"knowledge_ingest": knowledge_ingest,
},
result_summary=_build_ingest_running_summary(
knowledge_ingest,
{
"total_documents": len(document_ids),
"completed_documents": 0,
"failed_documents": 0,
"skipped_documents": 0,
"percent": 10 if document_ids else 100,
},
),
)
tool_call = run_service.record_tool_call(
run_id=agent_run_id,
tool_type=AgentToolType.LLM.value,
tool_name="lightrag.index_documents",
request_json=tool_request_json,
response_json={"phase": "indexing"},
status="running",
duration_ms=0,
error_message=None,
)
tool_call_id = tool_call.id
def heartbeat_worker() -> None:
while not heartbeat_stop.wait(HEARTBEAT_INTERVAL_SECONDS):
heartbeat_db = session_factory()
try:
AgentRunService(heartbeat_db).merge_route_json(
agent_run_id,
{
"job_type": "knowledge_index_sync",
"phase": "indexing",
"heartbeat_at": datetime.now(UTC).isoformat(),
},
)
except Exception:
logger.exception(
"Knowledge index heartbeat update failed run_id=%s",
agent_run_id,
)
finally:
heartbeat_db.close()
heartbeat_thread = threading.Thread(
target=heartbeat_worker,
name=f"knowledge-index-heartbeat-{agent_run_id}",
daemon=True,
)
heartbeat_thread.start()
responses: list[dict[str, Any]] = []
succeeded_document_ids: list[str] = []
failed_documents: list[dict[str, str]] = []
total_documents = len(document_ids)
for index, document_id in enumerate(document_ids, start=1):
_patch_ingest_document(
knowledge_ingest,
document_id,
{
"status": "running",
"phase": "indexing",
"started_at": datetime.now(UTC).isoformat(),
},
event=f"开始处理第 {index}/{total_documents} 个文件,正在写入 LightRAG。",
)
knowledge_ingest["current_document_id"] = document_id
_sync_ingest_route_json(
run_service,
agent_run_id,
knowledge_ingest,
progress=_build_ingest_progress(knowledge_ingest, total_documents),
)
try:
response = rag_service.index_documents(document_ids=[document_id], force=force)
except Exception as exc:
logger.exception(
"Knowledge document index failed run_id=%s doc_id=%s",
agent_run_id,
document_id,
)
failed_documents.append(
{
"document_id": document_id,
"status": "exception",
"error": str(exc),
}
)
_patch_ingest_document(
knowledge_ingest,
document_id,
{
"status": "failed",
"phase": "failed",
"finished_at": datetime.now(UTC).isoformat(),
"error": str(exc),
},
event=f"归集失败:{exc}",
level="error",
)
knowledge_service.set_document_ingest_statuses(
[document_id],
KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
_refresh_ingest_graph(knowledge_ingest)
_sync_ingest_route_json(
run_service,
agent_run_id,
knowledge_ingest,
progress=_build_ingest_progress(knowledge_ingest, total_documents),
)
continue
responses.append(response)
response_failed_documents = _extract_failed_documents(response, document_id)
document_summary = _extract_document_summary(response, document_id)
if response_failed_documents:
failed_documents.extend(response_failed_documents)
error_text = (
response_failed_documents[0].get("error") or "LightRAG 未返回可查询状态"
)
_patch_ingest_document(
knowledge_ingest,
document_id,
{
**document_summary,
"status": "failed",
"phase": "failed",
"finished_at": datetime.now(UTC).isoformat(),
"error": error_text,
"track_id": str(response.get("track_id") or "").strip(),
},
event=f"LightRAG 索引失败:{error_text}",
level="error",
)
knowledge_service.set_document_ingest_statuses(
[document_id],
KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
else:
succeeded_document_ids.append(document_id)
chunk_count = int(document_summary.get("chunk_count") or 0)
entity_count = int(document_summary.get("entity_count") or 0)
relation_count = int(document_summary.get("relation_count") or 0)
_patch_ingest_document(
knowledge_ingest,
document_id,
{
**document_summary,
"status": "succeeded",
"phase": "indexed",
"finished_at": datetime.now(UTC).isoformat(),
"track_id": str(response.get("track_id") or "").strip(),
},
event=(
"LightRAG 索引完成:"
f"{chunk_count} 个 chunk{entity_count} 个实体,"
f"{relation_count} 条关系。"
),
)
knowledge_service.set_document_ingest_statuses(
[document_id],
KNOWLEDGE_INGEST_STATUS_INGESTED,
agent_run_id=agent_run_id,
)
_refresh_ingest_graph(knowledge_ingest)
_sync_ingest_route_json(
run_service,
agent_run_id,
knowledge_ingest,
progress=_build_ingest_progress(knowledge_ingest, total_documents),
)
failed_document_ids = [
str(item.get("document_id") or "").strip()
for item in failed_documents
if str(item.get("document_id") or "").strip()
]
duration_ms = int((perf_counter() - started) * 1000)
tool_status = "succeeded" if not failed_document_ids else "failed"
latest_track_id = _resolve_latest_track_id(responses)
knowledge_ingest["current_document_id"] = ""
knowledge_ingest["status"] = tool_status
knowledge_ingest["phase"] = "completed"
knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
heartbeat_stop.set()
if heartbeat_thread is not None:
heartbeat_thread.join(timeout=1)
run_service.update_tool_call(
tool_call_id,
response_json={
"track_id": latest_track_id,
"requested_document_ids": document_ids,
"succeeded_document_ids": succeeded_document_ids,
"failed_documents": failed_documents,
"documents": knowledge_ingest.get("documents", []),
"responses": responses,
},
status=tool_status,
duration_ms=duration_ms,
error_message=None if tool_status == "succeeded" else "部分文档索引失败。",
)
completed_documents = len(succeeded_document_ids)
failed_count = len(failed_document_ids)
total_documents = len(document_ids)
summary = (
f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引。"
if failed_count == 0
else (
f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引,"
f"失败 {failed_count} 个。"
)
)
run_service.merge_route_json(
agent_run_id,
{
"job_type": "knowledge_index_sync",
"phase": "completed",
"track_id": latest_track_id,
"heartbeat_at": datetime.now(UTC).isoformat(),
"progress": {
"total_documents": total_documents,
"completed_documents": completed_documents,
"failed_documents": failed_count,
"skipped_documents": 0,
"percent": 100,
},
"knowledge_ingest": knowledge_ingest,
},
status=(
AgentRunStatus.SUCCEEDED.value
if failed_count == 0
else AgentRunStatus.FAILED.value
),
result_summary=summary,
error_message="部分文档索引失败。" if failed_count else None,
finished_at=datetime.now(UTC),
)
except Exception as exc:
heartbeat_stop.set()
if heartbeat_thread is not None:
heartbeat_thread.join(timeout=1)
try:
if tool_call_id:
AgentRunService(db).update_tool_call(
tool_call_id,
response_json={"error": str(exc)},
status="failed",
duration_ms=int((perf_counter() - started) * 1000),
error_message=str(exc),
)
else:
AgentRunService(db).record_tool_call(
run_id=agent_run_id,
tool_type=AgentToolType.LLM.value,
tool_name="lightrag.index_documents",
request_json=tool_request_json,
response_json={"error": str(exc)},
status="failed",
duration_ms=int((perf_counter() - started) * 1000),
error_message=str(exc),
)
KnowledgeService(db=db).set_document_ingest_statuses(
_resolve_failed_ingest_document_ids(knowledge_ingest, document_ids),
KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
if knowledge_ingest is not None:
for document_id in document_ids:
document = _find_ingest_document(knowledge_ingest, document_id)
if document is None or document.get("status") in {"succeeded", "failed"}:
continue
_patch_ingest_document(
knowledge_ingest,
document_id,
{
"status": "failed",
"phase": "failed",
"finished_at": datetime.now(UTC).isoformat(),
"error": str(exc),
},
event=f"归集任务中断:{exc}",
level="error",
)
knowledge_ingest["status"] = "failed"
knowledge_ingest["phase"] = "failed"
knowledge_ingest["current_document_id"] = ""
knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
route_payload: dict[str, Any] = {
"job_type": "knowledge_index_sync",
"phase": "failed",
"heartbeat_at": datetime.now(UTC).isoformat(),
"progress": {
"total_documents": len(document_ids),
"completed_documents": 0,
"failed_documents": len(document_ids),
"skipped_documents": 0,
"percent": 100,
},
}
if knowledge_ingest is not None:
route_payload["knowledge_ingest"] = knowledge_ingest
AgentRunService(db).merge_route_json(
agent_run_id,
route_payload,
status=AgentRunStatus.FAILED.value,
result_summary=str(exc),
error_message=str(exc),
finished_at=datetime.now(UTC),
)
except Exception:
logger.exception("Knowledge index task finalization failed run_id=%s", agent_run_id)
logger.exception("Knowledge index task failed run_id=%s", agent_run_id)
finally:
heartbeat_stop.set()
if heartbeat_thread is not None and heartbeat_thread.is_alive():
heartbeat_thread.join(timeout=1)
db.close()
def _build_initial_knowledge_ingest_state(
knowledge_service: KnowledgeService,
*,
document_ids: list[str],
) -> dict[str, Any]:
now = datetime.now(UTC).isoformat()
documents = [
_build_initial_knowledge_ingest_document(knowledge_service, document_id, now=now)
for document_id in document_ids
]
return {
"schema_version": 1,
"status": "running",
"phase": "queued",
"started_at": now,
"finished_at": None,
"current_document_id": documents[0]["document_id"] if documents else "",
"documents": documents,
"graph": _build_ingest_graph({"documents": documents}),
}
def _build_initial_knowledge_ingest_document(
knowledge_service: KnowledgeService,
document_id: str,
*,
now: str,
) -> dict[str, Any]:
try:
entry = knowledge_service.get_document_entry(document_id)
except Exception:
entry = {}
return {
"document_id": document_id,
"name": str(entry.get("original_name") or document_id).strip(),
"folder": str(entry.get("folder") or "").strip(),
"extension": str(entry.get("extension") or "").strip(),
"mime_type": str(entry.get("mime_type") or "").strip(),
"status": "queued",
"phase": "queued",
"started_at": None,
"finished_at": None,
"text_chars": 0,
"indexed_text_chars": 0,
"section_count": 0,
"sections": [],
"chunk_count": 0,
"chunk_ids": [],
"chunks": [],
"entity_count": 0,
"relation_count": 0,
"entities": [],
"entity_chunks": [],
"relations": [],
"events": [
{
"at": now,
"level": "info",
"message": "已进入知识归集队列,等待 LightRAG 处理。",
}
],
}
def _patch_ingest_document(
knowledge_ingest: dict[str, Any],
document_id: str,
updates: dict[str, Any],
*,
event: str = "",
level: str = "info",
) -> None:
document = _find_ingest_document(knowledge_ingest, document_id)
if document is None:
return
document.update(updates)
if event:
_append_ingest_event(document, event, level=level)
def _append_ingest_event(document: dict[str, Any], message: str, *, level: str) -> None:
events = document.get("events")
if not isinstance(events, list):
events = []
events.append(
{
"at": datetime.now(UTC).isoformat(),
"level": level,
"message": message,
}
)
document["events"] = events[-30:]
def _find_ingest_document(
knowledge_ingest: dict[str, Any],
document_id: str,
) -> dict[str, Any] | None:
for document in list(knowledge_ingest.get("documents") or []):
if not isinstance(document, dict):
continue
if str(document.get("document_id") or "").strip() == document_id:
return document
return None
def _sync_ingest_route_json(
run_service: AgentRunService,
agent_run_id: str,
knowledge_ingest: dict[str, Any],
*,
progress: dict[str, int],
) -> None:
run_service.merge_route_json(
agent_run_id,
{
"job_type": "knowledge_index_sync",
"phase": "indexing",
"heartbeat_at": datetime.now(UTC).isoformat(),
"progress": progress,
"knowledge_ingest": knowledge_ingest,
},
result_summary=_build_ingest_running_summary(knowledge_ingest, progress),
)
def _build_ingest_running_summary(
knowledge_ingest: dict[str, Any],
progress: dict[str, int],
) -> str:
total_documents = int(progress.get("total_documents") or 0)
completed_documents = int(progress.get("completed_documents") or 0)
failed_documents = int(progress.get("failed_documents") or 0)
current_document_id = str(knowledge_ingest.get("current_document_id") or "").strip()
current_document = (
_find_ingest_document(knowledge_ingest, current_document_id)
if current_document_id
else None
)
if current_document is not None:
name = str(current_document.get("name") or current_document_id).strip()
current_index = _resolve_ingest_document_index(knowledge_ingest, current_document_id)
return (
f"知识归纳正在处理 {current_index}/{total_documents}{name}"
f"已完成 {completed_documents} 个,失败 {failed_documents} 个。"
)
return (
f"知识归纳正在运行,已完成 {completed_documents}/{total_documents} 个文档,"
f"失败 {failed_documents} 个。"
)
def _resolve_ingest_document_index(
knowledge_ingest: dict[str, Any],
document_id: str,
) -> int:
documents = [
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
]
for index, document in enumerate(documents, start=1):
if str(document.get("document_id") or "").strip() == document_id:
return index
return 0
def _build_ingest_progress(
knowledge_ingest: dict[str, Any],
total_documents: int,
) -> dict[str, int]:
documents = [
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
]
completed_documents = sum(1 for item in documents if item.get("status") == "succeeded")
failed_documents = sum(1 for item in documents if item.get("status") == "failed")
skipped_documents = sum(1 for item in documents if item.get("status") == "skipped")
done_documents = completed_documents + failed_documents + skipped_documents
if total_documents <= 0:
percent = 100
else:
percent = min(95, max(10, 10 + int(done_documents * 85 / total_documents)))
return {
"total_documents": total_documents,
"completed_documents": completed_documents,
"failed_documents": failed_documents,
"skipped_documents": skipped_documents,
"percent": percent,
}
def _extract_document_summary(response: dict[str, Any], document_id: str) -> dict[str, Any]:
for item in list(response.get("document_summaries") or []):
if not isinstance(item, dict):
continue
if str(item.get("document_id") or "").strip() == document_id:
return dict(item)
return {}
def _extract_failed_documents(
response: dict[str, Any],
document_id: str,
) -> list[dict[str, str]]:
failed_documents: list[dict[str, str]] = []
for item in list(response.get("failed_documents") or []):
if not isinstance(item, dict):
continue
item_document_id = str(item.get("document_id") or "").strip()
if item_document_id and item_document_id != document_id:
continue
failed_documents.append(
{
"document_id": item_document_id or document_id,
"status": str(item.get("status") or "failed").strip(),
"error": str(item.get("error") or "LightRAG 索引失败").strip(),
}
)
return failed_documents
def _resolve_failed_ingest_document_ids(
knowledge_ingest: dict[str, Any] | None,
document_ids: list[str],
) -> list[str]:
if knowledge_ingest is None:
return document_ids
failed_document_ids: list[str] = []
seen_document_ids: set[str] = set()
for document in list(knowledge_ingest.get("documents") or []):
if not isinstance(document, dict):
continue
document_id = str(document.get("document_id") or "").strip()
if not document_id:
continue
seen_document_ids.add(document_id)
if document.get("status") != "succeeded":
failed_document_ids.append(document_id)
failed_document_ids.extend(
document_id for document_id in document_ids if document_id not in seen_document_ids
)
return failed_document_ids
def _refresh_ingest_graph(knowledge_ingest: dict[str, Any]) -> None:
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
def _build_ingest_graph(knowledge_ingest: dict[str, Any]) -> dict[str, Any]:
documents = [
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
]
entities = _dedupe_entities(
entity for document in documents for entity in list(document.get("entities") or [])
)
relations = _dedupe_relations(
relation for document in documents for relation in list(document.get("relations") or [])
)
return {
"chunk_count": sum(_to_int(document.get("chunk_count")) for document in documents),
"entity_count": sum(_to_int(document.get("entity_count")) for document in documents),
"relation_count": sum(_to_int(document.get("relation_count")) for document in documents),
"entities": entities[:60],
"relations": relations[:60],
}
def _dedupe_entities(items: Any) -> list[dict[str, Any]]:
deduped: list[dict[str, Any]] = []
seen: set[str] = set()
for item in items:
if isinstance(item, dict):
name = str(
item.get("name")
or item.get("entity")
or item.get("entity_id")
or item.get("title")
or item.get("id")
or ""
).strip()
entity = dict(item)
else:
name = str(item or "").strip()
entity = {}
if not name or name in seen:
continue
seen.add(name)
entity["name"] = name
entity["type"] = str(
entity.get("type")
or entity.get("entity_type")
or entity.get("category")
or entity.get("kind")
or "实体"
).strip()
description = str(entity.get("description") or "").strip()
descriptions = entity.get("descriptions")
if not isinstance(descriptions, list):
descriptions = [description] if description else []
entity["description"] = description
entity["descriptions"] = [
str(description_item or "").strip()
for description_item in descriptions
if str(description_item or "").strip()
][:5]
if not isinstance(entity.get("properties"), dict):
entity["properties"] = {}
deduped.append(entity)
return deduped
def _dedupe_relations(items: Any) -> list[dict[str, Any]]:
deduped: list[dict[str, Any]] = []
seen: set[tuple[str, str, str]] = set()
for item in items:
if not isinstance(item, dict):
continue
source = str(item.get("source") or "").strip()
target = str(item.get("target") or "").strip()
relation_type = str(item.get("type") or "关联").strip()
key = (source, target, relation_type)
if not source or not target or key in seen:
continue
seen.add(key)
deduped.append({**item, "source": source, "target": target, "type": relation_type})
return deduped
def _resolve_latest_track_id(responses: list[dict[str, Any]]) -> str:
for response in reversed(responses):
track_id = str(response.get("track_id") or "").strip()
if track_id:
return track_id
return ""
def _to_int(value: Any) -> int:
try:
return int(value or 0)
except (TypeError, ValueError):
return 0
knowledge_index_task_manager = KnowledgeIndexTaskManager()