Files
X-Financial/server/src/app/services/knowledge_index_tasks.py

773 lines
29 KiB
Python
Raw Normal View History

from __future__ import annotations
import threading
from concurrent.futures import Future, ThreadPoolExecutor
from datetime import UTC, datetime
from time import perf_counter
from typing import Any
from app.api.deps import CurrentUserContext
from app.core.agent_enums import AgentName, AgentRunStatus, AgentToolType
from app.core.logging import get_logger
from app.db.session import get_session_factory
from app.services.agent_runs import AgentRunService
from app.services.knowledge import (
KNOWLEDGE_INGEST_STATUS_FAILED,
KNOWLEDGE_INGEST_STATUS_INGESTED,
KnowledgeService,
)
from app.services.knowledge_rag import KnowledgeRagService
logger = get_logger("app.services.knowledge_index_tasks")
HEARTBEAT_INTERVAL_SECONDS = 10
class KnowledgeIndexTaskManager:
def __init__(self) -> None:
self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="knowledge-index")
self._futures: dict[str, Future[Any]] = {}
def submit_sync(
self,
*,
agent_run_id: str,
folder: str,
current_user: CurrentUserContext,
document_ids: list[str],
force: bool,
) -> None:
future = self._executor.submit(
self._run_sync,
agent_run_id,
folder,
current_user,
[str(item).strip() for item in document_ids if str(item).strip()],
force,
)
self._futures[agent_run_id] = future
def shutdown(self) -> None:
self._executor.shutdown(wait=False, cancel_futures=True)
@staticmethod
def _run_sync(
agent_run_id: str,
folder: str,
current_user: CurrentUserContext,
document_ids: list[str],
force: bool,
) -> None:
session_factory = get_session_factory()
db = session_factory()
started = perf_counter()
heartbeat_stop = threading.Event()
heartbeat_thread: threading.Thread | None = None
tool_call_id = ""
knowledge_ingest: dict[str, Any] | None = None
tool_request_json = {
"agent": AgentName.HERMES.value,
"folder": folder,
"document_ids": document_ids,
"force": force,
}
try:
run_service = AgentRunService(db)
knowledge_service = KnowledgeService(db=db)
rag_service = KnowledgeRagService(db=db)
knowledge_ingest = _build_initial_knowledge_ingest_state(
knowledge_service,
document_ids=document_ids,
)
run_service.merge_route_json(
agent_run_id,
{
"job_type": "knowledge_index_sync",
"phase": "indexing",
"folder": folder,
"force": force,
"heartbeat_at": datetime.now(UTC).isoformat(),
"requested_document_ids": document_ids,
"requested_by_username": current_user.username,
"requested_by_name": current_user.name,
"progress": {
"total_documents": len(document_ids),
"completed_documents": 0,
"failed_documents": 0,
"skipped_documents": 0,
"percent": 10 if document_ids else 100,
},
"knowledge_ingest": knowledge_ingest,
},
result_summary=_build_ingest_running_summary(
knowledge_ingest,
{
"total_documents": len(document_ids),
"completed_documents": 0,
"failed_documents": 0,
"skipped_documents": 0,
"percent": 10 if document_ids else 100,
},
),
)
tool_call = run_service.record_tool_call(
run_id=agent_run_id,
tool_type=AgentToolType.LLM.value,
tool_name="lightrag.index_documents",
request_json=tool_request_json,
response_json={"phase": "indexing"},
status="running",
duration_ms=0,
error_message=None,
)
tool_call_id = tool_call.id
def heartbeat_worker() -> None:
while not heartbeat_stop.wait(HEARTBEAT_INTERVAL_SECONDS):
heartbeat_db = session_factory()
try:
AgentRunService(heartbeat_db).merge_route_json(
agent_run_id,
{
"job_type": "knowledge_index_sync",
"phase": "indexing",
"heartbeat_at": datetime.now(UTC).isoformat(),
},
)
except Exception:
logger.exception(
"Knowledge index heartbeat update failed run_id=%s",
agent_run_id,
)
finally:
heartbeat_db.close()
heartbeat_thread = threading.Thread(
target=heartbeat_worker,
name=f"knowledge-index-heartbeat-{agent_run_id}",
daemon=True,
)
heartbeat_thread.start()
responses: list[dict[str, Any]] = []
succeeded_document_ids: list[str] = []
failed_documents: list[dict[str, str]] = []
total_documents = len(document_ids)
for index, document_id in enumerate(document_ids, start=1):
_patch_ingest_document(
knowledge_ingest,
document_id,
{
"status": "running",
"phase": "indexing",
"started_at": datetime.now(UTC).isoformat(),
},
event=f"开始处理第 {index}/{total_documents} 个文件,正在写入 LightRAG。",
)
knowledge_ingest["current_document_id"] = document_id
_sync_ingest_route_json(
run_service,
agent_run_id,
knowledge_ingest,
progress=_build_ingest_progress(knowledge_ingest, total_documents),
)
try:
response = rag_service.index_documents(document_ids=[document_id], force=force)
except Exception as exc:
logger.exception(
"Knowledge document index failed run_id=%s doc_id=%s",
agent_run_id,
document_id,
)
failed_documents.append(
{
"document_id": document_id,
"status": "exception",
"error": str(exc),
}
)
_patch_ingest_document(
knowledge_ingest,
document_id,
{
"status": "failed",
"phase": "failed",
"finished_at": datetime.now(UTC).isoformat(),
"error": str(exc),
},
event=f"归集失败:{exc}",
level="error",
)
knowledge_service.set_document_ingest_statuses(
[document_id],
KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
_refresh_ingest_graph(knowledge_ingest)
_sync_ingest_route_json(
run_service,
agent_run_id,
knowledge_ingest,
progress=_build_ingest_progress(knowledge_ingest, total_documents),
)
continue
responses.append(response)
response_failed_documents = _extract_failed_documents(response, document_id)
document_summary = _extract_document_summary(response, document_id)
if response_failed_documents:
failed_documents.extend(response_failed_documents)
error_text = (
response_failed_documents[0].get("error") or "LightRAG 未返回可查询状态"
)
_patch_ingest_document(
knowledge_ingest,
document_id,
{
**document_summary,
"status": "failed",
"phase": "failed",
"finished_at": datetime.now(UTC).isoformat(),
"error": error_text,
"track_id": str(response.get("track_id") or "").strip(),
},
event=f"LightRAG 索引失败:{error_text}",
level="error",
)
knowledge_service.set_document_ingest_statuses(
[document_id],
KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
else:
succeeded_document_ids.append(document_id)
chunk_count = int(document_summary.get("chunk_count") or 0)
entity_count = int(document_summary.get("entity_count") or 0)
relation_count = int(document_summary.get("relation_count") or 0)
_patch_ingest_document(
knowledge_ingest,
document_id,
{
**document_summary,
"status": "succeeded",
"phase": "indexed",
"finished_at": datetime.now(UTC).isoformat(),
"track_id": str(response.get("track_id") or "").strip(),
},
event=(
"LightRAG 索引完成:"
f"{chunk_count} 个 chunk{entity_count} 个实体,"
f"{relation_count} 条关系。"
),
)
knowledge_service.set_document_ingest_statuses(
[document_id],
KNOWLEDGE_INGEST_STATUS_INGESTED,
agent_run_id=agent_run_id,
)
_refresh_ingest_graph(knowledge_ingest)
_sync_ingest_route_json(
run_service,
agent_run_id,
knowledge_ingest,
progress=_build_ingest_progress(knowledge_ingest, total_documents),
)
failed_document_ids = [
str(item.get("document_id") or "").strip()
for item in failed_documents
if str(item.get("document_id") or "").strip()
]
duration_ms = int((perf_counter() - started) * 1000)
tool_status = "succeeded" if not failed_document_ids else "failed"
latest_track_id = _resolve_latest_track_id(responses)
knowledge_ingest["current_document_id"] = ""
knowledge_ingest["status"] = tool_status
knowledge_ingest["phase"] = "completed"
knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
heartbeat_stop.set()
if heartbeat_thread is not None:
heartbeat_thread.join(timeout=1)
run_service.update_tool_call(
tool_call_id,
response_json={
"track_id": latest_track_id,
"requested_document_ids": document_ids,
"succeeded_document_ids": succeeded_document_ids,
"failed_documents": failed_documents,
"documents": knowledge_ingest.get("documents", []),
"responses": responses,
},
status=tool_status,
duration_ms=duration_ms,
error_message=None if tool_status == "succeeded" else "部分文档索引失败。",
)
completed_documents = len(succeeded_document_ids)
failed_count = len(failed_document_ids)
total_documents = len(document_ids)
summary = (
f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引。"
if failed_count == 0
else (
f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引,"
f"失败 {failed_count} 个。"
)
)
run_service.merge_route_json(
agent_run_id,
{
"job_type": "knowledge_index_sync",
"phase": "completed",
"track_id": latest_track_id,
"heartbeat_at": datetime.now(UTC).isoformat(),
"progress": {
"total_documents": total_documents,
"completed_documents": completed_documents,
"failed_documents": failed_count,
"skipped_documents": 0,
"percent": 100,
},
"knowledge_ingest": knowledge_ingest,
},
status=(
AgentRunStatus.SUCCEEDED.value
if failed_count == 0
else AgentRunStatus.FAILED.value
),
result_summary=summary,
error_message="部分文档索引失败。" if failed_count else None,
finished_at=datetime.now(UTC),
)
except Exception as exc:
heartbeat_stop.set()
if heartbeat_thread is not None:
heartbeat_thread.join(timeout=1)
try:
if tool_call_id:
AgentRunService(db).update_tool_call(
tool_call_id,
response_json={"error": str(exc)},
status="failed",
duration_ms=int((perf_counter() - started) * 1000),
error_message=str(exc),
)
else:
AgentRunService(db).record_tool_call(
run_id=agent_run_id,
tool_type=AgentToolType.LLM.value,
tool_name="lightrag.index_documents",
request_json=tool_request_json,
response_json={"error": str(exc)},
status="failed",
duration_ms=int((perf_counter() - started) * 1000),
error_message=str(exc),
)
KnowledgeService(db=db).set_document_ingest_statuses(
_resolve_failed_ingest_document_ids(knowledge_ingest, document_ids),
KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
if knowledge_ingest is not None:
for document_id in document_ids:
document = _find_ingest_document(knowledge_ingest, document_id)
if document is None or document.get("status") in {"succeeded", "failed"}:
continue
_patch_ingest_document(
knowledge_ingest,
document_id,
{
"status": "failed",
"phase": "failed",
"finished_at": datetime.now(UTC).isoformat(),
"error": str(exc),
},
event=f"归集任务中断:{exc}",
level="error",
)
knowledge_ingest["status"] = "failed"
knowledge_ingest["phase"] = "failed"
knowledge_ingest["current_document_id"] = ""
knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
route_payload: dict[str, Any] = {
"job_type": "knowledge_index_sync",
"phase": "failed",
"heartbeat_at": datetime.now(UTC).isoformat(),
"progress": {
"total_documents": len(document_ids),
"completed_documents": 0,
"failed_documents": len(document_ids),
"skipped_documents": 0,
"percent": 100,
},
}
if knowledge_ingest is not None:
route_payload["knowledge_ingest"] = knowledge_ingest
AgentRunService(db).merge_route_json(
agent_run_id,
route_payload,
status=AgentRunStatus.FAILED.value,
result_summary=str(exc),
error_message=str(exc),
finished_at=datetime.now(UTC),
)
except Exception:
logger.exception("Knowledge index task finalization failed run_id=%s", agent_run_id)
logger.exception("Knowledge index task failed run_id=%s", agent_run_id)
finally:
heartbeat_stop.set()
if heartbeat_thread is not None and heartbeat_thread.is_alive():
heartbeat_thread.join(timeout=1)
db.close()
def _build_initial_knowledge_ingest_state(
knowledge_service: KnowledgeService,
*,
document_ids: list[str],
) -> dict[str, Any]:
now = datetime.now(UTC).isoformat()
documents = [
_build_initial_knowledge_ingest_document(knowledge_service, document_id, now=now)
for document_id in document_ids
]
return {
"schema_version": 1,
"status": "running",
"phase": "queued",
"started_at": now,
"finished_at": None,
"current_document_id": documents[0]["document_id"] if documents else "",
"documents": documents,
"graph": _build_ingest_graph({"documents": documents}),
}
def _build_initial_knowledge_ingest_document(
knowledge_service: KnowledgeService,
document_id: str,
*,
now: str,
) -> dict[str, Any]:
try:
entry = knowledge_service.get_document_entry(document_id)
except Exception:
entry = {}
return {
"document_id": document_id,
"name": str(entry.get("original_name") or document_id).strip(),
"folder": str(entry.get("folder") or "").strip(),
"extension": str(entry.get("extension") or "").strip(),
"mime_type": str(entry.get("mime_type") or "").strip(),
"status": "queued",
"phase": "queued",
"started_at": None,
"finished_at": None,
"text_chars": 0,
"indexed_text_chars": 0,
"section_count": 0,
"sections": [],
"chunk_count": 0,
"chunk_ids": [],
"chunks": [],
"entity_count": 0,
"relation_count": 0,
"entities": [],
"entity_chunks": [],
"relations": [],
"events": [
{
"at": now,
"level": "info",
"message": "已进入知识归集队列,等待 LightRAG 处理。",
}
],
}
def _patch_ingest_document(
knowledge_ingest: dict[str, Any],
document_id: str,
updates: dict[str, Any],
*,
event: str = "",
level: str = "info",
) -> None:
document = _find_ingest_document(knowledge_ingest, document_id)
if document is None:
return
document.update(updates)
if event:
_append_ingest_event(document, event, level=level)
def _append_ingest_event(document: dict[str, Any], message: str, *, level: str) -> None:
events = document.get("events")
if not isinstance(events, list):
events = []
events.append(
{
"at": datetime.now(UTC).isoformat(),
"level": level,
"message": message,
}
)
document["events"] = events[-30:]
def _find_ingest_document(
knowledge_ingest: dict[str, Any],
document_id: str,
) -> dict[str, Any] | None:
for document in list(knowledge_ingest.get("documents") or []):
if not isinstance(document, dict):
continue
if str(document.get("document_id") or "").strip() == document_id:
return document
return None
def _sync_ingest_route_json(
run_service: AgentRunService,
agent_run_id: str,
knowledge_ingest: dict[str, Any],
*,
progress: dict[str, int],
) -> None:
run_service.merge_route_json(
agent_run_id,
{
"job_type": "knowledge_index_sync",
"phase": "indexing",
"heartbeat_at": datetime.now(UTC).isoformat(),
"progress": progress,
"knowledge_ingest": knowledge_ingest,
},
result_summary=_build_ingest_running_summary(knowledge_ingest, progress),
)
def _build_ingest_running_summary(
knowledge_ingest: dict[str, Any],
progress: dict[str, int],
) -> str:
total_documents = int(progress.get("total_documents") or 0)
completed_documents = int(progress.get("completed_documents") or 0)
failed_documents = int(progress.get("failed_documents") or 0)
current_document_id = str(knowledge_ingest.get("current_document_id") or "").strip()
current_document = (
_find_ingest_document(knowledge_ingest, current_document_id)
if current_document_id
else None
)
if current_document is not None:
name = str(current_document.get("name") or current_document_id).strip()
current_index = _resolve_ingest_document_index(knowledge_ingest, current_document_id)
return (
f"知识归纳正在处理 {current_index}/{total_documents}{name}"
f"已完成 {completed_documents} 个,失败 {failed_documents} 个。"
)
return (
f"知识归纳正在运行,已完成 {completed_documents}/{total_documents} 个文档,"
f"失败 {failed_documents} 个。"
)
def _resolve_ingest_document_index(
knowledge_ingest: dict[str, Any],
document_id: str,
) -> int:
documents = [
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
]
for index, document in enumerate(documents, start=1):
if str(document.get("document_id") or "").strip() == document_id:
return index
return 0
def _build_ingest_progress(
knowledge_ingest: dict[str, Any],
total_documents: int,
) -> dict[str, int]:
documents = [
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
]
completed_documents = sum(1 for item in documents if item.get("status") == "succeeded")
failed_documents = sum(1 for item in documents if item.get("status") == "failed")
skipped_documents = sum(1 for item in documents if item.get("status") == "skipped")
done_documents = completed_documents + failed_documents + skipped_documents
if total_documents <= 0:
percent = 100
else:
percent = min(95, max(10, 10 + int(done_documents * 85 / total_documents)))
return {
"total_documents": total_documents,
"completed_documents": completed_documents,
"failed_documents": failed_documents,
"skipped_documents": skipped_documents,
"percent": percent,
}
def _extract_document_summary(response: dict[str, Any], document_id: str) -> dict[str, Any]:
for item in list(response.get("document_summaries") or []):
if not isinstance(item, dict):
continue
if str(item.get("document_id") or "").strip() == document_id:
return dict(item)
return {}
def _extract_failed_documents(
response: dict[str, Any],
document_id: str,
) -> list[dict[str, str]]:
failed_documents: list[dict[str, str]] = []
for item in list(response.get("failed_documents") or []):
if not isinstance(item, dict):
continue
item_document_id = str(item.get("document_id") or "").strip()
if item_document_id and item_document_id != document_id:
continue
failed_documents.append(
{
"document_id": item_document_id or document_id,
"status": str(item.get("status") or "failed").strip(),
"error": str(item.get("error") or "LightRAG 索引失败").strip(),
}
)
return failed_documents
def _resolve_failed_ingest_document_ids(
knowledge_ingest: dict[str, Any] | None,
document_ids: list[str],
) -> list[str]:
if knowledge_ingest is None:
return document_ids
failed_document_ids: list[str] = []
seen_document_ids: set[str] = set()
for document in list(knowledge_ingest.get("documents") or []):
if not isinstance(document, dict):
continue
document_id = str(document.get("document_id") or "").strip()
if not document_id:
continue
seen_document_ids.add(document_id)
if document.get("status") != "succeeded":
failed_document_ids.append(document_id)
failed_document_ids.extend(
document_id for document_id in document_ids if document_id not in seen_document_ids
)
return failed_document_ids
def _refresh_ingest_graph(knowledge_ingest: dict[str, Any]) -> None:
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
def _build_ingest_graph(knowledge_ingest: dict[str, Any]) -> dict[str, Any]:
documents = [
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
]
entities = _dedupe_entities(
entity for document in documents for entity in list(document.get("entities") or [])
)
relations = _dedupe_relations(
relation for document in documents for relation in list(document.get("relations") or [])
)
return {
"chunk_count": sum(_to_int(document.get("chunk_count")) for document in documents),
"entity_count": sum(_to_int(document.get("entity_count")) for document in documents),
"relation_count": sum(_to_int(document.get("relation_count")) for document in documents),
"entities": entities[:60],
"relations": relations[:60],
}
def _dedupe_entities(items: Any) -> list[dict[str, Any]]:
deduped: list[dict[str, Any]] = []
seen: set[str] = set()
for item in items:
if isinstance(item, dict):
name = str(
item.get("name")
or item.get("entity")
or item.get("entity_id")
or item.get("title")
or item.get("id")
or ""
).strip()
entity = dict(item)
else:
name = str(item or "").strip()
entity = {}
if not name or name in seen:
continue
seen.add(name)
entity["name"] = name
entity["type"] = str(
entity.get("type")
or entity.get("entity_type")
or entity.get("category")
or entity.get("kind")
or "实体"
).strip()
description = str(entity.get("description") or "").strip()
descriptions = entity.get("descriptions")
if not isinstance(descriptions, list):
descriptions = [description] if description else []
entity["description"] = description
entity["descriptions"] = [
str(description_item or "").strip()
for description_item in descriptions
if str(description_item or "").strip()
][:5]
if not isinstance(entity.get("properties"), dict):
entity["properties"] = {}
deduped.append(entity)
return deduped
def _dedupe_relations(items: Any) -> list[dict[str, Any]]:
deduped: list[dict[str, Any]] = []
seen: set[tuple[str, str, str]] = set()
for item in items:
if not isinstance(item, dict):
continue
source = str(item.get("source") or "").strip()
target = str(item.get("target") or "").strip()
relation_type = str(item.get("type") or "关联").strip()
key = (source, target, relation_type)
if not source or not target or key in seen:
continue
seen.add(key)
deduped.append({**item, "source": source, "target": target, "type": relation_type})
return deduped
def _resolve_latest_track_id(responses: list[dict[str, Any]]) -> str:
for response in reversed(responses):
track_id = str(response.get("track_id") or "").strip()
if track_id:
return track_id
return ""
def _to_int(value: Any) -> int:
try:
return int(value or 0)
except (TypeError, ValueError):
return 0
knowledge_index_task_manager = KnowledgeIndexTaskManager()