2026-05-17 08:38:41 +00:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
2026-05-18 02:49:39 +00:00
|
|
|
|
import threading
|
2026-05-17 08:38:41 +00:00
|
|
|
|
from concurrent.futures import Future, ThreadPoolExecutor
|
|
|
|
|
|
from datetime import UTC, datetime
|
|
|
|
|
|
from time import perf_counter
|
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
|
|
from app.api.deps import CurrentUserContext
|
|
|
|
|
|
from app.core.agent_enums import AgentName, AgentRunStatus, AgentToolType
|
|
|
|
|
|
from app.core.logging import get_logger
|
|
|
|
|
|
from app.db.session import get_session_factory
|
|
|
|
|
|
from app.services.agent_runs import AgentRunService
|
|
|
|
|
|
from app.services.knowledge import (
|
|
|
|
|
|
KNOWLEDGE_INGEST_STATUS_FAILED,
|
|
|
|
|
|
KNOWLEDGE_INGEST_STATUS_INGESTED,
|
|
|
|
|
|
KnowledgeService,
|
|
|
|
|
|
)
|
|
|
|
|
|
from app.services.knowledge_rag import KnowledgeRagService
|
|
|
|
|
|
|
|
|
|
|
|
logger = get_logger("app.services.knowledge_index_tasks")
|
2026-05-18 02:49:39 +00:00
|
|
|
|
HEARTBEAT_INTERVAL_SECONDS = 10
|
2026-05-17 08:38:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class KnowledgeIndexTaskManager:
|
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
|
self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="knowledge-index")
|
|
|
|
|
|
self._futures: dict[str, Future[Any]] = {}
|
|
|
|
|
|
|
|
|
|
|
|
def submit_sync(
|
|
|
|
|
|
self,
|
|
|
|
|
|
*,
|
|
|
|
|
|
agent_run_id: str,
|
|
|
|
|
|
folder: str,
|
|
|
|
|
|
current_user: CurrentUserContext,
|
|
|
|
|
|
document_ids: list[str],
|
|
|
|
|
|
force: bool,
|
|
|
|
|
|
) -> None:
|
|
|
|
|
|
future = self._executor.submit(
|
|
|
|
|
|
self._run_sync,
|
|
|
|
|
|
agent_run_id,
|
|
|
|
|
|
folder,
|
|
|
|
|
|
current_user,
|
|
|
|
|
|
[str(item).strip() for item in document_ids if str(item).strip()],
|
|
|
|
|
|
force,
|
|
|
|
|
|
)
|
|
|
|
|
|
self._futures[agent_run_id] = future
|
|
|
|
|
|
|
|
|
|
|
|
def shutdown(self) -> None:
|
|
|
|
|
|
self._executor.shutdown(wait=False, cancel_futures=True)
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _run_sync(
|
|
|
|
|
|
agent_run_id: str,
|
|
|
|
|
|
folder: str,
|
|
|
|
|
|
current_user: CurrentUserContext,
|
|
|
|
|
|
document_ids: list[str],
|
|
|
|
|
|
force: bool,
|
|
|
|
|
|
) -> None:
|
|
|
|
|
|
session_factory = get_session_factory()
|
|
|
|
|
|
db = session_factory()
|
|
|
|
|
|
started = perf_counter()
|
2026-05-18 02:49:39 +00:00
|
|
|
|
heartbeat_stop = threading.Event()
|
|
|
|
|
|
heartbeat_thread: threading.Thread | None = None
|
|
|
|
|
|
tool_call_id = ""
|
2026-05-22 23:47:28 +08:00
|
|
|
|
knowledge_ingest: dict[str, Any] | None = None
|
2026-05-18 02:49:39 +00:00
|
|
|
|
tool_request_json = {
|
|
|
|
|
|
"agent": AgentName.HERMES.value,
|
|
|
|
|
|
"folder": folder,
|
|
|
|
|
|
"document_ids": document_ids,
|
|
|
|
|
|
"force": force,
|
|
|
|
|
|
}
|
2026-05-17 08:38:41 +00:00
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
run_service = AgentRunService(db)
|
|
|
|
|
|
knowledge_service = KnowledgeService(db=db)
|
|
|
|
|
|
rag_service = KnowledgeRagService(db=db)
|
2026-05-22 23:47:28 +08:00
|
|
|
|
knowledge_ingest = _build_initial_knowledge_ingest_state(
|
|
|
|
|
|
knowledge_service,
|
|
|
|
|
|
document_ids=document_ids,
|
|
|
|
|
|
)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
|
|
|
|
|
|
run_service.merge_route_json(
|
|
|
|
|
|
agent_run_id,
|
|
|
|
|
|
{
|
|
|
|
|
|
"job_type": "knowledge_index_sync",
|
|
|
|
|
|
"phase": "indexing",
|
|
|
|
|
|
"folder": folder,
|
|
|
|
|
|
"force": force,
|
|
|
|
|
|
"heartbeat_at": datetime.now(UTC).isoformat(),
|
|
|
|
|
|
"requested_document_ids": document_ids,
|
|
|
|
|
|
"requested_by_username": current_user.username,
|
|
|
|
|
|
"requested_by_name": current_user.name,
|
|
|
|
|
|
"progress": {
|
|
|
|
|
|
"total_documents": len(document_ids),
|
|
|
|
|
|
"completed_documents": 0,
|
|
|
|
|
|
"failed_documents": 0,
|
|
|
|
|
|
"skipped_documents": 0,
|
|
|
|
|
|
"percent": 10 if document_ids else 100,
|
|
|
|
|
|
},
|
2026-05-22 23:47:28 +08:00
|
|
|
|
"knowledge_ingest": knowledge_ingest,
|
2026-05-17 08:38:41 +00:00
|
|
|
|
},
|
2026-05-22 23:47:28 +08:00
|
|
|
|
result_summary=_build_ingest_running_summary(
|
|
|
|
|
|
knowledge_ingest,
|
|
|
|
|
|
{
|
|
|
|
|
|
"total_documents": len(document_ids),
|
|
|
|
|
|
"completed_documents": 0,
|
|
|
|
|
|
"failed_documents": 0,
|
|
|
|
|
|
"skipped_documents": 0,
|
|
|
|
|
|
"percent": 10 if document_ids else 100,
|
|
|
|
|
|
},
|
|
|
|
|
|
),
|
2026-05-17 08:38:41 +00:00
|
|
|
|
)
|
2026-05-18 02:49:39 +00:00
|
|
|
|
tool_call = run_service.record_tool_call(
|
|
|
|
|
|
run_id=agent_run_id,
|
|
|
|
|
|
tool_type=AgentToolType.LLM.value,
|
|
|
|
|
|
tool_name="lightrag.index_documents",
|
|
|
|
|
|
request_json=tool_request_json,
|
|
|
|
|
|
response_json={"phase": "indexing"},
|
|
|
|
|
|
status="running",
|
|
|
|
|
|
duration_ms=0,
|
|
|
|
|
|
error_message=None,
|
|
|
|
|
|
)
|
|
|
|
|
|
tool_call_id = tool_call.id
|
|
|
|
|
|
|
|
|
|
|
|
def heartbeat_worker() -> None:
|
|
|
|
|
|
while not heartbeat_stop.wait(HEARTBEAT_INTERVAL_SECONDS):
|
|
|
|
|
|
heartbeat_db = session_factory()
|
|
|
|
|
|
try:
|
|
|
|
|
|
AgentRunService(heartbeat_db).merge_route_json(
|
|
|
|
|
|
agent_run_id,
|
|
|
|
|
|
{
|
|
|
|
|
|
"job_type": "knowledge_index_sync",
|
|
|
|
|
|
"phase": "indexing",
|
|
|
|
|
|
"heartbeat_at": datetime.now(UTC).isoformat(),
|
|
|
|
|
|
},
|
|
|
|
|
|
)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
logger.exception(
|
|
|
|
|
|
"Knowledge index heartbeat update failed run_id=%s",
|
|
|
|
|
|
agent_run_id,
|
|
|
|
|
|
)
|
|
|
|
|
|
finally:
|
|
|
|
|
|
heartbeat_db.close()
|
|
|
|
|
|
|
|
|
|
|
|
heartbeat_thread = threading.Thread(
|
|
|
|
|
|
target=heartbeat_worker,
|
|
|
|
|
|
name=f"knowledge-index-heartbeat-{agent_run_id}",
|
|
|
|
|
|
daemon=True,
|
|
|
|
|
|
)
|
|
|
|
|
|
heartbeat_thread.start()
|
2026-05-17 08:38:41 +00:00
|
|
|
|
|
2026-05-22 23:47:28 +08:00
|
|
|
|
responses: list[dict[str, Any]] = []
|
|
|
|
|
|
succeeded_document_ids: list[str] = []
|
|
|
|
|
|
failed_documents: list[dict[str, str]] = []
|
|
|
|
|
|
total_documents = len(document_ids)
|
|
|
|
|
|
|
|
|
|
|
|
for index, document_id in enumerate(document_ids, start=1):
|
|
|
|
|
|
_patch_ingest_document(
|
|
|
|
|
|
knowledge_ingest,
|
|
|
|
|
|
document_id,
|
|
|
|
|
|
{
|
|
|
|
|
|
"status": "running",
|
|
|
|
|
|
"phase": "indexing",
|
|
|
|
|
|
"started_at": datetime.now(UTC).isoformat(),
|
|
|
|
|
|
},
|
|
|
|
|
|
event=f"开始处理第 {index}/{total_documents} 个文件,正在写入 LightRAG。",
|
|
|
|
|
|
)
|
|
|
|
|
|
knowledge_ingest["current_document_id"] = document_id
|
|
|
|
|
|
_sync_ingest_route_json(
|
|
|
|
|
|
run_service,
|
|
|
|
|
|
agent_run_id,
|
|
|
|
|
|
knowledge_ingest,
|
|
|
|
|
|
progress=_build_ingest_progress(knowledge_ingest, total_documents),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
response = rag_service.index_documents(document_ids=[document_id], force=force)
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.exception(
|
|
|
|
|
|
"Knowledge document index failed run_id=%s doc_id=%s",
|
|
|
|
|
|
agent_run_id,
|
|
|
|
|
|
document_id,
|
|
|
|
|
|
)
|
|
|
|
|
|
failed_documents.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"document_id": document_id,
|
|
|
|
|
|
"status": "exception",
|
|
|
|
|
|
"error": str(exc),
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
_patch_ingest_document(
|
|
|
|
|
|
knowledge_ingest,
|
|
|
|
|
|
document_id,
|
|
|
|
|
|
{
|
|
|
|
|
|
"status": "failed",
|
|
|
|
|
|
"phase": "failed",
|
|
|
|
|
|
"finished_at": datetime.now(UTC).isoformat(),
|
|
|
|
|
|
"error": str(exc),
|
|
|
|
|
|
},
|
|
|
|
|
|
event=f"归集失败:{exc}",
|
|
|
|
|
|
level="error",
|
|
|
|
|
|
)
|
|
|
|
|
|
knowledge_service.set_document_ingest_statuses(
|
|
|
|
|
|
[document_id],
|
|
|
|
|
|
KNOWLEDGE_INGEST_STATUS_FAILED,
|
|
|
|
|
|
agent_run_id=agent_run_id,
|
|
|
|
|
|
)
|
|
|
|
|
|
_refresh_ingest_graph(knowledge_ingest)
|
|
|
|
|
|
_sync_ingest_route_json(
|
|
|
|
|
|
run_service,
|
|
|
|
|
|
agent_run_id,
|
|
|
|
|
|
knowledge_ingest,
|
|
|
|
|
|
progress=_build_ingest_progress(knowledge_ingest, total_documents),
|
|
|
|
|
|
)
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
responses.append(response)
|
|
|
|
|
|
response_failed_documents = _extract_failed_documents(response, document_id)
|
|
|
|
|
|
document_summary = _extract_document_summary(response, document_id)
|
|
|
|
|
|
if response_failed_documents:
|
|
|
|
|
|
failed_documents.extend(response_failed_documents)
|
|
|
|
|
|
error_text = (
|
|
|
|
|
|
response_failed_documents[0].get("error") or "LightRAG 未返回可查询状态"
|
|
|
|
|
|
)
|
|
|
|
|
|
_patch_ingest_document(
|
|
|
|
|
|
knowledge_ingest,
|
|
|
|
|
|
document_id,
|
|
|
|
|
|
{
|
|
|
|
|
|
**document_summary,
|
|
|
|
|
|
"status": "failed",
|
|
|
|
|
|
"phase": "failed",
|
|
|
|
|
|
"finished_at": datetime.now(UTC).isoformat(),
|
|
|
|
|
|
"error": error_text,
|
|
|
|
|
|
"track_id": str(response.get("track_id") or "").strip(),
|
|
|
|
|
|
},
|
|
|
|
|
|
event=f"LightRAG 索引失败:{error_text}",
|
|
|
|
|
|
level="error",
|
|
|
|
|
|
)
|
|
|
|
|
|
knowledge_service.set_document_ingest_statuses(
|
|
|
|
|
|
[document_id],
|
|
|
|
|
|
KNOWLEDGE_INGEST_STATUS_FAILED,
|
|
|
|
|
|
agent_run_id=agent_run_id,
|
|
|
|
|
|
)
|
|
|
|
|
|
else:
|
|
|
|
|
|
succeeded_document_ids.append(document_id)
|
|
|
|
|
|
chunk_count = int(document_summary.get("chunk_count") or 0)
|
|
|
|
|
|
entity_count = int(document_summary.get("entity_count") or 0)
|
|
|
|
|
|
relation_count = int(document_summary.get("relation_count") or 0)
|
|
|
|
|
|
_patch_ingest_document(
|
|
|
|
|
|
knowledge_ingest,
|
|
|
|
|
|
document_id,
|
|
|
|
|
|
{
|
|
|
|
|
|
**document_summary,
|
|
|
|
|
|
"status": "succeeded",
|
|
|
|
|
|
"phase": "indexed",
|
|
|
|
|
|
"finished_at": datetime.now(UTC).isoformat(),
|
|
|
|
|
|
"track_id": str(response.get("track_id") or "").strip(),
|
|
|
|
|
|
},
|
|
|
|
|
|
event=(
|
|
|
|
|
|
"LightRAG 索引完成:"
|
|
|
|
|
|
f"{chunk_count} 个 chunk,{entity_count} 个实体,"
|
|
|
|
|
|
f"{relation_count} 条关系。"
|
|
|
|
|
|
),
|
|
|
|
|
|
)
|
|
|
|
|
|
knowledge_service.set_document_ingest_statuses(
|
|
|
|
|
|
[document_id],
|
|
|
|
|
|
KNOWLEDGE_INGEST_STATUS_INGESTED,
|
|
|
|
|
|
agent_run_id=agent_run_id,
|
|
|
|
|
|
)
|
|
|
|
|
|
_refresh_ingest_graph(knowledge_ingest)
|
|
|
|
|
|
_sync_ingest_route_json(
|
|
|
|
|
|
run_service,
|
|
|
|
|
|
agent_run_id,
|
|
|
|
|
|
knowledge_ingest,
|
|
|
|
|
|
progress=_build_ingest_progress(knowledge_ingest, total_documents),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-05-17 08:38:41 +00:00
|
|
|
|
failed_document_ids = [
|
|
|
|
|
|
str(item.get("document_id") or "").strip()
|
|
|
|
|
|
for item in failed_documents
|
|
|
|
|
|
if str(item.get("document_id") or "").strip()
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
duration_ms = int((perf_counter() - started) * 1000)
|
|
|
|
|
|
tool_status = "succeeded" if not failed_document_ids else "failed"
|
2026-05-22 23:47:28 +08:00
|
|
|
|
latest_track_id = _resolve_latest_track_id(responses)
|
|
|
|
|
|
knowledge_ingest["current_document_id"] = ""
|
|
|
|
|
|
knowledge_ingest["status"] = tool_status
|
|
|
|
|
|
knowledge_ingest["phase"] = "completed"
|
|
|
|
|
|
knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
|
|
|
|
|
|
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
|
2026-05-18 02:49:39 +00:00
|
|
|
|
heartbeat_stop.set()
|
|
|
|
|
|
if heartbeat_thread is not None:
|
|
|
|
|
|
heartbeat_thread.join(timeout=1)
|
|
|
|
|
|
run_service.update_tool_call(
|
|
|
|
|
|
tool_call_id,
|
2026-05-22 23:47:28 +08:00
|
|
|
|
response_json={
|
|
|
|
|
|
"track_id": latest_track_id,
|
|
|
|
|
|
"requested_document_ids": document_ids,
|
|
|
|
|
|
"succeeded_document_ids": succeeded_document_ids,
|
|
|
|
|
|
"failed_documents": failed_documents,
|
|
|
|
|
|
"documents": knowledge_ingest.get("documents", []),
|
|
|
|
|
|
"responses": responses,
|
|
|
|
|
|
},
|
2026-05-17 08:38:41 +00:00
|
|
|
|
status=tool_status,
|
|
|
|
|
|
duration_ms=duration_ms,
|
|
|
|
|
|
error_message=None if tool_status == "succeeded" else "部分文档索引失败。",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
completed_documents = len(succeeded_document_ids)
|
|
|
|
|
|
failed_count = len(failed_document_ids)
|
|
|
|
|
|
total_documents = len(document_ids)
|
|
|
|
|
|
summary = (
|
|
|
|
|
|
f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引。"
|
|
|
|
|
|
if failed_count == 0
|
2026-05-22 23:47:28 +08:00
|
|
|
|
else (
|
|
|
|
|
|
f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引,"
|
|
|
|
|
|
f"失败 {failed_count} 个。"
|
|
|
|
|
|
)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
)
|
|
|
|
|
|
run_service.merge_route_json(
|
|
|
|
|
|
agent_run_id,
|
|
|
|
|
|
{
|
|
|
|
|
|
"job_type": "knowledge_index_sync",
|
|
|
|
|
|
"phase": "completed",
|
2026-05-22 23:47:28 +08:00
|
|
|
|
"track_id": latest_track_id,
|
2026-05-17 08:38:41 +00:00
|
|
|
|
"heartbeat_at": datetime.now(UTC).isoformat(),
|
|
|
|
|
|
"progress": {
|
|
|
|
|
|
"total_documents": total_documents,
|
|
|
|
|
|
"completed_documents": completed_documents,
|
|
|
|
|
|
"failed_documents": failed_count,
|
|
|
|
|
|
"skipped_documents": 0,
|
|
|
|
|
|
"percent": 100,
|
|
|
|
|
|
},
|
2026-05-22 23:47:28 +08:00
|
|
|
|
"knowledge_ingest": knowledge_ingest,
|
2026-05-17 08:38:41 +00:00
|
|
|
|
},
|
|
|
|
|
|
status=(
|
|
|
|
|
|
AgentRunStatus.SUCCEEDED.value
|
|
|
|
|
|
if failed_count == 0
|
|
|
|
|
|
else AgentRunStatus.FAILED.value
|
|
|
|
|
|
),
|
|
|
|
|
|
result_summary=summary,
|
|
|
|
|
|
error_message="部分文档索引失败。" if failed_count else None,
|
|
|
|
|
|
finished_at=datetime.now(UTC),
|
|
|
|
|
|
)
|
|
|
|
|
|
except Exception as exc:
|
2026-05-18 02:49:39 +00:00
|
|
|
|
heartbeat_stop.set()
|
|
|
|
|
|
if heartbeat_thread is not None:
|
|
|
|
|
|
heartbeat_thread.join(timeout=1)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
try:
|
2026-05-18 02:49:39 +00:00
|
|
|
|
if tool_call_id:
|
|
|
|
|
|
AgentRunService(db).update_tool_call(
|
|
|
|
|
|
tool_call_id,
|
|
|
|
|
|
response_json={"error": str(exc)},
|
|
|
|
|
|
status="failed",
|
|
|
|
|
|
duration_ms=int((perf_counter() - started) * 1000),
|
|
|
|
|
|
error_message=str(exc),
|
|
|
|
|
|
)
|
|
|
|
|
|
else:
|
|
|
|
|
|
AgentRunService(db).record_tool_call(
|
|
|
|
|
|
run_id=agent_run_id,
|
|
|
|
|
|
tool_type=AgentToolType.LLM.value,
|
|
|
|
|
|
tool_name="lightrag.index_documents",
|
|
|
|
|
|
request_json=tool_request_json,
|
|
|
|
|
|
response_json={"error": str(exc)},
|
|
|
|
|
|
status="failed",
|
|
|
|
|
|
duration_ms=int((perf_counter() - started) * 1000),
|
|
|
|
|
|
error_message=str(exc),
|
|
|
|
|
|
)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
KnowledgeService(db=db).set_document_ingest_statuses(
|
2026-05-22 23:47:28 +08:00
|
|
|
|
_resolve_failed_ingest_document_ids(knowledge_ingest, document_ids),
|
2026-05-17 08:38:41 +00:00
|
|
|
|
KNOWLEDGE_INGEST_STATUS_FAILED,
|
|
|
|
|
|
agent_run_id=agent_run_id,
|
|
|
|
|
|
)
|
2026-05-22 23:47:28 +08:00
|
|
|
|
if knowledge_ingest is not None:
|
|
|
|
|
|
for document_id in document_ids:
|
|
|
|
|
|
document = _find_ingest_document(knowledge_ingest, document_id)
|
|
|
|
|
|
if document is None or document.get("status") in {"succeeded", "failed"}:
|
|
|
|
|
|
continue
|
|
|
|
|
|
_patch_ingest_document(
|
|
|
|
|
|
knowledge_ingest,
|
|
|
|
|
|
document_id,
|
|
|
|
|
|
{
|
|
|
|
|
|
"status": "failed",
|
|
|
|
|
|
"phase": "failed",
|
|
|
|
|
|
"finished_at": datetime.now(UTC).isoformat(),
|
|
|
|
|
|
"error": str(exc),
|
|
|
|
|
|
},
|
|
|
|
|
|
event=f"归集任务中断:{exc}",
|
|
|
|
|
|
level="error",
|
|
|
|
|
|
)
|
|
|
|
|
|
knowledge_ingest["status"] = "failed"
|
|
|
|
|
|
knowledge_ingest["phase"] = "failed"
|
|
|
|
|
|
knowledge_ingest["current_document_id"] = ""
|
|
|
|
|
|
knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
|
|
|
|
|
|
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
|
|
|
|
|
|
|
|
|
|
|
|
route_payload: dict[str, Any] = {
|
|
|
|
|
|
"job_type": "knowledge_index_sync",
|
|
|
|
|
|
"phase": "failed",
|
|
|
|
|
|
"heartbeat_at": datetime.now(UTC).isoformat(),
|
|
|
|
|
|
"progress": {
|
|
|
|
|
|
"total_documents": len(document_ids),
|
|
|
|
|
|
"completed_documents": 0,
|
|
|
|
|
|
"failed_documents": len(document_ids),
|
|
|
|
|
|
"skipped_documents": 0,
|
|
|
|
|
|
"percent": 100,
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
if knowledge_ingest is not None:
|
|
|
|
|
|
route_payload["knowledge_ingest"] = knowledge_ingest
|
2026-05-17 08:38:41 +00:00
|
|
|
|
AgentRunService(db).merge_route_json(
|
|
|
|
|
|
agent_run_id,
|
2026-05-22 23:47:28 +08:00
|
|
|
|
route_payload,
|
2026-05-17 08:38:41 +00:00
|
|
|
|
status=AgentRunStatus.FAILED.value,
|
|
|
|
|
|
result_summary=str(exc),
|
|
|
|
|
|
error_message=str(exc),
|
|
|
|
|
|
finished_at=datetime.now(UTC),
|
|
|
|
|
|
)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
logger.exception("Knowledge index task finalization failed run_id=%s", agent_run_id)
|
|
|
|
|
|
logger.exception("Knowledge index task failed run_id=%s", agent_run_id)
|
|
|
|
|
|
finally:
|
2026-05-18 02:49:39 +00:00
|
|
|
|
heartbeat_stop.set()
|
|
|
|
|
|
if heartbeat_thread is not None and heartbeat_thread.is_alive():
|
|
|
|
|
|
heartbeat_thread.join(timeout=1)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
db.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-22 23:47:28 +08:00
|
|
|
|
def _build_initial_knowledge_ingest_state(
|
|
|
|
|
|
knowledge_service: KnowledgeService,
|
|
|
|
|
|
*,
|
|
|
|
|
|
document_ids: list[str],
|
|
|
|
|
|
) -> dict[str, Any]:
|
|
|
|
|
|
now = datetime.now(UTC).isoformat()
|
|
|
|
|
|
documents = [
|
|
|
|
|
|
_build_initial_knowledge_ingest_document(knowledge_service, document_id, now=now)
|
|
|
|
|
|
for document_id in document_ids
|
|
|
|
|
|
]
|
|
|
|
|
|
return {
|
|
|
|
|
|
"schema_version": 1,
|
|
|
|
|
|
"status": "running",
|
|
|
|
|
|
"phase": "queued",
|
|
|
|
|
|
"started_at": now,
|
|
|
|
|
|
"finished_at": None,
|
|
|
|
|
|
"current_document_id": documents[0]["document_id"] if documents else "",
|
|
|
|
|
|
"documents": documents,
|
|
|
|
|
|
"graph": _build_ingest_graph({"documents": documents}),
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_initial_knowledge_ingest_document(
|
|
|
|
|
|
knowledge_service: KnowledgeService,
|
|
|
|
|
|
document_id: str,
|
|
|
|
|
|
*,
|
|
|
|
|
|
now: str,
|
|
|
|
|
|
) -> dict[str, Any]:
|
|
|
|
|
|
try:
|
|
|
|
|
|
entry = knowledge_service.get_document_entry(document_id)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
entry = {}
|
|
|
|
|
|
return {
|
|
|
|
|
|
"document_id": document_id,
|
|
|
|
|
|
"name": str(entry.get("original_name") or document_id).strip(),
|
|
|
|
|
|
"folder": str(entry.get("folder") or "").strip(),
|
|
|
|
|
|
"extension": str(entry.get("extension") or "").strip(),
|
|
|
|
|
|
"mime_type": str(entry.get("mime_type") or "").strip(),
|
|
|
|
|
|
"status": "queued",
|
|
|
|
|
|
"phase": "queued",
|
|
|
|
|
|
"started_at": None,
|
|
|
|
|
|
"finished_at": None,
|
|
|
|
|
|
"text_chars": 0,
|
|
|
|
|
|
"indexed_text_chars": 0,
|
|
|
|
|
|
"section_count": 0,
|
|
|
|
|
|
"sections": [],
|
|
|
|
|
|
"chunk_count": 0,
|
|
|
|
|
|
"chunk_ids": [],
|
|
|
|
|
|
"chunks": [],
|
|
|
|
|
|
"entity_count": 0,
|
|
|
|
|
|
"relation_count": 0,
|
|
|
|
|
|
"entities": [],
|
2026-05-23 19:54:42 +08:00
|
|
|
|
"entity_chunks": [],
|
2026-05-22 23:47:28 +08:00
|
|
|
|
"relations": [],
|
|
|
|
|
|
"events": [
|
|
|
|
|
|
{
|
|
|
|
|
|
"at": now,
|
|
|
|
|
|
"level": "info",
|
|
|
|
|
|
"message": "已进入知识归集队列,等待 LightRAG 处理。",
|
|
|
|
|
|
}
|
|
|
|
|
|
],
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _patch_ingest_document(
|
|
|
|
|
|
knowledge_ingest: dict[str, Any],
|
|
|
|
|
|
document_id: str,
|
|
|
|
|
|
updates: dict[str, Any],
|
|
|
|
|
|
*,
|
|
|
|
|
|
event: str = "",
|
|
|
|
|
|
level: str = "info",
|
|
|
|
|
|
) -> None:
|
|
|
|
|
|
document = _find_ingest_document(knowledge_ingest, document_id)
|
|
|
|
|
|
if document is None:
|
|
|
|
|
|
return
|
|
|
|
|
|
document.update(updates)
|
|
|
|
|
|
if event:
|
|
|
|
|
|
_append_ingest_event(document, event, level=level)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _append_ingest_event(document: dict[str, Any], message: str, *, level: str) -> None:
|
|
|
|
|
|
events = document.get("events")
|
|
|
|
|
|
if not isinstance(events, list):
|
|
|
|
|
|
events = []
|
|
|
|
|
|
events.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"at": datetime.now(UTC).isoformat(),
|
|
|
|
|
|
"level": level,
|
|
|
|
|
|
"message": message,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
document["events"] = events[-30:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _find_ingest_document(
|
|
|
|
|
|
knowledge_ingest: dict[str, Any],
|
|
|
|
|
|
document_id: str,
|
|
|
|
|
|
) -> dict[str, Any] | None:
|
|
|
|
|
|
for document in list(knowledge_ingest.get("documents") or []):
|
|
|
|
|
|
if not isinstance(document, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
if str(document.get("document_id") or "").strip() == document_id:
|
|
|
|
|
|
return document
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _sync_ingest_route_json(
|
|
|
|
|
|
run_service: AgentRunService,
|
|
|
|
|
|
agent_run_id: str,
|
|
|
|
|
|
knowledge_ingest: dict[str, Any],
|
|
|
|
|
|
*,
|
|
|
|
|
|
progress: dict[str, int],
|
|
|
|
|
|
) -> None:
|
|
|
|
|
|
run_service.merge_route_json(
|
|
|
|
|
|
agent_run_id,
|
|
|
|
|
|
{
|
|
|
|
|
|
"job_type": "knowledge_index_sync",
|
|
|
|
|
|
"phase": "indexing",
|
|
|
|
|
|
"heartbeat_at": datetime.now(UTC).isoformat(),
|
|
|
|
|
|
"progress": progress,
|
|
|
|
|
|
"knowledge_ingest": knowledge_ingest,
|
|
|
|
|
|
},
|
|
|
|
|
|
result_summary=_build_ingest_running_summary(knowledge_ingest, progress),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_ingest_running_summary(
|
|
|
|
|
|
knowledge_ingest: dict[str, Any],
|
|
|
|
|
|
progress: dict[str, int],
|
|
|
|
|
|
) -> str:
|
|
|
|
|
|
total_documents = int(progress.get("total_documents") or 0)
|
|
|
|
|
|
completed_documents = int(progress.get("completed_documents") or 0)
|
|
|
|
|
|
failed_documents = int(progress.get("failed_documents") or 0)
|
|
|
|
|
|
current_document_id = str(knowledge_ingest.get("current_document_id") or "").strip()
|
|
|
|
|
|
current_document = (
|
|
|
|
|
|
_find_ingest_document(knowledge_ingest, current_document_id)
|
|
|
|
|
|
if current_document_id
|
|
|
|
|
|
else None
|
|
|
|
|
|
)
|
|
|
|
|
|
if current_document is not None:
|
|
|
|
|
|
name = str(current_document.get("name") or current_document_id).strip()
|
|
|
|
|
|
current_index = _resolve_ingest_document_index(knowledge_ingest, current_document_id)
|
|
|
|
|
|
return (
|
|
|
|
|
|
f"知识归纳正在处理 {current_index}/{total_documents}:{name}。"
|
|
|
|
|
|
f"已完成 {completed_documents} 个,失败 {failed_documents} 个。"
|
|
|
|
|
|
)
|
|
|
|
|
|
return (
|
|
|
|
|
|
f"知识归纳正在运行,已完成 {completed_documents}/{total_documents} 个文档,"
|
|
|
|
|
|
f"失败 {failed_documents} 个。"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _resolve_ingest_document_index(
|
|
|
|
|
|
knowledge_ingest: dict[str, Any],
|
|
|
|
|
|
document_id: str,
|
|
|
|
|
|
) -> int:
|
|
|
|
|
|
documents = [
|
|
|
|
|
|
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
|
|
|
|
|
|
]
|
|
|
|
|
|
for index, document in enumerate(documents, start=1):
|
|
|
|
|
|
if str(document.get("document_id") or "").strip() == document_id:
|
|
|
|
|
|
return index
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_ingest_progress(
|
|
|
|
|
|
knowledge_ingest: dict[str, Any],
|
|
|
|
|
|
total_documents: int,
|
|
|
|
|
|
) -> dict[str, int]:
|
|
|
|
|
|
documents = [
|
|
|
|
|
|
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
|
|
|
|
|
|
]
|
|
|
|
|
|
completed_documents = sum(1 for item in documents if item.get("status") == "succeeded")
|
|
|
|
|
|
failed_documents = sum(1 for item in documents if item.get("status") == "failed")
|
|
|
|
|
|
skipped_documents = sum(1 for item in documents if item.get("status") == "skipped")
|
|
|
|
|
|
done_documents = completed_documents + failed_documents + skipped_documents
|
|
|
|
|
|
if total_documents <= 0:
|
|
|
|
|
|
percent = 100
|
|
|
|
|
|
else:
|
|
|
|
|
|
percent = min(95, max(10, 10 + int(done_documents * 85 / total_documents)))
|
|
|
|
|
|
return {
|
|
|
|
|
|
"total_documents": total_documents,
|
|
|
|
|
|
"completed_documents": completed_documents,
|
|
|
|
|
|
"failed_documents": failed_documents,
|
|
|
|
|
|
"skipped_documents": skipped_documents,
|
|
|
|
|
|
"percent": percent,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_document_summary(response: dict[str, Any], document_id: str) -> dict[str, Any]:
|
|
|
|
|
|
for item in list(response.get("document_summaries") or []):
|
|
|
|
|
|
if not isinstance(item, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
if str(item.get("document_id") or "").strip() == document_id:
|
|
|
|
|
|
return dict(item)
|
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_failed_documents(
|
|
|
|
|
|
response: dict[str, Any],
|
|
|
|
|
|
document_id: str,
|
|
|
|
|
|
) -> list[dict[str, str]]:
|
|
|
|
|
|
failed_documents: list[dict[str, str]] = []
|
|
|
|
|
|
for item in list(response.get("failed_documents") or []):
|
|
|
|
|
|
if not isinstance(item, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
item_document_id = str(item.get("document_id") or "").strip()
|
|
|
|
|
|
if item_document_id and item_document_id != document_id:
|
|
|
|
|
|
continue
|
|
|
|
|
|
failed_documents.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"document_id": item_document_id or document_id,
|
|
|
|
|
|
"status": str(item.get("status") or "failed").strip(),
|
|
|
|
|
|
"error": str(item.get("error") or "LightRAG 索引失败").strip(),
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
return failed_documents
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _resolve_failed_ingest_document_ids(
|
|
|
|
|
|
knowledge_ingest: dict[str, Any] | None,
|
|
|
|
|
|
document_ids: list[str],
|
|
|
|
|
|
) -> list[str]:
|
|
|
|
|
|
if knowledge_ingest is None:
|
|
|
|
|
|
return document_ids
|
|
|
|
|
|
failed_document_ids: list[str] = []
|
|
|
|
|
|
seen_document_ids: set[str] = set()
|
|
|
|
|
|
for document in list(knowledge_ingest.get("documents") or []):
|
|
|
|
|
|
if not isinstance(document, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
document_id = str(document.get("document_id") or "").strip()
|
|
|
|
|
|
if not document_id:
|
|
|
|
|
|
continue
|
|
|
|
|
|
seen_document_ids.add(document_id)
|
|
|
|
|
|
if document.get("status") != "succeeded":
|
|
|
|
|
|
failed_document_ids.append(document_id)
|
|
|
|
|
|
failed_document_ids.extend(
|
|
|
|
|
|
document_id for document_id in document_ids if document_id not in seen_document_ids
|
|
|
|
|
|
)
|
|
|
|
|
|
return failed_document_ids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _refresh_ingest_graph(knowledge_ingest: dict[str, Any]) -> None:
|
|
|
|
|
|
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_ingest_graph(knowledge_ingest: dict[str, Any]) -> dict[str, Any]:
|
|
|
|
|
|
documents = [
|
|
|
|
|
|
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
|
|
|
|
|
|
]
|
2026-05-23 19:54:42 +08:00
|
|
|
|
entities = _dedupe_entities(
|
2026-05-22 23:47:28 +08:00
|
|
|
|
entity for document in documents for entity in list(document.get("entities") or [])
|
|
|
|
|
|
)
|
|
|
|
|
|
relations = _dedupe_relations(
|
|
|
|
|
|
relation for document in documents for relation in list(document.get("relations") or [])
|
|
|
|
|
|
)
|
|
|
|
|
|
return {
|
|
|
|
|
|
"chunk_count": sum(_to_int(document.get("chunk_count")) for document in documents),
|
|
|
|
|
|
"entity_count": sum(_to_int(document.get("entity_count")) for document in documents),
|
|
|
|
|
|
"relation_count": sum(_to_int(document.get("relation_count")) for document in documents),
|
|
|
|
|
|
"entities": entities[:60],
|
|
|
|
|
|
"relations": relations[:60],
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-23 19:54:42 +08:00
|
|
|
|
def _dedupe_entities(items: Any) -> list[dict[str, Any]]:
|
|
|
|
|
|
deduped: list[dict[str, Any]] = []
|
2026-05-22 23:47:28 +08:00
|
|
|
|
seen: set[str] = set()
|
|
|
|
|
|
for item in items:
|
2026-05-23 19:54:42 +08:00
|
|
|
|
if isinstance(item, dict):
|
|
|
|
|
|
name = str(
|
|
|
|
|
|
item.get("name")
|
|
|
|
|
|
or item.get("entity")
|
|
|
|
|
|
or item.get("entity_id")
|
|
|
|
|
|
or item.get("title")
|
|
|
|
|
|
or item.get("id")
|
|
|
|
|
|
or ""
|
|
|
|
|
|
).strip()
|
|
|
|
|
|
entity = dict(item)
|
|
|
|
|
|
else:
|
|
|
|
|
|
name = str(item or "").strip()
|
|
|
|
|
|
entity = {}
|
|
|
|
|
|
if not name or name in seen:
|
2026-05-22 23:47:28 +08:00
|
|
|
|
continue
|
2026-05-23 19:54:42 +08:00
|
|
|
|
seen.add(name)
|
|
|
|
|
|
entity["name"] = name
|
|
|
|
|
|
entity["type"] = str(
|
|
|
|
|
|
entity.get("type")
|
|
|
|
|
|
or entity.get("entity_type")
|
|
|
|
|
|
or entity.get("category")
|
|
|
|
|
|
or entity.get("kind")
|
|
|
|
|
|
or "实体"
|
|
|
|
|
|
).strip()
|
|
|
|
|
|
description = str(entity.get("description") or "").strip()
|
|
|
|
|
|
descriptions = entity.get("descriptions")
|
|
|
|
|
|
if not isinstance(descriptions, list):
|
|
|
|
|
|
descriptions = [description] if description else []
|
|
|
|
|
|
entity["description"] = description
|
|
|
|
|
|
entity["descriptions"] = [
|
|
|
|
|
|
str(description_item or "").strip()
|
|
|
|
|
|
for description_item in descriptions
|
|
|
|
|
|
if str(description_item or "").strip()
|
|
|
|
|
|
][:5]
|
|
|
|
|
|
if not isinstance(entity.get("properties"), dict):
|
|
|
|
|
|
entity["properties"] = {}
|
|
|
|
|
|
deduped.append(entity)
|
2026-05-22 23:47:28 +08:00
|
|
|
|
return deduped
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-23 19:54:42 +08:00
|
|
|
|
def _dedupe_relations(items: Any) -> list[dict[str, Any]]:
|
|
|
|
|
|
deduped: list[dict[str, Any]] = []
|
2026-05-22 23:47:28 +08:00
|
|
|
|
seen: set[tuple[str, str, str]] = set()
|
|
|
|
|
|
for item in items:
|
|
|
|
|
|
if not isinstance(item, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
source = str(item.get("source") or "").strip()
|
|
|
|
|
|
target = str(item.get("target") or "").strip()
|
|
|
|
|
|
relation_type = str(item.get("type") or "关联").strip()
|
|
|
|
|
|
key = (source, target, relation_type)
|
|
|
|
|
|
if not source or not target or key in seen:
|
|
|
|
|
|
continue
|
|
|
|
|
|
seen.add(key)
|
2026-05-23 19:54:42 +08:00
|
|
|
|
deduped.append({**item, "source": source, "target": target, "type": relation_type})
|
2026-05-22 23:47:28 +08:00
|
|
|
|
return deduped
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _resolve_latest_track_id(responses: list[dict[str, Any]]) -> str:
|
|
|
|
|
|
for response in reversed(responses):
|
|
|
|
|
|
track_id = str(response.get("track_id") or "").strip()
|
|
|
|
|
|
if track_id:
|
|
|
|
|
|
return track_id
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _to_int(value: Any) -> int:
|
|
|
|
|
|
try:
|
|
|
|
|
|
return int(value or 0)
|
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-17 08:38:41 +00:00
|
|
|
|
knowledge_index_task_manager = KnowledgeIndexTaskManager()
|