from __future__ import annotations import threading from concurrent.futures import Future, ThreadPoolExecutor from datetime import UTC, datetime from time import perf_counter from typing import Any from app.api.deps import CurrentUserContext from app.core.agent_enums import AgentName, AgentRunStatus, AgentToolType from app.core.logging import get_logger from app.db.session import get_session_factory from app.services.agent_runs import AgentRunService from app.services.knowledge import ( KNOWLEDGE_INGEST_STATUS_FAILED, KNOWLEDGE_INGEST_STATUS_INGESTED, KnowledgeService, ) from app.services.knowledge_rag import KnowledgeRagService logger = get_logger("app.services.knowledge_index_tasks") HEARTBEAT_INTERVAL_SECONDS = 10 class KnowledgeIndexTaskManager: def __init__(self) -> None: self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="knowledge-index") self._futures: dict[str, Future[Any]] = {} def submit_sync( self, *, agent_run_id: str, folder: str, current_user: CurrentUserContext, document_ids: list[str], force: bool, ) -> None: future = self._executor.submit( self._run_sync, agent_run_id, folder, current_user, [str(item).strip() for item in document_ids if str(item).strip()], force, ) self._futures[agent_run_id] = future def shutdown(self) -> None: self._executor.shutdown(wait=False, cancel_futures=True) @staticmethod def _run_sync( agent_run_id: str, folder: str, current_user: CurrentUserContext, document_ids: list[str], force: bool, ) -> None: session_factory = get_session_factory() db = session_factory() started = perf_counter() heartbeat_stop = threading.Event() heartbeat_thread: threading.Thread | None = None tool_call_id = "" knowledge_ingest: dict[str, Any] | None = None tool_request_json = { "agent": AgentName.HERMES.value, "folder": folder, "document_ids": document_ids, "force": force, } try: run_service = AgentRunService(db) knowledge_service = KnowledgeService(db=db) rag_service = KnowledgeRagService(db=db) knowledge_ingest = _build_initial_knowledge_ingest_state( knowledge_service, document_ids=document_ids, ) run_service.merge_route_json( agent_run_id, { "job_type": "knowledge_index_sync", "phase": "indexing", "folder": folder, "force": force, "heartbeat_at": datetime.now(UTC).isoformat(), "requested_document_ids": document_ids, "requested_by_username": current_user.username, "requested_by_name": current_user.name, "progress": { "total_documents": len(document_ids), "completed_documents": 0, "failed_documents": 0, "skipped_documents": 0, "percent": 10 if document_ids else 100, }, "knowledge_ingest": knowledge_ingest, }, result_summary=_build_ingest_running_summary( knowledge_ingest, { "total_documents": len(document_ids), "completed_documents": 0, "failed_documents": 0, "skipped_documents": 0, "percent": 10 if document_ids else 100, }, ), ) tool_call = run_service.record_tool_call( run_id=agent_run_id, tool_type=AgentToolType.LLM.value, tool_name="lightrag.index_documents", request_json=tool_request_json, response_json={"phase": "indexing"}, status="running", duration_ms=0, error_message=None, ) tool_call_id = tool_call.id def heartbeat_worker() -> None: while not heartbeat_stop.wait(HEARTBEAT_INTERVAL_SECONDS): heartbeat_db = session_factory() try: AgentRunService(heartbeat_db).merge_route_json( agent_run_id, { "job_type": "knowledge_index_sync", "phase": "indexing", "heartbeat_at": datetime.now(UTC).isoformat(), }, ) except Exception: logger.exception( "Knowledge index heartbeat update failed run_id=%s", agent_run_id, ) finally: heartbeat_db.close() heartbeat_thread = threading.Thread( target=heartbeat_worker, name=f"knowledge-index-heartbeat-{agent_run_id}", daemon=True, ) heartbeat_thread.start() responses: list[dict[str, Any]] = [] succeeded_document_ids: list[str] = [] failed_documents: list[dict[str, str]] = [] total_documents = len(document_ids) for index, document_id in enumerate(document_ids, start=1): _patch_ingest_document( knowledge_ingest, document_id, { "status": "running", "phase": "indexing", "started_at": datetime.now(UTC).isoformat(), }, event=f"开始处理第 {index}/{total_documents} 个文件,正在写入 LightRAG。", ) knowledge_ingest["current_document_id"] = document_id _sync_ingest_route_json( run_service, agent_run_id, knowledge_ingest, progress=_build_ingest_progress(knowledge_ingest, total_documents), ) try: response = rag_service.index_documents(document_ids=[document_id], force=force) except Exception as exc: logger.exception( "Knowledge document index failed run_id=%s doc_id=%s", agent_run_id, document_id, ) failed_documents.append( { "document_id": document_id, "status": "exception", "error": str(exc), } ) _patch_ingest_document( knowledge_ingest, document_id, { "status": "failed", "phase": "failed", "finished_at": datetime.now(UTC).isoformat(), "error": str(exc), }, event=f"归集失败:{exc}", level="error", ) knowledge_service.set_document_ingest_statuses( [document_id], KNOWLEDGE_INGEST_STATUS_FAILED, agent_run_id=agent_run_id, ) _refresh_ingest_graph(knowledge_ingest) _sync_ingest_route_json( run_service, agent_run_id, knowledge_ingest, progress=_build_ingest_progress(knowledge_ingest, total_documents), ) continue responses.append(response) response_failed_documents = _extract_failed_documents(response, document_id) document_summary = _extract_document_summary(response, document_id) if response_failed_documents: failed_documents.extend(response_failed_documents) error_text = ( response_failed_documents[0].get("error") or "LightRAG 未返回可查询状态" ) _patch_ingest_document( knowledge_ingest, document_id, { **document_summary, "status": "failed", "phase": "failed", "finished_at": datetime.now(UTC).isoformat(), "error": error_text, "track_id": str(response.get("track_id") or "").strip(), }, event=f"LightRAG 索引失败:{error_text}", level="error", ) knowledge_service.set_document_ingest_statuses( [document_id], KNOWLEDGE_INGEST_STATUS_FAILED, agent_run_id=agent_run_id, ) else: succeeded_document_ids.append(document_id) chunk_count = int(document_summary.get("chunk_count") or 0) entity_count = int(document_summary.get("entity_count") or 0) relation_count = int(document_summary.get("relation_count") or 0) _patch_ingest_document( knowledge_ingest, document_id, { **document_summary, "status": "succeeded", "phase": "indexed", "finished_at": datetime.now(UTC).isoformat(), "track_id": str(response.get("track_id") or "").strip(), }, event=( "LightRAG 索引完成:" f"{chunk_count} 个 chunk,{entity_count} 个实体," f"{relation_count} 条关系。" ), ) knowledge_service.set_document_ingest_statuses( [document_id], KNOWLEDGE_INGEST_STATUS_INGESTED, agent_run_id=agent_run_id, ) _refresh_ingest_graph(knowledge_ingest) _sync_ingest_route_json( run_service, agent_run_id, knowledge_ingest, progress=_build_ingest_progress(knowledge_ingest, total_documents), ) failed_document_ids = [ str(item.get("document_id") or "").strip() for item in failed_documents if str(item.get("document_id") or "").strip() ] duration_ms = int((perf_counter() - started) * 1000) tool_status = "succeeded" if not failed_document_ids else "failed" latest_track_id = _resolve_latest_track_id(responses) knowledge_ingest["current_document_id"] = "" knowledge_ingest["status"] = tool_status knowledge_ingest["phase"] = "completed" knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat() knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest) heartbeat_stop.set() if heartbeat_thread is not None: heartbeat_thread.join(timeout=1) run_service.update_tool_call( tool_call_id, response_json={ "track_id": latest_track_id, "requested_document_ids": document_ids, "succeeded_document_ids": succeeded_document_ids, "failed_documents": failed_documents, "documents": knowledge_ingest.get("documents", []), "responses": responses, }, status=tool_status, duration_ms=duration_ms, error_message=None if tool_status == "succeeded" else "部分文档索引失败。", ) completed_documents = len(succeeded_document_ids) failed_count = len(failed_document_ids) total_documents = len(document_ids) summary = ( f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引。" if failed_count == 0 else ( f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引," f"失败 {failed_count} 个。" ) ) run_service.merge_route_json( agent_run_id, { "job_type": "knowledge_index_sync", "phase": "completed", "track_id": latest_track_id, "heartbeat_at": datetime.now(UTC).isoformat(), "progress": { "total_documents": total_documents, "completed_documents": completed_documents, "failed_documents": failed_count, "skipped_documents": 0, "percent": 100, }, "knowledge_ingest": knowledge_ingest, }, status=( AgentRunStatus.SUCCEEDED.value if failed_count == 0 else AgentRunStatus.FAILED.value ), result_summary=summary, error_message="部分文档索引失败。" if failed_count else None, finished_at=datetime.now(UTC), ) except Exception as exc: heartbeat_stop.set() if heartbeat_thread is not None: heartbeat_thread.join(timeout=1) try: if tool_call_id: AgentRunService(db).update_tool_call( tool_call_id, response_json={"error": str(exc)}, status="failed", duration_ms=int((perf_counter() - started) * 1000), error_message=str(exc), ) else: AgentRunService(db).record_tool_call( run_id=agent_run_id, tool_type=AgentToolType.LLM.value, tool_name="lightrag.index_documents", request_json=tool_request_json, response_json={"error": str(exc)}, status="failed", duration_ms=int((perf_counter() - started) * 1000), error_message=str(exc), ) KnowledgeService(db=db).set_document_ingest_statuses( _resolve_failed_ingest_document_ids(knowledge_ingest, document_ids), KNOWLEDGE_INGEST_STATUS_FAILED, agent_run_id=agent_run_id, ) if knowledge_ingest is not None: for document_id in document_ids: document = _find_ingest_document(knowledge_ingest, document_id) if document is None or document.get("status") in {"succeeded", "failed"}: continue _patch_ingest_document( knowledge_ingest, document_id, { "status": "failed", "phase": "failed", "finished_at": datetime.now(UTC).isoformat(), "error": str(exc), }, event=f"归集任务中断:{exc}", level="error", ) knowledge_ingest["status"] = "failed" knowledge_ingest["phase"] = "failed" knowledge_ingest["current_document_id"] = "" knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat() knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest) route_payload: dict[str, Any] = { "job_type": "knowledge_index_sync", "phase": "failed", "heartbeat_at": datetime.now(UTC).isoformat(), "progress": { "total_documents": len(document_ids), "completed_documents": 0, "failed_documents": len(document_ids), "skipped_documents": 0, "percent": 100, }, } if knowledge_ingest is not None: route_payload["knowledge_ingest"] = knowledge_ingest AgentRunService(db).merge_route_json( agent_run_id, route_payload, status=AgentRunStatus.FAILED.value, result_summary=str(exc), error_message=str(exc), finished_at=datetime.now(UTC), ) except Exception: logger.exception("Knowledge index task finalization failed run_id=%s", agent_run_id) logger.exception("Knowledge index task failed run_id=%s", agent_run_id) finally: heartbeat_stop.set() if heartbeat_thread is not None and heartbeat_thread.is_alive(): heartbeat_thread.join(timeout=1) db.close() def _build_initial_knowledge_ingest_state( knowledge_service: KnowledgeService, *, document_ids: list[str], ) -> dict[str, Any]: now = datetime.now(UTC).isoformat() documents = [ _build_initial_knowledge_ingest_document(knowledge_service, document_id, now=now) for document_id in document_ids ] return { "schema_version": 1, "status": "running", "phase": "queued", "started_at": now, "finished_at": None, "current_document_id": documents[0]["document_id"] if documents else "", "documents": documents, "graph": _build_ingest_graph({"documents": documents}), } def _build_initial_knowledge_ingest_document( knowledge_service: KnowledgeService, document_id: str, *, now: str, ) -> dict[str, Any]: try: entry = knowledge_service.get_document_entry(document_id) except Exception: entry = {} return { "document_id": document_id, "name": str(entry.get("original_name") or document_id).strip(), "folder": str(entry.get("folder") or "").strip(), "extension": str(entry.get("extension") or "").strip(), "mime_type": str(entry.get("mime_type") or "").strip(), "status": "queued", "phase": "queued", "started_at": None, "finished_at": None, "text_chars": 0, "indexed_text_chars": 0, "section_count": 0, "sections": [], "chunk_count": 0, "chunk_ids": [], "chunks": [], "entity_count": 0, "relation_count": 0, "entities": [], "entity_chunks": [], "relations": [], "events": [ { "at": now, "level": "info", "message": "已进入知识归集队列,等待 LightRAG 处理。", } ], } def _patch_ingest_document( knowledge_ingest: dict[str, Any], document_id: str, updates: dict[str, Any], *, event: str = "", level: str = "info", ) -> None: document = _find_ingest_document(knowledge_ingest, document_id) if document is None: return document.update(updates) if event: _append_ingest_event(document, event, level=level) def _append_ingest_event(document: dict[str, Any], message: str, *, level: str) -> None: events = document.get("events") if not isinstance(events, list): events = [] events.append( { "at": datetime.now(UTC).isoformat(), "level": level, "message": message, } ) document["events"] = events[-30:] def _find_ingest_document( knowledge_ingest: dict[str, Any], document_id: str, ) -> dict[str, Any] | None: for document in list(knowledge_ingest.get("documents") or []): if not isinstance(document, dict): continue if str(document.get("document_id") or "").strip() == document_id: return document return None def _sync_ingest_route_json( run_service: AgentRunService, agent_run_id: str, knowledge_ingest: dict[str, Any], *, progress: dict[str, int], ) -> None: run_service.merge_route_json( agent_run_id, { "job_type": "knowledge_index_sync", "phase": "indexing", "heartbeat_at": datetime.now(UTC).isoformat(), "progress": progress, "knowledge_ingest": knowledge_ingest, }, result_summary=_build_ingest_running_summary(knowledge_ingest, progress), ) def _build_ingest_running_summary( knowledge_ingest: dict[str, Any], progress: dict[str, int], ) -> str: total_documents = int(progress.get("total_documents") or 0) completed_documents = int(progress.get("completed_documents") or 0) failed_documents = int(progress.get("failed_documents") or 0) current_document_id = str(knowledge_ingest.get("current_document_id") or "").strip() current_document = ( _find_ingest_document(knowledge_ingest, current_document_id) if current_document_id else None ) if current_document is not None: name = str(current_document.get("name") or current_document_id).strip() current_index = _resolve_ingest_document_index(knowledge_ingest, current_document_id) return ( f"知识归纳正在处理 {current_index}/{total_documents}:{name}。" f"已完成 {completed_documents} 个,失败 {failed_documents} 个。" ) return ( f"知识归纳正在运行,已完成 {completed_documents}/{total_documents} 个文档," f"失败 {failed_documents} 个。" ) def _resolve_ingest_document_index( knowledge_ingest: dict[str, Any], document_id: str, ) -> int: documents = [ item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict) ] for index, document in enumerate(documents, start=1): if str(document.get("document_id") or "").strip() == document_id: return index return 0 def _build_ingest_progress( knowledge_ingest: dict[str, Any], total_documents: int, ) -> dict[str, int]: documents = [ item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict) ] completed_documents = sum(1 for item in documents if item.get("status") == "succeeded") failed_documents = sum(1 for item in documents if item.get("status") == "failed") skipped_documents = sum(1 for item in documents if item.get("status") == "skipped") done_documents = completed_documents + failed_documents + skipped_documents if total_documents <= 0: percent = 100 else: percent = min(95, max(10, 10 + int(done_documents * 85 / total_documents))) return { "total_documents": total_documents, "completed_documents": completed_documents, "failed_documents": failed_documents, "skipped_documents": skipped_documents, "percent": percent, } def _extract_document_summary(response: dict[str, Any], document_id: str) -> dict[str, Any]: for item in list(response.get("document_summaries") or []): if not isinstance(item, dict): continue if str(item.get("document_id") or "").strip() == document_id: return dict(item) return {} def _extract_failed_documents( response: dict[str, Any], document_id: str, ) -> list[dict[str, str]]: failed_documents: list[dict[str, str]] = [] for item in list(response.get("failed_documents") or []): if not isinstance(item, dict): continue item_document_id = str(item.get("document_id") or "").strip() if item_document_id and item_document_id != document_id: continue failed_documents.append( { "document_id": item_document_id or document_id, "status": str(item.get("status") or "failed").strip(), "error": str(item.get("error") or "LightRAG 索引失败").strip(), } ) return failed_documents def _resolve_failed_ingest_document_ids( knowledge_ingest: dict[str, Any] | None, document_ids: list[str], ) -> list[str]: if knowledge_ingest is None: return document_ids failed_document_ids: list[str] = [] seen_document_ids: set[str] = set() for document in list(knowledge_ingest.get("documents") or []): if not isinstance(document, dict): continue document_id = str(document.get("document_id") or "").strip() if not document_id: continue seen_document_ids.add(document_id) if document.get("status") != "succeeded": failed_document_ids.append(document_id) failed_document_ids.extend( document_id for document_id in document_ids if document_id not in seen_document_ids ) return failed_document_ids def _refresh_ingest_graph(knowledge_ingest: dict[str, Any]) -> None: knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest) def _build_ingest_graph(knowledge_ingest: dict[str, Any]) -> dict[str, Any]: documents = [ item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict) ] entities = _dedupe_entities( entity for document in documents for entity in list(document.get("entities") or []) ) relations = _dedupe_relations( relation for document in documents for relation in list(document.get("relations") or []) ) return { "chunk_count": sum(_to_int(document.get("chunk_count")) for document in documents), "entity_count": sum(_to_int(document.get("entity_count")) for document in documents), "relation_count": sum(_to_int(document.get("relation_count")) for document in documents), "entities": entities[:60], "relations": relations[:60], } def _dedupe_entities(items: Any) -> list[dict[str, Any]]: deduped: list[dict[str, Any]] = [] seen: set[str] = set() for item in items: if isinstance(item, dict): name = str( item.get("name") or item.get("entity") or item.get("entity_id") or item.get("title") or item.get("id") or "" ).strip() entity = dict(item) else: name = str(item or "").strip() entity = {} if not name or name in seen: continue seen.add(name) entity["name"] = name entity["type"] = str( entity.get("type") or entity.get("entity_type") or entity.get("category") or entity.get("kind") or "实体" ).strip() description = str(entity.get("description") or "").strip() descriptions = entity.get("descriptions") if not isinstance(descriptions, list): descriptions = [description] if description else [] entity["description"] = description entity["descriptions"] = [ str(description_item or "").strip() for description_item in descriptions if str(description_item or "").strip() ][:5] if not isinstance(entity.get("properties"), dict): entity["properties"] = {} deduped.append(entity) return deduped def _dedupe_relations(items: Any) -> list[dict[str, Any]]: deduped: list[dict[str, Any]] = [] seen: set[tuple[str, str, str]] = set() for item in items: if not isinstance(item, dict): continue source = str(item.get("source") or "").strip() target = str(item.get("target") or "").strip() relation_type = str(item.get("type") or "关联").strip() key = (source, target, relation_type) if not source or not target or key in seen: continue seen.add(key) deduped.append({**item, "source": source, "target": target, "type": relation_type}) return deduped def _resolve_latest_track_id(responses: list[dict[str, Any]]) -> str: for response in reversed(responses): track_id = str(response.get("track_id") or "").strip() if track_id: return track_id return "" def _to_int(value: Any) -> int: try: return int(value or 0) except (TypeError, ValueError): return 0 knowledge_index_task_manager = KnowledgeIndexTaskManager()