X-Financial/server/src/app/services/knowledge_index_tasks.py

from __future__ import annotations

import threading
from concurrent.futures import Future, ThreadPoolExecutor
from datetime import UTC, datetime
from time import perf_counter
from typing import Any

from app.api.deps import CurrentUserContext
from app.core.agent_enums import AgentName, AgentRunStatus, AgentToolType
from app.core.logging import get_logger
from app.db.session import get_session_factory
from app.services.agent_runs import AgentRunService
from app.services.knowledge import (
    KNOWLEDGE_INGEST_STATUS_FAILED,
    KNOWLEDGE_INGEST_STATUS_INGESTED,
    KnowledgeService,
)
from app.services.knowledge_rag import KnowledgeRagService

logger = get_logger("app.services.knowledge_index_tasks")
HEARTBEAT_INTERVAL_SECONDS = 10


class KnowledgeIndexTaskManager:
    def __init__(self) -> None:
        self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="knowledge-index")
        self._futures: dict[str, Future[Any]] = {}

    def submit_sync(
        self,
        *,
        agent_run_id: str,
        folder: str,
        current_user: CurrentUserContext,
        document_ids: list[str],
        force: bool,
    ) -> None:
        future = self._executor.submit(
            self._run_sync,
            agent_run_id,
            folder,
            current_user,
            [str(item).strip() for item in document_ids if str(item).strip()],
            force,
        )
        self._futures[agent_run_id] = future

    def shutdown(self) -> None:
        self._executor.shutdown(wait=False, cancel_futures=True)

    @staticmethod
    def _run_sync(
        agent_run_id: str,
        folder: str,
        current_user: CurrentUserContext,
        document_ids: list[str],
        force: bool,
    ) -> None:
        session_factory = get_session_factory()
        db = session_factory()
        started = perf_counter()
        heartbeat_stop = threading.Event()
        heartbeat_thread: threading.Thread | None = None
        tool_call_id = ""
        knowledge_ingest: dict[str, Any] | None = None
        tool_request_json = {
            "agent": AgentName.HERMES.value,
            "folder": folder,
            "document_ids": document_ids,
            "force": force,
        }

        try:
            run_service = AgentRunService(db)
            knowledge_service = KnowledgeService(db=db)
            rag_service = KnowledgeRagService(db=db)
            knowledge_ingest = _build_initial_knowledge_ingest_state(
                knowledge_service,
                document_ids=document_ids,
            )

            run_service.merge_route_json(
                agent_run_id,
                {
                    "job_type": "knowledge_index_sync",
                    "phase": "indexing",
                    "folder": folder,
                    "force": force,
                    "heartbeat_at": datetime.now(UTC).isoformat(),
                    "requested_document_ids": document_ids,
                    "requested_by_username": current_user.username,
                    "requested_by_name": current_user.name,
                    "progress": {
                        "total_documents": len(document_ids),
                        "completed_documents": 0,
                        "failed_documents": 0,
                        "skipped_documents": 0,
                        "percent": 10 if document_ids else 100,
                    },
                    "knowledge_ingest": knowledge_ingest,
                },
                result_summary=_build_ingest_running_summary(
                    knowledge_ingest,
                    {
                        "total_documents": len(document_ids),
                        "completed_documents": 0,
                        "failed_documents": 0,
                        "skipped_documents": 0,
                        "percent": 10 if document_ids else 100,
                    },
                ),
            )
            tool_call = run_service.record_tool_call(
                run_id=agent_run_id,
                tool_type=AgentToolType.LLM.value,
                tool_name="lightrag.index_documents",
                request_json=tool_request_json,
                response_json={"phase": "indexing"},
                status="running",
                duration_ms=0,
                error_message=None,
            )
            tool_call_id = tool_call.id

            def heartbeat_worker() -> None:
                while not heartbeat_stop.wait(HEARTBEAT_INTERVAL_SECONDS):
                    heartbeat_db = session_factory()
                    try:
                        AgentRunService(heartbeat_db).merge_route_json(
                            agent_run_id,
                            {
                                "job_type": "knowledge_index_sync",
                                "phase": "indexing",
                                "heartbeat_at": datetime.now(UTC).isoformat(),
                            },
                        )
                    except Exception:
                        logger.exception(
                            "Knowledge index heartbeat update failed run_id=%s",
                            agent_run_id,
                        )
                    finally:
                        heartbeat_db.close()

            heartbeat_thread = threading.Thread(
                target=heartbeat_worker,
                name=f"knowledge-index-heartbeat-{agent_run_id}",
                daemon=True,
            )
            heartbeat_thread.start()

            responses: list[dict[str, Any]] = []
            succeeded_document_ids: list[str] = []
            failed_documents: list[dict[str, str]] = []
            total_documents = len(document_ids)

            for index, document_id in enumerate(document_ids, start=1):
                _patch_ingest_document(
                    knowledge_ingest,
                    document_id,
                    {
                        "status": "running",
                        "phase": "indexing",
                        "started_at": datetime.now(UTC).isoformat(),
                    },
                    event=f"开始处理第 {index}/{total_documents} 个文件，正在写入 LightRAG。",
                )
                knowledge_ingest["current_document_id"] = document_id
                _sync_ingest_route_json(
                    run_service,
                    agent_run_id,
                    knowledge_ingest,
                    progress=_build_ingest_progress(knowledge_ingest, total_documents),
                )

                try:
                    response = rag_service.index_documents(document_ids=[document_id], force=force)
                except Exception as exc:
                    logger.exception(
                        "Knowledge document index failed run_id=%s doc_id=%s",
                        agent_run_id,
                        document_id,
                    )
                    failed_documents.append(
                        {
                            "document_id": document_id,
                            "status": "exception",
                            "error": str(exc),
                        }
                    )
                    _patch_ingest_document(
                        knowledge_ingest,
                        document_id,
                        {
                            "status": "failed",
                            "phase": "failed",
                            "finished_at": datetime.now(UTC).isoformat(),
                            "error": str(exc),
                        },
                        event=f"归集失败：{exc}",
                        level="error",
                    )
                    knowledge_service.set_document_ingest_statuses(
                        [document_id],
                        KNOWLEDGE_INGEST_STATUS_FAILED,
                        agent_run_id=agent_run_id,
                    )
                    _refresh_ingest_graph(knowledge_ingest)
                    _sync_ingest_route_json(
                        run_service,
                        agent_run_id,
                        knowledge_ingest,
                        progress=_build_ingest_progress(knowledge_ingest, total_documents),
                    )
                    continue

                responses.append(response)
                response_failed_documents = _extract_failed_documents(response, document_id)
                document_summary = _extract_document_summary(response, document_id)
                if response_failed_documents:
                    failed_documents.extend(response_failed_documents)
                    error_text = (
                        response_failed_documents[0].get("error") or "LightRAG 未返回可查询状态"
                    )
                    _patch_ingest_document(
                        knowledge_ingest,
                        document_id,
                        {
                            **document_summary,
                            "status": "failed",
                            "phase": "failed",
                            "finished_at": datetime.now(UTC).isoformat(),
                            "error": error_text,
                            "track_id": str(response.get("track_id") or "").strip(),
                        },
                        event=f"LightRAG 索引失败：{error_text}",
                        level="error",
                    )
                    knowledge_service.set_document_ingest_statuses(
                        [document_id],
                        KNOWLEDGE_INGEST_STATUS_FAILED,
                        agent_run_id=agent_run_id,
                    )
                else:
                    succeeded_document_ids.append(document_id)
                    chunk_count = int(document_summary.get("chunk_count") or 0)
                    entity_count = int(document_summary.get("entity_count") or 0)
                    relation_count = int(document_summary.get("relation_count") or 0)
                    _patch_ingest_document(
                        knowledge_ingest,
                        document_id,
                        {
                            **document_summary,
                            "status": "succeeded",
                            "phase": "indexed",
                            "finished_at": datetime.now(UTC).isoformat(),
                            "track_id": str(response.get("track_id") or "").strip(),
                        },
                        event=(
                            "LightRAG 索引完成："
                            f"{chunk_count} 个 chunk，{entity_count} 个实体，"
                            f"{relation_count} 条关系。"
                        ),
                    )
                    knowledge_service.set_document_ingest_statuses(
                        [document_id],
                        KNOWLEDGE_INGEST_STATUS_INGESTED,
                        agent_run_id=agent_run_id,
                    )
                _refresh_ingest_graph(knowledge_ingest)
                _sync_ingest_route_json(
                    run_service,
                    agent_run_id,
                    knowledge_ingest,
                    progress=_build_ingest_progress(knowledge_ingest, total_documents),
                )

            failed_document_ids = [
                str(item.get("document_id") or "").strip()
                for item in failed_documents
                if str(item.get("document_id") or "").strip()
            ]

            duration_ms = int((perf_counter() - started) * 1000)
            tool_status = "succeeded" if not failed_document_ids else "failed"
            latest_track_id = _resolve_latest_track_id(responses)
            knowledge_ingest["current_document_id"] = ""
            knowledge_ingest["status"] = tool_status
            knowledge_ingest["phase"] = "completed"
            knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
            knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
            heartbeat_stop.set()
            if heartbeat_thread is not None:
                heartbeat_thread.join(timeout=1)
            run_service.update_tool_call(
                tool_call_id,
                response_json={
                    "track_id": latest_track_id,
                    "requested_document_ids": document_ids,
                    "succeeded_document_ids": succeeded_document_ids,
                    "failed_documents": failed_documents,
                    "documents": knowledge_ingest.get("documents", []),
                    "responses": responses,
                },
                status=tool_status,
                duration_ms=duration_ms,
                error_message=None if tool_status == "succeeded" else "部分文档索引失败。",
            )

            completed_documents = len(succeeded_document_ids)
            failed_count = len(failed_document_ids)
            total_documents = len(document_ids)
            summary = (
                f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引。"
                if failed_count == 0
                else (
                    f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引，"
                    f"失败 {failed_count} 个。"
                )
            )
            run_service.merge_route_json(
                agent_run_id,
                {
                    "job_type": "knowledge_index_sync",
                    "phase": "completed",
                    "track_id": latest_track_id,
                    "heartbeat_at": datetime.now(UTC).isoformat(),
                    "progress": {
                        "total_documents": total_documents,
                        "completed_documents": completed_documents,
                        "failed_documents": failed_count,
                        "skipped_documents": 0,
                        "percent": 100,
                    },
                    "knowledge_ingest": knowledge_ingest,
                },
                status=(
                    AgentRunStatus.SUCCEEDED.value
                    if failed_count == 0
                    else AgentRunStatus.FAILED.value
                ),
                result_summary=summary,
                error_message="部分文档索引失败。" if failed_count else None,
                finished_at=datetime.now(UTC),
            )
        except Exception as exc:
            heartbeat_stop.set()
            if heartbeat_thread is not None:
                heartbeat_thread.join(timeout=1)
            try:
                if tool_call_id:
                    AgentRunService(db).update_tool_call(
                        tool_call_id,
                        response_json={"error": str(exc)},
                        status="failed",
                        duration_ms=int((perf_counter() - started) * 1000),
                        error_message=str(exc),
                    )
                else:
                    AgentRunService(db).record_tool_call(
                        run_id=agent_run_id,
                        tool_type=AgentToolType.LLM.value,
                        tool_name="lightrag.index_documents",
                        request_json=tool_request_json,
                        response_json={"error": str(exc)},
                        status="failed",
                        duration_ms=int((perf_counter() - started) * 1000),
                        error_message=str(exc),
                    )
                KnowledgeService(db=db).set_document_ingest_statuses(
                    _resolve_failed_ingest_document_ids(knowledge_ingest, document_ids),
                    KNOWLEDGE_INGEST_STATUS_FAILED,
                    agent_run_id=agent_run_id,
                )
                if knowledge_ingest is not None:
                    for document_id in document_ids:
                        document = _find_ingest_document(knowledge_ingest, document_id)
                        if document is None or document.get("status") in {"succeeded", "failed"}:
                            continue
                        _patch_ingest_document(
                            knowledge_ingest,
                            document_id,
                            {
                                "status": "failed",
                                "phase": "failed",
                                "finished_at": datetime.now(UTC).isoformat(),
                                "error": str(exc),
                            },
                            event=f"归集任务中断：{exc}",
                            level="error",
                        )
                    knowledge_ingest["status"] = "failed"
                    knowledge_ingest["phase"] = "failed"
                    knowledge_ingest["current_document_id"] = ""
                    knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
                    knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)

                route_payload: dict[str, Any] = {
                    "job_type": "knowledge_index_sync",
                    "phase": "failed",
                    "heartbeat_at": datetime.now(UTC).isoformat(),
                    "progress": {
                        "total_documents": len(document_ids),
                        "completed_documents": 0,
                        "failed_documents": len(document_ids),
                        "skipped_documents": 0,
                        "percent": 100,
                    },
                }
                if knowledge_ingest is not None:
                    route_payload["knowledge_ingest"] = knowledge_ingest
                AgentRunService(db).merge_route_json(
                    agent_run_id,
                    route_payload,
                    status=AgentRunStatus.FAILED.value,
                    result_summary=str(exc),
                    error_message=str(exc),
                    finished_at=datetime.now(UTC),
                )
            except Exception:
                logger.exception("Knowledge index task finalization failed run_id=%s", agent_run_id)
            logger.exception("Knowledge index task failed run_id=%s", agent_run_id)
        finally:
            heartbeat_stop.set()
            if heartbeat_thread is not None and heartbeat_thread.is_alive():
                heartbeat_thread.join(timeout=1)
            db.close()


def _build_initial_knowledge_ingest_state(
    knowledge_service: KnowledgeService,
    *,
    document_ids: list[str],
) -> dict[str, Any]:
    now = datetime.now(UTC).isoformat()
    documents = [
        _build_initial_knowledge_ingest_document(knowledge_service, document_id, now=now)
        for document_id in document_ids
    ]
    return {
        "schema_version": 1,
        "status": "running",
        "phase": "queued",
        "started_at": now,
        "finished_at": None,
        "current_document_id": documents[0]["document_id"] if documents else "",
        "documents": documents,
        "graph": _build_ingest_graph({"documents": documents}),
    }


def _build_initial_knowledge_ingest_document(
    knowledge_service: KnowledgeService,
    document_id: str,
    *,
    now: str,
) -> dict[str, Any]:
    try:
        entry = knowledge_service.get_document_entry(document_id)
    except Exception:
        entry = {}
    return {
        "document_id": document_id,
        "name": str(entry.get("original_name") or document_id).strip(),
        "folder": str(entry.get("folder") or "").strip(),
        "extension": str(entry.get("extension") or "").strip(),
        "mime_type": str(entry.get("mime_type") or "").strip(),
        "status": "queued",
        "phase": "queued",
        "started_at": None,
        "finished_at": None,
        "text_chars": 0,
        "indexed_text_chars": 0,
        "section_count": 0,
        "sections": [],
        "chunk_count": 0,
        "chunk_ids": [],
        "chunks": [],
        "entity_count": 0,
        "relation_count": 0,
        "entities": [],
        "entity_chunks": [],
        "relations": [],
        "events": [
            {
                "at": now,
                "level": "info",
                "message": "已进入知识归集队列，等待 LightRAG 处理。",
            }
        ],
    }


def _patch_ingest_document(
    knowledge_ingest: dict[str, Any],
    document_id: str,
    updates: dict[str, Any],
    *,
    event: str = "",
    level: str = "info",
) -> None:
    document = _find_ingest_document(knowledge_ingest, document_id)
    if document is None:
        return
    document.update(updates)
    if event:
        _append_ingest_event(document, event, level=level)


def _append_ingest_event(document: dict[str, Any], message: str, *, level: str) -> None:
    events = document.get("events")
    if not isinstance(events, list):
        events = []
    events.append(
        {
            "at": datetime.now(UTC).isoformat(),
            "level": level,
            "message": message,
        }
    )
    document["events"] = events[-30:]


def _find_ingest_document(
    knowledge_ingest: dict[str, Any],
    document_id: str,
) -> dict[str, Any] | None:
    for document in list(knowledge_ingest.get("documents") or []):
        if not isinstance(document, dict):
            continue
        if str(document.get("document_id") or "").strip() == document_id:
            return document
    return None


def _sync_ingest_route_json(
    run_service: AgentRunService,
    agent_run_id: str,
    knowledge_ingest: dict[str, Any],
    *,
    progress: dict[str, int],
) -> None:
    run_service.merge_route_json(
        agent_run_id,
        {
            "job_type": "knowledge_index_sync",
            "phase": "indexing",
            "heartbeat_at": datetime.now(UTC).isoformat(),
            "progress": progress,
            "knowledge_ingest": knowledge_ingest,
        },
        result_summary=_build_ingest_running_summary(knowledge_ingest, progress),
    )


def _build_ingest_running_summary(
    knowledge_ingest: dict[str, Any],
    progress: dict[str, int],
) -> str:
    total_documents = int(progress.get("total_documents") or 0)
    completed_documents = int(progress.get("completed_documents") or 0)
    failed_documents = int(progress.get("failed_documents") or 0)
    current_document_id = str(knowledge_ingest.get("current_document_id") or "").strip()
    current_document = (
        _find_ingest_document(knowledge_ingest, current_document_id)
        if current_document_id
        else None
    )
    if current_document is not None:
        name = str(current_document.get("name") or current_document_id).strip()
        current_index = _resolve_ingest_document_index(knowledge_ingest, current_document_id)
        return (
            f"知识归纳正在处理 {current_index}/{total_documents}：{name}。"
            f"已完成 {completed_documents} 个，失败 {failed_documents} 个。"
        )
    return (
        f"知识归纳正在运行，已完成 {completed_documents}/{total_documents} 个文档，"
        f"失败 {failed_documents} 个。"
    )


def _resolve_ingest_document_index(
    knowledge_ingest: dict[str, Any],
    document_id: str,
) -> int:
    documents = [
        item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
    ]
    for index, document in enumerate(documents, start=1):
        if str(document.get("document_id") or "").strip() == document_id:
            return index
    return 0


def _build_ingest_progress(
    knowledge_ingest: dict[str, Any],
    total_documents: int,
) -> dict[str, int]:
    documents = [
        item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
    ]
    completed_documents = sum(1 for item in documents if item.get("status") == "succeeded")
    failed_documents = sum(1 for item in documents if item.get("status") == "failed")
    skipped_documents = sum(1 for item in documents if item.get("status") == "skipped")
    done_documents = completed_documents + failed_documents + skipped_documents
    if total_documents <= 0:
        percent = 100
    else:
        percent = min(95, max(10, 10 + int(done_documents * 85 / total_documents)))
    return {
        "total_documents": total_documents,
        "completed_documents": completed_documents,
        "failed_documents": failed_documents,
        "skipped_documents": skipped_documents,
        "percent": percent,
    }


def _extract_document_summary(response: dict[str, Any], document_id: str) -> dict[str, Any]:
    for item in list(response.get("document_summaries") or []):
        if not isinstance(item, dict):
            continue
        if str(item.get("document_id") or "").strip() == document_id:
            return dict(item)
    return {}


def _extract_failed_documents(
    response: dict[str, Any],
    document_id: str,
) -> list[dict[str, str]]:
    failed_documents: list[dict[str, str]] = []
    for item in list(response.get("failed_documents") or []):
        if not isinstance(item, dict):
            continue
        item_document_id = str(item.get("document_id") or "").strip()
        if item_document_id and item_document_id != document_id:
            continue
        failed_documents.append(
            {
                "document_id": item_document_id or document_id,
                "status": str(item.get("status") or "failed").strip(),
                "error": str(item.get("error") or "LightRAG 索引失败").strip(),
            }
        )
    return failed_documents


def _resolve_failed_ingest_document_ids(
    knowledge_ingest: dict[str, Any] | None,
    document_ids: list[str],
) -> list[str]:
    if knowledge_ingest is None:
        return document_ids
    failed_document_ids: list[str] = []
    seen_document_ids: set[str] = set()
    for document in list(knowledge_ingest.get("documents") or []):
        if not isinstance(document, dict):
            continue
        document_id = str(document.get("document_id") or "").strip()
        if not document_id:
            continue
        seen_document_ids.add(document_id)
        if document.get("status") != "succeeded":
            failed_document_ids.append(document_id)
    failed_document_ids.extend(
        document_id for document_id in document_ids if document_id not in seen_document_ids
    )
    return failed_document_ids


def _refresh_ingest_graph(knowledge_ingest: dict[str, Any]) -> None:
    knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)


def _build_ingest_graph(knowledge_ingest: dict[str, Any]) -> dict[str, Any]:
    documents = [
        item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
    ]
    entities = _dedupe_entities(
        entity for document in documents for entity in list(document.get("entities") or [])
    )
    relations = _dedupe_relations(
        relation for document in documents for relation in list(document.get("relations") or [])
    )
    return {
        "chunk_count": sum(_to_int(document.get("chunk_count")) for document in documents),
        "entity_count": sum(_to_int(document.get("entity_count")) for document in documents),
        "relation_count": sum(_to_int(document.get("relation_count")) for document in documents),
        "entities": entities[:60],
        "relations": relations[:60],
    }


def _dedupe_entities(items: Any) -> list[dict[str, Any]]:
    deduped: list[dict[str, Any]] = []
    seen: set[str] = set()
    for item in items:
        if isinstance(item, dict):
            name = str(
                item.get("name")
                or item.get("entity")
                or item.get("entity_id")
                or item.get("title")
                or item.get("id")
                or ""
            ).strip()
            entity = dict(item)
        else:
            name = str(item or "").strip()
            entity = {}
        if not name or name in seen:
            continue
        seen.add(name)
        entity["name"] = name
        entity["type"] = str(
            entity.get("type")
            or entity.get("entity_type")
            or entity.get("category")
            or entity.get("kind")
            or "实体"
        ).strip()
        description = str(entity.get("description") or "").strip()
        descriptions = entity.get("descriptions")
        if not isinstance(descriptions, list):
            descriptions = [description] if description else []
        entity["description"] = description
        entity["descriptions"] = [
            str(description_item or "").strip()
            for description_item in descriptions
            if str(description_item or "").strip()
        ][:5]
        if not isinstance(entity.get("properties"), dict):
            entity["properties"] = {}
        deduped.append(entity)
    return deduped


def _dedupe_relations(items: Any) -> list[dict[str, Any]]:
    deduped: list[dict[str, Any]] = []
    seen: set[tuple[str, str, str]] = set()
    for item in items:
        if not isinstance(item, dict):
            continue
        source = str(item.get("source") or "").strip()
        target = str(item.get("target") or "").strip()
        relation_type = str(item.get("type") or "关联").strip()
        key = (source, target, relation_type)
        if not source or not target or key in seen:
            continue
        seen.add(key)
        deduped.append({**item, "source": source, "target": target, "type": relation_type})
    return deduped


def _resolve_latest_track_id(responses: list[dict[str, Any]]) -> str:
    for response in reversed(responses):
        track_id = str(response.get("track_id") or "").strip()
        if track_id:
            return track_id
    return ""


def _to_int(value: Any) -> int:
    try:
        return int(value or 0)
    except (TypeError, ValueError):
        return 0


knowledge_index_task_manager = KnowledgeIndexTaskManager()