from __future__ import annotations from dataclasses import dataclass, field from datetime import UTC, datetime, timedelta from sqlalchemy import select from sqlalchemy.orm import Session from app.api.deps import CurrentUserContext from app.core.agent_enums import AgentName, AgentPermissionLevel, AgentRunSource, AgentRunStatus from app.models.agent_asset import AgentAsset from app.services.agent_runs import AgentRunService from app.services.knowledge import ( KNOWLEDGE_INGEST_STATUS_FAILED, KNOWLEDGE_INGEST_STATUS_SYNCING, KnowledgeService, ) from app.services.knowledge_index_tasks import knowledge_index_task_manager ALL_KNOWLEDGE_FOLDERS_LABEL = "全部知识库" @dataclass(slots=True) class KnowledgeSyncDispatchResult: ok: bool = True agent_run_id: str = "" folder: str = "" document_ids: list[str] = field(default_factory=list) queued_at: datetime = field(default_factory=lambda: datetime.now(UTC)) status: str = AgentRunStatus.SUCCEEDED.value summary: str = "" reused: bool = False class KnowledgeSyncDispatchService: def __init__(self, db: Session) -> None: self.db = db self.run_service = AgentRunService(db) self.knowledge_service = KnowledgeService(db=db) def queue_sync( self, *, current_user: CurrentUserContext, folder: str | None = None, document_ids: list[str] | None = None, source: str = AgentRunSource.USER_MESSAGE.value, force: bool = False, changed_only: bool = True, ) -> KnowledgeSyncDispatchResult: normalized_folder = str(folder or "").strip() or None folder_label = normalized_folder or ALL_KNOWLEDGE_FOLDERS_LABEL normalized_requested_ids = [ str(item).strip() for item in document_ids or [] if str(item).strip() ] all_documents = self.knowledge_service.list_documents_for_ingest( folder=normalized_folder, document_ids=normalized_requested_ids, changed_only=False, ) target_documents = self.knowledge_service.list_documents_for_ingest( folder=normalized_folder, document_ids=normalized_requested_ids, changed_only=(False if force else changed_only), ) target_document_ids = [ str(item.get("id") or "").strip() for item in target_documents if str(item.get("id") or "").strip() ] if not all_documents: return KnowledgeSyncDispatchResult( folder=folder_label, document_ids=[], status=AgentRunStatus.SUCCEEDED.value, summary="当前目录暂无可归纳的知识文档。", ) if not target_document_ids: return KnowledgeSyncDispatchResult( folder=folder_label, document_ids=[], status=AgentRunStatus.SUCCEEDED.value, summary="当前目录没有需要增量归纳的文档。", ) active_run = self._find_active_run( folder=folder_label, requested_document_ids=target_document_ids, ) if active_run is not None: active_document_ids = [ str(item).strip() for item in list(active_run.route_json.get("requested_document_ids") or target_document_ids) if str(item).strip() ] return KnowledgeSyncDispatchResult( agent_run_id=active_run.run_id, folder=folder_label, document_ids=active_document_ids, queued_at=active_run.started_at, status=active_run.status, summary="已有知识归纳任务正在执行,系统已复用当前任务。", reused=True, ) task_asset = self.db.scalar( select(AgentAsset).where(AgentAsset.code == "task.hermes.knowledge_index_sync") ) run = self.run_service.create_run( agent=AgentName.HERMES.value, source=source, user_id=current_user.username, task_id=task_asset.id if task_asset is not None else None, permission_level=AgentPermissionLevel.READ.value, status=AgentRunStatus.RUNNING.value, result_summary="知识归纳任务已入队,等待后台执行。", route_json={ "job_type": "knowledge_index_sync", "phase": "queued", "folder": folder_label, "force": force, "changed_only": (False if force else changed_only), "requested_document_ids": target_document_ids, "requested_by_username": current_user.username, "requested_by_name": current_user.name, "progress": { "total_documents": len(target_document_ids), "completed_documents": 0, "failed_documents": 0, "skipped_documents": 0, "percent": 0, }, }, ) try: self.knowledge_service.set_document_ingest_statuses( target_document_ids, status_code=KNOWLEDGE_INGEST_STATUS_SYNCING, agent_run_id=run.run_id, ) knowledge_index_task_manager.submit_sync( agent_run_id=run.run_id, folder=folder_label, current_user=current_user, document_ids=target_document_ids, force=force, ) return KnowledgeSyncDispatchResult( agent_run_id=run.run_id, folder=folder_label, document_ids=target_document_ids, queued_at=run.started_at, status=run.status, summary="知识归纳任务已进入后台执行,可在日志管理中查看进度。", ) except Exception as exc: self.run_service.update_run( run.run_id, status=AgentRunStatus.FAILED.value, error_message=str(exc), result_summary=str(exc), finished_at=datetime.now(UTC), ) self.knowledge_service.set_document_ingest_statuses( target_document_ids, status_code=KNOWLEDGE_INGEST_STATUS_FAILED, agent_run_id=run.run_id, ) raise def _find_active_run( self, *, folder: str, requested_document_ids: list[str], ): requested_set = {str(item).strip() for item in requested_document_ids if str(item).strip()} for item in self.run_service.list_runs( agent=AgentName.HERMES.value, status=AgentRunStatus.RUNNING.value, limit=100, ): if str(item.route_json.get("job_type") or "").strip() != "knowledge_index_sync": continue heartbeat_raw = str(item.route_json.get("heartbeat_at") or "").strip() heartbeat_at = None if heartbeat_raw: try: heartbeat_at = datetime.fromisoformat(heartbeat_raw) except ValueError: heartbeat_at = None last_seen_at = heartbeat_at or item.started_at if last_seen_at.tzinfo is None: last_seen_at = last_seen_at.replace(tzinfo=UTC) if datetime.now(UTC) - last_seen_at > timedelta(minutes=30): stale_document_ids = [ str(document_id).strip() for document_id in list(item.route_json.get("requested_document_ids") or []) if str(document_id).strip() ] if stale_document_ids: self.knowledge_service.set_document_ingest_statuses( stale_document_ids, status_code=KNOWLEDGE_INGEST_STATUS_FAILED, agent_run_id=item.run_id, ) self.run_service.merge_route_json( item.run_id, { "phase": "stale_failed", "heartbeat_at": datetime.now(UTC).isoformat(), }, status=AgentRunStatus.FAILED.value, result_summary="知识归纳任务长时间无心跳,系统已自动标记失败。", error_message="Knowledge index heartbeat timed out.", finished_at=datetime.now(UTC), ) continue active_ids = { str(document_id).strip() for document_id in list(item.route_json.get("requested_document_ids") or []) if str(document_id).strip() } active_folder = str(item.route_json.get("folder") or "").strip() if active_folder == ALL_KNOWLEDGE_FOLDERS_LABEL: if not requested_set or active_ids & requested_set: return item continue if active_folder == folder: if not requested_set or not active_ids or active_ids & requested_set: return item return None