from __future__ import annotations import json import uuid from datetime import UTC, datetime, timedelta from typing import Any from sqlalchemy.orm import Session from app.core.config import get_settings from app.core.agent_enums import AgentName, AgentPermissionLevel, AgentRunStatus from app.core.logging import get_logger from app.models.agent_run import AgentRun, AgentToolCall, SemanticParseLog from app.repositories.agent_run import AgentRunRepository from app.schemas.agent_run import ( AgentRunRead, AgentRunStatsRead, AgentToolCallRead, SemanticParseRead, ) from app.services.agent_foundation import AgentFoundationService from app.services.knowledge_ingest_log import enrich_knowledge_ingest_route_json logger = get_logger("app.services.agent_runs") KNOWLEDGE_SYNC_HEARTBEAT_TIMEOUT = timedelta(minutes=30) KNOWLEDGE_SYNC_JOB_TYPES = {"knowledge_index_sync", "llm_wiki_sync"} LIST_ROUTE_FIELDS = ( ("route_job_type", "job_type"), ("route_task_type", "task_type"), ("route_task_code", "task_code"), ("route_task_name", "task_name"), ("route_task_title", "task_title"), ("route_asset_name", "asset_name"), ("route_selected_agent", "selected_agent"), ("route_phase", "phase"), ("route_stage", "stage"), ("route_report_type", "report_type"), ("route_snapshot_key", "snapshot_key"), ("route_folder", "folder"), ("route_heartbeat_at", "heartbeat_at"), ) LIST_ONTOLOGY_FIELDS = ( ("ontology_scenario", "scenario"), ("ontology_intent", "intent"), ("ontology_parse_strategy", "parse_strategy"), ) LIST_PROGRESS_FIELDS = { "percent", "total_documents", "completed_documents", "failed_documents", "skipped_documents", "current_stage", } class AgentRunService: def __init__(self, db: Session) -> None: self.db = db self.repository = AgentRunRepository(db) def list_runs( self, *, agent: str | None = None, status: str | None = None, source: str | None = None, limit: int = 20, ) -> list[AgentRunRead]: self._ensure_ready() self._reconcile_stale_knowledge_index_runs() rows = self.repository.list_light( agent=agent, status=status, source=source, limit=limit, ) tool_calls_by_run_id = self._group_light_tool_calls( self.repository.list_light_tool_calls([str(item["run_id"]) for item in rows]) ) return [ self._serialize_run_list_item( item, tool_calls_by_run_id.get(str(item["run_id"]), []), ) for item in rows ] def get_run(self, run_id: str) -> AgentRunRead | None: self._ensure_ready() self._reconcile_stale_knowledge_index_runs(target_run_id=run_id) run = self.repository.get_by_run_id(run_id) if run is None: return None return self._serialize_run(run, enrich_knowledge_ingest=True) def summarize_runs( self, *, agent: str | None = None, status: str | None = None, source: str | None = None, limit: int = 200, ) -> AgentRunStatsRead: self._ensure_ready() self._reconcile_stale_knowledge_index_runs() runs = self.repository.list(agent=agent, status=status, source=source, limit=limit) agents: dict[str, int] = {} statuses: dict[str, int] = {} tool_statuses: dict[str, int] = {} tool_call_count = 0 failed_tool_call_count = 0 llm_call_count = 0 failed_llm_call_count = 0 model_fallback_count = 0 model_guardrail_count = 0 recent_errors: list[dict[str, Any]] = [] for run in runs: agents[run.agent] = agents.get(run.agent, 0) + 1 statuses[run.status] = statuses.get(run.status, 0) + 1 ontology_json = run.ontology_json or {} if ontology_json.get("parse_strategy") == "rule_fallback": model_fallback_count += 1 model_summary = ontology_json.get("model_invocation_summary") if isinstance(model_summary, dict) and model_summary.get("model_guardrail_reason"): model_guardrail_count += 1 if run.status == AgentRunStatus.FAILED.value and run.error_message: recent_errors.append( { "run_id": run.run_id, "agent": run.agent, "stage": (run.route_json or {}).get("stage"), "message": run.error_message, } ) for tool_call in run.tool_calls: tool_call_count += 1 tool_statuses[tool_call.status] = tool_statuses.get(tool_call.status, 0) + 1 failed = tool_call.status == "failed" if failed: failed_tool_call_count += 1 if tool_call.tool_type == "llm": llm_call_count += 1 if failed: failed_llm_call_count += 1 if tool_call.error_message: recent_errors.append( { "run_id": run.run_id, "agent": run.agent, "tool_name": tool_call.tool_name, "tool_type": tool_call.tool_type, "message": tool_call.error_message, } ) return AgentRunStatsRead( window_limit=limit, total_runs=len(runs), succeeded_runs=statuses.get(AgentRunStatus.SUCCEEDED.value, 0), blocked_runs=statuses.get(AgentRunStatus.BLOCKED.value, 0), failed_runs=statuses.get(AgentRunStatus.FAILED.value, 0), tool_call_count=tool_call_count, failed_tool_call_count=failed_tool_call_count, llm_call_count=llm_call_count, failed_llm_call_count=failed_llm_call_count, model_fallback_count=model_fallback_count, model_guardrail_count=model_guardrail_count, agents=agents, statuses=statuses, tool_statuses=tool_statuses, recent_errors=recent_errors[:10], ) def create_run( self, *, agent: str, source: str, user_id: str | None = None, task_id: str | None = None, ontology_json: dict[str, Any] | None = None, route_json: dict[str, Any] | None = None, permission_level: str = AgentPermissionLevel.READ.value, status: str = AgentRunStatus.RUNNING.value, result_summary: str | None = None, error_message: str | None = None, started_at: datetime | None = None, finished_at: datetime | None = None, ) -> AgentRunRead: self._ensure_ready() run = AgentRun( run_id=f"run_{uuid.uuid4().hex[:16]}", agent=agent, source=source, user_id=user_id, task_id=task_id, ontology_json=ontology_json or {}, route_json=route_json or {}, permission_level=permission_level, status=status, result_summary=result_summary, error_message=error_message, started_at=started_at or datetime.now(UTC), finished_at=finished_at, ) created = self.repository.create_run(run) logger.info("Created agent run id=%s run_id=%s", created.id, created.run_id) return self._serialize_run(created) def update_run( self, run_id: str, *, agent: str | None = None, ontology_json: dict[str, Any] | None = None, route_json: dict[str, Any] | None = None, permission_level: str | None = None, status: str | None = None, result_summary: str | None = None, error_message: str | None = None, finished_at: datetime | None = None, ) -> AgentRunRead: self._ensure_ready() run = self.repository.get_by_run_id(run_id) if run is None: raise LookupError("Run not found") if agent is not None: run.agent = agent if ontology_json is not None: run.ontology_json = ontology_json if route_json is not None: run.route_json = route_json if permission_level is not None: run.permission_level = permission_level if status is not None: run.status = status if result_summary is not None: run.result_summary = result_summary if error_message is not None: run.error_message = error_message if finished_at is not None: run.finished_at = finished_at updated = self.repository.save_run(run) logger.info("Updated agent run run_id=%s status=%s", updated.run_id, updated.status) return self._serialize_run(updated) def merge_route_json( self, run_id: str, route_patch: dict[str, Any], *, status: str | None = None, result_summary: str | None = None, error_message: str | None = None, finished_at: datetime | None = None, ) -> AgentRunRead: self._ensure_ready() run = self.repository.get_by_run_id(run_id) if run is None: raise LookupError("Run not found") route_json = dict(run.route_json or {}) route_json.update(route_patch or {}) run.route_json = route_json if status is not None: run.status = status if result_summary is not None: run.result_summary = result_summary if error_message is not None: run.error_message = error_message if finished_at is not None: run.finished_at = finished_at updated = self.repository.save_run(run) logger.info("Merged route_json for agent run run_id=%s status=%s", updated.run_id, updated.status) return self._serialize_run(updated) def record_tool_call( self, *, run_id: str, tool_type: str, tool_name: str, request_json: dict[str, Any] | None = None, response_json: dict[str, Any] | None = None, status: str, duration_ms: int = 0, error_message: str | None = None, ) -> AgentToolCallRead: self._ensure_ready() tool_call = AgentToolCall( run_id=run_id, tool_type=tool_type, tool_name=tool_name, request_json=request_json or {}, response_json=response_json or {}, status=status, duration_ms=duration_ms, error_message=error_message, ) created = self.repository.create_tool_call(tool_call) logger.info("Recorded tool call run_id=%s tool=%s", run_id, tool_name) return AgentToolCallRead.model_validate(created) def update_tool_call( self, tool_call_id: str, *, request_json: dict[str, Any] | None = None, response_json: dict[str, Any] | None = None, status: str | None = None, duration_ms: int | None = None, error_message: str | None = None, ) -> AgentToolCallRead: self._ensure_ready() tool_call = self.repository.get_tool_call(tool_call_id) if tool_call is None: raise LookupError("Tool call not found") if request_json is not None: tool_call.request_json = request_json if response_json is not None: tool_call.response_json = response_json if status is not None: tool_call.status = status if duration_ms is not None: tool_call.duration_ms = duration_ms tool_call.error_message = error_message updated = self.repository.save_tool_call(tool_call) logger.info("Updated tool call id=%s status=%s", updated.id, updated.status) return AgentToolCallRead.model_validate(updated) def record_semantic_parse( self, *, run_id: str, user_id: str | None, raw_query: str, scenario: str, intent: str, entities_json: list[Any] | None = None, time_range_json: dict[str, Any] | None = None, metrics_json: list[Any] | None = None, constraints_json: list[Any] | None = None, risk_flags_json: list[Any] | None = None, permission_json: dict[str, Any] | None = None, confidence: float = 0.0, ) -> SemanticParseRead: self._ensure_ready() semantic_parse = SemanticParseLog( run_id=run_id, user_id=user_id, raw_query=raw_query, scenario=scenario, intent=intent, entities_json=entities_json or [], time_range_json=time_range_json or {}, metrics_json=metrics_json or [], constraints_json=constraints_json or [], risk_flags_json=risk_flags_json or [], permission_json=permission_json or {}, confidence=confidence, ) created = self.repository.create_semantic_parse(semantic_parse) logger.info( "Recorded semantic parse run_id=%s scenario=%s intent=%s", run_id, scenario, intent ) return SemanticParseRead.model_validate(created) def _ensure_ready(self) -> None: AgentFoundationService(self.db).ensure_foundation_ready() def _reconcile_stale_knowledge_index_runs(self, *, target_run_id: str | None = None) -> None: runs = self.repository.list( agent=AgentName.HERMES.value, status=AgentRunStatus.RUNNING.value, limit=200, ) now = datetime.now(UTC) for run in runs: if target_run_id is not None and run.run_id != target_run_id: continue route_json = dict(run.route_json or {}) if str(route_json.get("job_type") or "").strip() not in KNOWLEDGE_SYNC_JOB_TYPES: continue heartbeat_at = self._parse_heartbeat_time( str(route_json.get("heartbeat_at") or "").strip() ) last_seen_at = heartbeat_at or run.started_at if last_seen_at.tzinfo is None: last_seen_at = last_seen_at.replace(tzinfo=UTC) if now - last_seen_at <= KNOWLEDGE_SYNC_HEARTBEAT_TIMEOUT: continue stale_document_ids = [ str(document_id).strip() for document_id in list(route_json.get("requested_document_ids") or []) if str(document_id).strip() ] if stale_document_ids: from app.services.knowledge import ( KNOWLEDGE_INGEST_STATUS_FAILED, KnowledgeService, ) KnowledgeService(db=self.db).set_document_ingest_statuses( stale_document_ids, KNOWLEDGE_INGEST_STATUS_FAILED, agent_run_id=run.run_id, ) route_json.update( { "phase": "stale_failed", "heartbeat_at": now.isoformat(), } ) run.route_json = route_json run.status = AgentRunStatus.FAILED.value run.result_summary = "知识归纳任务长时间无心跳,系统已自动标记失败。" run.error_message = "Knowledge index heartbeat timed out." run.finished_at = now self.repository.save_run(run) logger.warning("Marked stale knowledge index run as failed run_id=%s", run.run_id) @staticmethod def _parse_heartbeat_time(raw_value: str) -> datetime | None: normalized = str(raw_value or "").strip() if not normalized: return None try: return datetime.fromisoformat(normalized) except ValueError: return None def _serialize_run( self, run: AgentRun, *, enrich_knowledge_ingest: bool = False, ) -> AgentRunRead: semantic_parse = run.semantic_parse_logs[0] if run.semantic_parse_logs else None route_json = run.route_json if enrich_knowledge_ingest: route_json = enrich_knowledge_ingest_route_json( dict(run.route_json or {}), storage_root=get_settings().resolved_storage_root_dir, ) return AgentRunRead( id=run.id, run_id=run.run_id, agent=run.agent, source=run.source, user_id=run.user_id, task_id=run.task_id, ontology_json=run.ontology_json, route_json=route_json, permission_level=run.permission_level, status=run.status, result_summary=run.result_summary, error_message=run.error_message, started_at=run.started_at, finished_at=run.finished_at, tool_calls=[AgentToolCallRead.model_validate(item) for item in run.tool_calls], semantic_parse=SemanticParseRead.model_validate(semantic_parse) if semantic_parse else None, ) def _serialize_run_list_item( self, row: dict[str, Any], tool_calls: list[dict[str, Any]], ) -> AgentRunRead: return AgentRunRead( id=str(row["id"]), run_id=str(row["run_id"]), agent=str(row["agent"]), source=str(row["source"]), user_id=row.get("user_id"), task_id=row.get("task_id"), ontology_json=self._build_list_ontology_json(row), route_json=self._build_list_route_json(row), permission_level=str(row["permission_level"]), status=str(row["status"]), result_summary=row.get("result_summary"), error_message=row.get("error_message"), started_at=row["started_at"], finished_at=row.get("finished_at"), tool_calls=[self._serialize_light_tool_call(item) for item in tool_calls], semantic_parse=None, ) def _build_list_route_json(self, row: dict[str, Any]) -> dict[str, Any]: payload: dict[str, Any] = {} for source_key, target_key in LIST_ROUTE_FIELDS: self._set_if_present(payload, target_key, row.get(source_key)) progress = self._coerce_json_object(row.get("route_progress")) compact_progress = { key: value for key, value in progress.items() if key in LIST_PROGRESS_FIELDS and self._is_scalar_json_value(value) } if compact_progress: payload["progress"] = compact_progress return payload def _build_list_ontology_json(self, row: dict[str, Any]) -> dict[str, Any]: payload: dict[str, Any] = {} for source_key, target_key in LIST_ONTOLOGY_FIELDS: self._set_if_present(payload, target_key, row.get(source_key)) return payload def _serialize_light_tool_call(self, row: dict[str, Any]) -> AgentToolCallRead: return AgentToolCallRead( id=str(row["id"]), run_id=str(row["run_id"]), tool_type=str(row["tool_type"]), tool_name=str(row["tool_name"]), request_json={}, response_json={}, status=str(row["status"]), duration_ms=int(row.get("duration_ms") or 0), error_message=row.get("error_message"), created_at=row["created_at"], ) @staticmethod def _group_light_tool_calls( tool_calls: list[dict[str, Any]], ) -> dict[str, list[dict[str, Any]]]: grouped: dict[str, list[dict[str, Any]]] = {} for tool_call in tool_calls: grouped.setdefault(str(tool_call.get("run_id") or ""), []).append(tool_call) return grouped @staticmethod def _coerce_json_object(value: Any) -> dict[str, Any]: if isinstance(value, dict): return value if isinstance(value, str): normalized = value.strip() if normalized.startswith("{") and normalized.endswith("}"): try: loaded = json.loads(normalized) except json.JSONDecodeError: return {} return loaded if isinstance(loaded, dict) else {} return {} @staticmethod def _set_if_present(payload: dict[str, Any], key: str, value: Any) -> None: if value is None: return if isinstance(value, str) and not value.strip(): return if not AgentRunService._is_scalar_json_value(value): return payload[key] = value @staticmethod def _is_scalar_json_value(value: Any) -> bool: return value is None or isinstance(value, str | int | float | bool)