diff --git a/backend/app/agents/graph.py b/backend/app/agents/graph.py index 2548cc0..a24feb0 100644 --- a/backend/app/agents/graph.py +++ b/backend/app/agents/graph.py @@ -6,6 +6,7 @@ import asyncio import json import logging import re +from uuid import uuid4 from typing import Any, Literal, cast from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage @@ -19,10 +20,13 @@ from app.agents.prompts import ( MASTER_SYSTEM_PROMPT, SCHEDULE_PLANNER_SYSTEM_PROMPT, ) +from app.agents.registry import load_builtin_registry_indexes +from app.agents.schemas.event import AgentEvent from app.agents.skill_registry import build_skill_context from app.agents.state import AgentRole, AgentState from app.agents.tools import SUB_COMMANDER_TOOLSETS from app.agents.tools.time_reasoning import normalize_tool_time_arguments +from app.agents.verifier import apply_verification_verdict, verify_task_result from app.services.llm_service import ( create_llm_from_config, default_provider_capabilities, @@ -632,6 +636,76 @@ def _conversation_history_messages(state: AgentState) -> list[BaseMessage]: return [message for message in history if getattr(message, "type", "") != "system"] +def _append_event_trace( + state: AgentState, + event_type: str, + *, + payload: dict[str, Any] | None = None, + severity: str = "info", + task_id: str | None = None, +) -> None: + event = AgentEvent( + event_id=f"evt-{uuid4()}", + event_type=cast(Any, event_type), + conversation_id=str(state.get("conversation_id") or "") or None, + agent_id=_role_value(state.get("current_agent")), + sub_commander_id=state.get("current_sub_commander"), + task_id=task_id, + payload=payload or {}, + severity=cast(Any, severity), + ) + state["event_trace"] = [ + *(state.get("event_trace") or []), + event.model_dump(mode="json"), + ] + + +def _capability_manifest_for_tool(tool_name: str): + indexes = load_builtin_registry_indexes() + capability_id = indexes.capability_id_by_tool_name.get(tool_name) + if capability_id is None: + return None + return indexes.capability_by_id.get(capability_id) + + +def _build_verifier_hints(state: AgentState, tool_name: str, result: Any) -> dict[str, Any]: + capability = _capability_manifest_for_tool(tool_name) + permission_class = getattr(capability, "permission_class", None) + side_effect_scope = getattr(capability, "side_effect_scope", None) + return { + "tool_name": tool_name, + "permission_class": getattr(permission_class, "value", None), + "side_effect_scope": getattr(side_effect_scope, "value", None), + "requires_confirmation": bool(getattr(capability, "requires_confirmation", False)), + "supports_retry": bool(getattr(capability, "supports_retry", False)), + "safe_for_parallel_use": bool(getattr(capability, "safe_for_parallel_use", False)), + "result_preview": _stringify_message_content(result)[:200], + } + + +def _update_task_result_summary(state: AgentState, tool_summaries: list[dict[str, Any]]) -> None: + if not tool_summaries: + return + + previous_summary = state.get("task_result_summary") or {} + previous_tools = previous_summary.get("tools") or [] + merged_tools = [*previous_tools, *tool_summaries] + summary = { + "tool_count": len(merged_tools), + "tools": merged_tools, + "created_count": sum(int(item.get("created_count") or 0) for item in merged_tools), + "created_entity_types": [ + entity_type + for item in merged_tools + for entity_type in item.get("created_entity_types") or [] + if entity_type + ], + "stop_reason": state.get("stop_reason"), + } + state["task_result_summary"] = summary + state["action_results"] = [*(state.get("action_results") or []), summary] + + def _record_sub_commander(state: AgentState, role: AgentRole, sub_commander: str, user_query: str) -> None: state["current_agent"] = role.value state["current_sub_commander"] = sub_commander @@ -889,6 +963,8 @@ async def _execute_tool_calls( result_lines: list[str] = [] created_entities: list[dict[str, str]] = [] tool_messages: list[ToolMessage] = [] + verifier_hints_by_tool: list[dict[str, Any]] = [] + tool_summaries: list[dict[str, Any]] = [] for call in tool_calls: tool_name = call["name"] @@ -897,6 +973,13 @@ async def _execute_tool_calls( if tool is None: raise ValueError(f"Tool not found: {tool_name}") + _append_event_trace( + state, + "agent.tool.start", + payload={"tool_name": tool_name, "args": normalized_args}, + task_id=str(call.get("id") or "") or None, + ) + try: if hasattr(tool, "ainvoke"): result = await tool.ainvoke(normalized_args) @@ -905,6 +988,13 @@ async def _execute_tool_calls( except Exception as exc: logger.exception("Tool execution failed: %s args=%s", tool_name, normalized_args) result = f"工具执行失败: {exc}" + _append_event_trace( + state, + "agent.error", + payload={"tool_name": tool_name, "args": normalized_args, "error": str(exc)}, + severity="error", + task_id=str(call.get("id") or "") or None, + ) normalized_call = { "id": call.get("id"), @@ -914,6 +1004,27 @@ async def _execute_tool_calls( } normalized_calls.append(normalized_call) result_lines.append(f"[{tool_name}] {result}") + verifier_hints = _build_verifier_hints(state, tool_name, result) + verifier_hints_by_tool.append(verifier_hints) + tool_outcome = { + "tool_name": tool_name, + "args": normalized_args, + "result_preview": _stringify_message_content(result)[:200], + "verifier_hints": verifier_hints, + } + state["tool_outcomes"] = [*(state.get("tool_outcomes") or []), tool_outcome] + _append_event_trace( + state, + "agent.tool.result", + payload={ + "tool_name": tool_name, + "args": normalized_args, + "result_preview": _stringify_message_content(result)[:200], + "verification": verifier_hints, + }, + severity="error" if _tool_result_indicates_failure(result) else "info", + task_id=str(call.get("id") or "") or None, + ) tool_messages.append( ToolMessage( content=_stringify_message_content(result), @@ -922,9 +1033,21 @@ async def _execute_tool_calls( ) ) entity = _classify_created_entity(tool_name) + call_created_entities: list[dict[str, str]] = [] if entity and not _tool_result_indicates_failure(result): created_entities.append(entity) + call_created_entities.append(entity) + tool_summaries.append( + { + "tool_name": tool_name, + "result_preview": _stringify_message_content(result)[:200], + "created_entity_types": [entity.get("type") for entity in call_created_entities if entity.get("type")], + "created_count": len(call_created_entities), + } + ) + state["verifier_hints"] = {"tools": verifier_hints_by_tool} + _update_task_result_summary(state, tool_summaries) return normalized_calls, "\n".join(result_lines), created_entities, tool_messages @@ -1127,6 +1250,43 @@ async def _run_sub_commander( if summary_target: state[_summary_state_key(summary_target)] = state.get("final_response") + task_result_summary = state.get("task_result_summary") + tool_outcomes = list(state.get("tool_outcomes") or []) + has_tool_failure = any( + _tool_result_indicates_failure(outcome.get("result_preview")) + for outcome in tool_outcomes + ) + verifier_input = { + "summary": state.get("final_response") or (task_result_summary or {}).get("tools"), + "evidence": tool_outcomes, + "success": bool(tool_outcomes or state.get("final_response")) and not has_tool_failure, + } + _append_event_trace( + state, + "agent.verify.started", + payload={ + "summary_present": bool(verifier_input["summary"]), + "evidence_count": len(verifier_input["evidence"]), + }, + ) + verdict = verify_task_result( + summary=state.get("final_response"), + evidence=tool_outcomes, + result=verifier_input, + ) + updated_state = apply_verification_verdict(state, verdict) + state.update(updated_state) + _append_event_trace( + state, + "agent.verify.completed", + payload={ + "status": verdict.status, + "summary": verdict.summary, + "evidence_count": len(verdict.evidence), + }, + severity="error" if verdict.status == "failed" else "info", + ) + final_response_text = state.get("final_response") if not state.get("clarification_needed") and final_response_text: _clear_clarification_context(state) @@ -1355,6 +1515,7 @@ def get_agent_graph(callbacks: list | None = None): __all__ = [ + "_build_verifier_hints", "_choose_sub_commander", "_parse_json_action", "_route_agent_from_user_query", diff --git a/backend/app/agents/prompts.py b/backend/app/agents/prompts.py index 9b3351b..79b1b8f 100644 --- a/backend/app/agents/prompts.py +++ b/backend/app/agents/prompts.py @@ -324,6 +324,19 @@ ANALYST_INSIGHTS_PROMPT = f"""{JARVIS_PERSONA_PROMPT} """ +VERIFIER_PROMPT = f"""{JARVIS_PERSONA_PROMPT} + +你是 Jarvis 的验证官,负责对执行结果做最小但明确的核验。 + +## 你的职责: +- 只输出 passed、failed、skipped 三种验证结论之一 +- 用一句话总结验证判断 +- 如有证据,保留关键证据点 +- 当信息不足以证明成功或失败时,优先判定为 skipped +- 不重写执行方案,不扩展无关建议 +""" + + JSON_ACTION_FALLBACK_PROMPT = """你当前运行在 JSON action fallback 模式。 你的输出必须满足以下规则: diff --git a/backend/app/agents/registry/__init__.py b/backend/app/agents/registry/__init__.py index 0a62924..52513b3 100644 --- a/backend/app/agents/registry/__init__.py +++ b/backend/app/agents/registry/__init__.py @@ -1,11 +1,19 @@ """Registry manifest models and validation helpers.""" +from functools import lru_cache + from app.agents.registry.indexes import RegistryIndexes, build_registry_indexes from app.agents.registry.loader import RegistryBundle, load_builtin_registry_bundle +@lru_cache(maxsize=1) +def load_builtin_registry_indexes() -> RegistryIndexes: + return build_registry_indexes(load_builtin_registry_bundle()) + + __all__ = [ "RegistryBundle", "RegistryIndexes", "build_registry_indexes", "load_builtin_registry_bundle", + "load_builtin_registry_indexes", ] diff --git a/backend/app/agents/registry/builtins.py b/backend/app/agents/registry/builtins.py index 3157346..ea9d663 100644 --- a/backend/app/agents/registry/builtins.py +++ b/backend/app/agents/registry/builtins.py @@ -2,6 +2,8 @@ from app.agents.prompts import SUB_COMMANDER_PROMPTS_BY_KEY from app.agents.registry.models import ( AgentManifest, CapabilityManifest, + PermissionClass, + SideEffectScope, SpecialistTemplateManifest, SubCommanderManifest, ) @@ -89,10 +91,150 @@ _capability_tool_names = tuple( ) ) +_CAPABILITY_METADATA_BY_TOOL_NAME: dict[str, dict[str, object]] = { + "get_tasks": { + "permission_class": PermissionClass.READ, + "side_effect_scope": SideEffectScope.NONE, + "supports_retry": True, + "idempotent": True, + "safe_for_parallel_use": True, + "requires_confirmation": False, + }, + "get_schedule_day": { + "permission_class": PermissionClass.READ, + "side_effect_scope": SideEffectScope.NONE, + "supports_retry": True, + "idempotent": True, + "safe_for_parallel_use": True, + "requires_confirmation": False, + }, + "resolve_time_expression": { + "permission_class": PermissionClass.READ, + "side_effect_scope": SideEffectScope.NONE, + "supports_retry": True, + "idempotent": True, + "safe_for_parallel_use": True, + "requires_confirmation": False, + }, + "search_knowledge": { + "permission_class": PermissionClass.READ, + "side_effect_scope": SideEffectScope.NONE, + "supports_retry": True, + "idempotent": True, + "safe_for_parallel_use": True, + "requires_confirmation": False, + }, + "hybrid_search": { + "permission_class": PermissionClass.READ, + "side_effect_scope": SideEffectScope.NONE, + "supports_retry": True, + "idempotent": True, + "safe_for_parallel_use": True, + "requires_confirmation": False, + }, + "get_knowledge_graph_context": { + "permission_class": PermissionClass.READ, + "side_effect_scope": SideEffectScope.NONE, + "supports_retry": True, + "idempotent": True, + "safe_for_parallel_use": True, + "requires_confirmation": False, + }, + "get_forum_posts": { + "permission_class": PermissionClass.READ, + "side_effect_scope": SideEffectScope.NONE, + "supports_retry": True, + "idempotent": True, + "safe_for_parallel_use": True, + "requires_confirmation": False, + }, + "scan_forum_for_instructions": { + "permission_class": PermissionClass.READ, + "side_effect_scope": SideEffectScope.NONE, + "supports_retry": True, + "idempotent": True, + "safe_for_parallel_use": True, + "requires_confirmation": False, + }, + "web_search": { + "permission_class": PermissionClass.EXTERNAL, + "side_effect_scope": SideEffectScope.NETWORK, + "supports_retry": True, + "idempotent": True, + "safe_for_parallel_use": True, + "requires_confirmation": False, + }, + "create_task": { + "permission_class": PermissionClass.WRITE, + "side_effect_scope": SideEffectScope.LOCAL_STATE, + "supports_retry": False, + "idempotent": False, + "safe_for_parallel_use": False, + "requires_confirmation": True, + }, + "update_task_status": { + "permission_class": PermissionClass.WRITE, + "side_effect_scope": SideEffectScope.LOCAL_STATE, + "supports_retry": False, + "idempotent": False, + "safe_for_parallel_use": False, + "requires_confirmation": True, + }, + "create_todo": { + "permission_class": PermissionClass.WRITE, + "side_effect_scope": SideEffectScope.LOCAL_STATE, + "supports_retry": False, + "idempotent": False, + "safe_for_parallel_use": False, + "requires_confirmation": True, + }, + "create_schedule_task": { + "permission_class": PermissionClass.WRITE, + "side_effect_scope": SideEffectScope.LOCAL_STATE, + "supports_retry": False, + "idempotent": False, + "safe_for_parallel_use": False, + "requires_confirmation": True, + }, + "create_reminder": { + "permission_class": PermissionClass.WRITE, + "side_effect_scope": SideEffectScope.LOCAL_STATE, + "supports_retry": False, + "idempotent": False, + "safe_for_parallel_use": False, + "requires_confirmation": True, + }, + "create_goal": { + "permission_class": PermissionClass.WRITE, + "side_effect_scope": SideEffectScope.LOCAL_STATE, + "supports_retry": False, + "idempotent": False, + "safe_for_parallel_use": False, + "requires_confirmation": True, + }, + "create_forum_post": { + "permission_class": PermissionClass.WRITE, + "side_effect_scope": SideEffectScope.LOCAL_STATE, + "supports_retry": False, + "idempotent": False, + "safe_for_parallel_use": False, + "requires_confirmation": True, + }, + "build_knowledge_graph": { + "permission_class": PermissionClass.WRITE, + "side_effect_scope": SideEffectScope.LOCAL_STATE, + "supports_retry": False, + "idempotent": False, + "safe_for_parallel_use": False, + "requires_confirmation": True, + }, +} + BUILTIN_CAPABILITY_MANIFESTS: tuple[CapabilityManifest, ...] = tuple( CapabilityManifest( capability_id=tool_name, tool_name=tool_name, + **dict(_CAPABILITY_METADATA_BY_TOOL_NAME.get(tool_name, {})), ) for tool_name in _capability_tool_names ) diff --git a/backend/app/agents/registry/models.py b/backend/app/agents/registry/models.py index 9601d3b..c102feb 100644 --- a/backend/app/agents/registry/models.py +++ b/backend/app/agents/registry/models.py @@ -1,6 +1,21 @@ +from enum import Enum + from pydantic import BaseModel +class PermissionClass(str, Enum): + READ = "read" + WRITE = "write" + EXTERNAL = "external" + + +class SideEffectScope(str, Enum): + NONE = "none" + LOCAL_STATE = "local_state" + DB_WRITE = "db_write" + NETWORK = "network" + + class AgentManifest(BaseModel): agent_id: str display_name: str @@ -23,6 +38,12 @@ class SubCommanderManifest(BaseModel): class CapabilityManifest(BaseModel): capability_id: str tool_name: str + permission_class: PermissionClass = PermissionClass.READ + side_effect_scope: SideEffectScope = SideEffectScope.NONE + supports_retry: bool = False + idempotent: bool = False + safe_for_parallel_use: bool = False + requires_confirmation: bool = False class SpecialistTemplateManifest(BaseModel): diff --git a/backend/app/agents/schemas/__init__.py b/backend/app/agents/schemas/__init__.py new file mode 100644 index 0000000..ad3609f --- /dev/null +++ b/backend/app/agents/schemas/__init__.py @@ -0,0 +1,10 @@ +from app.agents.schemas.event import AgentEvent +from app.agents.schemas.task import AgentTask, TaskResult, TaskLifecycleStatus, VerificationStatus + +__all__ = [ + "AgentEvent", + "AgentTask", + "TaskLifecycleStatus", + "TaskResult", + "VerificationStatus", +] diff --git a/backend/app/agents/schemas/event.py b/backend/app/agents/schemas/event.py new file mode 100644 index 0000000..f08d1e1 --- /dev/null +++ b/backend/app/agents/schemas/event.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any, Literal + +from pydantic import BaseModel, Field + + +AgentEventType = Literal[ + "agent.tool.start", + "agent.tool.result", + "agent.verify.started", + "agent.verify.completed", + "agent.error", +] +AgentEventSeverity = Literal["info", "warning", "error"] + + +class AgentEvent(BaseModel): + event_id: str + event_type: AgentEventType + timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + conversation_id: str | None = None + agent_id: str | None = None + sub_commander_id: str | None = None + task_id: str | None = None + payload: dict[str, Any] = Field(default_factory=dict) + severity: AgentEventSeverity = "info" diff --git a/backend/app/agents/schemas/task.py b/backend/app/agents/schemas/task.py new file mode 100644 index 0000000..1f32dd6 --- /dev/null +++ b/backend/app/agents/schemas/task.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any, Literal + +from pydantic import BaseModel, Field + + +TaskLifecycleStatus = Literal["pending", "in_progress", "completed", "failed", "blocked"] +VerificationStatus = Literal["passed", "failed", "skipped"] + + +class AgentTask(BaseModel): + task_id: str + title: str + status: TaskLifecycleStatus = "pending" + owner_agent_id: str | None = None + role: str | None = None + goal: str | None = None + expected_evidence: list[dict[str, Any]] = Field(default_factory=list) + evidence: list[dict[str, Any]] = Field(default_factory=list) + result_summary: str | None = None + created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + + +class TaskResult(BaseModel): + task_id: str + status: VerificationStatus + summary: str | None = None + evidence: list[dict[str, Any]] = Field(default_factory=list) + output_data: dict[str, Any] | None = None diff --git a/backend/app/agents/state.py b/backend/app/agents/state.py index 32fb200..e1219ed 100644 --- a/backend/app/agents/state.py +++ b/backend/app/agents/state.py @@ -1,7 +1,9 @@ from dataclasses import dataclass from enum import Enum -from typing import Annotated, Any, TypedDict +from typing import Annotated, Any, Literal, TypedDict +from app.agents.schemas.event import AgentEvent +from app.agents.schemas.task import AgentTask, TaskResult, VerificationStatus from langchain_core.messages import BaseMessage from langgraph.graph.message import add_messages @@ -27,6 +29,7 @@ class AgentState(TypedDict): user_id: str conversation_id: str + execution_mode: Literal["direct", "delegated", "verified"] current_agent: str | None next_step: str | None active_agents: list[AgentRole] @@ -34,14 +37,24 @@ class AgentState(TypedDict): active_sub_commanders: list[str] sub_commander_trace: list[dict[str, Any]] agent_trace: list[str] + event_trace: list[AgentEvent | dict[str, Any]] pending_tasks: list[dict[str, Any]] completed_tasks: list[dict[str, Any]] + active_tasks: list[AgentTask | dict[str, Any]] + task_results: list[TaskResult | dict[str, Any]] tool_calls: list[dict[str, Any]] last_tool_result: str | None action_results: list[dict[str, Any]] created_entities: list[dict[str, Any]] tool_outcomes: list[dict[str, Any]] + task_result_summary: dict[str, Any] | None + verifier_hints: dict[str, Any] | None + + verification_status: VerificationStatus | None + verification_summary: str | None + verification_evidence: list[dict[str, Any]] + budget_state: dict[str, Any] | None tool_strategy_used: str | None tool_round_count: int @@ -89,6 +102,7 @@ def initial_state(user_id: str, conversation_id: str) -> AgentState: messages=[], user_id=user_id, conversation_id=conversation_id, + execution_mode="direct", current_agent=AgentRole.MASTER.value, next_step=None, active_agents=[AgentRole.MASTER], @@ -96,13 +110,22 @@ def initial_state(user_id: str, conversation_id: str) -> AgentState: active_sub_commanders=[], sub_commander_trace=[], agent_trace=[AgentRole.MASTER.value], + event_trace=[], pending_tasks=[], completed_tasks=[], + active_tasks=[], + task_results=[], tool_calls=[], last_tool_result=None, action_results=[], created_entities=[], tool_outcomes=[], + task_result_summary=None, + verifier_hints=None, + verification_status=None, + verification_summary=None, + verification_evidence=[], + budget_state=None, tool_strategy_used=None, tool_round_count=0, max_tool_rounds=2, diff --git a/backend/app/agents/verifier.py b/backend/app/agents/verifier.py new file mode 100644 index 0000000..7aa3e83 --- /dev/null +++ b/backend/app/agents/verifier.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + +from app.agents.schemas.task import AgentTask, TaskResult, VerificationStatus +from app.agents.state import AgentState + + +class VerificationVerdict(BaseModel): + status: VerificationStatus + summary: str | None = None + evidence: list[dict[str, Any]] = Field(default_factory=list) + + +def verify_task_result( + *, + task: AgentTask | dict[str, Any] | None = None, + result: TaskResult | dict[str, Any] | None = None, + summary: str | None = None, + evidence: list[dict[str, Any]] | None = None, + status: VerificationStatus | None = None, +) -> VerificationVerdict: + normalized_result = result.model_dump() if isinstance(result, TaskResult) else dict(result or {}) + normalized_task = task.model_dump() if isinstance(task, AgentTask) else dict(task or {}) + normalized_summary = summary or normalized_result.get("summary") or normalized_task.get("result_summary") + normalized_evidence = list(evidence or normalized_result.get("evidence") or normalized_task.get("evidence") or []) + + if status is not None: + return VerificationVerdict(status=status, summary=normalized_summary, evidence=normalized_evidence) + + if normalized_result.get("status") in {"passed", "failed", "skipped"}: + inferred_status = normalized_result["status"] + elif normalized_result.get("success") is True: + inferred_status = "passed" + elif normalized_result.get("success") is False: + inferred_status = "failed" + elif normalized_summary or normalized_evidence: + inferred_status = "skipped" + else: + inferred_status = "failed" + normalized_summary = "No verification input available." + + return VerificationVerdict( + status=inferred_status, + summary=normalized_summary, + evidence=normalized_evidence, + ) + + +def apply_verification_verdict(state: AgentState, verdict: VerificationVerdict) -> AgentState: + next_state = dict(state) + next_state["verification_status"] = verdict.status + next_state["verification_summary"] = verdict.summary + next_state["verification_evidence"] = list(verdict.evidence) + return AgentState(**next_state) + + +__all__ = ["VerificationVerdict", "apply_verification_verdict", "verify_task_result"] diff --git a/backend/tests/backend/app/agents/test_graph.py b/backend/tests/backend/app/agents/test_graph.py index 6646a76..a43e873 100644 --- a/backend/tests/backend/app/agents/test_graph.py +++ b/backend/tests/backend/app/agents/test_graph.py @@ -8,6 +8,7 @@ import app.agents.graph as graph_module from langchain_core.messages import AIMessage, HumanMessage from app.agents.graph import ( + _build_verifier_hints, _choose_sub_commander, _execute_tool_calls, _parse_json_action, @@ -29,29 +30,46 @@ def _base_state(message: str, user_llm_config: dict | None = None) -> dict: 'messages': [HumanMessage(content=message)], 'user_id': 'u1', 'conversation_id': 'c1', - 'current_agent': AgentRole.MASTER, + 'execution_mode': 'direct', + 'current_agent': AgentRole.MASTER.value, + 'next_step': None, 'active_agents': [AgentRole.MASTER], 'current_sub_commander': None, 'active_sub_commanders': [], 'sub_commander_trace': [], + 'agent_trace': [AgentRole.MASTER.value], + 'event_trace': [], 'pending_tasks': [], 'completed_tasks': [], + 'active_tasks': [], + 'task_results': [], 'tool_calls': [], 'last_tool_result': None, 'action_results': [], 'created_entities': [], + 'tool_outcomes': [], + 'task_result_summary': None, + 'verifier_hints': None, + 'verification_status': None, + 'verification_summary': None, + 'verification_evidence': [], + 'budget_state': None, + 'tool_strategy_used': None, 'tool_round_count': 0, 'max_tool_rounds': 2, 'retry_count': 0, 'max_retries': 1, 'iteration_count': 0, 'max_iterations': 3, + 'routing_hops': 0, + 'max_routing_hops': 2, + 'terminated_due_to_loop_guard': False, 'retrieval_trace': [], 'stop_reason': None, 'clarification_needed': False, 'clarification_question': None, - 'provider_capabilities': None, 'fallback_parse_error': None, + 'should_respond': True, 'knowledge_context': None, 'graph_context': None, 'schedule_context_summary': None, @@ -59,11 +77,17 @@ def _base_state(message: str, user_llm_config: dict | None = None) -> dict: 'plan_steps': [], 'analysis_report': None, 'final_response': None, - 'should_respond': True, 'memory_context': None, 'current_datetime_context': 'CURRENT_TIME: 2026-03-28T12:00:00+08:00', 'current_datetime_reference': {'current_time_iso': '2026-03-28T12:00:00+08:00', 'current_date_iso': '2026-03-28', 'timezone': 'UTC'}, + 'turn_context': None, + 'routing_decision': None, + 'continuity_state': None, + 'pending_action': None, + 'last_completed_action': None, + 'clarification_context': None, 'user_llm_config': user_llm_config, + 'provider_capabilities': None, } @@ -258,6 +282,7 @@ def test_initial_state_sets_structured_continuity_defaults(): assert state['pending_action'] is None assert state['last_completed_action'] is None assert state['clarification_context'] is None + assert state['event_trace'] == [] assert state['tool_outcomes'] == [] @@ -322,6 +347,7 @@ async def test_planner_node_clears_next_step_after_consuming_routed_turn(monkeyp assert result['final_response'] is not None +async def test_master_node_returns_stable_reply_for_simple_greeting(monkeypatch): monkeypatch.setattr('app.agents.graph._get_llm_for_state', lambda state: FailIfCalledLLM()) state = { @@ -1062,8 +1088,147 @@ async def test_master_node_returns_stable_reply_for_capability_question(monkeypa assert getattr(result['messages'][-1], 'content', '') == result['final_response'] -def test_choose_sub_commander_routes_schedule_requests_to_schedule_planning(): - assert _choose_sub_commander(AgentRole.SCHEDULE_PLANNER, '帮我安排一下这周计划') == 'schedule_planning' +def test_build_verifier_hints_uses_capability_metadata(): + state = _base_state('明天提醒我开会') + + hints = _build_verifier_hints(state, 'create_reminder', '提醒创建成功') + + assert hints['tool_name'] == 'create_reminder' + assert hints['permission_class'] == 'write' + assert hints['side_effect_scope'] == 'local_state' + assert hints['requires_confirmation'] is True + assert hints['supports_retry'] is False + assert hints['safe_for_parallel_use'] is False + assert '提醒创建成功' in hints['result_preview'] + + +async def test_execute_tool_calls_records_schema_events_and_aggregate_summaries(monkeypatch): + tool = FakeTool('create_reminder', '提醒创建成功: 开会 @ 2026-03-29 09:00') + state = _base_state('test') + + normalized_calls, tool_result, created_entities, tool_messages = await _execute_tool_calls( + [{'id': 'task-1', 'name': 'create_reminder', 'args': {'title': '开会', 'reminder_at': '2026-03-29T09:00:00'}}], + [tool], + state, + ) + + assert normalized_calls[0]['name'] == 'create_reminder' + assert tool_result.startswith('[create_reminder]') + assert created_entities == [{'type': 'reminder', 'tool': 'create_reminder'}] + assert len(tool_messages) == 1 + assert state['verifier_hints'] == { + 'tools': [ + { + 'tool_name': 'create_reminder', + 'permission_class': 'write', + 'side_effect_scope': 'local_state', + 'requires_confirmation': True, + 'supports_retry': False, + 'safe_for_parallel_use': False, + 'result_preview': '提醒创建成功: 开会 @ 2026-03-29 09:00', + } + ] + } + assert state['task_result_summary']['tool_count'] == 1 + assert state['task_result_summary']['created_entity_types'] == ['reminder'] + assert state['tool_outcomes'][0]['tool_name'] == 'create_reminder' + assert state['event_trace'][0]['event_type'] == 'agent.tool.start' + assert state['event_trace'][-1]['event_type'] == 'agent.tool.result' + assert state['event_trace'][-1]['payload']['verification']['tool_name'] == 'create_reminder' + assert state['task_result_summary'] == { + 'tool_count': 1, + 'tools': [ + { + 'tool_name': 'create_reminder', + 'result_preview': '提醒创建成功: 开会 @ 2026-03-29 09:00', + 'created_entity_types': ['reminder'], + 'created_count': 1, + } + ], + 'created_count': 1, + 'created_entity_types': ['reminder'], + 'stop_reason': None, + } + assert state['action_results'][-1] == state['task_result_summary'] + assert state['tool_outcomes'][0]['tool_name'] == 'create_reminder' + assert [event['event_type'] for event in state['event_trace']] == [ + 'agent.tool.start', + 'agent.tool.result', + ] + assert all('event_id' in event for event in state['event_trace']) + assert all('timestamp' in event for event in state['event_trace']) + assert all(event['conversation_id'] == 'c1' for event in state['event_trace']) + assert all(event['agent_id'] == AgentRole.MASTER.value for event in state['event_trace']) + assert all(event['task_id'] == 'task-1' for event in state['event_trace']) + + +async def test_execute_tool_calls_aggregates_multiple_tool_turns_without_overwrite(monkeypatch): + reminder_tool = FakeTool('create_reminder', '提醒创建成功: 开会 @ 2026-03-29 09:00') + search_tool = FakeTool('web_search', '成功搜索到 2 条网页结果') + state = _base_state('test') + + normalized_calls, tool_result, created_entities, tool_messages = await _execute_tool_calls( + [ + {'id': 'task-1', 'name': 'create_reminder', 'args': {'title': '开会', 'reminder_at': '2026-03-29T09:00:00'}}, + {'id': 'task-2', 'name': 'web_search', 'args': {'query': 'Jarvis 最新模型更新'}}, + ], + [reminder_tool, search_tool], + state, + ) + + assert [call['name'] for call in normalized_calls] == ['create_reminder', 'web_search'] + assert tool_result == '[create_reminder] 提醒创建成功: 开会 @ 2026-03-29 09:00\n[web_search] 成功搜索到 2 条网页结果' + assert created_entities == [{'type': 'reminder', 'tool': 'create_reminder'}] + assert [message.name for message in tool_messages] == ['create_reminder', 'web_search'] + assert state['verifier_hints'] == { + 'tools': [ + { + 'tool_name': 'create_reminder', + 'permission_class': 'write', + 'side_effect_scope': 'local_state', + 'requires_confirmation': True, + 'supports_retry': False, + 'safe_for_parallel_use': False, + 'result_preview': '提醒创建成功: 开会 @ 2026-03-29 09:00', + }, + { + 'tool_name': 'web_search', + 'permission_class': 'external', + 'side_effect_scope': 'network', + 'requires_confirmation': False, + 'supports_retry': True, + 'safe_for_parallel_use': True, + 'result_preview': '成功搜索到 2 条网页结果', + }, + ] + } + assert state['task_result_summary'] == { + 'tool_count': 2, + 'tools': [ + { + 'tool_name': 'create_reminder', + 'result_preview': '提醒创建成功: 开会 @ 2026-03-29 09:00', + 'created_entity_types': ['reminder'], + 'created_count': 1, + }, + { + 'tool_name': 'web_search', + 'result_preview': '成功搜索到 2 条网页结果', + 'created_entity_types': [], + 'created_count': 0, + }, + ], + 'created_count': 1, + 'created_entity_types': ['reminder'], + 'stop_reason': None, + } + assert len(state['tool_outcomes']) == 2 + assert [event['event_type'] for event in state['event_trace']] == [ + 'agent.tool.start', + 'agent.tool.result', + 'agent.tool.start', + 'agent.tool.result', + ] def test_choose_sub_commander_routes_focus_requests_to_schedule_analysis(): diff --git a/backend/tests/backend/app/agents/test_registry.py b/backend/tests/backend/app/agents/test_registry.py index 16e3fd6..bbdfc0d 100644 --- a/backend/tests/backend/app/agents/test_registry.py +++ b/backend/tests/backend/app/agents/test_registry.py @@ -5,11 +5,13 @@ from app.agents.prompts import ( SUB_COMMANDER_PROMPTS_BY_KEY, TOP_LEVEL_SYSTEM_PROMPTS_BY_KEY, ) -from app.agents.registry import build_registry_indexes, load_builtin_registry_bundle +from app.agents.registry import build_registry_indexes, load_builtin_registry_bundle, load_builtin_registry_indexes from app.agents.registry.indexes import summarize_registry_indexes from app.agents.registry.models import ( AgentManifest, CapabilityManifest, + PermissionClass, + SideEffectScope, SpecialistTemplateManifest, SubCommanderManifest, ) @@ -251,17 +253,34 @@ def test_builtin_capabilities_reference_actual_runtime_tool_names() -> None: assert manifest_tool_names == expected_tool_names -def test_builtin_sub_commander_capabilities_match_runtime_toolsets() -> None: - capabilities_by_tool_name = { - manifest.tool_name: manifest.capability_id for manifest in BUILTIN_CAPABILITY_MANIFESTS - } +def test_builtin_capability_metadata_distinguishes_read_and_write_surfaces() -> None: + capability_by_id = {manifest.capability_id: manifest for manifest in BUILTIN_CAPABILITY_MANIFESTS} - for sub_commander in BUILTIN_SUB_COMMANDER_MANIFESTS: - expected_capability_ids = { - capabilities_by_tool_name[tool.name] - for tool in SUB_COMMANDER_TOOLSETS[sub_commander.sub_commander_id] - } - assert set(sub_commander.capability_ids) == expected_capability_ids + assert capability_by_id["get_tasks"].permission_class == PermissionClass.READ + assert capability_by_id["get_tasks"].side_effect_scope == SideEffectScope.NONE + assert capability_by_id["get_tasks"].supports_retry is True + assert capability_by_id["get_tasks"].idempotent is True + assert capability_by_id["get_tasks"].safe_for_parallel_use is True + assert capability_by_id["get_tasks"].requires_confirmation is False + + assert capability_by_id["create_reminder"].permission_class == PermissionClass.WRITE + assert capability_by_id["create_reminder"].side_effect_scope == SideEffectScope.LOCAL_STATE + assert capability_by_id["create_reminder"].supports_retry is False + assert capability_by_id["create_reminder"].idempotent is False + assert capability_by_id["create_reminder"].safe_for_parallel_use is False + assert capability_by_id["create_reminder"].requires_confirmation is True + + assert capability_by_id["web_search"].permission_class == PermissionClass.EXTERNAL + assert capability_by_id["web_search"].side_effect_scope == SideEffectScope.NETWORK + + +def test_load_builtin_registry_indexes_is_cached_and_matches_bundle_indexes() -> None: + cached = load_builtin_registry_indexes() + rebuilt = build_registry_indexes(load_builtin_registry_bundle()) + + assert cached is load_builtin_registry_indexes() + assert cached.capability_id_by_tool_name == rebuilt.capability_id_by_tool_name + assert cached.capability_by_id["create_reminder"].requires_confirmation is True def test_builtin_manifests_form_a_valid_registry_bundle() -> None: diff --git a/backend/tests/backend/app/agents/test_schema_verifier.py b/backend/tests/backend/app/agents/test_schema_verifier.py new file mode 100644 index 0000000..dbb04eb --- /dev/null +++ b/backend/tests/backend/app/agents/test_schema_verifier.py @@ -0,0 +1,66 @@ +from app.agents.schemas.event import AgentEvent +from app.agents.schemas.task import AgentTask +from app.agents.verifier import verify_task_result + + +def test_agent_task_accepts_day1_fields(): + task = AgentTask( + task_id="task-1", + title="Verify foundation", + status="in_progress", + owner_agent_id="executor", + role="verifier", + goal="check output", + expected_evidence=[{"type": "assertion"}], + evidence=[{"type": "log"}], + result_summary="running", + ) + + assert task.task_id == "task-1" + assert task.owner_agent_id == "executor" + assert task.status == "in_progress" + assert task.expected_evidence == [{"type": "assertion"}] + assert task.evidence == [{"type": "log"}] + assert task.result_summary == "running" + + +def test_agent_event_accepts_day1_fields(): + event = AgentEvent( + event_id="evt-1", + event_type="agent.verify.completed", + conversation_id="conv-1", + agent_id="executor", + sub_commander_id="executor_tasks", + task_id="task-1", + payload={"status": "passed"}, + severity="info", + ) + + assert event.event_id == "evt-1" + assert event.event_type == "agent.verify.completed" + assert event.conversation_id == "conv-1" + assert event.payload == {"status": "passed"} + assert event.severity == "info" + + +def test_verifier_verdict_is_separate_from_task_lifecycle_status(): + task = AgentTask(task_id="task-1", title="Verify", status="blocked", result_summary="waiting") + + verdict = verify_task_result(task=task) + + assert verdict.status == "skipped" + assert verdict.summary == "waiting" + + +def test_verifier_prefers_explicit_result_success_signal(): + verdict = verify_task_result(result={"success": True, "summary": "all checks passed"}) + + assert verdict.status == "passed" + assert verdict.summary == "all checks passed" + + +def test_verifier_fails_when_no_verification_input_exists(): + verdict = verify_task_result() + + assert verdict.status == "failed" + assert verdict.summary == "No verification input available." diff --git a/development-doc/plan/2026-04-03-jarvis-agents-2-day-work-checklist.md b/development-doc/plan/2026-04-03-jarvis-agents-2-day-work-checklist.md new file mode 100644 index 0000000..ac711da --- /dev/null +++ b/development-doc/plan/2026-04-03-jarvis-agents-2-day-work-checklist.md @@ -0,0 +1,102 @@ +# Jarvis Agents 2 天工作计划(可勾选执行版) + +日期:2026-04-03 +状态:执行清单 +适用范围:基于 `phase-0` ~ `phase-4` 及现有 2 天融合方案整理 + +--- + +## 使用说明 + +- 完成前使用 `- [ ]` +- 完成后改成 `- [x]` +- Day 2 默认依赖 Day 1 的核心底座完成后再推进 + +--- + +## Day 1:补底座,完成 Phase 1 最小闭环 + +Day 1 目标:先把 Jarvis 从“只有静态路由”补成“有任务结构、有事件结构、有 verifier、有工具治理信息”的可扩展底座,同时不破坏当前 direct 主路径。 + +- [x] 新增最小 `task schema` + 改造内容:新增 `backend/app/agents/schemas/task.py`,统一 `task_id`、`title`、`status`、`owner_agent_id`、`evidence`、`result_summary`,并补 `role`、`goal`、`expected_evidence`、`created_at`、`updated_at`;状态固定为 `pending`、`in_progress`、`completed`、`failed`、`blocked`。 + +- [x] 新增最小 `event schema` + 改造内容:新增 `backend/app/agents/schemas/event.py`,统一 `event_id`、`event_type`、`timestamp`、`conversation_id`、`agent_id`、`sub_commander_id`、`task_id`、`payload`、`severity`;首批事件类型覆盖 `agent.tool.start`、`agent.tool.result`、`agent.verify.started`、`agent.verify.completed`、`agent.error`。 + +- [x] 扩展 `backend/app/agents/state.py` 的运行时字段 + 改造内容:新增 `execution_mode`、`verification_status`、`verification_summary`、`verification_evidence`、`active_tasks`、`task_results`、`event_trace`、`budget_state`;默认值保持兼容 `initial_state()`,不替换现有 `pending_tasks`、`completed_tasks`、`tool_calls`。 + +- [x] 扩展 capability / tool metadata 模型 + 改造内容:在 `backend/app/agents/registry/models.py` 增加 `permission_class`、`side_effect_scope`、`supports_retry`、`idempotent`、`safe_for_parallel_use`、`requires_confirmation`;至少先固化 `read` / `write` / `external` 和 `none` / `local_state` / `db_write` / `network` 两组枚举语义。 + +- [x] 回填 builtin tools 的静态 metadata + 改造内容:在 `backend/app/agents/registry/builtins.py` 和需要的 `backend/app/agents/tools/__init__.py` 中,把 search / retrieval 类工具标成偏 `read`,create / update 类工具标成偏 `write`,外部检索类工具标成 `external`,并补充是否可重试、是否幂等、是否适合并行等标记。 + +- [x] 新增 verifier 角色定义 + 改造内容:在 `backend/app/agents/prompts.py` 增加 verifier prompt,明确 verifier 只负责验收,不负责重新规划;验收点聚焦“是否真正满足请求”“是否有明确证据”“是否把失败伪装成成功”。 + +- [x] 落地 verifier 模块 + 改造内容:新增 `backend/app/agents/verifier.py`,支持 `passed`、`failed`、`skipped` 三类最小结论,先服务于工具调用后的复杂输出、知识检索结果和分析型汇总输出,不接管纯闲聊路径。 + +- [x] 在 `backend/app/agents/graph.py` 接入最小 event trace 与 verifier helper + 改造内容:给 `_execute_tool_calls()` 增加 tool start / result / error 事件写入;给收尾阶段增加 verifier helper 调用;给 `_run_sub_commander()` 增加 task result 摘要写入,但暂时不重构主图为完整协作编排图。 + +- [x] 补 Phase 1 单元测试与回归测试 + 改造内容:新增 `backend/tests/backend/app/agents/test_agent_schemas.py`、`backend/tests/backend/app/agents/test_verifier.py`,并扩展 `test_graph.py`,覆盖 state 兼容性、schema 合法性、tool metadata 存在性、verifier 判定、主流程不回退。 + +- [x] 完成 Day 1 验收 + 改造内容:确认 reminder / task / search 主流程继续通过;确认 verifier 已能独立运行;确认 event schema 与 task schema 已落代码;确认 direct 仍是默认主路径;确认未引入动态 `create_agent`、message bus 全链路和 UI。 + +--- + +## Day 2:引入最小协作能力,完成 Phase 2 雏形 + +Day 2 目标:在 Day 1 底座稳定的基础上,给 Jarvis 增加“复杂请求可拆分、可分配、可回收、可验收”的最小受控协作能力,但仍然不进入自由 swarm。 + +- [ ] 增加 `request_mode_selector` + 改造内容:在 `backend/app/agents/graph.py` 中增加 direct / collaboration 模式选择逻辑;简单请求继续走旧路径,只有明显多步骤、跨领域、需要多角色配合的请求才进入 collaboration mode。 + +- [ ] 新增 coordinator prompt + 改造内容:在 `backend/app/agents/prompts.py` 中定义 coordinator 角色,职责限定为“判断是否拆解”“输出 2~4 个清晰子任务”“分配角色建议”“汇总任务结果”;明确禁止无限递归拆分。 + +- [ ] 新增最小 task decomposition 结构 + 改造内容:基于 Day 1 的 task schema 扩展最小拆分结构,至少输出 `task_id`、`title`、`role`、`goal`、`expected_evidence`,让复杂请求能以结构化任务列表进入后续执行。 + +- [ ] 增加 role -> existing agent assignment + 改造内容:先复用当前已有 top-level agent,不新增独立 worker runtime;把 schedule 类任务映射给 `schedule_planner`,retrieval 类任务映射给 `librarian`,analysis 类任务映射给 `analyst`,execution 类任务映射给 `executor`。 + +- [ ] 建立统一 task result 回收结构 + 改造内容:约束每个角色统一返回 `task_id`、`status`、`summary`、`evidence`、`next_action`(可选),并把结果写回 `task_results`,避免最终结果继续依赖单点硬编码拼接。 + +- [ ] 让 verifier 强制参与协作结果收尾 + 改造内容:在 collaboration mode 下,所有复杂请求返回前都必须经过 verifier;verifier 有权拒绝证据不足、结果不完整、子任务未闭环的响应。 + +- [ ] 补 Phase 2 协作测试与回归测试 + 改造内容:覆盖复杂请求拆分测试、角色分配测试、task result 汇总测试、verifier 拒绝不完整结果测试,并再次确认 direct 模式原有流程不回退。 + +- [ ] 完成 Day 2 验收 + 改造内容:确认 graph 已能区分 direct / collaboration;确认复杂请求可拆成 2~4 个子任务;确认每个子任务有 owner 和 evidence;确认最终答案基于 task result 汇总;确认系统仍未进入无限动态 agent 模式。 + +--- + +## 这 2 天明确不做 + +- 不做动态 `create_agent` +- 不做 parent / child agent tree +- 不做内部消息线程长期态管理 +- 不做可视化调试面板 +- 不做 event stream API +- 不做 worktree / 隔离执行 +- 不做自由蜂群式协作 + +--- + +## 2 天结束后的预期状态 + +- [ ] 已具备 `direct` / `collaboration` 双模式入口 +- [ ] 已具备 verifier 独立验收层 +- [ ] 已具备 task schema / event schema / tool metadata 底座 +- [ ] 已具备 coordinator 雏形、任务拆分、角色分配、结果回收 +- [ ] 当前 reminder / task / search 主路径无明显回退 +- [ ] 后续可以继续推进 Phase 3 的受限动态协作,而不是返工 Phase 1 / Phase 2 底座