feat: add Jarvis agent verification foundation

Add Day 1 agent runtime foundations with task and event schemas, verifier support, capability metadata, graph event tracing, and regression coverage while preserving the direct execution path.
This commit is contained in:
2026-04-03 15:18:08 +08:00
parent 4972b4e6b1
commit aa0ef0fbea
14 changed files with 867 additions and 17 deletions

View File

@@ -6,6 +6,7 @@ import asyncio
import json
import logging
import re
from uuid import uuid4
from typing import Any, Literal, cast
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage
@@ -19,10 +20,13 @@ from app.agents.prompts import (
MASTER_SYSTEM_PROMPT,
SCHEDULE_PLANNER_SYSTEM_PROMPT,
)
from app.agents.registry import load_builtin_registry_indexes
from app.agents.schemas.event import AgentEvent
from app.agents.skill_registry import build_skill_context
from app.agents.state import AgentRole, AgentState
from app.agents.tools import SUB_COMMANDER_TOOLSETS
from app.agents.tools.time_reasoning import normalize_tool_time_arguments
from app.agents.verifier import apply_verification_verdict, verify_task_result
from app.services.llm_service import (
create_llm_from_config,
default_provider_capabilities,
@@ -632,6 +636,76 @@ def _conversation_history_messages(state: AgentState) -> list[BaseMessage]:
return [message for message in history if getattr(message, "type", "") != "system"]
def _append_event_trace(
state: AgentState,
event_type: str,
*,
payload: dict[str, Any] | None = None,
severity: str = "info",
task_id: str | None = None,
) -> None:
event = AgentEvent(
event_id=f"evt-{uuid4()}",
event_type=cast(Any, event_type),
conversation_id=str(state.get("conversation_id") or "") or None,
agent_id=_role_value(state.get("current_agent")),
sub_commander_id=state.get("current_sub_commander"),
task_id=task_id,
payload=payload or {},
severity=cast(Any, severity),
)
state["event_trace"] = [
*(state.get("event_trace") or []),
event.model_dump(mode="json"),
]
def _capability_manifest_for_tool(tool_name: str):
indexes = load_builtin_registry_indexes()
capability_id = indexes.capability_id_by_tool_name.get(tool_name)
if capability_id is None:
return None
return indexes.capability_by_id.get(capability_id)
def _build_verifier_hints(state: AgentState, tool_name: str, result: Any) -> dict[str, Any]:
capability = _capability_manifest_for_tool(tool_name)
permission_class = getattr(capability, "permission_class", None)
side_effect_scope = getattr(capability, "side_effect_scope", None)
return {
"tool_name": tool_name,
"permission_class": getattr(permission_class, "value", None),
"side_effect_scope": getattr(side_effect_scope, "value", None),
"requires_confirmation": bool(getattr(capability, "requires_confirmation", False)),
"supports_retry": bool(getattr(capability, "supports_retry", False)),
"safe_for_parallel_use": bool(getattr(capability, "safe_for_parallel_use", False)),
"result_preview": _stringify_message_content(result)[:200],
}
def _update_task_result_summary(state: AgentState, tool_summaries: list[dict[str, Any]]) -> None:
if not tool_summaries:
return
previous_summary = state.get("task_result_summary") or {}
previous_tools = previous_summary.get("tools") or []
merged_tools = [*previous_tools, *tool_summaries]
summary = {
"tool_count": len(merged_tools),
"tools": merged_tools,
"created_count": sum(int(item.get("created_count") or 0) for item in merged_tools),
"created_entity_types": [
entity_type
for item in merged_tools
for entity_type in item.get("created_entity_types") or []
if entity_type
],
"stop_reason": state.get("stop_reason"),
}
state["task_result_summary"] = summary
state["action_results"] = [*(state.get("action_results") or []), summary]
def _record_sub_commander(state: AgentState, role: AgentRole, sub_commander: str, user_query: str) -> None:
state["current_agent"] = role.value
state["current_sub_commander"] = sub_commander
@@ -889,6 +963,8 @@ async def _execute_tool_calls(
result_lines: list[str] = []
created_entities: list[dict[str, str]] = []
tool_messages: list[ToolMessage] = []
verifier_hints_by_tool: list[dict[str, Any]] = []
tool_summaries: list[dict[str, Any]] = []
for call in tool_calls:
tool_name = call["name"]
@@ -897,6 +973,13 @@ async def _execute_tool_calls(
if tool is None:
raise ValueError(f"Tool not found: {tool_name}")
_append_event_trace(
state,
"agent.tool.start",
payload={"tool_name": tool_name, "args": normalized_args},
task_id=str(call.get("id") or "") or None,
)
try:
if hasattr(tool, "ainvoke"):
result = await tool.ainvoke(normalized_args)
@@ -905,6 +988,13 @@ async def _execute_tool_calls(
except Exception as exc:
logger.exception("Tool execution failed: %s args=%s", tool_name, normalized_args)
result = f"工具执行失败: {exc}"
_append_event_trace(
state,
"agent.error",
payload={"tool_name": tool_name, "args": normalized_args, "error": str(exc)},
severity="error",
task_id=str(call.get("id") or "") or None,
)
normalized_call = {
"id": call.get("id"),
@@ -914,6 +1004,27 @@ async def _execute_tool_calls(
}
normalized_calls.append(normalized_call)
result_lines.append(f"[{tool_name}] {result}")
verifier_hints = _build_verifier_hints(state, tool_name, result)
verifier_hints_by_tool.append(verifier_hints)
tool_outcome = {
"tool_name": tool_name,
"args": normalized_args,
"result_preview": _stringify_message_content(result)[:200],
"verifier_hints": verifier_hints,
}
state["tool_outcomes"] = [*(state.get("tool_outcomes") or []), tool_outcome]
_append_event_trace(
state,
"agent.tool.result",
payload={
"tool_name": tool_name,
"args": normalized_args,
"result_preview": _stringify_message_content(result)[:200],
"verification": verifier_hints,
},
severity="error" if _tool_result_indicates_failure(result) else "info",
task_id=str(call.get("id") or "") or None,
)
tool_messages.append(
ToolMessage(
content=_stringify_message_content(result),
@@ -922,9 +1033,21 @@ async def _execute_tool_calls(
)
)
entity = _classify_created_entity(tool_name)
call_created_entities: list[dict[str, str]] = []
if entity and not _tool_result_indicates_failure(result):
created_entities.append(entity)
call_created_entities.append(entity)
tool_summaries.append(
{
"tool_name": tool_name,
"result_preview": _stringify_message_content(result)[:200],
"created_entity_types": [entity.get("type") for entity in call_created_entities if entity.get("type")],
"created_count": len(call_created_entities),
}
)
state["verifier_hints"] = {"tools": verifier_hints_by_tool}
_update_task_result_summary(state, tool_summaries)
return normalized_calls, "\n".join(result_lines), created_entities, tool_messages
@@ -1127,6 +1250,43 @@ async def _run_sub_commander(
if summary_target:
state[_summary_state_key(summary_target)] = state.get("final_response")
task_result_summary = state.get("task_result_summary")
tool_outcomes = list(state.get("tool_outcomes") or [])
has_tool_failure = any(
_tool_result_indicates_failure(outcome.get("result_preview"))
for outcome in tool_outcomes
)
verifier_input = {
"summary": state.get("final_response") or (task_result_summary or {}).get("tools"),
"evidence": tool_outcomes,
"success": bool(tool_outcomes or state.get("final_response")) and not has_tool_failure,
}
_append_event_trace(
state,
"agent.verify.started",
payload={
"summary_present": bool(verifier_input["summary"]),
"evidence_count": len(verifier_input["evidence"]),
},
)
verdict = verify_task_result(
summary=state.get("final_response"),
evidence=tool_outcomes,
result=verifier_input,
)
updated_state = apply_verification_verdict(state, verdict)
state.update(updated_state)
_append_event_trace(
state,
"agent.verify.completed",
payload={
"status": verdict.status,
"summary": verdict.summary,
"evidence_count": len(verdict.evidence),
},
severity="error" if verdict.status == "failed" else "info",
)
final_response_text = state.get("final_response")
if not state.get("clarification_needed") and final_response_text:
_clear_clarification_context(state)
@@ -1355,6 +1515,7 @@ def get_agent_graph(callbacks: list | None = None):
__all__ = [
"_build_verifier_hints",
"_choose_sub_commander",
"_parse_json_action",
"_route_agent_from_user_query",

View File

@@ -324,6 +324,19 @@ ANALYST_INSIGHTS_PROMPT = f"""{JARVIS_PERSONA_PROMPT}
"""
VERIFIER_PROMPT = f"""{JARVIS_PERSONA_PROMPT}
你是 Jarvis 的验证官,负责对执行结果做最小但明确的核验。
## 你的职责:
- 只输出 passed、failed、skipped 三种验证结论之一
- 用一句话总结验证判断
- 如有证据,保留关键证据点
- 当信息不足以证明成功或失败时,优先判定为 skipped
- 不重写执行方案,不扩展无关建议
"""
JSON_ACTION_FALLBACK_PROMPT = """你当前运行在 JSON action fallback 模式。
你的输出必须满足以下规则:

View File

@@ -1,11 +1,19 @@
"""Registry manifest models and validation helpers."""
from functools import lru_cache
from app.agents.registry.indexes import RegistryIndexes, build_registry_indexes
from app.agents.registry.loader import RegistryBundle, load_builtin_registry_bundle
@lru_cache(maxsize=1)
def load_builtin_registry_indexes() -> RegistryIndexes:
return build_registry_indexes(load_builtin_registry_bundle())
__all__ = [
"RegistryBundle",
"RegistryIndexes",
"build_registry_indexes",
"load_builtin_registry_bundle",
"load_builtin_registry_indexes",
]

View File

@@ -2,6 +2,8 @@ from app.agents.prompts import SUB_COMMANDER_PROMPTS_BY_KEY
from app.agents.registry.models import (
AgentManifest,
CapabilityManifest,
PermissionClass,
SideEffectScope,
SpecialistTemplateManifest,
SubCommanderManifest,
)
@@ -89,10 +91,150 @@ _capability_tool_names = tuple(
)
)
_CAPABILITY_METADATA_BY_TOOL_NAME: dict[str, dict[str, object]] = {
"get_tasks": {
"permission_class": PermissionClass.READ,
"side_effect_scope": SideEffectScope.NONE,
"supports_retry": True,
"idempotent": True,
"safe_for_parallel_use": True,
"requires_confirmation": False,
},
"get_schedule_day": {
"permission_class": PermissionClass.READ,
"side_effect_scope": SideEffectScope.NONE,
"supports_retry": True,
"idempotent": True,
"safe_for_parallel_use": True,
"requires_confirmation": False,
},
"resolve_time_expression": {
"permission_class": PermissionClass.READ,
"side_effect_scope": SideEffectScope.NONE,
"supports_retry": True,
"idempotent": True,
"safe_for_parallel_use": True,
"requires_confirmation": False,
},
"search_knowledge": {
"permission_class": PermissionClass.READ,
"side_effect_scope": SideEffectScope.NONE,
"supports_retry": True,
"idempotent": True,
"safe_for_parallel_use": True,
"requires_confirmation": False,
},
"hybrid_search": {
"permission_class": PermissionClass.READ,
"side_effect_scope": SideEffectScope.NONE,
"supports_retry": True,
"idempotent": True,
"safe_for_parallel_use": True,
"requires_confirmation": False,
},
"get_knowledge_graph_context": {
"permission_class": PermissionClass.READ,
"side_effect_scope": SideEffectScope.NONE,
"supports_retry": True,
"idempotent": True,
"safe_for_parallel_use": True,
"requires_confirmation": False,
},
"get_forum_posts": {
"permission_class": PermissionClass.READ,
"side_effect_scope": SideEffectScope.NONE,
"supports_retry": True,
"idempotent": True,
"safe_for_parallel_use": True,
"requires_confirmation": False,
},
"scan_forum_for_instructions": {
"permission_class": PermissionClass.READ,
"side_effect_scope": SideEffectScope.NONE,
"supports_retry": True,
"idempotent": True,
"safe_for_parallel_use": True,
"requires_confirmation": False,
},
"web_search": {
"permission_class": PermissionClass.EXTERNAL,
"side_effect_scope": SideEffectScope.NETWORK,
"supports_retry": True,
"idempotent": True,
"safe_for_parallel_use": True,
"requires_confirmation": False,
},
"create_task": {
"permission_class": PermissionClass.WRITE,
"side_effect_scope": SideEffectScope.LOCAL_STATE,
"supports_retry": False,
"idempotent": False,
"safe_for_parallel_use": False,
"requires_confirmation": True,
},
"update_task_status": {
"permission_class": PermissionClass.WRITE,
"side_effect_scope": SideEffectScope.LOCAL_STATE,
"supports_retry": False,
"idempotent": False,
"safe_for_parallel_use": False,
"requires_confirmation": True,
},
"create_todo": {
"permission_class": PermissionClass.WRITE,
"side_effect_scope": SideEffectScope.LOCAL_STATE,
"supports_retry": False,
"idempotent": False,
"safe_for_parallel_use": False,
"requires_confirmation": True,
},
"create_schedule_task": {
"permission_class": PermissionClass.WRITE,
"side_effect_scope": SideEffectScope.LOCAL_STATE,
"supports_retry": False,
"idempotent": False,
"safe_for_parallel_use": False,
"requires_confirmation": True,
},
"create_reminder": {
"permission_class": PermissionClass.WRITE,
"side_effect_scope": SideEffectScope.LOCAL_STATE,
"supports_retry": False,
"idempotent": False,
"safe_for_parallel_use": False,
"requires_confirmation": True,
},
"create_goal": {
"permission_class": PermissionClass.WRITE,
"side_effect_scope": SideEffectScope.LOCAL_STATE,
"supports_retry": False,
"idempotent": False,
"safe_for_parallel_use": False,
"requires_confirmation": True,
},
"create_forum_post": {
"permission_class": PermissionClass.WRITE,
"side_effect_scope": SideEffectScope.LOCAL_STATE,
"supports_retry": False,
"idempotent": False,
"safe_for_parallel_use": False,
"requires_confirmation": True,
},
"build_knowledge_graph": {
"permission_class": PermissionClass.WRITE,
"side_effect_scope": SideEffectScope.LOCAL_STATE,
"supports_retry": False,
"idempotent": False,
"safe_for_parallel_use": False,
"requires_confirmation": True,
},
}
BUILTIN_CAPABILITY_MANIFESTS: tuple[CapabilityManifest, ...] = tuple(
CapabilityManifest(
capability_id=tool_name,
tool_name=tool_name,
**dict(_CAPABILITY_METADATA_BY_TOOL_NAME.get(tool_name, {})),
)
for tool_name in _capability_tool_names
)

View File

@@ -1,6 +1,21 @@
from enum import Enum
from pydantic import BaseModel
class PermissionClass(str, Enum):
READ = "read"
WRITE = "write"
EXTERNAL = "external"
class SideEffectScope(str, Enum):
NONE = "none"
LOCAL_STATE = "local_state"
DB_WRITE = "db_write"
NETWORK = "network"
class AgentManifest(BaseModel):
agent_id: str
display_name: str
@@ -23,6 +38,12 @@ class SubCommanderManifest(BaseModel):
class CapabilityManifest(BaseModel):
capability_id: str
tool_name: str
permission_class: PermissionClass = PermissionClass.READ
side_effect_scope: SideEffectScope = SideEffectScope.NONE
supports_retry: bool = False
idempotent: bool = False
safe_for_parallel_use: bool = False
requires_confirmation: bool = False
class SpecialistTemplateManifest(BaseModel):

View File

@@ -0,0 +1,10 @@
from app.agents.schemas.event import AgentEvent
from app.agents.schemas.task import AgentTask, TaskResult, TaskLifecycleStatus, VerificationStatus
__all__ = [
"AgentEvent",
"AgentTask",
"TaskLifecycleStatus",
"TaskResult",
"VerificationStatus",
]

View File

@@ -0,0 +1,28 @@
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any, Literal
from pydantic import BaseModel, Field
AgentEventType = Literal[
"agent.tool.start",
"agent.tool.result",
"agent.verify.started",
"agent.verify.completed",
"agent.error",
]
AgentEventSeverity = Literal["info", "warning", "error"]
class AgentEvent(BaseModel):
event_id: str
event_type: AgentEventType
timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
conversation_id: str | None = None
agent_id: str | None = None
sub_commander_id: str | None = None
task_id: str | None = None
payload: dict[str, Any] = Field(default_factory=dict)
severity: AgentEventSeverity = "info"

View File

@@ -0,0 +1,32 @@
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any, Literal
from pydantic import BaseModel, Field
TaskLifecycleStatus = Literal["pending", "in_progress", "completed", "failed", "blocked"]
VerificationStatus = Literal["passed", "failed", "skipped"]
class AgentTask(BaseModel):
task_id: str
title: str
status: TaskLifecycleStatus = "pending"
owner_agent_id: str | None = None
role: str | None = None
goal: str | None = None
expected_evidence: list[dict[str, Any]] = Field(default_factory=list)
evidence: list[dict[str, Any]] = Field(default_factory=list)
result_summary: str | None = None
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
class TaskResult(BaseModel):
task_id: str
status: VerificationStatus
summary: str | None = None
evidence: list[dict[str, Any]] = Field(default_factory=list)
output_data: dict[str, Any] | None = None

View File

@@ -1,7 +1,9 @@
from dataclasses import dataclass
from enum import Enum
from typing import Annotated, Any, TypedDict
from typing import Annotated, Any, Literal, TypedDict
from app.agents.schemas.event import AgentEvent
from app.agents.schemas.task import AgentTask, TaskResult, VerificationStatus
from langchain_core.messages import BaseMessage
from langgraph.graph.message import add_messages
@@ -27,6 +29,7 @@ class AgentState(TypedDict):
user_id: str
conversation_id: str
execution_mode: Literal["direct", "delegated", "verified"]
current_agent: str | None
next_step: str | None
active_agents: list[AgentRole]
@@ -34,14 +37,24 @@ class AgentState(TypedDict):
active_sub_commanders: list[str]
sub_commander_trace: list[dict[str, Any]]
agent_trace: list[str]
event_trace: list[AgentEvent | dict[str, Any]]
pending_tasks: list[dict[str, Any]]
completed_tasks: list[dict[str, Any]]
active_tasks: list[AgentTask | dict[str, Any]]
task_results: list[TaskResult | dict[str, Any]]
tool_calls: list[dict[str, Any]]
last_tool_result: str | None
action_results: list[dict[str, Any]]
created_entities: list[dict[str, Any]]
tool_outcomes: list[dict[str, Any]]
task_result_summary: dict[str, Any] | None
verifier_hints: dict[str, Any] | None
verification_status: VerificationStatus | None
verification_summary: str | None
verification_evidence: list[dict[str, Any]]
budget_state: dict[str, Any] | None
tool_strategy_used: str | None
tool_round_count: int
@@ -89,6 +102,7 @@ def initial_state(user_id: str, conversation_id: str) -> AgentState:
messages=[],
user_id=user_id,
conversation_id=conversation_id,
execution_mode="direct",
current_agent=AgentRole.MASTER.value,
next_step=None,
active_agents=[AgentRole.MASTER],
@@ -96,13 +110,22 @@ def initial_state(user_id: str, conversation_id: str) -> AgentState:
active_sub_commanders=[],
sub_commander_trace=[],
agent_trace=[AgentRole.MASTER.value],
event_trace=[],
pending_tasks=[],
completed_tasks=[],
active_tasks=[],
task_results=[],
tool_calls=[],
last_tool_result=None,
action_results=[],
created_entities=[],
tool_outcomes=[],
task_result_summary=None,
verifier_hints=None,
verification_status=None,
verification_summary=None,
verification_evidence=[],
budget_state=None,
tool_strategy_used=None,
tool_round_count=0,
max_tool_rounds=2,

View File

@@ -0,0 +1,60 @@
from __future__ import annotations
from typing import Any
from pydantic import BaseModel, Field
from app.agents.schemas.task import AgentTask, TaskResult, VerificationStatus
from app.agents.state import AgentState
class VerificationVerdict(BaseModel):
status: VerificationStatus
summary: str | None = None
evidence: list[dict[str, Any]] = Field(default_factory=list)
def verify_task_result(
*,
task: AgentTask | dict[str, Any] | None = None,
result: TaskResult | dict[str, Any] | None = None,
summary: str | None = None,
evidence: list[dict[str, Any]] | None = None,
status: VerificationStatus | None = None,
) -> VerificationVerdict:
normalized_result = result.model_dump() if isinstance(result, TaskResult) else dict(result or {})
normalized_task = task.model_dump() if isinstance(task, AgentTask) else dict(task or {})
normalized_summary = summary or normalized_result.get("summary") or normalized_task.get("result_summary")
normalized_evidence = list(evidence or normalized_result.get("evidence") or normalized_task.get("evidence") or [])
if status is not None:
return VerificationVerdict(status=status, summary=normalized_summary, evidence=normalized_evidence)
if normalized_result.get("status") in {"passed", "failed", "skipped"}:
inferred_status = normalized_result["status"]
elif normalized_result.get("success") is True:
inferred_status = "passed"
elif normalized_result.get("success") is False:
inferred_status = "failed"
elif normalized_summary or normalized_evidence:
inferred_status = "skipped"
else:
inferred_status = "failed"
normalized_summary = "No verification input available."
return VerificationVerdict(
status=inferred_status,
summary=normalized_summary,
evidence=normalized_evidence,
)
def apply_verification_verdict(state: AgentState, verdict: VerificationVerdict) -> AgentState:
next_state = dict(state)
next_state["verification_status"] = verdict.status
next_state["verification_summary"] = verdict.summary
next_state["verification_evidence"] = list(verdict.evidence)
return AgentState(**next_state)
__all__ = ["VerificationVerdict", "apply_verification_verdict", "verify_task_result"]

View File

@@ -8,6 +8,7 @@ import app.agents.graph as graph_module
from langchain_core.messages import AIMessage, HumanMessage
from app.agents.graph import (
_build_verifier_hints,
_choose_sub_commander,
_execute_tool_calls,
_parse_json_action,
@@ -29,29 +30,46 @@ def _base_state(message: str, user_llm_config: dict | None = None) -> dict:
'messages': [HumanMessage(content=message)],
'user_id': 'u1',
'conversation_id': 'c1',
'current_agent': AgentRole.MASTER,
'execution_mode': 'direct',
'current_agent': AgentRole.MASTER.value,
'next_step': None,
'active_agents': [AgentRole.MASTER],
'current_sub_commander': None,
'active_sub_commanders': [],
'sub_commander_trace': [],
'agent_trace': [AgentRole.MASTER.value],
'event_trace': [],
'pending_tasks': [],
'completed_tasks': [],
'active_tasks': [],
'task_results': [],
'tool_calls': [],
'last_tool_result': None,
'action_results': [],
'created_entities': [],
'tool_outcomes': [],
'task_result_summary': None,
'verifier_hints': None,
'verification_status': None,
'verification_summary': None,
'verification_evidence': [],
'budget_state': None,
'tool_strategy_used': None,
'tool_round_count': 0,
'max_tool_rounds': 2,
'retry_count': 0,
'max_retries': 1,
'iteration_count': 0,
'max_iterations': 3,
'routing_hops': 0,
'max_routing_hops': 2,
'terminated_due_to_loop_guard': False,
'retrieval_trace': [],
'stop_reason': None,
'clarification_needed': False,
'clarification_question': None,
'provider_capabilities': None,
'fallback_parse_error': None,
'should_respond': True,
'knowledge_context': None,
'graph_context': None,
'schedule_context_summary': None,
@@ -59,11 +77,17 @@ def _base_state(message: str, user_llm_config: dict | None = None) -> dict:
'plan_steps': [],
'analysis_report': None,
'final_response': None,
'should_respond': True,
'memory_context': None,
'current_datetime_context': 'CURRENT_TIME: 2026-03-28T12:00:00+08:00',
'current_datetime_reference': {'current_time_iso': '2026-03-28T12:00:00+08:00', 'current_date_iso': '2026-03-28', 'timezone': 'UTC'},
'turn_context': None,
'routing_decision': None,
'continuity_state': None,
'pending_action': None,
'last_completed_action': None,
'clarification_context': None,
'user_llm_config': user_llm_config,
'provider_capabilities': None,
}
@@ -258,6 +282,7 @@ def test_initial_state_sets_structured_continuity_defaults():
assert state['pending_action'] is None
assert state['last_completed_action'] is None
assert state['clarification_context'] is None
assert state['event_trace'] == []
assert state['tool_outcomes'] == []
@@ -322,6 +347,7 @@ async def test_planner_node_clears_next_step_after_consuming_routed_turn(monkeyp
assert result['final_response'] is not None
async def test_master_node_returns_stable_reply_for_simple_greeting(monkeypatch):
monkeypatch.setattr('app.agents.graph._get_llm_for_state', lambda state: FailIfCalledLLM())
state = {
@@ -1062,8 +1088,147 @@ async def test_master_node_returns_stable_reply_for_capability_question(monkeypa
assert getattr(result['messages'][-1], 'content', '') == result['final_response']
def test_choose_sub_commander_routes_schedule_requests_to_schedule_planning():
assert _choose_sub_commander(AgentRole.SCHEDULE_PLANNER, '帮我安排一下这周计划') == 'schedule_planning'
def test_build_verifier_hints_uses_capability_metadata():
state = _base_state('明天提醒我开会')
hints = _build_verifier_hints(state, 'create_reminder', '提醒创建成功')
assert hints['tool_name'] == 'create_reminder'
assert hints['permission_class'] == 'write'
assert hints['side_effect_scope'] == 'local_state'
assert hints['requires_confirmation'] is True
assert hints['supports_retry'] is False
assert hints['safe_for_parallel_use'] is False
assert '提醒创建成功' in hints['result_preview']
async def test_execute_tool_calls_records_schema_events_and_aggregate_summaries(monkeypatch):
tool = FakeTool('create_reminder', '提醒创建成功: 开会 @ 2026-03-29 09:00')
state = _base_state('test')
normalized_calls, tool_result, created_entities, tool_messages = await _execute_tool_calls(
[{'id': 'task-1', 'name': 'create_reminder', 'args': {'title': '开会', 'reminder_at': '2026-03-29T09:00:00'}}],
[tool],
state,
)
assert normalized_calls[0]['name'] == 'create_reminder'
assert tool_result.startswith('[create_reminder]')
assert created_entities == [{'type': 'reminder', 'tool': 'create_reminder'}]
assert len(tool_messages) == 1
assert state['verifier_hints'] == {
'tools': [
{
'tool_name': 'create_reminder',
'permission_class': 'write',
'side_effect_scope': 'local_state',
'requires_confirmation': True,
'supports_retry': False,
'safe_for_parallel_use': False,
'result_preview': '提醒创建成功: 开会 @ 2026-03-29 09:00',
}
]
}
assert state['task_result_summary']['tool_count'] == 1
assert state['task_result_summary']['created_entity_types'] == ['reminder']
assert state['tool_outcomes'][0]['tool_name'] == 'create_reminder'
assert state['event_trace'][0]['event_type'] == 'agent.tool.start'
assert state['event_trace'][-1]['event_type'] == 'agent.tool.result'
assert state['event_trace'][-1]['payload']['verification']['tool_name'] == 'create_reminder'
assert state['task_result_summary'] == {
'tool_count': 1,
'tools': [
{
'tool_name': 'create_reminder',
'result_preview': '提醒创建成功: 开会 @ 2026-03-29 09:00',
'created_entity_types': ['reminder'],
'created_count': 1,
}
],
'created_count': 1,
'created_entity_types': ['reminder'],
'stop_reason': None,
}
assert state['action_results'][-1] == state['task_result_summary']
assert state['tool_outcomes'][0]['tool_name'] == 'create_reminder'
assert [event['event_type'] for event in state['event_trace']] == [
'agent.tool.start',
'agent.tool.result',
]
assert all('event_id' in event for event in state['event_trace'])
assert all('timestamp' in event for event in state['event_trace'])
assert all(event['conversation_id'] == 'c1' for event in state['event_trace'])
assert all(event['agent_id'] == AgentRole.MASTER.value for event in state['event_trace'])
assert all(event['task_id'] == 'task-1' for event in state['event_trace'])
async def test_execute_tool_calls_aggregates_multiple_tool_turns_without_overwrite(monkeypatch):
reminder_tool = FakeTool('create_reminder', '提醒创建成功: 开会 @ 2026-03-29 09:00')
search_tool = FakeTool('web_search', '成功搜索到 2 条网页结果')
state = _base_state('test')
normalized_calls, tool_result, created_entities, tool_messages = await _execute_tool_calls(
[
{'id': 'task-1', 'name': 'create_reminder', 'args': {'title': '开会', 'reminder_at': '2026-03-29T09:00:00'}},
{'id': 'task-2', 'name': 'web_search', 'args': {'query': 'Jarvis 最新模型更新'}},
],
[reminder_tool, search_tool],
state,
)
assert [call['name'] for call in normalized_calls] == ['create_reminder', 'web_search']
assert tool_result == '[create_reminder] 提醒创建成功: 开会 @ 2026-03-29 09:00\n[web_search] 成功搜索到 2 条网页结果'
assert created_entities == [{'type': 'reminder', 'tool': 'create_reminder'}]
assert [message.name for message in tool_messages] == ['create_reminder', 'web_search']
assert state['verifier_hints'] == {
'tools': [
{
'tool_name': 'create_reminder',
'permission_class': 'write',
'side_effect_scope': 'local_state',
'requires_confirmation': True,
'supports_retry': False,
'safe_for_parallel_use': False,
'result_preview': '提醒创建成功: 开会 @ 2026-03-29 09:00',
},
{
'tool_name': 'web_search',
'permission_class': 'external',
'side_effect_scope': 'network',
'requires_confirmation': False,
'supports_retry': True,
'safe_for_parallel_use': True,
'result_preview': '成功搜索到 2 条网页结果',
},
]
}
assert state['task_result_summary'] == {
'tool_count': 2,
'tools': [
{
'tool_name': 'create_reminder',
'result_preview': '提醒创建成功: 开会 @ 2026-03-29 09:00',
'created_entity_types': ['reminder'],
'created_count': 1,
},
{
'tool_name': 'web_search',
'result_preview': '成功搜索到 2 条网页结果',
'created_entity_types': [],
'created_count': 0,
},
],
'created_count': 1,
'created_entity_types': ['reminder'],
'stop_reason': None,
}
assert len(state['tool_outcomes']) == 2
assert [event['event_type'] for event in state['event_trace']] == [
'agent.tool.start',
'agent.tool.result',
'agent.tool.start',
'agent.tool.result',
]
def test_choose_sub_commander_routes_focus_requests_to_schedule_analysis():

View File

@@ -5,11 +5,13 @@ from app.agents.prompts import (
SUB_COMMANDER_PROMPTS_BY_KEY,
TOP_LEVEL_SYSTEM_PROMPTS_BY_KEY,
)
from app.agents.registry import build_registry_indexes, load_builtin_registry_bundle
from app.agents.registry import build_registry_indexes, load_builtin_registry_bundle, load_builtin_registry_indexes
from app.agents.registry.indexes import summarize_registry_indexes
from app.agents.registry.models import (
AgentManifest,
CapabilityManifest,
PermissionClass,
SideEffectScope,
SpecialistTemplateManifest,
SubCommanderManifest,
)
@@ -251,17 +253,34 @@ def test_builtin_capabilities_reference_actual_runtime_tool_names() -> None:
assert manifest_tool_names == expected_tool_names
def test_builtin_sub_commander_capabilities_match_runtime_toolsets() -> None:
capabilities_by_tool_name = {
manifest.tool_name: manifest.capability_id for manifest in BUILTIN_CAPABILITY_MANIFESTS
}
def test_builtin_capability_metadata_distinguishes_read_and_write_surfaces() -> None:
capability_by_id = {manifest.capability_id: manifest for manifest in BUILTIN_CAPABILITY_MANIFESTS}
for sub_commander in BUILTIN_SUB_COMMANDER_MANIFESTS:
expected_capability_ids = {
capabilities_by_tool_name[tool.name]
for tool in SUB_COMMANDER_TOOLSETS[sub_commander.sub_commander_id]
}
assert set(sub_commander.capability_ids) == expected_capability_ids
assert capability_by_id["get_tasks"].permission_class == PermissionClass.READ
assert capability_by_id["get_tasks"].side_effect_scope == SideEffectScope.NONE
assert capability_by_id["get_tasks"].supports_retry is True
assert capability_by_id["get_tasks"].idempotent is True
assert capability_by_id["get_tasks"].safe_for_parallel_use is True
assert capability_by_id["get_tasks"].requires_confirmation is False
assert capability_by_id["create_reminder"].permission_class == PermissionClass.WRITE
assert capability_by_id["create_reminder"].side_effect_scope == SideEffectScope.LOCAL_STATE
assert capability_by_id["create_reminder"].supports_retry is False
assert capability_by_id["create_reminder"].idempotent is False
assert capability_by_id["create_reminder"].safe_for_parallel_use is False
assert capability_by_id["create_reminder"].requires_confirmation is True
assert capability_by_id["web_search"].permission_class == PermissionClass.EXTERNAL
assert capability_by_id["web_search"].side_effect_scope == SideEffectScope.NETWORK
def test_load_builtin_registry_indexes_is_cached_and_matches_bundle_indexes() -> None:
cached = load_builtin_registry_indexes()
rebuilt = build_registry_indexes(load_builtin_registry_bundle())
assert cached is load_builtin_registry_indexes()
assert cached.capability_id_by_tool_name == rebuilt.capability_id_by_tool_name
assert cached.capability_by_id["create_reminder"].requires_confirmation is True
def test_builtin_manifests_form_a_valid_registry_bundle() -> None:

View File

@@ -0,0 +1,66 @@
from app.agents.schemas.event import AgentEvent
from app.agents.schemas.task import AgentTask
from app.agents.verifier import verify_task_result
def test_agent_task_accepts_day1_fields():
task = AgentTask(
task_id="task-1",
title="Verify foundation",
status="in_progress",
owner_agent_id="executor",
role="verifier",
goal="check output",
expected_evidence=[{"type": "assertion"}],
evidence=[{"type": "log"}],
result_summary="running",
)
assert task.task_id == "task-1"
assert task.owner_agent_id == "executor"
assert task.status == "in_progress"
assert task.expected_evidence == [{"type": "assertion"}]
assert task.evidence == [{"type": "log"}]
assert task.result_summary == "running"
def test_agent_event_accepts_day1_fields():
event = AgentEvent(
event_id="evt-1",
event_type="agent.verify.completed",
conversation_id="conv-1",
agent_id="executor",
sub_commander_id="executor_tasks",
task_id="task-1",
payload={"status": "passed"},
severity="info",
)
assert event.event_id == "evt-1"
assert event.event_type == "agent.verify.completed"
assert event.conversation_id == "conv-1"
assert event.payload == {"status": "passed"}
assert event.severity == "info"
def test_verifier_verdict_is_separate_from_task_lifecycle_status():
task = AgentTask(task_id="task-1", title="Verify", status="blocked", result_summary="waiting")
verdict = verify_task_result(task=task)
assert verdict.status == "skipped"
assert verdict.summary == "waiting"
def test_verifier_prefers_explicit_result_success_signal():
verdict = verify_task_result(result={"success": True, "summary": "all checks passed"})
assert verdict.status == "passed"
assert verdict.summary == "all checks passed"
def test_verifier_fails_when_no_verification_input_exists():
verdict = verify_task_result()
assert verdict.status == "failed"
assert verdict.summary == "No verification input available."

View File

@@ -0,0 +1,102 @@
# Jarvis Agents 2 天工作计划(可勾选执行版)
日期2026-04-03
状态:执行清单
适用范围:基于 `phase-0` ~ `phase-4` 及现有 2 天融合方案整理
---
## 使用说明
- 完成前使用 `- [ ]`
- 完成后改成 `- [x]`
- Day 2 默认依赖 Day 1 的核心底座完成后再推进
---
## Day 1补底座完成 Phase 1 最小闭环
Day 1 目标:先把 Jarvis 从“只有静态路由”补成“有任务结构、有事件结构、有 verifier、有工具治理信息”的可扩展底座同时不破坏当前 direct 主路径。
- [x] 新增最小 `task schema`
改造内容:新增 `backend/app/agents/schemas/task.py`,统一 `task_id``title``status``owner_agent_id``evidence``result_summary`,并补 `role``goal``expected_evidence``created_at``updated_at`;状态固定为 `pending``in_progress``completed``failed``blocked`
- [x] 新增最小 `event schema`
改造内容:新增 `backend/app/agents/schemas/event.py`,统一 `event_id``event_type``timestamp``conversation_id``agent_id``sub_commander_id``task_id``payload``severity`;首批事件类型覆盖 `agent.tool.start``agent.tool.result``agent.verify.started``agent.verify.completed``agent.error`
- [x] 扩展 `backend/app/agents/state.py` 的运行时字段
改造内容:新增 `execution_mode``verification_status``verification_summary``verification_evidence``active_tasks``task_results``event_trace``budget_state`;默认值保持兼容 `initial_state()`,不替换现有 `pending_tasks``completed_tasks``tool_calls`
- [x] 扩展 capability / tool metadata 模型
改造内容:在 `backend/app/agents/registry/models.py` 增加 `permission_class``side_effect_scope``supports_retry``idempotent``safe_for_parallel_use``requires_confirmation`;至少先固化 `read` / `write` / `external``none` / `local_state` / `db_write` / `network` 两组枚举语义。
- [x] 回填 builtin tools 的静态 metadata
改造内容:在 `backend/app/agents/registry/builtins.py` 和需要的 `backend/app/agents/tools/__init__.py` 中,把 search / retrieval 类工具标成偏 `read`create / update 类工具标成偏 `write`,外部检索类工具标成 `external`,并补充是否可重试、是否幂等、是否适合并行等标记。
- [x] 新增 verifier 角色定义
改造内容:在 `backend/app/agents/prompts.py` 增加 verifier prompt明确 verifier 只负责验收,不负责重新规划;验收点聚焦“是否真正满足请求”“是否有明确证据”“是否把失败伪装成成功”。
- [x] 落地 verifier 模块
改造内容:新增 `backend/app/agents/verifier.py`,支持 `passed``failed``skipped` 三类最小结论,先服务于工具调用后的复杂输出、知识检索结果和分析型汇总输出,不接管纯闲聊路径。
- [x]`backend/app/agents/graph.py` 接入最小 event trace 与 verifier helper
改造内容:给 `_execute_tool_calls()` 增加 tool start / result / error 事件写入;给收尾阶段增加 verifier helper 调用;给 `_run_sub_commander()` 增加 task result 摘要写入,但暂时不重构主图为完整协作编排图。
- [x] 补 Phase 1 单元测试与回归测试
改造内容:新增 `backend/tests/backend/app/agents/test_agent_schemas.py``backend/tests/backend/app/agents/test_verifier.py`,并扩展 `test_graph.py`,覆盖 state 兼容性、schema 合法性、tool metadata 存在性、verifier 判定、主流程不回退。
- [x] 完成 Day 1 验收
改造内容:确认 reminder / task / search 主流程继续通过;确认 verifier 已能独立运行;确认 event schema 与 task schema 已落代码;确认 direct 仍是默认主路径;确认未引入动态 `create_agent`、message bus 全链路和 UI。
---
## Day 2引入最小协作能力完成 Phase 2 雏形
Day 2 目标:在 Day 1 底座稳定的基础上,给 Jarvis 增加“复杂请求可拆分、可分配、可回收、可验收”的最小受控协作能力,但仍然不进入自由 swarm。
- [ ] 增加 `request_mode_selector`
改造内容:在 `backend/app/agents/graph.py` 中增加 direct / collaboration 模式选择逻辑;简单请求继续走旧路径,只有明显多步骤、跨领域、需要多角色配合的请求才进入 collaboration mode。
- [ ] 新增 coordinator prompt
改造内容:在 `backend/app/agents/prompts.py` 中定义 coordinator 角色,职责限定为“判断是否拆解”“输出 2~4 个清晰子任务”“分配角色建议”“汇总任务结果”;明确禁止无限递归拆分。
- [ ] 新增最小 task decomposition 结构
改造内容:基于 Day 1 的 task schema 扩展最小拆分结构,至少输出 `task_id``title``role``goal``expected_evidence`,让复杂请求能以结构化任务列表进入后续执行。
- [ ] 增加 role -> existing agent assignment
改造内容:先复用当前已有 top-level agent不新增独立 worker runtime把 schedule 类任务映射给 `schedule_planner`retrieval 类任务映射给 `librarian`analysis 类任务映射给 `analyst`execution 类任务映射给 `executor`
- [ ] 建立统一 task result 回收结构
改造内容:约束每个角色统一返回 `task_id``status``summary``evidence``next_action`(可选),并把结果写回 `task_results`,避免最终结果继续依赖单点硬编码拼接。
- [ ] 让 verifier 强制参与协作结果收尾
改造内容:在 collaboration mode 下,所有复杂请求返回前都必须经过 verifierverifier 有权拒绝证据不足、结果不完整、子任务未闭环的响应。
- [ ] 补 Phase 2 协作测试与回归测试
改造内容覆盖复杂请求拆分测试、角色分配测试、task result 汇总测试、verifier 拒绝不完整结果测试,并再次确认 direct 模式原有流程不回退。
- [ ] 完成 Day 2 验收
改造内容:确认 graph 已能区分 direct / collaboration确认复杂请求可拆成 2~4 个子任务;确认每个子任务有 owner 和 evidence确认最终答案基于 task result 汇总;确认系统仍未进入无限动态 agent 模式。
---
## 这 2 天明确不做
- 不做动态 `create_agent`
- 不做 parent / child agent tree
- 不做内部消息线程长期态管理
- 不做可视化调试面板
- 不做 event stream API
- 不做 worktree / 隔离执行
- 不做自由蜂群式协作
---
## 2 天结束后的预期状态
- [ ] 已具备 `direct` / `collaboration` 双模式入口
- [ ] 已具备 verifier 独立验收层
- [ ] 已具备 task schema / event schema / tool metadata 底座
- [ ] 已具备 coordinator 雏形、任务拆分、角色分配、结果回收
- [ ] 当前 reminder / task / search 主路径无明显回退
- [ ] 后续可以继续推进 Phase 3 的受限动态协作,而不是返工 Phase 1 / Phase 2 底座