feat: add Jarvis agent verification foundation
Add Day 1 agent runtime foundations with task and event schemas, verifier support, capability metadata, graph event tracing, and regression coverage while preserving the direct execution path.
This commit is contained in:
@@ -6,6 +6,7 @@ import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from uuid import uuid4
|
||||
from typing import Any, Literal, cast
|
||||
|
||||
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage
|
||||
@@ -19,10 +20,13 @@ from app.agents.prompts import (
|
||||
MASTER_SYSTEM_PROMPT,
|
||||
SCHEDULE_PLANNER_SYSTEM_PROMPT,
|
||||
)
|
||||
from app.agents.registry import load_builtin_registry_indexes
|
||||
from app.agents.schemas.event import AgentEvent
|
||||
from app.agents.skill_registry import build_skill_context
|
||||
from app.agents.state import AgentRole, AgentState
|
||||
from app.agents.tools import SUB_COMMANDER_TOOLSETS
|
||||
from app.agents.tools.time_reasoning import normalize_tool_time_arguments
|
||||
from app.agents.verifier import apply_verification_verdict, verify_task_result
|
||||
from app.services.llm_service import (
|
||||
create_llm_from_config,
|
||||
default_provider_capabilities,
|
||||
@@ -632,6 +636,76 @@ def _conversation_history_messages(state: AgentState) -> list[BaseMessage]:
|
||||
return [message for message in history if getattr(message, "type", "") != "system"]
|
||||
|
||||
|
||||
def _append_event_trace(
|
||||
state: AgentState,
|
||||
event_type: str,
|
||||
*,
|
||||
payload: dict[str, Any] | None = None,
|
||||
severity: str = "info",
|
||||
task_id: str | None = None,
|
||||
) -> None:
|
||||
event = AgentEvent(
|
||||
event_id=f"evt-{uuid4()}",
|
||||
event_type=cast(Any, event_type),
|
||||
conversation_id=str(state.get("conversation_id") or "") or None,
|
||||
agent_id=_role_value(state.get("current_agent")),
|
||||
sub_commander_id=state.get("current_sub_commander"),
|
||||
task_id=task_id,
|
||||
payload=payload or {},
|
||||
severity=cast(Any, severity),
|
||||
)
|
||||
state["event_trace"] = [
|
||||
*(state.get("event_trace") or []),
|
||||
event.model_dump(mode="json"),
|
||||
]
|
||||
|
||||
|
||||
def _capability_manifest_for_tool(tool_name: str):
|
||||
indexes = load_builtin_registry_indexes()
|
||||
capability_id = indexes.capability_id_by_tool_name.get(tool_name)
|
||||
if capability_id is None:
|
||||
return None
|
||||
return indexes.capability_by_id.get(capability_id)
|
||||
|
||||
|
||||
def _build_verifier_hints(state: AgentState, tool_name: str, result: Any) -> dict[str, Any]:
|
||||
capability = _capability_manifest_for_tool(tool_name)
|
||||
permission_class = getattr(capability, "permission_class", None)
|
||||
side_effect_scope = getattr(capability, "side_effect_scope", None)
|
||||
return {
|
||||
"tool_name": tool_name,
|
||||
"permission_class": getattr(permission_class, "value", None),
|
||||
"side_effect_scope": getattr(side_effect_scope, "value", None),
|
||||
"requires_confirmation": bool(getattr(capability, "requires_confirmation", False)),
|
||||
"supports_retry": bool(getattr(capability, "supports_retry", False)),
|
||||
"safe_for_parallel_use": bool(getattr(capability, "safe_for_parallel_use", False)),
|
||||
"result_preview": _stringify_message_content(result)[:200],
|
||||
}
|
||||
|
||||
|
||||
def _update_task_result_summary(state: AgentState, tool_summaries: list[dict[str, Any]]) -> None:
|
||||
if not tool_summaries:
|
||||
return
|
||||
|
||||
previous_summary = state.get("task_result_summary") or {}
|
||||
previous_tools = previous_summary.get("tools") or []
|
||||
merged_tools = [*previous_tools, *tool_summaries]
|
||||
summary = {
|
||||
"tool_count": len(merged_tools),
|
||||
"tools": merged_tools,
|
||||
"created_count": sum(int(item.get("created_count") or 0) for item in merged_tools),
|
||||
"created_entity_types": [
|
||||
entity_type
|
||||
for item in merged_tools
|
||||
for entity_type in item.get("created_entity_types") or []
|
||||
if entity_type
|
||||
],
|
||||
"stop_reason": state.get("stop_reason"),
|
||||
}
|
||||
state["task_result_summary"] = summary
|
||||
state["action_results"] = [*(state.get("action_results") or []), summary]
|
||||
|
||||
|
||||
def _record_sub_commander(state: AgentState, role: AgentRole, sub_commander: str, user_query: str) -> None:
|
||||
state["current_agent"] = role.value
|
||||
state["current_sub_commander"] = sub_commander
|
||||
@@ -889,6 +963,8 @@ async def _execute_tool_calls(
|
||||
result_lines: list[str] = []
|
||||
created_entities: list[dict[str, str]] = []
|
||||
tool_messages: list[ToolMessage] = []
|
||||
verifier_hints_by_tool: list[dict[str, Any]] = []
|
||||
tool_summaries: list[dict[str, Any]] = []
|
||||
|
||||
for call in tool_calls:
|
||||
tool_name = call["name"]
|
||||
@@ -897,6 +973,13 @@ async def _execute_tool_calls(
|
||||
if tool is None:
|
||||
raise ValueError(f"Tool not found: {tool_name}")
|
||||
|
||||
_append_event_trace(
|
||||
state,
|
||||
"agent.tool.start",
|
||||
payload={"tool_name": tool_name, "args": normalized_args},
|
||||
task_id=str(call.get("id") or "") or None,
|
||||
)
|
||||
|
||||
try:
|
||||
if hasattr(tool, "ainvoke"):
|
||||
result = await tool.ainvoke(normalized_args)
|
||||
@@ -905,6 +988,13 @@ async def _execute_tool_calls(
|
||||
except Exception as exc:
|
||||
logger.exception("Tool execution failed: %s args=%s", tool_name, normalized_args)
|
||||
result = f"工具执行失败: {exc}"
|
||||
_append_event_trace(
|
||||
state,
|
||||
"agent.error",
|
||||
payload={"tool_name": tool_name, "args": normalized_args, "error": str(exc)},
|
||||
severity="error",
|
||||
task_id=str(call.get("id") or "") or None,
|
||||
)
|
||||
|
||||
normalized_call = {
|
||||
"id": call.get("id"),
|
||||
@@ -914,6 +1004,27 @@ async def _execute_tool_calls(
|
||||
}
|
||||
normalized_calls.append(normalized_call)
|
||||
result_lines.append(f"[{tool_name}] {result}")
|
||||
verifier_hints = _build_verifier_hints(state, tool_name, result)
|
||||
verifier_hints_by_tool.append(verifier_hints)
|
||||
tool_outcome = {
|
||||
"tool_name": tool_name,
|
||||
"args": normalized_args,
|
||||
"result_preview": _stringify_message_content(result)[:200],
|
||||
"verifier_hints": verifier_hints,
|
||||
}
|
||||
state["tool_outcomes"] = [*(state.get("tool_outcomes") or []), tool_outcome]
|
||||
_append_event_trace(
|
||||
state,
|
||||
"agent.tool.result",
|
||||
payload={
|
||||
"tool_name": tool_name,
|
||||
"args": normalized_args,
|
||||
"result_preview": _stringify_message_content(result)[:200],
|
||||
"verification": verifier_hints,
|
||||
},
|
||||
severity="error" if _tool_result_indicates_failure(result) else "info",
|
||||
task_id=str(call.get("id") or "") or None,
|
||||
)
|
||||
tool_messages.append(
|
||||
ToolMessage(
|
||||
content=_stringify_message_content(result),
|
||||
@@ -922,9 +1033,21 @@ async def _execute_tool_calls(
|
||||
)
|
||||
)
|
||||
entity = _classify_created_entity(tool_name)
|
||||
call_created_entities: list[dict[str, str]] = []
|
||||
if entity and not _tool_result_indicates_failure(result):
|
||||
created_entities.append(entity)
|
||||
call_created_entities.append(entity)
|
||||
tool_summaries.append(
|
||||
{
|
||||
"tool_name": tool_name,
|
||||
"result_preview": _stringify_message_content(result)[:200],
|
||||
"created_entity_types": [entity.get("type") for entity in call_created_entities if entity.get("type")],
|
||||
"created_count": len(call_created_entities),
|
||||
}
|
||||
)
|
||||
|
||||
state["verifier_hints"] = {"tools": verifier_hints_by_tool}
|
||||
_update_task_result_summary(state, tool_summaries)
|
||||
return normalized_calls, "\n".join(result_lines), created_entities, tool_messages
|
||||
|
||||
|
||||
@@ -1127,6 +1250,43 @@ async def _run_sub_commander(
|
||||
if summary_target:
|
||||
state[_summary_state_key(summary_target)] = state.get("final_response")
|
||||
|
||||
task_result_summary = state.get("task_result_summary")
|
||||
tool_outcomes = list(state.get("tool_outcomes") or [])
|
||||
has_tool_failure = any(
|
||||
_tool_result_indicates_failure(outcome.get("result_preview"))
|
||||
for outcome in tool_outcomes
|
||||
)
|
||||
verifier_input = {
|
||||
"summary": state.get("final_response") or (task_result_summary or {}).get("tools"),
|
||||
"evidence": tool_outcomes,
|
||||
"success": bool(tool_outcomes or state.get("final_response")) and not has_tool_failure,
|
||||
}
|
||||
_append_event_trace(
|
||||
state,
|
||||
"agent.verify.started",
|
||||
payload={
|
||||
"summary_present": bool(verifier_input["summary"]),
|
||||
"evidence_count": len(verifier_input["evidence"]),
|
||||
},
|
||||
)
|
||||
verdict = verify_task_result(
|
||||
summary=state.get("final_response"),
|
||||
evidence=tool_outcomes,
|
||||
result=verifier_input,
|
||||
)
|
||||
updated_state = apply_verification_verdict(state, verdict)
|
||||
state.update(updated_state)
|
||||
_append_event_trace(
|
||||
state,
|
||||
"agent.verify.completed",
|
||||
payload={
|
||||
"status": verdict.status,
|
||||
"summary": verdict.summary,
|
||||
"evidence_count": len(verdict.evidence),
|
||||
},
|
||||
severity="error" if verdict.status == "failed" else "info",
|
||||
)
|
||||
|
||||
final_response_text = state.get("final_response")
|
||||
if not state.get("clarification_needed") and final_response_text:
|
||||
_clear_clarification_context(state)
|
||||
@@ -1355,6 +1515,7 @@ def get_agent_graph(callbacks: list | None = None):
|
||||
|
||||
|
||||
__all__ = [
|
||||
"_build_verifier_hints",
|
||||
"_choose_sub_commander",
|
||||
"_parse_json_action",
|
||||
"_route_agent_from_user_query",
|
||||
|
||||
@@ -324,6 +324,19 @@ ANALYST_INSIGHTS_PROMPT = f"""{JARVIS_PERSONA_PROMPT}
|
||||
"""
|
||||
|
||||
|
||||
VERIFIER_PROMPT = f"""{JARVIS_PERSONA_PROMPT}
|
||||
|
||||
你是 Jarvis 的验证官,负责对执行结果做最小但明确的核验。
|
||||
|
||||
## 你的职责:
|
||||
- 只输出 passed、failed、skipped 三种验证结论之一
|
||||
- 用一句话总结验证判断
|
||||
- 如有证据,保留关键证据点
|
||||
- 当信息不足以证明成功或失败时,优先判定为 skipped
|
||||
- 不重写执行方案,不扩展无关建议
|
||||
"""
|
||||
|
||||
|
||||
JSON_ACTION_FALLBACK_PROMPT = """你当前运行在 JSON action fallback 模式。
|
||||
|
||||
你的输出必须满足以下规则:
|
||||
|
||||
@@ -1,11 +1,19 @@
|
||||
"""Registry manifest models and validation helpers."""
|
||||
|
||||
from functools import lru_cache
|
||||
|
||||
from app.agents.registry.indexes import RegistryIndexes, build_registry_indexes
|
||||
from app.agents.registry.loader import RegistryBundle, load_builtin_registry_bundle
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def load_builtin_registry_indexes() -> RegistryIndexes:
|
||||
return build_registry_indexes(load_builtin_registry_bundle())
|
||||
|
||||
|
||||
__all__ = [
|
||||
"RegistryBundle",
|
||||
"RegistryIndexes",
|
||||
"build_registry_indexes",
|
||||
"load_builtin_registry_bundle",
|
||||
"load_builtin_registry_indexes",
|
||||
]
|
||||
|
||||
@@ -2,6 +2,8 @@ from app.agents.prompts import SUB_COMMANDER_PROMPTS_BY_KEY
|
||||
from app.agents.registry.models import (
|
||||
AgentManifest,
|
||||
CapabilityManifest,
|
||||
PermissionClass,
|
||||
SideEffectScope,
|
||||
SpecialistTemplateManifest,
|
||||
SubCommanderManifest,
|
||||
)
|
||||
@@ -89,10 +91,150 @@ _capability_tool_names = tuple(
|
||||
)
|
||||
)
|
||||
|
||||
_CAPABILITY_METADATA_BY_TOOL_NAME: dict[str, dict[str, object]] = {
|
||||
"get_tasks": {
|
||||
"permission_class": PermissionClass.READ,
|
||||
"side_effect_scope": SideEffectScope.NONE,
|
||||
"supports_retry": True,
|
||||
"idempotent": True,
|
||||
"safe_for_parallel_use": True,
|
||||
"requires_confirmation": False,
|
||||
},
|
||||
"get_schedule_day": {
|
||||
"permission_class": PermissionClass.READ,
|
||||
"side_effect_scope": SideEffectScope.NONE,
|
||||
"supports_retry": True,
|
||||
"idempotent": True,
|
||||
"safe_for_parallel_use": True,
|
||||
"requires_confirmation": False,
|
||||
},
|
||||
"resolve_time_expression": {
|
||||
"permission_class": PermissionClass.READ,
|
||||
"side_effect_scope": SideEffectScope.NONE,
|
||||
"supports_retry": True,
|
||||
"idempotent": True,
|
||||
"safe_for_parallel_use": True,
|
||||
"requires_confirmation": False,
|
||||
},
|
||||
"search_knowledge": {
|
||||
"permission_class": PermissionClass.READ,
|
||||
"side_effect_scope": SideEffectScope.NONE,
|
||||
"supports_retry": True,
|
||||
"idempotent": True,
|
||||
"safe_for_parallel_use": True,
|
||||
"requires_confirmation": False,
|
||||
},
|
||||
"hybrid_search": {
|
||||
"permission_class": PermissionClass.READ,
|
||||
"side_effect_scope": SideEffectScope.NONE,
|
||||
"supports_retry": True,
|
||||
"idempotent": True,
|
||||
"safe_for_parallel_use": True,
|
||||
"requires_confirmation": False,
|
||||
},
|
||||
"get_knowledge_graph_context": {
|
||||
"permission_class": PermissionClass.READ,
|
||||
"side_effect_scope": SideEffectScope.NONE,
|
||||
"supports_retry": True,
|
||||
"idempotent": True,
|
||||
"safe_for_parallel_use": True,
|
||||
"requires_confirmation": False,
|
||||
},
|
||||
"get_forum_posts": {
|
||||
"permission_class": PermissionClass.READ,
|
||||
"side_effect_scope": SideEffectScope.NONE,
|
||||
"supports_retry": True,
|
||||
"idempotent": True,
|
||||
"safe_for_parallel_use": True,
|
||||
"requires_confirmation": False,
|
||||
},
|
||||
"scan_forum_for_instructions": {
|
||||
"permission_class": PermissionClass.READ,
|
||||
"side_effect_scope": SideEffectScope.NONE,
|
||||
"supports_retry": True,
|
||||
"idempotent": True,
|
||||
"safe_for_parallel_use": True,
|
||||
"requires_confirmation": False,
|
||||
},
|
||||
"web_search": {
|
||||
"permission_class": PermissionClass.EXTERNAL,
|
||||
"side_effect_scope": SideEffectScope.NETWORK,
|
||||
"supports_retry": True,
|
||||
"idempotent": True,
|
||||
"safe_for_parallel_use": True,
|
||||
"requires_confirmation": False,
|
||||
},
|
||||
"create_task": {
|
||||
"permission_class": PermissionClass.WRITE,
|
||||
"side_effect_scope": SideEffectScope.LOCAL_STATE,
|
||||
"supports_retry": False,
|
||||
"idempotent": False,
|
||||
"safe_for_parallel_use": False,
|
||||
"requires_confirmation": True,
|
||||
},
|
||||
"update_task_status": {
|
||||
"permission_class": PermissionClass.WRITE,
|
||||
"side_effect_scope": SideEffectScope.LOCAL_STATE,
|
||||
"supports_retry": False,
|
||||
"idempotent": False,
|
||||
"safe_for_parallel_use": False,
|
||||
"requires_confirmation": True,
|
||||
},
|
||||
"create_todo": {
|
||||
"permission_class": PermissionClass.WRITE,
|
||||
"side_effect_scope": SideEffectScope.LOCAL_STATE,
|
||||
"supports_retry": False,
|
||||
"idempotent": False,
|
||||
"safe_for_parallel_use": False,
|
||||
"requires_confirmation": True,
|
||||
},
|
||||
"create_schedule_task": {
|
||||
"permission_class": PermissionClass.WRITE,
|
||||
"side_effect_scope": SideEffectScope.LOCAL_STATE,
|
||||
"supports_retry": False,
|
||||
"idempotent": False,
|
||||
"safe_for_parallel_use": False,
|
||||
"requires_confirmation": True,
|
||||
},
|
||||
"create_reminder": {
|
||||
"permission_class": PermissionClass.WRITE,
|
||||
"side_effect_scope": SideEffectScope.LOCAL_STATE,
|
||||
"supports_retry": False,
|
||||
"idempotent": False,
|
||||
"safe_for_parallel_use": False,
|
||||
"requires_confirmation": True,
|
||||
},
|
||||
"create_goal": {
|
||||
"permission_class": PermissionClass.WRITE,
|
||||
"side_effect_scope": SideEffectScope.LOCAL_STATE,
|
||||
"supports_retry": False,
|
||||
"idempotent": False,
|
||||
"safe_for_parallel_use": False,
|
||||
"requires_confirmation": True,
|
||||
},
|
||||
"create_forum_post": {
|
||||
"permission_class": PermissionClass.WRITE,
|
||||
"side_effect_scope": SideEffectScope.LOCAL_STATE,
|
||||
"supports_retry": False,
|
||||
"idempotent": False,
|
||||
"safe_for_parallel_use": False,
|
||||
"requires_confirmation": True,
|
||||
},
|
||||
"build_knowledge_graph": {
|
||||
"permission_class": PermissionClass.WRITE,
|
||||
"side_effect_scope": SideEffectScope.LOCAL_STATE,
|
||||
"supports_retry": False,
|
||||
"idempotent": False,
|
||||
"safe_for_parallel_use": False,
|
||||
"requires_confirmation": True,
|
||||
},
|
||||
}
|
||||
|
||||
BUILTIN_CAPABILITY_MANIFESTS: tuple[CapabilityManifest, ...] = tuple(
|
||||
CapabilityManifest(
|
||||
capability_id=tool_name,
|
||||
tool_name=tool_name,
|
||||
**dict(_CAPABILITY_METADATA_BY_TOOL_NAME.get(tool_name, {})),
|
||||
)
|
||||
for tool_name in _capability_tool_names
|
||||
)
|
||||
|
||||
@@ -1,6 +1,21 @@
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class PermissionClass(str, Enum):
|
||||
READ = "read"
|
||||
WRITE = "write"
|
||||
EXTERNAL = "external"
|
||||
|
||||
|
||||
class SideEffectScope(str, Enum):
|
||||
NONE = "none"
|
||||
LOCAL_STATE = "local_state"
|
||||
DB_WRITE = "db_write"
|
||||
NETWORK = "network"
|
||||
|
||||
|
||||
class AgentManifest(BaseModel):
|
||||
agent_id: str
|
||||
display_name: str
|
||||
@@ -23,6 +38,12 @@ class SubCommanderManifest(BaseModel):
|
||||
class CapabilityManifest(BaseModel):
|
||||
capability_id: str
|
||||
tool_name: str
|
||||
permission_class: PermissionClass = PermissionClass.READ
|
||||
side_effect_scope: SideEffectScope = SideEffectScope.NONE
|
||||
supports_retry: bool = False
|
||||
idempotent: bool = False
|
||||
safe_for_parallel_use: bool = False
|
||||
requires_confirmation: bool = False
|
||||
|
||||
|
||||
class SpecialistTemplateManifest(BaseModel):
|
||||
|
||||
10
backend/app/agents/schemas/__init__.py
Normal file
10
backend/app/agents/schemas/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from app.agents.schemas.event import AgentEvent
|
||||
from app.agents.schemas.task import AgentTask, TaskResult, TaskLifecycleStatus, VerificationStatus
|
||||
|
||||
__all__ = [
|
||||
"AgentEvent",
|
||||
"AgentTask",
|
||||
"TaskLifecycleStatus",
|
||||
"TaskResult",
|
||||
"VerificationStatus",
|
||||
]
|
||||
28
backend/app/agents/schemas/event.py
Normal file
28
backend/app/agents/schemas/event.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
AgentEventType = Literal[
|
||||
"agent.tool.start",
|
||||
"agent.tool.result",
|
||||
"agent.verify.started",
|
||||
"agent.verify.completed",
|
||||
"agent.error",
|
||||
]
|
||||
AgentEventSeverity = Literal["info", "warning", "error"]
|
||||
|
||||
|
||||
class AgentEvent(BaseModel):
|
||||
event_id: str
|
||||
event_type: AgentEventType
|
||||
timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
conversation_id: str | None = None
|
||||
agent_id: str | None = None
|
||||
sub_commander_id: str | None = None
|
||||
task_id: str | None = None
|
||||
payload: dict[str, Any] = Field(default_factory=dict)
|
||||
severity: AgentEventSeverity = "info"
|
||||
32
backend/app/agents/schemas/task.py
Normal file
32
backend/app/agents/schemas/task.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
TaskLifecycleStatus = Literal["pending", "in_progress", "completed", "failed", "blocked"]
|
||||
VerificationStatus = Literal["passed", "failed", "skipped"]
|
||||
|
||||
|
||||
class AgentTask(BaseModel):
|
||||
task_id: str
|
||||
title: str
|
||||
status: TaskLifecycleStatus = "pending"
|
||||
owner_agent_id: str | None = None
|
||||
role: str | None = None
|
||||
goal: str | None = None
|
||||
expected_evidence: list[dict[str, Any]] = Field(default_factory=list)
|
||||
evidence: list[dict[str, Any]] = Field(default_factory=list)
|
||||
result_summary: str | None = None
|
||||
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
|
||||
|
||||
class TaskResult(BaseModel):
|
||||
task_id: str
|
||||
status: VerificationStatus
|
||||
summary: str | None = None
|
||||
evidence: list[dict[str, Any]] = Field(default_factory=list)
|
||||
output_data: dict[str, Any] | None = None
|
||||
@@ -1,7 +1,9 @@
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Annotated, Any, TypedDict
|
||||
from typing import Annotated, Any, Literal, TypedDict
|
||||
|
||||
from app.agents.schemas.event import AgentEvent
|
||||
from app.agents.schemas.task import AgentTask, TaskResult, VerificationStatus
|
||||
from langchain_core.messages import BaseMessage
|
||||
from langgraph.graph.message import add_messages
|
||||
|
||||
@@ -27,6 +29,7 @@ class AgentState(TypedDict):
|
||||
user_id: str
|
||||
conversation_id: str
|
||||
|
||||
execution_mode: Literal["direct", "delegated", "verified"]
|
||||
current_agent: str | None
|
||||
next_step: str | None
|
||||
active_agents: list[AgentRole]
|
||||
@@ -34,14 +37,24 @@ class AgentState(TypedDict):
|
||||
active_sub_commanders: list[str]
|
||||
sub_commander_trace: list[dict[str, Any]]
|
||||
agent_trace: list[str]
|
||||
event_trace: list[AgentEvent | dict[str, Any]]
|
||||
|
||||
pending_tasks: list[dict[str, Any]]
|
||||
completed_tasks: list[dict[str, Any]]
|
||||
active_tasks: list[AgentTask | dict[str, Any]]
|
||||
task_results: list[TaskResult | dict[str, Any]]
|
||||
tool_calls: list[dict[str, Any]]
|
||||
last_tool_result: str | None
|
||||
action_results: list[dict[str, Any]]
|
||||
created_entities: list[dict[str, Any]]
|
||||
tool_outcomes: list[dict[str, Any]]
|
||||
task_result_summary: dict[str, Any] | None
|
||||
verifier_hints: dict[str, Any] | None
|
||||
|
||||
verification_status: VerificationStatus | None
|
||||
verification_summary: str | None
|
||||
verification_evidence: list[dict[str, Any]]
|
||||
budget_state: dict[str, Any] | None
|
||||
|
||||
tool_strategy_used: str | None
|
||||
tool_round_count: int
|
||||
@@ -89,6 +102,7 @@ def initial_state(user_id: str, conversation_id: str) -> AgentState:
|
||||
messages=[],
|
||||
user_id=user_id,
|
||||
conversation_id=conversation_id,
|
||||
execution_mode="direct",
|
||||
current_agent=AgentRole.MASTER.value,
|
||||
next_step=None,
|
||||
active_agents=[AgentRole.MASTER],
|
||||
@@ -96,13 +110,22 @@ def initial_state(user_id: str, conversation_id: str) -> AgentState:
|
||||
active_sub_commanders=[],
|
||||
sub_commander_trace=[],
|
||||
agent_trace=[AgentRole.MASTER.value],
|
||||
event_trace=[],
|
||||
pending_tasks=[],
|
||||
completed_tasks=[],
|
||||
active_tasks=[],
|
||||
task_results=[],
|
||||
tool_calls=[],
|
||||
last_tool_result=None,
|
||||
action_results=[],
|
||||
created_entities=[],
|
||||
tool_outcomes=[],
|
||||
task_result_summary=None,
|
||||
verifier_hints=None,
|
||||
verification_status=None,
|
||||
verification_summary=None,
|
||||
verification_evidence=[],
|
||||
budget_state=None,
|
||||
tool_strategy_used=None,
|
||||
tool_round_count=0,
|
||||
max_tool_rounds=2,
|
||||
|
||||
60
backend/app/agents/verifier.py
Normal file
60
backend/app/agents/verifier.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from app.agents.schemas.task import AgentTask, TaskResult, VerificationStatus
|
||||
from app.agents.state import AgentState
|
||||
|
||||
|
||||
class VerificationVerdict(BaseModel):
|
||||
status: VerificationStatus
|
||||
summary: str | None = None
|
||||
evidence: list[dict[str, Any]] = Field(default_factory=list)
|
||||
|
||||
|
||||
def verify_task_result(
|
||||
*,
|
||||
task: AgentTask | dict[str, Any] | None = None,
|
||||
result: TaskResult | dict[str, Any] | None = None,
|
||||
summary: str | None = None,
|
||||
evidence: list[dict[str, Any]] | None = None,
|
||||
status: VerificationStatus | None = None,
|
||||
) -> VerificationVerdict:
|
||||
normalized_result = result.model_dump() if isinstance(result, TaskResult) else dict(result or {})
|
||||
normalized_task = task.model_dump() if isinstance(task, AgentTask) else dict(task or {})
|
||||
normalized_summary = summary or normalized_result.get("summary") or normalized_task.get("result_summary")
|
||||
normalized_evidence = list(evidence or normalized_result.get("evidence") or normalized_task.get("evidence") or [])
|
||||
|
||||
if status is not None:
|
||||
return VerificationVerdict(status=status, summary=normalized_summary, evidence=normalized_evidence)
|
||||
|
||||
if normalized_result.get("status") in {"passed", "failed", "skipped"}:
|
||||
inferred_status = normalized_result["status"]
|
||||
elif normalized_result.get("success") is True:
|
||||
inferred_status = "passed"
|
||||
elif normalized_result.get("success") is False:
|
||||
inferred_status = "failed"
|
||||
elif normalized_summary or normalized_evidence:
|
||||
inferred_status = "skipped"
|
||||
else:
|
||||
inferred_status = "failed"
|
||||
normalized_summary = "No verification input available."
|
||||
|
||||
return VerificationVerdict(
|
||||
status=inferred_status,
|
||||
summary=normalized_summary,
|
||||
evidence=normalized_evidence,
|
||||
)
|
||||
|
||||
|
||||
def apply_verification_verdict(state: AgentState, verdict: VerificationVerdict) -> AgentState:
|
||||
next_state = dict(state)
|
||||
next_state["verification_status"] = verdict.status
|
||||
next_state["verification_summary"] = verdict.summary
|
||||
next_state["verification_evidence"] = list(verdict.evidence)
|
||||
return AgentState(**next_state)
|
||||
|
||||
|
||||
__all__ = ["VerificationVerdict", "apply_verification_verdict", "verify_task_result"]
|
||||
@@ -8,6 +8,7 @@ import app.agents.graph as graph_module
|
||||
from langchain_core.messages import AIMessage, HumanMessage
|
||||
|
||||
from app.agents.graph import (
|
||||
_build_verifier_hints,
|
||||
_choose_sub_commander,
|
||||
_execute_tool_calls,
|
||||
_parse_json_action,
|
||||
@@ -29,29 +30,46 @@ def _base_state(message: str, user_llm_config: dict | None = None) -> dict:
|
||||
'messages': [HumanMessage(content=message)],
|
||||
'user_id': 'u1',
|
||||
'conversation_id': 'c1',
|
||||
'current_agent': AgentRole.MASTER,
|
||||
'execution_mode': 'direct',
|
||||
'current_agent': AgentRole.MASTER.value,
|
||||
'next_step': None,
|
||||
'active_agents': [AgentRole.MASTER],
|
||||
'current_sub_commander': None,
|
||||
'active_sub_commanders': [],
|
||||
'sub_commander_trace': [],
|
||||
'agent_trace': [AgentRole.MASTER.value],
|
||||
'event_trace': [],
|
||||
'pending_tasks': [],
|
||||
'completed_tasks': [],
|
||||
'active_tasks': [],
|
||||
'task_results': [],
|
||||
'tool_calls': [],
|
||||
'last_tool_result': None,
|
||||
'action_results': [],
|
||||
'created_entities': [],
|
||||
'tool_outcomes': [],
|
||||
'task_result_summary': None,
|
||||
'verifier_hints': None,
|
||||
'verification_status': None,
|
||||
'verification_summary': None,
|
||||
'verification_evidence': [],
|
||||
'budget_state': None,
|
||||
'tool_strategy_used': None,
|
||||
'tool_round_count': 0,
|
||||
'max_tool_rounds': 2,
|
||||
'retry_count': 0,
|
||||
'max_retries': 1,
|
||||
'iteration_count': 0,
|
||||
'max_iterations': 3,
|
||||
'routing_hops': 0,
|
||||
'max_routing_hops': 2,
|
||||
'terminated_due_to_loop_guard': False,
|
||||
'retrieval_trace': [],
|
||||
'stop_reason': None,
|
||||
'clarification_needed': False,
|
||||
'clarification_question': None,
|
||||
'provider_capabilities': None,
|
||||
'fallback_parse_error': None,
|
||||
'should_respond': True,
|
||||
'knowledge_context': None,
|
||||
'graph_context': None,
|
||||
'schedule_context_summary': None,
|
||||
@@ -59,11 +77,17 @@ def _base_state(message: str, user_llm_config: dict | None = None) -> dict:
|
||||
'plan_steps': [],
|
||||
'analysis_report': None,
|
||||
'final_response': None,
|
||||
'should_respond': True,
|
||||
'memory_context': None,
|
||||
'current_datetime_context': 'CURRENT_TIME: 2026-03-28T12:00:00+08:00',
|
||||
'current_datetime_reference': {'current_time_iso': '2026-03-28T12:00:00+08:00', 'current_date_iso': '2026-03-28', 'timezone': 'UTC'},
|
||||
'turn_context': None,
|
||||
'routing_decision': None,
|
||||
'continuity_state': None,
|
||||
'pending_action': None,
|
||||
'last_completed_action': None,
|
||||
'clarification_context': None,
|
||||
'user_llm_config': user_llm_config,
|
||||
'provider_capabilities': None,
|
||||
}
|
||||
|
||||
|
||||
@@ -258,6 +282,7 @@ def test_initial_state_sets_structured_continuity_defaults():
|
||||
assert state['pending_action'] is None
|
||||
assert state['last_completed_action'] is None
|
||||
assert state['clarification_context'] is None
|
||||
assert state['event_trace'] == []
|
||||
assert state['tool_outcomes'] == []
|
||||
|
||||
|
||||
@@ -322,6 +347,7 @@ async def test_planner_node_clears_next_step_after_consuming_routed_turn(monkeyp
|
||||
assert result['final_response'] is not None
|
||||
|
||||
|
||||
async def test_master_node_returns_stable_reply_for_simple_greeting(monkeypatch):
|
||||
monkeypatch.setattr('app.agents.graph._get_llm_for_state', lambda state: FailIfCalledLLM())
|
||||
|
||||
state = {
|
||||
@@ -1062,8 +1088,147 @@ async def test_master_node_returns_stable_reply_for_capability_question(monkeypa
|
||||
assert getattr(result['messages'][-1], 'content', '') == result['final_response']
|
||||
|
||||
|
||||
def test_choose_sub_commander_routes_schedule_requests_to_schedule_planning():
|
||||
assert _choose_sub_commander(AgentRole.SCHEDULE_PLANNER, '帮我安排一下这周计划') == 'schedule_planning'
|
||||
def test_build_verifier_hints_uses_capability_metadata():
|
||||
state = _base_state('明天提醒我开会')
|
||||
|
||||
hints = _build_verifier_hints(state, 'create_reminder', '提醒创建成功')
|
||||
|
||||
assert hints['tool_name'] == 'create_reminder'
|
||||
assert hints['permission_class'] == 'write'
|
||||
assert hints['side_effect_scope'] == 'local_state'
|
||||
assert hints['requires_confirmation'] is True
|
||||
assert hints['supports_retry'] is False
|
||||
assert hints['safe_for_parallel_use'] is False
|
||||
assert '提醒创建成功' in hints['result_preview']
|
||||
|
||||
|
||||
async def test_execute_tool_calls_records_schema_events_and_aggregate_summaries(monkeypatch):
|
||||
tool = FakeTool('create_reminder', '提醒创建成功: 开会 @ 2026-03-29 09:00')
|
||||
state = _base_state('test')
|
||||
|
||||
normalized_calls, tool_result, created_entities, tool_messages = await _execute_tool_calls(
|
||||
[{'id': 'task-1', 'name': 'create_reminder', 'args': {'title': '开会', 'reminder_at': '2026-03-29T09:00:00'}}],
|
||||
[tool],
|
||||
state,
|
||||
)
|
||||
|
||||
assert normalized_calls[0]['name'] == 'create_reminder'
|
||||
assert tool_result.startswith('[create_reminder]')
|
||||
assert created_entities == [{'type': 'reminder', 'tool': 'create_reminder'}]
|
||||
assert len(tool_messages) == 1
|
||||
assert state['verifier_hints'] == {
|
||||
'tools': [
|
||||
{
|
||||
'tool_name': 'create_reminder',
|
||||
'permission_class': 'write',
|
||||
'side_effect_scope': 'local_state',
|
||||
'requires_confirmation': True,
|
||||
'supports_retry': False,
|
||||
'safe_for_parallel_use': False,
|
||||
'result_preview': '提醒创建成功: 开会 @ 2026-03-29 09:00',
|
||||
}
|
||||
]
|
||||
}
|
||||
assert state['task_result_summary']['tool_count'] == 1
|
||||
assert state['task_result_summary']['created_entity_types'] == ['reminder']
|
||||
assert state['tool_outcomes'][0]['tool_name'] == 'create_reminder'
|
||||
assert state['event_trace'][0]['event_type'] == 'agent.tool.start'
|
||||
assert state['event_trace'][-1]['event_type'] == 'agent.tool.result'
|
||||
assert state['event_trace'][-1]['payload']['verification']['tool_name'] == 'create_reminder'
|
||||
assert state['task_result_summary'] == {
|
||||
'tool_count': 1,
|
||||
'tools': [
|
||||
{
|
||||
'tool_name': 'create_reminder',
|
||||
'result_preview': '提醒创建成功: 开会 @ 2026-03-29 09:00',
|
||||
'created_entity_types': ['reminder'],
|
||||
'created_count': 1,
|
||||
}
|
||||
],
|
||||
'created_count': 1,
|
||||
'created_entity_types': ['reminder'],
|
||||
'stop_reason': None,
|
||||
}
|
||||
assert state['action_results'][-1] == state['task_result_summary']
|
||||
assert state['tool_outcomes'][0]['tool_name'] == 'create_reminder'
|
||||
assert [event['event_type'] for event in state['event_trace']] == [
|
||||
'agent.tool.start',
|
||||
'agent.tool.result',
|
||||
]
|
||||
assert all('event_id' in event for event in state['event_trace'])
|
||||
assert all('timestamp' in event for event in state['event_trace'])
|
||||
assert all(event['conversation_id'] == 'c1' for event in state['event_trace'])
|
||||
assert all(event['agent_id'] == AgentRole.MASTER.value for event in state['event_trace'])
|
||||
assert all(event['task_id'] == 'task-1' for event in state['event_trace'])
|
||||
|
||||
|
||||
async def test_execute_tool_calls_aggregates_multiple_tool_turns_without_overwrite(monkeypatch):
|
||||
reminder_tool = FakeTool('create_reminder', '提醒创建成功: 开会 @ 2026-03-29 09:00')
|
||||
search_tool = FakeTool('web_search', '成功搜索到 2 条网页结果')
|
||||
state = _base_state('test')
|
||||
|
||||
normalized_calls, tool_result, created_entities, tool_messages = await _execute_tool_calls(
|
||||
[
|
||||
{'id': 'task-1', 'name': 'create_reminder', 'args': {'title': '开会', 'reminder_at': '2026-03-29T09:00:00'}},
|
||||
{'id': 'task-2', 'name': 'web_search', 'args': {'query': 'Jarvis 最新模型更新'}},
|
||||
],
|
||||
[reminder_tool, search_tool],
|
||||
state,
|
||||
)
|
||||
|
||||
assert [call['name'] for call in normalized_calls] == ['create_reminder', 'web_search']
|
||||
assert tool_result == '[create_reminder] 提醒创建成功: 开会 @ 2026-03-29 09:00\n[web_search] 成功搜索到 2 条网页结果'
|
||||
assert created_entities == [{'type': 'reminder', 'tool': 'create_reminder'}]
|
||||
assert [message.name for message in tool_messages] == ['create_reminder', 'web_search']
|
||||
assert state['verifier_hints'] == {
|
||||
'tools': [
|
||||
{
|
||||
'tool_name': 'create_reminder',
|
||||
'permission_class': 'write',
|
||||
'side_effect_scope': 'local_state',
|
||||
'requires_confirmation': True,
|
||||
'supports_retry': False,
|
||||
'safe_for_parallel_use': False,
|
||||
'result_preview': '提醒创建成功: 开会 @ 2026-03-29 09:00',
|
||||
},
|
||||
{
|
||||
'tool_name': 'web_search',
|
||||
'permission_class': 'external',
|
||||
'side_effect_scope': 'network',
|
||||
'requires_confirmation': False,
|
||||
'supports_retry': True,
|
||||
'safe_for_parallel_use': True,
|
||||
'result_preview': '成功搜索到 2 条网页结果',
|
||||
},
|
||||
]
|
||||
}
|
||||
assert state['task_result_summary'] == {
|
||||
'tool_count': 2,
|
||||
'tools': [
|
||||
{
|
||||
'tool_name': 'create_reminder',
|
||||
'result_preview': '提醒创建成功: 开会 @ 2026-03-29 09:00',
|
||||
'created_entity_types': ['reminder'],
|
||||
'created_count': 1,
|
||||
},
|
||||
{
|
||||
'tool_name': 'web_search',
|
||||
'result_preview': '成功搜索到 2 条网页结果',
|
||||
'created_entity_types': [],
|
||||
'created_count': 0,
|
||||
},
|
||||
],
|
||||
'created_count': 1,
|
||||
'created_entity_types': ['reminder'],
|
||||
'stop_reason': None,
|
||||
}
|
||||
assert len(state['tool_outcomes']) == 2
|
||||
assert [event['event_type'] for event in state['event_trace']] == [
|
||||
'agent.tool.start',
|
||||
'agent.tool.result',
|
||||
'agent.tool.start',
|
||||
'agent.tool.result',
|
||||
]
|
||||
|
||||
|
||||
def test_choose_sub_commander_routes_focus_requests_to_schedule_analysis():
|
||||
|
||||
@@ -5,11 +5,13 @@ from app.agents.prompts import (
|
||||
SUB_COMMANDER_PROMPTS_BY_KEY,
|
||||
TOP_LEVEL_SYSTEM_PROMPTS_BY_KEY,
|
||||
)
|
||||
from app.agents.registry import build_registry_indexes, load_builtin_registry_bundle
|
||||
from app.agents.registry import build_registry_indexes, load_builtin_registry_bundle, load_builtin_registry_indexes
|
||||
from app.agents.registry.indexes import summarize_registry_indexes
|
||||
from app.agents.registry.models import (
|
||||
AgentManifest,
|
||||
CapabilityManifest,
|
||||
PermissionClass,
|
||||
SideEffectScope,
|
||||
SpecialistTemplateManifest,
|
||||
SubCommanderManifest,
|
||||
)
|
||||
@@ -251,17 +253,34 @@ def test_builtin_capabilities_reference_actual_runtime_tool_names() -> None:
|
||||
assert manifest_tool_names == expected_tool_names
|
||||
|
||||
|
||||
def test_builtin_sub_commander_capabilities_match_runtime_toolsets() -> None:
|
||||
capabilities_by_tool_name = {
|
||||
manifest.tool_name: manifest.capability_id for manifest in BUILTIN_CAPABILITY_MANIFESTS
|
||||
}
|
||||
def test_builtin_capability_metadata_distinguishes_read_and_write_surfaces() -> None:
|
||||
capability_by_id = {manifest.capability_id: manifest for manifest in BUILTIN_CAPABILITY_MANIFESTS}
|
||||
|
||||
for sub_commander in BUILTIN_SUB_COMMANDER_MANIFESTS:
|
||||
expected_capability_ids = {
|
||||
capabilities_by_tool_name[tool.name]
|
||||
for tool in SUB_COMMANDER_TOOLSETS[sub_commander.sub_commander_id]
|
||||
}
|
||||
assert set(sub_commander.capability_ids) == expected_capability_ids
|
||||
assert capability_by_id["get_tasks"].permission_class == PermissionClass.READ
|
||||
assert capability_by_id["get_tasks"].side_effect_scope == SideEffectScope.NONE
|
||||
assert capability_by_id["get_tasks"].supports_retry is True
|
||||
assert capability_by_id["get_tasks"].idempotent is True
|
||||
assert capability_by_id["get_tasks"].safe_for_parallel_use is True
|
||||
assert capability_by_id["get_tasks"].requires_confirmation is False
|
||||
|
||||
assert capability_by_id["create_reminder"].permission_class == PermissionClass.WRITE
|
||||
assert capability_by_id["create_reminder"].side_effect_scope == SideEffectScope.LOCAL_STATE
|
||||
assert capability_by_id["create_reminder"].supports_retry is False
|
||||
assert capability_by_id["create_reminder"].idempotent is False
|
||||
assert capability_by_id["create_reminder"].safe_for_parallel_use is False
|
||||
assert capability_by_id["create_reminder"].requires_confirmation is True
|
||||
|
||||
assert capability_by_id["web_search"].permission_class == PermissionClass.EXTERNAL
|
||||
assert capability_by_id["web_search"].side_effect_scope == SideEffectScope.NETWORK
|
||||
|
||||
|
||||
def test_load_builtin_registry_indexes_is_cached_and_matches_bundle_indexes() -> None:
|
||||
cached = load_builtin_registry_indexes()
|
||||
rebuilt = build_registry_indexes(load_builtin_registry_bundle())
|
||||
|
||||
assert cached is load_builtin_registry_indexes()
|
||||
assert cached.capability_id_by_tool_name == rebuilt.capability_id_by_tool_name
|
||||
assert cached.capability_by_id["create_reminder"].requires_confirmation is True
|
||||
|
||||
|
||||
def test_builtin_manifests_form_a_valid_registry_bundle() -> None:
|
||||
|
||||
66
backend/tests/backend/app/agents/test_schema_verifier.py
Normal file
66
backend/tests/backend/app/agents/test_schema_verifier.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from app.agents.schemas.event import AgentEvent
|
||||
from app.agents.schemas.task import AgentTask
|
||||
from app.agents.verifier import verify_task_result
|
||||
|
||||
|
||||
def test_agent_task_accepts_day1_fields():
|
||||
task = AgentTask(
|
||||
task_id="task-1",
|
||||
title="Verify foundation",
|
||||
status="in_progress",
|
||||
owner_agent_id="executor",
|
||||
role="verifier",
|
||||
goal="check output",
|
||||
expected_evidence=[{"type": "assertion"}],
|
||||
evidence=[{"type": "log"}],
|
||||
result_summary="running",
|
||||
)
|
||||
|
||||
assert task.task_id == "task-1"
|
||||
assert task.owner_agent_id == "executor"
|
||||
assert task.status == "in_progress"
|
||||
assert task.expected_evidence == [{"type": "assertion"}]
|
||||
assert task.evidence == [{"type": "log"}]
|
||||
assert task.result_summary == "running"
|
||||
|
||||
|
||||
def test_agent_event_accepts_day1_fields():
|
||||
event = AgentEvent(
|
||||
event_id="evt-1",
|
||||
event_type="agent.verify.completed",
|
||||
conversation_id="conv-1",
|
||||
agent_id="executor",
|
||||
sub_commander_id="executor_tasks",
|
||||
task_id="task-1",
|
||||
payload={"status": "passed"},
|
||||
severity="info",
|
||||
)
|
||||
|
||||
assert event.event_id == "evt-1"
|
||||
assert event.event_type == "agent.verify.completed"
|
||||
assert event.conversation_id == "conv-1"
|
||||
assert event.payload == {"status": "passed"}
|
||||
assert event.severity == "info"
|
||||
|
||||
|
||||
def test_verifier_verdict_is_separate_from_task_lifecycle_status():
|
||||
task = AgentTask(task_id="task-1", title="Verify", status="blocked", result_summary="waiting")
|
||||
|
||||
verdict = verify_task_result(task=task)
|
||||
|
||||
assert verdict.status == "skipped"
|
||||
assert verdict.summary == "waiting"
|
||||
|
||||
|
||||
def test_verifier_prefers_explicit_result_success_signal():
|
||||
verdict = verify_task_result(result={"success": True, "summary": "all checks passed"})
|
||||
|
||||
assert verdict.status == "passed"
|
||||
assert verdict.summary == "all checks passed"
|
||||
|
||||
|
||||
def test_verifier_fails_when_no_verification_input_exists():
|
||||
verdict = verify_task_result()
|
||||
|
||||
assert verdict.status == "failed"
|
||||
assert verdict.summary == "No verification input available."
|
||||
@@ -0,0 +1,102 @@
|
||||
# Jarvis Agents 2 天工作计划(可勾选执行版)
|
||||
|
||||
日期:2026-04-03
|
||||
状态:执行清单
|
||||
适用范围:基于 `phase-0` ~ `phase-4` 及现有 2 天融合方案整理
|
||||
|
||||
---
|
||||
|
||||
## 使用说明
|
||||
|
||||
- 完成前使用 `- [ ]`
|
||||
- 完成后改成 `- [x]`
|
||||
- Day 2 默认依赖 Day 1 的核心底座完成后再推进
|
||||
|
||||
---
|
||||
|
||||
## Day 1:补底座,完成 Phase 1 最小闭环
|
||||
|
||||
Day 1 目标:先把 Jarvis 从“只有静态路由”补成“有任务结构、有事件结构、有 verifier、有工具治理信息”的可扩展底座,同时不破坏当前 direct 主路径。
|
||||
|
||||
- [x] 新增最小 `task schema`
|
||||
改造内容:新增 `backend/app/agents/schemas/task.py`,统一 `task_id`、`title`、`status`、`owner_agent_id`、`evidence`、`result_summary`,并补 `role`、`goal`、`expected_evidence`、`created_at`、`updated_at`;状态固定为 `pending`、`in_progress`、`completed`、`failed`、`blocked`。
|
||||
|
||||
- [x] 新增最小 `event schema`
|
||||
改造内容:新增 `backend/app/agents/schemas/event.py`,统一 `event_id`、`event_type`、`timestamp`、`conversation_id`、`agent_id`、`sub_commander_id`、`task_id`、`payload`、`severity`;首批事件类型覆盖 `agent.tool.start`、`agent.tool.result`、`agent.verify.started`、`agent.verify.completed`、`agent.error`。
|
||||
|
||||
- [x] 扩展 `backend/app/agents/state.py` 的运行时字段
|
||||
改造内容:新增 `execution_mode`、`verification_status`、`verification_summary`、`verification_evidence`、`active_tasks`、`task_results`、`event_trace`、`budget_state`;默认值保持兼容 `initial_state()`,不替换现有 `pending_tasks`、`completed_tasks`、`tool_calls`。
|
||||
|
||||
- [x] 扩展 capability / tool metadata 模型
|
||||
改造内容:在 `backend/app/agents/registry/models.py` 增加 `permission_class`、`side_effect_scope`、`supports_retry`、`idempotent`、`safe_for_parallel_use`、`requires_confirmation`;至少先固化 `read` / `write` / `external` 和 `none` / `local_state` / `db_write` / `network` 两组枚举语义。
|
||||
|
||||
- [x] 回填 builtin tools 的静态 metadata
|
||||
改造内容:在 `backend/app/agents/registry/builtins.py` 和需要的 `backend/app/agents/tools/__init__.py` 中,把 search / retrieval 类工具标成偏 `read`,create / update 类工具标成偏 `write`,外部检索类工具标成 `external`,并补充是否可重试、是否幂等、是否适合并行等标记。
|
||||
|
||||
- [x] 新增 verifier 角色定义
|
||||
改造内容:在 `backend/app/agents/prompts.py` 增加 verifier prompt,明确 verifier 只负责验收,不负责重新规划;验收点聚焦“是否真正满足请求”“是否有明确证据”“是否把失败伪装成成功”。
|
||||
|
||||
- [x] 落地 verifier 模块
|
||||
改造内容:新增 `backend/app/agents/verifier.py`,支持 `passed`、`failed`、`skipped` 三类最小结论,先服务于工具调用后的复杂输出、知识检索结果和分析型汇总输出,不接管纯闲聊路径。
|
||||
|
||||
- [x] 在 `backend/app/agents/graph.py` 接入最小 event trace 与 verifier helper
|
||||
改造内容:给 `_execute_tool_calls()` 增加 tool start / result / error 事件写入;给收尾阶段增加 verifier helper 调用;给 `_run_sub_commander()` 增加 task result 摘要写入,但暂时不重构主图为完整协作编排图。
|
||||
|
||||
- [x] 补 Phase 1 单元测试与回归测试
|
||||
改造内容:新增 `backend/tests/backend/app/agents/test_agent_schemas.py`、`backend/tests/backend/app/agents/test_verifier.py`,并扩展 `test_graph.py`,覆盖 state 兼容性、schema 合法性、tool metadata 存在性、verifier 判定、主流程不回退。
|
||||
|
||||
- [x] 完成 Day 1 验收
|
||||
改造内容:确认 reminder / task / search 主流程继续通过;确认 verifier 已能独立运行;确认 event schema 与 task schema 已落代码;确认 direct 仍是默认主路径;确认未引入动态 `create_agent`、message bus 全链路和 UI。
|
||||
|
||||
---
|
||||
|
||||
## Day 2:引入最小协作能力,完成 Phase 2 雏形
|
||||
|
||||
Day 2 目标:在 Day 1 底座稳定的基础上,给 Jarvis 增加“复杂请求可拆分、可分配、可回收、可验收”的最小受控协作能力,但仍然不进入自由 swarm。
|
||||
|
||||
- [ ] 增加 `request_mode_selector`
|
||||
改造内容:在 `backend/app/agents/graph.py` 中增加 direct / collaboration 模式选择逻辑;简单请求继续走旧路径,只有明显多步骤、跨领域、需要多角色配合的请求才进入 collaboration mode。
|
||||
|
||||
- [ ] 新增 coordinator prompt
|
||||
改造内容:在 `backend/app/agents/prompts.py` 中定义 coordinator 角色,职责限定为“判断是否拆解”“输出 2~4 个清晰子任务”“分配角色建议”“汇总任务结果”;明确禁止无限递归拆分。
|
||||
|
||||
- [ ] 新增最小 task decomposition 结构
|
||||
改造内容:基于 Day 1 的 task schema 扩展最小拆分结构,至少输出 `task_id`、`title`、`role`、`goal`、`expected_evidence`,让复杂请求能以结构化任务列表进入后续执行。
|
||||
|
||||
- [ ] 增加 role -> existing agent assignment
|
||||
改造内容:先复用当前已有 top-level agent,不新增独立 worker runtime;把 schedule 类任务映射给 `schedule_planner`,retrieval 类任务映射给 `librarian`,analysis 类任务映射给 `analyst`,execution 类任务映射给 `executor`。
|
||||
|
||||
- [ ] 建立统一 task result 回收结构
|
||||
改造内容:约束每个角色统一返回 `task_id`、`status`、`summary`、`evidence`、`next_action`(可选),并把结果写回 `task_results`,避免最终结果继续依赖单点硬编码拼接。
|
||||
|
||||
- [ ] 让 verifier 强制参与协作结果收尾
|
||||
改造内容:在 collaboration mode 下,所有复杂请求返回前都必须经过 verifier;verifier 有权拒绝证据不足、结果不完整、子任务未闭环的响应。
|
||||
|
||||
- [ ] 补 Phase 2 协作测试与回归测试
|
||||
改造内容:覆盖复杂请求拆分测试、角色分配测试、task result 汇总测试、verifier 拒绝不完整结果测试,并再次确认 direct 模式原有流程不回退。
|
||||
|
||||
- [ ] 完成 Day 2 验收
|
||||
改造内容:确认 graph 已能区分 direct / collaboration;确认复杂请求可拆成 2~4 个子任务;确认每个子任务有 owner 和 evidence;确认最终答案基于 task result 汇总;确认系统仍未进入无限动态 agent 模式。
|
||||
|
||||
---
|
||||
|
||||
## 这 2 天明确不做
|
||||
|
||||
- 不做动态 `create_agent`
|
||||
- 不做 parent / child agent tree
|
||||
- 不做内部消息线程长期态管理
|
||||
- 不做可视化调试面板
|
||||
- 不做 event stream API
|
||||
- 不做 worktree / 隔离执行
|
||||
- 不做自由蜂群式协作
|
||||
|
||||
---
|
||||
|
||||
## 2 天结束后的预期状态
|
||||
|
||||
- [ ] 已具备 `direct` / `collaboration` 双模式入口
|
||||
- [ ] 已具备 verifier 独立验收层
|
||||
- [ ] 已具备 task schema / event schema / tool metadata 底座
|
||||
- [ ] 已具备 coordinator 雏形、任务拆分、角色分配、结果回收
|
||||
- [ ] 当前 reminder / task / search 主路径无明显回退
|
||||
- [ ] 后续可以继续推进 Phase 3 的受限动态协作,而不是返工 Phase 1 / Phase 2 底座
|
||||
Reference in New Issue
Block a user