fix: harden L3 runtime continuity and tool execution

Align the L3 graph, agent service, and sync tool shims on one canonical continuity contract so clarification resumes and persisted snapshots behave consistently. Add targeted regressions and hardening notes covering system-message coalescing, async bridge usage, and continuity rehydration.
2026-04-03 13:14:59 +08:00
parent b3f9b5e715
commit 4972b4e6b1
18 changed files with 4755 additions and 735 deletions
--- a/backend/app/agents/graph.py
+++ b/backend/app/agents/graph.py
--- a/backend/app/agents/state.py
+++ b/backend/app/agents/state.py
@@ -1,6 +1,6 @@
-from dataclasses import dataclass, field
-from typing import TypedDict, Annotated, Sequence
+from dataclasses import dataclass
 from enum import Enum
+from typing import Annotated, Any, TypedDict

 from langchain_core.messages import BaseMessage
 from langgraph.graph.message import add_messages
@@ -23,40 +23,65 @@ class ConversationTurn:


 class AgentState(TypedDict):
-    # Core message history with add_messages reducer
    messages: Annotated[list[BaseMessage], add_messages]
-    
-    # Session identifiers
    user_id: str
    conversation_id: str

-    # Agent routing state
    current_agent: str | None
-    next_step: str | None  # For explicit graph routing
-    
-    # Traceability
+    next_step: str | None
+    active_agents: list[AgentRole]
+    current_sub_commander: str | None
+    active_sub_commanders: list[str]
+    sub_commander_trace: list[dict[str, Any]]
    agent_trace: list[str]
-    
-    # Task & Entity Tracking (Business Logic)
-    pending_tasks: list[dict]
-    completed_tasks: list[dict]
-    created_entities: list[dict]

-    # Context summaries (for long-term or cross-agent context)
+    pending_tasks: list[dict[str, Any]]
+    completed_tasks: list[dict[str, Any]]
+    tool_calls: list[dict[str, Any]]
+    last_tool_result: str | None
+    action_results: list[dict[str, Any]]
+    created_entities: list[dict[str, Any]]
+    tool_outcomes: list[dict[str, Any]]
+
+    tool_strategy_used: str | None
+    tool_round_count: int
+    max_tool_rounds: int
+    retry_count: int
+    max_retries: int
+    iteration_count: int
+    max_iterations: int
+    routing_hops: int
+    max_routing_hops: int
+    terminated_due_to_loop_guard: bool
+    retrieval_trace: list[dict[str, Any]]
+    stop_reason: str | None
+
+    clarification_needed: bool
+    clarification_question: str | None
+    fallback_parse_error: str | None
+    should_respond: bool
+
    knowledge_context: str | None
+    graph_context: str | None
    schedule_context_summary: str | None
+    plan: str | None
+    plan_steps: list[dict[str, Any]]
    analysis_report: str | None
-
-    # Output control
    final_response: str | None
-    
-    # Memory & Environment
+
    memory_context: str | None
    current_datetime_context: str | None
-    
-    # Configuration
-    user_llm_config: dict | None
-    provider_capabilities: dict | None
+    current_datetime_reference: dict[str, str] | None
+
+    turn_context: dict[str, Any] | None
+    routing_decision: dict[str, Any] | None
+    continuity_state: dict[str, Any] | None
+    pending_action: dict[str, Any] | None
+    last_completed_action: dict[str, Any] | None
+    clarification_context: dict[str, Any] | None
+
+    user_llm_config: dict[str, Any] | None
+    provider_capabilities: dict[str, Any] | None


 def initial_state(user_id: str, conversation_id: str) -> AgentState:
@@ -66,16 +91,50 @@ def initial_state(user_id: str, conversation_id: str) -> AgentState:
        conversation_id=conversation_id,
        current_agent=AgentRole.MASTER.value,
        next_step=None,
+        active_agents=[AgentRole.MASTER],
+        current_sub_commander=None,
+        active_sub_commanders=[],
+        sub_commander_trace=[],
        agent_trace=[AgentRole.MASTER.value],
        pending_tasks=[],
        completed_tasks=[],
+        tool_calls=[],
+        last_tool_result=None,
+        action_results=[],
        created_entities=[],
+        tool_outcomes=[],
+        tool_strategy_used=None,
+        tool_round_count=0,
+        max_tool_rounds=2,
+        retry_count=0,
+        max_retries=1,
+        iteration_count=0,
+        max_iterations=3,
+        routing_hops=0,
+        max_routing_hops=2,
+        terminated_due_to_loop_guard=False,
+        retrieval_trace=[],
+        stop_reason=None,
+        clarification_needed=False,
+        clarification_question=None,
+        fallback_parse_error=None,
+        should_respond=True,
        knowledge_context=None,
+        graph_context=None,
        schedule_context_summary=None,
+        plan=None,
+        plan_steps=[],
        analysis_report=None,
        final_response=None,
        memory_context=None,
        current_datetime_context=None,
+        current_datetime_reference=None,
+        turn_context=None,
+        routing_decision=None,
+        continuity_state=None,
+        pending_action=None,
+        last_completed_action=None,
+        clarification_context=None,
        user_llm_config=None,
        provider_capabilities=None,
    )
--- a/backend/app/agents/tools/async_bridge.py
+++ b/backend/app/agents/tools/async_bridge.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any
+
+_executor = ThreadPoolExecutor(max_workers=4)
+
+
+def run_async(coro: Any, timeout: int = 30):
+    try:
+        asyncio.get_running_loop()
+    except RuntimeError:
+        return asyncio.run(coro)
+    return _executor.submit(asyncio.run, coro).result(timeout=timeout)
+
+
+__all__ = ["run_async"]
--- a/backend/app/agents/tools/forum.py
+++ b/backend/app/agents/tools/forum.py
@@ -4,19 +4,12 @@ from langchain_core.tools import tool
 from app.database import async_session
 from app.models.forum import ForumPost, ForumReply
 from app.agents.context import get_current_user
+from app.agents.tools.async_bridge import run_async
 from sqlalchemy import select
-import asyncio
-from concurrent.futures import ThreadPoolExecutor
-
-_executor = ThreadPoolExecutor(max_workers=4)


 def _run_async(coro, timeout: int = 30):
-    try:
-        asyncio.get_running_loop()
-    except RuntimeError:
-        return asyncio.run(coro)
-    return _executor.submit(asyncio.run, coro).result(timeout=timeout)
+    return run_async(coro, timeout=timeout)


@tool
--- a/backend/app/agents/tools/schedule.py
+++ b/backend/app/agents/tools/schedule.py
@@ -2,8 +2,6 @@

 from __future__ import annotations

-import asyncio
-from concurrent.futures import ThreadPoolExecutor
 from datetime import date, datetime
 from zoneinfo import ZoneInfo

@@ -11,21 +9,16 @@ from langchain_core.tools import tool
 from sqlalchemy import select

 from app.agents.context import get_current_user
+from app.agents.tools.async_bridge import run_async
 from app.database import async_session
 from app.models.goal import Goal, GoalStatus
 from app.models.reminder import Reminder
 from app.models.task import Task, TaskPriority, TaskStatus
 from app.models.todo import DailyTodo, TodoSource

-_executor = ThreadPoolExecutor(max_workers=4)
-

 def _run_async(coro, timeout: int = 30):
-    try:
-        asyncio.get_running_loop()
-    except RuntimeError:
-        return asyncio.run(coro)
-    return _executor.submit(asyncio.run, coro).result(timeout=timeout)
+    return run_async(coro, timeout=timeout)


 def _parse_date(value: str | None) -> date:
--- a/backend/app/agents/tools/search.py
+++ b/backend/app/agents/tools/search.py
@@ -5,25 +5,16 @@ Agent 工具集 - 知识库 & 图谱相关
 由于 LangChain 工具系统是同步的，内部用 run_in_executor 处理 async 逻辑。
 """

-from concurrent.futures import ThreadPoolExecutor
-import asyncio
-
 from langchain_core.tools import tool

 from app.agents.context import get_current_user
+from app.agents.tools.async_bridge import run_async
 from app.database import async_session

-_executor = ThreadPoolExecutor(max_workers=4)
-

 def _run_async(coro, timeout: int = 30):
    """在同步上下文中运行 async 代码"""
-    try:
-        loop = asyncio.get_running_loop()
-        future = loop.run_in_executor(_executor, lambda: asyncio.run(coro))
-        return future.result(timeout=timeout)
-    except RuntimeError:
-        return asyncio.run(coro)
+    return run_async(coro, timeout=timeout)


@tool
--- a/backend/app/agents/tools/task.py
+++ b/backend/app/agents/tools/task.py
@@ -8,21 +8,13 @@ from langchain_core.tools import tool
 from sqlalchemy import select

 from app.agents.context import get_current_user
+from app.agents.tools.async_bridge import run_async
 from app.database import async_session
 from app.models.task import Task, TaskPriority, TaskStatus

-import asyncio
-from concurrent.futures import ThreadPoolExecutor
-
-_executor = ThreadPoolExecutor(max_workers=4)
-

 def _run_async(coro, timeout: int = 30):
-    try:
-        asyncio.get_running_loop()
-    except RuntimeError:
-        return asyncio.run(coro)
-    return _executor.submit(asyncio.run, coro).result(timeout=timeout)
+    return run_async(coro, timeout=timeout)


 def _normalize_title(title: str | None, content: str | None) -> str:
--- a/backend/app/agents/tools/time_reasoning.py
+++ b/backend/app/agents/tools/time_reasoning.py
@@ -241,6 +241,10 @@ def normalize_tool_time_arguments(tool_name: str, args: dict, current_datetime_c
        if raw_value and not _is_iso_datetime(raw_value):
            payload = resolve_time_expression_data(raw_value, current_datetime_context=current_datetime_context, prefer="datetime")
            normalized["reminder_at"] = payload["resolved_datetime"]
+        raw_date = normalized.get("date")
+        if isinstance(raw_date, str) and raw_date.strip() and not _is_iso_date(raw_date):
+            payload = resolve_time_expression_data(raw_date, current_datetime_context=current_datetime_context, prefer="date")
+            normalized["date"] = payload["resolved_date"]
        return normalized

    if tool_name in {"create_schedule_task", "create_task"}: