Files
X-Financial/server/src/app/services/steward_planner_extraction.py

579 lines
26 KiB
Python
Raw Normal View History

from __future__ import annotations
import re
from datetime import UTC, date, datetime
from typing import Any
from app.schemas.steward import (
StewardAttachmentGroup,
StewardAttachmentInput,
StewardConfirmationAction,
StewardPlanRequest,
StewardTask,
StewardThinkingEvent,
)
from app.services.application_fact_resolver import ApplicationFactResolver
from app.services.ontology_field_registry import normalize_ontology_form_values
from app.services.steward_constants import BUSINESS_CANONICAL_FIELDS
from app.services.steward_planner_shared import (
BUSINESS_FIELD_LABELS,
CITY_NAMES,
EXPENSE_TYPE_LABELS,
PlannedTaskDraft,
REIMBURSEMENT_PATTERN,
TRANSPORT_MODE_LABELS,
)
class StewardPlannerExtractionMixin:
def _has_multiple_financial_demands(self, message: str) -> bool:
task_drafts = self._extract_task_drafts(message)
if len(task_drafts) > 1:
return True
compact = re.sub(r"\s+", "", message)
if not compact:
return False
application_signal = self._looks_like_application(compact) or self._looks_like_future_travel_application(compact)
reimbursement_signal = self._find_first_reimbursement_index(compact) >= 0
if application_signal and reimbursement_signal:
return True
connector_signal = re.search(r"并且|同时|另外|还有|还要|以及|再", compact)
repeated_reimbursement_signal = len(list(REIMBURSEMENT_PATTERN.finditer(compact))) > 1
return bool(connector_signal and repeated_reimbursement_signal)
@staticmethod
def _find_first_reimbursement_index(message: str) -> int:
candidates = [message.find(item) for item in ("我要报销", "还需要报销", "需要报销", "报销")]
positives = [item for item in candidates if item >= 0]
return min(positives) if positives else -1
@staticmethod
def _looks_like_application(text: str) -> bool:
compact = re.sub(r"\s+", "", text)
return bool(compact) and "申请" in compact and bool(re.search(r"出差|差旅|费用|交通|住宿|采购|会务|会议", compact))
@staticmethod
def _looks_like_future_travel_application(text: str) -> bool:
compact = re.sub(r"\s+", "", text)
if not compact or "报销" in compact:
return False
business_signal = re.search(
r"出差|差旅|客户现场|项目|部署|实施|支撑|支持|协助|拜访|调研|培训|会议|驻场|上线|验收",
compact,
)
route_signal = re.search(
fr"(?:去|到|赴|前往)({'|'.join(CITY_NAMES)})",
compact,
)
time_signal = re.search(
r"明天|后天|下周|下月|近期|月底|\d{1,2}月\d{1,2}(?:日|号)?|"
r"\d{4}[-/年]\d{1,2}[-/月]\d{1,2}(?:日)?|[0-9一二两三四五六七八九十]+天",
compact,
)
planned_route_signal = re.search(
r"(?:去|到|赴|前往).{0,24}(?:出差|差旅|客户|现场|项目|部署|实施|支撑|支持|协助|拜访|调研|培训|会议|驻场|上线|验收)|"
r"(?:出差|差旅).{0,24}(?:[0-9一二两三四五六七八九十]+天|客户|现场|项目|部署|实施|支撑|支持|协助|拜访|调研|培训|会议|驻场|上线|验收)",
compact,
)
return bool((business_signal or route_signal) and (time_signal or planned_route_signal))
def _looks_like_ambiguous_travel_flow(
self,
text: str,
base_date: date,
request: StewardPlanRequest,
) -> bool:
compact = re.sub(r"\s+", "", text)
if not compact or request.attachments:
return False
if re.search(r"申请|报销|草稿|提交|审批|保存|发起|创建", compact):
return False
if not re.search(r"出差|差旅|客户现场|项目|部署|实施|支撑|支持|协助|拜访|调研|培训|会议|驻场|上线|验收", compact):
return False
if not self._extract_time_range(compact, base_date):
return False
if not self._extract_location(compact):
return False
return not self._is_future_or_current_time_range(compact, base_date)
def _is_future_or_current_time_range(self, segment: str, base_date: date) -> bool:
normalized = self._extract_time_range(segment, base_date)
if not normalized:
return False
try:
parsed = date.fromisoformat(normalized)
except ValueError:
return False
return parsed >= base_date
def _build_task(
self,
draft: PlannedTaskDraft,
base_date: date,
request: StewardPlanRequest,
) -> StewardTask:
fields = self._extract_ontology_fields(draft.segment, draft.task_type, base_date, request)
missing_fields = self._resolve_missing_fields(draft.task_type, fields)
task_id = f"task_{'app' if draft.task_type == 'expense_application' else 'reim'}_{draft.index:03d}"
assigned_agent = (
"application_assistant"
if draft.task_type == "expense_application"
else "reimbursement_assistant"
)
title_prefix = "费用申请" if draft.task_type == "expense_application" else "费用报销"
title = self._build_task_title(title_prefix, fields, draft.index)
return StewardTask(
task_id=task_id,
task_type=draft.task_type, # type: ignore[arg-type]
assigned_agent=assigned_agent, # type: ignore[arg-type]
title=title,
summary=self._build_task_summary(draft.segment, fields),
status="needs_confirmation",
confidence=self._resolve_task_confidence(draft.segment, fields, draft.task_type),
ontology_fields=fields,
missing_fields=missing_fields,
confirmation_required=True,
)
def _build_fallback_task(
self,
message: str,
base_date: date,
request: StewardPlanRequest,
) -> StewardTask:
task_type = "reimbursement" if "报销" in message or request.attachments else "expense_application"
draft = PlannedTaskDraft(task_type=task_type, segment=message, index=1)
task = self._build_task(draft, base_date, request)
return task.model_copy(update={"confidence": min(task.confidence, 0.58)})
def _extract_ontology_fields(
self,
segment: str,
task_type: str,
base_date: date,
request: StewardPlanRequest,
) -> dict[str, str]:
normalized_context = normalize_ontology_form_values(request.context_json.get("review_form_values"))
fields: dict[str, str] = {
key: value
for key, value in normalized_context.items()
if key in BUSINESS_CANONICAL_FIELDS and str(value or "").strip()
}
expense_type = self._infer_expense_type(segment, task_type)
if expense_type and not fields.get("expense_type"):
fields["expense_type"] = expense_type
time_range = self._extract_time_range(segment, base_date)
if time_range and not fields.get("time_range"):
fields["time_range"] = time_range
location = self._extract_location(segment)
if location and not fields.get("location"):
fields["location"] = location
reason = self._extract_reason(segment, task_type)
if reason and not fields.get("reason"):
fields["reason"] = reason
transport_mode = self._extract_transport_mode(segment)
if transport_mode and not fields.get("transport_mode"):
fields["transport_mode"] = transport_mode
if request.attachments:
fields["attachments"] = "".join(item.name for item in request.attachments if item.name)
return {key: value for key, value in fields.items() if key in BUSINESS_CANONICAL_FIELDS and value}
@staticmethod
def _infer_expense_type(segment: str, task_type: str) -> str:
return ApplicationFactResolver.infer_expense_type(segment, task_type)
def _extract_time_range(self, segment: str, base_date: date) -> str:
return ApplicationFactResolver.extract_time_range(segment, base_date)
@staticmethod
def _safe_date(year: int, month: int, day: int) -> str:
return ApplicationFactResolver.safe_date(year, month, day)
@staticmethod
def _extract_location(segment: str) -> str:
return ApplicationFactResolver.extract_location(segment)
@staticmethod
def _extract_reason(segment: str, task_type: str) -> str:
return ApplicationFactResolver.extract_reason(segment, task_type)
@staticmethod
def _extract_transport_mode(segment: str) -> str:
return ApplicationFactResolver.extract_transport_mode(segment)
@staticmethod
def _resolve_missing_fields(task_type: str, fields: dict[str, str]) -> list[str]:
required = ["expense_type", "time_range", "reason"]
if task_type == "expense_application":
required.append("location")
if fields.get("expense_type") in {"travel", "transport"}:
required.append("transport_mode")
return [key for key in required if not str(fields.get(key) or "").strip()]
@staticmethod
def _resolve_task_confidence(segment: str, fields: dict[str, str], task_type: str) -> float:
compact = re.sub(r"\s+", "", segment)
if task_type == "expense_application":
intent_score = 1.0 if (
"申请" in compact or StewardPlannerExtractionMixin._looks_like_future_travel_application(compact)
) else 0.45
else:
intent_score = 1.0 if "报销" in compact else 0.45
time_score = 1.0 if fields.get("time_range") else 0.0
location_score = 1.0 if fields.get("location") else 0.2
scene_score = 1.0 if fields.get("expense_type") and fields["expense_type"] != "other" else 0.35
confidence = min(1.0, 0.35 * intent_score + 0.25 * time_score + 0.2 * location_score + 0.2 * scene_score)
return round(max(0.45, confidence), 2)
def _build_attachment_groups(
self,
attachments: list[StewardAttachmentInput],
tasks: list[StewardTask],
) -> list[StewardAttachmentGroup]:
if not attachments:
return []
classified = [(item, self._classify_attachment(item)) for item in attachments if item.name]
travel_related = [item.name for item, scene in classified if scene in {"travel", "transport"}]
excluded = [item.name for item, scene in classified if scene not in {"travel", "transport"}]
target_task = self._resolve_attachment_target_task(tasks)
groups: list[StewardAttachmentGroup] = []
if travel_related:
confidence = 0.72 + min(0.18, len(travel_related) * 0.04)
groups.append(
StewardAttachmentGroup(
group_id="ag_travel_001",
target_task_id=target_task.task_id if target_task else None,
scene="travel",
scene_label="差旅相关费用",
attachment_names=travel_related,
excluded_attachment_names=excluded,
confidence=round(confidence, 2),
rationale="附件名称或 OCR 摘要中包含差旅、交通、住宿、火车、机票等线索。",
confirmation_required=True,
)
)
elif excluded:
groups.append(
StewardAttachmentGroup(
group_id="ag_other_001",
target_task_id=None,
scene="other",
scene_label="待人工确认费用",
attachment_names=excluded,
excluded_attachment_names=[],
confidence=0.5,
rationale="当前附件缺少可稳定归属到申请或报销任务的差旅线索。",
confirmation_required=True,
)
)
return groups
@staticmethod
def _resolve_attachment_target_task(tasks: list[StewardTask]) -> StewardTask | None:
reimbursement_tasks = [item for item in tasks if item.task_type == "reimbursement"]
for task in reimbursement_tasks:
if task.ontology_fields.get("expense_type") == "travel":
return task
return reimbursement_tasks[0] if reimbursement_tasks else None
@staticmethod
def _classify_attachment(attachment: StewardAttachmentInput) -> str:
text = " ".join(
[
attachment.name,
attachment.media_type,
attachment.ocr_summary,
" ".join(f"{key}:{value}" for key, value in attachment.ocr_fields.items()),
]
)
compact = re.sub(r"\s+", "", text).lower()
if re.search(r"招待|接待|餐饮|宴请|客户|meal|entertainment", compact):
return "entertainment"
if re.search(r"酒店|住宿|差旅|出差|高铁|火车|动车|机票|航班|train|flight|hotel|travel", compact):
return "travel"
if re.search(r"出租车|的士|网约车|打车|交通|taxi|transport", compact):
return "transport"
return "other"
def _build_confirmation_actions(
self,
tasks: list[StewardTask],
attachment_groups: list[StewardAttachmentGroup],
) -> list[StewardConfirmationAction]:
actions: list[StewardConfirmationAction] = []
for task in tasks:
if task.task_type == "expense_application":
action_type = "confirm_create_application"
label = "确认创建申请单"
else:
action_type = "confirm_create_reimbursement_draft"
label = "确认创建报销草稿"
actions.append(
StewardConfirmationAction(
confirmation_id=f"confirm_{task.task_id}",
action_type=action_type,
label=label,
description=f"确认后把“{task.title}”交给{self._agent_label(task.assigned_agent)}继续核对。",
target_task_id=task.task_id,
payload={
"task_id": task.task_id,
"task_type": task.task_type,
"assigned_agent": task.assigned_agent,
"ontology_fields": task.ontology_fields,
},
)
)
for group in attachment_groups:
actions.append(
StewardConfirmationAction(
confirmation_id=f"confirm_{group.group_id}",
action_type="confirm_attachment_group",
label="确认附件归集",
description=f"确认后将 {len(group.attachment_names)} 份附件按“{group.scene_label}”归集。",
target_task_id=group.target_task_id,
attachment_group_id=group.group_id,
payload={
"attachment_group_id": group.group_id,
"target_task_id": group.target_task_id,
"attachment_names": group.attachment_names,
"excluded_attachment_names": group.excluded_attachment_names,
},
)
)
return actions
@staticmethod
def _agent_label(assigned_agent: str) -> str:
return "申请助手" if assigned_agent == "application_assistant" else "报销助手"
def _build_thinking_events(
self,
tasks: list[StewardTask],
attachment_groups: list[StewardAttachmentGroup],
attachments: list[StewardAttachmentInput],
) -> list[StewardThinkingEvent]:
application_count = sum(1 for item in tasks if item.task_type == "expense_application")
reimbursement_count = sum(1 for item in tasks if item.task_type == "reimbursement")
task_intent_summary = self._summarize_task_intents(tasks)
ontology_summary = self._summarize_ontology_coverage(tasks)
delegation_summary = self._summarize_delegation_targets(tasks)
events = [
StewardThinkingEvent(
event_id="intent_agent_entry",
stage="intent_agent",
title="意图识别智能体接管",
content=(
f"检测到复合财务话术,当前不是单一助手会话;"
f"已进入小财管家编排模式,候选任务共 {len(tasks)} 个。"
),
),
StewardThinkingEvent(
event_id="intent_task_split",
stage="task_split",
title=f"拆分申请 {application_count} 个、报销 {reimbursement_count}",
content=task_intent_summary,
),
StewardThinkingEvent(
event_id="intent_ontology_mapping",
stage="ontology_mapping",
title="核对业务要素",
content=ontology_summary,
),
]
gap_event = self._build_business_gap_thinking_event(tasks)
if gap_event:
events.append(gap_event)
if attachments:
events.append(
StewardThinkingEvent(
event_id="intent_attachment_correlation",
stage="attachment_correlation",
title="关联附件与任务线索",
content=self._summarize_attachment_correlation(attachment_groups, len(attachments)),
)
)
events.append(
StewardThinkingEvent(
event_id="intent_delegation_gate",
stage="delegation_gate",
title="生成确认点并准备分派",
content=f"{delegation_summary} 创建单据、生成草稿、绑定附件和提交审批都会等待用户确认。",
)
)
return events
@staticmethod
def _summarize_task_intents(tasks: list[StewardTask]) -> str:
if not tasks:
return "当前输入尚未形成稳定任务,先保留为待确认财务事项。"
parts = []
for task in tasks:
task_label = "申请" if task.task_type == "expense_application" else "报销"
fields = task.ontology_fields
anchors = []
if fields.get("time_range"):
anchors.append(fields["time_range"])
if fields.get("location"):
anchors.append(fields["location"])
if fields.get("expense_type"):
anchors.append(StewardPlannerExtractionMixin._format_business_field_value("expense_type", fields["expense_type"]))
anchor_text = "".join(anchors) if anchors else "待补充关键字段"
parts.append(f"{task_label}{task.title}{anchor_text}")
return "".join(parts)
@staticmethod
def _summarize_ontology_coverage(tasks: list[StewardTask]) -> str:
mapped_labels = []
missing_labels = []
for task in tasks:
mapped_labels.extend(StewardPlannerExtractionMixin._business_field_label(key) for key in task.ontology_fields.keys())
missing_labels.extend(StewardPlannerExtractionMixin._business_field_label(key) for key in task.missing_fields)
mapped = "".join(dict.fromkeys(label for label in mapped_labels if label)) or "暂无稳定业务要素"
missing = ";还缺少:" + "".join(dict.fromkeys(label for label in missing_labels if label)) if missing_labels else ""
return f"已把用户输入归一为业务要素:{mapped}{missing}。后续执行仍会先让用户确认。"
@staticmethod
def _build_business_gap_thinking_event(tasks: list[StewardTask]) -> StewardThinkingEvent | None:
gap_lines = []
for task in tasks:
if not task.missing_fields:
continue
missing_labels = [
StewardPlannerExtractionMixin._business_field_label(key)
for key in task.missing_fields
if key
]
if not missing_labels:
continue
if task.task_type == "expense_application" and "transport_mode" in task.missing_fields:
gap_lines.append(
(
f"{task.title}已识别到{StewardPlannerExtractionMixin._summarize_known_business_points(task)}"
"但用户没有说明出行方式;出行方式会影响交通费用测算,进入申请单核对后需要先追问火车、飞机或轮船。"
)
)
else:
gap_lines.append(
(
f"{task.title}还缺少{''.join(dict.fromkeys(missing_labels))}"
"需要在对应步骤里继续向用户确认,不能直接执行入库或提交。"
)
)
if not gap_lines:
return None
return StewardThinkingEvent(
event_id="intent_business_gap_check",
stage="business_gap_check",
title="判断待补充信息",
content="".join(gap_lines),
)
@staticmethod
def _summarize_known_business_points(task: StewardTask) -> str:
parts = []
for key in ("time_range", "location", "reason", "expense_type"):
value = str(task.ontology_fields.get(key) or "").strip()
if value:
parts.append(
f"{StewardPlannerExtractionMixin._business_field_label(key)}"
f"{StewardPlannerExtractionMixin._format_business_field_value(key, value)}"
)
return "".join(parts) or "部分业务要素"
@staticmethod
def _business_field_label(key: str) -> str:
return BUSINESS_FIELD_LABELS.get(str(key or "").strip(), str(key or "").strip())
@staticmethod
def _format_business_field_value(key: str, value: str) -> str:
cleaned = str(value or "").strip()
if key == "expense_type":
return EXPENSE_TYPE_LABELS.get(cleaned, cleaned)
if key == "transport_mode":
return TRANSPORT_MODE_LABELS.get(cleaned, cleaned)
return cleaned
@staticmethod
def _summarize_attachment_correlation(
attachment_groups: list[StewardAttachmentGroup],
total_attachment_count: int,
) -> str:
grouped_names = []
excluded_names = []
for group in attachment_groups:
grouped_names.extend(group.attachment_names)
excluded_names.extend(group.excluded_attachment_names)
grouped_text = "".join(grouped_names) if grouped_names else "暂无可稳定归集附件"
excluded_text = ";排除或单独确认:" + "".join(excluded_names) if excluded_names else ""
return f"已核对 {total_attachment_count} 份附件,建议归集:{grouped_text}{excluded_text}"
@staticmethod
def _summarize_delegation_targets(tasks: list[StewardTask]) -> str:
application_count = sum(1 for item in tasks if item.assigned_agent == "application_assistant")
reimbursement_count = sum(1 for item in tasks if item.assigned_agent == "reimbursement_assistant")
parts = []
if application_count:
parts.append(f"{application_count} 个申请任务交给申请助手")
if reimbursement_count:
parts.append(f"{reimbursement_count} 个报销任务交给报销助手")
return "".join(parts) + "" if parts else "尚无可分派任务。"
@staticmethod
def _build_summary(tasks: list[StewardTask], attachment_groups: list[StewardAttachmentGroup]) -> str:
parts = [f"我识别到 {len(tasks)} 个待处理任务"]
if attachment_groups:
grouped = sum(len(item.attachment_names) for item in attachment_groups)
parts.append(f"并形成 {grouped} 份附件的归集建议")
parts.append(",请确认后我再分派给对应助手执行。")
return "".join(parts)
@staticmethod
def _build_task_title(prefix: str, fields: dict[str, str], index: int) -> str:
location = fields.get("location", "")
time_range = fields.get("time_range", "")
expense_type = fields.get("expense_type", "")
subject = location or {"travel": "差旅", "transport": "交通", "entertainment": "招待"}.get(expense_type, "")
if subject and time_range:
return f"{prefix} {time_range} {subject}"
if subject:
return f"{prefix} {subject}"
return f"{prefix} {index}"
@staticmethod
def _build_task_summary(segment: str, fields: dict[str, str]) -> str:
field_parts = []
for key, label in (
("time_range", "时间"),
("location", "地点"),
("expense_type", "费用类型"),
("reason", "事由"),
("transport_mode", "交通方式"),
):
value = fields.get(key)
if value:
field_parts.append(f"{label}{value}")
return "".join(field_parts) or segment
@staticmethod
def _resolve_base_date(client_now_iso: str | None, context_json: dict[str, Any]) -> date:
raw_value = client_now_iso or str(context_json.get("client_now_iso") or "").strip()
if raw_value:
try:
parsed = datetime.fromisoformat(raw_value.replace("Z", "+00:00"))
return parsed.date()
except ValueError:
pass
return datetime.now(UTC).date()
@staticmethod
def _clean_text(value: Any) -> str:
return re.sub(r"\s+", " ", str(value or "")).strip()