feat: 完善差旅票据行程提取与费用明细回填逻辑

增强文档智能识别的票据场景关键词和字段提取能力,优化
会话关联草稿报销单的解析路径,修复费用明细合并和票据
去重边界问题,前端改进报销创建和审批详情交互,补充单
元测试覆盖。
This commit is contained in:
caoxiaozhu
2026-05-21 14:24:51 +08:00
parent b183b0bd5e
commit f28d7e6d16
24 changed files with 1565 additions and 433 deletions

View File

@@ -180,7 +180,9 @@ SLOT_LABELS = {
"attachments": "票据附件",
}
DATE_TEXT_PATTERN = re.compile(r"(\d{4}[年/-]\d{1,2}[月/-]\d{1,2}日?)")
DATE_TEXT_PATTERN = re.compile(
r"(\d{4}[年/-]\d{1,2}[月/-]\d{1,2}日?(?:\s*[T ]?\s*(?:[01]?\d|2[0-3])[:][0-5]\d)?)"
)
AMOUNT_TEXT_PATTERN = re.compile(
r"(\d+(?:\.\d+)?)\s*(?:万元|万员|万圆|万园|万块|万元整|元整|块钱|块|元|员|圆|园|万)"
)
@@ -238,10 +240,14 @@ LEADING_REASON_TIME_PATTERNS = (
re.compile(
r"^\s*(?:识别事项(?:有)?[:]\s*)?"
r"(?:业务发生(?:时间|日期)|费用发生(?:时间|日期)|发生(?:时间|日期)|报销(?:时间|日期)|时间)[:]?\s*"
r"(?:19|20)\d{2}[-/年.]\d{1,2}[-/月.]\d{1,2}日?\s*[,。;;、]?\s*"
r"(?:19|20)\d{2}[-/年.]\d{1,2}[-/月.]\d{1,2}日?"
r"(?:\s*(?:至|到|~||—|-)\s*(?:19|20)\d{2}[-/年.]\d{1,2}[-/月.]\d{1,2}日?)?"
r"\s*[,。;;、]?\s*"
),
re.compile(
r"^\s*(?:19|20)\d{2}[-/年.]\d{1,2}[-/月.]\d{1,2}日?\s*[,。;;、]\s*"
r"^\s*(?:19|20)\d{2}[-/年.]\d{1,2}[-/月.]\d{1,2}日?"
r"(?:\s*(?:至|到|~||—|-)\s*(?:19|20)\d{2}[-/年.]\d{1,2}[-/月.]\d{1,2}日?)?"
r"\s*[,。;;、]\s*"
),
)
AMOUNT_UNIT_ALIASES = {
@@ -1936,6 +1942,7 @@ class UserAgentService:
can_proceed=can_proceed,
claim_groups=claim_groups,
draft_payload=draft_payload,
missing_slot_keys=missing_slot_keys,
)
edit_fields = self._build_review_edit_fields(
payload,
@@ -3200,7 +3207,9 @@ class UserAgentService:
can_proceed: bool,
claim_groups: list[UserAgentReviewClaimGroup],
draft_payload: UserAgentDraftPayload | None,
missing_slot_keys: set[str] | None = None,
) -> list[UserAgentReviewAction]:
missing_slot_keys = set(missing_slot_keys or set())
if self._is_review_association_choice_pending(payload):
claim_no = str(payload.tool_payload.get("association_candidate_claim_no") or "").strip()
link_label = f"关联到草稿 {claim_no}" if claim_no else "关联到现有草稿"
@@ -3212,9 +3221,13 @@ class UserAgentService:
emphasis="secondary",
),
UserAgentReviewAction(
label="修改识别信息",
label="选择报销类型" if "expense_type" in missing_slot_keys else "修改识别信息",
action_type="edit_review",
description="打开结构化模板,按已识别字段逐项修改。",
description=(
"先选择本次报销类型,后续票据会作为当前单据的补充继续核对。"
if "expense_type" in missing_slot_keys
else "打开结构化模板,按已识别字段逐项修改。"
),
emphasis="secondary",
),
UserAgentReviewAction(
@@ -3235,6 +3248,23 @@ class UserAgentService:
),
]
review_action = str(payload.context_json.get("review_action") or "").strip()
if "expense_type" in missing_slot_keys and not review_action:
return [
UserAgentReviewAction(
label="取消",
action_type="cancel_review",
description="放弃当前识别结果,并退出本次核对流程。",
emphasis="secondary",
),
UserAgentReviewAction(
label="选择报销类型",
action_type="edit_review",
description="先选择本次报销类型,后续票据会作为当前单据的补充继续核对。",
emphasis="primary",
),
]
primary_action = UserAgentReviewAction(
label="继续下一步" if can_proceed else "保存为草稿",
action_type="next_step" if can_proceed else "save_draft",
@@ -3258,9 +3288,13 @@ class UserAgentService:
emphasis="secondary",
),
UserAgentReviewAction(
label="修改识别信息",
label="选择报销类型" if "expense_type" in missing_slot_keys else "修改识别信息",
action_type="edit_review",
description="打开结构化模板,按已识别字段逐项修改。",
description=(
"先选择本次报销类型,后续票据会作为当前单据的补充继续核对。"
if "expense_type" in missing_slot_keys
else "打开结构化模板,按已识别字段逐项修改。"
),
emphasis="secondary",
),
]
@@ -3429,6 +3463,15 @@ class UserAgentService:
)
missing_labels = list(dict.fromkeys(missing_labels))
expense_type_slot = next((item for item in slot_cards if item.key == "expense_type"), None)
if expense_type_slot is not None and not str(expense_type_slot.value or "").strip():
return (
f"{self._build_review_intent_summary(payload, slot_cards=slot_cards, claim_groups=[])} "
"我已经先保留了当前识别出的时间、地点和事由,但还不能确定这张单据应该走哪类报销流程。"
"请先点击“选择报销类型”,在差旅费、交通费、住宿费等选项中选定;"
"选定后,后续上传的票据都会作为这张单据的补充继续核对,不会重新改判报销类型。"
)
review_payload = UserAgentReviewPayload(
intent_summary="",
body_message="",
@@ -4168,7 +4211,10 @@ class UserAgentService:
if labeled_match:
return labeled_match.group("value").strip()
city_match = re.search(r"去(?P<city>[\u4e00-\u9fa5]{2,8})(?:出差|拜访|参会|见客户|客户现场)", payload.message)
city_match = re.search(
r"去(?P<city>[\u4e00-\u9fa5]{2,8}?)(?:出差|拜访|参会|见客户|客户现场|支撑|支持|部署|实施|处理|协助)",
payload.message,
)
if city_match:
return city_match.group("city").strip()
if "客户现场" in payload.message.replace(" ", ""):
@@ -4210,9 +4256,9 @@ class UserAgentService:
def _build_time_slot(self, payload: UserAgentRequest) -> dict[str, str | float]:
review_form_values = self._resolve_review_form_values(payload)
edited_value = str(
review_form_values.get("occurred_date")
or review_form_values.get("time_range")
review_form_values.get("time_range")
or review_form_values.get("business_time")
or review_form_values.get("occurred_date")
or ""
).strip()
if edited_value:
@@ -4808,6 +4854,7 @@ class UserAgentService:
def _extract_document_fields(self, item: dict[str, object]) -> dict[str, str]:
raw_fields = item.get("document_fields")
normalized_fields: dict[str, str] = {}
document_type = str(item.get("document_type") or "").strip().lower()
if isinstance(raw_fields, list):
for field in raw_fields:
if not isinstance(field, dict):
@@ -4819,6 +4866,12 @@ class UserAgentService:
continue
normalized_label = self._normalize_document_field_label(key=key, label=label)
display_label = normalized_label or label
display_label = self._resolve_document_time_display_label(
document_type=document_type,
key=key,
label=label,
normalized_label=display_label,
)
normalized_value = self._normalize_document_field_value(
label=display_label,
value=value,
@@ -4834,13 +4887,49 @@ class UserAgentService:
normalized_fields["金额"] = amount_value
date_match = DATE_TEXT_PATTERN.search(text)
if date_match and "时间" not in normalized_fields:
normalized_fields["时间"] = date_match.group(1)
time_label = self._resolve_document_time_display_label(
document_type=document_type,
key="date",
label="日期",
normalized_label="时间",
)
normalized_fields[time_label] = date_match.group(1)
merchant = self._extract_document_merchant_name_from_text(text) if self._is_hotel_document_item(item) else ""
if merchant and "商户/酒店" not in normalized_fields:
normalized_fields["商户/酒店"] = merchant
return normalized_fields
@staticmethod
def _resolve_document_time_display_label(
*,
document_type: str,
key: str,
label: str,
normalized_label: str,
) -> str:
if normalized_label != "时间":
return normalized_label
label_by_type = {
"train_ticket": "列车出发时间",
"flight_itinerary": "起飞日期",
"taxi_receipt": "乘车时间",
"transport_receipt": "乘车时间",
"parking_toll_receipt": "通行日期",
}
normalized_type = str(document_type or "").strip().lower()
if normalized_type not in label_by_type:
return normalized_label
compact_key = str(key or "").strip().lower().replace("_", "")
compact_label = str(label or "").replace(" ", "")
if compact_key in {"date", "time", "issuedat", "issuedate", "invoicedate"}:
return label_by_type[normalized_type]
if any(token in compact_label for token in ("日期", "时间", "开票日期", "发生时间")):
return label_by_type[normalized_type]
return normalized_label
@staticmethod
def _normalize_document_field_label(*, key: str, label: str) -> str:
compact_key = str(key or "").strip().lower().replace("_", "")
@@ -4873,7 +4962,7 @@ class UserAgentService:
return ""
if normalized_label == "金额":
return self._extract_amount_text_from_value(raw_value) or raw_value
if normalized_label == "时间":
if normalized_label in {"时间", "出发日期", "列车出发时间", "起飞日期", "乘车时间", "通行日期"}:
match = DATE_TEXT_PATTERN.search(raw_value)
return match.group(1) if match else raw_value
return raw_value