feat: 完善差旅票据行程提取与费用明细回填逻辑

增强文档智能识别的票据场景关键词和字段提取能力，优化会话关联草稿报销单的解析路径，修复费用明细合并和票据去重边界问题，前端改进报销创建和审批详情交互，补充单元测试覆盖。
2026-05-21 14:24:51 +08:00
parent b183b0bd5e
commit f28d7e6d16
24 changed files with 1565 additions and 433 deletions
--- a/server/src/app/services/document_intelligence.py
+++ b/server/src/app/services/document_intelligence.py
@@ -184,6 +184,7 @@ AMOUNT_PATTERNS = (
    re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
 )
 DATE_PATTERN = re.compile(r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)")
+TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:：]([0-5]\d)(?!\d)")
 INVOICE_NUMBER_PATTERN = re.compile(r"(?:发票号码|票号|单号|订单号)[：:\s]*([A-Za-z0-9-]{6,24})")
 INVOICE_CODE_PATTERN = re.compile(r"(?:发票代码)[：:\s]*([A-Za-z0-9-]{6,24})")
 TRIP_NO_PATTERN = re.compile(r"(?:车次|航班(?:号)?)[：:\s]*([A-Za-z0-9]{2,12})")
@@ -192,6 +193,58 @@ MERCHANT_PATTERNS = (
    re.compile(r"(?:销售方(?:名称)?|商户(?:名称)?|开票方(?:名称)?|收款方(?:名称)?)[：:\s]*([A-Za-z0-9\u4e00-\u9fa5（）()·&\\-]{2,40})"),
    re.compile(r"([A-Za-z0-9\u4e00-\u9fa5（）()·&\\-]{2,40}(?:酒店|宾馆|饭店|酒楼|餐厅|航空|铁路|滴滴出行|停车场|服务区))"),
 )
+DATE_FIELD_KEYS = {
+    "date",
+    "time",
+    "issued_at",
+    "invoice_date",
+    "issue_date",
+    "travel_date",
+    "trip_date",
+    "journey_date",
+    "departure_date",
+    "departure_time",
+    "depart_date",
+    "depart_time",
+    "boarding_date",
+    "boarding_time",
+    "train_date",
+    "train_time",
+    "train_departure_time",
+    "scheduled_departure_time",
+    "flight_date",
+    "flight_time",
+    "ride_date",
+    "ride_time",
+    "pickup_time",
+    "start_time",
+}
+TRIP_DATE_LABEL_BY_DOCUMENT_TYPE = {
+    "train_ticket": "列车出发时间",
+    "flight_itinerary": "起飞日期",
+    "taxi_receipt": "乘车时间",
+    "transport_receipt": "乘车时间",
+    "parking_toll_receipt": "通行日期",
+}
+TRIP_DATE_FIELD_LABEL_TOKENS = (
+    "日期",
+    "时间",
+    "开票日期",
+    "发生时间",
+    "行程日期",
+    "出发日期",
+    "出发时间",
+    "列车出发时间",
+    "发车日期",
+    "发车时间",
+    "开车时间",
+    "乘车日期",
+    "乘车时间",
+    "起飞日期",
+    "航班日期",
+    "上车时间",
+    "用车时间",
+)


 class DocumentIntelligenceService:
@@ -212,7 +265,10 @@ class DocumentIntelligenceService:
        compact = re.sub(r"\s+", "", raw_text).lower()
        rule_match = _match_document_rule(compact)
        base_rule = rule_match.rule or DEFAULT_RULE
-        fields = tuple(_extract_document_fields(raw_text))
+        fields = _apply_document_type_field_labels(
+            tuple(_extract_document_fields(raw_text, base_rule.document_type)),
+            base_rule.document_type,
+        )
        rule_insight = DocumentInsight(
            document_type=base_rule.document_type,
            document_type_label=base_rule.document_type_label,
@@ -275,7 +331,10 @@ class DocumentIntelligenceService:
            for item in parsed.evidence
            if str(item or "").strip()
        ][:4]
-        normalized_fields = _normalize_llm_document_fields(parsed.fields)
+        normalized_fields = _apply_document_type_field_labels(
+            tuple(_normalize_llm_document_fields(parsed.fields)),
+            normalized_type,
+        )

        return LlmDocumentClassification(
            document_type=normalized_type,
@@ -312,7 +371,10 @@ class DocumentIntelligenceService:
                scene_code=rule_insight.scene_code,
                scene_label=rule_insight.scene_label,
                expense_type=rule_insight.expense_type,
-                fields=merged_fields,
+                fields=_apply_document_type_field_labels(
+                    merged_fields,
+                    rule_insight.document_type,
+                ),
                classification_source=rule_insight.classification_source,
                classification_confidence=rule_insight.classification_confidence,
                evidence=rule_insight.evidence,
@@ -337,7 +399,10 @@ class DocumentIntelligenceService:
                scene_code=rule_insight.scene_code,
                scene_label=rule_insight.scene_label,
                expense_type=rule_insight.expense_type,
-                fields=merged_fields,
+                fields=_apply_document_type_field_labels(
+                    merged_fields,
+                    rule_insight.document_type,
+                ),
                classification_source=rule_insight.classification_source,
                classification_confidence=rule_insight.classification_confidence,
                evidence=rule_insight.evidence,
@@ -354,7 +419,7 @@ class DocumentIntelligenceService:
            scene_code=rule.scene_code if parsed.scene_code == "other" else parsed.scene_code,
            scene_label=rule.scene_label if parsed.scene_label == "其他票据" else parsed.scene_label,
            expense_type=rule.expense_type if parsed.expense_type == "other" else parsed.expense_type,
-            fields=merged_fields,
+            fields=_apply_document_type_field_labels(merged_fields, rule.document_type),
            classification_source=source,
            classification_confidence=max(parsed.confidence, rule_insight.classification_confidence),
            evidence=tuple(parsed.evidence or rule_insight.evidence),
@@ -464,8 +529,49 @@ def _normalize_llm_document_field_key(key: str, label: str) -> str:
        token in compact_label for token in ("金额", "价税合计", "合计", "总额", "总计", "票价", "支付金额", "实付金额", "实收金额")
    ):
        return "amount"
-    if compact_key in {"date", "time", "issued_at", "invoice_date"} or any(
-        token in compact_label for token in ("日期", "时间", "开票日期", "发生时间")
+    if compact_key in {
+        "travel_date",
+        "trip_date",
+        "journey_date",
+        "departure_date",
+        "departure_time",
+        "depart_date",
+        "depart_time",
+        "boarding_date",
+        "boarding_time",
+        "train_date",
+        "train_time",
+        "train_departure_time",
+        "scheduled_departure_time",
+        "flight_date",
+        "flight_time",
+        "ride_date",
+        "ride_time",
+        "pickup_time",
+        "start_time",
+    } or any(
+        token in compact_label
+        for token in (
+            "行程日期",
+            "出发日期",
+            "出发时间",
+            "列车出发时间",
+            "发车日期",
+            "发车时间",
+            "开车时间",
+            "乘车日期",
+            "乘车时间",
+            "起飞日期",
+            "航班日期",
+            "上车时间",
+            "用车时间",
+        )
+    ):
+        return "trip_date"
+    if compact_key in {"issued_at", "issue_date", "invoice_date"} or "开票日期" in compact_label:
+        return "invoice_date"
+    if compact_key in {"date", "time"} or any(
+        token in compact_label for token in ("日期", "时间", "发生时间")
    ):
        return "date"
    if compact_key in {"merchant_name", "merchant", "seller_name", "vendor_name"} or any(
@@ -504,7 +610,7 @@ def _normalize_llm_document_field_value(key: str, value: str) -> str:
            return ""
        text_value = format(candidate.quantize(Decimal("0.01")), "f").rstrip("0").rstrip(".")
        return f"{text_value}元"
-    if key == "date":
+    if key in {"date", "time", "invoice_date", "trip_date"}:
        return _extract_date(raw_value) or _clean_field_value(raw_value)
    if key == "route":
        return _extract_route(raw_value) or _clean_field_value(
@@ -517,6 +623,8 @@ def _llm_document_field_label(key: str) -> str:
    return {
        "amount": "金额",
        "date": "日期",
+        "invoice_date": "开票日期",
+        "trip_date": "行程日期",
        "merchant_name": "商户",
        "invoice_number": "票据号码",
        "invoice_code": "发票代码",
@@ -525,6 +633,35 @@ def _llm_document_field_label(key: str) -> str:
    }.get(key, key)


+def _apply_document_type_field_labels(
+    fields: tuple[DocumentField, ...],
+    document_type: str,
+) -> tuple[DocumentField, ...]:
+    date_label = TRIP_DATE_LABEL_BY_DOCUMENT_TYPE.get(
+        str(document_type or "").strip().lower()
+    )
+    if not date_label:
+        return fields
+
+    adjusted: list[DocumentField] = []
+    for field in fields:
+        compact_key = str(field.key or "").strip().lower()
+        compact_label = str(field.label or "").replace(" ", "")
+        if compact_key in {"issued_at", "issue_date", "invoice_date"} or any(
+            token in compact_label for token in ("开票日期", "发票日期")
+        ):
+            adjusted.append(field)
+            continue
+        is_date_field = compact_key in DATE_FIELD_KEYS or any(
+            token in compact_label for token in TRIP_DATE_FIELD_LABEL_TOKENS
+        )
+        if is_date_field:
+            adjusted.append(DocumentField(key=field.key, label=date_label, value=field.value))
+            continue
+        adjusted.append(field)
+    return tuple(adjusted)
+
+
 def _merge_document_fields(
    base_fields: tuple[DocumentField, ...],
    override_fields: tuple[DocumentField, ...],
@@ -540,13 +677,13 @@ def _merge_document_fields(
    return tuple(merged[key] for key in order if key in merged)


-def _extract_document_fields(text: str) -> list[DocumentField]:
+def _extract_document_fields(text: str, document_type: str = "") -> list[DocumentField]:
    fields: list[DocumentField] = []
    amount = _extract_amount(text)
    if amount:
        fields.append(DocumentField(key="amount", label="金额", value=amount))

-    date_value = _extract_date(text)
+    date_value = _extract_date(text, document_type=document_type)
    if date_value:
        fields.append(DocumentField(key="date", label="日期", value=date_value))

@@ -594,10 +731,33 @@ def _extract_amount(text: str) -> str:
    return f"{text_value}元"


-def _extract_date(text: str) -> str:
-    match = DATE_PATTERN.search(text)
-    if not match:
+def _extract_date(text: str, *, document_type: str = "") -> str:
+    matches = list(DATE_PATTERN.finditer(text))
+    if not matches:
        return ""
+
+    normalized_type = str(document_type or "").strip().lower()
+    if normalized_type in TRIP_DATE_LABEL_BY_DOCUMENT_TYPE:
+        candidates: list[tuple[int, int, bool, str]] = []
+        for index, match in enumerate(matches):
+            value = _format_date_match_with_time(text, match)
+            if not value:
+                continue
+            invoice_context = _is_invoice_date_context(text, match)
+            score = _score_trip_date_context(text, match, value, invoice_context)
+            candidates.append((score, index, invoice_context, value))
+
+        non_invoice_candidates = [candidate for candidate in candidates if not candidate[2]]
+        if non_invoice_candidates:
+            return max(non_invoice_candidates, key=lambda candidate: (candidate[0], -candidate[1]))[3]
+        if candidates:
+            return ""
+        return ""
+
+    return _format_date_match_with_time(text, matches[0])
+
+
+def _format_date_match_with_time(text: str, match: re.Match[str]) -> str:
    raw_value = str(match.group(1) or "").strip()
    normalized = raw_value.replace("年", "-").replace("月", "-").replace("日", "")
    normalized = normalized.replace("/", "-").replace(".", "-")
@@ -605,7 +765,60 @@ def _extract_date(text: str) -> str:
    if len(parts) != 3:
        return raw_value
    year, month, day = parts
-    return f"{year.zfill(4)}-{month.zfill(2)}-{day.zfill(2)}"
+    date_value = f"{year.zfill(4)}-{month.zfill(2)}-{day.zfill(2)}"
+    surrounding = str(text or "")[max(0, match.start() - 18): match.end() + 24]
+    time_match = TIME_PATTERN.search(surrounding)
+    if time_match:
+        hour = str(time_match.group(1) or "").zfill(2)
+        minute = str(time_match.group(2) or "").zfill(2)
+        return f"{date_value} {hour}:{minute}"
+    return date_value
+
+
+def _is_invoice_date_context(text: str, match: re.Match[str]) -> bool:
+    window = str(text or "")[max(0, match.start() - 12): match.end() + 8]
+    compact = window.replace(" ", "")
+    return any(token in compact for token in ("开票日期", "发票日期", "开票时间", "开票"))
+
+
+def _score_trip_date_context(
+    text: str,
+    match: re.Match[str],
+    value: str,
+    invoice_context: bool,
+) -> int:
+    window = str(text or "")[max(0, match.start() - 32): match.end() + 32]
+    compact = window.replace(" ", "")
+    score = -20 if invoice_context else 0
+    if ":" in value or "：" in value:
+        score += 8
+    if any(
+        token in compact
+        for token in (
+            "行程日期",
+            "出发日期",
+            "出发时间",
+            "列车出发时间",
+            "发车日期",
+            "发车时间",
+            "开车时间",
+            "乘车日期",
+            "乘车时间",
+            "起飞日期",
+            "起飞时间",
+            "航班日期",
+            "上车时间",
+            "用车时间",
+        )
+    ):
+        score += 6
+    if any(token in compact for token in ("车次", "检票", "二等座", "一等座", "商务座", "软卧", "硬卧")):
+        score += 3
+    if re.search(r"[A-Z]\d{1,4}", compact):
+        score += 2
+    if re.search(r"[\u4e00-\u9fa5A-Za-z0-9（）()·]{2,20}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5A-Za-z0-9（）()·]{2,20}", compact):
+        score += 2
+    return score


 def _extract_merchant(text: str) -> str: