fix(reimbursement): harden assistant draft and claim cleanup

2026-05-21 23:52:34 +08:00
parent e701fa01da
commit 2908dda024
9 changed files with 1060 additions and 398 deletions
--- a/server/src/app/services/expense_amounts.py
+++ b/server/src/app/services/expense_amounts.py
@@ -0,0 +1,206 @@
+from __future__ import annotations
+
+import re
+from decimal import Decimal, InvalidOperation
+from typing import Any
+
+DOCUMENT_AMOUNT_PATTERNS = (
+    re.compile(
+        r"(?:价税合计|合计金额|费用合计|总费用|费用总计|订单(?:总)?金额|支付(?:金额)?|实付(?:金额)?|实收(?:金额)?|总(?:额|计|价)|票价|金额|车费|消费金额|房费|住宿费)"
+        r"[：:\s￥¥人民币为是]*([0-9]+(?:[.,][0-9]{1,2})?)"
+    ),
+    re.compile(r"[￥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"),
+    re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
+)
+
+DOCUMENT_AMOUNT_FIELD_KEYS = {
+    "amount",
+    "totalamount",
+    "paymentamount",
+    "paidamount",
+    "actualamount",
+}
+DOCUMENT_AMOUNT_LABEL_TOKENS = (
+    "金额",
+    "价税合计",
+    "合计",
+    "总额",
+    "总计",
+    "票价",
+    "支付金额",
+    "实付金额",
+    "实收金额",
+)
+DOCUMENT_TEXT_AMOUNT_PATTERNS = (
+    r"(?:金额|价税合计|合计|小写|实收金额|支付金额|订单金额|总额|总计|总费用|费用总计|票价|房费|住宿费|餐费)[：:\s￥¥人民币为是]*([0-9]{1,6}(?:[.,][0-9]{1,2})?)",
+    r"[￥¥]\s*([0-9]{1,6}(?:[.,][0-9]{1,2})?)",
+    r"([0-9]{1,6}(?:[.,][0-9]{1,2})?)\s*元",
+)
+
+
+def resolve_document_item_amount(document: dict[str, Any]) -> Decimal | None:
+    text = " ".join(
+        [
+            str(document.get("summary") or "").strip(),
+            str(document.get("text") or "").strip(),
+        ]
+    ).strip()
+    field_amount = resolve_document_field_amount(document)
+    text_amount = resolve_document_text_amount(text)
+
+    if field_amount is not None:
+        if is_date_like_amount_candidate(field_amount, text):
+            return text_amount
+        return field_amount
+
+    return text_amount
+
+
+def resolve_document_field_amount(document: dict[str, Any]) -> Decimal | None:
+    for field in list(document.get("document_fields") or []):
+        if not isinstance(field, dict):
+            continue
+        key = str(field.get("key") or "").strip().lower().replace("_", "")
+        label = str(field.get("label") or "").replace(" ", "")
+        is_amount_field = key in DOCUMENT_AMOUNT_FIELD_KEYS or any(
+            token in label for token in DOCUMENT_AMOUNT_LABEL_TOKENS
+        )
+        if not is_amount_field:
+            continue
+
+        raw_value = str(field.get("value") or "")
+        value = parse_document_amount_value(raw_value) or parse_plain_document_amount_value(
+            raw_value
+        )
+        if value is not None:
+            return value
+
+    return None
+
+
+def resolve_document_text_amount(text: str) -> Decimal | None:
+    candidates = [
+        candidate
+        for candidate in extract_amount_candidates(text)
+        if not is_date_like_amount_candidate(candidate, text)
+    ]
+    if not candidates:
+        return None
+    return max(candidates)
+
+
+def parse_document_amount_value(value: str) -> Decimal | None:
+    raw_value = str(value or "").strip()
+    if not raw_value:
+        return None
+    for pattern in DOCUMENT_AMOUNT_PATTERNS:
+        match = pattern.search(raw_value)
+        if not match:
+            continue
+        numeric = str(match.group(1) or "").replace(",", ".").strip()
+        try:
+            amount = Decimal(numeric).quantize(Decimal("0.01"))
+        except (InvalidOperation, ValueError):
+            continue
+        if amount > Decimal("0.00"):
+            return amount
+    return None
+
+
+def parse_plain_document_amount_value(value: str) -> Decimal | None:
+    raw_value = str(value or "").strip()
+    if not re.fullmatch(r"[0-9]{1,6}(?:[.,][0-9]{1,2})?", raw_value):
+        return None
+    try:
+        amount = Decimal(raw_value.replace(",", ".")).quantize(Decimal("0.01"))
+    except (InvalidOperation, ValueError):
+        return None
+    return amount if amount > Decimal("0.00") else None
+
+
+def is_probable_year_amount(amount: Decimal | None) -> bool:
+    if amount is None:
+        return False
+    try:
+        normalized = Decimal(amount).quantize(Decimal("0.01"))
+    except (InvalidOperation, ValueError):
+        return False
+    return (
+        normalized == normalized.to_integral_value()
+        and Decimal("1900") <= normalized <= Decimal("2099")
+    )
+
+
+def is_date_like_amount_candidate(amount: Decimal | None, text: str) -> bool:
+    if not is_probable_year_amount(amount):
+        return False
+    year = str(int(Decimal(amount or 0)))
+    pattern = re.compile(rf"(?<!\d){re.escape(year)}\s*(?:年|[-/.])\s*\d{{1,2}}")
+    return bool(pattern.search(str(text or "")))
+
+
+def format_decimal_amount(amount: Decimal | None) -> str:
+    if amount is None:
+        return ""
+    normalized = Decimal(amount).quantize(Decimal("0.01"))
+    return format(normalized, "f")
+
+
+def extract_amount_candidates(text: str) -> list[Decimal]:
+    values: list[Decimal] = []
+    seen: set[Decimal] = set()
+
+    def append_candidate(
+        raw: str,
+        *,
+        source_text: str = "",
+        start: int = -1,
+        end: int = -1,
+    ) -> None:
+        compact = str(raw or "").replace(",", ".").strip()
+        if not compact:
+            return
+        try:
+            candidate = Decimal(compact).quantize(Decimal("0.01"))
+        except (InvalidOperation, ValueError):
+            return
+        if is_amount_match_date_fragment(candidate, source_text, start, end):
+            return
+        if candidate in seen:
+            return
+        seen.add(candidate)
+        values.append(candidate)
+
+    for pattern in DOCUMENT_TEXT_AMOUNT_PATTERNS:
+        for match in re.finditer(pattern, text, flags=re.IGNORECASE):
+            append_candidate(
+                match.group(1),
+                source_text=text,
+                start=match.start(1),
+                end=match.end(1),
+            )
+
+    if values:
+        return values
+
+    for match in re.finditer(r"(?<!\d)(\d{1,6}\.\d{1,2})(?!\d)", text):
+        append_candidate(match.group(1), source_text=text, start=match.start(1), end=match.end(1))
+    return values
+
+
+def is_amount_match_date_fragment(
+    amount: Decimal,
+    text: str,
+    start: int,
+    end: int,
+) -> bool:
+    if start < 0 or end < 0 or not is_probable_year_amount(amount):
+        return False
+
+    before = str(text or "")[max(0, start - 8):start]
+    after = str(text or "")[end:end + 10]
+    if re.match(r"\s*(?:年|[-/.])\s*\d{1,2}", after):
+        return True
+    if re.search(r"\d{1,2}\s*(?:年|[-/.])\s*$", before):
+        return True
+    return False