fix(reimbursement): harden assistant draft and claim cleanup

This commit is contained in:
caoxiaozhu
2026-05-21 23:52:34 +08:00
parent e701fa01da
commit 2908dda024
9 changed files with 1060 additions and 398 deletions

View File

@@ -0,0 +1,206 @@
from __future__ import annotations
import re
from decimal import Decimal, InvalidOperation
from typing import Any
DOCUMENT_AMOUNT_PATTERNS = (
re.compile(
r"(?:价税合计|合计金额|费用合计|总费用|费用总计|订单(?:总)?金额|支付(?:金额)?|实付(?:金额)?|实收(?:金额)?|总(?:额|计|价)|票价|金额|车费|消费金额|房费|住宿费)"
r"[:\s¥¥人民币为是]*([0-9]+(?:[.,][0-9]{1,2})?)"
),
re.compile(r"[¥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"),
re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
)
DOCUMENT_AMOUNT_FIELD_KEYS = {
"amount",
"totalamount",
"paymentamount",
"paidamount",
"actualamount",
}
DOCUMENT_AMOUNT_LABEL_TOKENS = (
"金额",
"价税合计",
"合计",
"总额",
"总计",
"票价",
"支付金额",
"实付金额",
"实收金额",
)
DOCUMENT_TEXT_AMOUNT_PATTERNS = (
r"(?:金额|价税合计|合计|小写|实收金额|支付金额|订单金额|总额|总计|总费用|费用总计|票价|房费|住宿费|餐费)[:\s¥¥人民币为是]*([0-9]{1,6}(?:[.,][0-9]{1,2})?)",
r"[¥¥]\s*([0-9]{1,6}(?:[.,][0-9]{1,2})?)",
r"([0-9]{1,6}(?:[.,][0-9]{1,2})?)\s*元",
)
def resolve_document_item_amount(document: dict[str, Any]) -> Decimal | None:
text = " ".join(
[
str(document.get("summary") or "").strip(),
str(document.get("text") or "").strip(),
]
).strip()
field_amount = resolve_document_field_amount(document)
text_amount = resolve_document_text_amount(text)
if field_amount is not None:
if is_date_like_amount_candidate(field_amount, text):
return text_amount
return field_amount
return text_amount
def resolve_document_field_amount(document: dict[str, Any]) -> Decimal | None:
for field in list(document.get("document_fields") or []):
if not isinstance(field, dict):
continue
key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
is_amount_field = key in DOCUMENT_AMOUNT_FIELD_KEYS or any(
token in label for token in DOCUMENT_AMOUNT_LABEL_TOKENS
)
if not is_amount_field:
continue
raw_value = str(field.get("value") or "")
value = parse_document_amount_value(raw_value) or parse_plain_document_amount_value(
raw_value
)
if value is not None:
return value
return None
def resolve_document_text_amount(text: str) -> Decimal | None:
candidates = [
candidate
for candidate in extract_amount_candidates(text)
if not is_date_like_amount_candidate(candidate, text)
]
if not candidates:
return None
return max(candidates)
def parse_document_amount_value(value: str) -> Decimal | None:
raw_value = str(value or "").strip()
if not raw_value:
return None
for pattern in DOCUMENT_AMOUNT_PATTERNS:
match = pattern.search(raw_value)
if not match:
continue
numeric = str(match.group(1) or "").replace(",", ".").strip()
try:
amount = Decimal(numeric).quantize(Decimal("0.01"))
except (InvalidOperation, ValueError):
continue
if amount > Decimal("0.00"):
return amount
return None
def parse_plain_document_amount_value(value: str) -> Decimal | None:
raw_value = str(value or "").strip()
if not re.fullmatch(r"[0-9]{1,6}(?:[.,][0-9]{1,2})?", raw_value):
return None
try:
amount = Decimal(raw_value.replace(",", ".")).quantize(Decimal("0.01"))
except (InvalidOperation, ValueError):
return None
return amount if amount > Decimal("0.00") else None
def is_probable_year_amount(amount: Decimal | None) -> bool:
if amount is None:
return False
try:
normalized = Decimal(amount).quantize(Decimal("0.01"))
except (InvalidOperation, ValueError):
return False
return (
normalized == normalized.to_integral_value()
and Decimal("1900") <= normalized <= Decimal("2099")
)
def is_date_like_amount_candidate(amount: Decimal | None, text: str) -> bool:
if not is_probable_year_amount(amount):
return False
year = str(int(Decimal(amount or 0)))
pattern = re.compile(rf"(?<!\d){re.escape(year)}\s*(?:年|[-/.])\s*\d{{1,2}}")
return bool(pattern.search(str(text or "")))
def format_decimal_amount(amount: Decimal | None) -> str:
if amount is None:
return ""
normalized = Decimal(amount).quantize(Decimal("0.01"))
return format(normalized, "f")
def extract_amount_candidates(text: str) -> list[Decimal]:
values: list[Decimal] = []
seen: set[Decimal] = set()
def append_candidate(
raw: str,
*,
source_text: str = "",
start: int = -1,
end: int = -1,
) -> None:
compact = str(raw or "").replace(",", ".").strip()
if not compact:
return
try:
candidate = Decimal(compact).quantize(Decimal("0.01"))
except (InvalidOperation, ValueError):
return
if is_amount_match_date_fragment(candidate, source_text, start, end):
return
if candidate in seen:
return
seen.add(candidate)
values.append(candidate)
for pattern in DOCUMENT_TEXT_AMOUNT_PATTERNS:
for match in re.finditer(pattern, text, flags=re.IGNORECASE):
append_candidate(
match.group(1),
source_text=text,
start=match.start(1),
end=match.end(1),
)
if values:
return values
for match in re.finditer(r"(?<!\d)(\d{1,6}\.\d{1,2})(?!\d)", text):
append_candidate(match.group(1), source_text=text, start=match.start(1), end=match.end(1))
return values
def is_amount_match_date_fragment(
amount: Decimal,
text: str,
start: int,
end: int,
) -> bool:
if start < 0 or end < 0 or not is_probable_year_amount(amount):
return False
before = str(text or "")[max(0, start - 8):start]
after = str(text or "")[end:end + 10]
if re.match(r"\s*(?:年|[-/.])\s*\d{1,2}", after):
return True
if re.search(r"\d{1,2}\s*(?:年|[-/.])\s*$", before):
return True
return False