from __future__ import annotations import re from decimal import Decimal, InvalidOperation from typing import Any DOCUMENT_AMOUNT_PATTERNS = ( re.compile( r"(?:价税合计|合计金额|费用合计|总费用|费用总计|订单(?:总)?金额|支付(?:金额)?|实付(?:金额)?|实收(?:金额)?|总(?:额|计|价)|票价|金额|车费|消费金额|房费|住宿费)" r"[::\s¥¥人民币为是]*([0-9]+(?:[.,][0-9]{1,2})?)" ), re.compile(r"[¥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"), re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"), ) DOCUMENT_AMOUNT_FIELD_KEYS = { "amount", "totalamount", "paymentamount", "paidamount", "actualamount", } DOCUMENT_AMOUNT_LABEL_TOKENS = ( "金额", "价税合计", "合计", "总额", "总计", "票价", "支付金额", "实付金额", "实收金额", ) DOCUMENT_TEXT_AMOUNT_PATTERNS = ( r"(?:金额|价税合计|合计|小写|实收金额|支付金额|订单金额|总额|总计|总费用|费用总计|票价|房费|住宿费|餐费)[::\s¥¥人民币为是]*([0-9]{1,6}(?:[.,][0-9]{1,2})?)", r"[¥¥]\s*([0-9]{1,6}(?:[.,][0-9]{1,2})?)", r"([0-9]{1,6}(?:[.,][0-9]{1,2})?)\s*元", ) def resolve_document_item_amount(document: dict[str, Any]) -> Decimal | None: text = " ".join( [ str(document.get("summary") or "").strip(), str(document.get("text") or "").strip(), ] ).strip() field_amount = resolve_document_field_amount(document) text_amount = resolve_document_text_amount(text) if field_amount is not None: if is_date_like_amount_candidate(field_amount, text): return text_amount return field_amount return text_amount def resolve_document_field_amount(document: dict[str, Any]) -> Decimal | None: for field in list(document.get("document_fields") or []): if not isinstance(field, dict): continue key = str(field.get("key") or "").strip().lower().replace("_", "") label = str(field.get("label") or "").replace(" ", "") is_amount_field = key in DOCUMENT_AMOUNT_FIELD_KEYS or any( token in label for token in DOCUMENT_AMOUNT_LABEL_TOKENS ) if not is_amount_field: continue raw_value = str(field.get("value") or "") value = parse_document_amount_value(raw_value) or parse_plain_document_amount_value( raw_value ) if value is not None: return value return None def resolve_document_text_amount(text: str) -> Decimal | None: candidates = [ candidate for candidate in extract_amount_candidates(text) if not is_date_like_amount_candidate(candidate, text) ] if not candidates: return None return max(candidates) def parse_document_amount_value(value: str) -> Decimal | None: raw_value = str(value or "").strip() if not raw_value: return None for pattern in DOCUMENT_AMOUNT_PATTERNS: match = pattern.search(raw_value) if not match: continue numeric = str(match.group(1) or "").replace(",", ".").strip() try: amount = Decimal(numeric).quantize(Decimal("0.01")) except (InvalidOperation, ValueError): continue if amount > Decimal("0.00"): return amount return None def parse_plain_document_amount_value(value: str) -> Decimal | None: raw_value = str(value or "").strip() if not re.fullmatch(r"[0-9]{1,6}(?:[.,][0-9]{1,2})?", raw_value): return None try: amount = Decimal(raw_value.replace(",", ".")).quantize(Decimal("0.01")) except (InvalidOperation, ValueError): return None return amount if amount > Decimal("0.00") else None def is_probable_year_amount(amount: Decimal | None) -> bool: if amount is None: return False try: normalized = Decimal(amount).quantize(Decimal("0.01")) except (InvalidOperation, ValueError): return False return ( normalized == normalized.to_integral_value() and Decimal("1900") <= normalized <= Decimal("2099") ) def is_date_like_amount_candidate(amount: Decimal | None, text: str) -> bool: if not is_probable_year_amount(amount): return False year = str(int(Decimal(amount or 0))) pattern = re.compile(rf"(? str: if amount is None: return "" normalized = Decimal(amount).quantize(Decimal("0.01")) return format(normalized, "f") def extract_amount_candidates(text: str) -> list[Decimal]: values: list[Decimal] = [] seen: set[Decimal] = set() def append_candidate( raw: str, *, source_text: str = "", start: int = -1, end: int = -1, ) -> None: compact = str(raw or "").replace(",", ".").strip() if not compact: return try: candidate = Decimal(compact).quantize(Decimal("0.01")) except (InvalidOperation, ValueError): return if is_amount_match_date_fragment(candidate, source_text, start, end): return if candidate in seen: return seen.add(candidate) values.append(candidate) for pattern in DOCUMENT_TEXT_AMOUNT_PATTERNS: for match in re.finditer(pattern, text, flags=re.IGNORECASE): append_candidate( match.group(1), source_text=text, start=match.start(1), end=match.end(1), ) if values: return values for match in re.finditer(r"(? bool: if start < 0 or end < 0 or not is_probable_year_amount(amount): return False before = str(text or "")[max(0, start - 8):start] after = str(text or "")[end:end + 10] if re.match(r"\s*(?:年|[-/.])\s*\d{1,2}", after): return True if re.search(r"\d{1,2}\s*(?:年|[-/.])\s*$", before): return True return False