207 lines
6.3 KiB
Python
207 lines
6.3 KiB
Python
from __future__ import annotations
|
||
|
||
import re
|
||
from decimal import Decimal, InvalidOperation
|
||
from typing import Any
|
||
|
||
DOCUMENT_AMOUNT_PATTERNS = (
|
||
re.compile(
|
||
r"(?:价税合计|合计金额|费用合计|总费用|费用总计|订单(?:总)?金额|支付(?:金额)?|实付(?:金额)?|实收(?:金额)?|总(?:额|计|价)|票价|金额|车费|消费金额|房费|住宿费)"
|
||
r"[::\s¥¥人民币为是]*([0-9]+(?:[.,][0-9]{1,2})?)"
|
||
),
|
||
re.compile(r"[¥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"),
|
||
re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
|
||
)
|
||
|
||
DOCUMENT_AMOUNT_FIELD_KEYS = {
|
||
"amount",
|
||
"totalamount",
|
||
"paymentamount",
|
||
"paidamount",
|
||
"actualamount",
|
||
}
|
||
DOCUMENT_AMOUNT_LABEL_TOKENS = (
|
||
"金额",
|
||
"价税合计",
|
||
"合计",
|
||
"总额",
|
||
"总计",
|
||
"票价",
|
||
"支付金额",
|
||
"实付金额",
|
||
"实收金额",
|
||
)
|
||
DOCUMENT_TEXT_AMOUNT_PATTERNS = (
|
||
r"(?:金额|价税合计|合计|小写|实收金额|支付金额|订单金额|总额|总计|总费用|费用总计|票价|房费|住宿费|餐费)[::\s¥¥人民币为是]*([0-9]{1,6}(?:[.,][0-9]{1,2})?)",
|
||
r"[¥¥]\s*([0-9]{1,6}(?:[.,][0-9]{1,2})?)",
|
||
r"([0-9]{1,6}(?:[.,][0-9]{1,2})?)\s*元",
|
||
)
|
||
|
||
|
||
def resolve_document_item_amount(document: dict[str, Any]) -> Decimal | None:
|
||
text = " ".join(
|
||
[
|
||
str(document.get("summary") or "").strip(),
|
||
str(document.get("text") or "").strip(),
|
||
]
|
||
).strip()
|
||
field_amount = resolve_document_field_amount(document)
|
||
text_amount = resolve_document_text_amount(text)
|
||
|
||
if field_amount is not None:
|
||
if is_date_like_amount_candidate(field_amount, text):
|
||
return text_amount
|
||
return field_amount
|
||
|
||
return text_amount
|
||
|
||
|
||
def resolve_document_field_amount(document: dict[str, Any]) -> Decimal | None:
|
||
for field in list(document.get("document_fields") or []):
|
||
if not isinstance(field, dict):
|
||
continue
|
||
key = str(field.get("key") or "").strip().lower().replace("_", "")
|
||
label = str(field.get("label") or "").replace(" ", "")
|
||
is_amount_field = key in DOCUMENT_AMOUNT_FIELD_KEYS or any(
|
||
token in label for token in DOCUMENT_AMOUNT_LABEL_TOKENS
|
||
)
|
||
if not is_amount_field:
|
||
continue
|
||
|
||
raw_value = str(field.get("value") or "")
|
||
value = parse_document_amount_value(raw_value) or parse_plain_document_amount_value(
|
||
raw_value
|
||
)
|
||
if value is not None:
|
||
return value
|
||
|
||
return None
|
||
|
||
|
||
def resolve_document_text_amount(text: str) -> Decimal | None:
|
||
candidates = [
|
||
candidate
|
||
for candidate in extract_amount_candidates(text)
|
||
if not is_date_like_amount_candidate(candidate, text)
|
||
]
|
||
if not candidates:
|
||
return None
|
||
return max(candidates)
|
||
|
||
|
||
def parse_document_amount_value(value: str) -> Decimal | None:
|
||
raw_value = str(value or "").strip()
|
||
if not raw_value:
|
||
return None
|
||
for pattern in DOCUMENT_AMOUNT_PATTERNS:
|
||
match = pattern.search(raw_value)
|
||
if not match:
|
||
continue
|
||
numeric = str(match.group(1) or "").replace(",", ".").strip()
|
||
try:
|
||
amount = Decimal(numeric).quantize(Decimal("0.01"))
|
||
except (InvalidOperation, ValueError):
|
||
continue
|
||
if amount > Decimal("0.00"):
|
||
return amount
|
||
return None
|
||
|
||
|
||
def parse_plain_document_amount_value(value: str) -> Decimal | None:
|
||
raw_value = str(value or "").strip()
|
||
if not re.fullmatch(r"[0-9]{1,6}(?:[.,][0-9]{1,2})?", raw_value):
|
||
return None
|
||
try:
|
||
amount = Decimal(raw_value.replace(",", ".")).quantize(Decimal("0.01"))
|
||
except (InvalidOperation, ValueError):
|
||
return None
|
||
return amount if amount > Decimal("0.00") else None
|
||
|
||
|
||
def is_probable_year_amount(amount: Decimal | None) -> bool:
|
||
if amount is None:
|
||
return False
|
||
try:
|
||
normalized = Decimal(amount).quantize(Decimal("0.01"))
|
||
except (InvalidOperation, ValueError):
|
||
return False
|
||
return (
|
||
normalized == normalized.to_integral_value()
|
||
and Decimal("1900") <= normalized <= Decimal("2099")
|
||
)
|
||
|
||
|
||
def is_date_like_amount_candidate(amount: Decimal | None, text: str) -> bool:
|
||
if not is_probable_year_amount(amount):
|
||
return False
|
||
year = str(int(Decimal(amount or 0)))
|
||
pattern = re.compile(rf"(?<!\d){re.escape(year)}\s*(?:年|[-/.])\s*\d{{1,2}}")
|
||
return bool(pattern.search(str(text or "")))
|
||
|
||
|
||
def format_decimal_amount(amount: Decimal | None) -> str:
|
||
if amount is None:
|
||
return ""
|
||
normalized = Decimal(amount).quantize(Decimal("0.01"))
|
||
return format(normalized, "f")
|
||
|
||
|
||
def extract_amount_candidates(text: str) -> list[Decimal]:
|
||
values: list[Decimal] = []
|
||
seen: set[Decimal] = set()
|
||
|
||
def append_candidate(
|
||
raw: str,
|
||
*,
|
||
source_text: str = "",
|
||
start: int = -1,
|
||
end: int = -1,
|
||
) -> None:
|
||
compact = str(raw or "").replace(",", ".").strip()
|
||
if not compact:
|
||
return
|
||
try:
|
||
candidate = Decimal(compact).quantize(Decimal("0.01"))
|
||
except (InvalidOperation, ValueError):
|
||
return
|
||
if is_amount_match_date_fragment(candidate, source_text, start, end):
|
||
return
|
||
if candidate in seen:
|
||
return
|
||
seen.add(candidate)
|
||
values.append(candidate)
|
||
|
||
for pattern in DOCUMENT_TEXT_AMOUNT_PATTERNS:
|
||
for match in re.finditer(pattern, text, flags=re.IGNORECASE):
|
||
append_candidate(
|
||
match.group(1),
|
||
source_text=text,
|
||
start=match.start(1),
|
||
end=match.end(1),
|
||
)
|
||
|
||
if values:
|
||
return values
|
||
|
||
for match in re.finditer(r"(?<!\d)(\d{1,6}\.\d{1,2})(?!\d)", text):
|
||
append_candidate(match.group(1), source_text=text, start=match.start(1), end=match.end(1))
|
||
return values
|
||
|
||
|
||
def is_amount_match_date_fragment(
|
||
amount: Decimal,
|
||
text: str,
|
||
start: int,
|
||
end: int,
|
||
) -> bool:
|
||
if start < 0 or end < 0 or not is_probable_year_amount(amount):
|
||
return False
|
||
|
||
before = str(text or "")[max(0, start - 8):start]
|
||
after = str(text or "")[end:end + 10]
|
||
if re.match(r"\s*(?:年|[-/.])\s*\d{1,2}", after):
|
||
return True
|
||
if re.search(r"\d{1,2}\s*(?:年|[-/.])\s*$", before):
|
||
return True
|
||
return False
|