207 lines
6.3 KiB
Python
207 lines
6.3 KiB
Python
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import re
|
|||
|
|
from decimal import Decimal, InvalidOperation
|
|||
|
|
from typing import Any
|
|||
|
|
|
|||
|
|
DOCUMENT_AMOUNT_PATTERNS = (
|
|||
|
|
re.compile(
|
|||
|
|
r"(?:价税合计|合计金额|费用合计|总费用|费用总计|订单(?:总)?金额|支付(?:金额)?|实付(?:金额)?|实收(?:金额)?|总(?:额|计|价)|票价|金额|车费|消费金额|房费|住宿费)"
|
|||
|
|
r"[::\s¥¥人民币为是]*([0-9]+(?:[.,][0-9]{1,2})?)"
|
|||
|
|
),
|
|||
|
|
re.compile(r"[¥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"),
|
|||
|
|
re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
DOCUMENT_AMOUNT_FIELD_KEYS = {
|
|||
|
|
"amount",
|
|||
|
|
"totalamount",
|
|||
|
|
"paymentamount",
|
|||
|
|
"paidamount",
|
|||
|
|
"actualamount",
|
|||
|
|
}
|
|||
|
|
DOCUMENT_AMOUNT_LABEL_TOKENS = (
|
|||
|
|
"金额",
|
|||
|
|
"价税合计",
|
|||
|
|
"合计",
|
|||
|
|
"总额",
|
|||
|
|
"总计",
|
|||
|
|
"票价",
|
|||
|
|
"支付金额",
|
|||
|
|
"实付金额",
|
|||
|
|
"实收金额",
|
|||
|
|
)
|
|||
|
|
DOCUMENT_TEXT_AMOUNT_PATTERNS = (
|
|||
|
|
r"(?:金额|价税合计|合计|小写|实收金额|支付金额|订单金额|总额|总计|总费用|费用总计|票价|房费|住宿费|餐费)[::\s¥¥人民币为是]*([0-9]{1,6}(?:[.,][0-9]{1,2})?)",
|
|||
|
|
r"[¥¥]\s*([0-9]{1,6}(?:[.,][0-9]{1,2})?)",
|
|||
|
|
r"([0-9]{1,6}(?:[.,][0-9]{1,2})?)\s*元",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def resolve_document_item_amount(document: dict[str, Any]) -> Decimal | None:
|
|||
|
|
text = " ".join(
|
|||
|
|
[
|
|||
|
|
str(document.get("summary") or "").strip(),
|
|||
|
|
str(document.get("text") or "").strip(),
|
|||
|
|
]
|
|||
|
|
).strip()
|
|||
|
|
field_amount = resolve_document_field_amount(document)
|
|||
|
|
text_amount = resolve_document_text_amount(text)
|
|||
|
|
|
|||
|
|
if field_amount is not None:
|
|||
|
|
if is_date_like_amount_candidate(field_amount, text):
|
|||
|
|
return text_amount
|
|||
|
|
return field_amount
|
|||
|
|
|
|||
|
|
return text_amount
|
|||
|
|
|
|||
|
|
|
|||
|
|
def resolve_document_field_amount(document: dict[str, Any]) -> Decimal | None:
|
|||
|
|
for field in list(document.get("document_fields") or []):
|
|||
|
|
if not isinstance(field, dict):
|
|||
|
|
continue
|
|||
|
|
key = str(field.get("key") or "").strip().lower().replace("_", "")
|
|||
|
|
label = str(field.get("label") or "").replace(" ", "")
|
|||
|
|
is_amount_field = key in DOCUMENT_AMOUNT_FIELD_KEYS or any(
|
|||
|
|
token in label for token in DOCUMENT_AMOUNT_LABEL_TOKENS
|
|||
|
|
)
|
|||
|
|
if not is_amount_field:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
raw_value = str(field.get("value") or "")
|
|||
|
|
value = parse_document_amount_value(raw_value) or parse_plain_document_amount_value(
|
|||
|
|
raw_value
|
|||
|
|
)
|
|||
|
|
if value is not None:
|
|||
|
|
return value
|
|||
|
|
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def resolve_document_text_amount(text: str) -> Decimal | None:
|
|||
|
|
candidates = [
|
|||
|
|
candidate
|
|||
|
|
for candidate in extract_amount_candidates(text)
|
|||
|
|
if not is_date_like_amount_candidate(candidate, text)
|
|||
|
|
]
|
|||
|
|
if not candidates:
|
|||
|
|
return None
|
|||
|
|
return max(candidates)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_document_amount_value(value: str) -> Decimal | None:
|
|||
|
|
raw_value = str(value or "").strip()
|
|||
|
|
if not raw_value:
|
|||
|
|
return None
|
|||
|
|
for pattern in DOCUMENT_AMOUNT_PATTERNS:
|
|||
|
|
match = pattern.search(raw_value)
|
|||
|
|
if not match:
|
|||
|
|
continue
|
|||
|
|
numeric = str(match.group(1) or "").replace(",", ".").strip()
|
|||
|
|
try:
|
|||
|
|
amount = Decimal(numeric).quantize(Decimal("0.01"))
|
|||
|
|
except (InvalidOperation, ValueError):
|
|||
|
|
continue
|
|||
|
|
if amount > Decimal("0.00"):
|
|||
|
|
return amount
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_plain_document_amount_value(value: str) -> Decimal | None:
|
|||
|
|
raw_value = str(value or "").strip()
|
|||
|
|
if not re.fullmatch(r"[0-9]{1,6}(?:[.,][0-9]{1,2})?", raw_value):
|
|||
|
|
return None
|
|||
|
|
try:
|
|||
|
|
amount = Decimal(raw_value.replace(",", ".")).quantize(Decimal("0.01"))
|
|||
|
|
except (InvalidOperation, ValueError):
|
|||
|
|
return None
|
|||
|
|
return amount if amount > Decimal("0.00") else None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def is_probable_year_amount(amount: Decimal | None) -> bool:
|
|||
|
|
if amount is None:
|
|||
|
|
return False
|
|||
|
|
try:
|
|||
|
|
normalized = Decimal(amount).quantize(Decimal("0.01"))
|
|||
|
|
except (InvalidOperation, ValueError):
|
|||
|
|
return False
|
|||
|
|
return (
|
|||
|
|
normalized == normalized.to_integral_value()
|
|||
|
|
and Decimal("1900") <= normalized <= Decimal("2099")
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def is_date_like_amount_candidate(amount: Decimal | None, text: str) -> bool:
|
|||
|
|
if not is_probable_year_amount(amount):
|
|||
|
|
return False
|
|||
|
|
year = str(int(Decimal(amount or 0)))
|
|||
|
|
pattern = re.compile(rf"(?<!\d){re.escape(year)}\s*(?:年|[-/.])\s*\d{{1,2}}")
|
|||
|
|
return bool(pattern.search(str(text or "")))
|
|||
|
|
|
|||
|
|
|
|||
|
|
def format_decimal_amount(amount: Decimal | None) -> str:
|
|||
|
|
if amount is None:
|
|||
|
|
return ""
|
|||
|
|
normalized = Decimal(amount).quantize(Decimal("0.01"))
|
|||
|
|
return format(normalized, "f")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_amount_candidates(text: str) -> list[Decimal]:
|
|||
|
|
values: list[Decimal] = []
|
|||
|
|
seen: set[Decimal] = set()
|
|||
|
|
|
|||
|
|
def append_candidate(
|
|||
|
|
raw: str,
|
|||
|
|
*,
|
|||
|
|
source_text: str = "",
|
|||
|
|
start: int = -1,
|
|||
|
|
end: int = -1,
|
|||
|
|
) -> None:
|
|||
|
|
compact = str(raw or "").replace(",", ".").strip()
|
|||
|
|
if not compact:
|
|||
|
|
return
|
|||
|
|
try:
|
|||
|
|
candidate = Decimal(compact).quantize(Decimal("0.01"))
|
|||
|
|
except (InvalidOperation, ValueError):
|
|||
|
|
return
|
|||
|
|
if is_amount_match_date_fragment(candidate, source_text, start, end):
|
|||
|
|
return
|
|||
|
|
if candidate in seen:
|
|||
|
|
return
|
|||
|
|
seen.add(candidate)
|
|||
|
|
values.append(candidate)
|
|||
|
|
|
|||
|
|
for pattern in DOCUMENT_TEXT_AMOUNT_PATTERNS:
|
|||
|
|
for match in re.finditer(pattern, text, flags=re.IGNORECASE):
|
|||
|
|
append_candidate(
|
|||
|
|
match.group(1),
|
|||
|
|
source_text=text,
|
|||
|
|
start=match.start(1),
|
|||
|
|
end=match.end(1),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if values:
|
|||
|
|
return values
|
|||
|
|
|
|||
|
|
for match in re.finditer(r"(?<!\d)(\d{1,6}\.\d{1,2})(?!\d)", text):
|
|||
|
|
append_candidate(match.group(1), source_text=text, start=match.start(1), end=match.end(1))
|
|||
|
|
return values
|
|||
|
|
|
|||
|
|
|
|||
|
|
def is_amount_match_date_fragment(
|
|||
|
|
amount: Decimal,
|
|||
|
|
text: str,
|
|||
|
|
start: int,
|
|||
|
|
end: int,
|
|||
|
|
) -> bool:
|
|||
|
|
if start < 0 or end < 0 or not is_probable_year_amount(amount):
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
before = str(text or "")[max(0, start - 8):start]
|
|||
|
|
after = str(text or "")[end:end + 10]
|
|||
|
|
if re.match(r"\s*(?:年|[-/.])\s*\d{1,2}", after):
|
|||
|
|
return True
|
|||
|
|
if re.search(r"\d{1,2}\s*(?:年|[-/.])\s*$", before):
|
|||
|
|
return True
|
|||
|
|
return False
|