Files
X-Financial/server/src/app/services/expense_amounts.py

207 lines
6.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import re
from decimal import Decimal, InvalidOperation
from typing import Any
DOCUMENT_AMOUNT_PATTERNS = (
re.compile(
r"(?:价税合计|合计金额|费用合计|总费用|费用总计|订单(?:总)?金额|支付(?:金额)?|实付(?:金额)?|实收(?:金额)?|总(?:额|计|价)|票价|金额|车费|消费金额|房费|住宿费)"
r"[:\s¥¥人民币为是]*([0-9]+(?:[.,][0-9]{1,2})?)"
),
re.compile(r"[¥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"),
re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
)
DOCUMENT_AMOUNT_FIELD_KEYS = {
"amount",
"totalamount",
"paymentamount",
"paidamount",
"actualamount",
}
DOCUMENT_AMOUNT_LABEL_TOKENS = (
"金额",
"价税合计",
"合计",
"总额",
"总计",
"票价",
"支付金额",
"实付金额",
"实收金额",
)
DOCUMENT_TEXT_AMOUNT_PATTERNS = (
r"(?:金额|价税合计|合计|小写|实收金额|支付金额|订单金额|总额|总计|总费用|费用总计|票价|房费|住宿费|餐费)[:\s¥¥人民币为是]*([0-9]{1,6}(?:[.,][0-9]{1,2})?)",
r"[¥¥]\s*([0-9]{1,6}(?:[.,][0-9]{1,2})?)",
r"([0-9]{1,6}(?:[.,][0-9]{1,2})?)\s*元",
)
def resolve_document_item_amount(document: dict[str, Any]) -> Decimal | None:
text = " ".join(
[
str(document.get("summary") or "").strip(),
str(document.get("text") or "").strip(),
]
).strip()
field_amount = resolve_document_field_amount(document)
text_amount = resolve_document_text_amount(text)
if field_amount is not None:
if is_date_like_amount_candidate(field_amount, text):
return text_amount
return field_amount
return text_amount
def resolve_document_field_amount(document: dict[str, Any]) -> Decimal | None:
for field in list(document.get("document_fields") or []):
if not isinstance(field, dict):
continue
key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
is_amount_field = key in DOCUMENT_AMOUNT_FIELD_KEYS or any(
token in label for token in DOCUMENT_AMOUNT_LABEL_TOKENS
)
if not is_amount_field:
continue
raw_value = str(field.get("value") or "")
value = parse_document_amount_value(raw_value) or parse_plain_document_amount_value(
raw_value
)
if value is not None:
return value
return None
def resolve_document_text_amount(text: str) -> Decimal | None:
candidates = [
candidate
for candidate in extract_amount_candidates(text)
if not is_date_like_amount_candidate(candidate, text)
]
if not candidates:
return None
return max(candidates)
def parse_document_amount_value(value: str) -> Decimal | None:
raw_value = str(value or "").strip()
if not raw_value:
return None
for pattern in DOCUMENT_AMOUNT_PATTERNS:
match = pattern.search(raw_value)
if not match:
continue
numeric = str(match.group(1) or "").replace(",", ".").strip()
try:
amount = Decimal(numeric).quantize(Decimal("0.01"))
except (InvalidOperation, ValueError):
continue
if amount > Decimal("0.00"):
return amount
return None
def parse_plain_document_amount_value(value: str) -> Decimal | None:
raw_value = str(value or "").strip()
if not re.fullmatch(r"[0-9]{1,6}(?:[.,][0-9]{1,2})?", raw_value):
return None
try:
amount = Decimal(raw_value.replace(",", ".")).quantize(Decimal("0.01"))
except (InvalidOperation, ValueError):
return None
return amount if amount > Decimal("0.00") else None
def is_probable_year_amount(amount: Decimal | None) -> bool:
if amount is None:
return False
try:
normalized = Decimal(amount).quantize(Decimal("0.01"))
except (InvalidOperation, ValueError):
return False
return (
normalized == normalized.to_integral_value()
and Decimal("1900") <= normalized <= Decimal("2099")
)
def is_date_like_amount_candidate(amount: Decimal | None, text: str) -> bool:
if not is_probable_year_amount(amount):
return False
year = str(int(Decimal(amount or 0)))
pattern = re.compile(rf"(?<!\d){re.escape(year)}\s*(?:年|[-/.])\s*\d{{1,2}}")
return bool(pattern.search(str(text or "")))
def format_decimal_amount(amount: Decimal | None) -> str:
if amount is None:
return ""
normalized = Decimal(amount).quantize(Decimal("0.01"))
return format(normalized, "f")
def extract_amount_candidates(text: str) -> list[Decimal]:
values: list[Decimal] = []
seen: set[Decimal] = set()
def append_candidate(
raw: str,
*,
source_text: str = "",
start: int = -1,
end: int = -1,
) -> None:
compact = str(raw or "").replace(",", ".").strip()
if not compact:
return
try:
candidate = Decimal(compact).quantize(Decimal("0.01"))
except (InvalidOperation, ValueError):
return
if is_amount_match_date_fragment(candidate, source_text, start, end):
return
if candidate in seen:
return
seen.add(candidate)
values.append(candidate)
for pattern in DOCUMENT_TEXT_AMOUNT_PATTERNS:
for match in re.finditer(pattern, text, flags=re.IGNORECASE):
append_candidate(
match.group(1),
source_text=text,
start=match.start(1),
end=match.end(1),
)
if values:
return values
for match in re.finditer(r"(?<!\d)(\d{1,6}\.\d{1,2})(?!\d)", text):
append_candidate(match.group(1), source_text=text, start=match.start(1), end=match.end(1))
return values
def is_amount_match_date_fragment(
amount: Decimal,
text: str,
start: int,
end: int,
) -> bool:
if start < 0 or end < 0 or not is_probable_year_amount(amount):
return False
before = str(text or "")[max(0, start - 8):start]
after = str(text or "")[end:end + 10]
if re.match(r"\s*(?:年|[-/.])\s*\d{1,2}", after):
return True
if re.search(r"\d{1,2}\s*(?:年|[-/.])\s*$", before):
return True
return False