X-Financial/server/src/app/services/expense_amounts.py

from __future__ import annotations

import re
from decimal import Decimal, InvalidOperation
from typing import Any

DOCUMENT_AMOUNT_PATTERNS = (
    re.compile(
        r"(?:价税合计|合计金额|费用合计|总费用|费用总计|订单(?:总)?金额|支付(?:金额)?|实付(?:金额)?|实收(?:金额)?|总(?:额|计|价)|票价|金额|车费|消费金额|房费|住宿费)"
        r"[：:\s￥¥人民币为是]*([0-9]+(?:[.,][0-9]{1,2})?)"
    ),
    re.compile(r"[￥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"),
    re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
)

DOCUMENT_AMOUNT_FIELD_KEYS = {
    "amount",
    "totalamount",
    "paymentamount",
    "paidamount",
    "actualamount",
}
DOCUMENT_AMOUNT_LABEL_TOKENS = (
    "金额",
    "价税合计",
    "合计",
    "总额",
    "总计",
    "票价",
    "支付金额",
    "实付金额",
    "实收金额",
)
DOCUMENT_TEXT_AMOUNT_PATTERNS = (
    r"(?:金额|价税合计|合计|小写|实收金额|支付金额|订单金额|总额|总计|总费用|费用总计|票价|房费|住宿费|餐费)[：:\s￥¥人民币为是]*([0-9]{1,6}(?:[.,][0-9]{1,2})?)",
    r"[￥¥]\s*([0-9]{1,6}(?:[.,][0-9]{1,2})?)",
    r"([0-9]{1,6}(?:[.,][0-9]{1,2})?)\s*元",
)


def resolve_document_item_amount(document: dict[str, Any]) -> Decimal | None:
    text = " ".join(
        [
            str(document.get("summary") or "").strip(),
            str(document.get("text") or "").strip(),
        ]
    ).strip()
    field_amount = resolve_document_field_amount(document)
    text_amount = resolve_document_text_amount(text)

    if field_amount is not None:
        if is_date_like_amount_candidate(field_amount, text):
            return text_amount
        return field_amount

    return text_amount


def resolve_document_field_amount(document: dict[str, Any]) -> Decimal | None:
    for field in list(document.get("document_fields") or []):
        if not isinstance(field, dict):
            continue
        key = str(field.get("key") or "").strip().lower().replace("_", "")
        label = str(field.get("label") or "").replace(" ", "")
        is_amount_field = key in DOCUMENT_AMOUNT_FIELD_KEYS or any(
            token in label for token in DOCUMENT_AMOUNT_LABEL_TOKENS
        )
        if not is_amount_field:
            continue

        raw_value = str(field.get("value") or "")
        value = parse_document_amount_value(raw_value) or parse_plain_document_amount_value(
            raw_value
        )
        if value is not None:
            return value

    return None


def resolve_document_text_amount(text: str) -> Decimal | None:
    candidates = [
        candidate
        for candidate in extract_amount_candidates(text)
        if not is_date_like_amount_candidate(candidate, text)
    ]
    if not candidates:
        return None
    return max(candidates)


def parse_document_amount_value(value: str) -> Decimal | None:
    raw_value = str(value or "").strip()
    if not raw_value:
        return None
    for pattern in DOCUMENT_AMOUNT_PATTERNS:
        match = pattern.search(raw_value)
        if not match:
            continue
        numeric = str(match.group(1) or "").replace(",", ".").strip()
        try:
            amount = Decimal(numeric).quantize(Decimal("0.01"))
        except (InvalidOperation, ValueError):
            continue
        if amount > Decimal("0.00"):
            return amount
    return None


def parse_plain_document_amount_value(value: str) -> Decimal | None:
    raw_value = str(value or "").strip()
    if not re.fullmatch(r"[0-9]{1,6}(?:[.,][0-9]{1,2})?", raw_value):
        return None
    try:
        amount = Decimal(raw_value.replace(",", ".")).quantize(Decimal("0.01"))
    except (InvalidOperation, ValueError):
        return None
    return amount if amount > Decimal("0.00") else None


def is_probable_year_amount(amount: Decimal | None) -> bool:
    if amount is None:
        return False
    try:
        normalized = Decimal(amount).quantize(Decimal("0.01"))
    except (InvalidOperation, ValueError):
        return False
    return (
        normalized == normalized.to_integral_value()
        and Decimal("1900") <= normalized <= Decimal("2099")
    )


def is_date_like_amount_candidate(amount: Decimal | None, text: str) -> bool:
    if not is_probable_year_amount(amount):
        return False
    year = str(int(Decimal(amount or 0)))
    pattern = re.compile(rf"(?<!\d){re.escape(year)}\s*(?:年|[-/.])\s*\d{{1,2}}")
    return bool(pattern.search(str(text or "")))


def format_decimal_amount(amount: Decimal | None) -> str:
    if amount is None:
        return ""
    normalized = Decimal(amount).quantize(Decimal("0.01"))
    return format(normalized, "f")


def extract_amount_candidates(text: str) -> list[Decimal]:
    values: list[Decimal] = []
    seen: set[Decimal] = set()

    def append_candidate(
        raw: str,
        *,
        source_text: str = "",
        start: int = -1,
        end: int = -1,
    ) -> None:
        compact = str(raw or "").replace(",", ".").strip()
        if not compact:
            return
        try:
            candidate = Decimal(compact).quantize(Decimal("0.01"))
        except (InvalidOperation, ValueError):
            return
        if is_amount_match_date_fragment(candidate, source_text, start, end):
            return
        if candidate in seen:
            return
        seen.add(candidate)
        values.append(candidate)

    for pattern in DOCUMENT_TEXT_AMOUNT_PATTERNS:
        for match in re.finditer(pattern, text, flags=re.IGNORECASE):
            append_candidate(
                match.group(1),
                source_text=text,
                start=match.start(1),
                end=match.end(1),
            )

    if values:
        return values

    for match in re.finditer(r"(?<!\d)(\d{1,6}\.\d{1,2})(?!\d)", text):
        append_candidate(match.group(1), source_text=text, start=match.start(1), end=match.end(1))
    return values


def is_amount_match_date_fragment(
    amount: Decimal,
    text: str,
    start: int,
    end: int,
) -> bool:
    if start < 0 or end < 0 or not is_probable_year_amount(amount):
        return False

    before = str(text or "")[max(0, start - 8):start]
    after = str(text or "")[end:end + 10]
    if re.match(r"\s*(?:年|[-/.])\s*\d{1,2}", after):
        return True
    if re.search(r"\d{1,2}\s*(?:年|[-/.])\s*$", before):
        return True
    return False