from __future__ import annotations import json import re import shutil import uuid from collections import defaultdict from datetime import UTC, date, datetime, timedelta from decimal import Decimal, InvalidOperation from pathlib import Path from types import SimpleNamespace from typing import Any from sqlalchemy import func, or_, select from sqlalchemy import inspect as sqlalchemy_inspect from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session, selectinload from app.api.deps import CurrentUserContext from app.core.agent_enums import AgentAssetDomain, AgentAssetStatus, AgentAssetType from app.models.agent_asset import AgentAsset from app.models.employee import Employee from app.models.financial_record import ExpenseClaim, ExpenseClaimItem from app.schemas.ontology import OntologyEntity, OntologyParseResult from app.schemas.reimbursement import ( ExpenseClaimItemCreate, ExpenseClaimItemUpdate, ExpenseClaimUpdate, TravelReimbursementCalculatorRequest, ) from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY from app.services.agent_foundation import AgentFoundationService from app.services.audit import AuditLogService from app.services.document_intelligence import build_document_insight from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage from app.services.expense_claim_constants import ( EXPENSE_TYPE_LABELS, MAX_DRAFT_CLAIMS_PER_USER, EDITABLE_CLAIM_STATUSES, SYSTEM_GENERATED_ITEM_TYPES, TRAVEL_DETAIL_ITEM_TYPES, TRAVEL_ALLOWANCE_TRIGGER_ITEM_TYPES, DOCUMENT_TYPE_ITEM_TYPE_MAP, DOCUMENT_TYPE_SCENE_MAP, DOCUMENT_FACT_ITEM_TYPES, ROUTE_DESCRIPTION_ITEM_TYPES, DOCUMENT_TRIP_DATE_LABELS, DOCUMENT_TRIP_DATE_REQUIREMENT_LABELS, DOCUMENT_TRIP_DATE_KEYS, DOCUMENT_GENERIC_DATE_KEYS, DOCUMENT_INVOICE_DATE_KEYS, DOCUMENT_TRIP_DATE_LABEL_TOKENS, DOCUMENT_GENERIC_DATE_LABEL_TOKENS, DOCUMENT_INVOICE_DATE_LABEL_TOKENS, DOCUMENT_ROUTE_FORMAT_PATTERN, DOCUMENT_ROUTE_TEXT_PATTERN, DOCUMENT_ROUTE_ORIGIN_LABELS, DOCUMENT_ROUTE_DESTINATION_LABELS, GENERIC_ATTACHMENT_BACKFILL_ITEM_TYPES, LOCATION_REQUIRED_EXPENSE_TYPES, EXPENSE_SCENE_KEYWORDS, EXPENSE_TYPE_ALLOWED_DOCUMENT_SCENES, DOCUMENT_SCENE_LABELS, DOCUMENT_ASSOCIATION_REVIEW_ACTIONS, PERSISTENT_EXPENSE_REVIEW_ACTIONS, RETURN_REASON_OPTIONS, MAX_CLAIM_NO_RETRY_ATTEMPTS, DOCUMENT_DATE_PATTERN, SYSTEM_GENERATED_REASON_PREFIXES, LEADING_REASON_TIME_PATTERNS, AI_REVIEW_LOOKBACK_DAYS, AI_REVIEW_REPEAT_RISK_WARNING_COUNT, AI_REVIEW_REPEAT_RISK_BLOCK_COUNT, TRAVEL_REVIEW_RELEVANT_EXPENSE_TYPES, TRAVEL_REVIEW_LONG_DISTANCE_DOCUMENT_TYPES, TRAVEL_POLICY_CITY_TIERS, TRAVEL_POLICY_CITY_MATCH_ORDER, TRAVEL_POLICY_BAND_LABELS, TRAVEL_POLICY_HOTEL_LIMITS, TRAVEL_POLICY_ALLOWED_TRANSPORT_LEVELS, TRAVEL_POLICY_ROUTE_EXCEPTION_KEYWORDS, TRAVEL_POLICY_STANDARD_EXCEPTION_KEYWORDS, TRAVEL_POLICY_FLIGHT_CLASS_PATTERNS, TRAVEL_POLICY_TRAIN_CLASS_PATTERNS, TRAVEL_POLICY_HOTEL_NIGHT_PATTERN, ) from app.services.expense_claim_risk_review import ExpenseClaimRiskReviewMixin from app.services.expense_amounts import ( extract_amount_candidates, format_decimal_amount, is_amount_match_date_fragment, is_date_like_amount_candidate, is_probable_year_amount, parse_document_amount_value, parse_plain_document_amount_value, resolve_document_field_amount, resolve_document_item_amount, resolve_document_text_amount, ) from app.services.expense_rule_runtime import ( DEFAULT_SCENE_RULE_ASSET_CODE, ExpenseRuleRuntimeService, RuntimeTravelPolicy, build_default_expense_rule_catalog, resolve_document_type_label, ) from app.services.ocr import OcrService class ExpenseClaimDocumentParsingMixin: def _resolve_document_route_value(self, document: dict[str, Any]) -> str: route = self._resolve_document_fact_field( document, keys={"route", "trip_route"}, labels={"行程", "路线"}, ) if route: return route origin = self._resolve_document_fact_field( document, keys={ "origin", "from", "from_city", "departure", "departure_city", "start", "start_location", "start_address", "pickup_location", "pickup_address", "boarding_station", }, labels=DOCUMENT_ROUTE_ORIGIN_LABELS, ) destination = self._resolve_document_fact_field( document, keys={ "destination", "to", "to_city", "arrival", "arrival_city", "end", "end_location", "end_address", "dropoff_location", "dropoff_address", "alighting_station", }, labels=DOCUMENT_ROUTE_DESTINATION_LABELS, ) if origin and destination: return f"{origin}-{destination}" text = " ".join( [ str(document.get("summary") or "").strip(), str(document.get("text") or "").strip(), ] ).strip() text_route = self._extract_document_route_from_text(text) if text_route: return text_route text_origin = self._extract_document_labeled_text_value(text, DOCUMENT_ROUTE_ORIGIN_LABELS) text_destination = self._extract_document_labeled_text_value(text, DOCUMENT_ROUTE_DESTINATION_LABELS) if text_origin and text_destination: return f"{text_origin}-{text_destination}" return "" @staticmethod def _resolve_document_fact_field( document: dict[str, Any], *, keys: set[str], labels: set[str], ) -> str: raw_fields = document.get("document_fields") if not isinstance(raw_fields, list): raw_fields = document.get("fields") if not isinstance(raw_fields, list): return "" normalized_keys = {str(key or "").strip().lower().replace("_", "") for key in keys} for field in raw_fields: if not isinstance(field, dict): continue field_key = str(field.get("key") or "").strip().lower().replace("_", "") label = str(field.get("label") or "").replace(" ", "") value = str(field.get("value") or "").strip() if not value: continue if field_key in normalized_keys or any(token in label for token in labels): return value return "" @staticmethod def _format_document_route(route: str) -> str: normalized = ( str(route or "") .strip() .replace("->", "-") .replace("→", "-") .replace("—", "-") .replace("–", "-") .replace("至", "-") .replace("到", "-") ) if "-" not in normalized: return str(route or "").strip() origin, destination = [part.strip() for part in normalized.split("-", 1)] origin = origin.removeprefix("从").strip() destination = destination.removeprefix("至").removeprefix("到").strip() if not origin or not destination or origin == destination: return str(route or "").strip() return f"{origin}-{destination}" @staticmethod def _extract_document_route_from_text(text: str) -> str: for match in DOCUMENT_ROUTE_TEXT_PATTERN.finditer(str(text or "")): origin = str(match.group(1) or "").strip() destination = str(match.group(2) or "").strip() if not origin or not destination or origin == destination: continue if origin.isdigit() and destination.isdigit(): continue if DOCUMENT_DATE_PATTERN.search(f"{origin}-{destination}"): continue return f"{origin}-{destination}" return "" @staticmethod def _extract_document_labeled_text_value(text: str, labels: set[str]) -> str: for label in sorted(labels, key=len, reverse=True): pattern = re.compile( rf"{re.escape(label)}[::\s]*" r"([A-Za-z0-9\u4e00-\u9fa5()()·\-路街道号弄区县市省园桥站机场中心]{2,50})" ) match = pattern.search(str(text or "")) if match: return str(match.group(1) or "").strip() return "" def _resolve_document_stay_range(self, document: dict[str, Any]) -> str: check_in = self._resolve_document_fact_field( document, keys={"check_in", "checkin", "arrival_date", "start_date"}, labels={"入住", "入住日期", "到店", "开始日期"}, ) check_out = self._resolve_document_fact_field( document, keys={"check_out", "checkout", "departure_date", "end_date"}, labels={"离店", "退房", "离店日期", "结束日期"}, ) if check_in and check_out: return f"{check_in}至{check_out}" nights = self._resolve_document_fact_field( document, keys={"nights", "night_count", "room_nights"}, labels={"间夜", "晚数", "入住天数"}, ) if nights: return f"{nights}晚" return "" def _resolve_document_item_amount(self, document: dict[str, Any]) -> Decimal | None: return resolve_document_item_amount(document) def _resolve_document_field_amount(self, document: dict[str, Any]) -> Decimal | None: return resolve_document_field_amount(document) def _resolve_document_text_amount(self, text: str) -> Decimal | None: return resolve_document_text_amount(text) def _parse_document_amount_value(self, value: str) -> Decimal | None: return parse_document_amount_value(value) @staticmethod def _parse_plain_document_amount_value(value: str) -> Decimal | None: return parse_plain_document_amount_value(value) @staticmethod def _is_probable_year_amount(amount: Decimal | None) -> bool: return is_probable_year_amount(amount) @classmethod def _is_date_like_amount_candidate(cls, amount: Decimal | None, text: str) -> bool: return is_date_like_amount_candidate(amount, text) @staticmethod def _format_decimal_amount(amount: Decimal | None) -> str: return format_decimal_amount(amount) def _resolve_document_item_date(self, document: dict[str, Any], *, fallback: date) -> date: return self._resolve_document_item_date_candidate(document) or fallback def _resolve_document_item_date_candidate(self, document: dict[str, Any]) -> date | None: document_type = str(document.get("document_type") or "").strip().lower() if document_type in DOCUMENT_TRIP_DATE_LABELS: parsed = self._resolve_document_date_from_fields( document, keys=DOCUMENT_TRIP_DATE_KEYS, labels=DOCUMENT_TRIP_DATE_LABEL_TOKENS, ) if parsed is not None: return parsed parsed = self._resolve_document_date_from_fields( document, keys=DOCUMENT_GENERIC_DATE_KEYS, labels=DOCUMENT_GENERIC_DATE_LABEL_TOKENS, excluded_labels=DOCUMENT_INVOICE_DATE_LABEL_TOKENS, ) if parsed is not None: return parsed parsed = self._parse_document_date( " ".join( [ str(document.get("summary") or "").strip(), str(document.get("text") or "").strip(), ] ).strip() ) if parsed is not None: return parsed return None for field in list(document.get("document_fields") or []): if not isinstance(field, dict): continue key = str(field.get("key") or "").strip().lower().replace("_", "") label = str(field.get("label") or "").replace(" ", "") value = str(field.get("value") or "").strip() if not value: continue if key in {"date", "time", "issuedat", "issuedate", "invoicedate"} or any( token in label for token in ("日期", "时间", "开票日期", "发生时间") ): parsed = self._parse_document_date(value) if parsed is not None: return parsed parsed = self._parse_document_date( " ".join( [ str(document.get("summary") or "").strip(), str(document.get("text") or "").strip(), ] ).strip() ) return parsed def _resolve_document_date_from_fields( self, document: dict[str, Any], *, keys: set[str], labels: tuple[str, ...], excluded_labels: tuple[str, ...] = (), ) -> date | None: for field in list(document.get("document_fields") or []): if not isinstance(field, dict): continue key = str(field.get("key") or "").strip().lower().replace("_", "") label = str(field.get("label") or "").replace(" ", "") if excluded_labels and any(token in label for token in excluded_labels): continue if key not in keys and not any(token in label for token in labels): continue parsed = self._parse_document_date(str(field.get("value") or "")) if parsed is not None: return parsed return None @staticmethod def _parse_document_date(value: str) -> date | None: match = DOCUMENT_DATE_PATTERN.search(str(value or "")) if not match: return None raw_value = str(match.group(1) or "").strip() normalized = raw_value.replace("年", "-").replace("月", "-").replace("日", "") normalized = normalized.replace("/", "-").replace(".", "-") parts = [part for part in normalized.split("-") if part] if len(parts) != 3: return None try: return date(int(parts[0]), int(parts[1]), int(parts[2])) except ValueError: return None