397 lines
15 KiB
Python
397 lines
15 KiB
Python
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
import shutil
|
||
import uuid
|
||
from collections import defaultdict
|
||
from datetime import UTC, date, datetime, timedelta
|
||
from decimal import Decimal, InvalidOperation
|
||
from pathlib import Path
|
||
from types import SimpleNamespace
|
||
from typing import Any
|
||
|
||
from sqlalchemy import func, or_, select
|
||
from sqlalchemy import inspect as sqlalchemy_inspect
|
||
from sqlalchemy.exc import IntegrityError
|
||
from sqlalchemy.orm import Session, selectinload
|
||
|
||
from app.api.deps import CurrentUserContext
|
||
from app.core.agent_enums import AgentAssetDomain, AgentAssetStatus, AgentAssetType
|
||
from app.models.agent_asset import AgentAsset
|
||
from app.models.employee import Employee
|
||
from app.models.financial_record import ExpenseClaim, ExpenseClaimItem
|
||
from app.schemas.ontology import OntologyEntity, OntologyParseResult
|
||
from app.schemas.reimbursement import (
|
||
ExpenseClaimItemCreate,
|
||
ExpenseClaimItemUpdate,
|
||
ExpenseClaimUpdate,
|
||
TravelReimbursementCalculatorRequest,
|
||
)
|
||
from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager
|
||
from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY
|
||
from app.services.agent_foundation import AgentFoundationService
|
||
from app.services.audit import AuditLogService
|
||
from app.services.document_intelligence import build_document_insight
|
||
from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy
|
||
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
|
||
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
|
||
from app.services.expense_claim_constants import (
|
||
EXPENSE_TYPE_LABELS,
|
||
MAX_DRAFT_CLAIMS_PER_USER,
|
||
EDITABLE_CLAIM_STATUSES,
|
||
SYSTEM_GENERATED_ITEM_TYPES,
|
||
TRAVEL_DETAIL_ITEM_TYPES,
|
||
TRAVEL_ALLOWANCE_TRIGGER_ITEM_TYPES,
|
||
DOCUMENT_TYPE_ITEM_TYPE_MAP,
|
||
DOCUMENT_TYPE_SCENE_MAP,
|
||
DOCUMENT_FACT_ITEM_TYPES,
|
||
ROUTE_DESCRIPTION_ITEM_TYPES,
|
||
DOCUMENT_TRIP_DATE_LABELS,
|
||
DOCUMENT_TRIP_DATE_REQUIREMENT_LABELS,
|
||
DOCUMENT_TRIP_DATE_KEYS,
|
||
DOCUMENT_GENERIC_DATE_KEYS,
|
||
DOCUMENT_INVOICE_DATE_KEYS,
|
||
DOCUMENT_TRIP_DATE_LABEL_TOKENS,
|
||
DOCUMENT_GENERIC_DATE_LABEL_TOKENS,
|
||
DOCUMENT_INVOICE_DATE_LABEL_TOKENS,
|
||
DOCUMENT_ROUTE_FORMAT_PATTERN,
|
||
DOCUMENT_ROUTE_TEXT_PATTERN,
|
||
DOCUMENT_ROUTE_ORIGIN_LABELS,
|
||
DOCUMENT_ROUTE_DESTINATION_LABELS,
|
||
GENERIC_ATTACHMENT_BACKFILL_ITEM_TYPES,
|
||
LOCATION_REQUIRED_EXPENSE_TYPES,
|
||
EXPENSE_SCENE_KEYWORDS,
|
||
EXPENSE_TYPE_ALLOWED_DOCUMENT_SCENES,
|
||
DOCUMENT_SCENE_LABELS,
|
||
DOCUMENT_ASSOCIATION_REVIEW_ACTIONS,
|
||
PERSISTENT_EXPENSE_REVIEW_ACTIONS,
|
||
RETURN_REASON_OPTIONS,
|
||
MAX_CLAIM_NO_RETRY_ATTEMPTS,
|
||
DOCUMENT_DATE_PATTERN,
|
||
SYSTEM_GENERATED_REASON_PREFIXES,
|
||
LEADING_REASON_TIME_PATTERNS,
|
||
AI_REVIEW_LOOKBACK_DAYS,
|
||
AI_REVIEW_REPEAT_RISK_WARNING_COUNT,
|
||
AI_REVIEW_REPEAT_RISK_BLOCK_COUNT,
|
||
TRAVEL_REVIEW_RELEVANT_EXPENSE_TYPES,
|
||
TRAVEL_REVIEW_LONG_DISTANCE_DOCUMENT_TYPES,
|
||
TRAVEL_POLICY_CITY_TIERS,
|
||
TRAVEL_POLICY_CITY_MATCH_ORDER,
|
||
TRAVEL_POLICY_BAND_LABELS,
|
||
TRAVEL_POLICY_HOTEL_LIMITS,
|
||
TRAVEL_POLICY_ALLOWED_TRANSPORT_LEVELS,
|
||
TRAVEL_POLICY_ROUTE_EXCEPTION_KEYWORDS,
|
||
TRAVEL_POLICY_STANDARD_EXCEPTION_KEYWORDS,
|
||
TRAVEL_POLICY_FLIGHT_CLASS_PATTERNS,
|
||
TRAVEL_POLICY_TRAIN_CLASS_PATTERNS,
|
||
TRAVEL_POLICY_HOTEL_NIGHT_PATTERN,
|
||
)
|
||
from app.services.expense_claim_risk_review import ExpenseClaimRiskReviewMixin
|
||
from app.services.expense_amounts import (
|
||
extract_amount_candidates,
|
||
format_decimal_amount,
|
||
is_amount_match_date_fragment,
|
||
is_date_like_amount_candidate,
|
||
is_probable_year_amount,
|
||
parse_document_amount_value,
|
||
parse_plain_document_amount_value,
|
||
resolve_document_field_amount,
|
||
resolve_document_item_amount,
|
||
resolve_document_text_amount,
|
||
)
|
||
from app.services.expense_rule_runtime import (
|
||
DEFAULT_SCENE_RULE_ASSET_CODE,
|
||
ExpenseRuleRuntimeService,
|
||
RuntimeTravelPolicy,
|
||
build_default_expense_rule_catalog,
|
||
resolve_document_type_label,
|
||
)
|
||
from app.services.ocr import OcrService
|
||
|
||
|
||
class ExpenseClaimDocumentParsingMixin:
|
||
def _resolve_document_route_value(self, document: dict[str, Any]) -> str:
|
||
route = self._resolve_document_fact_field(
|
||
document,
|
||
keys={"route", "trip_route"},
|
||
labels={"行程", "路线"},
|
||
)
|
||
if route:
|
||
return route
|
||
|
||
origin = self._resolve_document_fact_field(
|
||
document,
|
||
keys={
|
||
"origin",
|
||
"from",
|
||
"from_city",
|
||
"departure",
|
||
"departure_city",
|
||
"start",
|
||
"start_location",
|
||
"start_address",
|
||
"pickup_location",
|
||
"pickup_address",
|
||
"boarding_station",
|
||
},
|
||
labels=DOCUMENT_ROUTE_ORIGIN_LABELS,
|
||
)
|
||
destination = self._resolve_document_fact_field(
|
||
document,
|
||
keys={
|
||
"destination",
|
||
"to",
|
||
"to_city",
|
||
"arrival",
|
||
"arrival_city",
|
||
"end",
|
||
"end_location",
|
||
"end_address",
|
||
"dropoff_location",
|
||
"dropoff_address",
|
||
"alighting_station",
|
||
},
|
||
labels=DOCUMENT_ROUTE_DESTINATION_LABELS,
|
||
)
|
||
if origin and destination:
|
||
return f"{origin}-{destination}"
|
||
|
||
text = " ".join(
|
||
[
|
||
str(document.get("summary") or "").strip(),
|
||
str(document.get("text") or "").strip(),
|
||
]
|
||
).strip()
|
||
text_route = self._extract_document_route_from_text(text)
|
||
if text_route:
|
||
return text_route
|
||
|
||
text_origin = self._extract_document_labeled_text_value(text, DOCUMENT_ROUTE_ORIGIN_LABELS)
|
||
text_destination = self._extract_document_labeled_text_value(text, DOCUMENT_ROUTE_DESTINATION_LABELS)
|
||
if text_origin and text_destination:
|
||
return f"{text_origin}-{text_destination}"
|
||
return ""
|
||
|
||
@staticmethod
|
||
def _resolve_document_fact_field(
|
||
document: dict[str, Any],
|
||
*,
|
||
keys: set[str],
|
||
labels: set[str],
|
||
) -> str:
|
||
raw_fields = document.get("document_fields")
|
||
if not isinstance(raw_fields, list):
|
||
raw_fields = document.get("fields")
|
||
if not isinstance(raw_fields, list):
|
||
return ""
|
||
|
||
normalized_keys = {str(key or "").strip().lower().replace("_", "") for key in keys}
|
||
for field in raw_fields:
|
||
if not isinstance(field, dict):
|
||
continue
|
||
field_key = str(field.get("key") or "").strip().lower().replace("_", "")
|
||
label = str(field.get("label") or "").replace(" ", "")
|
||
value = str(field.get("value") or "").strip()
|
||
if not value:
|
||
continue
|
||
if field_key in normalized_keys or any(token in label for token in labels):
|
||
return value
|
||
return ""
|
||
|
||
@staticmethod
|
||
def _format_document_route(route: str) -> str:
|
||
normalized = (
|
||
str(route or "")
|
||
.strip()
|
||
.replace("->", "-")
|
||
.replace("→", "-")
|
||
.replace("—", "-")
|
||
.replace("–", "-")
|
||
.replace("至", "-")
|
||
.replace("到", "-")
|
||
)
|
||
if "-" not in normalized:
|
||
return str(route or "").strip()
|
||
origin, destination = [part.strip() for part in normalized.split("-", 1)]
|
||
origin = origin.removeprefix("从").strip()
|
||
destination = destination.removeprefix("至").removeprefix("到").strip()
|
||
if not origin or not destination or origin == destination:
|
||
return str(route or "").strip()
|
||
return f"{origin}-{destination}"
|
||
|
||
@staticmethod
|
||
def _extract_document_route_from_text(text: str) -> str:
|
||
for match in DOCUMENT_ROUTE_TEXT_PATTERN.finditer(str(text or "")):
|
||
origin = str(match.group(1) or "").strip()
|
||
destination = str(match.group(2) or "").strip()
|
||
if not origin or not destination or origin == destination:
|
||
continue
|
||
if origin.isdigit() and destination.isdigit():
|
||
continue
|
||
if DOCUMENT_DATE_PATTERN.search(f"{origin}-{destination}"):
|
||
continue
|
||
return f"{origin}-{destination}"
|
||
return ""
|
||
|
||
@staticmethod
|
||
def _extract_document_labeled_text_value(text: str, labels: set[str]) -> str:
|
||
for label in sorted(labels, key=len, reverse=True):
|
||
pattern = re.compile(
|
||
rf"{re.escape(label)}[::\s]*"
|
||
r"([A-Za-z0-9\u4e00-\u9fa5()()·\-路街道号弄区县市省园桥站机场中心]{2,50})"
|
||
)
|
||
match = pattern.search(str(text or ""))
|
||
if match:
|
||
return str(match.group(1) or "").strip()
|
||
return ""
|
||
|
||
def _resolve_document_stay_range(self, document: dict[str, Any]) -> str:
|
||
check_in = self._resolve_document_fact_field(
|
||
document,
|
||
keys={"check_in", "checkin", "arrival_date", "start_date"},
|
||
labels={"入住", "入住日期", "到店", "开始日期"},
|
||
)
|
||
check_out = self._resolve_document_fact_field(
|
||
document,
|
||
keys={"check_out", "checkout", "departure_date", "end_date"},
|
||
labels={"离店", "退房", "离店日期", "结束日期"},
|
||
)
|
||
if check_in and check_out:
|
||
return f"{check_in}至{check_out}"
|
||
nights = self._resolve_document_fact_field(
|
||
document,
|
||
keys={"nights", "night_count", "room_nights"},
|
||
labels={"间夜", "晚数", "入住天数"},
|
||
)
|
||
if nights:
|
||
return f"{nights}晚"
|
||
return ""
|
||
|
||
def _resolve_document_item_amount(self, document: dict[str, Any]) -> Decimal | None:
|
||
return resolve_document_item_amount(document)
|
||
|
||
def _resolve_document_field_amount(self, document: dict[str, Any]) -> Decimal | None:
|
||
return resolve_document_field_amount(document)
|
||
|
||
def _resolve_document_text_amount(self, text: str) -> Decimal | None:
|
||
return resolve_document_text_amount(text)
|
||
|
||
def _parse_document_amount_value(self, value: str) -> Decimal | None:
|
||
return parse_document_amount_value(value)
|
||
|
||
@staticmethod
|
||
def _parse_plain_document_amount_value(value: str) -> Decimal | None:
|
||
return parse_plain_document_amount_value(value)
|
||
|
||
@staticmethod
|
||
def _is_probable_year_amount(amount: Decimal | None) -> bool:
|
||
return is_probable_year_amount(amount)
|
||
|
||
@classmethod
|
||
def _is_date_like_amount_candidate(cls, amount: Decimal | None, text: str) -> bool:
|
||
return is_date_like_amount_candidate(amount, text)
|
||
|
||
@staticmethod
|
||
def _format_decimal_amount(amount: Decimal | None) -> str:
|
||
return format_decimal_amount(amount)
|
||
|
||
def _resolve_document_item_date(self, document: dict[str, Any], *, fallback: date) -> date:
|
||
return self._resolve_document_item_date_candidate(document) or fallback
|
||
|
||
def _resolve_document_item_date_candidate(self, document: dict[str, Any]) -> date | None:
|
||
document_type = str(document.get("document_type") or "").strip().lower()
|
||
if document_type in DOCUMENT_TRIP_DATE_LABELS:
|
||
parsed = self._resolve_document_date_from_fields(
|
||
document,
|
||
keys=DOCUMENT_TRIP_DATE_KEYS,
|
||
labels=DOCUMENT_TRIP_DATE_LABEL_TOKENS,
|
||
)
|
||
if parsed is not None:
|
||
return parsed
|
||
|
||
parsed = self._resolve_document_date_from_fields(
|
||
document,
|
||
keys=DOCUMENT_GENERIC_DATE_KEYS,
|
||
labels=DOCUMENT_GENERIC_DATE_LABEL_TOKENS,
|
||
excluded_labels=DOCUMENT_INVOICE_DATE_LABEL_TOKENS,
|
||
)
|
||
if parsed is not None:
|
||
return parsed
|
||
|
||
parsed = self._parse_document_date(
|
||
" ".join(
|
||
[
|
||
str(document.get("summary") or "").strip(),
|
||
str(document.get("text") or "").strip(),
|
||
]
|
||
).strip()
|
||
)
|
||
if parsed is not None:
|
||
return parsed
|
||
|
||
return None
|
||
|
||
for field in list(document.get("document_fields") or []):
|
||
if not isinstance(field, dict):
|
||
continue
|
||
key = str(field.get("key") or "").strip().lower().replace("_", "")
|
||
label = str(field.get("label") or "").replace(" ", "")
|
||
value = str(field.get("value") or "").strip()
|
||
if not value:
|
||
continue
|
||
if key in {"date", "time", "issuedat", "issuedate", "invoicedate"} or any(
|
||
token in label for token in ("日期", "时间", "开票日期", "发生时间")
|
||
):
|
||
parsed = self._parse_document_date(value)
|
||
if parsed is not None:
|
||
return parsed
|
||
|
||
parsed = self._parse_document_date(
|
||
" ".join(
|
||
[
|
||
str(document.get("summary") or "").strip(),
|
||
str(document.get("text") or "").strip(),
|
||
]
|
||
).strip()
|
||
)
|
||
return parsed
|
||
|
||
def _resolve_document_date_from_fields(
|
||
self,
|
||
document: dict[str, Any],
|
||
*,
|
||
keys: set[str],
|
||
labels: tuple[str, ...],
|
||
excluded_labels: tuple[str, ...] = (),
|
||
) -> date | None:
|
||
for field in list(document.get("document_fields") or []):
|
||
if not isinstance(field, dict):
|
||
continue
|
||
key = str(field.get("key") or "").strip().lower().replace("_", "")
|
||
label = str(field.get("label") or "").replace(" ", "")
|
||
if excluded_labels and any(token in label for token in excluded_labels):
|
||
continue
|
||
if key not in keys and not any(token in label for token in labels):
|
||
continue
|
||
parsed = self._parse_document_date(str(field.get("value") or ""))
|
||
if parsed is not None:
|
||
return parsed
|
||
return None
|
||
|
||
@staticmethod
|
||
def _parse_document_date(value: str) -> date | None:
|
||
match = DOCUMENT_DATE_PATTERN.search(str(value or ""))
|
||
if not match:
|
||
return None
|
||
raw_value = str(match.group(1) or "").strip()
|
||
normalized = raw_value.replace("年", "-").replace("月", "-").replace("日", "")
|
||
normalized = normalized.replace("/", "-").replace(".", "-")
|
||
parts = [part for part in normalized.split("-") if part]
|
||
if len(parts) != 3:
|
||
return None
|
||
try:
|
||
return date(int(parts[0]), int(parts[1]), int(parts[2]))
|
||
except ValueError:
|
||
return None
|