refactor(server): split oversized backend services
This commit is contained in:
396
server/src/app/services/expense_claim_document_parsing.py
Normal file
396
server/src/app/services/expense_claim_document_parsing.py
Normal file
@@ -0,0 +1,396 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from datetime import UTC, date, datetime, timedelta
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import func, or_, select
|
||||
from sqlalchemy import inspect as sqlalchemy_inspect
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.orm import Session, selectinload
|
||||
|
||||
from app.api.deps import CurrentUserContext
|
||||
from app.core.agent_enums import AgentAssetDomain, AgentAssetStatus, AgentAssetType
|
||||
from app.models.agent_asset import AgentAsset
|
||||
from app.models.employee import Employee
|
||||
from app.models.financial_record import ExpenseClaim, ExpenseClaimItem
|
||||
from app.schemas.ontology import OntologyEntity, OntologyParseResult
|
||||
from app.schemas.reimbursement import (
|
||||
ExpenseClaimItemCreate,
|
||||
ExpenseClaimItemUpdate,
|
||||
ExpenseClaimUpdate,
|
||||
TravelReimbursementCalculatorRequest,
|
||||
)
|
||||
from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager
|
||||
from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY
|
||||
from app.services.agent_foundation import AgentFoundationService
|
||||
from app.services.audit import AuditLogService
|
||||
from app.services.document_intelligence import build_document_insight
|
||||
from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy
|
||||
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
|
||||
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
|
||||
from app.services.expense_claim_constants import (
|
||||
EXPENSE_TYPE_LABELS,
|
||||
MAX_DRAFT_CLAIMS_PER_USER,
|
||||
EDITABLE_CLAIM_STATUSES,
|
||||
SYSTEM_GENERATED_ITEM_TYPES,
|
||||
TRAVEL_DETAIL_ITEM_TYPES,
|
||||
TRAVEL_ALLOWANCE_TRIGGER_ITEM_TYPES,
|
||||
DOCUMENT_TYPE_ITEM_TYPE_MAP,
|
||||
DOCUMENT_TYPE_SCENE_MAP,
|
||||
DOCUMENT_FACT_ITEM_TYPES,
|
||||
ROUTE_DESCRIPTION_ITEM_TYPES,
|
||||
DOCUMENT_TRIP_DATE_LABELS,
|
||||
DOCUMENT_TRIP_DATE_REQUIREMENT_LABELS,
|
||||
DOCUMENT_TRIP_DATE_KEYS,
|
||||
DOCUMENT_GENERIC_DATE_KEYS,
|
||||
DOCUMENT_INVOICE_DATE_KEYS,
|
||||
DOCUMENT_TRIP_DATE_LABEL_TOKENS,
|
||||
DOCUMENT_GENERIC_DATE_LABEL_TOKENS,
|
||||
DOCUMENT_INVOICE_DATE_LABEL_TOKENS,
|
||||
DOCUMENT_ROUTE_FORMAT_PATTERN,
|
||||
DOCUMENT_ROUTE_TEXT_PATTERN,
|
||||
DOCUMENT_ROUTE_ORIGIN_LABELS,
|
||||
DOCUMENT_ROUTE_DESTINATION_LABELS,
|
||||
GENERIC_ATTACHMENT_BACKFILL_ITEM_TYPES,
|
||||
LOCATION_REQUIRED_EXPENSE_TYPES,
|
||||
EXPENSE_SCENE_KEYWORDS,
|
||||
EXPENSE_TYPE_ALLOWED_DOCUMENT_SCENES,
|
||||
DOCUMENT_SCENE_LABELS,
|
||||
DOCUMENT_ASSOCIATION_REVIEW_ACTIONS,
|
||||
PERSISTENT_EXPENSE_REVIEW_ACTIONS,
|
||||
RETURN_REASON_OPTIONS,
|
||||
MAX_CLAIM_NO_RETRY_ATTEMPTS,
|
||||
DOCUMENT_DATE_PATTERN,
|
||||
SYSTEM_GENERATED_REASON_PREFIXES,
|
||||
LEADING_REASON_TIME_PATTERNS,
|
||||
AI_REVIEW_LOOKBACK_DAYS,
|
||||
AI_REVIEW_REPEAT_RISK_WARNING_COUNT,
|
||||
AI_REVIEW_REPEAT_RISK_BLOCK_COUNT,
|
||||
TRAVEL_REVIEW_RELEVANT_EXPENSE_TYPES,
|
||||
TRAVEL_REVIEW_LONG_DISTANCE_DOCUMENT_TYPES,
|
||||
TRAVEL_POLICY_CITY_TIERS,
|
||||
TRAVEL_POLICY_CITY_MATCH_ORDER,
|
||||
TRAVEL_POLICY_BAND_LABELS,
|
||||
TRAVEL_POLICY_HOTEL_LIMITS,
|
||||
TRAVEL_POLICY_ALLOWED_TRANSPORT_LEVELS,
|
||||
TRAVEL_POLICY_ROUTE_EXCEPTION_KEYWORDS,
|
||||
TRAVEL_POLICY_STANDARD_EXCEPTION_KEYWORDS,
|
||||
TRAVEL_POLICY_FLIGHT_CLASS_PATTERNS,
|
||||
TRAVEL_POLICY_TRAIN_CLASS_PATTERNS,
|
||||
TRAVEL_POLICY_HOTEL_NIGHT_PATTERN,
|
||||
)
|
||||
from app.services.expense_claim_risk_review import ExpenseClaimRiskReviewMixin
|
||||
from app.services.expense_amounts import (
|
||||
extract_amount_candidates,
|
||||
format_decimal_amount,
|
||||
is_amount_match_date_fragment,
|
||||
is_date_like_amount_candidate,
|
||||
is_probable_year_amount,
|
||||
parse_document_amount_value,
|
||||
parse_plain_document_amount_value,
|
||||
resolve_document_field_amount,
|
||||
resolve_document_item_amount,
|
||||
resolve_document_text_amount,
|
||||
)
|
||||
from app.services.expense_rule_runtime import (
|
||||
DEFAULT_SCENE_RULE_ASSET_CODE,
|
||||
ExpenseRuleRuntimeService,
|
||||
RuntimeTravelPolicy,
|
||||
build_default_expense_rule_catalog,
|
||||
resolve_document_type_label,
|
||||
)
|
||||
from app.services.ocr import OcrService
|
||||
|
||||
|
||||
class ExpenseClaimDocumentParsingMixin:
|
||||
def _resolve_document_route_value(self, document: dict[str, Any]) -> str:
|
||||
route = self._resolve_document_fact_field(
|
||||
document,
|
||||
keys={"route", "trip_route"},
|
||||
labels={"行程", "路线"},
|
||||
)
|
||||
if route:
|
||||
return route
|
||||
|
||||
origin = self._resolve_document_fact_field(
|
||||
document,
|
||||
keys={
|
||||
"origin",
|
||||
"from",
|
||||
"from_city",
|
||||
"departure",
|
||||
"departure_city",
|
||||
"start",
|
||||
"start_location",
|
||||
"start_address",
|
||||
"pickup_location",
|
||||
"pickup_address",
|
||||
"boarding_station",
|
||||
},
|
||||
labels=DOCUMENT_ROUTE_ORIGIN_LABELS,
|
||||
)
|
||||
destination = self._resolve_document_fact_field(
|
||||
document,
|
||||
keys={
|
||||
"destination",
|
||||
"to",
|
||||
"to_city",
|
||||
"arrival",
|
||||
"arrival_city",
|
||||
"end",
|
||||
"end_location",
|
||||
"end_address",
|
||||
"dropoff_location",
|
||||
"dropoff_address",
|
||||
"alighting_station",
|
||||
},
|
||||
labels=DOCUMENT_ROUTE_DESTINATION_LABELS,
|
||||
)
|
||||
if origin and destination:
|
||||
return f"{origin}-{destination}"
|
||||
|
||||
text = " ".join(
|
||||
[
|
||||
str(document.get("summary") or "").strip(),
|
||||
str(document.get("text") or "").strip(),
|
||||
]
|
||||
).strip()
|
||||
text_route = self._extract_document_route_from_text(text)
|
||||
if text_route:
|
||||
return text_route
|
||||
|
||||
text_origin = self._extract_document_labeled_text_value(text, DOCUMENT_ROUTE_ORIGIN_LABELS)
|
||||
text_destination = self._extract_document_labeled_text_value(text, DOCUMENT_ROUTE_DESTINATION_LABELS)
|
||||
if text_origin and text_destination:
|
||||
return f"{text_origin}-{text_destination}"
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _resolve_document_fact_field(
|
||||
document: dict[str, Any],
|
||||
*,
|
||||
keys: set[str],
|
||||
labels: set[str],
|
||||
) -> str:
|
||||
raw_fields = document.get("document_fields")
|
||||
if not isinstance(raw_fields, list):
|
||||
raw_fields = document.get("fields")
|
||||
if not isinstance(raw_fields, list):
|
||||
return ""
|
||||
|
||||
normalized_keys = {str(key or "").strip().lower().replace("_", "") for key in keys}
|
||||
for field in raw_fields:
|
||||
if not isinstance(field, dict):
|
||||
continue
|
||||
field_key = str(field.get("key") or "").strip().lower().replace("_", "")
|
||||
label = str(field.get("label") or "").replace(" ", "")
|
||||
value = str(field.get("value") or "").strip()
|
||||
if not value:
|
||||
continue
|
||||
if field_key in normalized_keys or any(token in label for token in labels):
|
||||
return value
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _format_document_route(route: str) -> str:
|
||||
normalized = (
|
||||
str(route or "")
|
||||
.strip()
|
||||
.replace("->", "-")
|
||||
.replace("→", "-")
|
||||
.replace("—", "-")
|
||||
.replace("–", "-")
|
||||
.replace("至", "-")
|
||||
.replace("到", "-")
|
||||
)
|
||||
if "-" not in normalized:
|
||||
return str(route or "").strip()
|
||||
origin, destination = [part.strip() for part in normalized.split("-", 1)]
|
||||
origin = origin.removeprefix("从").strip()
|
||||
destination = destination.removeprefix("至").removeprefix("到").strip()
|
||||
if not origin or not destination or origin == destination:
|
||||
return str(route or "").strip()
|
||||
return f"{origin}-{destination}"
|
||||
|
||||
@staticmethod
|
||||
def _extract_document_route_from_text(text: str) -> str:
|
||||
for match in DOCUMENT_ROUTE_TEXT_PATTERN.finditer(str(text or "")):
|
||||
origin = str(match.group(1) or "").strip()
|
||||
destination = str(match.group(2) or "").strip()
|
||||
if not origin or not destination or origin == destination:
|
||||
continue
|
||||
if origin.isdigit() and destination.isdigit():
|
||||
continue
|
||||
if DOCUMENT_DATE_PATTERN.search(f"{origin}-{destination}"):
|
||||
continue
|
||||
return f"{origin}-{destination}"
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _extract_document_labeled_text_value(text: str, labels: set[str]) -> str:
|
||||
for label in sorted(labels, key=len, reverse=True):
|
||||
pattern = re.compile(
|
||||
rf"{re.escape(label)}[::\s]*"
|
||||
r"([A-Za-z0-9\u4e00-\u9fa5()()·\-路街道号弄区县市省园桥站机场中心]{2,50})"
|
||||
)
|
||||
match = pattern.search(str(text or ""))
|
||||
if match:
|
||||
return str(match.group(1) or "").strip()
|
||||
return ""
|
||||
|
||||
def _resolve_document_stay_range(self, document: dict[str, Any]) -> str:
|
||||
check_in = self._resolve_document_fact_field(
|
||||
document,
|
||||
keys={"check_in", "checkin", "arrival_date", "start_date"},
|
||||
labels={"入住", "入住日期", "到店", "开始日期"},
|
||||
)
|
||||
check_out = self._resolve_document_fact_field(
|
||||
document,
|
||||
keys={"check_out", "checkout", "departure_date", "end_date"},
|
||||
labels={"离店", "退房", "离店日期", "结束日期"},
|
||||
)
|
||||
if check_in and check_out:
|
||||
return f"{check_in}至{check_out}"
|
||||
nights = self._resolve_document_fact_field(
|
||||
document,
|
||||
keys={"nights", "night_count", "room_nights"},
|
||||
labels={"间夜", "晚数", "入住天数"},
|
||||
)
|
||||
if nights:
|
||||
return f"{nights}晚"
|
||||
return ""
|
||||
|
||||
def _resolve_document_item_amount(self, document: dict[str, Any]) -> Decimal | None:
|
||||
return resolve_document_item_amount(document)
|
||||
|
||||
def _resolve_document_field_amount(self, document: dict[str, Any]) -> Decimal | None:
|
||||
return resolve_document_field_amount(document)
|
||||
|
||||
def _resolve_document_text_amount(self, text: str) -> Decimal | None:
|
||||
return resolve_document_text_amount(text)
|
||||
|
||||
def _parse_document_amount_value(self, value: str) -> Decimal | None:
|
||||
return parse_document_amount_value(value)
|
||||
|
||||
@staticmethod
|
||||
def _parse_plain_document_amount_value(value: str) -> Decimal | None:
|
||||
return parse_plain_document_amount_value(value)
|
||||
|
||||
@staticmethod
|
||||
def _is_probable_year_amount(amount: Decimal | None) -> bool:
|
||||
return is_probable_year_amount(amount)
|
||||
|
||||
@classmethod
|
||||
def _is_date_like_amount_candidate(cls, amount: Decimal | None, text: str) -> bool:
|
||||
return is_date_like_amount_candidate(amount, text)
|
||||
|
||||
@staticmethod
|
||||
def _format_decimal_amount(amount: Decimal | None) -> str:
|
||||
return format_decimal_amount(amount)
|
||||
|
||||
def _resolve_document_item_date(self, document: dict[str, Any], *, fallback: date) -> date:
|
||||
return self._resolve_document_item_date_candidate(document) or fallback
|
||||
|
||||
def _resolve_document_item_date_candidate(self, document: dict[str, Any]) -> date | None:
|
||||
document_type = str(document.get("document_type") or "").strip().lower()
|
||||
if document_type in DOCUMENT_TRIP_DATE_LABELS:
|
||||
parsed = self._resolve_document_date_from_fields(
|
||||
document,
|
||||
keys=DOCUMENT_TRIP_DATE_KEYS,
|
||||
labels=DOCUMENT_TRIP_DATE_LABEL_TOKENS,
|
||||
)
|
||||
if parsed is not None:
|
||||
return parsed
|
||||
|
||||
parsed = self._resolve_document_date_from_fields(
|
||||
document,
|
||||
keys=DOCUMENT_GENERIC_DATE_KEYS,
|
||||
labels=DOCUMENT_GENERIC_DATE_LABEL_TOKENS,
|
||||
excluded_labels=DOCUMENT_INVOICE_DATE_LABEL_TOKENS,
|
||||
)
|
||||
if parsed is not None:
|
||||
return parsed
|
||||
|
||||
parsed = self._parse_document_date(
|
||||
" ".join(
|
||||
[
|
||||
str(document.get("summary") or "").strip(),
|
||||
str(document.get("text") or "").strip(),
|
||||
]
|
||||
).strip()
|
||||
)
|
||||
if parsed is not None:
|
||||
return parsed
|
||||
|
||||
return None
|
||||
|
||||
for field in list(document.get("document_fields") or []):
|
||||
if not isinstance(field, dict):
|
||||
continue
|
||||
key = str(field.get("key") or "").strip().lower().replace("_", "")
|
||||
label = str(field.get("label") or "").replace(" ", "")
|
||||
value = str(field.get("value") or "").strip()
|
||||
if not value:
|
||||
continue
|
||||
if key in {"date", "time", "issuedat", "issuedate", "invoicedate"} or any(
|
||||
token in label for token in ("日期", "时间", "开票日期", "发生时间")
|
||||
):
|
||||
parsed = self._parse_document_date(value)
|
||||
if parsed is not None:
|
||||
return parsed
|
||||
|
||||
parsed = self._parse_document_date(
|
||||
" ".join(
|
||||
[
|
||||
str(document.get("summary") or "").strip(),
|
||||
str(document.get("text") or "").strip(),
|
||||
]
|
||||
).strip()
|
||||
)
|
||||
return parsed
|
||||
|
||||
def _resolve_document_date_from_fields(
|
||||
self,
|
||||
document: dict[str, Any],
|
||||
*,
|
||||
keys: set[str],
|
||||
labels: tuple[str, ...],
|
||||
excluded_labels: tuple[str, ...] = (),
|
||||
) -> date | None:
|
||||
for field in list(document.get("document_fields") or []):
|
||||
if not isinstance(field, dict):
|
||||
continue
|
||||
key = str(field.get("key") or "").strip().lower().replace("_", "")
|
||||
label = str(field.get("label") or "").replace(" ", "")
|
||||
if excluded_labels and any(token in label for token in excluded_labels):
|
||||
continue
|
||||
if key not in keys and not any(token in label for token in labels):
|
||||
continue
|
||||
parsed = self._parse_document_date(str(field.get("value") or ""))
|
||||
if parsed is not None:
|
||||
return parsed
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _parse_document_date(value: str) -> date | None:
|
||||
match = DOCUMENT_DATE_PATTERN.search(str(value or ""))
|
||||
if not match:
|
||||
return None
|
||||
raw_value = str(match.group(1) or "").strip()
|
||||
normalized = raw_value.replace("年", "-").replace("月", "-").replace("日", "")
|
||||
normalized = normalized.replace("/", "-").replace(".", "-")
|
||||
parts = [part for part in normalized.split("-") if part]
|
||||
if len(parts) != 3:
|
||||
return None
|
||||
try:
|
||||
return date(int(parts[0]), int(parts[1]), int(parts[2]))
|
||||
except ValueError:
|
||||
return None
|
||||
Reference in New Issue
Block a user