- 新增 document_preview 模块,DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成(poppler-data 编码)、renderer_id 标识 - receipt_folder 服务复用预览生成,缓存票据资产并提供清理;删除票据时保留已关联报销单的附件副本 - document_intelligence 新增票据预览/资产缓存接入与字段提取增强;ocr 抽取复用预览工具,附件分析/文档/操作/展示四个子模块同步适配 - receipt_folder 端点补充资产缓存头,补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试,新增 attachment_analysis 回归测试
762 lines
30 KiB
Python
762 lines
30 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import shutil
|
|
import uuid
|
|
from collections import defaultdict
|
|
from datetime import UTC, date, datetime, timedelta
|
|
from decimal import Decimal, InvalidOperation
|
|
from pathlib import Path
|
|
from types import SimpleNamespace
|
|
from typing import Any
|
|
|
|
from sqlalchemy import func, or_, select
|
|
from sqlalchemy import inspect as sqlalchemy_inspect
|
|
from sqlalchemy.exc import IntegrityError
|
|
from sqlalchemy.orm import Session, selectinload
|
|
|
|
from app.api.deps import CurrentUserContext
|
|
from app.core.agent_enums import AgentAssetDomain, AgentAssetStatus, AgentAssetType
|
|
from app.models.agent_asset import AgentAsset
|
|
from app.models.employee import Employee
|
|
from app.models.financial_record import ExpenseClaim, ExpenseClaimItem
|
|
from app.schemas.ontology import OntologyEntity, OntologyParseResult
|
|
from app.schemas.reimbursement import (
|
|
ExpenseClaimItemCreate,
|
|
ExpenseClaimItemUpdate,
|
|
ExpenseClaimUpdate,
|
|
TravelReimbursementCalculatorRequest,
|
|
)
|
|
from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager
|
|
from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY
|
|
from app.services.agent_foundation import AgentFoundationService
|
|
from app.services.audit import AuditLogService
|
|
from app.services.document_preview import DocumentPreviewAssets
|
|
from app.services.document_intelligence import build_document_insight
|
|
from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy
|
|
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
|
|
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
|
|
from app.services.expense_claim_constants import (
|
|
EXPENSE_TYPE_LABELS,
|
|
MAX_DRAFT_CLAIMS_PER_USER,
|
|
EDITABLE_CLAIM_STATUSES,
|
|
SYSTEM_GENERATED_ITEM_TYPES,
|
|
TRAVEL_DETAIL_ITEM_TYPES,
|
|
TRAVEL_ALLOWANCE_TRIGGER_ITEM_TYPES,
|
|
DOCUMENT_TYPE_ITEM_TYPE_MAP,
|
|
DOCUMENT_TYPE_SCENE_MAP,
|
|
DOCUMENT_FACT_ITEM_TYPES,
|
|
ROUTE_DESCRIPTION_ITEM_TYPES,
|
|
DOCUMENT_TRIP_DATE_LABELS,
|
|
DOCUMENT_TRIP_DATE_REQUIREMENT_LABELS,
|
|
DOCUMENT_TRIP_DATE_KEYS,
|
|
DOCUMENT_GENERIC_DATE_KEYS,
|
|
DOCUMENT_INVOICE_DATE_KEYS,
|
|
DOCUMENT_TRIP_DATE_LABEL_TOKENS,
|
|
DOCUMENT_GENERIC_DATE_LABEL_TOKENS,
|
|
DOCUMENT_INVOICE_DATE_LABEL_TOKENS,
|
|
DOCUMENT_ROUTE_FORMAT_PATTERN,
|
|
DOCUMENT_ROUTE_TEXT_PATTERN,
|
|
DOCUMENT_ROUTE_ORIGIN_LABELS,
|
|
DOCUMENT_ROUTE_DESTINATION_LABELS,
|
|
GENERIC_ATTACHMENT_BACKFILL_ITEM_TYPES,
|
|
LOCATION_REQUIRED_EXPENSE_TYPES,
|
|
EXPENSE_SCENE_KEYWORDS,
|
|
EXPENSE_TYPE_ALLOWED_DOCUMENT_SCENES,
|
|
DOCUMENT_SCENE_LABELS,
|
|
DOCUMENT_ASSOCIATION_REVIEW_ACTIONS,
|
|
PERSISTENT_EXPENSE_REVIEW_ACTIONS,
|
|
RETURN_REASON_OPTIONS,
|
|
MAX_CLAIM_NO_RETRY_ATTEMPTS,
|
|
DOCUMENT_DATE_PATTERN,
|
|
SYSTEM_GENERATED_REASON_PREFIXES,
|
|
LEADING_REASON_TIME_PATTERNS,
|
|
AI_REVIEW_LOOKBACK_DAYS,
|
|
AI_REVIEW_REPEAT_RISK_WARNING_COUNT,
|
|
AI_REVIEW_REPEAT_RISK_BLOCK_COUNT,
|
|
TRAVEL_REVIEW_RELEVANT_EXPENSE_TYPES,
|
|
TRAVEL_REVIEW_LONG_DISTANCE_DOCUMENT_TYPES,
|
|
TRAVEL_POLICY_CITY_TIERS,
|
|
TRAVEL_POLICY_CITY_MATCH_ORDER,
|
|
TRAVEL_POLICY_BAND_LABELS,
|
|
TRAVEL_POLICY_HOTEL_LIMITS,
|
|
TRAVEL_POLICY_ALLOWED_TRANSPORT_LEVELS,
|
|
TRAVEL_POLICY_ROUTE_EXCEPTION_KEYWORDS,
|
|
TRAVEL_POLICY_STANDARD_EXCEPTION_KEYWORDS,
|
|
TRAVEL_POLICY_FLIGHT_CLASS_PATTERNS,
|
|
TRAVEL_POLICY_TRAIN_CLASS_PATTERNS,
|
|
TRAVEL_POLICY_HOTEL_NIGHT_PATTERN,
|
|
)
|
|
from app.services.expense_claim_risk_review import ExpenseClaimRiskReviewMixin
|
|
from app.services.expense_amounts import (
|
|
extract_amount_candidates,
|
|
format_decimal_amount,
|
|
is_amount_match_date_fragment,
|
|
is_date_like_amount_candidate,
|
|
is_probable_year_amount,
|
|
parse_document_amount_value,
|
|
parse_plain_document_amount_value,
|
|
resolve_document_field_amount,
|
|
resolve_document_item_amount,
|
|
resolve_document_text_amount,
|
|
)
|
|
from app.services.expense_rule_runtime import (
|
|
DEFAULT_SCENE_RULE_ASSET_CODE,
|
|
ExpenseRuleRuntimeService,
|
|
RuntimeTravelPolicy,
|
|
build_default_expense_rule_catalog,
|
|
resolve_document_type_label,
|
|
)
|
|
from app.services.ocr import OcrService
|
|
from app.services.receipt_folder import ReceiptFolderService
|
|
|
|
|
|
class ExpenseClaimAttachmentOperationsMixin:
|
|
def upload_claim_item_attachment(
|
|
self,
|
|
*,
|
|
claim_id: str,
|
|
item_id: str,
|
|
filename: str,
|
|
content: bytes,
|
|
media_type: str | None,
|
|
current_user: CurrentUserContext,
|
|
source_receipt_id: str = "",
|
|
) -> dict[str, Any] | None:
|
|
claim, item = self._get_claim_item_or_raise(
|
|
claim_id=claim_id,
|
|
item_id=item_id,
|
|
current_user=current_user,
|
|
)
|
|
if claim is None:
|
|
return None
|
|
|
|
self._ensure_draft_claim(claim)
|
|
self._ensure_mutable_claim_item(item)
|
|
normalized_name = self._attachment_storage.normalize_filename(filename)
|
|
if not content:
|
|
raise ValueError("上传文件不能为空。")
|
|
|
|
before_json = self._serialize_claim(claim)
|
|
attachment_dir = self._attachment_storage.build_item_dir(claim.id, item.id)
|
|
shutil.rmtree(attachment_dir, ignore_errors=True)
|
|
attachment_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
file_path = attachment_dir / normalized_name
|
|
file_path.write_bytes(content)
|
|
resolved_media_type = self._attachment_presentation.resolve_media_type(
|
|
normalized_name,
|
|
fallback=media_type,
|
|
)
|
|
|
|
attachment_analysis = self._build_fallback_attachment_analysis(
|
|
media_type=media_type,
|
|
item=item,
|
|
)
|
|
source_receipt_document = self._resolve_source_receipt_document(
|
|
source_receipt_id=source_receipt_id,
|
|
current_user=current_user,
|
|
fallback_filename=normalized_name,
|
|
fallback_media_type=resolved_media_type,
|
|
)
|
|
ocr_document = None
|
|
document_info = None
|
|
requirement_check = None
|
|
ocr_status = "empty"
|
|
ocr_error = ""
|
|
upload_ocr_document = None
|
|
try:
|
|
ocr_result = OcrService(self.db).recognize_files(
|
|
[(normalized_name, content, media_type or "application/octet-stream")]
|
|
)
|
|
documents = list(ocr_result.documents or [])
|
|
if documents:
|
|
upload_ocr_document = documents[0]
|
|
except Exception as exc: # pragma: no cover - fallback path depends on OCR runtime
|
|
ocr_error = str(exc)
|
|
|
|
ocr_document = self._choose_attachment_ocr_document(
|
|
source_receipt_document=source_receipt_document,
|
|
upload_ocr_document=upload_ocr_document,
|
|
)
|
|
if ocr_document is not None:
|
|
ocr_status = "recognized"
|
|
ocr_error = ""
|
|
document_info = self._build_attachment_document_info(ocr_document)
|
|
self._backfill_item_type_from_attachment(
|
|
item=item,
|
|
document_info=document_info,
|
|
)
|
|
self._backfill_item_amount_from_attachment(
|
|
item=item,
|
|
document=ocr_document,
|
|
document_info=document_info,
|
|
)
|
|
self._backfill_item_date_from_attachment(
|
|
item=item,
|
|
document=ocr_document,
|
|
document_info=document_info,
|
|
)
|
|
self._backfill_item_reason_from_attachment(
|
|
item=item,
|
|
document=ocr_document,
|
|
document_info=document_info,
|
|
)
|
|
requirement_check = self._build_attachment_requirement_check(
|
|
item=item,
|
|
document_info=document_info,
|
|
)
|
|
attachment_analysis = self._build_attachment_analysis(
|
|
document=ocr_document,
|
|
item=item,
|
|
claim=claim,
|
|
document_info=document_info,
|
|
requirement_check=requirement_check,
|
|
)
|
|
elif ocr_error:
|
|
ocr_status = "failed"
|
|
attachment_analysis = self._build_failed_ocr_attachment_analysis(
|
|
media_type=media_type,
|
|
error_message=ocr_error,
|
|
item=item,
|
|
)
|
|
|
|
item.invoice_id = self._attachment_storage.to_storage_key(file_path)
|
|
preview_meta = self._attachment_presentation.build_preview_meta(
|
|
file_path=file_path,
|
|
media_type=resolved_media_type,
|
|
ocr_document=ocr_document,
|
|
)
|
|
meta = {
|
|
"file_name": normalized_name,
|
|
"storage_key": item.invoice_id,
|
|
"media_type": resolved_media_type,
|
|
"size_bytes": len(content),
|
|
"uploaded_at": datetime.now(UTC).isoformat(),
|
|
"previewable": bool(preview_meta["previewable"]),
|
|
"preview_kind": str(preview_meta["preview_kind"]),
|
|
"preview_storage_key": str(preview_meta["preview_storage_key"]),
|
|
"preview_media_type": str(preview_meta["preview_media_type"]),
|
|
"preview_file_name": str(preview_meta["preview_file_name"]),
|
|
"preview_rendered_with": str(preview_meta.get("preview_rendered_with") or ""),
|
|
"analysis": attachment_analysis,
|
|
"document_info": document_info,
|
|
"requirement_check": requirement_check,
|
|
"ocr_status": ocr_status,
|
|
"ocr_error": ocr_error,
|
|
"ocr_text": str(getattr(ocr_document, "text", "") or ""),
|
|
"ocr_summary": str(getattr(ocr_document, "summary", "") or ""),
|
|
"ocr_avg_score": float(getattr(ocr_document, "avg_score", 0.0) or 0.0),
|
|
"ocr_line_count": int(getattr(ocr_document, "line_count", 0) or 0),
|
|
"ocr_classification_source": str(getattr(ocr_document, "classification_source", "") or ""),
|
|
"ocr_classification_confidence": float(getattr(ocr_document, "classification_confidence", 0.0) or 0.0),
|
|
"ocr_classification_evidence": [
|
|
str(item)
|
|
for item in getattr(ocr_document, "classification_evidence", []) or []
|
|
if str(item).strip()
|
|
],
|
|
"ocr_warnings": [str(item) for item in getattr(ocr_document, "warnings", []) or []],
|
|
"source_receipt_id": str(source_receipt_id or "").strip(),
|
|
}
|
|
self._attachment_storage.write_meta(file_path, meta)
|
|
ReceiptFolderService().save_linked_attachment(
|
|
file_path=file_path,
|
|
media_type=resolved_media_type,
|
|
document=ocr_document,
|
|
current_user=current_user,
|
|
claim_id=claim.id,
|
|
claim_no=claim.claim_no,
|
|
item_id=item.id,
|
|
source_receipt_id=source_receipt_id,
|
|
)
|
|
|
|
self._sync_claim_from_items(claim)
|
|
self._refresh_claim_pre_review_flags(claim, is_application_claim=False)
|
|
self.db.commit()
|
|
self.db.refresh(claim)
|
|
|
|
self.audit_service.log_action(
|
|
actor=current_user.name or current_user.username,
|
|
action="expense_claim.attachment_upload",
|
|
resource_type="expense_claim",
|
|
resource_id=claim.id,
|
|
before_json=before_json,
|
|
after_json=self._serialize_claim(claim),
|
|
)
|
|
|
|
return {
|
|
"message": f"{normalized_name} 已上传并关联到当前费用明细。",
|
|
"claim_id": claim.id,
|
|
"item_id": item.id,
|
|
"invoice_id": item.invoice_id,
|
|
"item_date": item.item_date.isoformat() if item.item_date else None,
|
|
"item_type": item.item_type,
|
|
"item_reason": item.item_reason,
|
|
"item_location": item.item_location,
|
|
"item_note": item.item_note,
|
|
"item_amount": item.item_amount,
|
|
"claim_amount": claim.amount,
|
|
"claim_risk_flags": list(claim.risk_flags_json or []),
|
|
"attachment": self._build_attachment_payload(item),
|
|
}
|
|
|
|
def _resolve_source_receipt_document(
|
|
self,
|
|
*,
|
|
source_receipt_id: str,
|
|
current_user: CurrentUserContext,
|
|
fallback_filename: str,
|
|
fallback_media_type: str,
|
|
) -> SimpleNamespace | None:
|
|
normalized_receipt_id = str(source_receipt_id or "").strip()
|
|
if not normalized_receipt_id:
|
|
return None
|
|
|
|
try:
|
|
receipt = ReceiptFolderService().get_receipt(normalized_receipt_id, current_user)
|
|
except FileNotFoundError:
|
|
return None
|
|
|
|
raw_meta = receipt.raw_meta if isinstance(receipt.raw_meta, dict) else {}
|
|
fields = self._normalize_receipt_document_fields(
|
|
[field.model_dump() for field in list(receipt.fields or [])]
|
|
)
|
|
if not fields:
|
|
fields = self._normalize_receipt_document_fields(raw_meta.get("document_fields"))
|
|
|
|
document = SimpleNamespace(
|
|
filename=str(receipt.file_name or fallback_filename or "").strip(),
|
|
media_type=str(receipt.media_type or fallback_media_type or "application/octet-stream").strip(),
|
|
engine=str(receipt.engine or raw_meta.get("engine") or ""),
|
|
model=str(receipt.model or raw_meta.get("model") or ""),
|
|
text=str(receipt.ocr_text or raw_meta.get("ocr_text") or ""),
|
|
summary=str(receipt.summary or raw_meta.get("summary") or ""),
|
|
avg_score=float(receipt.avg_score or raw_meta.get("ocr_avg_score") or 0.0),
|
|
line_count=int(receipt.line_count or raw_meta.get("ocr_line_count") or 0),
|
|
page_count=max(1, int(receipt.page_count or raw_meta.get("page_count") or 1)),
|
|
document_type=str(receipt.document_type or raw_meta.get("document_type") or "other").strip(),
|
|
document_type_label=str(
|
|
receipt.document_type_label or raw_meta.get("document_type_label") or "其他单据"
|
|
).strip(),
|
|
scene_code=str(receipt.scene_code or raw_meta.get("scene_code") or "other").strip(),
|
|
scene_label=str(receipt.scene_label or raw_meta.get("scene_label") or "其他票据").strip(),
|
|
classification_source=str(raw_meta.get("ocr_classification_source") or "receipt_folder"),
|
|
classification_confidence=float(
|
|
receipt.classification_confidence
|
|
or raw_meta.get("ocr_classification_confidence")
|
|
or 0.0
|
|
),
|
|
classification_evidence=[
|
|
str(value)
|
|
for value in list(
|
|
receipt.classification_evidence
|
|
or raw_meta.get("ocr_classification_evidence")
|
|
or []
|
|
)
|
|
if str(value).strip()
|
|
],
|
|
document_fields=fields,
|
|
preview_kind=str(raw_meta.get("preview_kind") or ""),
|
|
preview_data_url="",
|
|
warnings=[
|
|
str(value)
|
|
for value in list(receipt.warnings or raw_meta.get("ocr_warnings") or [])
|
|
if str(value).strip()
|
|
],
|
|
)
|
|
return document if self._attachment_ocr_signal_score(document) > 0 else None
|
|
|
|
@staticmethod
|
|
def _normalize_receipt_document_fields(raw_fields: Any) -> list[dict[str, str]]:
|
|
fields: list[dict[str, str]] = []
|
|
for field in list(raw_fields or []):
|
|
if isinstance(field, dict):
|
|
key = str(field.get("key") or "").strip()
|
|
label = str(field.get("label") or "").strip()
|
|
value = str(field.get("value") or "").strip()
|
|
else:
|
|
key = str(getattr(field, "key", "") or "").strip()
|
|
label = str(getattr(field, "label", "") or "").strip()
|
|
value = str(getattr(field, "value", "") or "").strip()
|
|
if label and value:
|
|
fields.append({"key": key, "label": label, "value": value})
|
|
return fields
|
|
|
|
@classmethod
|
|
def _choose_attachment_ocr_document(
|
|
cls,
|
|
*,
|
|
source_receipt_document: Any | None,
|
|
upload_ocr_document: Any | None,
|
|
) -> Any | None:
|
|
source_score = cls._attachment_ocr_signal_score(source_receipt_document)
|
|
upload_score = cls._attachment_ocr_signal_score(upload_ocr_document)
|
|
if source_score <= 0:
|
|
return upload_ocr_document if upload_score > 0 else None
|
|
if upload_score <= 0:
|
|
return source_receipt_document
|
|
|
|
source_type = cls._attachment_document_type(source_receipt_document)
|
|
upload_type = cls._attachment_document_type(upload_ocr_document)
|
|
if source_type not in {"", "other"} and upload_type in {"", "other"}:
|
|
return source_receipt_document
|
|
if (
|
|
source_type == upload_type
|
|
and cls._attachment_document_field_count(source_receipt_document)
|
|
> cls._attachment_document_field_count(upload_ocr_document)
|
|
):
|
|
return source_receipt_document
|
|
if source_score > upload_score + 2:
|
|
return source_receipt_document
|
|
return upload_ocr_document
|
|
|
|
@classmethod
|
|
def _attachment_ocr_signal_score(cls, document: Any | None) -> int:
|
|
if document is None:
|
|
return 0
|
|
score = 0
|
|
document_type = cls._attachment_document_type(document)
|
|
if document_type not in {"", "other"}:
|
|
score += 4
|
|
score += min(3, cls._attachment_document_field_count(document))
|
|
if str(getattr(document, "text", "") or "").strip():
|
|
score += 2
|
|
if str(getattr(document, "summary", "") or "").strip():
|
|
score += 1
|
|
if int(getattr(document, "line_count", 0) or 0) > 0:
|
|
score += 1
|
|
return score
|
|
|
|
@staticmethod
|
|
def _attachment_document_type(document: Any | None) -> str:
|
|
return str(getattr(document, "document_type", "") or "").strip().lower()
|
|
|
|
@staticmethod
|
|
def _attachment_document_field_count(document: Any | None) -> int:
|
|
if document is None:
|
|
return 0
|
|
return len(list(getattr(document, "document_fields", []) or []))
|
|
|
|
def get_claim_item_attachment_meta(
|
|
self,
|
|
*,
|
|
claim_id: str,
|
|
item_id: str,
|
|
current_user: CurrentUserContext,
|
|
) -> dict[str, Any] | None:
|
|
claim, item = self._get_claim_item_or_raise(
|
|
claim_id=claim_id,
|
|
item_id=item_id,
|
|
current_user=current_user,
|
|
)
|
|
if claim is None:
|
|
return None
|
|
|
|
return self._build_attachment_payload(item)
|
|
|
|
def get_claim_item_attachment_content(
|
|
self,
|
|
*,
|
|
claim_id: str,
|
|
item_id: str,
|
|
current_user: CurrentUserContext,
|
|
) -> tuple[Path, str, str] | None:
|
|
claim, item = self._get_claim_item_or_raise(
|
|
claim_id=claim_id,
|
|
item_id=item_id,
|
|
current_user=current_user,
|
|
)
|
|
if claim is None:
|
|
return None
|
|
|
|
return self._resolve_item_attachment_content(item)
|
|
|
|
def get_claim_item_attachment_preview_content(
|
|
self,
|
|
*,
|
|
claim_id: str,
|
|
item_id: str,
|
|
current_user: CurrentUserContext,
|
|
) -> tuple[Path, str, str] | None:
|
|
claim, item = self._get_claim_item_or_raise(
|
|
claim_id=claim_id,
|
|
item_id=item_id,
|
|
current_user=current_user,
|
|
)
|
|
if claim is None:
|
|
return None
|
|
|
|
return self._resolve_item_attachment_preview_content(item)
|
|
|
|
def delete_claim_item_attachment(
|
|
self,
|
|
*,
|
|
claim_id: str,
|
|
item_id: str,
|
|
current_user: CurrentUserContext,
|
|
) -> dict[str, Any] | None:
|
|
claim, item = self._get_claim_item_or_raise(
|
|
claim_id=claim_id,
|
|
item_id=item_id,
|
|
current_user=current_user,
|
|
)
|
|
if claim is None:
|
|
return None
|
|
|
|
self._ensure_draft_claim(claim)
|
|
self._ensure_mutable_claim_item(item)
|
|
before_json = self._serialize_claim(claim)
|
|
previous_invoice_id = str(item.invoice_id or "").strip()
|
|
previous_name = self._attachment_presentation.resolve_display_name(item.invoice_id)
|
|
self._attachment_storage.delete_item_files(item)
|
|
item.invoice_id = None
|
|
claim.risk_flags_json = self._remove_deleted_attachment_risk_flags(
|
|
claim.risk_flags_json,
|
|
item_id=item.id,
|
|
invoice_id=previous_invoice_id,
|
|
)
|
|
|
|
self._sync_claim_from_items(claim)
|
|
self._refresh_claim_pre_review_flags(claim, is_application_claim=False)
|
|
self.db.commit()
|
|
self.db.refresh(claim)
|
|
|
|
self.audit_service.log_action(
|
|
actor=current_user.name or current_user.username,
|
|
action="expense_claim.attachment_delete",
|
|
resource_type="expense_claim",
|
|
resource_id=claim.id,
|
|
before_json=before_json,
|
|
after_json=self._serialize_claim(claim),
|
|
)
|
|
|
|
return {
|
|
"message": f"{previous_name or '附件'} 已删除。",
|
|
"claim_id": claim.id,
|
|
"item_id": item.id,
|
|
"invoice_id": item.invoice_id,
|
|
"claim_risk_flags": list(claim.risk_flags_json or []),
|
|
"attachment": None,
|
|
}
|
|
|
|
@staticmethod
|
|
def _remove_deleted_attachment_risk_flags(
|
|
risk_flags: Any,
|
|
*,
|
|
item_id: str | None,
|
|
invoice_id: str | None,
|
|
) -> list[Any]:
|
|
normalized_item_id = str(item_id or "").strip()
|
|
normalized_invoice_id = str(invoice_id or "").strip()
|
|
cleaned_flags: list[Any] = []
|
|
for flag in list(risk_flags or []):
|
|
if not isinstance(flag, dict):
|
|
cleaned_flags.append(flag)
|
|
continue
|
|
|
|
source = str(flag.get("source") or "").strip()
|
|
if source != "attachment_analysis":
|
|
cleaned_flags.append(flag)
|
|
continue
|
|
|
|
flag_item_id = str(flag.get("item_id") or flag.get("itemId") or "").strip()
|
|
flag_invoice_id = str(flag.get("invoice_id") or flag.get("invoiceId") or "").strip()
|
|
matches_deleted_item = bool(normalized_item_id and flag_item_id == normalized_item_id)
|
|
matches_deleted_invoice = bool(normalized_invoice_id and flag_invoice_id == normalized_invoice_id)
|
|
if matches_deleted_item or matches_deleted_invoice:
|
|
continue
|
|
|
|
cleaned_flags.append(flag)
|
|
return cleaned_flags
|
|
|
|
def _get_claim_item_or_raise(
|
|
self,
|
|
*,
|
|
claim_id: str,
|
|
item_id: str,
|
|
current_user: CurrentUserContext,
|
|
) -> tuple[ExpenseClaim | None, ExpenseClaimItem]:
|
|
claim = self.get_claim(claim_id, current_user)
|
|
if claim is None:
|
|
return None, None # type: ignore[return-value]
|
|
|
|
item = next((entry for entry in claim.items if entry.id == item_id), None)
|
|
if item is None:
|
|
raise LookupError("Item not found")
|
|
return claim, item
|
|
|
|
def _resolve_item_attachment_content(self, item: ExpenseClaimItem) -> tuple[Path, str, str]:
|
|
file_path = self._attachment_storage.resolve_item_path(item)
|
|
if file_path is None or not file_path.exists():
|
|
raise FileNotFoundError("Attachment not found")
|
|
|
|
metadata = self._attachment_storage.read_meta(file_path)
|
|
filename = str(metadata.get("file_name") or file_path.name)
|
|
media_type = self._attachment_presentation.resolve_media_type(
|
|
filename,
|
|
fallback=str(metadata.get("media_type") or ""),
|
|
)
|
|
return file_path, media_type, filename
|
|
|
|
def _repair_pdf_text_layer_metadata_if_needed(
|
|
self,
|
|
*,
|
|
file_path: Path,
|
|
metadata: dict[str, Any],
|
|
item: ExpenseClaimItem | None = None,
|
|
) -> dict[str, Any]:
|
|
if not metadata:
|
|
return metadata
|
|
|
|
media_type = str(metadata.get("media_type") or self._attachment_presentation.resolve_media_type(file_path.name)).strip()
|
|
if media_type != "application/pdf":
|
|
return metadata
|
|
|
|
ocr_text = str(metadata.get("ocr_text") or "")
|
|
ocr_summary = str(metadata.get("ocr_summary") or "")
|
|
if OcrService._placeholder_ratio(f"{ocr_summary}\n{ocr_text}") < 0.12:
|
|
return metadata
|
|
|
|
text_layer = OcrService(self.db)._extract_pdf_text_layer(file_path)
|
|
repaired_text, used_text_layer = OcrService._choose_document_text(
|
|
ocr_text=ocr_text,
|
|
text_layer=text_layer,
|
|
)
|
|
if not used_text_layer or not repaired_text:
|
|
return metadata
|
|
|
|
repaired_summary = OcrService._summarize_text(repaired_text)
|
|
document = SimpleNamespace(
|
|
filename=str(metadata.get("file_name") or file_path.name),
|
|
text=repaired_text,
|
|
summary=repaired_summary,
|
|
avg_score=float(metadata.get("ocr_avg_score") or 0.0),
|
|
line_count=int(metadata.get("ocr_line_count") or 0),
|
|
document_type="",
|
|
document_type_label="",
|
|
scene_code="",
|
|
scene_label="",
|
|
document_fields=[],
|
|
warnings=[str(value) for value in list(metadata.get("ocr_warnings") or []) if str(value).strip()],
|
|
)
|
|
document_info = self._build_attachment_document_info(document)
|
|
document.document_type = document_info.get("document_type", "")
|
|
document.document_type_label = document_info.get("document_type_label", "")
|
|
document.scene_code = document_info.get("scene_code", "")
|
|
document.scene_label = document_info.get("scene_label", "")
|
|
document.document_fields = list(document_info.get("fields") or [])
|
|
|
|
metadata["ocr_text"] = repaired_text
|
|
metadata["ocr_summary"] = repaired_summary
|
|
metadata["document_info"] = document_info
|
|
metadata["previewable"] = True
|
|
metadata["preview_kind"] = "pdf"
|
|
metadata["preview_storage_key"] = str(
|
|
metadata.get("storage_key") or self._attachment_storage.to_storage_key(file_path)
|
|
)
|
|
metadata["preview_media_type"] = "application/pdf"
|
|
metadata["preview_file_name"] = str(metadata.get("file_name") or file_path.name)
|
|
|
|
if item is not None:
|
|
requirement_check = self._build_attachment_requirement_check(
|
|
item=item,
|
|
document_info=document_info,
|
|
)
|
|
metadata["requirement_check"] = requirement_check
|
|
metadata["analysis"] = self._build_attachment_analysis(
|
|
document=document,
|
|
item=item,
|
|
claim=getattr(item, "claim", None),
|
|
document_info=document_info,
|
|
requirement_check=requirement_check,
|
|
)
|
|
|
|
self._attachment_storage.write_meta(file_path, metadata)
|
|
return metadata
|
|
|
|
def _refresh_pdf_attachment_preview_meta_if_needed(
|
|
self,
|
|
*,
|
|
file_path: Path,
|
|
metadata: dict[str, Any],
|
|
) -> dict[str, Any]:
|
|
if not metadata:
|
|
return metadata
|
|
|
|
media_type = str(
|
|
metadata.get("media_type")
|
|
or self._attachment_presentation.resolve_media_type(file_path.name)
|
|
).strip()
|
|
if media_type != "application/pdf":
|
|
return metadata
|
|
|
|
preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
|
|
preview_path = self._attachment_storage.resolve_path(preview_storage_key) if preview_storage_key else None
|
|
if (
|
|
preview_path is not None
|
|
and preview_path.exists()
|
|
and str(metadata.get("preview_kind") or "").strip() == "image"
|
|
and str(metadata.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
|
|
and str(metadata.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
|
|
):
|
|
return metadata
|
|
|
|
preview_name = str(metadata.get("preview_file_name") or "").strip()
|
|
if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
|
|
preview_name = f"{file_path.stem}.preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
|
|
preview_path = file_path.parent / preview_name
|
|
|
|
try:
|
|
DocumentPreviewAssets.render_pdf_first_page(
|
|
pdf_path=file_path,
|
|
preview_path=preview_path,
|
|
timeout_seconds=OcrService(self.db).settings.ocr_timeout_seconds,
|
|
)
|
|
except Exception:
|
|
return metadata
|
|
|
|
metadata.update(
|
|
{
|
|
"previewable": True,
|
|
"preview_kind": "image",
|
|
"preview_storage_key": self._attachment_storage.to_storage_key(preview_path),
|
|
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
|
|
"preview_file_name": preview_path.name,
|
|
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
|
|
}
|
|
)
|
|
self._attachment_storage.write_meta(file_path, metadata)
|
|
return metadata
|
|
|
|
def _resolve_item_attachment_preview_content(self, item: ExpenseClaimItem) -> tuple[Path, str, str]:
|
|
file_path, media_type, filename = self._resolve_item_attachment_content(item)
|
|
metadata = self._attachment_storage.read_meta(file_path)
|
|
metadata = self._repair_pdf_text_layer_metadata_if_needed(
|
|
file_path=file_path,
|
|
metadata=metadata,
|
|
item=item,
|
|
)
|
|
metadata = self._refresh_pdf_attachment_preview_meta_if_needed(
|
|
file_path=file_path,
|
|
metadata=metadata,
|
|
)
|
|
preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
|
|
preview_file_name = str(metadata.get("preview_file_name") or "").strip()
|
|
preview_media_type = str(metadata.get("preview_media_type") or "").strip()
|
|
|
|
if preview_storage_key:
|
|
preview_path = self._attachment_storage.resolve_path(preview_storage_key)
|
|
if preview_path is not None and preview_path.exists():
|
|
resolved_name = preview_file_name or preview_path.name
|
|
resolved_media_type = self._attachment_presentation.resolve_media_type(
|
|
resolved_name,
|
|
fallback=preview_media_type,
|
|
)
|
|
return preview_path, resolved_media_type, resolved_name
|
|
|
|
if self._attachment_presentation.is_previewable_media_type(media_type, filename):
|
|
return file_path, media_type, filename
|
|
|
|
raise FileNotFoundError("Attachment preview not found")
|