Files
X-Financial/server/src/app/services/expense_claim_attachment_operations.py
caoxiaozhu 84a8998e59 feat(server): 票据文件夹资产缓存与文档预览统一生成
- 新增 document_preview 模块,DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成(poppler-data 编码)、renderer_id 标识
- receipt_folder 服务复用预览生成,缓存票据资产并提供清理;删除票据时保留已关联报销单的附件副本
- document_intelligence 新增票据预览/资产缓存接入与字段提取增强;ocr 抽取复用预览工具,附件分析/文档/操作/展示四个子模块同步适配
- receipt_folder 端点补充资产缓存头,补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试,新增 attachment_analysis 回归测试
2026-06-23 09:42:00 +08:00

762 lines
30 KiB
Python

from __future__ import annotations
import json
import re
import shutil
import uuid
from collections import defaultdict
from datetime import UTC, date, datetime, timedelta
from decimal import Decimal, InvalidOperation
from pathlib import Path
from types import SimpleNamespace
from typing import Any
from sqlalchemy import func, or_, select
from sqlalchemy import inspect as sqlalchemy_inspect
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session, selectinload
from app.api.deps import CurrentUserContext
from app.core.agent_enums import AgentAssetDomain, AgentAssetStatus, AgentAssetType
from app.models.agent_asset import AgentAsset
from app.models.employee import Employee
from app.models.financial_record import ExpenseClaim, ExpenseClaimItem
from app.schemas.ontology import OntologyEntity, OntologyParseResult
from app.schemas.reimbursement import (
ExpenseClaimItemCreate,
ExpenseClaimItemUpdate,
ExpenseClaimUpdate,
TravelReimbursementCalculatorRequest,
)
from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager
from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY
from app.services.agent_foundation import AgentFoundationService
from app.services.audit import AuditLogService
from app.services.document_preview import DocumentPreviewAssets
from app.services.document_intelligence import build_document_insight
from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
from app.services.expense_claim_constants import (
EXPENSE_TYPE_LABELS,
MAX_DRAFT_CLAIMS_PER_USER,
EDITABLE_CLAIM_STATUSES,
SYSTEM_GENERATED_ITEM_TYPES,
TRAVEL_DETAIL_ITEM_TYPES,
TRAVEL_ALLOWANCE_TRIGGER_ITEM_TYPES,
DOCUMENT_TYPE_ITEM_TYPE_MAP,
DOCUMENT_TYPE_SCENE_MAP,
DOCUMENT_FACT_ITEM_TYPES,
ROUTE_DESCRIPTION_ITEM_TYPES,
DOCUMENT_TRIP_DATE_LABELS,
DOCUMENT_TRIP_DATE_REQUIREMENT_LABELS,
DOCUMENT_TRIP_DATE_KEYS,
DOCUMENT_GENERIC_DATE_KEYS,
DOCUMENT_INVOICE_DATE_KEYS,
DOCUMENT_TRIP_DATE_LABEL_TOKENS,
DOCUMENT_GENERIC_DATE_LABEL_TOKENS,
DOCUMENT_INVOICE_DATE_LABEL_TOKENS,
DOCUMENT_ROUTE_FORMAT_PATTERN,
DOCUMENT_ROUTE_TEXT_PATTERN,
DOCUMENT_ROUTE_ORIGIN_LABELS,
DOCUMENT_ROUTE_DESTINATION_LABELS,
GENERIC_ATTACHMENT_BACKFILL_ITEM_TYPES,
LOCATION_REQUIRED_EXPENSE_TYPES,
EXPENSE_SCENE_KEYWORDS,
EXPENSE_TYPE_ALLOWED_DOCUMENT_SCENES,
DOCUMENT_SCENE_LABELS,
DOCUMENT_ASSOCIATION_REVIEW_ACTIONS,
PERSISTENT_EXPENSE_REVIEW_ACTIONS,
RETURN_REASON_OPTIONS,
MAX_CLAIM_NO_RETRY_ATTEMPTS,
DOCUMENT_DATE_PATTERN,
SYSTEM_GENERATED_REASON_PREFIXES,
LEADING_REASON_TIME_PATTERNS,
AI_REVIEW_LOOKBACK_DAYS,
AI_REVIEW_REPEAT_RISK_WARNING_COUNT,
AI_REVIEW_REPEAT_RISK_BLOCK_COUNT,
TRAVEL_REVIEW_RELEVANT_EXPENSE_TYPES,
TRAVEL_REVIEW_LONG_DISTANCE_DOCUMENT_TYPES,
TRAVEL_POLICY_CITY_TIERS,
TRAVEL_POLICY_CITY_MATCH_ORDER,
TRAVEL_POLICY_BAND_LABELS,
TRAVEL_POLICY_HOTEL_LIMITS,
TRAVEL_POLICY_ALLOWED_TRANSPORT_LEVELS,
TRAVEL_POLICY_ROUTE_EXCEPTION_KEYWORDS,
TRAVEL_POLICY_STANDARD_EXCEPTION_KEYWORDS,
TRAVEL_POLICY_FLIGHT_CLASS_PATTERNS,
TRAVEL_POLICY_TRAIN_CLASS_PATTERNS,
TRAVEL_POLICY_HOTEL_NIGHT_PATTERN,
)
from app.services.expense_claim_risk_review import ExpenseClaimRiskReviewMixin
from app.services.expense_amounts import (
extract_amount_candidates,
format_decimal_amount,
is_amount_match_date_fragment,
is_date_like_amount_candidate,
is_probable_year_amount,
parse_document_amount_value,
parse_plain_document_amount_value,
resolve_document_field_amount,
resolve_document_item_amount,
resolve_document_text_amount,
)
from app.services.expense_rule_runtime import (
DEFAULT_SCENE_RULE_ASSET_CODE,
ExpenseRuleRuntimeService,
RuntimeTravelPolicy,
build_default_expense_rule_catalog,
resolve_document_type_label,
)
from app.services.ocr import OcrService
from app.services.receipt_folder import ReceiptFolderService
class ExpenseClaimAttachmentOperationsMixin:
def upload_claim_item_attachment(
self,
*,
claim_id: str,
item_id: str,
filename: str,
content: bytes,
media_type: str | None,
current_user: CurrentUserContext,
source_receipt_id: str = "",
) -> dict[str, Any] | None:
claim, item = self._get_claim_item_or_raise(
claim_id=claim_id,
item_id=item_id,
current_user=current_user,
)
if claim is None:
return None
self._ensure_draft_claim(claim)
self._ensure_mutable_claim_item(item)
normalized_name = self._attachment_storage.normalize_filename(filename)
if not content:
raise ValueError("上传文件不能为空。")
before_json = self._serialize_claim(claim)
attachment_dir = self._attachment_storage.build_item_dir(claim.id, item.id)
shutil.rmtree(attachment_dir, ignore_errors=True)
attachment_dir.mkdir(parents=True, exist_ok=True)
file_path = attachment_dir / normalized_name
file_path.write_bytes(content)
resolved_media_type = self._attachment_presentation.resolve_media_type(
normalized_name,
fallback=media_type,
)
attachment_analysis = self._build_fallback_attachment_analysis(
media_type=media_type,
item=item,
)
source_receipt_document = self._resolve_source_receipt_document(
source_receipt_id=source_receipt_id,
current_user=current_user,
fallback_filename=normalized_name,
fallback_media_type=resolved_media_type,
)
ocr_document = None
document_info = None
requirement_check = None
ocr_status = "empty"
ocr_error = ""
upload_ocr_document = None
try:
ocr_result = OcrService(self.db).recognize_files(
[(normalized_name, content, media_type or "application/octet-stream")]
)
documents = list(ocr_result.documents or [])
if documents:
upload_ocr_document = documents[0]
except Exception as exc: # pragma: no cover - fallback path depends on OCR runtime
ocr_error = str(exc)
ocr_document = self._choose_attachment_ocr_document(
source_receipt_document=source_receipt_document,
upload_ocr_document=upload_ocr_document,
)
if ocr_document is not None:
ocr_status = "recognized"
ocr_error = ""
document_info = self._build_attachment_document_info(ocr_document)
self._backfill_item_type_from_attachment(
item=item,
document_info=document_info,
)
self._backfill_item_amount_from_attachment(
item=item,
document=ocr_document,
document_info=document_info,
)
self._backfill_item_date_from_attachment(
item=item,
document=ocr_document,
document_info=document_info,
)
self._backfill_item_reason_from_attachment(
item=item,
document=ocr_document,
document_info=document_info,
)
requirement_check = self._build_attachment_requirement_check(
item=item,
document_info=document_info,
)
attachment_analysis = self._build_attachment_analysis(
document=ocr_document,
item=item,
claim=claim,
document_info=document_info,
requirement_check=requirement_check,
)
elif ocr_error:
ocr_status = "failed"
attachment_analysis = self._build_failed_ocr_attachment_analysis(
media_type=media_type,
error_message=ocr_error,
item=item,
)
item.invoice_id = self._attachment_storage.to_storage_key(file_path)
preview_meta = self._attachment_presentation.build_preview_meta(
file_path=file_path,
media_type=resolved_media_type,
ocr_document=ocr_document,
)
meta = {
"file_name": normalized_name,
"storage_key": item.invoice_id,
"media_type": resolved_media_type,
"size_bytes": len(content),
"uploaded_at": datetime.now(UTC).isoformat(),
"previewable": bool(preview_meta["previewable"]),
"preview_kind": str(preview_meta["preview_kind"]),
"preview_storage_key": str(preview_meta["preview_storage_key"]),
"preview_media_type": str(preview_meta["preview_media_type"]),
"preview_file_name": str(preview_meta["preview_file_name"]),
"preview_rendered_with": str(preview_meta.get("preview_rendered_with") or ""),
"analysis": attachment_analysis,
"document_info": document_info,
"requirement_check": requirement_check,
"ocr_status": ocr_status,
"ocr_error": ocr_error,
"ocr_text": str(getattr(ocr_document, "text", "") or ""),
"ocr_summary": str(getattr(ocr_document, "summary", "") or ""),
"ocr_avg_score": float(getattr(ocr_document, "avg_score", 0.0) or 0.0),
"ocr_line_count": int(getattr(ocr_document, "line_count", 0) or 0),
"ocr_classification_source": str(getattr(ocr_document, "classification_source", "") or ""),
"ocr_classification_confidence": float(getattr(ocr_document, "classification_confidence", 0.0) or 0.0),
"ocr_classification_evidence": [
str(item)
for item in getattr(ocr_document, "classification_evidence", []) or []
if str(item).strip()
],
"ocr_warnings": [str(item) for item in getattr(ocr_document, "warnings", []) or []],
"source_receipt_id": str(source_receipt_id or "").strip(),
}
self._attachment_storage.write_meta(file_path, meta)
ReceiptFolderService().save_linked_attachment(
file_path=file_path,
media_type=resolved_media_type,
document=ocr_document,
current_user=current_user,
claim_id=claim.id,
claim_no=claim.claim_no,
item_id=item.id,
source_receipt_id=source_receipt_id,
)
self._sync_claim_from_items(claim)
self._refresh_claim_pre_review_flags(claim, is_application_claim=False)
self.db.commit()
self.db.refresh(claim)
self.audit_service.log_action(
actor=current_user.name or current_user.username,
action="expense_claim.attachment_upload",
resource_type="expense_claim",
resource_id=claim.id,
before_json=before_json,
after_json=self._serialize_claim(claim),
)
return {
"message": f"{normalized_name} 已上传并关联到当前费用明细。",
"claim_id": claim.id,
"item_id": item.id,
"invoice_id": item.invoice_id,
"item_date": item.item_date.isoformat() if item.item_date else None,
"item_type": item.item_type,
"item_reason": item.item_reason,
"item_location": item.item_location,
"item_note": item.item_note,
"item_amount": item.item_amount,
"claim_amount": claim.amount,
"claim_risk_flags": list(claim.risk_flags_json or []),
"attachment": self._build_attachment_payload(item),
}
def _resolve_source_receipt_document(
self,
*,
source_receipt_id: str,
current_user: CurrentUserContext,
fallback_filename: str,
fallback_media_type: str,
) -> SimpleNamespace | None:
normalized_receipt_id = str(source_receipt_id or "").strip()
if not normalized_receipt_id:
return None
try:
receipt = ReceiptFolderService().get_receipt(normalized_receipt_id, current_user)
except FileNotFoundError:
return None
raw_meta = receipt.raw_meta if isinstance(receipt.raw_meta, dict) else {}
fields = self._normalize_receipt_document_fields(
[field.model_dump() for field in list(receipt.fields or [])]
)
if not fields:
fields = self._normalize_receipt_document_fields(raw_meta.get("document_fields"))
document = SimpleNamespace(
filename=str(receipt.file_name or fallback_filename or "").strip(),
media_type=str(receipt.media_type or fallback_media_type or "application/octet-stream").strip(),
engine=str(receipt.engine or raw_meta.get("engine") or ""),
model=str(receipt.model or raw_meta.get("model") or ""),
text=str(receipt.ocr_text or raw_meta.get("ocr_text") or ""),
summary=str(receipt.summary or raw_meta.get("summary") or ""),
avg_score=float(receipt.avg_score or raw_meta.get("ocr_avg_score") or 0.0),
line_count=int(receipt.line_count or raw_meta.get("ocr_line_count") or 0),
page_count=max(1, int(receipt.page_count or raw_meta.get("page_count") or 1)),
document_type=str(receipt.document_type or raw_meta.get("document_type") or "other").strip(),
document_type_label=str(
receipt.document_type_label or raw_meta.get("document_type_label") or "其他单据"
).strip(),
scene_code=str(receipt.scene_code or raw_meta.get("scene_code") or "other").strip(),
scene_label=str(receipt.scene_label or raw_meta.get("scene_label") or "其他票据").strip(),
classification_source=str(raw_meta.get("ocr_classification_source") or "receipt_folder"),
classification_confidence=float(
receipt.classification_confidence
or raw_meta.get("ocr_classification_confidence")
or 0.0
),
classification_evidence=[
str(value)
for value in list(
receipt.classification_evidence
or raw_meta.get("ocr_classification_evidence")
or []
)
if str(value).strip()
],
document_fields=fields,
preview_kind=str(raw_meta.get("preview_kind") or ""),
preview_data_url="",
warnings=[
str(value)
for value in list(receipt.warnings or raw_meta.get("ocr_warnings") or [])
if str(value).strip()
],
)
return document if self._attachment_ocr_signal_score(document) > 0 else None
@staticmethod
def _normalize_receipt_document_fields(raw_fields: Any) -> list[dict[str, str]]:
fields: list[dict[str, str]] = []
for field in list(raw_fields or []):
if isinstance(field, dict):
key = str(field.get("key") or "").strip()
label = str(field.get("label") or "").strip()
value = str(field.get("value") or "").strip()
else:
key = str(getattr(field, "key", "") or "").strip()
label = str(getattr(field, "label", "") or "").strip()
value = str(getattr(field, "value", "") or "").strip()
if label and value:
fields.append({"key": key, "label": label, "value": value})
return fields
@classmethod
def _choose_attachment_ocr_document(
cls,
*,
source_receipt_document: Any | None,
upload_ocr_document: Any | None,
) -> Any | None:
source_score = cls._attachment_ocr_signal_score(source_receipt_document)
upload_score = cls._attachment_ocr_signal_score(upload_ocr_document)
if source_score <= 0:
return upload_ocr_document if upload_score > 0 else None
if upload_score <= 0:
return source_receipt_document
source_type = cls._attachment_document_type(source_receipt_document)
upload_type = cls._attachment_document_type(upload_ocr_document)
if source_type not in {"", "other"} and upload_type in {"", "other"}:
return source_receipt_document
if (
source_type == upload_type
and cls._attachment_document_field_count(source_receipt_document)
> cls._attachment_document_field_count(upload_ocr_document)
):
return source_receipt_document
if source_score > upload_score + 2:
return source_receipt_document
return upload_ocr_document
@classmethod
def _attachment_ocr_signal_score(cls, document: Any | None) -> int:
if document is None:
return 0
score = 0
document_type = cls._attachment_document_type(document)
if document_type not in {"", "other"}:
score += 4
score += min(3, cls._attachment_document_field_count(document))
if str(getattr(document, "text", "") or "").strip():
score += 2
if str(getattr(document, "summary", "") or "").strip():
score += 1
if int(getattr(document, "line_count", 0) or 0) > 0:
score += 1
return score
@staticmethod
def _attachment_document_type(document: Any | None) -> str:
return str(getattr(document, "document_type", "") or "").strip().lower()
@staticmethod
def _attachment_document_field_count(document: Any | None) -> int:
if document is None:
return 0
return len(list(getattr(document, "document_fields", []) or []))
def get_claim_item_attachment_meta(
self,
*,
claim_id: str,
item_id: str,
current_user: CurrentUserContext,
) -> dict[str, Any] | None:
claim, item = self._get_claim_item_or_raise(
claim_id=claim_id,
item_id=item_id,
current_user=current_user,
)
if claim is None:
return None
return self._build_attachment_payload(item)
def get_claim_item_attachment_content(
self,
*,
claim_id: str,
item_id: str,
current_user: CurrentUserContext,
) -> tuple[Path, str, str] | None:
claim, item = self._get_claim_item_or_raise(
claim_id=claim_id,
item_id=item_id,
current_user=current_user,
)
if claim is None:
return None
return self._resolve_item_attachment_content(item)
def get_claim_item_attachment_preview_content(
self,
*,
claim_id: str,
item_id: str,
current_user: CurrentUserContext,
) -> tuple[Path, str, str] | None:
claim, item = self._get_claim_item_or_raise(
claim_id=claim_id,
item_id=item_id,
current_user=current_user,
)
if claim is None:
return None
return self._resolve_item_attachment_preview_content(item)
def delete_claim_item_attachment(
self,
*,
claim_id: str,
item_id: str,
current_user: CurrentUserContext,
) -> dict[str, Any] | None:
claim, item = self._get_claim_item_or_raise(
claim_id=claim_id,
item_id=item_id,
current_user=current_user,
)
if claim is None:
return None
self._ensure_draft_claim(claim)
self._ensure_mutable_claim_item(item)
before_json = self._serialize_claim(claim)
previous_invoice_id = str(item.invoice_id or "").strip()
previous_name = self._attachment_presentation.resolve_display_name(item.invoice_id)
self._attachment_storage.delete_item_files(item)
item.invoice_id = None
claim.risk_flags_json = self._remove_deleted_attachment_risk_flags(
claim.risk_flags_json,
item_id=item.id,
invoice_id=previous_invoice_id,
)
self._sync_claim_from_items(claim)
self._refresh_claim_pre_review_flags(claim, is_application_claim=False)
self.db.commit()
self.db.refresh(claim)
self.audit_service.log_action(
actor=current_user.name or current_user.username,
action="expense_claim.attachment_delete",
resource_type="expense_claim",
resource_id=claim.id,
before_json=before_json,
after_json=self._serialize_claim(claim),
)
return {
"message": f"{previous_name or '附件'} 已删除。",
"claim_id": claim.id,
"item_id": item.id,
"invoice_id": item.invoice_id,
"claim_risk_flags": list(claim.risk_flags_json or []),
"attachment": None,
}
@staticmethod
def _remove_deleted_attachment_risk_flags(
risk_flags: Any,
*,
item_id: str | None,
invoice_id: str | None,
) -> list[Any]:
normalized_item_id = str(item_id or "").strip()
normalized_invoice_id = str(invoice_id or "").strip()
cleaned_flags: list[Any] = []
for flag in list(risk_flags or []):
if not isinstance(flag, dict):
cleaned_flags.append(flag)
continue
source = str(flag.get("source") or "").strip()
if source != "attachment_analysis":
cleaned_flags.append(flag)
continue
flag_item_id = str(flag.get("item_id") or flag.get("itemId") or "").strip()
flag_invoice_id = str(flag.get("invoice_id") or flag.get("invoiceId") or "").strip()
matches_deleted_item = bool(normalized_item_id and flag_item_id == normalized_item_id)
matches_deleted_invoice = bool(normalized_invoice_id and flag_invoice_id == normalized_invoice_id)
if matches_deleted_item or matches_deleted_invoice:
continue
cleaned_flags.append(flag)
return cleaned_flags
def _get_claim_item_or_raise(
self,
*,
claim_id: str,
item_id: str,
current_user: CurrentUserContext,
) -> tuple[ExpenseClaim | None, ExpenseClaimItem]:
claim = self.get_claim(claim_id, current_user)
if claim is None:
return None, None # type: ignore[return-value]
item = next((entry for entry in claim.items if entry.id == item_id), None)
if item is None:
raise LookupError("Item not found")
return claim, item
def _resolve_item_attachment_content(self, item: ExpenseClaimItem) -> tuple[Path, str, str]:
file_path = self._attachment_storage.resolve_item_path(item)
if file_path is None or not file_path.exists():
raise FileNotFoundError("Attachment not found")
metadata = self._attachment_storage.read_meta(file_path)
filename = str(metadata.get("file_name") or file_path.name)
media_type = self._attachment_presentation.resolve_media_type(
filename,
fallback=str(metadata.get("media_type") or ""),
)
return file_path, media_type, filename
def _repair_pdf_text_layer_metadata_if_needed(
self,
*,
file_path: Path,
metadata: dict[str, Any],
item: ExpenseClaimItem | None = None,
) -> dict[str, Any]:
if not metadata:
return metadata
media_type = str(metadata.get("media_type") or self._attachment_presentation.resolve_media_type(file_path.name)).strip()
if media_type != "application/pdf":
return metadata
ocr_text = str(metadata.get("ocr_text") or "")
ocr_summary = str(metadata.get("ocr_summary") or "")
if OcrService._placeholder_ratio(f"{ocr_summary}\n{ocr_text}") < 0.12:
return metadata
text_layer = OcrService(self.db)._extract_pdf_text_layer(file_path)
repaired_text, used_text_layer = OcrService._choose_document_text(
ocr_text=ocr_text,
text_layer=text_layer,
)
if not used_text_layer or not repaired_text:
return metadata
repaired_summary = OcrService._summarize_text(repaired_text)
document = SimpleNamespace(
filename=str(metadata.get("file_name") or file_path.name),
text=repaired_text,
summary=repaired_summary,
avg_score=float(metadata.get("ocr_avg_score") or 0.0),
line_count=int(metadata.get("ocr_line_count") or 0),
document_type="",
document_type_label="",
scene_code="",
scene_label="",
document_fields=[],
warnings=[str(value) for value in list(metadata.get("ocr_warnings") or []) if str(value).strip()],
)
document_info = self._build_attachment_document_info(document)
document.document_type = document_info.get("document_type", "")
document.document_type_label = document_info.get("document_type_label", "")
document.scene_code = document_info.get("scene_code", "")
document.scene_label = document_info.get("scene_label", "")
document.document_fields = list(document_info.get("fields") or [])
metadata["ocr_text"] = repaired_text
metadata["ocr_summary"] = repaired_summary
metadata["document_info"] = document_info
metadata["previewable"] = True
metadata["preview_kind"] = "pdf"
metadata["preview_storage_key"] = str(
metadata.get("storage_key") or self._attachment_storage.to_storage_key(file_path)
)
metadata["preview_media_type"] = "application/pdf"
metadata["preview_file_name"] = str(metadata.get("file_name") or file_path.name)
if item is not None:
requirement_check = self._build_attachment_requirement_check(
item=item,
document_info=document_info,
)
metadata["requirement_check"] = requirement_check
metadata["analysis"] = self._build_attachment_analysis(
document=document,
item=item,
claim=getattr(item, "claim", None),
document_info=document_info,
requirement_check=requirement_check,
)
self._attachment_storage.write_meta(file_path, metadata)
return metadata
def _refresh_pdf_attachment_preview_meta_if_needed(
self,
*,
file_path: Path,
metadata: dict[str, Any],
) -> dict[str, Any]:
if not metadata:
return metadata
media_type = str(
metadata.get("media_type")
or self._attachment_presentation.resolve_media_type(file_path.name)
).strip()
if media_type != "application/pdf":
return metadata
preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
preview_path = self._attachment_storage.resolve_path(preview_storage_key) if preview_storage_key else None
if (
preview_path is not None
and preview_path.exists()
and str(metadata.get("preview_kind") or "").strip() == "image"
and str(metadata.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
and str(metadata.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
):
return metadata
preview_name = str(metadata.get("preview_file_name") or "").strip()
if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
preview_name = f"{file_path.stem}.preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
preview_path = file_path.parent / preview_name
try:
DocumentPreviewAssets.render_pdf_first_page(
pdf_path=file_path,
preview_path=preview_path,
timeout_seconds=OcrService(self.db).settings.ocr_timeout_seconds,
)
except Exception:
return metadata
metadata.update(
{
"previewable": True,
"preview_kind": "image",
"preview_storage_key": self._attachment_storage.to_storage_key(preview_path),
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
"preview_file_name": preview_path.name,
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
}
)
self._attachment_storage.write_meta(file_path, metadata)
return metadata
def _resolve_item_attachment_preview_content(self, item: ExpenseClaimItem) -> tuple[Path, str, str]:
file_path, media_type, filename = self._resolve_item_attachment_content(item)
metadata = self._attachment_storage.read_meta(file_path)
metadata = self._repair_pdf_text_layer_metadata_if_needed(
file_path=file_path,
metadata=metadata,
item=item,
)
metadata = self._refresh_pdf_attachment_preview_meta_if_needed(
file_path=file_path,
metadata=metadata,
)
preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
preview_file_name = str(metadata.get("preview_file_name") or "").strip()
preview_media_type = str(metadata.get("preview_media_type") or "").strip()
if preview_storage_key:
preview_path = self._attachment_storage.resolve_path(preview_storage_key)
if preview_path is not None and preview_path.exists():
resolved_name = preview_file_name or preview_path.name
resolved_media_type = self._attachment_presentation.resolve_media_type(
resolved_name,
fallback=preview_media_type,
)
return preview_path, resolved_media_type, resolved_name
if self._attachment_presentation.is_previewable_media_type(media_type, filename):
return file_path, media_type, filename
raise FileNotFoundError("Attachment preview not found")