feat(server): 票据文件夹资产缓存与文档预览统一生成

- 新增 document_preview 模块,DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成(poppler-data 编码)、renderer_id 标识
- receipt_folder 服务复用预览生成,缓存票据资产并提供清理;删除票据时保留已关联报销单的附件副本
- document_intelligence 新增票据预览/资产缓存接入与字段提取增强;ocr 抽取复用预览工具,附件分析/文档/操作/展示四个子模块同步适配
- receipt_folder 端点补充资产缓存头,补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试,新增 attachment_analysis 回归测试
This commit is contained in:
caoxiaozhu
2026-06-23 09:42:00 +08:00
parent bc743adef3
commit 84a8998e59
15 changed files with 1076 additions and 79 deletions

View File

@@ -92,7 +92,7 @@ def preview_receipt(receipt_id: str, current_user: CurrentUser) -> FileResponse:
file_path, media_type, file_name = ReceiptFolderService().resolve_preview(receipt_id, current_user) file_path, media_type, file_name = ReceiptFolderService().resolve_preview(receipt_id, current_user)
except FileNotFoundError as exc: except FileNotFoundError as exc:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Receipt preview not found") from exc raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Receipt preview not found") from exc
return FileResponse(file_path, media_type=media_type, filename=file_name) return FileResponse(file_path, media_type=media_type, filename=file_name, headers={"Cache-Control": "no-store"})
@router.get( @router.get(

View File

@@ -25,11 +25,15 @@ AMOUNT_PATTERNS = (
re.compile(r"[¥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"), re.compile(r"[¥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"),
re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"), re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
) )
DATE_PATTERN = re.compile(r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)") DATE_PATTERN = re.compile(
r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
)
TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:]([0-5]\d)(?!\d)") TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:]([0-5]\d)(?!\d)")
INVOICE_NUMBER_PATTERN = re.compile(r"(?:发票号码|票号|单号|订单号)[:\s]*([A-Za-z0-9-]{6,24})") INVOICE_NUMBER_PATTERN = re.compile(r"(?:发票号码|票号|单号|订单号)[:\s]*([A-Za-z0-9-]{6,24})")
INVOICE_CODE_PATTERN = re.compile(r"(?:发票代码)[:\s]*([A-Za-z0-9-]{6,24})") INVOICE_CODE_PATTERN = re.compile(r"(?:发票代码)[:\s]*([A-Za-z0-9-]{6,24})")
TRIP_NO_PATTERN = re.compile(r"(?:车次|航班(?:号)?)[:\s]*([A-Za-z0-9]{2,12})") TRIP_NO_PATTERN = re.compile(r"(?:车次|航班(?:号)?)[:\s]*([A-Za-z0-9]{2,12})")
TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(?<![A-Za-z0-9])([GCDZKTLYS]\d{1,5})(?![A-Za-z0-9])", re.IGNORECASE)
ROUTE_PATTERN = re.compile(r"([\u4e00-\u9fa5]{2,12})\s*(?:至|→|->|-)\s*([\u4e00-\u9fa5]{2,12})") ROUTE_PATTERN = re.compile(r"([\u4e00-\u9fa5]{2,12})\s*(?:至|→|->|-)\s*([\u4e00-\u9fa5]{2,12})")
MERCHANT_PATTERNS = ( MERCHANT_PATTERNS = (
re.compile(r"(?:销售方(?:名称)?|商户(?:名称)?|开票方(?:名称)?|收款方(?:名称)?)[:\s]*([A-Za-z0-9\u4e00-\u9fa5()·&\\-]{2,40})"), re.compile(r"(?:销售方(?:名称)?|商户(?:名称)?|开票方(?:名称)?|收款方(?:名称)?)[:\s]*([A-Za-z0-9\u4e00-\u9fa5()·&\\-]{2,40})"),
@@ -300,6 +304,14 @@ def _match_document_rule(compact_text: str) -> RuleMatch:
best_score = score best_score = score
if best_score <= 0: if best_score <= 0:
train_rule = DOCUMENT_TYPE_RULE_MAP.get("train_ticket")
if train_rule and _looks_like_train_ticket(compact_text):
return RuleMatch(
rule=train_rule,
confidence=0.82,
evidence=("车次", "12306"),
score=3.8,
)
return RuleMatch(rule=None, confidence=0.0, evidence=(), score=0.0) return RuleMatch(rule=None, confidence=0.0, evidence=(), score=0.0)
confidence = min(0.94, 0.30 + min(best_score, 4.8) * 0.12) confidence = min(0.94, 0.30 + min(best_score, 4.8) * 0.12)
@@ -311,6 +323,17 @@ def _match_document_rule(compact_text: str) -> RuleMatch:
) )
def _looks_like_train_ticket(compact_text: str) -> bool:
text = str(compact_text or "").lower()
if not re.search(r"[gcdzktlys]\d{1,5}", text, flags=re.IGNORECASE):
return False
if "12306" in text or "95306" in text:
return True
if re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—||-)[\u4e00-\u9fa5]{2,12}", text):
return True
return "wuhan" in text and "shanghai" in text
def _extract_json_payload(response_text: str | None) -> dict[str, Any] | None: def _extract_json_payload(response_text: str | None) -> dict[str, Any] | None:
if not response_text: if not response_text:
return None return None
@@ -521,33 +544,48 @@ def _merge_document_fields(
def _extract_document_fields(text: str, document_type: str = "") -> list[DocumentField]: def _extract_document_fields(text: str, document_type: str = "") -> list[DocumentField]:
fields: list[DocumentField] = [] fields: list[DocumentField] = []
normalized_type = str(document_type or "").strip().lower()
def append_field(key: str, label: str, value: str) -> None:
cleaned = _clean_field_value(value)
if not cleaned:
return
if any(field.key == key for field in fields if field.key):
return
fields.append(DocumentField(key=key, label=label, value=cleaned))
amount = _extract_amount(text) amount = _extract_amount(text)
if amount: if amount:
fields.append(DocumentField(key="amount", label="金额", value=amount)) append_field("amount", "金额", amount)
date_value = _extract_date(text, document_type=document_type) date_value = _extract_date(text, document_type=document_type)
if date_value: if date_value:
fields.append(DocumentField(key="date", label="日期", value=date_value)) append_field("date", "日期", date_value)
merchant = _extract_merchant(text) merchant = _extract_merchant(text)
if merchant: if merchant:
fields.append(DocumentField(key="merchant_name", label="商户", value=merchant)) append_field("merchant_name", "商户", merchant)
invoice_number = _extract_pattern(INVOICE_NUMBER_PATTERN, text) invoice_number = _extract_pattern(INVOICE_NUMBER_PATTERN, text)
if invoice_number: if invoice_number:
fields.append(DocumentField(key="invoice_number", label="票据号码", value=invoice_number)) append_field("invoice_number", "票据号码", invoice_number)
invoice_code = _extract_pattern(INVOICE_CODE_PATTERN, text) invoice_code = _extract_pattern(INVOICE_CODE_PATTERN, text)
if invoice_code: if invoice_code:
fields.append(DocumentField(key="invoice_code", label="发票代码", value=invoice_code)) append_field("invoice_code", "发票代码", invoice_code)
trip_no = _extract_pattern(TRIP_NO_PATTERN, text) trip_no = _extract_pattern(TRIP_NO_PATTERN, text)
if not trip_no and normalized_type == "train_ticket":
trip_no = _extract_pattern(TRAIN_STANDALONE_NO_PATTERN, text)
if trip_no: if trip_no:
fields.append(DocumentField(key="trip_no", label="车次/航班", value=trip_no)) append_field("trip_no", "车次/航班", trip_no.upper())
route = _extract_route(text) route = _extract_route(text)
if route: if route:
fields.append(DocumentField(key="route", label="行程", value=route)) append_field("route", "行程", route)
if normalized_type == "train_ticket" and not any(field.key == "amount" for field in fields):
append_field("amount", "金额", _extract_loose_decimal_amount(text))
return fields return fields
@@ -621,6 +659,7 @@ def _format_date_match_with_time(text: str, match: re.Match[str]) -> str:
raw_value = str(match.group(1) or "").strip() raw_value = str(match.group(1) or "").strip()
normalized = raw_value.replace("", "-").replace("", "-").replace("", "") normalized = raw_value.replace("", "-").replace("", "-").replace("", "")
normalized = normalized.replace("/", "-").replace(".", "-") normalized = normalized.replace("/", "-").replace(".", "-")
normalized = re.sub(r"\s+", "-", normalized)
parts = [part for part in normalized.split("-") if part] parts = [part for part in normalized.split("-") if part]
if len(parts) != 3: if len(parts) != 3:
return raw_value return raw_value
@@ -703,6 +742,23 @@ def _extract_route(text: str) -> str:
return f"{start}-{end}" return f"{start}-{end}"
def _extract_loose_decimal_amount(text: str) -> str:
best_value: Decimal | None = None
for match in re.finditer(r"(?<!\d)(\d{1,6}\.\d{1,2})(?!\d)", str(text or "")):
try:
candidate = Decimal(match.group(1)).quantize(Decimal("0.01"))
except InvalidOperation:
continue
if candidate <= Decimal("0.00"):
continue
if best_value is None or candidate > best_value:
best_value = candidate
if best_value is None:
return ""
text_value = format(best_value, "f").rstrip("0").rstrip(".")
return f"{text_value}"
def _extract_pattern(pattern: re.Pattern[str], text: str) -> str: def _extract_pattern(pattern: re.Pattern[str], text: str) -> str:
match = pattern.search(text) match = pattern.search(text)
if not match: if not match:

View File

@@ -0,0 +1,98 @@
from __future__ import annotations
import base64
import binascii
import mimetypes
import re
import shutil
import subprocess
import tempfile
from pathlib import Path
class DocumentPreviewAssets:
PDF_RENDERER_ID = "pdftoppm-png-r160-poppler-data"
PDF_PREVIEW_MEDIA_TYPE = "image/png"
PDF_PREVIEW_SUFFIX = ".png"
@staticmethod
def decode_data_url(payload: str) -> tuple[str, bytes] | None:
normalized = str(payload or "").strip()
matched = re.match(
r"^data:(?P<media>[\w.+-]+/[\w.+-]+);base64,(?P<body>.+)$",
normalized,
flags=re.DOTALL,
)
if not matched:
return None
try:
content = base64.b64decode(matched.group("body"), validate=True)
except (binascii.Error, ValueError):
return None
return matched.group("media"), content
@classmethod
def renderer_id_for_source(cls, media_type: str | None) -> str:
return cls.PDF_RENDERER_ID if str(media_type or "").strip() == "application/pdf" else ""
@classmethod
def write_data_url_preview(
cls,
*,
preview_dir: Path,
preview_name_stem: str,
preview_data_url: str,
) -> tuple[Path, str, str] | None:
decoded = cls.decode_data_url(preview_data_url)
if decoded is None:
return None
preview_media_type, preview_content = decoded
suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
preview_name = f"{Path(preview_name_stem).stem}{suffix}"
preview_path = preview_dir / preview_name
preview_path.write_bytes(preview_content)
return preview_path, preview_media_type, preview_name
@classmethod
def render_pdf_first_page(
cls,
*,
pdf_path: Path,
preview_path: Path,
timeout_seconds: int | float,
) -> Path:
preview_path.parent.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory(prefix=".pdf-preview-", dir=str(preview_path.parent)) as temp_dir:
prefix = Path(temp_dir) / "page"
completed = subprocess.run(
[
"pdftoppm",
"-png",
"-r",
"160",
str(pdf_path),
str(prefix),
],
capture_output=True,
text=True,
timeout=timeout_seconds,
check=False,
)
if completed.returncode != 0:
detail = (completed.stderr or completed.stdout or "").strip()
raise RuntimeError(detail or "pdftoppm failed to render PDF preview.")
pages = sorted(Path(temp_dir).glob("page-*.png"), key=cls._extract_pdf_page_sort_key)
if not pages:
raise RuntimeError("pdftoppm did not generate a preview image.")
shutil.copyfile(pages[0], preview_path)
return preview_path
@staticmethod
def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]:
suffix = path.stem.rsplit("-", 1)[-1]
try:
return int(suffix), path.name
except ValueError:
return 0, path.name

View File

@@ -336,7 +336,27 @@ class ExpenseClaimAttachmentAnalysisMixin:
@staticmethod @staticmethod
def _has_date_like_text(text: str) -> bool: def _has_date_like_text(text: str) -> bool:
return bool(re.search(r"(20\d{2}[年/\-.]\d{1,2}[月/\-.]\d{1,2}日?)", text)) return bool(re.search(r"(20\d{2}(?:[年/\-.]|\s+)\d{1,2}(?:[月/\-.]|\s+)\d{1,2}日?)", text))
@staticmethod
def _has_document_date_field(document_info: dict[str, Any]) -> bool:
date_keys = DOCUMENT_TRIP_DATE_KEYS | DOCUMENT_GENERIC_DATE_KEYS | DOCUMENT_INVOICE_DATE_KEYS
date_label_tokens = (
*DOCUMENT_TRIP_DATE_LABEL_TOKENS,
*DOCUMENT_GENERIC_DATE_LABEL_TOKENS,
*DOCUMENT_INVOICE_DATE_LABEL_TOKENS,
)
for field in list(document_info.get("fields") or []):
if not isinstance(field, dict):
continue
value = str(field.get("value") or "").strip()
if not value:
continue
key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
if key in date_keys or any(token in label for token in date_label_tokens):
return True
return False
@staticmethod @staticmethod
def _normalize_match_text(text: str) -> str: def _normalize_match_text(text: str) -> str:
@@ -538,6 +558,12 @@ class ExpenseClaimAttachmentAnalysisMixin:
recognized_document_label = str(document_info.get("document_type_label") or "其他单据").strip() or "其他单据" recognized_document_label = str(document_info.get("document_type_label") or "其他单据").strip() or "其他单据"
requirement_matches = bool(requirement_check.get("matches")) requirement_matches = bool(requirement_check.get("matches"))
mismatch_severity = str(requirement_check.get("mismatch_severity") or "high").strip().lower() or "high" mismatch_severity = str(requirement_check.get("mismatch_severity") or "high").strip().lower() or "high"
document_fields = [
field
for field in list(document_info.get("fields") or [])
if isinstance(field, dict) and str(field.get("value") or "").strip()
]
has_readable_content = bool(line_count > 0 or compact_text or document_fields)
has_ticket_keyword = any( has_ticket_keyword = any(
keyword in compact_text keyword in compact_text
@@ -556,15 +582,18 @@ class ExpenseClaimAttachmentAnalysisMixin:
) )
) )
amount_candidates = self._extract_amount_candidates(text) amount_candidates = self._extract_amount_candidates(text)
field_amount = self._resolve_document_field_amount({"document_fields": document_fields})
if field_amount is not None and field_amount not in amount_candidates:
amount_candidates.insert(0, field_amount)
item_amount = Decimal(item.item_amount or Decimal("0.00")).quantize(Decimal("0.01")) item_amount = Decimal(item.item_amount or Decimal("0.00")).quantize(Decimal("0.01"))
has_matching_amount = any(abs(candidate - item_amount) <= Decimal("1.00") for candidate in amount_candidates) has_matching_amount = any(abs(candidate - item_amount) <= Decimal("1.00") for candidate in amount_candidates)
has_date_text = self._has_date_like_text(text) has_date_text = self._has_date_like_text(text) or self._has_document_date_field(document_info)
amount_mismatch = bool(amount_candidates) and item_amount > Decimal("0.00") and not has_matching_amount amount_mismatch = bool(amount_candidates) and item_amount > Decimal("0.00") and not has_matching_amount
points: list[str] = [] points: list[str] = []
if warnings: if warnings:
points.append(f"识别提示:{warnings[0]}") points.append(f"识别提示:{warnings[0]}")
if line_count == 0 or not compact_text: if not has_readable_content:
points.append("附件内容:未识别到有效文字,当前附件更像普通图片或内容过于模糊。") points.append("附件内容:未识别到有效文字,当前附件更像普通图片或内容过于模糊。")
if recognized_document_type == "other" and not has_ticket_keyword: if recognized_document_type == "other" and not has_ticket_keyword:
points.append("票据类型:未识别到发票、票据、电子行程单等关键字,暂无法判断票据类型。") points.append("票据类型:未识别到发票、票据、电子行程单等关键字,暂无法判断票据类型。")
@@ -617,8 +646,7 @@ class ExpenseClaimAttachmentAnalysisMixin:
headline = "AI提示住宿金额超出报销标准" headline = "AI提示住宿金额超出报销标准"
summary = "当前住宿票据金额超过规则中心差旅住宿标准,已作为风险项保留在单据中;如需按特殊情况提交,请补充超标原因。" summary = "当前住宿票据金额超过规则中心差旅住宿标准,已作为风险项保留在单据中;如需按特殊情况提交,请补充超标原因。"
elif ( elif (
line_count == 0 not has_readable_content
or not compact_text
or (recognized_document_type == "other" and not has_ticket_keyword and issue_count >= 2) or (recognized_document_type == "other" and not has_ticket_keyword and issue_count >= 2)
or (not requirement_matches and mismatch_severity == "high") or (not requirement_matches and mismatch_severity == "high")
or (purpose_mismatch_point and amount_mismatch) or (purpose_mismatch_point and amount_mismatch)

View File

@@ -119,6 +119,13 @@ class ExpenseClaimAttachmentDocumentMixin:
metadata=metadata, metadata=metadata,
item=item, item=item,
) )
metadata = self._refresh_pdf_attachment_preview_meta_if_needed(
file_path=file_path,
metadata=metadata,
)
if self._attachment_metadata_needs_analysis_refresh(metadata):
self._refresh_item_attachment_analysis(item)
metadata = self._attachment_storage.read_meta(file_path)
uploaded_at_value = metadata.get("uploaded_at") uploaded_at_value = metadata.get("uploaded_at")
uploaded_at = None uploaded_at = None
if isinstance(uploaded_at_value, str) and uploaded_at_value.strip(): if isinstance(uploaded_at_value, str) and uploaded_at_value.strip():
@@ -157,6 +164,68 @@ class ExpenseClaimAttachmentDocumentMixin:
"requirement_check": requirement_check, "requirement_check": requirement_check,
} }
@classmethod
def _attachment_metadata_needs_analysis_refresh(cls, metadata: dict[str, Any]) -> bool:
analysis = metadata.get("analysis")
if not isinstance(analysis, dict):
return cls._attachment_metadata_has_ocr_signal(metadata)
points = [
str(point or "").strip()
for point in list(analysis.get("points") or [])
if str(point or "").strip()
]
if not points:
return False
if any("未识别到有效文字" in point for point in points):
return cls._attachment_metadata_has_readable_signal(metadata)
if any("未识别到列车出发时间" in point or "未识别到开票日期" in point for point in points):
return cls._attachment_metadata_has_date_field(metadata)
return False
@classmethod
def _attachment_metadata_has_ocr_signal(cls, metadata: dict[str, Any]) -> bool:
return bool(
str(metadata.get("ocr_text") or "").strip()
or str(metadata.get("ocr_summary") or "").strip()
or int(metadata.get("ocr_line_count") or 0) > 0
or cls._attachment_metadata_document_fields(metadata)
)
@classmethod
def _attachment_metadata_has_readable_signal(cls, metadata: dict[str, Any]) -> bool:
return bool(
str(metadata.get("ocr_text") or "").strip()
or str(metadata.get("ocr_summary") or "").strip()
or int(metadata.get("ocr_line_count") or 0) > 0
or cls._attachment_metadata_document_fields(metadata)
)
@staticmethod
def _attachment_metadata_document_fields(metadata: dict[str, Any]) -> list[dict[str, Any]]:
document_info = metadata.get("document_info")
if not isinstance(document_info, dict):
return []
return [
field
for field in list(document_info.get("fields") or [])
if isinstance(field, dict) and str(field.get("value") or "").strip()
]
@classmethod
def _attachment_metadata_has_date_field(cls, metadata: dict[str, Any]) -> bool:
for field in cls._attachment_metadata_document_fields(metadata):
key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
if key in {"date", "tripdate", "departuredate", "invoicedate"}:
return True
if any(token in label for token in ("日期", "时间", "出发")):
return True
return False
def _build_attachment_document_info(self, document: Any) -> dict[str, Any]: def _build_attachment_document_info(self, document: Any) -> dict[str, Any]:
insight = build_document_insight( insight = build_document_insight(
filename=str(getattr(document, "filename", "") or ""), filename=str(getattr(document, "filename", "") or ""),

View File

@@ -32,6 +32,7 @@ from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager
from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY
from app.services.agent_foundation import AgentFoundationService from app.services.agent_foundation import AgentFoundationService
from app.services.audit import AuditLogService from app.services.audit import AuditLogService
from app.services.document_preview import DocumentPreviewAssets
from app.services.document_intelligence import build_document_insight from app.services.document_intelligence import build_document_insight
from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
@@ -238,6 +239,7 @@ class ExpenseClaimAttachmentOperationsMixin:
"preview_storage_key": str(preview_meta["preview_storage_key"]), "preview_storage_key": str(preview_meta["preview_storage_key"]),
"preview_media_type": str(preview_meta["preview_media_type"]), "preview_media_type": str(preview_meta["preview_media_type"]),
"preview_file_name": str(preview_meta["preview_file_name"]), "preview_file_name": str(preview_meta["preview_file_name"]),
"preview_rendered_with": str(preview_meta.get("preview_rendered_with") or ""),
"analysis": attachment_analysis, "analysis": attachment_analysis,
"document_info": document_info, "document_info": document_info,
"requirement_check": requirement_check, "requirement_check": requirement_check,
@@ -673,6 +675,60 @@ class ExpenseClaimAttachmentOperationsMixin:
self._attachment_storage.write_meta(file_path, metadata) self._attachment_storage.write_meta(file_path, metadata)
return metadata return metadata
def _refresh_pdf_attachment_preview_meta_if_needed(
self,
*,
file_path: Path,
metadata: dict[str, Any],
) -> dict[str, Any]:
if not metadata:
return metadata
media_type = str(
metadata.get("media_type")
or self._attachment_presentation.resolve_media_type(file_path.name)
).strip()
if media_type != "application/pdf":
return metadata
preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
preview_path = self._attachment_storage.resolve_path(preview_storage_key) if preview_storage_key else None
if (
preview_path is not None
and preview_path.exists()
and str(metadata.get("preview_kind") or "").strip() == "image"
and str(metadata.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
and str(metadata.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
):
return metadata
preview_name = str(metadata.get("preview_file_name") or "").strip()
if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
preview_name = f"{file_path.stem}.preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
preview_path = file_path.parent / preview_name
try:
DocumentPreviewAssets.render_pdf_first_page(
pdf_path=file_path,
preview_path=preview_path,
timeout_seconds=OcrService(self.db).settings.ocr_timeout_seconds,
)
except Exception:
return metadata
metadata.update(
{
"previewable": True,
"preview_kind": "image",
"preview_storage_key": self._attachment_storage.to_storage_key(preview_path),
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
"preview_file_name": preview_path.name,
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
}
)
self._attachment_storage.write_meta(file_path, metadata)
return metadata
def _resolve_item_attachment_preview_content(self, item: ExpenseClaimItem) -> tuple[Path, str, str]: def _resolve_item_attachment_preview_content(self, item: ExpenseClaimItem) -> tuple[Path, str, str]:
file_path, media_type, filename = self._resolve_item_attachment_content(item) file_path, media_type, filename = self._resolve_item_attachment_content(item)
metadata = self._attachment_storage.read_meta(file_path) metadata = self._attachment_storage.read_meta(file_path)
@@ -681,6 +737,10 @@ class ExpenseClaimAttachmentOperationsMixin:
metadata=metadata, metadata=metadata,
item=item, item=item,
) )
metadata = self._refresh_pdf_attachment_preview_meta_if_needed(
file_path=file_path,
metadata=metadata,
)
preview_storage_key = str(metadata.get("preview_storage_key") or "").strip() preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
preview_file_name = str(metadata.get("preview_file_name") or "").strip() preview_file_name = str(metadata.get("preview_file_name") or "").strip()
preview_media_type = str(metadata.get("preview_media_type") or "").strip() preview_media_type = str(metadata.get("preview_media_type") or "").strip()

View File

@@ -1,13 +1,11 @@
from __future__ import annotations from __future__ import annotations
import base64
import binascii
import mimetypes import mimetypes
import re
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from urllib.parse import quote from urllib.parse import quote
from app.services.document_preview import DocumentPreviewAssets
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
@@ -42,6 +40,7 @@ class ExpenseClaimAttachmentPresentation:
"preview_storage_key": self.storage.to_storage_key(preview_path), "preview_storage_key": self.storage.to_storage_key(preview_path),
"preview_media_type": preview_media_type, "preview_media_type": preview_media_type,
"preview_file_name": preview_file_name, "preview_file_name": preview_file_name,
"preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
} }
if preview_kind: if preview_kind:
@@ -51,6 +50,7 @@ class ExpenseClaimAttachmentPresentation:
"preview_storage_key": storage_key, "preview_storage_key": storage_key,
"preview_media_type": media_type, "preview_media_type": media_type,
"preview_file_name": filename, "preview_file_name": filename,
"preview_rendered_with": "",
} }
return { return {
@@ -59,6 +59,7 @@ class ExpenseClaimAttachmentPresentation:
"preview_storage_key": "", "preview_storage_key": "",
"preview_media_type": "", "preview_media_type": "",
"preview_file_name": "", "preview_file_name": "",
"preview_rendered_with": "",
} }
@staticmethod @staticmethod
@@ -72,15 +73,7 @@ class ExpenseClaimAttachmentPresentation:
@staticmethod @staticmethod
def decode_data_url(payload: str) -> tuple[str, bytes] | None: def decode_data_url(payload: str) -> tuple[str, bytes] | None:
normalized = str(payload or "").strip() return DocumentPreviewAssets.decode_data_url(payload)
matched = re.match(r"^data:(?P<media>[\w.+-]+/[\w.+-]+);base64,(?P<body>.+)$", normalized, flags=re.DOTALL)
if not matched:
return None
try:
content = base64.b64decode(matched.group("body"), validate=True)
except (binascii.Error, ValueError):
return None
return matched.group("media"), content
def _write_preview_asset_from_data_url( def _write_preview_asset_from_data_url(
self, self,
@@ -89,16 +82,11 @@ class ExpenseClaimAttachmentPresentation:
original_filename: str, original_filename: str,
preview_data_url: str, preview_data_url: str,
) -> tuple[Path, str, str] | None: ) -> tuple[Path, str, str] | None:
decoded = self.decode_data_url(preview_data_url) return DocumentPreviewAssets.write_data_url_preview(
if decoded is None: preview_dir=attachment_dir,
return None preview_name_stem=f"{Path(original_filename).stem}.preview",
preview_data_url=preview_data_url,
preview_media_type, preview_content = decoded )
suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
preview_name = f"{Path(original_filename).stem}.preview{suffix}"
preview_path = attachment_dir / preview_name
preview_path.write_bytes(preview_content)
return preview_path, preview_media_type, preview_name
@staticmethod @staticmethod
def build_preview_client_path(claim_id: str, item_id: str) -> str: def build_preview_client_path(claim_id: str, item_id: str) -> str:

View File

@@ -537,7 +537,7 @@ class OcrService:
if page_summary: if page_summary:
aggregated.summary_fragments.append(page_summary) aggregated.summary_fragments.append(page_summary)
page_text = str(payload.get("text", "") or "").strip() page_text = self._resolve_worker_document_text(payload)
if page_text: if page_text:
aggregated.text_fragments.append(page_text) aggregated.text_fragments.append(page_text)
@@ -626,6 +626,22 @@ class OcrService:
return descriptor.text_layer return descriptor.text_layer
return "" return ""
@staticmethod
def _resolve_worker_document_text(payload: dict) -> str:
for key in ("text", "ocr_text", "raw_text", "full_text"):
value = str(payload.get(key, "") or "").strip()
if value:
return value
lines = payload.get("lines", [])
if not isinstance(lines, list):
return ""
return "\n".join(
str(item.get("text", "") or "").strip()
for item in lines
if isinstance(item, dict) and str(item.get("text", "") or "").strip()
).strip()
@staticmethod @staticmethod
def _build_lines( def _build_lines(
items: list[dict], items: list[dict],

View File

@@ -12,7 +12,7 @@ from uuid import uuid4
from app.api.deps import CurrentUserContext from app.api.deps import CurrentUserContext
from app.core.config import get_settings from app.core.config import get_settings
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead
from app.schemas.receipt_folder import ( from app.schemas.receipt_folder import (
ReceiptFolderDeleteResponse, ReceiptFolderDeleteResponse,
ReceiptFolderDetailRead, ReceiptFolderDetailRead,
@@ -20,11 +20,13 @@ from app.schemas.receipt_folder import (
ReceiptFolderItemRead, ReceiptFolderItemRead,
ReceiptFolderUpdate, ReceiptFolderUpdate,
) )
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation from app.services.document_preview import DocumentPreviewAssets
from app.services.document_intelligence import build_document_insight
from app.services.ocr import SUPPORTED_SUFFIXES from app.services.ocr import SUPPORTED_SUFFIXES
RECEIPT_DATE_PATTERN = re.compile( RECEIPT_DATE_PATTERN = re.compile(
r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)" r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
) )
RECEIPT_TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:]([0-5]\d)(?!\d)") RECEIPT_TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:]([0-5]\d)(?!\d)")
TRAIN_INVOICE_DATE_PATTERN = re.compile( TRAIN_INVOICE_DATE_PATTERN = re.compile(
@@ -45,7 +47,9 @@ TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座
TRAIN_CARRIAGE_PATTERN = re.compile(r"(?:车厢|车厢号)\s*[:]?\s*([0-9]{1,2}\s*车?)") TRAIN_CARRIAGE_PATTERN = re.compile(r"(?:车厢|车厢号)\s*[:]?\s*([0-9]{1,2}\s*车?)")
TRAIN_SEAT_NO_PATTERN = re.compile(r"(?:座位|座位号)\s*[:]?\s*([0-9]{1,3}[A-F号]?)", re.IGNORECASE) TRAIN_SEAT_NO_PATTERN = re.compile(r"(?:座位|座位号)\s*[:]?\s*([0-9]{1,3}[A-F号]?)", re.IGNORECASE)
TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])号?", re.IGNORECASE) TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])号?", re.IGNORECASE)
TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE)
TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[::¥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)") TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[::¥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)")
class ReceiptFolderStorageMixin: class ReceiptFolderStorageMixin:
@@ -101,18 +105,19 @@ class ReceiptFolderStorageMixin:
document: Any | None, document: Any | None,
) -> dict[str, Any]: ) -> dict[str, Any]:
preview_data_url = str(getattr(document, "preview_data_url", "") or "").strip() preview_data_url = str(getattr(document, "preview_data_url", "") or "").strip()
decoded = ExpenseClaimAttachmentPresentation.decode_data_url(preview_data_url) preview_asset = DocumentPreviewAssets.write_data_url_preview(
if decoded is not None: preview_dir=receipt_dir,
preview_media_type, preview_content = decoded preview_name_stem="preview",
suffix = mimetypes.guess_extension(preview_media_type) or ".bin" preview_data_url=preview_data_url,
preview_name = f"preview{suffix}" )
preview_path = receipt_dir / preview_name if preview_asset is not None:
preview_path.write_bytes(preview_content) _, preview_media_type, preview_name = preview_asset
return { return {
"previewable": True, "previewable": True,
"preview_kind": "image", "preview_kind": "image",
"preview_file_name": preview_name, "preview_file_name": preview_name,
"preview_media_type": preview_media_type, "preview_media_type": preview_media_type,
"preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
} }
if self._is_previewable(media_type): if self._is_previewable(media_type):
return { return {
@@ -120,14 +125,67 @@ class ReceiptFolderStorageMixin:
"preview_kind": "image" if media_type.startswith("image/") else "pdf", "preview_kind": "image" if media_type.startswith("image/") else "pdf",
"preview_file_name": source_path.name, "preview_file_name": source_path.name,
"preview_media_type": media_type, "preview_media_type": media_type,
"preview_rendered_with": "",
} }
return { return {
"previewable": False, "previewable": False,
"preview_kind": "", "preview_kind": "",
"preview_file_name": "", "preview_file_name": "",
"preview_media_type": "", "preview_media_type": "",
"preview_rendered_with": "",
} }
def _refresh_pdf_preview_asset_if_needed(
self,
*,
receipt_dir: Path,
meta: dict[str, Any],
) -> dict[str, Any]:
source_name = str(meta.get("source_file_name") or meta.get("file_name") or "").strip()
if not source_name:
return meta
source_path = self._assert_child(receipt_dir / source_name)
source_media_type = self.resolve_media_type(source_path.name, str(meta.get("media_type") or ""))
if source_media_type != "application/pdf" or not source_path.exists():
return meta
preview_name = str(meta.get("preview_file_name") or "").strip()
preview_path = self._assert_child(receipt_dir / preview_name) if preview_name else None
if (
preview_path is not None
and preview_path.exists()
and str(meta.get("preview_kind") or "").strip() == "image"
and str(meta.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
and str(meta.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
):
return meta
if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
preview_name = f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
preview_path = self._assert_child(receipt_dir / preview_name)
try:
DocumentPreviewAssets.render_pdf_first_page(
pdf_path=source_path,
preview_path=preview_path,
timeout_seconds=get_settings().ocr_timeout_seconds,
)
except Exception:
return meta
meta.update(
{
"previewable": True,
"preview_kind": "image",
"preview_file_name": preview_path.name,
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
}
)
self._write_meta(receipt_dir, meta)
return meta
@staticmethod @staticmethod
def _is_previewable(media_type: str) -> bool: def _is_previewable(media_type: str) -> bool:
return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf" return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf"
@@ -256,6 +314,7 @@ class ReceiptFolderItemMixin:
def _build_item(self, meta: dict[str, Any]) -> ReceiptFolderItemRead: def _build_item(self, meta: dict[str, Any]) -> ReceiptFolderItemRead:
receipt_id = str(meta.get("id") or "").strip() receipt_id = str(meta.get("id") or "").strip()
status_value = str(meta.get("status") or "unlinked").strip() or "unlinked" status_value = str(meta.get("status") or "unlinked").strip() or "unlinked"
identity = self._resolve_receipt_document_identity(meta)
return ReceiptFolderItemRead( return ReceiptFolderItemRead(
id=receipt_id, id=receipt_id,
file_name=str(meta.get("file_name") or ""), file_name=str(meta.get("file_name") or ""),
@@ -263,10 +322,10 @@ class ReceiptFolderItemMixin:
size_bytes=int(meta.get("size_bytes") or 0), size_bytes=int(meta.get("size_bytes") or 0),
status=status_value, status=status_value,
status_label="已关联" if status_value == "linked" else "未关联", status_label="已关联" if status_value == "linked" else "未关联",
document_type=str(meta.get("document_type") or "other"), document_type=identity["document_type"],
document_type_label=str(meta.get("document_type_label") or "其他单据"), document_type_label=identity["document_type_label"],
scene_code=str(meta.get("scene_code") or "other"), scene_code=identity["scene_code"],
scene_label=str(meta.get("scene_label") or "其他票据"), scene_label=identity["scene_label"],
summary=str(meta.get("summary") or ""), summary=str(meta.get("summary") or ""),
amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")), amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")),
document_date=self._resolve_receipt_document_date(meta), document_date=self._resolve_receipt_document_date(meta),
@@ -283,6 +342,38 @@ class ReceiptFolderItemMixin:
warnings=[str(value) for value in list(meta.get("ocr_warnings") or []) if str(value).strip()], warnings=[str(value) for value in list(meta.get("ocr_warnings") or []) if str(value).strip()],
) )
def _resolve_receipt_document_identity(self, meta: dict[str, Any]) -> dict[str, str]:
document_type = str(meta.get("document_type") or "other").strip() or "other"
document_type_label = str(meta.get("document_type_label") or "其他单据").strip() or "其他单据"
scene_code = str(meta.get("scene_code") or "other").strip() or "other"
scene_label = str(meta.get("scene_label") or "其他票据").strip() or "其他票据"
if document_type not in {"", "other"} and document_type_label != "其他单据":
return {
"document_type": document_type,
"document_type_label": document_type_label,
"scene_code": scene_code,
"scene_label": scene_label,
}
insight = build_document_insight(
filename=str(meta.get("file_name") or ""),
summary=str(meta.get("summary") or ""),
text=self._receipt_text(meta),
)
if insight.document_type in {"", "other"}:
return {
"document_type": document_type,
"document_type_label": document_type_label,
"scene_code": scene_code,
"scene_label": scene_label,
}
return {
"document_type": insight.document_type,
"document_type_label": insight.document_type_label,
"scene_code": insight.scene_code,
"scene_label": insight.scene_label,
}
def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]: def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]:
fields = [ fields = [
ReceiptFolderFieldRead( ReceiptFolderFieldRead(
@@ -503,7 +594,15 @@ class ReceiptFolderTrainTicketMixin:
if str(document_type or "").strip().lower() == "train_ticket": if str(document_type or "").strip().lower() == "train_ticket":
return True return True
compact = "".join([document_type_label, scene_label, text]).replace(" ", "") compact = "".join([document_type_label, scene_label, text]).replace(" ", "")
return any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次")) if any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次")):
return True
lower_compact = compact.lower()
return bool(re.search(r"[GCDZKTLYS]\d{1,5}", compact, flags=re.IGNORECASE)) and (
"12306" in compact
or "95306" in compact
or re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—||-)[\u4e00-\u9fa5]{2,12}", compact)
or ("wuhan" in lower_compact and "shanghai" in lower_compact)
)
@classmethod @classmethod
def _is_train_ticket_meta(cls, meta: dict[str, Any]) -> bool: def _is_train_ticket_meta(cls, meta: dict[str, Any]) -> bool:
@@ -581,6 +680,7 @@ class ReceiptFolderTrainTicketMixin:
return raw return raw
normalized = match.group(1).replace("", "-").replace("", "-").replace("", "") normalized = match.group(1).replace("", "-").replace("", "-").replace("", "")
normalized = normalized.replace("/", "-").replace(".", "-") normalized = normalized.replace("/", "-").replace(".", "-")
normalized = re.sub(r"\s+", "-", normalized)
parts = [part for part in normalized.split("-") if part] parts = [part for part in normalized.split("-") if part]
if len(parts) != 3: if len(parts) != 3:
return match.group(1) return match.group(1)
@@ -651,7 +751,28 @@ class ReceiptFolderTrainTicketMixin:
cleaned = re.sub(r"[^·\u4e00-\u9fa5]", "", str(value or "")).strip() cleaned = re.sub(r"[^·\u4e00-\u9fa5]", "", str(value or "")).strip()
if not 2 <= len(cleaned) <= 8: if not 2 <= len(cleaned) <= 8:
return "" return ""
if any(token in cleaned for token in ("电子", "客票", "铁路", "发票", "税务", "湖北省", "中国铁路", "开票", "日期")): if any(
token in cleaned
for token in (
"电子",
"客票",
"铁路",
"发票",
"税务",
"湖北省",
"中国铁路",
"开票",
"日期",
"车厢",
"座位",
"票价",
"金额",
"行程",
"出发",
"到达",
"车次",
)
):
return "" return ""
return cleaned return cleaned
@@ -660,20 +781,29 @@ class ReceiptFolderTrainTicketMixin:
labeled = cls._extract_first(TRAIN_ID_PATTERN, text) labeled = cls._extract_first(TRAIN_ID_PATTERN, text)
if labeled: if labeled:
return labeled return labeled
fallback = ""
for line in str(text or "").replace("\r", "\n").splitlines(): for line in str(text or "").replace("\r", "\n").splitlines():
compact_line = line.replace(" ", "") compact_line = line.replace(" ", "")
if any(token in compact_line for token in ("发票号码", "电子客票号", "客票号", "订单号")): if any(token in compact_line for token in ("发票号码", "电子客票号", "客票号", "订单号")):
continue continue
match = TRAIN_ID_FALLBACK_PATTERN.search(compact_line) match = TRAIN_ID_FALLBACK_PATTERN.search(compact_line)
if match: if not match:
return str(match.group(1) or "").strip() continue
return "" candidate = str(match.group(1) or "").strip()
if "*" in candidate:
return candidate
if not fallback:
fallback = candidate
return fallback
@staticmethod @staticmethod
def _extract_train_carriage_and_seat(text: str) -> tuple[str, str]: def _extract_train_carriage_and_seat(text: str) -> tuple[str, str]:
combined_match = TRAIN_COMBINED_SEAT_PATTERN.search(str(text or "")) combined_match = TRAIN_COMBINED_SEAT_PATTERN.search(str(text or ""))
if combined_match: if combined_match:
return f"{combined_match.group(1)}", combined_match.group(2) return f"{combined_match.group(1)}", combined_match.group(2)
loose_match = TRAIN_LOOSE_SEAT_PATTERN.search(str(text or ""))
if loose_match:
return f"{loose_match.group(1).zfill(2)}", loose_match.group(2).upper()
carriage_no = ReceiptFolderService._extract_first(TRAIN_CARRIAGE_PATTERN, text).replace(" ", "") carriage_no = ReceiptFolderService._extract_first(TRAIN_CARRIAGE_PATTERN, text).replace(" ", "")
seat_no = ReceiptFolderService._extract_first(TRAIN_SEAT_NO_PATTERN, text) seat_no = ReceiptFolderService._extract_first(TRAIN_SEAT_NO_PATTERN, text)
return carriage_no, seat_no return carriage_no, seat_no
@@ -681,6 +811,12 @@ class ReceiptFolderTrainTicketMixin:
@staticmethod @staticmethod
def _extract_train_fare(text: str) -> str: def _extract_train_fare(text: str) -> str:
match = TRAIN_FARE_PATTERN.search(str(text or "")) match = TRAIN_FARE_PATTERN.search(str(text or ""))
if not match:
match = max(
list(TRAIN_LOOSE_FARE_PATTERN.finditer(str(text or ""))),
key=lambda item: float(str(item.group(1) or "0").replace(",", ".")),
default=None,
)
if not match: if not match:
return "" return ""
value = str(match.group(1) or "").replace(",", ".").strip() value = str(match.group(1) or "").replace(",", ".").strip()
@@ -721,13 +857,10 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
) )
if existing_receipt is not None: if existing_receipt is not None:
enriched.append( enriched.append(
document.model_copy( self._enrich_ocr_document_with_receipt(
update={ document,
"receipt_id": existing_receipt.id, receipt=existing_receipt,
"receipt_status": existing_receipt.status, current_user=current_user,
"receipt_preview_url": existing_receipt.preview_url,
"receipt_source_url": existing_receipt.source_url,
}
) )
) )
continue continue
@@ -744,14 +877,11 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
warning = "已上传过同样的单据,请不要重复上传。" warning = "已上传过同样的单据,请不要重复上传。"
existing_warnings = [str(item) for item in list(document.warnings or []) if str(item).strip()] existing_warnings = [str(item) for item in list(document.warnings or []) if str(item).strip()]
enriched.append( enriched.append(
document.model_copy( self._enrich_ocr_document_with_receipt(
update={ document,
"receipt_id": duplicate_receipt.id, receipt=duplicate_receipt,
"receipt_status": duplicate_receipt.status, current_user=current_user,
"receipt_preview_url": duplicate_receipt.preview_url, extra_warnings=[*existing_warnings, warning],
"receipt_source_url": duplicate_receipt.source_url,
"warnings": list(dict.fromkeys([*existing_warnings, warning])),
}
) )
) )
continue continue
@@ -763,16 +893,77 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
current_user=current_user, current_user=current_user,
) )
enriched.append( enriched.append(
document.model_copy( self._enrich_ocr_document_with_receipt(
update={ document,
receipt=receipt,
current_user=current_user,
)
)
return result.model_copy(update={"documents": enriched})
def _enrich_ocr_document_with_receipt(
self,
document: OcrRecognizeDocumentRead,
*,
receipt: ReceiptFolderItemRead,
current_user: CurrentUserContext,
extra_warnings: list[str] | None = None,
) -> OcrRecognizeDocumentRead:
update: dict[str, Any] = {
"receipt_id": receipt.id, "receipt_id": receipt.id,
"receipt_status": receipt.status, "receipt_status": receipt.status,
"receipt_preview_url": receipt.preview_url, "receipt_preview_url": receipt.preview_url,
"receipt_source_url": receipt.source_url, "receipt_source_url": receipt.source_url,
} }
try:
meta = self._read_receipt_meta(receipt.id, current_user)
except FileNotFoundError:
meta = {}
if meta:
update.update(
{
"text": str(meta.get("ocr_text") or document.text or ""),
"summary": str(meta.get("summary") or document.summary or ""),
"document_type": str(meta.get("document_type") or document.document_type or "other"),
"document_type_label": str(meta.get("document_type_label") or document.document_type_label or "其他单据"),
"scene_code": str(meta.get("scene_code") or document.scene_code or "other"),
"scene_label": str(meta.get("scene_label") or document.scene_label or "其他票据"),
"classification_source": str(meta.get("ocr_classification_source") or document.classification_source or ""),
"classification_confidence": float(
meta.get("ocr_classification_confidence")
or document.classification_confidence
or 0.0
),
"classification_evidence": [
str(value)
for value in list(meta.get("ocr_classification_evidence") or document.classification_evidence or [])
if str(value).strip()
],
"document_fields": self._build_ocr_document_fields_from_meta(meta),
}
) )
warnings = [
str(item)
for item in list(extra_warnings if extra_warnings is not None else document.warnings or [])
if str(item).strip()
]
if warnings:
update["warnings"] = list(dict.fromkeys(warnings))
return document.model_copy(update=update)
def _build_ocr_document_fields_from_meta(self, meta: dict[str, Any]) -> list[OcrRecognizeFieldRead]:
return [
OcrRecognizeFieldRead(
key=field.key,
label=field.label,
value=field.value,
) )
return result.model_copy(update={"documents": enriched}) for field in self._resolve_fields(meta)
if field.label and field.value
]
def save_receipt( def save_receipt(
self, self,
@@ -1024,6 +1215,7 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
def resolve_preview(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]: def resolve_preview(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]:
meta = self._read_receipt_meta(receipt_id, current_user) meta = self._read_receipt_meta(receipt_id, current_user)
receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id) receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id)
meta = self._refresh_pdf_preview_asset_if_needed(receipt_dir=receipt_dir, meta=meta)
preview_name = str(meta.get("preview_file_name") or "").strip() preview_name = str(meta.get("preview_file_name") or "").strip()
if preview_name: if preview_name:
preview_path = self._assert_child(receipt_dir / preview_name) preview_path = self._assert_child(receipt_dir / preview_name)
@@ -1038,4 +1230,3 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
if self._is_previewable(source_media_type): if self._is_previewable(source_media_type):
return source_path, source_media_type, source_name return source_path, source_media_type, source_name
raise FileNotFoundError("Receipt preview not found") raise FileNotFoundError("Receipt preview not found")

View File

@@ -84,6 +84,35 @@ def test_document_intelligence_prefers_train_ticket_for_railway_e_ticket_invoice
assert any(field.label == "金额" and field.value == "354元" for field in insight.fields) assert any(field.label == "金额" and field.value == "354元" for field in insight.fields)
def test_document_intelligence_recovers_train_ticket_from_english_station_ocr_text() -> None:
insight = build_document_insight(
filename="2月20_武汉-上海.pdf",
summary=":26429165800002785705:2026 05 18Wuhan Shanghaihongqiao G458",
text=(
":26429165800002785705\n"
":2026 05 18\n"
"G458\n"
"Wuhan\n"
"Shanghaihongqiao\n"
"2026 02 20 07:55\n"
"06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
)
assert insight.document_type == "train_ticket"
assert insight.document_type_label == "火车/高铁票"
assert insight.scene_code == "travel"
fields = {field.label: field.value for field in insight.fields}
assert fields["金额"] == "354元"
assert fields["列车出发时间"] == "2026-02-20 07:55"
assert fields["车次/航班"] == "G458"
assert fields["行程"] == "武汉-上海"
def test_document_intelligence_labels_train_ticket_date_as_train_departure_time() -> None: def test_document_intelligence_labels_train_ticket_date_as_train_departure_time() -> None:
insight = build_document_insight( insight = build_document_insight(
filename="铁路电子客票.pdf", filename="铁路电子客票.pdf",

View File

@@ -0,0 +1,169 @@
from __future__ import annotations
import json
from decimal import Decimal
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
from app.services.ocr import OcrService
from test_reimbursement_endpoints import build_client, seed_claim
def test_train_ticket_attachment_with_structured_fields_is_not_flagged_as_unreadable(
monkeypatch,
tmp_path,
) -> None:
def fake_recognize(
self,
files: list[tuple[str, bytes, str | None]],
) -> OcrRecognizeBatchRead:
return OcrRecognizeBatchRead(
total_file_count=1,
success_count=1,
documents=[
OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text=(
":26429165800002785705\n"
":2026 05 18\n"
"G458\n"
"Wuhan\n"
"Shanghaihongqiao\n"
"2026 02 20 07:55\n"
"06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
summary="Wuhan Shanghaihongqiao G458 354.00",
avg_score=0.0,
line_count=0,
page_count=1,
warnings=[],
)
],
)
monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
client, session_factory = build_client()
with session_factory() as db:
claim, item = seed_claim(db)
claim.expense_type = "travel"
claim.reason = "武汉-上海差旅"
claim.location = "上海"
claim.amount = Decimal("354.00")
item.item_type = "train_ticket"
item.item_reason = "武汉-上海"
item.item_location = "上海"
item.item_amount = Decimal("354.00")
db.commit()
claim_id = claim.id
item_id = item.id
upload_response = client.post(
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
)
assert upload_response.status_code == 200
attachment = upload_response.json()["attachment"]
analysis = attachment["analysis"]
points = analysis["points"]
assert attachment["document_info"]["document_type"] == "train_ticket"
assert analysis["severity"] == "pass"
assert not any("未识别到有效文字" in point for point in points)
assert not any("未识别到列车出发时间" in point for point in points)
def test_attachment_meta_read_repairs_stale_unreadable_train_ticket_analysis(
monkeypatch,
tmp_path,
) -> None:
def fake_recognize(
self,
files: list[tuple[str, bytes, str | None]],
) -> OcrRecognizeBatchRead:
return OcrRecognizeBatchRead(
total_file_count=1,
success_count=1,
documents=[
OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text=(
":26429165800002785705 :2026 05 18\n"
"G458\n"
"Wuhan Shanghaihongqiao\n"
"2026 02 20 07:55 06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
summary="Wuhan Shanghaihongqiao G458 354.00",
avg_score=0.0,
line_count=0,
page_count=1,
warnings=[],
)
],
)
monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
client, session_factory = build_client()
with session_factory() as db:
claim, item = seed_claim(db)
claim.expense_type = "travel"
claim.reason = "武汉-上海差旅"
claim.location = "上海"
claim.amount = Decimal("354.00")
item.item_type = "train_ticket"
item.item_reason = "武汉-上海"
item.item_location = "上海"
item.item_amount = Decimal("354.00")
db.commit()
claim_id = claim.id
item_id = item.id
upload_response = client.post(
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
)
assert upload_response.status_code == 200
meta_path = next(tmp_path.rglob("*.meta.json"))
meta = json.loads(meta_path.read_text(encoding="utf-8"))
meta["analysis"] = {
"severity": "high",
"label": "高风险",
"headline": "AI提示附件不符合票据校验条件",
"summary": "当前附件存在明显异常,票据类型与当前费用场景不匹配,或无法作为有效报销材料。",
"points": [
"附件内容:未识别到有效文字,当前附件更像普通图片或内容过于模糊。",
"日期字段:未识别到列车出发时间或乘车日期。",
],
"rule_basis": [],
"suggestion": "建议过滤当前不匹配的票据,重新上传符合当前费用场景的清晰原件。",
}
meta_path.write_text(json.dumps(meta, ensure_ascii=False), encoding="utf-8")
meta_response = client.get(
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/meta",
headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
)
assert meta_response.status_code == 200
analysis = meta_response.json()["analysis"]
points = analysis["points"]
assert analysis["severity"] == "pass"
assert not any("未识别到有效文字" in point for point in points)
assert not any("未识别到列车出发时间" in point for point in points)

View File

@@ -176,3 +176,73 @@ def test_ocr_recognize_endpoint_returns_structured_payload(monkeypatch, tmp_path
assert deleted_response.status_code == 404 assert deleted_response.status_code == 404
finally: finally:
get_settings.cache_clear() get_settings.cache_clear()
def test_ocr_recognize_endpoint_returns_receipt_enriched_train_fields(monkeypatch, tmp_path) -> None:
def fake_recognize(
self,
files: list[tuple[str, bytes, str | None]],
) -> OcrRecognizeBatchRead:
return OcrRecognizeBatchRead(
engine="paddleocr_mobile",
model="PP-OCRv5_mobile",
total_file_count=1,
success_count=1,
documents=[
OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.png",
media_type="image/png",
text=(
":26429165800002785705\n"
"G458\n"
"Wuhan\n"
"Shanghaihongqiao\n"
"2026 02 20 07:55\n"
"06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
summary="Wuhan Shanghaihongqiao G458 354.00",
avg_score=0.92,
line_count=0,
page_count=1,
document_type="train_ticket",
document_type_label="火车/高铁票",
scene_code="travel",
scene_label="差旅票据",
document_fields=[
OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"),
OcrRecognizeFieldRead(key="trip_no", label="车次/航班", value="G458"),
OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"),
OcrRecognizeFieldRead(key="amount", label="金额", value="354元"),
],
)
],
)
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
try:
client = build_client()
response = client.post(
"/api/v1/ocr/recognize",
headers={"x-auth-username": "pytest", "x-auth-name": "Py Test"},
files=[("files", ("2月20_武汉-上海.png", b"fake-image", "image/png"))],
)
finally:
get_settings.cache_clear()
assert response.status_code == 200
document = response.json()["documents"][0]
fields = {
item["label"]: item["value"]
for item in document["document_fields"]
}
assert document["receipt_id"]
assert fields["身份证号"] == "4201061987****1615"
assert fields["车厢"] == "06车"
assert fields["座位号"] == "01B"
assert fields["票价"] == "354.00元"

View File

@@ -101,6 +101,55 @@ print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))
assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"] assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]
def test_ocr_service_recovers_image_text_from_worker_ocr_text(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"ocr_text": "铁路电子客票 武汉-上海 2026 02 20 07:55 G458 : 354.00 12306 95306",
"avg_score": 0.92,
"line_count": 0,
"page_count": 1,
"warnings": [],
"lines": [],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
OcrService._result_cache.clear()
get_settings.cache_clear()
try:
result = OcrService().recognize_files([("train-ticket.png", b"fake-train-image", "image/png")])
finally:
OcrService._result_cache.clear()
get_settings.cache_clear()
recognized = result.documents[0]
assert "铁路电子客票" in recognized.text
assert recognized.document_type == "train_ticket"
assert any(field.label == "列车出发时间" and field.value == "2026-02-20 07:55" for field in recognized.document_fields)
assert any(field.label == "车次/航班" and field.value == "G458" for field in recognized.document_fields)
assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields)
def test_ocr_service_passes_configured_device_to_worker( def test_ocr_service_passes_configured_device_to_worker(
monkeypatch, monkeypatch,
tmp_path: Path, tmp_path: Path,

View File

@@ -1,8 +1,11 @@
from __future__ import annotations from __future__ import annotations
import base64
from app.api.deps import CurrentUserContext from app.api.deps import CurrentUserContext
from app.core.config import get_settings from app.core.config import get_settings
from app.schemas.ocr import OcrRecognizeDocumentRead from app.schemas.ocr import OcrRecognizeDocumentRead
from app.services.document_preview import DocumentPreviewAssets
from app.services.receipt_folder import ReceiptFolderService from app.services.receipt_folder import ReceiptFolderService
@@ -69,6 +72,172 @@ def test_receipt_folder_train_ticket_uses_invoice_date_and_enriches_fields(monke
get_settings.cache_clear() get_settings.cache_clear()
def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
stale_preview = b"stale-preview"
preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}"
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=b"%PDF-1.4 fake",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
preview_kind="image",
preview_data_url=preview_data_url,
),
)
receipt_dir = next(service.root.glob("pytest/*"))
preview_path = receipt_dir / "preview.png"
assert preview_path.read_bytes() == stale_preview
stale_meta = service._read_meta(receipt_dir)
stale_meta.pop("preview_rendered_with", None)
service._write_meta(receipt_dir, stale_meta)
def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
preview_path.write_bytes(b"refreshed-preview")
return preview_path
monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user)
assert resolved_path == preview_path
assert media_type == "image/png"
assert file_name == "preview.png"
assert preview_path.read_bytes() == b"refreshed-preview"
meta = service._read_meta(receipt_dir)
assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
finally:
get_settings.cache_clear()
def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
service = ReceiptFolderService()
content = b"%PDF-1.4 same receipt"
receipt = service.save_receipt(
filename="same-receipt.pdf",
content=content,
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="same-receipt.pdf",
media_type="application/pdf",
text="same receipt amount 354",
document_type="other",
document_type_label="其他单据",
scene_code="other",
scene_label="其他票据",
),
)
receipt_dir = service.root / "pytest" / receipt.id
assert receipt_dir.exists()
duplicate = service.find_duplicate_receipt(
filename="same-receipt.pdf",
content=content,
current_user=current_user,
)
assert duplicate is not None
assert duplicate.id == receipt.id
service.delete_receipt(receipt_id=receipt.id, current_user=current_user)
assert not receipt_dir.exists()
assert (
service.find_duplicate_receipt(
filename="same-receipt.pdf",
content=content,
current_user=current_user,
)
is None
)
finally:
get_settings.cache_clear()
def test_receipt_folder_recovers_train_ticket_detail_from_other_english_ocr(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=b"%PDF-1.4 fake",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text=(
":26429165800002785705\n"
":2026 05 18\n"
"G458\n"
"Wuhan\n"
"Shanghaihongqiao\n"
"2026 02 20 07:55\n"
"06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
summary="Wuhan Shanghaihongqiao G458 354.00",
document_type="other",
document_type_label="其他单据",
scene_code="other",
scene_label="其他票据",
),
)
assert receipt.document_type == "train_ticket"
assert receipt.document_type_label == "火车/高铁票"
assert receipt.scene_code == "travel"
assert receipt.amount == "354.00元"
assert receipt.document_date == "2026-02-20"
assert receipt.merchant_name == "中国铁路"
detail = service.get_receipt(receipt.id, current_user)
fields = {field.label: field.value for field in detail.fields}
assert fields["行程"] == "武汉-上海"
assert fields["车次"] == "G458"
assert fields["列车出发时间"] == "2026-02-20 07:55"
assert fields["票价"] == "354.00元"
assert fields["身份证号"] == "4201061987****1615"
assert fields["车厢"] == "06车"
assert fields["座位号"] == "01B"
assert "乘车人" not in fields
finally:
get_settings.cache_clear()
def test_receipt_folder_unlink_receipts_for_claim_marks_linked_receipts_unlinked(monkeypatch, tmp_path) -> None: def test_receipt_folder_unlink_receipts_for_claim_marks_linked_receipts_unlinked(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear() get_settings.cache_clear()

View File

@@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
import base64 import base64
import json
from collections.abc import Generator from collections.abc import Generator
from datetime import UTC, date, datetime from datetime import UTC, date, datetime
from decimal import Decimal from decimal import Decimal
@@ -19,6 +20,7 @@ from app.models.organization import OrganizationUnit
from app.models.risk_observation import RiskObservation, RiskObservationFeedback from app.models.risk_observation import RiskObservation, RiskObservationFeedback
from app.models.role import Role from app.models.role import Role
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
from app.services.document_preview import DocumentPreviewAssets
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
from app.services.ocr import OcrService from app.services.ocr import OcrService
@@ -686,6 +688,9 @@ def test_claim_item_pdf_attachment_preview_returns_generated_image(monkeypatch,
meta_payload = upload_response.json()["attachment"] meta_payload = upload_response.json()["attachment"]
assert meta_payload["preview_kind"] == "image" assert meta_payload["preview_kind"] == "image"
assert meta_payload["preview_url"].endswith(f"/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview") assert meta_payload["preview_url"].endswith(f"/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview")
meta_path = next(tmp_path.rglob("invoice.pdf.meta.json"))
stored_meta = json.loads(meta_path.read_text(encoding="utf-8"))
assert stored_meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
preview_response = client.get( preview_response = client.get(
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview", f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview",