feat(server): 票据文件夹资产缓存与文档预览统一生成

- 新增 document_preview 模块，DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成（poppler-data 编码）、renderer_id 标识 - receipt_folder 服务复用预览生成，缓存票据资产并提供清理；删除票据时保留已关联报销单的附件副本 - document_intelligence 新增票据预览/资产缓存接入与字段提取增强；ocr 抽取复用预览工具，附件分析/文档/操作/展示四个子模块同步适配 - receipt_folder 端点补充资产缓存头，补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试，新增 attachment_analysis 回归测试
2026-06-23 09:42:00 +08:00
parent bc743adef3
commit 84a8998e59
15 changed files with 1076 additions and 79 deletions
--- a/server/src/app/api/v1/endpoints/receipt_folder.py
+++ b/server/src/app/api/v1/endpoints/receipt_folder.py
@@ -92,7 +92,7 @@ def preview_receipt(receipt_id: str, current_user: CurrentUser) -> FileResponse:
        file_path, media_type, file_name = ReceiptFolderService().resolve_preview(receipt_id, current_user)
    except FileNotFoundError as exc:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Receipt preview not found") from exc
-    return FileResponse(file_path, media_type=media_type, filename=file_name)
+    return FileResponse(file_path, media_type=media_type, filename=file_name, headers={"Cache-Control": "no-store"})
@router.get(
--- a/server/src/app/services/document_intelligence.py
+++ b/server/src/app/services/document_intelligence.py
@@ -25,11 +25,15 @@ AMOUNT_PATTERNS = (
    re.compile(r"[￥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"),
    re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
 )
-DATE_PATTERN = re.compile(r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)")
+DATE_PATTERN = re.compile(
    r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
    r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
 )
 TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:：]([0-5]\d)(?!\d)")
 INVOICE_NUMBER_PATTERN = re.compile(r"(?:发票号码|票号|单号|订单号)[：:\s]*([A-Za-z0-9-]{6,24})")
 INVOICE_CODE_PATTERN = re.compile(r"(?:发票代码)[：:\s]*([A-Za-z0-9-]{6,24})")
 TRIP_NO_PATTERN = re.compile(r"(?:车次|航班(?:号)?)[：:\s]*([A-Za-z0-9]{2,12})")
 TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(?<![A-Za-z0-9])([GCDZKTLYS]\d{1,5})(?![A-Za-z0-9])", re.IGNORECASE)
 ROUTE_PATTERN = re.compile(r"([\u4e00-\u9fa5]{2,12})\s*(?:至|→|->|-)\s*([\u4e00-\u9fa5]{2,12})")
 MERCHANT_PATTERNS = (
    re.compile(r"(?:销售方(?:名称)?|商户(?:名称)?|开票方(?:名称)?|收款方(?:名称)?)[：:\s]*([A-Za-z0-9\u4e00-\u9fa5（）()·&\\-]{2,40})"),
@@ -300,6 +304,14 @@ def _match_document_rule(compact_text: str) -> RuleMatch:
            best_score = score
    if best_score <= 0:
        train_rule = DOCUMENT_TYPE_RULE_MAP.get("train_ticket")
        if train_rule and _looks_like_train_ticket(compact_text):
            return RuleMatch(
                rule=train_rule,
                confidence=0.82,
                evidence=("车次", "12306"),
                score=3.8,
            )
        return RuleMatch(rule=None, confidence=0.0, evidence=(), score=0.0)
    confidence = min(0.94, 0.30 + min(best_score, 4.8) * 0.12)
@@ -311,6 +323,17 @@ def _match_document_rule(compact_text: str) -> RuleMatch:
    )
 def _looks_like_train_ticket(compact_text: str) -> bool:
    text = str(compact_text or "").lower()
    if not re.search(r"[gcdzktlys]\d{1,5}", text, flags=re.IGNORECASE):
        return False
    if "12306" in text or "95306" in text:
        return True
    if re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5]{2,12}", text):
        return True
    return "wuhan" in text and "shanghai" in text
 def _extract_json_payload(response_text: str | None) -> dict[str, Any] | None:
    if not response_text:
        return None
@@ -521,33 +544,48 @@ def _merge_document_fields(
 def _extract_document_fields(text: str, document_type: str = "") -> list[DocumentField]:
    fields: list[DocumentField] = []
    normalized_type = str(document_type or "").strip().lower()
    def append_field(key: str, label: str, value: str) -> None:
        cleaned = _clean_field_value(value)
        if not cleaned:
            return
        if any(field.key == key for field in fields if field.key):
            return
        fields.append(DocumentField(key=key, label=label, value=cleaned))
    amount = _extract_amount(text)
    if amount:
-        fields.append(DocumentField(key="amount", label="金额", value=amount))
+        append_field("amount", "金额", amount)
    date_value = _extract_date(text, document_type=document_type)
    if date_value:
-        fields.append(DocumentField(key="date", label="日期", value=date_value))
+        append_field("date", "日期", date_value)
    merchant = _extract_merchant(text)
    if merchant:
-        fields.append(DocumentField(key="merchant_name", label="商户", value=merchant))
+        append_field("merchant_name", "商户", merchant)
    invoice_number = _extract_pattern(INVOICE_NUMBER_PATTERN, text)
    if invoice_number:
-        fields.append(DocumentField(key="invoice_number", label="票据号码", value=invoice_number))
+        append_field("invoice_number", "票据号码", invoice_number)
    invoice_code = _extract_pattern(INVOICE_CODE_PATTERN, text)
    if invoice_code:
-        fields.append(DocumentField(key="invoice_code", label="发票代码", value=invoice_code))
+        append_field("invoice_code", "发票代码", invoice_code)
    trip_no = _extract_pattern(TRIP_NO_PATTERN, text)
    if not trip_no and normalized_type == "train_ticket":
        trip_no = _extract_pattern(TRAIN_STANDALONE_NO_PATTERN, text)
    if trip_no:
-        fields.append(DocumentField(key="trip_no", label="车次/航班", value=trip_no))
+        append_field("trip_no", "车次/航班", trip_no.upper())
    route = _extract_route(text)
    if route:
-        fields.append(DocumentField(key="route", label="行程", value=route))
+        append_field("route", "行程", route)
    if normalized_type == "train_ticket" and not any(field.key == "amount" for field in fields):
        append_field("amount", "金额", _extract_loose_decimal_amount(text))
    return fields
@@ -621,6 +659,7 @@ def _format_date_match_with_time(text: str, match: re.Match[str]) -> str:
    raw_value = str(match.group(1) or "").strip()
    normalized = raw_value.replace("年", "-").replace("月", "-").replace("日", "")
    normalized = normalized.replace("/", "-").replace(".", "-")
    normalized = re.sub(r"\s+", "-", normalized)
    parts = [part for part in normalized.split("-") if part]
    if len(parts) != 3:
        return raw_value
@@ -703,6 +742,23 @@ def _extract_route(text: str) -> str:
    return f"{start}-{end}"
 def _extract_loose_decimal_amount(text: str) -> str:
    best_value: Decimal | None = None
    for match in re.finditer(r"(?<!\d)(\d{1,6}\.\d{1,2})(?!\d)", str(text or "")):
        try:
            candidate = Decimal(match.group(1)).quantize(Decimal("0.01"))
        except InvalidOperation:
            continue
        if candidate <= Decimal("0.00"):
            continue
        if best_value is None or candidate > best_value:
            best_value = candidate
    if best_value is None:
        return ""
    text_value = format(best_value, "f").rstrip("0").rstrip(".")
    return f"{text_value}元"
 def _extract_pattern(pattern: re.Pattern[str], text: str) -> str:
    match = pattern.search(text)
    if not match:
--- a/server/src/app/services/document_preview.py
+++ b/server/src/app/services/document_preview.py
@@ -0,0 +1,98 @@
 from __future__ import annotations
 import base64
 import binascii
 import mimetypes
 import re
 import shutil
 import subprocess
 import tempfile
 from pathlib import Path
 class DocumentPreviewAssets:
    PDF_RENDERER_ID = "pdftoppm-png-r160-poppler-data"
    PDF_PREVIEW_MEDIA_TYPE = "image/png"
    PDF_PREVIEW_SUFFIX = ".png"
    @staticmethod
    def decode_data_url(payload: str) -> tuple[str, bytes] | None:
        normalized = str(payload or "").strip()
        matched = re.match(
            r"^data:(?P<media>[\w.+-]+/[\w.+-]+);base64,(?P<body>.+)$",
            normalized,
            flags=re.DOTALL,
        )
        if not matched:
            return None
        try:
            content = base64.b64decode(matched.group("body"), validate=True)
        except (binascii.Error, ValueError):
            return None
        return matched.group("media"), content
    @classmethod
    def renderer_id_for_source(cls, media_type: str | None) -> str:
        return cls.PDF_RENDERER_ID if str(media_type or "").strip() == "application/pdf" else ""
    @classmethod
    def write_data_url_preview(
        cls,
        *,
        preview_dir: Path,
        preview_name_stem: str,
        preview_data_url: str,
    ) -> tuple[Path, str, str] | None:
        decoded = cls.decode_data_url(preview_data_url)
        if decoded is None:
            return None
        preview_media_type, preview_content = decoded
        suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
        preview_name = f"{Path(preview_name_stem).stem}{suffix}"
        preview_path = preview_dir / preview_name
        preview_path.write_bytes(preview_content)
        return preview_path, preview_media_type, preview_name
    @classmethod
    def render_pdf_first_page(
        cls,
        *,
        pdf_path: Path,
        preview_path: Path,
        timeout_seconds: int | float,
    ) -> Path:
        preview_path.parent.mkdir(parents=True, exist_ok=True)
        with tempfile.TemporaryDirectory(prefix=".pdf-preview-", dir=str(preview_path.parent)) as temp_dir:
            prefix = Path(temp_dir) / "page"
            completed = subprocess.run(
                [
                    "pdftoppm",
                    "-png",
                    "-r",
                    "160",
                    str(pdf_path),
                    str(prefix),
                ],
                capture_output=True,
                text=True,
                timeout=timeout_seconds,
                check=False,
            )
            if completed.returncode != 0:
                detail = (completed.stderr or completed.stdout or "").strip()
                raise RuntimeError(detail or "pdftoppm failed to render PDF preview.")
            pages = sorted(Path(temp_dir).glob("page-*.png"), key=cls._extract_pdf_page_sort_key)
            if not pages:
                raise RuntimeError("pdftoppm did not generate a preview image.")
            shutil.copyfile(pages[0], preview_path)
        return preview_path
    @staticmethod
    def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]:
        suffix = path.stem.rsplit("-", 1)[-1]
        try:
            return int(suffix), path.name
        except ValueError:
            return 0, path.name
--- a/server/src/app/services/expense_claim_attachment_analysis.py
+++ b/server/src/app/services/expense_claim_attachment_analysis.py
@@ -336,7 +336,27 @@ class ExpenseClaimAttachmentAnalysisMixin:
    @staticmethod
    def _has_date_like_text(text: str) -> bool:
-        return bool(re.search(r"(20\d{2}[年/\-.]\d{1,2}[月/\-.]\d{1,2}日?)", text))
+        return bool(re.search(r"(20\d{2}(?:[年/\-.]|\s+)\d{1,2}(?:[月/\-.]|\s+)\d{1,2}日?)", text))
    @staticmethod
    def _has_document_date_field(document_info: dict[str, Any]) -> bool:
        date_keys = DOCUMENT_TRIP_DATE_KEYS | DOCUMENT_GENERIC_DATE_KEYS | DOCUMENT_INVOICE_DATE_KEYS
        date_label_tokens = (
            *DOCUMENT_TRIP_DATE_LABEL_TOKENS,
            *DOCUMENT_GENERIC_DATE_LABEL_TOKENS,
            *DOCUMENT_INVOICE_DATE_LABEL_TOKENS,
        )
        for field in list(document_info.get("fields") or []):
            if not isinstance(field, dict):
                continue
            value = str(field.get("value") or "").strip()
            if not value:
                continue
            key = str(field.get("key") or "").strip().lower().replace("_", "")
            label = str(field.get("label") or "").replace(" ", "")
            if key in date_keys or any(token in label for token in date_label_tokens):
                return True
        return False
    @staticmethod
    def _normalize_match_text(text: str) -> str:
@@ -538,6 +558,12 @@ class ExpenseClaimAttachmentAnalysisMixin:
        recognized_document_label = str(document_info.get("document_type_label") or "其他单据").strip() or "其他单据"
        requirement_matches = bool(requirement_check.get("matches"))
        mismatch_severity = str(requirement_check.get("mismatch_severity") or "high").strip().lower() or "high"
        document_fields = [
            field
            for field in list(document_info.get("fields") or [])
            if isinstance(field, dict) and str(field.get("value") or "").strip()
        ]
        has_readable_content = bool(line_count > 0 or compact_text or document_fields)
        has_ticket_keyword = any(
            keyword in compact_text
@@ -556,15 +582,18 @@ class ExpenseClaimAttachmentAnalysisMixin:
            )
        )
        amount_candidates = self._extract_amount_candidates(text)
        field_amount = self._resolve_document_field_amount({"document_fields": document_fields})
        if field_amount is not None and field_amount not in amount_candidates:
            amount_candidates.insert(0, field_amount)
        item_amount = Decimal(item.item_amount or Decimal("0.00")).quantize(Decimal("0.01"))
        has_matching_amount = any(abs(candidate - item_amount) <= Decimal("1.00") for candidate in amount_candidates)
-        has_date_text = self._has_date_like_text(text)
+        has_date_text = self._has_date_like_text(text) or self._has_document_date_field(document_info)
        amount_mismatch = bool(amount_candidates) and item_amount > Decimal("0.00") and not has_matching_amount
        points: list[str] = []
        if warnings:
            points.append(f"识别提示：{warnings[0]}")
-        if line_count == 0 or not compact_text:
+        if not has_readable_content:
            points.append("附件内容：未识别到有效文字，当前附件更像普通图片或内容过于模糊。")
        if recognized_document_type == "other" and not has_ticket_keyword:
            points.append("票据类型：未识别到发票、票据、电子行程单等关键字，暂无法判断票据类型。")
@@ -617,8 +646,7 @@ class ExpenseClaimAttachmentAnalysisMixin:
            headline = "AI提示：住宿金额超出报销标准"
            summary = "当前住宿票据金额超过规则中心差旅住宿标准，已作为风险项保留在单据中；如需按特殊情况提交，请补充超标原因。"
        elif (
-            line_count == 0
+            not has_readable_content
            or not compact_text
            or (recognized_document_type == "other" and not has_ticket_keyword and issue_count >= 2)
            or (not requirement_matches and mismatch_severity == "high")
            or (purpose_mismatch_point and amount_mismatch)
--- a/server/src/app/services/expense_claim_attachment_document.py
+++ b/server/src/app/services/expense_claim_attachment_document.py
@@ -119,6 +119,13 @@ class ExpenseClaimAttachmentDocumentMixin:
            metadata=metadata,
            item=item,
        )
        metadata = self._refresh_pdf_attachment_preview_meta_if_needed(
            file_path=file_path,
            metadata=metadata,
        )
        if self._attachment_metadata_needs_analysis_refresh(metadata):
            self._refresh_item_attachment_analysis(item)
            metadata = self._attachment_storage.read_meta(file_path)
        uploaded_at_value = metadata.get("uploaded_at")
        uploaded_at = None
        if isinstance(uploaded_at_value, str) and uploaded_at_value.strip():
@@ -157,6 +164,68 @@ class ExpenseClaimAttachmentDocumentMixin:
            "requirement_check": requirement_check,
        }
    @classmethod
    def _attachment_metadata_needs_analysis_refresh(cls, metadata: dict[str, Any]) -> bool:
        analysis = metadata.get("analysis")
        if not isinstance(analysis, dict):
            return cls._attachment_metadata_has_ocr_signal(metadata)
        points = [
            str(point or "").strip()
            for point in list(analysis.get("points") or [])
            if str(point or "").strip()
        ]
        if not points:
            return False
        if any("未识别到有效文字" in point for point in points):
            return cls._attachment_metadata_has_readable_signal(metadata)
        if any("未识别到列车出发时间" in point or "未识别到开票日期" in point for point in points):
            return cls._attachment_metadata_has_date_field(metadata)
        return False
    @classmethod
    def _attachment_metadata_has_ocr_signal(cls, metadata: dict[str, Any]) -> bool:
        return bool(
            str(metadata.get("ocr_text") or "").strip()
            or str(metadata.get("ocr_summary") or "").strip()
            or int(metadata.get("ocr_line_count") or 0) > 0
            or cls._attachment_metadata_document_fields(metadata)
        )
    @classmethod
    def _attachment_metadata_has_readable_signal(cls, metadata: dict[str, Any]) -> bool:
        return bool(
            str(metadata.get("ocr_text") or "").strip()
            or str(metadata.get("ocr_summary") or "").strip()
            or int(metadata.get("ocr_line_count") or 0) > 0
            or cls._attachment_metadata_document_fields(metadata)
        )
    @staticmethod
    def _attachment_metadata_document_fields(metadata: dict[str, Any]) -> list[dict[str, Any]]:
        document_info = metadata.get("document_info")
        if not isinstance(document_info, dict):
            return []
        return [
            field
            for field in list(document_info.get("fields") or [])
            if isinstance(field, dict) and str(field.get("value") or "").strip()
        ]
    @classmethod
    def _attachment_metadata_has_date_field(cls, metadata: dict[str, Any]) -> bool:
        for field in cls._attachment_metadata_document_fields(metadata):
            key = str(field.get("key") or "").strip().lower().replace("_", "")
            label = str(field.get("label") or "").replace(" ", "")
            if key in {"date", "tripdate", "departuredate", "invoicedate"}:
                return True
            if any(token in label for token in ("日期", "时间", "出发")):
                return True
        return False
    def _build_attachment_document_info(self, document: Any) -> dict[str, Any]:
        insight = build_document_insight(
            filename=str(getattr(document, "filename", "") or ""),
--- a/server/src/app/services/expense_claim_attachment_operations.py
+++ b/server/src/app/services/expense_claim_attachment_operations.py
@@ -32,6 +32,7 @@ from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager
 from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY
 from app.services.agent_foundation import AgentFoundationService
 from app.services.audit import AuditLogService
 from app.services.document_preview import DocumentPreviewAssets
 from app.services.document_intelligence import build_document_insight
 from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy
 from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
@@ -238,6 +239,7 @@ class ExpenseClaimAttachmentOperationsMixin:
            "preview_storage_key": str(preview_meta["preview_storage_key"]),
            "preview_media_type": str(preview_meta["preview_media_type"]),
            "preview_file_name": str(preview_meta["preview_file_name"]),
            "preview_rendered_with": str(preview_meta.get("preview_rendered_with") or ""),
            "analysis": attachment_analysis,
            "document_info": document_info,
            "requirement_check": requirement_check,
@@ -673,6 +675,60 @@ class ExpenseClaimAttachmentOperationsMixin:
        self._attachment_storage.write_meta(file_path, metadata)
        return metadata
    def _refresh_pdf_attachment_preview_meta_if_needed(
        self,
        *,
        file_path: Path,
        metadata: dict[str, Any],
    ) -> dict[str, Any]:
        if not metadata:
            return metadata
        media_type = str(
            metadata.get("media_type")
            or self._attachment_presentation.resolve_media_type(file_path.name)
        ).strip()
        if media_type != "application/pdf":
            return metadata
        preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
        preview_path = self._attachment_storage.resolve_path(preview_storage_key) if preview_storage_key else None
        if (
            preview_path is not None
            and preview_path.exists()
            and str(metadata.get("preview_kind") or "").strip() == "image"
            and str(metadata.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
            and str(metadata.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
        ):
            return metadata
        preview_name = str(metadata.get("preview_file_name") or "").strip()
        if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
            preview_name = f"{file_path.stem}.preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
        preview_path = file_path.parent / preview_name
        try:
            DocumentPreviewAssets.render_pdf_first_page(
                pdf_path=file_path,
                preview_path=preview_path,
                timeout_seconds=OcrService(self.db).settings.ocr_timeout_seconds,
            )
        except Exception:
            return metadata
        metadata.update(
            {
                "previewable": True,
                "preview_kind": "image",
                "preview_storage_key": self._attachment_storage.to_storage_key(preview_path),
                "preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
                "preview_file_name": preview_path.name,
                "preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
            }
        )
        self._attachment_storage.write_meta(file_path, metadata)
        return metadata
    def _resolve_item_attachment_preview_content(self, item: ExpenseClaimItem) -> tuple[Path, str, str]:
        file_path, media_type, filename = self._resolve_item_attachment_content(item)
        metadata = self._attachment_storage.read_meta(file_path)
@@ -681,6 +737,10 @@ class ExpenseClaimAttachmentOperationsMixin:
            metadata=metadata,
            item=item,
        )
        metadata = self._refresh_pdf_attachment_preview_meta_if_needed(
            file_path=file_path,
            metadata=metadata,
        )
        preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
        preview_file_name = str(metadata.get("preview_file_name") or "").strip()
        preview_media_type = str(metadata.get("preview_media_type") or "").strip()
--- a/server/src/app/services/expense_claim_attachment_presentation.py
+++ b/server/src/app/services/expense_claim_attachment_presentation.py
@@ -1,13 +1,11 @@
 from __future__ import annotations
 import base64
 import binascii
 import mimetypes
 import re
 from pathlib import Path
 from typing import Any
 from urllib.parse import quote
 from app.services.document_preview import DocumentPreviewAssets
 from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
@@ -42,6 +40,7 @@ class ExpenseClaimAttachmentPresentation:
                    "preview_storage_key": self.storage.to_storage_key(preview_path),
                    "preview_media_type": preview_media_type,
                    "preview_file_name": preview_file_name,
                    "preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
                }
        if preview_kind:
@@ -51,6 +50,7 @@ class ExpenseClaimAttachmentPresentation:
                "preview_storage_key": storage_key,
                "preview_media_type": media_type,
                "preview_file_name": filename,
                "preview_rendered_with": "",
            }
        return {
@@ -59,6 +59,7 @@ class ExpenseClaimAttachmentPresentation:
            "preview_storage_key": "",
            "preview_media_type": "",
            "preview_file_name": "",
            "preview_rendered_with": "",
        }
    @staticmethod
@@ -72,15 +73,7 @@ class ExpenseClaimAttachmentPresentation:
    @staticmethod
    def decode_data_url(payload: str) -> tuple[str, bytes] | None:
-        normalized = str(payload or "").strip()
+        return DocumentPreviewAssets.decode_data_url(payload)
        matched = re.match(r"^data:(?P<media>[\w.+-]+/[\w.+-]+);base64,(?P<body>.+)$", normalized, flags=re.DOTALL)
        if not matched:
            return None
        try:
            content = base64.b64decode(matched.group("body"), validate=True)
        except (binascii.Error, ValueError):
            return None
        return matched.group("media"), content
    def _write_preview_asset_from_data_url(
        self,
@@ -89,16 +82,11 @@ class ExpenseClaimAttachmentPresentation:
        original_filename: str,
        preview_data_url: str,
    ) -> tuple[Path, str, str] | None:
-        decoded = self.decode_data_url(preview_data_url)
+        return DocumentPreviewAssets.write_data_url_preview(
-        if decoded is None:
+            preview_dir=attachment_dir,
-            return None
+            preview_name_stem=f"{Path(original_filename).stem}.preview",
-
+            preview_data_url=preview_data_url,
-        preview_media_type, preview_content = decoded
+        )
        suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
        preview_name = f"{Path(original_filename).stem}.preview{suffix}"
        preview_path = attachment_dir / preview_name
        preview_path.write_bytes(preview_content)
        return preview_path, preview_media_type, preview_name
    @staticmethod
    def build_preview_client_path(claim_id: str, item_id: str) -> str:
--- a/server/src/app/services/ocr.py
+++ b/server/src/app/services/ocr.py
@@ -537,7 +537,7 @@ class OcrService:
            if page_summary:
                aggregated.summary_fragments.append(page_summary)
-            page_text = str(payload.get("text", "") or "").strip()
+            page_text = self._resolve_worker_document_text(payload)
            if page_text:
                aggregated.text_fragments.append(page_text)
@@ -626,6 +626,22 @@ class OcrService:
                return descriptor.text_layer
        return ""
    @staticmethod
    def _resolve_worker_document_text(payload: dict) -> str:
        for key in ("text", "ocr_text", "raw_text", "full_text"):
            value = str(payload.get(key, "") or "").strip()
            if value:
                return value
        lines = payload.get("lines", [])
        if not isinstance(lines, list):
            return ""
        return "\n".join(
            str(item.get("text", "") or "").strip()
            for item in lines
            if isinstance(item, dict) and str(item.get("text", "") or "").strip()
        ).strip()
    @staticmethod
    def _build_lines(
        items: list[dict],
--- a/server/src/app/services/receipt_folder.py
+++ b/server/src/app/services/receipt_folder.py
@@ -12,7 +12,7 @@ from uuid import uuid4
 from app.api.deps import CurrentUserContext
 from app.core.config import get_settings
-from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
+from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead
 from app.schemas.receipt_folder import (
    ReceiptFolderDeleteResponse,
    ReceiptFolderDetailRead,
@@ -20,11 +20,13 @@ from app.schemas.receipt_folder import (
    ReceiptFolderItemRead,
    ReceiptFolderUpdate,
 )
-from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
+from app.services.document_preview import DocumentPreviewAssets
 from app.services.document_intelligence import build_document_insight
 from app.services.ocr import SUPPORTED_SUFFIXES
 RECEIPT_DATE_PATTERN = re.compile(
-    r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)"
+    r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
    r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
 )
 RECEIPT_TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:：]([0-5]\d)(?!\d)")
 TRAIN_INVOICE_DATE_PATTERN = re.compile(
@@ -45,7 +47,9 @@ TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座
 TRAIN_CARRIAGE_PATTERN = re.compile(r"(?:车厢|车厢号)\s*[:：]?\s*([0-9]{1,2}\s*车?)")
 TRAIN_SEAT_NO_PATTERN = re.compile(r"(?:座位|座位号)\s*[:：]?\s*([0-9]{1,3}[A-F号]?)", re.IGNORECASE)
 TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])号?", re.IGNORECASE)
 TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE)
 TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[:：￥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
 TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)")
 class ReceiptFolderStorageMixin:
@@ -101,18 +105,19 @@ class ReceiptFolderStorageMixin:
        document: Any | None,
    ) -> dict[str, Any]:
        preview_data_url = str(getattr(document, "preview_data_url", "") or "").strip()
-        decoded = ExpenseClaimAttachmentPresentation.decode_data_url(preview_data_url)
+        preview_asset = DocumentPreviewAssets.write_data_url_preview(
-        if decoded is not None:
+            preview_dir=receipt_dir,
-            preview_media_type, preview_content = decoded
+            preview_name_stem="preview",
-            suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
+            preview_data_url=preview_data_url,
-            preview_name = f"preview{suffix}"
+        )
-            preview_path = receipt_dir / preview_name
+        if preview_asset is not None:
-            preview_path.write_bytes(preview_content)
+            _, preview_media_type, preview_name = preview_asset
            return {
                "previewable": True,
                "preview_kind": "image",
                "preview_file_name": preview_name,
                "preview_media_type": preview_media_type,
                "preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
            }
        if self._is_previewable(media_type):
            return {
@@ -120,14 +125,67 @@ class ReceiptFolderStorageMixin:
                "preview_kind": "image" if media_type.startswith("image/") else "pdf",
                "preview_file_name": source_path.name,
                "preview_media_type": media_type,
                "preview_rendered_with": "",
            }
        return {
            "previewable": False,
            "preview_kind": "",
            "preview_file_name": "",
            "preview_media_type": "",
            "preview_rendered_with": "",
        }
    def _refresh_pdf_preview_asset_if_needed(
        self,
        *,
        receipt_dir: Path,
        meta: dict[str, Any],
    ) -> dict[str, Any]:
        source_name = str(meta.get("source_file_name") or meta.get("file_name") or "").strip()
        if not source_name:
            return meta
        source_path = self._assert_child(receipt_dir / source_name)
        source_media_type = self.resolve_media_type(source_path.name, str(meta.get("media_type") or ""))
        if source_media_type != "application/pdf" or not source_path.exists():
            return meta
        preview_name = str(meta.get("preview_file_name") or "").strip()
        preview_path = self._assert_child(receipt_dir / preview_name) if preview_name else None
        if (
            preview_path is not None
            and preview_path.exists()
            and str(meta.get("preview_kind") or "").strip() == "image"
            and str(meta.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
            and str(meta.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
        ):
            return meta
        if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
            preview_name = f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
        preview_path = self._assert_child(receipt_dir / preview_name)
        try:
            DocumentPreviewAssets.render_pdf_first_page(
                pdf_path=source_path,
                preview_path=preview_path,
                timeout_seconds=get_settings().ocr_timeout_seconds,
            )
        except Exception:
            return meta
        meta.update(
            {
                "previewable": True,
                "preview_kind": "image",
                "preview_file_name": preview_path.name,
                "preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
                "preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
            }
        )
        self._write_meta(receipt_dir, meta)
        return meta
    @staticmethod
    def _is_previewable(media_type: str) -> bool:
        return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf"
@@ -256,6 +314,7 @@ class ReceiptFolderItemMixin:
    def _build_item(self, meta: dict[str, Any]) -> ReceiptFolderItemRead:
        receipt_id = str(meta.get("id") or "").strip()
        status_value = str(meta.get("status") or "unlinked").strip() or "unlinked"
        identity = self._resolve_receipt_document_identity(meta)
        return ReceiptFolderItemRead(
            id=receipt_id,
            file_name=str(meta.get("file_name") or ""),
@@ -263,10 +322,10 @@ class ReceiptFolderItemMixin:
            size_bytes=int(meta.get("size_bytes") or 0),
            status=status_value,
            status_label="已关联" if status_value == "linked" else "未关联",
-            document_type=str(meta.get("document_type") or "other"),
+            document_type=identity["document_type"],
-            document_type_label=str(meta.get("document_type_label") or "其他单据"),
+            document_type_label=identity["document_type_label"],
-            scene_code=str(meta.get("scene_code") or "other"),
+            scene_code=identity["scene_code"],
-            scene_label=str(meta.get("scene_label") or "其他票据"),
+            scene_label=identity["scene_label"],
            summary=str(meta.get("summary") or ""),
            amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")),
            document_date=self._resolve_receipt_document_date(meta),
@@ -283,6 +342,38 @@ class ReceiptFolderItemMixin:
            warnings=[str(value) for value in list(meta.get("ocr_warnings") or []) if str(value).strip()],
        )
    def _resolve_receipt_document_identity(self, meta: dict[str, Any]) -> dict[str, str]:
        document_type = str(meta.get("document_type") or "other").strip() or "other"
        document_type_label = str(meta.get("document_type_label") or "其他单据").strip() or "其他单据"
        scene_code = str(meta.get("scene_code") or "other").strip() or "other"
        scene_label = str(meta.get("scene_label") or "其他票据").strip() or "其他票据"
        if document_type not in {"", "other"} and document_type_label != "其他单据":
            return {
                "document_type": document_type,
                "document_type_label": document_type_label,
                "scene_code": scene_code,
                "scene_label": scene_label,
            }
        insight = build_document_insight(
            filename=str(meta.get("file_name") or ""),
            summary=str(meta.get("summary") or ""),
            text=self._receipt_text(meta),
        )
        if insight.document_type in {"", "other"}:
            return {
                "document_type": document_type,
                "document_type_label": document_type_label,
                "scene_code": scene_code,
                "scene_label": scene_label,
            }
        return {
            "document_type": insight.document_type,
            "document_type_label": insight.document_type_label,
            "scene_code": insight.scene_code,
            "scene_label": insight.scene_label,
        }
    def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]:
        fields = [
            ReceiptFolderFieldRead(
@@ -503,7 +594,15 @@ class ReceiptFolderTrainTicketMixin:
        if str(document_type or "").strip().lower() == "train_ticket":
            return True
        compact = "".join([document_type_label, scene_label, text]).replace(" ", "")
-        return any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次"))
+        if any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次")):
            return True
        lower_compact = compact.lower()
        return bool(re.search(r"[GCDZKTLYS]\d{1,5}", compact, flags=re.IGNORECASE)) and (
            "12306" in compact
            or "95306" in compact
            or re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5]{2,12}", compact)
            or ("wuhan" in lower_compact and "shanghai" in lower_compact)
        )
    @classmethod
    def _is_train_ticket_meta(cls, meta: dict[str, Any]) -> bool:
@@ -581,6 +680,7 @@ class ReceiptFolderTrainTicketMixin:
            return raw
        normalized = match.group(1).replace("年", "-").replace("月", "-").replace("日", "")
        normalized = normalized.replace("/", "-").replace(".", "-")
        normalized = re.sub(r"\s+", "-", normalized)
        parts = [part for part in normalized.split("-") if part]
        if len(parts) != 3:
            return match.group(1)
@@ -651,7 +751,28 @@ class ReceiptFolderTrainTicketMixin:
        cleaned = re.sub(r"[^·\u4e00-\u9fa5]", "", str(value or "")).strip()
        if not 2 <= len(cleaned) <= 8:
            return ""
-        if any(token in cleaned for token in ("电子", "客票", "铁路", "发票", "税务", "湖北省", "中国铁路", "开票", "日期")):
+        if any(
            token in cleaned
            for token in (
                "电子",
                "客票",
                "铁路",
                "发票",
                "税务",
                "湖北省",
                "中国铁路",
                "开票",
                "日期",
                "车厢",
                "座位",
                "票价",
                "金额",
                "行程",
                "出发",
                "到达",
                "车次",
            )
        ):
            return ""
        return cleaned
@@ -660,20 +781,29 @@ class ReceiptFolderTrainTicketMixin:
        labeled = cls._extract_first(TRAIN_ID_PATTERN, text)
        if labeled:
            return labeled
        fallback = ""
        for line in str(text or "").replace("\r", "\n").splitlines():
            compact_line = line.replace(" ", "")
            if any(token in compact_line for token in ("发票号码", "电子客票号", "客票号", "订单号")):
                continue
            match = TRAIN_ID_FALLBACK_PATTERN.search(compact_line)
-            if match:
+            if not match:
-                return str(match.group(1) or "").strip()
+                continue
-        return ""
+            candidate = str(match.group(1) or "").strip()
            if "*" in candidate:
                return candidate
            if not fallback:
                fallback = candidate
        return fallback
    @staticmethod
    def _extract_train_carriage_and_seat(text: str) -> tuple[str, str]:
        combined_match = TRAIN_COMBINED_SEAT_PATTERN.search(str(text or ""))
        if combined_match:
            return f"{combined_match.group(1)}车", combined_match.group(2)
        loose_match = TRAIN_LOOSE_SEAT_PATTERN.search(str(text or ""))
        if loose_match:
            return f"{loose_match.group(1).zfill(2)}车", loose_match.group(2).upper()
        carriage_no = ReceiptFolderService._extract_first(TRAIN_CARRIAGE_PATTERN, text).replace(" ", "")
        seat_no = ReceiptFolderService._extract_first(TRAIN_SEAT_NO_PATTERN, text)
        return carriage_no, seat_no
@@ -681,6 +811,12 @@ class ReceiptFolderTrainTicketMixin:
    @staticmethod
    def _extract_train_fare(text: str) -> str:
        match = TRAIN_FARE_PATTERN.search(str(text or ""))
        if not match:
            match = max(
                list(TRAIN_LOOSE_FARE_PATTERN.finditer(str(text or ""))),
                key=lambda item: float(str(item.group(1) or "0").replace(",", ".")),
                default=None,
            )
        if not match:
            return ""
        value = str(match.group(1) or "").replace(",", ".").strip()
@@ -721,13 +857,10 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
            )
            if existing_receipt is not None:
                enriched.append(
-                    document.model_copy(
+                    self._enrich_ocr_document_with_receipt(
-                        update={
+                        document,
-                            "receipt_id": existing_receipt.id,
+                        receipt=existing_receipt,
-                            "receipt_status": existing_receipt.status,
+                        current_user=current_user,
                            "receipt_preview_url": existing_receipt.preview_url,
                            "receipt_source_url": existing_receipt.source_url,
                        }
                    )
                )
                continue
@@ -744,14 +877,11 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
                warning = "已上传过同样的单据，请不要重复上传。"
                existing_warnings = [str(item) for item in list(document.warnings or []) if str(item).strip()]
                enriched.append(
-                    document.model_copy(
+                    self._enrich_ocr_document_with_receipt(
-                        update={
+                        document,
-                            "receipt_id": duplicate_receipt.id,
+                        receipt=duplicate_receipt,
-                            "receipt_status": duplicate_receipt.status,
+                        current_user=current_user,
-                            "receipt_preview_url": duplicate_receipt.preview_url,
+                        extra_warnings=[*existing_warnings, warning],
                            "receipt_source_url": duplicate_receipt.source_url,
                            "warnings": list(dict.fromkeys([*existing_warnings, warning])),
                        }
                    )
                )
                continue
@@ -763,16 +893,77 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
                current_user=current_user,
            )
            enriched.append(
-                document.model_copy(
+                self._enrich_ocr_document_with_receipt(
-                    update={
+                    document,
                    receipt=receipt,
                    current_user=current_user,
                )
            )
        return result.model_copy(update={"documents": enriched})
    def _enrich_ocr_document_with_receipt(
        self,
        document: OcrRecognizeDocumentRead,
        *,
        receipt: ReceiptFolderItemRead,
        current_user: CurrentUserContext,
        extra_warnings: list[str] | None = None,
    ) -> OcrRecognizeDocumentRead:
        update: dict[str, Any] = {
            "receipt_id": receipt.id,
            "receipt_status": receipt.status,
            "receipt_preview_url": receipt.preview_url,
            "receipt_source_url": receipt.source_url,
        }
        try:
            meta = self._read_receipt_meta(receipt.id, current_user)
        except FileNotFoundError:
            meta = {}
        if meta:
            update.update(
                {
                    "text": str(meta.get("ocr_text") or document.text or ""),
                    "summary": str(meta.get("summary") or document.summary or ""),
                    "document_type": str(meta.get("document_type") or document.document_type or "other"),
                    "document_type_label": str(meta.get("document_type_label") or document.document_type_label or "其他单据"),
                    "scene_code": str(meta.get("scene_code") or document.scene_code or "other"),
                    "scene_label": str(meta.get("scene_label") or document.scene_label or "其他票据"),
                    "classification_source": str(meta.get("ocr_classification_source") or document.classification_source or ""),
                    "classification_confidence": float(
                        meta.get("ocr_classification_confidence")
                        or document.classification_confidence
                        or 0.0
                    ),
                    "classification_evidence": [
                        str(value)
                        for value in list(meta.get("ocr_classification_evidence") or document.classification_evidence or [])
                        if str(value).strip()
                    ],
                    "document_fields": self._build_ocr_document_fields_from_meta(meta),
                }
            )
        warnings = [
            str(item)
            for item in list(extra_warnings if extra_warnings is not None else document.warnings or [])
            if str(item).strip()
        ]
        if warnings:
            update["warnings"] = list(dict.fromkeys(warnings))
        return document.model_copy(update=update)
    def _build_ocr_document_fields_from_meta(self, meta: dict[str, Any]) -> list[OcrRecognizeFieldRead]:
        return [
            OcrRecognizeFieldRead(
                key=field.key,
                label=field.label,
                value=field.value,
            )
-        return result.model_copy(update={"documents": enriched})
+            for field in self._resolve_fields(meta)
            if field.label and field.value
        ]
    def save_receipt(
        self,
@@ -1024,6 +1215,7 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
    def resolve_preview(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]:
        meta = self._read_receipt_meta(receipt_id, current_user)
        receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id)
        meta = self._refresh_pdf_preview_asset_if_needed(receipt_dir=receipt_dir, meta=meta)
        preview_name = str(meta.get("preview_file_name") or "").strip()
        if preview_name:
            preview_path = self._assert_child(receipt_dir / preview_name)
@@ -1038,4 +1230,3 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
        if self._is_previewable(source_media_type):
            return source_path, source_media_type, source_name
        raise FileNotFoundError("Receipt preview not found")
--- a/server/tests/test_document_intelligence.py
+++ b/server/tests/test_document_intelligence.py
@@ -84,6 +84,35 @@ def test_document_intelligence_prefers_train_ticket_for_railway_e_ticket_invoice
    assert any(field.label == "金额" and field.value == "354元" for field in insight.fields)
 def test_document_intelligence_recovers_train_ticket_from_english_station_ocr_text() -> None:
    insight = build_document_insight(
        filename="2月20_武汉-上海.pdf",
        summary=":26429165800002785705；:2026 05 18；Wuhan Shanghaihongqiao G458",
        text=(
            ":26429165800002785705\n"
            ":2026 05 18\n"
            "G458\n"
            "Wuhan\n"
            "Shanghaihongqiao\n"
            "2026 02 20 07:55\n"
            "06 01B\n"
            ": 354.00\n"
            "4201061987****1615\n"
            ":6580061086021391007342026\n"
            "12306 95306"
        ),
    )
    assert insight.document_type == "train_ticket"
    assert insight.document_type_label == "火车/高铁票"
    assert insight.scene_code == "travel"
    fields = {field.label: field.value for field in insight.fields}
    assert fields["金额"] == "354元"
    assert fields["列车出发时间"] == "2026-02-20 07:55"
    assert fields["车次/航班"] == "G458"
    assert fields["行程"] == "武汉-上海"
 def test_document_intelligence_labels_train_ticket_date_as_train_departure_time() -> None:
    insight = build_document_insight(
        filename="铁路电子客票.pdf",
--- a/server/tests/test_expense_claim_attachment_analysis_regression.py
+++ b/server/tests/test_expense_claim_attachment_analysis_regression.py
@@ -0,0 +1,169 @@
 from __future__ import annotations
 import json
 from decimal import Decimal
 from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
 from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
 from app.services.ocr import OcrService
 from test_reimbursement_endpoints import build_client, seed_claim
 def test_train_ticket_attachment_with_structured_fields_is_not_flagged_as_unreadable(
    monkeypatch,
    tmp_path,
 ) -> None:
    def fake_recognize(
        self,
        files: list[tuple[str, bytes, str | None]],
    ) -> OcrRecognizeBatchRead:
        return OcrRecognizeBatchRead(
            total_file_count=1,
            success_count=1,
            documents=[
                OcrRecognizeDocumentRead(
                    filename="2月20_武汉-上海.pdf",
                    media_type="application/pdf",
                    text=(
                        ":26429165800002785705\n"
                        ":2026 05 18\n"
                        "G458\n"
                        "Wuhan\n"
                        "Shanghaihongqiao\n"
                        "2026 02 20 07:55\n"
                        "06 01B\n"
                        ": 354.00\n"
                        "4201061987****1615\n"
                        ":6580061086021391007342026\n"
                        "12306 95306"
                    ),
                    summary="Wuhan Shanghaihongqiao G458 354.00",
                    avg_score=0.0,
                    line_count=0,
                    page_count=1,
                    warnings=[],
                )
            ],
        )
    monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
    monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
    client, session_factory = build_client()
    with session_factory() as db:
        claim, item = seed_claim(db)
        claim.expense_type = "travel"
        claim.reason = "武汉-上海差旅"
        claim.location = "上海"
        claim.amount = Decimal("354.00")
        item.item_type = "train_ticket"
        item.item_reason = "武汉-上海"
        item.item_location = "上海"
        item.item_amount = Decimal("354.00")
        db.commit()
        claim_id = claim.id
        item_id = item.id
    upload_response = client.post(
        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
        headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
        files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
    )
    assert upload_response.status_code == 200
    attachment = upload_response.json()["attachment"]
    analysis = attachment["analysis"]
    points = analysis["points"]
    assert attachment["document_info"]["document_type"] == "train_ticket"
    assert analysis["severity"] == "pass"
    assert not any("未识别到有效文字" in point for point in points)
    assert not any("未识别到列车出发时间" in point for point in points)
 def test_attachment_meta_read_repairs_stale_unreadable_train_ticket_analysis(
    monkeypatch,
    tmp_path,
 ) -> None:
    def fake_recognize(
        self,
        files: list[tuple[str, bytes, str | None]],
    ) -> OcrRecognizeBatchRead:
        return OcrRecognizeBatchRead(
            total_file_count=1,
            success_count=1,
            documents=[
                OcrRecognizeDocumentRead(
                    filename="2月20_武汉-上海.pdf",
                    media_type="application/pdf",
                    text=(
                        ":26429165800002785705 :2026 05 18\n"
                        "G458\n"
                        "Wuhan Shanghaihongqiao\n"
                        "2026 02 20 07:55 06 01B\n"
                        ": 354.00\n"
                        "4201061987****1615\n"
                        ":6580061086021391007342026\n"
                        "12306 95306"
                    ),
                    summary="Wuhan Shanghaihongqiao G458 354.00",
                    avg_score=0.0,
                    line_count=0,
                    page_count=1,
                    warnings=[],
                )
            ],
        )
    monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
    monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
    client, session_factory = build_client()
    with session_factory() as db:
        claim, item = seed_claim(db)
        claim.expense_type = "travel"
        claim.reason = "武汉-上海差旅"
        claim.location = "上海"
        claim.amount = Decimal("354.00")
        item.item_type = "train_ticket"
        item.item_reason = "武汉-上海"
        item.item_location = "上海"
        item.item_amount = Decimal("354.00")
        db.commit()
        claim_id = claim.id
        item_id = item.id
    upload_response = client.post(
        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
        headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
        files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
    )
    assert upload_response.status_code == 200
    meta_path = next(tmp_path.rglob("*.meta.json"))
    meta = json.loads(meta_path.read_text(encoding="utf-8"))
    meta["analysis"] = {
        "severity": "high",
        "label": "高风险",
        "headline": "AI提示：附件不符合票据校验条件",
        "summary": "当前附件存在明显异常，票据类型与当前费用场景不匹配，或无法作为有效报销材料。",
        "points": [
            "附件内容：未识别到有效文字，当前附件更像普通图片或内容过于模糊。",
            "日期字段：未识别到列车出发时间或乘车日期。",
        ],
        "rule_basis": [],
        "suggestion": "建议过滤当前不匹配的票据，重新上传符合当前费用场景的清晰原件。",
    }
    meta_path.write_text(json.dumps(meta, ensure_ascii=False), encoding="utf-8")
    meta_response = client.get(
        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/meta",
        headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
    )
    assert meta_response.status_code == 200
    analysis = meta_response.json()["analysis"]
    points = analysis["points"]
    assert analysis["severity"] == "pass"
    assert not any("未识别到有效文字" in point for point in points)
    assert not any("未识别到列车出发时间" in point for point in points)
--- a/server/tests/test_ocr_endpoints.py
+++ b/server/tests/test_ocr_endpoints.py
@@ -176,3 +176,73 @@ def test_ocr_recognize_endpoint_returns_structured_payload(monkeypatch, tmp_path
        assert deleted_response.status_code == 404
    finally:
        get_settings.cache_clear()
 def test_ocr_recognize_endpoint_returns_receipt_enriched_train_fields(monkeypatch, tmp_path) -> None:
    def fake_recognize(
        self,
        files: list[tuple[str, bytes, str | None]],
    ) -> OcrRecognizeBatchRead:
        return OcrRecognizeBatchRead(
            engine="paddleocr_mobile",
            model="PP-OCRv5_mobile",
            total_file_count=1,
            success_count=1,
            documents=[
                OcrRecognizeDocumentRead(
                    filename="2月20_武汉-上海.png",
                    media_type="image/png",
                    text=(
                        ":26429165800002785705\n"
                        "G458\n"
                        "Wuhan\n"
                        "Shanghaihongqiao\n"
                        "2026 02 20 07:55\n"
                        "06 01B\n"
                        ": 354.00\n"
                        "4201061987****1615\n"
                        ":6580061086021391007342026\n"
                        "12306 95306"
                    ),
                    summary="Wuhan Shanghaihongqiao G458 354.00",
                    avg_score=0.92,
                    line_count=0,
                    page_count=1,
                    document_type="train_ticket",
                    document_type_label="火车/高铁票",
                    scene_code="travel",
                    scene_label="差旅票据",
                    document_fields=[
                        OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"),
                        OcrRecognizeFieldRead(key="trip_no", label="车次/航班", value="G458"),
                        OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"),
                        OcrRecognizeFieldRead(key="amount", label="金额", value="354元"),
                    ],
                )
            ],
        )
    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
    get_settings.cache_clear()
    monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
    try:
        client = build_client()
        response = client.post(
            "/api/v1/ocr/recognize",
            headers={"x-auth-username": "pytest", "x-auth-name": "Py Test"},
            files=[("files", ("2月20_武汉-上海.png", b"fake-image", "image/png"))],
        )
    finally:
        get_settings.cache_clear()
    assert response.status_code == 200
    document = response.json()["documents"][0]
    fields = {
        item["label"]: item["value"]
        for item in document["document_fields"]
    }
    assert document["receipt_id"]
    assert fields["身份证号"] == "4201061987****1615"
    assert fields["车厢"] == "06车"
    assert fields["座位号"] == "01B"
    assert fields["票价"] == "354.00元"
--- a/server/tests/test_ocr_service.py
+++ b/server/tests/test_ocr_service.py
@@ -101,6 +101,55 @@ print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))
    assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]
 def test_ocr_service_recovers_image_text_from_worker_ocr_text(
    monkeypatch,
    tmp_path: Path,
 ) -> None:
    def fake_invoke_worker(
        self,
        *,
        python_bin: str,
        worker_path: str,
        input_paths: list[Path],
    ) -> dict:
        return {
            "engine": "paddleocr_mobile",
            "model": "PP-OCRv5_mobile",
            "documents": [
                {
                    "input_path": str(input_paths[0]),
                    "engine": "paddleocr_mobile",
                    "model": "PP-OCRv5_mobile",
                    "ocr_text": "铁路电子客票 武汉-上海 2026 02 20 07:55 G458 : 354.00 12306 95306",
                    "avg_score": 0.92,
                    "line_count": 0,
                    "page_count": 1,
                    "warnings": [],
                    "lines": [],
                }
            ],
        }
    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
    monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
    monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
    monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
    OcrService._result_cache.clear()
    get_settings.cache_clear()
    try:
        result = OcrService().recognize_files([("train-ticket.png", b"fake-train-image", "image/png")])
    finally:
        OcrService._result_cache.clear()
        get_settings.cache_clear()
    recognized = result.documents[0]
    assert "铁路电子客票" in recognized.text
    assert recognized.document_type == "train_ticket"
    assert any(field.label == "列车出发时间" and field.value == "2026-02-20 07:55" for field in recognized.document_fields)
    assert any(field.label == "车次/航班" and field.value == "G458" for field in recognized.document_fields)
    assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields)
 def test_ocr_service_passes_configured_device_to_worker(
    monkeypatch,
    tmp_path: Path,
--- a/server/tests/test_receipt_folder_service.py
+++ b/server/tests/test_receipt_folder_service.py
@@ -1,8 +1,11 @@
 from __future__ import annotations
 import base64
 from app.api.deps import CurrentUserContext
 from app.core.config import get_settings
 from app.schemas.ocr import OcrRecognizeDocumentRead
 from app.services.document_preview import DocumentPreviewAssets
 from app.services.receipt_folder import ReceiptFolderService
@@ -69,6 +72,172 @@ def test_receipt_folder_train_ticket_uses_invoice_date_and_enriches_fields(monke
        get_settings.cache_clear()
 def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None:
    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
    get_settings.cache_clear()
    try:
        current_user = CurrentUserContext(
            username="pytest",
            name="Py Test",
            role_codes=[],
            is_admin=False,
        )
        stale_preview = b"stale-preview"
        preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}"
        service = ReceiptFolderService()
        receipt = service.save_receipt(
            filename="2月20_武汉-上海.pdf",
            content=b"%PDF-1.4 fake",
            media_type="application/pdf",
            current_user=current_user,
            document=OcrRecognizeDocumentRead(
                filename="2月20_武汉-上海.pdf",
                media_type="application/pdf",
                preview_kind="image",
                preview_data_url=preview_data_url,
            ),
        )
        receipt_dir = next(service.root.glob("pytest/*"))
        preview_path = receipt_dir / "preview.png"
        assert preview_path.read_bytes() == stale_preview
        stale_meta = service._read_meta(receipt_dir)
        stale_meta.pop("preview_rendered_with", None)
        service._write_meta(receipt_dir, stale_meta)
        def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
            preview_path.write_bytes(b"refreshed-preview")
            return preview_path
        monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
        resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user)
        assert resolved_path == preview_path
        assert media_type == "image/png"
        assert file_name == "preview.png"
        assert preview_path.read_bytes() == b"refreshed-preview"
        meta = service._read_meta(receipt_dir)
        assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
    finally:
        get_settings.cache_clear()
 def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None:
    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
    get_settings.cache_clear()
    try:
        current_user = CurrentUserContext(
            username="pytest",
            name="Py Test",
            role_codes=[],
            is_admin=False,
        )
        service = ReceiptFolderService()
        content = b"%PDF-1.4 same receipt"
        receipt = service.save_receipt(
            filename="same-receipt.pdf",
            content=content,
            media_type="application/pdf",
            current_user=current_user,
            document=OcrRecognizeDocumentRead(
                filename="same-receipt.pdf",
                media_type="application/pdf",
                text="same receipt amount 354",
                document_type="other",
                document_type_label="其他单据",
                scene_code="other",
                scene_label="其他票据",
            ),
        )
        receipt_dir = service.root / "pytest" / receipt.id
        assert receipt_dir.exists()
        duplicate = service.find_duplicate_receipt(
            filename="same-receipt.pdf",
            content=content,
            current_user=current_user,
        )
        assert duplicate is not None
        assert duplicate.id == receipt.id
        service.delete_receipt(receipt_id=receipt.id, current_user=current_user)
        assert not receipt_dir.exists()
        assert (
            service.find_duplicate_receipt(
                filename="same-receipt.pdf",
                content=content,
                current_user=current_user,
            )
            is None
        )
    finally:
        get_settings.cache_clear()
 def test_receipt_folder_recovers_train_ticket_detail_from_other_english_ocr(monkeypatch, tmp_path) -> None:
    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
    get_settings.cache_clear()
    try:
        current_user = CurrentUserContext(
            username="pytest",
            name="Py Test",
            role_codes=[],
            is_admin=False,
        )
        service = ReceiptFolderService()
        receipt = service.save_receipt(
            filename="2月20_武汉-上海.pdf",
            content=b"%PDF-1.4 fake",
            media_type="application/pdf",
            current_user=current_user,
            document=OcrRecognizeDocumentRead(
                filename="2月20_武汉-上海.pdf",
                media_type="application/pdf",
                text=(
                    ":26429165800002785705\n"
                    ":2026 05 18\n"
                    "G458\n"
                    "Wuhan\n"
                    "Shanghaihongqiao\n"
                    "2026 02 20 07:55\n"
                    "06 01B\n"
                    ": 354.00\n"
                    "4201061987****1615\n"
                    ":6580061086021391007342026\n"
                    "12306 95306"
                ),
                summary="Wuhan Shanghaihongqiao G458 354.00",
                document_type="other",
                document_type_label="其他单据",
                scene_code="other",
                scene_label="其他票据",
            ),
        )
        assert receipt.document_type == "train_ticket"
        assert receipt.document_type_label == "火车/高铁票"
        assert receipt.scene_code == "travel"
        assert receipt.amount == "354.00元"
        assert receipt.document_date == "2026-02-20"
        assert receipt.merchant_name == "中国铁路"
        detail = service.get_receipt(receipt.id, current_user)
        fields = {field.label: field.value for field in detail.fields}
        assert fields["行程"] == "武汉-上海"
        assert fields["车次"] == "G458"
        assert fields["列车出发时间"] == "2026-02-20 07:55"
        assert fields["票价"] == "354.00元"
        assert fields["身份证号"] == "4201061987****1615"
        assert fields["车厢"] == "06车"
        assert fields["座位号"] == "01B"
        assert "乘车人" not in fields
    finally:
        get_settings.cache_clear()
 def test_receipt_folder_unlink_receipts_for_claim_marks_linked_receipts_unlinked(monkeypatch, tmp_path) -> None:
    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
    get_settings.cache_clear()
--- a/server/tests/test_reimbursement_endpoints.py
+++ b/server/tests/test_reimbursement_endpoints.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import base64
 import json
 from collections.abc import Generator
 from datetime import UTC, date, datetime
 from decimal import Decimal
@@ -19,6 +20,7 @@ from app.models.organization import OrganizationUnit
 from app.models.risk_observation import RiskObservation, RiskObservationFeedback
 from app.models.role import Role
 from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
 from app.services.document_preview import DocumentPreviewAssets
 from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
 from app.services.ocr import OcrService
@@ -686,6 +688,9 @@ def test_claim_item_pdf_attachment_preview_returns_generated_image(monkeypatch,
    meta_payload = upload_response.json()["attachment"]
    assert meta_payload["preview_kind"] == "image"
    assert meta_payload["preview_url"].endswith(f"/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview")
    meta_path = next(tmp_path.rglob("invoice.pdf.meta.json"))
    stored_meta = json.loads(meta_path.read_text(encoding="utf-8"))
    assert stored_meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
    preview_response = client.get(
        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview",