feat(server): 票据文件夹资产缓存与文档预览统一生成

- 新增 document_preview 模块，DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成（poppler-data 编码）、renderer_id 标识 - receipt_folder 服务复用预览生成，缓存票据资产并提供清理；删除票据时保留已关联报销单的附件副本 - document_intelligence 新增票据预览/资产缓存接入与字段提取增强；ocr 抽取复用预览工具，附件分析/文档/操作/展示四个子模块同步适配 - receipt_folder 端点补充资产缓存头，补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试，新增 attachment_analysis 回归测试
2026-06-23 09:42:00 +08:00
parent bc743adef3
commit 84a8998e59
15 changed files with 1076 additions and 79 deletions
--- a/server/src/app/api/v1/endpoints/receipt_folder.py
+++ b/server/src/app/api/v1/endpoints/receipt_folder.py
@@ -92,7 +92,7 @@ def preview_receipt(receipt_id: str, current_user: CurrentUser) -> FileResponse:
        file_path, media_type, file_name = ReceiptFolderService().resolve_preview(receipt_id, current_user)
    except FileNotFoundError as exc:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Receipt preview not found") from exc
-    return FileResponse(file_path, media_type=media_type, filename=file_name)
+    return FileResponse(file_path, media_type=media_type, filename=file_name, headers={"Cache-Control": "no-store"})


@router.get(
--- a/server/src/app/services/document_intelligence.py
+++ b/server/src/app/services/document_intelligence.py
@@ -25,11 +25,15 @@ AMOUNT_PATTERNS = (
    re.compile(r"[￥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"),
    re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
 )
-DATE_PATTERN = re.compile(r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)")
+DATE_PATTERN = re.compile(
+    r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
+    r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
+)
 TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:：]([0-5]\d)(?!\d)")
 INVOICE_NUMBER_PATTERN = re.compile(r"(?:发票号码|票号|单号|订单号)[：:\s]*([A-Za-z0-9-]{6,24})")
 INVOICE_CODE_PATTERN = re.compile(r"(?:发票代码)[：:\s]*([A-Za-z0-9-]{6,24})")
 TRIP_NO_PATTERN = re.compile(r"(?:车次|航班(?:号)?)[：:\s]*([A-Za-z0-9]{2,12})")
+TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(?<![A-Za-z0-9])([GCDZKTLYS]\d{1,5})(?![A-Za-z0-9])", re.IGNORECASE)
 ROUTE_PATTERN = re.compile(r"([\u4e00-\u9fa5]{2,12})\s*(?:至|→|->|-)\s*([\u4e00-\u9fa5]{2,12})")
 MERCHANT_PATTERNS = (
    re.compile(r"(?:销售方(?:名称)?|商户(?:名称)?|开票方(?:名称)?|收款方(?:名称)?)[：:\s]*([A-Za-z0-9\u4e00-\u9fa5（）()·&\\-]{2,40})"),
@@ -300,6 +304,14 @@ def _match_document_rule(compact_text: str) -> RuleMatch:
            best_score = score

    if best_score <= 0:
+        train_rule = DOCUMENT_TYPE_RULE_MAP.get("train_ticket")
+        if train_rule and _looks_like_train_ticket(compact_text):
+            return RuleMatch(
+                rule=train_rule,
+                confidence=0.82,
+                evidence=("车次", "12306"),
+                score=3.8,
+            )
        return RuleMatch(rule=None, confidence=0.0, evidence=(), score=0.0)

    confidence = min(0.94, 0.30 + min(best_score, 4.8) * 0.12)
@@ -311,6 +323,17 @@ def _match_document_rule(compact_text: str) -> RuleMatch:
    )


+def _looks_like_train_ticket(compact_text: str) -> bool:
+    text = str(compact_text or "").lower()
+    if not re.search(r"[gcdzktlys]\d{1,5}", text, flags=re.IGNORECASE):
+        return False
+    if "12306" in text or "95306" in text:
+        return True
+    if re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5]{2,12}", text):
+        return True
+    return "wuhan" in text and "shanghai" in text
+
+
 def _extract_json_payload(response_text: str | None) -> dict[str, Any] | None:
    if not response_text:
        return None
@@ -521,33 +544,48 @@ def _merge_document_fields(

 def _extract_document_fields(text: str, document_type: str = "") -> list[DocumentField]:
    fields: list[DocumentField] = []
+    normalized_type = str(document_type or "").strip().lower()
+
+    def append_field(key: str, label: str, value: str) -> None:
+        cleaned = _clean_field_value(value)
+        if not cleaned:
+            return
+        if any(field.key == key for field in fields if field.key):
+            return
+        fields.append(DocumentField(key=key, label=label, value=cleaned))
+
    amount = _extract_amount(text)
    if amount:
-        fields.append(DocumentField(key="amount", label="金额", value=amount))
+        append_field("amount", "金额", amount)

    date_value = _extract_date(text, document_type=document_type)
    if date_value:
-        fields.append(DocumentField(key="date", label="日期", value=date_value))
+        append_field("date", "日期", date_value)

    merchant = _extract_merchant(text)
    if merchant:
-        fields.append(DocumentField(key="merchant_name", label="商户", value=merchant))
+        append_field("merchant_name", "商户", merchant)

    invoice_number = _extract_pattern(INVOICE_NUMBER_PATTERN, text)
    if invoice_number:
-        fields.append(DocumentField(key="invoice_number", label="票据号码", value=invoice_number))
+        append_field("invoice_number", "票据号码", invoice_number)

    invoice_code = _extract_pattern(INVOICE_CODE_PATTERN, text)
    if invoice_code:
-        fields.append(DocumentField(key="invoice_code", label="发票代码", value=invoice_code))
+        append_field("invoice_code", "发票代码", invoice_code)

    trip_no = _extract_pattern(TRIP_NO_PATTERN, text)
+    if not trip_no and normalized_type == "train_ticket":
+        trip_no = _extract_pattern(TRAIN_STANDALONE_NO_PATTERN, text)
    if trip_no:
-        fields.append(DocumentField(key="trip_no", label="车次/航班", value=trip_no))
+        append_field("trip_no", "车次/航班", trip_no.upper())

    route = _extract_route(text)
    if route:
-        fields.append(DocumentField(key="route", label="行程", value=route))
+        append_field("route", "行程", route)
+
+    if normalized_type == "train_ticket" and not any(field.key == "amount" for field in fields):
+        append_field("amount", "金额", _extract_loose_decimal_amount(text))

    return fields

@@ -621,6 +659,7 @@ def _format_date_match_with_time(text: str, match: re.Match[str]) -> str:
    raw_value = str(match.group(1) or "").strip()
    normalized = raw_value.replace("年", "-").replace("月", "-").replace("日", "")
    normalized = normalized.replace("/", "-").replace(".", "-")
+    normalized = re.sub(r"\s+", "-", normalized)
    parts = [part for part in normalized.split("-") if part]
    if len(parts) != 3:
        return raw_value
@@ -703,6 +742,23 @@ def _extract_route(text: str) -> str:
    return f"{start}-{end}"


+def _extract_loose_decimal_amount(text: str) -> str:
+    best_value: Decimal | None = None
+    for match in re.finditer(r"(?<!\d)(\d{1,6}\.\d{1,2})(?!\d)", str(text or "")):
+        try:
+            candidate = Decimal(match.group(1)).quantize(Decimal("0.01"))
+        except InvalidOperation:
+            continue
+        if candidate <= Decimal("0.00"):
+            continue
+        if best_value is None or candidate > best_value:
+            best_value = candidate
+    if best_value is None:
+        return ""
+    text_value = format(best_value, "f").rstrip("0").rstrip(".")
+    return f"{text_value}元"
+
+
 def _extract_pattern(pattern: re.Pattern[str], text: str) -> str:
    match = pattern.search(text)
    if not match:
--- a/server/src/app/services/document_preview.py
+++ b/server/src/app/services/document_preview.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+
+import base64
+import binascii
+import mimetypes
+import re
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+
+class DocumentPreviewAssets:
+    PDF_RENDERER_ID = "pdftoppm-png-r160-poppler-data"
+    PDF_PREVIEW_MEDIA_TYPE = "image/png"
+    PDF_PREVIEW_SUFFIX = ".png"
+
+    @staticmethod
+    def decode_data_url(payload: str) -> tuple[str, bytes] | None:
+        normalized = str(payload or "").strip()
+        matched = re.match(
+            r"^data:(?P<media>[\w.+-]+/[\w.+-]+);base64,(?P<body>.+)$",
+            normalized,
+            flags=re.DOTALL,
+        )
+        if not matched:
+            return None
+        try:
+            content = base64.b64decode(matched.group("body"), validate=True)
+        except (binascii.Error, ValueError):
+            return None
+        return matched.group("media"), content
+
+    @classmethod
+    def renderer_id_for_source(cls, media_type: str | None) -> str:
+        return cls.PDF_RENDERER_ID if str(media_type or "").strip() == "application/pdf" else ""
+
+    @classmethod
+    def write_data_url_preview(
+        cls,
+        *,
+        preview_dir: Path,
+        preview_name_stem: str,
+        preview_data_url: str,
+    ) -> tuple[Path, str, str] | None:
+        decoded = cls.decode_data_url(preview_data_url)
+        if decoded is None:
+            return None
+
+        preview_media_type, preview_content = decoded
+        suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
+        preview_name = f"{Path(preview_name_stem).stem}{suffix}"
+        preview_path = preview_dir / preview_name
+        preview_path.write_bytes(preview_content)
+        return preview_path, preview_media_type, preview_name
+
+    @classmethod
+    def render_pdf_first_page(
+        cls,
+        *,
+        pdf_path: Path,
+        preview_path: Path,
+        timeout_seconds: int | float,
+    ) -> Path:
+        preview_path.parent.mkdir(parents=True, exist_ok=True)
+        with tempfile.TemporaryDirectory(prefix=".pdf-preview-", dir=str(preview_path.parent)) as temp_dir:
+            prefix = Path(temp_dir) / "page"
+            completed = subprocess.run(
+                [
+                    "pdftoppm",
+                    "-png",
+                    "-r",
+                    "160",
+                    str(pdf_path),
+                    str(prefix),
+                ],
+                capture_output=True,
+                text=True,
+                timeout=timeout_seconds,
+                check=False,
+            )
+            if completed.returncode != 0:
+                detail = (completed.stderr or completed.stdout or "").strip()
+                raise RuntimeError(detail or "pdftoppm failed to render PDF preview.")
+
+            pages = sorted(Path(temp_dir).glob("page-*.png"), key=cls._extract_pdf_page_sort_key)
+            if not pages:
+                raise RuntimeError("pdftoppm did not generate a preview image.")
+            shutil.copyfile(pages[0], preview_path)
+        return preview_path
+
+    @staticmethod
+    def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]:
+        suffix = path.stem.rsplit("-", 1)[-1]
+        try:
+            return int(suffix), path.name
+        except ValueError:
+            return 0, path.name
--- a/server/src/app/services/expense_claim_attachment_analysis.py
+++ b/server/src/app/services/expense_claim_attachment_analysis.py
@@ -336,7 +336,27 @@ class ExpenseClaimAttachmentAnalysisMixin:

    @staticmethod
    def _has_date_like_text(text: str) -> bool:
-        return bool(re.search(r"(20\d{2}[年/\-.]\d{1,2}[月/\-.]\d{1,2}日?)", text))
+        return bool(re.search(r"(20\d{2}(?:[年/\-.]|\s+)\d{1,2}(?:[月/\-.]|\s+)\d{1,2}日?)", text))
+
+    @staticmethod
+    def _has_document_date_field(document_info: dict[str, Any]) -> bool:
+        date_keys = DOCUMENT_TRIP_DATE_KEYS | DOCUMENT_GENERIC_DATE_KEYS | DOCUMENT_INVOICE_DATE_KEYS
+        date_label_tokens = (
+            *DOCUMENT_TRIP_DATE_LABEL_TOKENS,
+            *DOCUMENT_GENERIC_DATE_LABEL_TOKENS,
+            *DOCUMENT_INVOICE_DATE_LABEL_TOKENS,
+        )
+        for field in list(document_info.get("fields") or []):
+            if not isinstance(field, dict):
+                continue
+            value = str(field.get("value") or "").strip()
+            if not value:
+                continue
+            key = str(field.get("key") or "").strip().lower().replace("_", "")
+            label = str(field.get("label") or "").replace(" ", "")
+            if key in date_keys or any(token in label for token in date_label_tokens):
+                return True
+        return False

    @staticmethod
    def _normalize_match_text(text: str) -> str:
@@ -538,6 +558,12 @@ class ExpenseClaimAttachmentAnalysisMixin:
        recognized_document_label = str(document_info.get("document_type_label") or "其他单据").strip() or "其他单据"
        requirement_matches = bool(requirement_check.get("matches"))
        mismatch_severity = str(requirement_check.get("mismatch_severity") or "high").strip().lower() or "high"
+        document_fields = [
+            field
+            for field in list(document_info.get("fields") or [])
+            if isinstance(field, dict) and str(field.get("value") or "").strip()
+        ]
+        has_readable_content = bool(line_count > 0 or compact_text or document_fields)

        has_ticket_keyword = any(
            keyword in compact_text
@@ -556,15 +582,18 @@ class ExpenseClaimAttachmentAnalysisMixin:
            )
        )
        amount_candidates = self._extract_amount_candidates(text)
+        field_amount = self._resolve_document_field_amount({"document_fields": document_fields})
+        if field_amount is not None and field_amount not in amount_candidates:
+            amount_candidates.insert(0, field_amount)
        item_amount = Decimal(item.item_amount or Decimal("0.00")).quantize(Decimal("0.01"))
        has_matching_amount = any(abs(candidate - item_amount) <= Decimal("1.00") for candidate in amount_candidates)
-        has_date_text = self._has_date_like_text(text)
+        has_date_text = self._has_date_like_text(text) or self._has_document_date_field(document_info)
        amount_mismatch = bool(amount_candidates) and item_amount > Decimal("0.00") and not has_matching_amount

        points: list[str] = []
        if warnings:
            points.append(f"识别提示：{warnings[0]}")
-        if line_count == 0 or not compact_text:
+        if not has_readable_content:
            points.append("附件内容：未识别到有效文字，当前附件更像普通图片或内容过于模糊。")
        if recognized_document_type == "other" and not has_ticket_keyword:
            points.append("票据类型：未识别到发票、票据、电子行程单等关键字，暂无法判断票据类型。")
@@ -617,8 +646,7 @@ class ExpenseClaimAttachmentAnalysisMixin:
            headline = "AI提示：住宿金额超出报销标准"
            summary = "当前住宿票据金额超过规则中心差旅住宿标准，已作为风险项保留在单据中；如需按特殊情况提交，请补充超标原因。"
        elif (
-            line_count == 0
-            or not compact_text
+            not has_readable_content
            or (recognized_document_type == "other" and not has_ticket_keyword and issue_count >= 2)
            or (not requirement_matches and mismatch_severity == "high")
            or (purpose_mismatch_point and amount_mismatch)
--- a/server/src/app/services/expense_claim_attachment_document.py
+++ b/server/src/app/services/expense_claim_attachment_document.py
@@ -119,6 +119,13 @@ class ExpenseClaimAttachmentDocumentMixin:
            metadata=metadata,
            item=item,
        )
+        metadata = self._refresh_pdf_attachment_preview_meta_if_needed(
+            file_path=file_path,
+            metadata=metadata,
+        )
+        if self._attachment_metadata_needs_analysis_refresh(metadata):
+            self._refresh_item_attachment_analysis(item)
+            metadata = self._attachment_storage.read_meta(file_path)
        uploaded_at_value = metadata.get("uploaded_at")
        uploaded_at = None
        if isinstance(uploaded_at_value, str) and uploaded_at_value.strip():
@@ -157,6 +164,68 @@ class ExpenseClaimAttachmentDocumentMixin:
            "requirement_check": requirement_check,
        }

+    @classmethod
+    def _attachment_metadata_needs_analysis_refresh(cls, metadata: dict[str, Any]) -> bool:
+        analysis = metadata.get("analysis")
+        if not isinstance(analysis, dict):
+            return cls._attachment_metadata_has_ocr_signal(metadata)
+
+        points = [
+            str(point or "").strip()
+            for point in list(analysis.get("points") or [])
+            if str(point or "").strip()
+        ]
+        if not points:
+            return False
+
+        if any("未识别到有效文字" in point for point in points):
+            return cls._attachment_metadata_has_readable_signal(metadata)
+
+        if any("未识别到列车出发时间" in point or "未识别到开票日期" in point for point in points):
+            return cls._attachment_metadata_has_date_field(metadata)
+
+        return False
+
+    @classmethod
+    def _attachment_metadata_has_ocr_signal(cls, metadata: dict[str, Any]) -> bool:
+        return bool(
+            str(metadata.get("ocr_text") or "").strip()
+            or str(metadata.get("ocr_summary") or "").strip()
+            or int(metadata.get("ocr_line_count") or 0) > 0
+            or cls._attachment_metadata_document_fields(metadata)
+        )
+
+    @classmethod
+    def _attachment_metadata_has_readable_signal(cls, metadata: dict[str, Any]) -> bool:
+        return bool(
+            str(metadata.get("ocr_text") or "").strip()
+            or str(metadata.get("ocr_summary") or "").strip()
+            or int(metadata.get("ocr_line_count") or 0) > 0
+            or cls._attachment_metadata_document_fields(metadata)
+        )
+
+    @staticmethod
+    def _attachment_metadata_document_fields(metadata: dict[str, Any]) -> list[dict[str, Any]]:
+        document_info = metadata.get("document_info")
+        if not isinstance(document_info, dict):
+            return []
+        return [
+            field
+            for field in list(document_info.get("fields") or [])
+            if isinstance(field, dict) and str(field.get("value") or "").strip()
+        ]
+
+    @classmethod
+    def _attachment_metadata_has_date_field(cls, metadata: dict[str, Any]) -> bool:
+        for field in cls._attachment_metadata_document_fields(metadata):
+            key = str(field.get("key") or "").strip().lower().replace("_", "")
+            label = str(field.get("label") or "").replace(" ", "")
+            if key in {"date", "tripdate", "departuredate", "invoicedate"}:
+                return True
+            if any(token in label for token in ("日期", "时间", "出发")):
+                return True
+        return False
+
    def _build_attachment_document_info(self, document: Any) -> dict[str, Any]:
        insight = build_document_insight(
            filename=str(getattr(document, "filename", "") or ""),
--- a/server/src/app/services/expense_claim_attachment_operations.py
+++ b/server/src/app/services/expense_claim_attachment_operations.py
@@ -32,6 +32,7 @@ from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager
 from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY
 from app.services.agent_foundation import AgentFoundationService
 from app.services.audit import AuditLogService
+from app.services.document_preview import DocumentPreviewAssets
 from app.services.document_intelligence import build_document_insight
 from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy
 from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
@@ -238,6 +239,7 @@ class ExpenseClaimAttachmentOperationsMixin:
            "preview_storage_key": str(preview_meta["preview_storage_key"]),
            "preview_media_type": str(preview_meta["preview_media_type"]),
            "preview_file_name": str(preview_meta["preview_file_name"]),
+            "preview_rendered_with": str(preview_meta.get("preview_rendered_with") or ""),
            "analysis": attachment_analysis,
            "document_info": document_info,
            "requirement_check": requirement_check,
@@ -673,6 +675,60 @@ class ExpenseClaimAttachmentOperationsMixin:
        self._attachment_storage.write_meta(file_path, metadata)
        return metadata

+    def _refresh_pdf_attachment_preview_meta_if_needed(
+        self,
+        *,
+        file_path: Path,
+        metadata: dict[str, Any],
+    ) -> dict[str, Any]:
+        if not metadata:
+            return metadata
+
+        media_type = str(
+            metadata.get("media_type")
+            or self._attachment_presentation.resolve_media_type(file_path.name)
+        ).strip()
+        if media_type != "application/pdf":
+            return metadata
+
+        preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
+        preview_path = self._attachment_storage.resolve_path(preview_storage_key) if preview_storage_key else None
+        if (
+            preview_path is not None
+            and preview_path.exists()
+            and str(metadata.get("preview_kind") or "").strip() == "image"
+            and str(metadata.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
+            and str(metadata.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
+        ):
+            return metadata
+
+        preview_name = str(metadata.get("preview_file_name") or "").strip()
+        if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
+            preview_name = f"{file_path.stem}.preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
+        preview_path = file_path.parent / preview_name
+
+        try:
+            DocumentPreviewAssets.render_pdf_first_page(
+                pdf_path=file_path,
+                preview_path=preview_path,
+                timeout_seconds=OcrService(self.db).settings.ocr_timeout_seconds,
+            )
+        except Exception:
+            return metadata
+
+        metadata.update(
+            {
+                "previewable": True,
+                "preview_kind": "image",
+                "preview_storage_key": self._attachment_storage.to_storage_key(preview_path),
+                "preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
+                "preview_file_name": preview_path.name,
+                "preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
+            }
+        )
+        self._attachment_storage.write_meta(file_path, metadata)
+        return metadata
+
    def _resolve_item_attachment_preview_content(self, item: ExpenseClaimItem) -> tuple[Path, str, str]:
        file_path, media_type, filename = self._resolve_item_attachment_content(item)
        metadata = self._attachment_storage.read_meta(file_path)
@@ -681,6 +737,10 @@ class ExpenseClaimAttachmentOperationsMixin:
            metadata=metadata,
            item=item,
        )
+        metadata = self._refresh_pdf_attachment_preview_meta_if_needed(
+            file_path=file_path,
+            metadata=metadata,
+        )
        preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
        preview_file_name = str(metadata.get("preview_file_name") or "").strip()
        preview_media_type = str(metadata.get("preview_media_type") or "").strip()
--- a/server/src/app/services/expense_claim_attachment_presentation.py
+++ b/server/src/app/services/expense_claim_attachment_presentation.py
@@ -1,13 +1,11 @@
 from __future__ import annotations

-import base64
-import binascii
 import mimetypes
-import re
 from pathlib import Path
 from typing import Any
 from urllib.parse import quote

+from app.services.document_preview import DocumentPreviewAssets
 from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage


@@ -42,6 +40,7 @@ class ExpenseClaimAttachmentPresentation:
                    "preview_storage_key": self.storage.to_storage_key(preview_path),
                    "preview_media_type": preview_media_type,
                    "preview_file_name": preview_file_name,
+                    "preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
                }

        if preview_kind:
@@ -51,6 +50,7 @@ class ExpenseClaimAttachmentPresentation:
                "preview_storage_key": storage_key,
                "preview_media_type": media_type,
                "preview_file_name": filename,
+                "preview_rendered_with": "",
            }

        return {
@@ -59,6 +59,7 @@ class ExpenseClaimAttachmentPresentation:
            "preview_storage_key": "",
            "preview_media_type": "",
            "preview_file_name": "",
+            "preview_rendered_with": "",
        }

    @staticmethod
@@ -72,15 +73,7 @@ class ExpenseClaimAttachmentPresentation:

    @staticmethod
    def decode_data_url(payload: str) -> tuple[str, bytes] | None:
-        normalized = str(payload or "").strip()
-        matched = re.match(r"^data:(?P<media>[\w.+-]+/[\w.+-]+);base64,(?P<body>.+)$", normalized, flags=re.DOTALL)
-        if not matched:
-            return None
-        try:
-            content = base64.b64decode(matched.group("body"), validate=True)
-        except (binascii.Error, ValueError):
-            return None
-        return matched.group("media"), content
+        return DocumentPreviewAssets.decode_data_url(payload)

    def _write_preview_asset_from_data_url(
        self,
@@ -89,16 +82,11 @@ class ExpenseClaimAttachmentPresentation:
        original_filename: str,
        preview_data_url: str,
    ) -> tuple[Path, str, str] | None:
-        decoded = self.decode_data_url(preview_data_url)
-        if decoded is None:
-            return None
-
-        preview_media_type, preview_content = decoded
-        suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
-        preview_name = f"{Path(original_filename).stem}.preview{suffix}"
-        preview_path = attachment_dir / preview_name
-        preview_path.write_bytes(preview_content)
-        return preview_path, preview_media_type, preview_name
+        return DocumentPreviewAssets.write_data_url_preview(
+            preview_dir=attachment_dir,
+            preview_name_stem=f"{Path(original_filename).stem}.preview",
+            preview_data_url=preview_data_url,
+        )

    @staticmethod
    def build_preview_client_path(claim_id: str, item_id: str) -> str:
--- a/server/src/app/services/ocr.py
+++ b/server/src/app/services/ocr.py
@@ -537,7 +537,7 @@ class OcrService:
            if page_summary:
                aggregated.summary_fragments.append(page_summary)

-            page_text = str(payload.get("text", "") or "").strip()
+            page_text = self._resolve_worker_document_text(payload)
            if page_text:
                aggregated.text_fragments.append(page_text)

@@ -626,6 +626,22 @@ class OcrService:
                return descriptor.text_layer
        return ""

+    @staticmethod
+    def _resolve_worker_document_text(payload: dict) -> str:
+        for key in ("text", "ocr_text", "raw_text", "full_text"):
+            value = str(payload.get(key, "") or "").strip()
+            if value:
+                return value
+
+        lines = payload.get("lines", [])
+        if not isinstance(lines, list):
+            return ""
+        return "\n".join(
+            str(item.get("text", "") or "").strip()
+            for item in lines
+            if isinstance(item, dict) and str(item.get("text", "") or "").strip()
+        ).strip()
+
    @staticmethod
    def _build_lines(
        items: list[dict],
--- a/server/src/app/services/receipt_folder.py
+++ b/server/src/app/services/receipt_folder.py
@@ -12,7 +12,7 @@ from uuid import uuid4

 from app.api.deps import CurrentUserContext
 from app.core.config import get_settings
-from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
+from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead
 from app.schemas.receipt_folder import (
    ReceiptFolderDeleteResponse,
    ReceiptFolderDetailRead,
@@ -20,11 +20,13 @@ from app.schemas.receipt_folder import (
    ReceiptFolderItemRead,
    ReceiptFolderUpdate,
 )
-from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
+from app.services.document_preview import DocumentPreviewAssets
+from app.services.document_intelligence import build_document_insight
 from app.services.ocr import SUPPORTED_SUFFIXES

 RECEIPT_DATE_PATTERN = re.compile(
-    r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)"
+    r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
+    r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
 )
 RECEIPT_TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:：]([0-5]\d)(?!\d)")
 TRAIN_INVOICE_DATE_PATTERN = re.compile(
@@ -45,7 +47,9 @@ TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座
 TRAIN_CARRIAGE_PATTERN = re.compile(r"(?:车厢|车厢号)\s*[:：]?\s*([0-9]{1,2}\s*车?)")
 TRAIN_SEAT_NO_PATTERN = re.compile(r"(?:座位|座位号)\s*[:：]?\s*([0-9]{1,3}[A-F号]?)", re.IGNORECASE)
 TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])号?", re.IGNORECASE)
+TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE)
 TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[:：￥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
+TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)")


 class ReceiptFolderStorageMixin:
@@ -101,18 +105,19 @@ class ReceiptFolderStorageMixin:
        document: Any | None,
    ) -> dict[str, Any]:
        preview_data_url = str(getattr(document, "preview_data_url", "") or "").strip()
-        decoded = ExpenseClaimAttachmentPresentation.decode_data_url(preview_data_url)
-        if decoded is not None:
-            preview_media_type, preview_content = decoded
-            suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
-            preview_name = f"preview{suffix}"
-            preview_path = receipt_dir / preview_name
-            preview_path.write_bytes(preview_content)
+        preview_asset = DocumentPreviewAssets.write_data_url_preview(
+            preview_dir=receipt_dir,
+            preview_name_stem="preview",
+            preview_data_url=preview_data_url,
+        )
+        if preview_asset is not None:
+            _, preview_media_type, preview_name = preview_asset
            return {
                "previewable": True,
                "preview_kind": "image",
                "preview_file_name": preview_name,
                "preview_media_type": preview_media_type,
+                "preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
            }
        if self._is_previewable(media_type):
            return {
@@ -120,14 +125,67 @@ class ReceiptFolderStorageMixin:
                "preview_kind": "image" if media_type.startswith("image/") else "pdf",
                "preview_file_name": source_path.name,
                "preview_media_type": media_type,
+                "preview_rendered_with": "",
            }
        return {
            "previewable": False,
            "preview_kind": "",
            "preview_file_name": "",
            "preview_media_type": "",
+            "preview_rendered_with": "",
        }

+    def _refresh_pdf_preview_asset_if_needed(
+        self,
+        *,
+        receipt_dir: Path,
+        meta: dict[str, Any],
+    ) -> dict[str, Any]:
+        source_name = str(meta.get("source_file_name") or meta.get("file_name") or "").strip()
+        if not source_name:
+            return meta
+
+        source_path = self._assert_child(receipt_dir / source_name)
+        source_media_type = self.resolve_media_type(source_path.name, str(meta.get("media_type") or ""))
+        if source_media_type != "application/pdf" or not source_path.exists():
+            return meta
+
+        preview_name = str(meta.get("preview_file_name") or "").strip()
+        preview_path = self._assert_child(receipt_dir / preview_name) if preview_name else None
+        if (
+            preview_path is not None
+            and preview_path.exists()
+            and str(meta.get("preview_kind") or "").strip() == "image"
+            and str(meta.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
+            and str(meta.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
+        ):
+            return meta
+
+        if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
+            preview_name = f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
+        preview_path = self._assert_child(receipt_dir / preview_name)
+
+        try:
+            DocumentPreviewAssets.render_pdf_first_page(
+                pdf_path=source_path,
+                preview_path=preview_path,
+                timeout_seconds=get_settings().ocr_timeout_seconds,
+            )
+        except Exception:
+            return meta
+
+        meta.update(
+            {
+                "previewable": True,
+                "preview_kind": "image",
+                "preview_file_name": preview_path.name,
+                "preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
+                "preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
+            }
+        )
+        self._write_meta(receipt_dir, meta)
+        return meta
+
    @staticmethod
    def _is_previewable(media_type: str) -> bool:
        return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf"
@@ -256,6 +314,7 @@ class ReceiptFolderItemMixin:
    def _build_item(self, meta: dict[str, Any]) -> ReceiptFolderItemRead:
        receipt_id = str(meta.get("id") or "").strip()
        status_value = str(meta.get("status") or "unlinked").strip() or "unlinked"
+        identity = self._resolve_receipt_document_identity(meta)
        return ReceiptFolderItemRead(
            id=receipt_id,
            file_name=str(meta.get("file_name") or ""),
@@ -263,10 +322,10 @@ class ReceiptFolderItemMixin:
            size_bytes=int(meta.get("size_bytes") or 0),
            status=status_value,
            status_label="已关联" if status_value == "linked" else "未关联",
-            document_type=str(meta.get("document_type") or "other"),
-            document_type_label=str(meta.get("document_type_label") or "其他单据"),
-            scene_code=str(meta.get("scene_code") or "other"),
-            scene_label=str(meta.get("scene_label") or "其他票据"),
+            document_type=identity["document_type"],
+            document_type_label=identity["document_type_label"],
+            scene_code=identity["scene_code"],
+            scene_label=identity["scene_label"],
            summary=str(meta.get("summary") or ""),
            amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")),
            document_date=self._resolve_receipt_document_date(meta),
@@ -283,6 +342,38 @@ class ReceiptFolderItemMixin:
            warnings=[str(value) for value in list(meta.get("ocr_warnings") or []) if str(value).strip()],
        )

+    def _resolve_receipt_document_identity(self, meta: dict[str, Any]) -> dict[str, str]:
+        document_type = str(meta.get("document_type") or "other").strip() or "other"
+        document_type_label = str(meta.get("document_type_label") or "其他单据").strip() or "其他单据"
+        scene_code = str(meta.get("scene_code") or "other").strip() or "other"
+        scene_label = str(meta.get("scene_label") or "其他票据").strip() or "其他票据"
+        if document_type not in {"", "other"} and document_type_label != "其他单据":
+            return {
+                "document_type": document_type,
+                "document_type_label": document_type_label,
+                "scene_code": scene_code,
+                "scene_label": scene_label,
+            }
+
+        insight = build_document_insight(
+            filename=str(meta.get("file_name") or ""),
+            summary=str(meta.get("summary") or ""),
+            text=self._receipt_text(meta),
+        )
+        if insight.document_type in {"", "other"}:
+            return {
+                "document_type": document_type,
+                "document_type_label": document_type_label,
+                "scene_code": scene_code,
+                "scene_label": scene_label,
+            }
+        return {
+            "document_type": insight.document_type,
+            "document_type_label": insight.document_type_label,
+            "scene_code": insight.scene_code,
+            "scene_label": insight.scene_label,
+        }
+
    def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]:
        fields = [
            ReceiptFolderFieldRead(
@@ -503,7 +594,15 @@ class ReceiptFolderTrainTicketMixin:
        if str(document_type or "").strip().lower() == "train_ticket":
            return True
        compact = "".join([document_type_label, scene_label, text]).replace(" ", "")
-        return any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次"))
+        if any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次")):
+            return True
+        lower_compact = compact.lower()
+        return bool(re.search(r"[GCDZKTLYS]\d{1,5}", compact, flags=re.IGNORECASE)) and (
+            "12306" in compact
+            or "95306" in compact
+            or re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5]{2,12}", compact)
+            or ("wuhan" in lower_compact and "shanghai" in lower_compact)
+        )

    @classmethod
    def _is_train_ticket_meta(cls, meta: dict[str, Any]) -> bool:
@@ -581,6 +680,7 @@ class ReceiptFolderTrainTicketMixin:
            return raw
        normalized = match.group(1).replace("年", "-").replace("月", "-").replace("日", "")
        normalized = normalized.replace("/", "-").replace(".", "-")
+        normalized = re.sub(r"\s+", "-", normalized)
        parts = [part for part in normalized.split("-") if part]
        if len(parts) != 3:
            return match.group(1)
@@ -651,7 +751,28 @@ class ReceiptFolderTrainTicketMixin:
        cleaned = re.sub(r"[^·\u4e00-\u9fa5]", "", str(value or "")).strip()
        if not 2 <= len(cleaned) <= 8:
            return ""
-        if any(token in cleaned for token in ("电子", "客票", "铁路", "发票", "税务", "湖北省", "中国铁路", "开票", "日期")):
+        if any(
+            token in cleaned
+            for token in (
+                "电子",
+                "客票",
+                "铁路",
+                "发票",
+                "税务",
+                "湖北省",
+                "中国铁路",
+                "开票",
+                "日期",
+                "车厢",
+                "座位",
+                "票价",
+                "金额",
+                "行程",
+                "出发",
+                "到达",
+                "车次",
+            )
+        ):
            return ""
        return cleaned

@@ -660,20 +781,29 @@ class ReceiptFolderTrainTicketMixin:
        labeled = cls._extract_first(TRAIN_ID_PATTERN, text)
        if labeled:
            return labeled
+        fallback = ""
        for line in str(text or "").replace("\r", "\n").splitlines():
            compact_line = line.replace(" ", "")
            if any(token in compact_line for token in ("发票号码", "电子客票号", "客票号", "订单号")):
                continue
            match = TRAIN_ID_FALLBACK_PATTERN.search(compact_line)
-            if match:
-                return str(match.group(1) or "").strip()
-        return ""
+            if not match:
+                continue
+            candidate = str(match.group(1) or "").strip()
+            if "*" in candidate:
+                return candidate
+            if not fallback:
+                fallback = candidate
+        return fallback

    @staticmethod
    def _extract_train_carriage_and_seat(text: str) -> tuple[str, str]:
        combined_match = TRAIN_COMBINED_SEAT_PATTERN.search(str(text or ""))
        if combined_match:
            return f"{combined_match.group(1)}车", combined_match.group(2)
+        loose_match = TRAIN_LOOSE_SEAT_PATTERN.search(str(text or ""))
+        if loose_match:
+            return f"{loose_match.group(1).zfill(2)}车", loose_match.group(2).upper()
        carriage_no = ReceiptFolderService._extract_first(TRAIN_CARRIAGE_PATTERN, text).replace(" ", "")
        seat_no = ReceiptFolderService._extract_first(TRAIN_SEAT_NO_PATTERN, text)
        return carriage_no, seat_no
@@ -681,6 +811,12 @@ class ReceiptFolderTrainTicketMixin:
    @staticmethod
    def _extract_train_fare(text: str) -> str:
        match = TRAIN_FARE_PATTERN.search(str(text or ""))
+        if not match:
+            match = max(
+                list(TRAIN_LOOSE_FARE_PATTERN.finditer(str(text or ""))),
+                key=lambda item: float(str(item.group(1) or "0").replace(",", ".")),
+                default=None,
+            )
        if not match:
            return ""
        value = str(match.group(1) or "").replace(",", ".").strip()
@@ -721,13 +857,10 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
            )
            if existing_receipt is not None:
                enriched.append(
-                    document.model_copy(
-                        update={
-                            "receipt_id": existing_receipt.id,
-                            "receipt_status": existing_receipt.status,
-                            "receipt_preview_url": existing_receipt.preview_url,
-                            "receipt_source_url": existing_receipt.source_url,
-                        }
+                    self._enrich_ocr_document_with_receipt(
+                        document,
+                        receipt=existing_receipt,
+                        current_user=current_user,
                    )
                )
                continue
@@ -744,14 +877,11 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
                warning = "已上传过同样的单据，请不要重复上传。"
                existing_warnings = [str(item) for item in list(document.warnings or []) if str(item).strip()]
                enriched.append(
-                    document.model_copy(
-                        update={
-                            "receipt_id": duplicate_receipt.id,
-                            "receipt_status": duplicate_receipt.status,
-                            "receipt_preview_url": duplicate_receipt.preview_url,
-                            "receipt_source_url": duplicate_receipt.source_url,
-                            "warnings": list(dict.fromkeys([*existing_warnings, warning])),
-                        }
+                    self._enrich_ocr_document_with_receipt(
+                        document,
+                        receipt=duplicate_receipt,
+                        current_user=current_user,
+                        extra_warnings=[*existing_warnings, warning],
                    )
                )
                continue
@@ -763,17 +893,78 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
                current_user=current_user,
            )
            enriched.append(
-                document.model_copy(
-                    update={
-                        "receipt_id": receipt.id,
-                        "receipt_status": receipt.status,
-                        "receipt_preview_url": receipt.preview_url,
-                        "receipt_source_url": receipt.source_url,
-                    }
+                self._enrich_ocr_document_with_receipt(
+                    document,
+                    receipt=receipt,
+                    current_user=current_user,
                )
            )
        return result.model_copy(update={"documents": enriched})

+    def _enrich_ocr_document_with_receipt(
+        self,
+        document: OcrRecognizeDocumentRead,
+        *,
+        receipt: ReceiptFolderItemRead,
+        current_user: CurrentUserContext,
+        extra_warnings: list[str] | None = None,
+    ) -> OcrRecognizeDocumentRead:
+        update: dict[str, Any] = {
+            "receipt_id": receipt.id,
+            "receipt_status": receipt.status,
+            "receipt_preview_url": receipt.preview_url,
+            "receipt_source_url": receipt.source_url,
+        }
+
+        try:
+            meta = self._read_receipt_meta(receipt.id, current_user)
+        except FileNotFoundError:
+            meta = {}
+
+        if meta:
+            update.update(
+                {
+                    "text": str(meta.get("ocr_text") or document.text or ""),
+                    "summary": str(meta.get("summary") or document.summary or ""),
+                    "document_type": str(meta.get("document_type") or document.document_type or "other"),
+                    "document_type_label": str(meta.get("document_type_label") or document.document_type_label or "其他单据"),
+                    "scene_code": str(meta.get("scene_code") or document.scene_code or "other"),
+                    "scene_label": str(meta.get("scene_label") or document.scene_label or "其他票据"),
+                    "classification_source": str(meta.get("ocr_classification_source") or document.classification_source or ""),
+                    "classification_confidence": float(
+                        meta.get("ocr_classification_confidence")
+                        or document.classification_confidence
+                        or 0.0
+                    ),
+                    "classification_evidence": [
+                        str(value)
+                        for value in list(meta.get("ocr_classification_evidence") or document.classification_evidence or [])
+                        if str(value).strip()
+                    ],
+                    "document_fields": self._build_ocr_document_fields_from_meta(meta),
+                }
+            )
+
+        warnings = [
+            str(item)
+            for item in list(extra_warnings if extra_warnings is not None else document.warnings or [])
+            if str(item).strip()
+        ]
+        if warnings:
+            update["warnings"] = list(dict.fromkeys(warnings))
+        return document.model_copy(update=update)
+
+    def _build_ocr_document_fields_from_meta(self, meta: dict[str, Any]) -> list[OcrRecognizeFieldRead]:
+        return [
+            OcrRecognizeFieldRead(
+                key=field.key,
+                label=field.label,
+                value=field.value,
+            )
+            for field in self._resolve_fields(meta)
+            if field.label and field.value
+        ]
+
    def save_receipt(
        self,
        *,
@@ -1024,6 +1215,7 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
    def resolve_preview(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]:
        meta = self._read_receipt_meta(receipt_id, current_user)
        receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id)
+        meta = self._refresh_pdf_preview_asset_if_needed(receipt_dir=receipt_dir, meta=meta)
        preview_name = str(meta.get("preview_file_name") or "").strip()
        if preview_name:
            preview_path = self._assert_child(receipt_dir / preview_name)
@@ -1038,4 +1230,3 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
        if self._is_previewable(source_media_type):
            return source_path, source_media_type, source_name
        raise FileNotFoundError("Receipt preview not found")
-
--- a/server/tests/test_document_intelligence.py
+++ b/server/tests/test_document_intelligence.py
@@ -84,6 +84,35 @@ def test_document_intelligence_prefers_train_ticket_for_railway_e_ticket_invoice
    assert any(field.label == "金额" and field.value == "354元" for field in insight.fields)


+def test_document_intelligence_recovers_train_ticket_from_english_station_ocr_text() -> None:
+    insight = build_document_insight(
+        filename="2月20_武汉-上海.pdf",
+        summary=":26429165800002785705；:2026 05 18；Wuhan Shanghaihongqiao G458",
+        text=(
+            ":26429165800002785705\n"
+            ":2026 05 18\n"
+            "G458\n"
+            "Wuhan\n"
+            "Shanghaihongqiao\n"
+            "2026 02 20 07:55\n"
+            "06 01B\n"
+            ": 354.00\n"
+            "4201061987****1615\n"
+            ":6580061086021391007342026\n"
+            "12306 95306"
+        ),
+    )
+
+    assert insight.document_type == "train_ticket"
+    assert insight.document_type_label == "火车/高铁票"
+    assert insight.scene_code == "travel"
+    fields = {field.label: field.value for field in insight.fields}
+    assert fields["金额"] == "354元"
+    assert fields["列车出发时间"] == "2026-02-20 07:55"
+    assert fields["车次/航班"] == "G458"
+    assert fields["行程"] == "武汉-上海"
+
+
 def test_document_intelligence_labels_train_ticket_date_as_train_departure_time() -> None:
    insight = build_document_insight(
        filename="铁路电子客票.pdf",
--- a/server/tests/test_expense_claim_attachment_analysis_regression.py
+++ b/server/tests/test_expense_claim_attachment_analysis_regression.py
@@ -0,0 +1,169 @@
+from __future__ import annotations
+
+import json
+from decimal import Decimal
+
+from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
+from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
+from app.services.ocr import OcrService
+from test_reimbursement_endpoints import build_client, seed_claim
+
+
+def test_train_ticket_attachment_with_structured_fields_is_not_flagged_as_unreadable(
+    monkeypatch,
+    tmp_path,
+) -> None:
+    def fake_recognize(
+        self,
+        files: list[tuple[str, bytes, str | None]],
+    ) -> OcrRecognizeBatchRead:
+        return OcrRecognizeBatchRead(
+            total_file_count=1,
+            success_count=1,
+            documents=[
+                OcrRecognizeDocumentRead(
+                    filename="2月20_武汉-上海.pdf",
+                    media_type="application/pdf",
+                    text=(
+                        ":26429165800002785705\n"
+                        ":2026 05 18\n"
+                        "G458\n"
+                        "Wuhan\n"
+                        "Shanghaihongqiao\n"
+                        "2026 02 20 07:55\n"
+                        "06 01B\n"
+                        ": 354.00\n"
+                        "4201061987****1615\n"
+                        ":6580061086021391007342026\n"
+                        "12306 95306"
+                    ),
+                    summary="Wuhan Shanghaihongqiao G458 354.00",
+                    avg_score=0.0,
+                    line_count=0,
+                    page_count=1,
+                    warnings=[],
+                )
+            ],
+        )
+
+    monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
+    monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
+
+    client, session_factory = build_client()
+    with session_factory() as db:
+        claim, item = seed_claim(db)
+        claim.expense_type = "travel"
+        claim.reason = "武汉-上海差旅"
+        claim.location = "上海"
+        claim.amount = Decimal("354.00")
+        item.item_type = "train_ticket"
+        item.item_reason = "武汉-上海"
+        item.item_location = "上海"
+        item.item_amount = Decimal("354.00")
+        db.commit()
+        claim_id = claim.id
+        item_id = item.id
+
+    upload_response = client.post(
+        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
+        headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
+        files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
+    )
+
+    assert upload_response.status_code == 200
+    attachment = upload_response.json()["attachment"]
+    analysis = attachment["analysis"]
+    points = analysis["points"]
+
+    assert attachment["document_info"]["document_type"] == "train_ticket"
+    assert analysis["severity"] == "pass"
+    assert not any("未识别到有效文字" in point for point in points)
+    assert not any("未识别到列车出发时间" in point for point in points)
+
+
+def test_attachment_meta_read_repairs_stale_unreadable_train_ticket_analysis(
+    monkeypatch,
+    tmp_path,
+) -> None:
+    def fake_recognize(
+        self,
+        files: list[tuple[str, bytes, str | None]],
+    ) -> OcrRecognizeBatchRead:
+        return OcrRecognizeBatchRead(
+            total_file_count=1,
+            success_count=1,
+            documents=[
+                OcrRecognizeDocumentRead(
+                    filename="2月20_武汉-上海.pdf",
+                    media_type="application/pdf",
+                    text=(
+                        ":26429165800002785705 :2026 05 18\n"
+                        "G458\n"
+                        "Wuhan Shanghaihongqiao\n"
+                        "2026 02 20 07:55 06 01B\n"
+                        ": 354.00\n"
+                        "4201061987****1615\n"
+                        ":6580061086021391007342026\n"
+                        "12306 95306"
+                    ),
+                    summary="Wuhan Shanghaihongqiao G458 354.00",
+                    avg_score=0.0,
+                    line_count=0,
+                    page_count=1,
+                    warnings=[],
+                )
+            ],
+        )
+
+    monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
+    monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
+
+    client, session_factory = build_client()
+    with session_factory() as db:
+        claim, item = seed_claim(db)
+        claim.expense_type = "travel"
+        claim.reason = "武汉-上海差旅"
+        claim.location = "上海"
+        claim.amount = Decimal("354.00")
+        item.item_type = "train_ticket"
+        item.item_reason = "武汉-上海"
+        item.item_location = "上海"
+        item.item_amount = Decimal("354.00")
+        db.commit()
+        claim_id = claim.id
+        item_id = item.id
+
+    upload_response = client.post(
+        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
+        headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
+        files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
+    )
+    assert upload_response.status_code == 200
+
+    meta_path = next(tmp_path.rglob("*.meta.json"))
+    meta = json.loads(meta_path.read_text(encoding="utf-8"))
+    meta["analysis"] = {
+        "severity": "high",
+        "label": "高风险",
+        "headline": "AI提示：附件不符合票据校验条件",
+        "summary": "当前附件存在明显异常，票据类型与当前费用场景不匹配，或无法作为有效报销材料。",
+        "points": [
+            "附件内容：未识别到有效文字，当前附件更像普通图片或内容过于模糊。",
+            "日期字段：未识别到列车出发时间或乘车日期。",
+        ],
+        "rule_basis": [],
+        "suggestion": "建议过滤当前不匹配的票据，重新上传符合当前费用场景的清晰原件。",
+    }
+    meta_path.write_text(json.dumps(meta, ensure_ascii=False), encoding="utf-8")
+
+    meta_response = client.get(
+        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/meta",
+        headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
+    )
+
+    assert meta_response.status_code == 200
+    analysis = meta_response.json()["analysis"]
+    points = analysis["points"]
+    assert analysis["severity"] == "pass"
+    assert not any("未识别到有效文字" in point for point in points)
+    assert not any("未识别到列车出发时间" in point for point in points)
--- a/server/tests/test_ocr_endpoints.py
+++ b/server/tests/test_ocr_endpoints.py
@@ -176,3 +176,73 @@ def test_ocr_recognize_endpoint_returns_structured_payload(monkeypatch, tmp_path
        assert deleted_response.status_code == 404
    finally:
        get_settings.cache_clear()
+
+
+def test_ocr_recognize_endpoint_returns_receipt_enriched_train_fields(monkeypatch, tmp_path) -> None:
+    def fake_recognize(
+        self,
+        files: list[tuple[str, bytes, str | None]],
+    ) -> OcrRecognizeBatchRead:
+        return OcrRecognizeBatchRead(
+            engine="paddleocr_mobile",
+            model="PP-OCRv5_mobile",
+            total_file_count=1,
+            success_count=1,
+            documents=[
+                OcrRecognizeDocumentRead(
+                    filename="2月20_武汉-上海.png",
+                    media_type="image/png",
+                    text=(
+                        ":26429165800002785705\n"
+                        "G458\n"
+                        "Wuhan\n"
+                        "Shanghaihongqiao\n"
+                        "2026 02 20 07:55\n"
+                        "06 01B\n"
+                        ": 354.00\n"
+                        "4201061987****1615\n"
+                        ":6580061086021391007342026\n"
+                        "12306 95306"
+                    ),
+                    summary="Wuhan Shanghaihongqiao G458 354.00",
+                    avg_score=0.92,
+                    line_count=0,
+                    page_count=1,
+                    document_type="train_ticket",
+                    document_type_label="火车/高铁票",
+                    scene_code="travel",
+                    scene_label="差旅票据",
+                    document_fields=[
+                        OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"),
+                        OcrRecognizeFieldRead(key="trip_no", label="车次/航班", value="G458"),
+                        OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"),
+                        OcrRecognizeFieldRead(key="amount", label="金额", value="354元"),
+                    ],
+                )
+            ],
+        )
+
+    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
+    get_settings.cache_clear()
+    monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
+    try:
+        client = build_client()
+        response = client.post(
+            "/api/v1/ocr/recognize",
+            headers={"x-auth-username": "pytest", "x-auth-name": "Py Test"},
+            files=[("files", ("2月20_武汉-上海.png", b"fake-image", "image/png"))],
+        )
+    finally:
+        get_settings.cache_clear()
+
+    assert response.status_code == 200
+    document = response.json()["documents"][0]
+    fields = {
+        item["label"]: item["value"]
+        for item in document["document_fields"]
+    }
+    assert document["receipt_id"]
+    assert fields["身份证号"] == "4201061987****1615"
+    assert fields["车厢"] == "06车"
+    assert fields["座位号"] == "01B"
+    assert fields["票价"] == "354.00元"
--- a/server/tests/test_ocr_service.py
+++ b/server/tests/test_ocr_service.py
@@ -101,6 +101,55 @@ print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))
    assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]


+def test_ocr_service_recovers_image_text_from_worker_ocr_text(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    def fake_invoke_worker(
+        self,
+        *,
+        python_bin: str,
+        worker_path: str,
+        input_paths: list[Path],
+    ) -> dict:
+        return {
+            "engine": "paddleocr_mobile",
+            "model": "PP-OCRv5_mobile",
+            "documents": [
+                {
+                    "input_path": str(input_paths[0]),
+                    "engine": "paddleocr_mobile",
+                    "model": "PP-OCRv5_mobile",
+                    "ocr_text": "铁路电子客票 武汉-上海 2026 02 20 07:55 G458 : 354.00 12306 95306",
+                    "avg_score": 0.92,
+                    "line_count": 0,
+                    "page_count": 1,
+                    "warnings": [],
+                    "lines": [],
+                }
+            ],
+        }
+
+    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
+    monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
+    monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
+    monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
+    OcrService._result_cache.clear()
+    get_settings.cache_clear()
+    try:
+        result = OcrService().recognize_files([("train-ticket.png", b"fake-train-image", "image/png")])
+    finally:
+        OcrService._result_cache.clear()
+        get_settings.cache_clear()
+
+    recognized = result.documents[0]
+    assert "铁路电子客票" in recognized.text
+    assert recognized.document_type == "train_ticket"
+    assert any(field.label == "列车出发时间" and field.value == "2026-02-20 07:55" for field in recognized.document_fields)
+    assert any(field.label == "车次/航班" and field.value == "G458" for field in recognized.document_fields)
+    assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields)
+
+
 def test_ocr_service_passes_configured_device_to_worker(
    monkeypatch,
    tmp_path: Path,
--- a/server/tests/test_receipt_folder_service.py
+++ b/server/tests/test_receipt_folder_service.py
@@ -1,8 +1,11 @@
 from __future__ import annotations

+import base64
+
 from app.api.deps import CurrentUserContext
 from app.core.config import get_settings
 from app.schemas.ocr import OcrRecognizeDocumentRead
+from app.services.document_preview import DocumentPreviewAssets
 from app.services.receipt_folder import ReceiptFolderService


@@ -69,6 +72,172 @@ def test_receipt_folder_train_ticket_uses_invoice_date_and_enriches_fields(monke
        get_settings.cache_clear()


+def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None:
+    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
+    get_settings.cache_clear()
+    try:
+        current_user = CurrentUserContext(
+            username="pytest",
+            name="Py Test",
+            role_codes=[],
+            is_admin=False,
+        )
+        stale_preview = b"stale-preview"
+        preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}"
+        service = ReceiptFolderService()
+        receipt = service.save_receipt(
+            filename="2月20_武汉-上海.pdf",
+            content=b"%PDF-1.4 fake",
+            media_type="application/pdf",
+            current_user=current_user,
+            document=OcrRecognizeDocumentRead(
+                filename="2月20_武汉-上海.pdf",
+                media_type="application/pdf",
+                preview_kind="image",
+                preview_data_url=preview_data_url,
+            ),
+        )
+
+        receipt_dir = next(service.root.glob("pytest/*"))
+        preview_path = receipt_dir / "preview.png"
+        assert preview_path.read_bytes() == stale_preview
+        stale_meta = service._read_meta(receipt_dir)
+        stale_meta.pop("preview_rendered_with", None)
+        service._write_meta(receipt_dir, stale_meta)
+
+        def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
+            preview_path.write_bytes(b"refreshed-preview")
+            return preview_path
+
+        monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
+
+        resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user)
+
+        assert resolved_path == preview_path
+        assert media_type == "image/png"
+        assert file_name == "preview.png"
+        assert preview_path.read_bytes() == b"refreshed-preview"
+        meta = service._read_meta(receipt_dir)
+        assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
+    finally:
+        get_settings.cache_clear()
+
+
+def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None:
+    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
+    get_settings.cache_clear()
+    try:
+        current_user = CurrentUserContext(
+            username="pytest",
+            name="Py Test",
+            role_codes=[],
+            is_admin=False,
+        )
+        service = ReceiptFolderService()
+        content = b"%PDF-1.4 same receipt"
+
+        receipt = service.save_receipt(
+            filename="same-receipt.pdf",
+            content=content,
+            media_type="application/pdf",
+            current_user=current_user,
+            document=OcrRecognizeDocumentRead(
+                filename="same-receipt.pdf",
+                media_type="application/pdf",
+                text="same receipt amount 354",
+                document_type="other",
+                document_type_label="其他单据",
+                scene_code="other",
+                scene_label="其他票据",
+            ),
+        )
+        receipt_dir = service.root / "pytest" / receipt.id
+
+        assert receipt_dir.exists()
+        duplicate = service.find_duplicate_receipt(
+            filename="same-receipt.pdf",
+            content=content,
+            current_user=current_user,
+        )
+        assert duplicate is not None
+        assert duplicate.id == receipt.id
+
+        service.delete_receipt(receipt_id=receipt.id, current_user=current_user)
+
+        assert not receipt_dir.exists()
+        assert (
+            service.find_duplicate_receipt(
+                filename="same-receipt.pdf",
+                content=content,
+                current_user=current_user,
+            )
+            is None
+        )
+    finally:
+        get_settings.cache_clear()
+
+
+def test_receipt_folder_recovers_train_ticket_detail_from_other_english_ocr(monkeypatch, tmp_path) -> None:
+    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
+    get_settings.cache_clear()
+    try:
+        current_user = CurrentUserContext(
+            username="pytest",
+            name="Py Test",
+            role_codes=[],
+            is_admin=False,
+        )
+        service = ReceiptFolderService()
+        receipt = service.save_receipt(
+            filename="2月20_武汉-上海.pdf",
+            content=b"%PDF-1.4 fake",
+            media_type="application/pdf",
+            current_user=current_user,
+            document=OcrRecognizeDocumentRead(
+                filename="2月20_武汉-上海.pdf",
+                media_type="application/pdf",
+                text=(
+                    ":26429165800002785705\n"
+                    ":2026 05 18\n"
+                    "G458\n"
+                    "Wuhan\n"
+                    "Shanghaihongqiao\n"
+                    "2026 02 20 07:55\n"
+                    "06 01B\n"
+                    ": 354.00\n"
+                    "4201061987****1615\n"
+                    ":6580061086021391007342026\n"
+                    "12306 95306"
+                ),
+                summary="Wuhan Shanghaihongqiao G458 354.00",
+                document_type="other",
+                document_type_label="其他单据",
+                scene_code="other",
+                scene_label="其他票据",
+            ),
+        )
+
+        assert receipt.document_type == "train_ticket"
+        assert receipt.document_type_label == "火车/高铁票"
+        assert receipt.scene_code == "travel"
+        assert receipt.amount == "354.00元"
+        assert receipt.document_date == "2026-02-20"
+        assert receipt.merchant_name == "中国铁路"
+
+        detail = service.get_receipt(receipt.id, current_user)
+        fields = {field.label: field.value for field in detail.fields}
+        assert fields["行程"] == "武汉-上海"
+        assert fields["车次"] == "G458"
+        assert fields["列车出发时间"] == "2026-02-20 07:55"
+        assert fields["票价"] == "354.00元"
+        assert fields["身份证号"] == "4201061987****1615"
+        assert fields["车厢"] == "06车"
+        assert fields["座位号"] == "01B"
+        assert "乘车人" not in fields
+    finally:
+        get_settings.cache_clear()
+
+
 def test_receipt_folder_unlink_receipts_for_claim_marks_linked_receipts_unlinked(monkeypatch, tmp_path) -> None:
    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
    get_settings.cache_clear()
--- a/server/tests/test_reimbursement_endpoints.py
+++ b/server/tests/test_reimbursement_endpoints.py
@@ -1,6 +1,7 @@
 from __future__ import annotations

 import base64
+import json
 from collections.abc import Generator
 from datetime import UTC, date, datetime
 from decimal import Decimal
@@ -19,6 +20,7 @@ from app.models.organization import OrganizationUnit
 from app.models.risk_observation import RiskObservation, RiskObservationFeedback
 from app.models.role import Role
 from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
+from app.services.document_preview import DocumentPreviewAssets
 from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
 from app.services.ocr import OcrService

@@ -686,6 +688,9 @@ def test_claim_item_pdf_attachment_preview_returns_generated_image(monkeypatch,
    meta_payload = upload_response.json()["attachment"]
    assert meta_payload["preview_kind"] == "image"
    assert meta_payload["preview_url"].endswith(f"/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview")
+    meta_path = next(tmp_path.rglob("invoice.pdf.meta.json"))
+    stored_meta = json.loads(meta_path.read_text(encoding="utf-8"))
+    assert stored_meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID

    preview_response = client.get(
        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview",