feat(server): 票据文件夹资产缓存与文档预览统一生成

- 新增 document_preview 模块，DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成（poppler-data 编码）、renderer_id 标识 - receipt_folder 服务复用预览生成，缓存票据资产并提供清理；删除票据时保留已关联报销单的附件副本 - document_intelligence 新增票据预览/资产缓存接入与字段提取增强；ocr 抽取复用预览工具，附件分析/文档/操作/展示四个子模块同步适配 - receipt_folder 端点补充资产缓存头，补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试，新增 attachment_analysis 回归测试
2026-06-23 09:42:00 +08:00
parent bc743adef3
commit 84a8998e59
15 changed files with 1076 additions and 79 deletions
--- a/server/src/app/services/receipt_folder.py
+++ b/server/src/app/services/receipt_folder.py
@@ -12,7 +12,7 @@ from uuid import uuid4

 from app.api.deps import CurrentUserContext
 from app.core.config import get_settings
-from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
+from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead
 from app.schemas.receipt_folder import (
    ReceiptFolderDeleteResponse,
    ReceiptFolderDetailRead,
@@ -20,11 +20,13 @@ from app.schemas.receipt_folder import (
    ReceiptFolderItemRead,
    ReceiptFolderUpdate,
 )
-from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
+from app.services.document_preview import DocumentPreviewAssets
+from app.services.document_intelligence import build_document_insight
 from app.services.ocr import SUPPORTED_SUFFIXES

 RECEIPT_DATE_PATTERN = re.compile(
-    r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)"
+    r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
+    r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
 )
 RECEIPT_TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:：]([0-5]\d)(?!\d)")
 TRAIN_INVOICE_DATE_PATTERN = re.compile(
@@ -45,7 +47,9 @@ TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座
 TRAIN_CARRIAGE_PATTERN = re.compile(r"(?:车厢|车厢号)\s*[:：]?\s*([0-9]{1,2}\s*车?)")
 TRAIN_SEAT_NO_PATTERN = re.compile(r"(?:座位|座位号)\s*[:：]?\s*([0-9]{1,3}[A-F号]?)", re.IGNORECASE)
 TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])号?", re.IGNORECASE)
+TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE)
 TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[:：￥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
+TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)")


 class ReceiptFolderStorageMixin:
@@ -101,18 +105,19 @@ class ReceiptFolderStorageMixin:
        document: Any | None,
    ) -> dict[str, Any]:
        preview_data_url = str(getattr(document, "preview_data_url", "") or "").strip()
-        decoded = ExpenseClaimAttachmentPresentation.decode_data_url(preview_data_url)
-        if decoded is not None:
-            preview_media_type, preview_content = decoded
-            suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
-            preview_name = f"preview{suffix}"
-            preview_path = receipt_dir / preview_name
-            preview_path.write_bytes(preview_content)
+        preview_asset = DocumentPreviewAssets.write_data_url_preview(
+            preview_dir=receipt_dir,
+            preview_name_stem="preview",
+            preview_data_url=preview_data_url,
+        )
+        if preview_asset is not None:
+            _, preview_media_type, preview_name = preview_asset
            return {
                "previewable": True,
                "preview_kind": "image",
                "preview_file_name": preview_name,
                "preview_media_type": preview_media_type,
+                "preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
            }
        if self._is_previewable(media_type):
            return {
@@ -120,14 +125,67 @@ class ReceiptFolderStorageMixin:
                "preview_kind": "image" if media_type.startswith("image/") else "pdf",
                "preview_file_name": source_path.name,
                "preview_media_type": media_type,
+                "preview_rendered_with": "",
            }
        return {
            "previewable": False,
            "preview_kind": "",
            "preview_file_name": "",
            "preview_media_type": "",
+            "preview_rendered_with": "",
        }

+    def _refresh_pdf_preview_asset_if_needed(
+        self,
+        *,
+        receipt_dir: Path,
+        meta: dict[str, Any],
+    ) -> dict[str, Any]:
+        source_name = str(meta.get("source_file_name") or meta.get("file_name") or "").strip()
+        if not source_name:
+            return meta
+
+        source_path = self._assert_child(receipt_dir / source_name)
+        source_media_type = self.resolve_media_type(source_path.name, str(meta.get("media_type") or ""))
+        if source_media_type != "application/pdf" or not source_path.exists():
+            return meta
+
+        preview_name = str(meta.get("preview_file_name") or "").strip()
+        preview_path = self._assert_child(receipt_dir / preview_name) if preview_name else None
+        if (
+            preview_path is not None
+            and preview_path.exists()
+            and str(meta.get("preview_kind") or "").strip() == "image"
+            and str(meta.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
+            and str(meta.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
+        ):
+            return meta
+
+        if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
+            preview_name = f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
+        preview_path = self._assert_child(receipt_dir / preview_name)
+
+        try:
+            DocumentPreviewAssets.render_pdf_first_page(
+                pdf_path=source_path,
+                preview_path=preview_path,
+                timeout_seconds=get_settings().ocr_timeout_seconds,
+            )
+        except Exception:
+            return meta
+
+        meta.update(
+            {
+                "previewable": True,
+                "preview_kind": "image",
+                "preview_file_name": preview_path.name,
+                "preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
+                "preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
+            }
+        )
+        self._write_meta(receipt_dir, meta)
+        return meta
+
    @staticmethod
    def _is_previewable(media_type: str) -> bool:
        return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf"
@@ -256,6 +314,7 @@ class ReceiptFolderItemMixin:
    def _build_item(self, meta: dict[str, Any]) -> ReceiptFolderItemRead:
        receipt_id = str(meta.get("id") or "").strip()
        status_value = str(meta.get("status") or "unlinked").strip() or "unlinked"
+        identity = self._resolve_receipt_document_identity(meta)
        return ReceiptFolderItemRead(
            id=receipt_id,
            file_name=str(meta.get("file_name") or ""),
@@ -263,10 +322,10 @@ class ReceiptFolderItemMixin:
            size_bytes=int(meta.get("size_bytes") or 0),
            status=status_value,
            status_label="已关联" if status_value == "linked" else "未关联",
-            document_type=str(meta.get("document_type") or "other"),
-            document_type_label=str(meta.get("document_type_label") or "其他单据"),
-            scene_code=str(meta.get("scene_code") or "other"),
-            scene_label=str(meta.get("scene_label") or "其他票据"),
+            document_type=identity["document_type"],
+            document_type_label=identity["document_type_label"],
+            scene_code=identity["scene_code"],
+            scene_label=identity["scene_label"],
            summary=str(meta.get("summary") or ""),
            amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")),
            document_date=self._resolve_receipt_document_date(meta),
@@ -283,6 +342,38 @@ class ReceiptFolderItemMixin:
            warnings=[str(value) for value in list(meta.get("ocr_warnings") or []) if str(value).strip()],
        )

+    def _resolve_receipt_document_identity(self, meta: dict[str, Any]) -> dict[str, str]:
+        document_type = str(meta.get("document_type") or "other").strip() or "other"
+        document_type_label = str(meta.get("document_type_label") or "其他单据").strip() or "其他单据"
+        scene_code = str(meta.get("scene_code") or "other").strip() or "other"
+        scene_label = str(meta.get("scene_label") or "其他票据").strip() or "其他票据"
+        if document_type not in {"", "other"} and document_type_label != "其他单据":
+            return {
+                "document_type": document_type,
+                "document_type_label": document_type_label,
+                "scene_code": scene_code,
+                "scene_label": scene_label,
+            }
+
+        insight = build_document_insight(
+            filename=str(meta.get("file_name") or ""),
+            summary=str(meta.get("summary") or ""),
+            text=self._receipt_text(meta),
+        )
+        if insight.document_type in {"", "other"}:
+            return {
+                "document_type": document_type,
+                "document_type_label": document_type_label,
+                "scene_code": scene_code,
+                "scene_label": scene_label,
+            }
+        return {
+            "document_type": insight.document_type,
+            "document_type_label": insight.document_type_label,
+            "scene_code": insight.scene_code,
+            "scene_label": insight.scene_label,
+        }
+
    def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]:
        fields = [
            ReceiptFolderFieldRead(
@@ -503,7 +594,15 @@ class ReceiptFolderTrainTicketMixin:
        if str(document_type or "").strip().lower() == "train_ticket":
            return True
        compact = "".join([document_type_label, scene_label, text]).replace(" ", "")
-        return any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次"))
+        if any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次")):
+            return True
+        lower_compact = compact.lower()
+        return bool(re.search(r"[GCDZKTLYS]\d{1,5}", compact, flags=re.IGNORECASE)) and (
+            "12306" in compact
+            or "95306" in compact
+            or re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5]{2,12}", compact)
+            or ("wuhan" in lower_compact and "shanghai" in lower_compact)
+        )

    @classmethod
    def _is_train_ticket_meta(cls, meta: dict[str, Any]) -> bool:
@@ -581,6 +680,7 @@ class ReceiptFolderTrainTicketMixin:
            return raw
        normalized = match.group(1).replace("年", "-").replace("月", "-").replace("日", "")
        normalized = normalized.replace("/", "-").replace(".", "-")
+        normalized = re.sub(r"\s+", "-", normalized)
        parts = [part for part in normalized.split("-") if part]
        if len(parts) != 3:
            return match.group(1)
@@ -651,7 +751,28 @@ class ReceiptFolderTrainTicketMixin:
        cleaned = re.sub(r"[^·\u4e00-\u9fa5]", "", str(value or "")).strip()
        if not 2 <= len(cleaned) <= 8:
            return ""
-        if any(token in cleaned for token in ("电子", "客票", "铁路", "发票", "税务", "湖北省", "中国铁路", "开票", "日期")):
+        if any(
+            token in cleaned
+            for token in (
+                "电子",
+                "客票",
+                "铁路",
+                "发票",
+                "税务",
+                "湖北省",
+                "中国铁路",
+                "开票",
+                "日期",
+                "车厢",
+                "座位",
+                "票价",
+                "金额",
+                "行程",
+                "出发",
+                "到达",
+                "车次",
+            )
+        ):
            return ""
        return cleaned

@@ -660,20 +781,29 @@ class ReceiptFolderTrainTicketMixin:
        labeled = cls._extract_first(TRAIN_ID_PATTERN, text)
        if labeled:
            return labeled
+        fallback = ""
        for line in str(text or "").replace("\r", "\n").splitlines():
            compact_line = line.replace(" ", "")
            if any(token in compact_line for token in ("发票号码", "电子客票号", "客票号", "订单号")):
                continue
            match = TRAIN_ID_FALLBACK_PATTERN.search(compact_line)
-            if match:
-                return str(match.group(1) or "").strip()
-        return ""
+            if not match:
+                continue
+            candidate = str(match.group(1) or "").strip()
+            if "*" in candidate:
+                return candidate
+            if not fallback:
+                fallback = candidate
+        return fallback

    @staticmethod
    def _extract_train_carriage_and_seat(text: str) -> tuple[str, str]:
        combined_match = TRAIN_COMBINED_SEAT_PATTERN.search(str(text or ""))
        if combined_match:
            return f"{combined_match.group(1)}车", combined_match.group(2)
+        loose_match = TRAIN_LOOSE_SEAT_PATTERN.search(str(text or ""))
+        if loose_match:
+            return f"{loose_match.group(1).zfill(2)}车", loose_match.group(2).upper()
        carriage_no = ReceiptFolderService._extract_first(TRAIN_CARRIAGE_PATTERN, text).replace(" ", "")
        seat_no = ReceiptFolderService._extract_first(TRAIN_SEAT_NO_PATTERN, text)
        return carriage_no, seat_no
@@ -681,6 +811,12 @@ class ReceiptFolderTrainTicketMixin:
    @staticmethod
    def _extract_train_fare(text: str) -> str:
        match = TRAIN_FARE_PATTERN.search(str(text or ""))
+        if not match:
+            match = max(
+                list(TRAIN_LOOSE_FARE_PATTERN.finditer(str(text or ""))),
+                key=lambda item: float(str(item.group(1) or "0").replace(",", ".")),
+                default=None,
+            )
        if not match:
            return ""
        value = str(match.group(1) or "").replace(",", ".").strip()
@@ -721,13 +857,10 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
            )
            if existing_receipt is not None:
                enriched.append(
-                    document.model_copy(
-                        update={
-                            "receipt_id": existing_receipt.id,
-                            "receipt_status": existing_receipt.status,
-                            "receipt_preview_url": existing_receipt.preview_url,
-                            "receipt_source_url": existing_receipt.source_url,
-                        }
+                    self._enrich_ocr_document_with_receipt(
+                        document,
+                        receipt=existing_receipt,
+                        current_user=current_user,
                    )
                )
                continue
@@ -744,14 +877,11 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
                warning = "已上传过同样的单据，请不要重复上传。"
                existing_warnings = [str(item) for item in list(document.warnings or []) if str(item).strip()]
                enriched.append(
-                    document.model_copy(
-                        update={
-                            "receipt_id": duplicate_receipt.id,
-                            "receipt_status": duplicate_receipt.status,
-                            "receipt_preview_url": duplicate_receipt.preview_url,
-                            "receipt_source_url": duplicate_receipt.source_url,
-                            "warnings": list(dict.fromkeys([*existing_warnings, warning])),
-                        }
+                    self._enrich_ocr_document_with_receipt(
+                        document,
+                        receipt=duplicate_receipt,
+                        current_user=current_user,
+                        extra_warnings=[*existing_warnings, warning],
                    )
                )
                continue
@@ -763,17 +893,78 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
                current_user=current_user,
            )
            enriched.append(
-                document.model_copy(
-                    update={
-                        "receipt_id": receipt.id,
-                        "receipt_status": receipt.status,
-                        "receipt_preview_url": receipt.preview_url,
-                        "receipt_source_url": receipt.source_url,
-                    }
+                self._enrich_ocr_document_with_receipt(
+                    document,
+                    receipt=receipt,
+                    current_user=current_user,
                )
            )
        return result.model_copy(update={"documents": enriched})

+    def _enrich_ocr_document_with_receipt(
+        self,
+        document: OcrRecognizeDocumentRead,
+        *,
+        receipt: ReceiptFolderItemRead,
+        current_user: CurrentUserContext,
+        extra_warnings: list[str] | None = None,
+    ) -> OcrRecognizeDocumentRead:
+        update: dict[str, Any] = {
+            "receipt_id": receipt.id,
+            "receipt_status": receipt.status,
+            "receipt_preview_url": receipt.preview_url,
+            "receipt_source_url": receipt.source_url,
+        }
+
+        try:
+            meta = self._read_receipt_meta(receipt.id, current_user)
+        except FileNotFoundError:
+            meta = {}
+
+        if meta:
+            update.update(
+                {
+                    "text": str(meta.get("ocr_text") or document.text or ""),
+                    "summary": str(meta.get("summary") or document.summary or ""),
+                    "document_type": str(meta.get("document_type") or document.document_type or "other"),
+                    "document_type_label": str(meta.get("document_type_label") or document.document_type_label or "其他单据"),
+                    "scene_code": str(meta.get("scene_code") or document.scene_code or "other"),
+                    "scene_label": str(meta.get("scene_label") or document.scene_label or "其他票据"),
+                    "classification_source": str(meta.get("ocr_classification_source") or document.classification_source or ""),
+                    "classification_confidence": float(
+                        meta.get("ocr_classification_confidence")
+                        or document.classification_confidence
+                        or 0.0
+                    ),
+                    "classification_evidence": [
+                        str(value)
+                        for value in list(meta.get("ocr_classification_evidence") or document.classification_evidence or [])
+                        if str(value).strip()
+                    ],
+                    "document_fields": self._build_ocr_document_fields_from_meta(meta),
+                }
+            )
+
+        warnings = [
+            str(item)
+            for item in list(extra_warnings if extra_warnings is not None else document.warnings or [])
+            if str(item).strip()
+        ]
+        if warnings:
+            update["warnings"] = list(dict.fromkeys(warnings))
+        return document.model_copy(update=update)
+
+    def _build_ocr_document_fields_from_meta(self, meta: dict[str, Any]) -> list[OcrRecognizeFieldRead]:
+        return [
+            OcrRecognizeFieldRead(
+                key=field.key,
+                label=field.label,
+                value=field.value,
+            )
+            for field in self._resolve_fields(meta)
+            if field.label and field.value
+        ]
+
    def save_receipt(
        self,
        *,
@@ -1024,6 +1215,7 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
    def resolve_preview(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]:
        meta = self._read_receipt_meta(receipt_id, current_user)
        receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id)
+        meta = self._refresh_pdf_preview_asset_if_needed(receipt_dir=receipt_dir, meta=meta)
        preview_name = str(meta.get("preview_file_name") or "").strip()
        if preview_name:
            preview_path = self._assert_child(receipt_dir / preview_name)
@@ -1038,4 +1230,3 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
        if self._is_previewable(source_media_type):
            return source_path, source_media_type, source_name
        raise FileNotFoundError("Receipt preview not found")
-