feat(server): 票据文件夹资产缓存与文档预览统一生成

- 新增 document_preview 模块，DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成（poppler-data 编码）、renderer_id 标识 - receipt_folder 服务复用预览生成，缓存票据资产并提供清理；删除票据时保留已关联报销单的附件副本 - document_intelligence 新增票据预览/资产缓存接入与字段提取增强；ocr 抽取复用预览工具，附件分析/文档/操作/展示四个子模块同步适配 - receipt_folder 端点补充资产缓存头，补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试，新增 attachment_analysis 回归测试
2026-06-23 09:42:00 +08:00
parent bc743adef3
commit 84a8998e59
15 changed files with 1076 additions and 79 deletions
--- a/server/tests/test_document_intelligence.py
+++ b/server/tests/test_document_intelligence.py
@@ -84,6 +84,35 @@ def test_document_intelligence_prefers_train_ticket_for_railway_e_ticket_invoice
    assert any(field.label == "金额" and field.value == "354元" for field in insight.fields)


+def test_document_intelligence_recovers_train_ticket_from_english_station_ocr_text() -> None:
+    insight = build_document_insight(
+        filename="2月20_武汉-上海.pdf",
+        summary=":26429165800002785705；:2026 05 18；Wuhan Shanghaihongqiao G458",
+        text=(
+            ":26429165800002785705\n"
+            ":2026 05 18\n"
+            "G458\n"
+            "Wuhan\n"
+            "Shanghaihongqiao\n"
+            "2026 02 20 07:55\n"
+            "06 01B\n"
+            ": 354.00\n"
+            "4201061987****1615\n"
+            ":6580061086021391007342026\n"
+            "12306 95306"
+        ),
+    )
+
+    assert insight.document_type == "train_ticket"
+    assert insight.document_type_label == "火车/高铁票"
+    assert insight.scene_code == "travel"
+    fields = {field.label: field.value for field in insight.fields}
+    assert fields["金额"] == "354元"
+    assert fields["列车出发时间"] == "2026-02-20 07:55"
+    assert fields["车次/航班"] == "G458"
+    assert fields["行程"] == "武汉-上海"
+
+
 def test_document_intelligence_labels_train_ticket_date_as_train_departure_time() -> None:
    insight = build_document_insight(
        filename="铁路电子客票.pdf",
--- a/server/tests/test_expense_claim_attachment_analysis_regression.py
+++ b/server/tests/test_expense_claim_attachment_analysis_regression.py
@@ -0,0 +1,169 @@
+from __future__ import annotations
+
+import json
+from decimal import Decimal
+
+from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
+from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
+from app.services.ocr import OcrService
+from test_reimbursement_endpoints import build_client, seed_claim
+
+
+def test_train_ticket_attachment_with_structured_fields_is_not_flagged_as_unreadable(
+    monkeypatch,
+    tmp_path,
+) -> None:
+    def fake_recognize(
+        self,
+        files: list[tuple[str, bytes, str | None]],
+    ) -> OcrRecognizeBatchRead:
+        return OcrRecognizeBatchRead(
+            total_file_count=1,
+            success_count=1,
+            documents=[
+                OcrRecognizeDocumentRead(
+                    filename="2月20_武汉-上海.pdf",
+                    media_type="application/pdf",
+                    text=(
+                        ":26429165800002785705\n"
+                        ":2026 05 18\n"
+                        "G458\n"
+                        "Wuhan\n"
+                        "Shanghaihongqiao\n"
+                        "2026 02 20 07:55\n"
+                        "06 01B\n"
+                        ": 354.00\n"
+                        "4201061987****1615\n"
+                        ":6580061086021391007342026\n"
+                        "12306 95306"
+                    ),
+                    summary="Wuhan Shanghaihongqiao G458 354.00",
+                    avg_score=0.0,
+                    line_count=0,
+                    page_count=1,
+                    warnings=[],
+                )
+            ],
+        )
+
+    monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
+    monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
+
+    client, session_factory = build_client()
+    with session_factory() as db:
+        claim, item = seed_claim(db)
+        claim.expense_type = "travel"
+        claim.reason = "武汉-上海差旅"
+        claim.location = "上海"
+        claim.amount = Decimal("354.00")
+        item.item_type = "train_ticket"
+        item.item_reason = "武汉-上海"
+        item.item_location = "上海"
+        item.item_amount = Decimal("354.00")
+        db.commit()
+        claim_id = claim.id
+        item_id = item.id
+
+    upload_response = client.post(
+        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
+        headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
+        files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
+    )
+
+    assert upload_response.status_code == 200
+    attachment = upload_response.json()["attachment"]
+    analysis = attachment["analysis"]
+    points = analysis["points"]
+
+    assert attachment["document_info"]["document_type"] == "train_ticket"
+    assert analysis["severity"] == "pass"
+    assert not any("未识别到有效文字" in point for point in points)
+    assert not any("未识别到列车出发时间" in point for point in points)
+
+
+def test_attachment_meta_read_repairs_stale_unreadable_train_ticket_analysis(
+    monkeypatch,
+    tmp_path,
+) -> None:
+    def fake_recognize(
+        self,
+        files: list[tuple[str, bytes, str | None]],
+    ) -> OcrRecognizeBatchRead:
+        return OcrRecognizeBatchRead(
+            total_file_count=1,
+            success_count=1,
+            documents=[
+                OcrRecognizeDocumentRead(
+                    filename="2月20_武汉-上海.pdf",
+                    media_type="application/pdf",
+                    text=(
+                        ":26429165800002785705 :2026 05 18\n"
+                        "G458\n"
+                        "Wuhan Shanghaihongqiao\n"
+                        "2026 02 20 07:55 06 01B\n"
+                        ": 354.00\n"
+                        "4201061987****1615\n"
+                        ":6580061086021391007342026\n"
+                        "12306 95306"
+                    ),
+                    summary="Wuhan Shanghaihongqiao G458 354.00",
+                    avg_score=0.0,
+                    line_count=0,
+                    page_count=1,
+                    warnings=[],
+                )
+            ],
+        )
+
+    monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
+    monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
+
+    client, session_factory = build_client()
+    with session_factory() as db:
+        claim, item = seed_claim(db)
+        claim.expense_type = "travel"
+        claim.reason = "武汉-上海差旅"
+        claim.location = "上海"
+        claim.amount = Decimal("354.00")
+        item.item_type = "train_ticket"
+        item.item_reason = "武汉-上海"
+        item.item_location = "上海"
+        item.item_amount = Decimal("354.00")
+        db.commit()
+        claim_id = claim.id
+        item_id = item.id
+
+    upload_response = client.post(
+        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
+        headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
+        files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
+    )
+    assert upload_response.status_code == 200
+
+    meta_path = next(tmp_path.rglob("*.meta.json"))
+    meta = json.loads(meta_path.read_text(encoding="utf-8"))
+    meta["analysis"] = {
+        "severity": "high",
+        "label": "高风险",
+        "headline": "AI提示：附件不符合票据校验条件",
+        "summary": "当前附件存在明显异常，票据类型与当前费用场景不匹配，或无法作为有效报销材料。",
+        "points": [
+            "附件内容：未识别到有效文字，当前附件更像普通图片或内容过于模糊。",
+            "日期字段：未识别到列车出发时间或乘车日期。",
+        ],
+        "rule_basis": [],
+        "suggestion": "建议过滤当前不匹配的票据，重新上传符合当前费用场景的清晰原件。",
+    }
+    meta_path.write_text(json.dumps(meta, ensure_ascii=False), encoding="utf-8")
+
+    meta_response = client.get(
+        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/meta",
+        headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
+    )
+
+    assert meta_response.status_code == 200
+    analysis = meta_response.json()["analysis"]
+    points = analysis["points"]
+    assert analysis["severity"] == "pass"
+    assert not any("未识别到有效文字" in point for point in points)
+    assert not any("未识别到列车出发时间" in point for point in points)
--- a/server/tests/test_ocr_endpoints.py
+++ b/server/tests/test_ocr_endpoints.py
@@ -176,3 +176,73 @@ def test_ocr_recognize_endpoint_returns_structured_payload(monkeypatch, tmp_path
        assert deleted_response.status_code == 404
    finally:
        get_settings.cache_clear()
+
+
+def test_ocr_recognize_endpoint_returns_receipt_enriched_train_fields(monkeypatch, tmp_path) -> None:
+    def fake_recognize(
+        self,
+        files: list[tuple[str, bytes, str | None]],
+    ) -> OcrRecognizeBatchRead:
+        return OcrRecognizeBatchRead(
+            engine="paddleocr_mobile",
+            model="PP-OCRv5_mobile",
+            total_file_count=1,
+            success_count=1,
+            documents=[
+                OcrRecognizeDocumentRead(
+                    filename="2月20_武汉-上海.png",
+                    media_type="image/png",
+                    text=(
+                        ":26429165800002785705\n"
+                        "G458\n"
+                        "Wuhan\n"
+                        "Shanghaihongqiao\n"
+                        "2026 02 20 07:55\n"
+                        "06 01B\n"
+                        ": 354.00\n"
+                        "4201061987****1615\n"
+                        ":6580061086021391007342026\n"
+                        "12306 95306"
+                    ),
+                    summary="Wuhan Shanghaihongqiao G458 354.00",
+                    avg_score=0.92,
+                    line_count=0,
+                    page_count=1,
+                    document_type="train_ticket",
+                    document_type_label="火车/高铁票",
+                    scene_code="travel",
+                    scene_label="差旅票据",
+                    document_fields=[
+                        OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"),
+                        OcrRecognizeFieldRead(key="trip_no", label="车次/航班", value="G458"),
+                        OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"),
+                        OcrRecognizeFieldRead(key="amount", label="金额", value="354元"),
+                    ],
+                )
+            ],
+        )
+
+    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
+    get_settings.cache_clear()
+    monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
+    try:
+        client = build_client()
+        response = client.post(
+            "/api/v1/ocr/recognize",
+            headers={"x-auth-username": "pytest", "x-auth-name": "Py Test"},
+            files=[("files", ("2月20_武汉-上海.png", b"fake-image", "image/png"))],
+        )
+    finally:
+        get_settings.cache_clear()
+
+    assert response.status_code == 200
+    document = response.json()["documents"][0]
+    fields = {
+        item["label"]: item["value"]
+        for item in document["document_fields"]
+    }
+    assert document["receipt_id"]
+    assert fields["身份证号"] == "4201061987****1615"
+    assert fields["车厢"] == "06车"
+    assert fields["座位号"] == "01B"
+    assert fields["票价"] == "354.00元"
--- a/server/tests/test_ocr_service.py
+++ b/server/tests/test_ocr_service.py
@@ -101,6 +101,55 @@ print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))
    assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]


+def test_ocr_service_recovers_image_text_from_worker_ocr_text(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    def fake_invoke_worker(
+        self,
+        *,
+        python_bin: str,
+        worker_path: str,
+        input_paths: list[Path],
+    ) -> dict:
+        return {
+            "engine": "paddleocr_mobile",
+            "model": "PP-OCRv5_mobile",
+            "documents": [
+                {
+                    "input_path": str(input_paths[0]),
+                    "engine": "paddleocr_mobile",
+                    "model": "PP-OCRv5_mobile",
+                    "ocr_text": "铁路电子客票 武汉-上海 2026 02 20 07:55 G458 : 354.00 12306 95306",
+                    "avg_score": 0.92,
+                    "line_count": 0,
+                    "page_count": 1,
+                    "warnings": [],
+                    "lines": [],
+                }
+            ],
+        }
+
+    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
+    monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
+    monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
+    monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
+    OcrService._result_cache.clear()
+    get_settings.cache_clear()
+    try:
+        result = OcrService().recognize_files([("train-ticket.png", b"fake-train-image", "image/png")])
+    finally:
+        OcrService._result_cache.clear()
+        get_settings.cache_clear()
+
+    recognized = result.documents[0]
+    assert "铁路电子客票" in recognized.text
+    assert recognized.document_type == "train_ticket"
+    assert any(field.label == "列车出发时间" and field.value == "2026-02-20 07:55" for field in recognized.document_fields)
+    assert any(field.label == "车次/航班" and field.value == "G458" for field in recognized.document_fields)
+    assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields)
+
+
 def test_ocr_service_passes_configured_device_to_worker(
    monkeypatch,
    tmp_path: Path,
--- a/server/tests/test_receipt_folder_service.py
+++ b/server/tests/test_receipt_folder_service.py
@@ -1,8 +1,11 @@
 from __future__ import annotations

+import base64
+
 from app.api.deps import CurrentUserContext
 from app.core.config import get_settings
 from app.schemas.ocr import OcrRecognizeDocumentRead
+from app.services.document_preview import DocumentPreviewAssets
 from app.services.receipt_folder import ReceiptFolderService


@@ -69,6 +72,172 @@ def test_receipt_folder_train_ticket_uses_invoice_date_and_enriches_fields(monke
        get_settings.cache_clear()


+def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None:
+    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
+    get_settings.cache_clear()
+    try:
+        current_user = CurrentUserContext(
+            username="pytest",
+            name="Py Test",
+            role_codes=[],
+            is_admin=False,
+        )
+        stale_preview = b"stale-preview"
+        preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}"
+        service = ReceiptFolderService()
+        receipt = service.save_receipt(
+            filename="2月20_武汉-上海.pdf",
+            content=b"%PDF-1.4 fake",
+            media_type="application/pdf",
+            current_user=current_user,
+            document=OcrRecognizeDocumentRead(
+                filename="2月20_武汉-上海.pdf",
+                media_type="application/pdf",
+                preview_kind="image",
+                preview_data_url=preview_data_url,
+            ),
+        )
+
+        receipt_dir = next(service.root.glob("pytest/*"))
+        preview_path = receipt_dir / "preview.png"
+        assert preview_path.read_bytes() == stale_preview
+        stale_meta = service._read_meta(receipt_dir)
+        stale_meta.pop("preview_rendered_with", None)
+        service._write_meta(receipt_dir, stale_meta)
+
+        def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
+            preview_path.write_bytes(b"refreshed-preview")
+            return preview_path
+
+        monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
+
+        resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user)
+
+        assert resolved_path == preview_path
+        assert media_type == "image/png"
+        assert file_name == "preview.png"
+        assert preview_path.read_bytes() == b"refreshed-preview"
+        meta = service._read_meta(receipt_dir)
+        assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
+    finally:
+        get_settings.cache_clear()
+
+
+def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None:
+    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
+    get_settings.cache_clear()
+    try:
+        current_user = CurrentUserContext(
+            username="pytest",
+            name="Py Test",
+            role_codes=[],
+            is_admin=False,
+        )
+        service = ReceiptFolderService()
+        content = b"%PDF-1.4 same receipt"
+
+        receipt = service.save_receipt(
+            filename="same-receipt.pdf",
+            content=content,
+            media_type="application/pdf",
+            current_user=current_user,
+            document=OcrRecognizeDocumentRead(
+                filename="same-receipt.pdf",
+                media_type="application/pdf",
+                text="same receipt amount 354",
+                document_type="other",
+                document_type_label="其他单据",
+                scene_code="other",
+                scene_label="其他票据",
+            ),
+        )
+        receipt_dir = service.root / "pytest" / receipt.id
+
+        assert receipt_dir.exists()
+        duplicate = service.find_duplicate_receipt(
+            filename="same-receipt.pdf",
+            content=content,
+            current_user=current_user,
+        )
+        assert duplicate is not None
+        assert duplicate.id == receipt.id
+
+        service.delete_receipt(receipt_id=receipt.id, current_user=current_user)
+
+        assert not receipt_dir.exists()
+        assert (
+            service.find_duplicate_receipt(
+                filename="same-receipt.pdf",
+                content=content,
+                current_user=current_user,
+            )
+            is None
+        )
+    finally:
+        get_settings.cache_clear()
+
+
+def test_receipt_folder_recovers_train_ticket_detail_from_other_english_ocr(monkeypatch, tmp_path) -> None:
+    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
+    get_settings.cache_clear()
+    try:
+        current_user = CurrentUserContext(
+            username="pytest",
+            name="Py Test",
+            role_codes=[],
+            is_admin=False,
+        )
+        service = ReceiptFolderService()
+        receipt = service.save_receipt(
+            filename="2月20_武汉-上海.pdf",
+            content=b"%PDF-1.4 fake",
+            media_type="application/pdf",
+            current_user=current_user,
+            document=OcrRecognizeDocumentRead(
+                filename="2月20_武汉-上海.pdf",
+                media_type="application/pdf",
+                text=(
+                    ":26429165800002785705\n"
+                    ":2026 05 18\n"
+                    "G458\n"
+                    "Wuhan\n"
+                    "Shanghaihongqiao\n"
+                    "2026 02 20 07:55\n"
+                    "06 01B\n"
+                    ": 354.00\n"
+                    "4201061987****1615\n"
+                    ":6580061086021391007342026\n"
+                    "12306 95306"
+                ),
+                summary="Wuhan Shanghaihongqiao G458 354.00",
+                document_type="other",
+                document_type_label="其他单据",
+                scene_code="other",
+                scene_label="其他票据",
+            ),
+        )
+
+        assert receipt.document_type == "train_ticket"
+        assert receipt.document_type_label == "火车/高铁票"
+        assert receipt.scene_code == "travel"
+        assert receipt.amount == "354.00元"
+        assert receipt.document_date == "2026-02-20"
+        assert receipt.merchant_name == "中国铁路"
+
+        detail = service.get_receipt(receipt.id, current_user)
+        fields = {field.label: field.value for field in detail.fields}
+        assert fields["行程"] == "武汉-上海"
+        assert fields["车次"] == "G458"
+        assert fields["列车出发时间"] == "2026-02-20 07:55"
+        assert fields["票价"] == "354.00元"
+        assert fields["身份证号"] == "4201061987****1615"
+        assert fields["车厢"] == "06车"
+        assert fields["座位号"] == "01B"
+        assert "乘车人" not in fields
+    finally:
+        get_settings.cache_clear()
+
+
 def test_receipt_folder_unlink_receipts_for_claim_marks_linked_receipts_unlinked(monkeypatch, tmp_path) -> None:
    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
    get_settings.cache_clear()
--- a/server/tests/test_reimbursement_endpoints.py
+++ b/server/tests/test_reimbursement_endpoints.py
@@ -1,6 +1,7 @@
 from __future__ import annotations

 import base64
+import json
 from collections.abc import Generator
 from datetime import UTC, date, datetime
 from decimal import Decimal
@@ -19,6 +20,7 @@ from app.models.organization import OrganizationUnit
 from app.models.risk_observation import RiskObservation, RiskObservationFeedback
 from app.models.role import Role
 from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
+from app.services.document_preview import DocumentPreviewAssets
 from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
 from app.services.ocr import OcrService

@@ -686,6 +688,9 @@ def test_claim_item_pdf_attachment_preview_returns_generated_image(monkeypatch,
    meta_payload = upload_response.json()["attachment"]
    assert meta_payload["preview_kind"] == "image"
    assert meta_payload["preview_url"].endswith(f"/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview")
+    meta_path = next(tmp_path.rglob("invoice.pdf.meta.json"))
+    stored_meta = json.loads(meta_path.read_text(encoding="utf-8"))
+    assert stored_meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID

    preview_response = client.get(
        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview",