feat(server): 票据文件夹资产缓存与文档预览统一生成

- 新增 document_preview 模块，DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成（poppler-data 编码）、renderer_id 标识 - receipt_folder 服务复用预览生成，缓存票据资产并提供清理；删除票据时保留已关联报销单的附件副本 - document_intelligence 新增票据预览/资产缓存接入与字段提取增强；ocr 抽取复用预览工具，附件分析/文档/操作/展示四个子模块同步适配 - receipt_folder 端点补充资产缓存头，补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试，新增 attachment_analysis 回归测试
2026-06-23 09:42:00 +08:00
parent bc743adef3
commit 84a8998e59
15 changed files with 1076 additions and 79 deletions
--- a/server/tests/test_expense_claim_attachment_analysis_regression.py
+++ b/server/tests/test_expense_claim_attachment_analysis_regression.py
@@ -0,0 +1,169 @@
+from __future__ import annotations
+
+import json
+from decimal import Decimal
+
+from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
+from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
+from app.services.ocr import OcrService
+from test_reimbursement_endpoints import build_client, seed_claim
+
+
+def test_train_ticket_attachment_with_structured_fields_is_not_flagged_as_unreadable(
+    monkeypatch,
+    tmp_path,
+) -> None:
+    def fake_recognize(
+        self,
+        files: list[tuple[str, bytes, str | None]],
+    ) -> OcrRecognizeBatchRead:
+        return OcrRecognizeBatchRead(
+            total_file_count=1,
+            success_count=1,
+            documents=[
+                OcrRecognizeDocumentRead(
+                    filename="2月20_武汉-上海.pdf",
+                    media_type="application/pdf",
+                    text=(
+                        ":26429165800002785705\n"
+                        ":2026 05 18\n"
+                        "G458\n"
+                        "Wuhan\n"
+                        "Shanghaihongqiao\n"
+                        "2026 02 20 07:55\n"
+                        "06 01B\n"
+                        ": 354.00\n"
+                        "4201061987****1615\n"
+                        ":6580061086021391007342026\n"
+                        "12306 95306"
+                    ),
+                    summary="Wuhan Shanghaihongqiao G458 354.00",
+                    avg_score=0.0,
+                    line_count=0,
+                    page_count=1,
+                    warnings=[],
+                )
+            ],
+        )
+
+    monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
+    monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
+
+    client, session_factory = build_client()
+    with session_factory() as db:
+        claim, item = seed_claim(db)
+        claim.expense_type = "travel"
+        claim.reason = "武汉-上海差旅"
+        claim.location = "上海"
+        claim.amount = Decimal("354.00")
+        item.item_type = "train_ticket"
+        item.item_reason = "武汉-上海"
+        item.item_location = "上海"
+        item.item_amount = Decimal("354.00")
+        db.commit()
+        claim_id = claim.id
+        item_id = item.id
+
+    upload_response = client.post(
+        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
+        headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
+        files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
+    )
+
+    assert upload_response.status_code == 200
+    attachment = upload_response.json()["attachment"]
+    analysis = attachment["analysis"]
+    points = analysis["points"]
+
+    assert attachment["document_info"]["document_type"] == "train_ticket"
+    assert analysis["severity"] == "pass"
+    assert not any("未识别到有效文字" in point for point in points)
+    assert not any("未识别到列车出发时间" in point for point in points)
+
+
+def test_attachment_meta_read_repairs_stale_unreadable_train_ticket_analysis(
+    monkeypatch,
+    tmp_path,
+) -> None:
+    def fake_recognize(
+        self,
+        files: list[tuple[str, bytes, str | None]],
+    ) -> OcrRecognizeBatchRead:
+        return OcrRecognizeBatchRead(
+            total_file_count=1,
+            success_count=1,
+            documents=[
+                OcrRecognizeDocumentRead(
+                    filename="2月20_武汉-上海.pdf",
+                    media_type="application/pdf",
+                    text=(
+                        ":26429165800002785705 :2026 05 18\n"
+                        "G458\n"
+                        "Wuhan Shanghaihongqiao\n"
+                        "2026 02 20 07:55 06 01B\n"
+                        ": 354.00\n"
+                        "4201061987****1615\n"
+                        ":6580061086021391007342026\n"
+                        "12306 95306"
+                    ),
+                    summary="Wuhan Shanghaihongqiao G458 354.00",
+                    avg_score=0.0,
+                    line_count=0,
+                    page_count=1,
+                    warnings=[],
+                )
+            ],
+        )
+
+    monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
+    monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
+
+    client, session_factory = build_client()
+    with session_factory() as db:
+        claim, item = seed_claim(db)
+        claim.expense_type = "travel"
+        claim.reason = "武汉-上海差旅"
+        claim.location = "上海"
+        claim.amount = Decimal("354.00")
+        item.item_type = "train_ticket"
+        item.item_reason = "武汉-上海"
+        item.item_location = "上海"
+        item.item_amount = Decimal("354.00")
+        db.commit()
+        claim_id = claim.id
+        item_id = item.id
+
+    upload_response = client.post(
+        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
+        headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
+        files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
+    )
+    assert upload_response.status_code == 200
+
+    meta_path = next(tmp_path.rglob("*.meta.json"))
+    meta = json.loads(meta_path.read_text(encoding="utf-8"))
+    meta["analysis"] = {
+        "severity": "high",
+        "label": "高风险",
+        "headline": "AI提示：附件不符合票据校验条件",
+        "summary": "当前附件存在明显异常，票据类型与当前费用场景不匹配，或无法作为有效报销材料。",
+        "points": [
+            "附件内容：未识别到有效文字，当前附件更像普通图片或内容过于模糊。",
+            "日期字段：未识别到列车出发时间或乘车日期。",
+        ],
+        "rule_basis": [],
+        "suggestion": "建议过滤当前不匹配的票据，重新上传符合当前费用场景的清晰原件。",
+    }
+    meta_path.write_text(json.dumps(meta, ensure_ascii=False), encoding="utf-8")
+
+    meta_response = client.get(
+        f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/meta",
+        headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
+    )
+
+    assert meta_response.status_code == 200
+    analysis = meta_response.json()["analysis"]
+    points = analysis["points"]
+    assert analysis["severity"] == "pass"
+    assert not any("未识别到有效文字" in point for point in points)
+    assert not any("未识别到列车出发时间" in point for point in points)