feat(server): 票据文件夹资产缓存与文档预览统一生成

- 新增 document_preview 模块,DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成(poppler-data 编码)、renderer_id 标识
- receipt_folder 服务复用预览生成,缓存票据资产并提供清理;删除票据时保留已关联报销单的附件副本
- document_intelligence 新增票据预览/资产缓存接入与字段提取增强;ocr 抽取复用预览工具,附件分析/文档/操作/展示四个子模块同步适配
- receipt_folder 端点补充资产缓存头,补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试,新增 attachment_analysis 回归测试
This commit is contained in:
caoxiaozhu
2026-06-23 09:42:00 +08:00
parent bc743adef3
commit 84a8998e59
15 changed files with 1076 additions and 79 deletions

View File

@@ -1,8 +1,11 @@
from __future__ import annotations
import base64
from app.api.deps import CurrentUserContext
from app.core.config import get_settings
from app.schemas.ocr import OcrRecognizeDocumentRead
from app.services.document_preview import DocumentPreviewAssets
from app.services.receipt_folder import ReceiptFolderService
@@ -69,6 +72,172 @@ def test_receipt_folder_train_ticket_uses_invoice_date_and_enriches_fields(monke
get_settings.cache_clear()
def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
stale_preview = b"stale-preview"
preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}"
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=b"%PDF-1.4 fake",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
preview_kind="image",
preview_data_url=preview_data_url,
),
)
receipt_dir = next(service.root.glob("pytest/*"))
preview_path = receipt_dir / "preview.png"
assert preview_path.read_bytes() == stale_preview
stale_meta = service._read_meta(receipt_dir)
stale_meta.pop("preview_rendered_with", None)
service._write_meta(receipt_dir, stale_meta)
def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
preview_path.write_bytes(b"refreshed-preview")
return preview_path
monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user)
assert resolved_path == preview_path
assert media_type == "image/png"
assert file_name == "preview.png"
assert preview_path.read_bytes() == b"refreshed-preview"
meta = service._read_meta(receipt_dir)
assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
finally:
get_settings.cache_clear()
def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
service = ReceiptFolderService()
content = b"%PDF-1.4 same receipt"
receipt = service.save_receipt(
filename="same-receipt.pdf",
content=content,
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="same-receipt.pdf",
media_type="application/pdf",
text="same receipt amount 354",
document_type="other",
document_type_label="其他单据",
scene_code="other",
scene_label="其他票据",
),
)
receipt_dir = service.root / "pytest" / receipt.id
assert receipt_dir.exists()
duplicate = service.find_duplicate_receipt(
filename="same-receipt.pdf",
content=content,
current_user=current_user,
)
assert duplicate is not None
assert duplicate.id == receipt.id
service.delete_receipt(receipt_id=receipt.id, current_user=current_user)
assert not receipt_dir.exists()
assert (
service.find_duplicate_receipt(
filename="same-receipt.pdf",
content=content,
current_user=current_user,
)
is None
)
finally:
get_settings.cache_clear()
def test_receipt_folder_recovers_train_ticket_detail_from_other_english_ocr(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=b"%PDF-1.4 fake",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text=(
":26429165800002785705\n"
":2026 05 18\n"
"G458\n"
"Wuhan\n"
"Shanghaihongqiao\n"
"2026 02 20 07:55\n"
"06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
summary="Wuhan Shanghaihongqiao G458 354.00",
document_type="other",
document_type_label="其他单据",
scene_code="other",
scene_label="其他票据",
),
)
assert receipt.document_type == "train_ticket"
assert receipt.document_type_label == "火车/高铁票"
assert receipt.scene_code == "travel"
assert receipt.amount == "354.00元"
assert receipt.document_date == "2026-02-20"
assert receipt.merchant_name == "中国铁路"
detail = service.get_receipt(receipt.id, current_user)
fields = {field.label: field.value for field in detail.fields}
assert fields["行程"] == "武汉-上海"
assert fields["车次"] == "G458"
assert fields["列车出发时间"] == "2026-02-20 07:55"
assert fields["票价"] == "354.00元"
assert fields["身份证号"] == "4201061987****1615"
assert fields["车厢"] == "06车"
assert fields["座位号"] == "01B"
assert "乘车人" not in fields
finally:
get_settings.cache_clear()
def test_receipt_folder_unlink_receipts_for_claim_marks_linked_receipts_unlinked(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()