from __future__ import annotations import base64 from app.api.deps import CurrentUserContext from app.core.config import get_settings from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead from app.services.document_preview import DocumentPreviewAssets from app.services.receipt_folder import ReceiptFolderService def test_receipt_folder_train_ticket_uses_invoice_date_and_enriches_fields(monkeypatch, tmp_path) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() try: current_user = CurrentUserContext( username="pytest", name="Py Test", role_codes=[], is_admin=False, ) service = ReceiptFolderService() receipt = service.save_receipt( filename="2月23_上海-武汉.pdf", content=b"%PDF-1.4 fake", media_type="application/pdf", current_user=current_user, document=OcrRecognizeDocumentRead( filename="2月23_上海-武汉.pdf", media_type="application/pdf", text=( "电子发票(铁路电子客票)\n" "发票号码:26319166100006175398\n" "电子客票号:E1234567890123\n" "开票日期:2026-02-18\n" "上海虹桥站\n" "武汉站\n" "G456\n" "二等座\n" "06车01B号\n" "2026-02-20 08:30开\n" "票价:¥354.00\n" "1101011990****1234\n" "张三" ), summary="铁路电子客票,上海虹桥至武汉,票价 354 元。", document_type="train_ticket", document_type_label="火车/高铁票", scene_code="travel", scene_label="差旅票据", ), ) assert receipt.document_date == "2026-02-18" assert receipt.merchant_name == "中国铁路" assert receipt.amount == "354.00元" detail = service.get_receipt(receipt.id, current_user) fields = {field.label: field.value for field in detail.fields} assert fields["开票日期"] == "2026-02-18" assert fields["乘车人"] == "张三" assert fields["出发地点"] == "上海虹桥" assert fields["到达地点"] == "武汉" assert fields["车次"] == "G456" assert fields["电子客票号"] == "E1234567890123" assert fields["身份证号"] == "1101011990****1234" assert fields["席别"] == "二等座" assert fields["车厢"] == "06车" assert fields["座位号"] == "01B" assert fields["列车出发时间"] == "2026-02-20 08:30" finally: get_settings.cache_clear() def test_receipt_folder_pdf_save_eagerly_renders_image_preview(monkeypatch, tmp_path) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() try: current_user = CurrentUserContext( username="pytest", name="Py Test", role_codes=[], is_admin=False, ) def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds): preview_path.write_bytes(b"rendered-preview") return preview_path monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page) service = ReceiptFolderService() receipt = service.save_receipt( filename="2月20_武汉-上海.pdf", content=b"%PDF-1.4 fake", media_type="application/pdf", current_user=current_user, document=OcrRecognizeDocumentRead( filename="2月20_武汉-上海.pdf", media_type="application/pdf", text="铁路电子客票 武汉 上海虹桥 354.00", summary="铁路电子客票,武汉至上海虹桥。", ), ) receipt_dir = next(service.root.glob("pytest/*")) preview_path = receipt_dir / "preview.png" meta = service._read_meta(receipt_dir) assert receipt.preview_kind == "image" assert preview_path.read_bytes() == b"rendered-preview" assert meta["preview_file_name"] == "preview.png" assert meta["preview_media_type"] == "image/png" assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user) assert resolved_path == preview_path assert media_type == "image/png" assert file_name == "preview.png" finally: get_settings.cache_clear() def test_receipt_folder_persist_enriches_pdf_ocr_document_with_image_preview(monkeypatch, tmp_path) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() try: current_user = CurrentUserContext( username="pytest", name="Py Test", role_codes=[], is_admin=False, ) def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds): preview_path.write_bytes(b"rendered-preview") return preview_path monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page) service = ReceiptFolderService() result = service.persist_ocr_batch( files=[("2月23_上海-武汉.pdf", b"%PDF-1.4 fake", "application/pdf")], result=OcrRecognizeBatchRead( total_file_count=1, success_count=1, documents=[ OcrRecognizeDocumentRead( filename="2月23_上海-武汉.pdf", media_type="application/pdf", text="铁路电子客票 上海虹桥 武汉 G456 354.00", summary="铁路电子客票,上海虹桥至武汉。", document_type="train_ticket", document_type_label="火车/高铁票", scene_code="travel", scene_label="差旅票据", ), ], ), current_user=current_user, ) document = result.documents[0] assert document.receipt_id assert document.receipt_preview_url.endswith(f"/receipt-folder/{document.receipt_id}/preview") assert document.preview_kind == "image" finally: get_settings.cache_clear() def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() try: current_user = CurrentUserContext( username="pytest", name="Py Test", role_codes=[], is_admin=False, ) stale_preview = b"stale-preview" preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}" service = ReceiptFolderService() receipt = service.save_receipt( filename="2月20_武汉-上海.pdf", content=b"%PDF-1.4 fake", media_type="application/pdf", current_user=current_user, document=OcrRecognizeDocumentRead( filename="2月20_武汉-上海.pdf", media_type="application/pdf", preview_kind="image", preview_data_url=preview_data_url, ), ) receipt_dir = next(service.root.glob("pytest/*")) preview_path = receipt_dir / "preview.png" assert preview_path.read_bytes() == stale_preview stale_meta = service._read_meta(receipt_dir) stale_meta.pop("preview_rendered_with", None) service._write_meta(receipt_dir, stale_meta) def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds): preview_path.write_bytes(b"refreshed-preview") return preview_path monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page) resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user) assert resolved_path == preview_path assert media_type == "image/png" assert file_name == "preview.png" assert preview_path.read_bytes() == b"refreshed-preview" meta = service._read_meta(receipt_dir) assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID finally: get_settings.cache_clear() def test_receipt_folder_pdf_preview_falls_back_to_source_when_render_fonts_missing( monkeypatch, tmp_path, ) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() try: current_user = CurrentUserContext( username="pytest", name="Py Test", role_codes=[], is_admin=False, ) stale_preview = b"broken-preview" preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}" service = ReceiptFolderService() receipt = service.save_receipt( filename="2月20_武汉-上海.pdf", content=b"%PDF-1.7 fake", media_type="application/pdf", current_user=current_user, document=OcrRecognizeDocumentRead( filename="2月20_武汉-上海.pdf", media_type="application/pdf", preview_kind="image", preview_data_url=preview_data_url, ), ) receipt_dir = next(service.root.glob("pytest/*")) meta = service._read_meta(receipt_dir) meta["preview_rendered_with"] = "pdftoppm-png-r160-poppler-data" service._write_meta(receipt_dir, meta) def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds): raise RuntimeError("Missing language pack for 'Adobe-GB1' mapping") monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page) resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user) assert resolved_path == receipt_dir / "2月20_武汉-上海.pdf" assert media_type == "application/pdf" assert file_name == "2月20_武汉-上海.pdf" refreshed_meta = service._read_meta(receipt_dir) assert refreshed_meta["preview_kind"] == "pdf" assert refreshed_meta["preview_file_name"] == "2月20_武汉-上海.pdf" assert refreshed_meta["preview_media_type"] == "application/pdf" assert refreshed_meta["preview_rendered_with"] == "" finally: get_settings.cache_clear() def test_receipt_folder_train_ticket_extracts_passenger_from_id_line_and_purchase_name( monkeypatch, tmp_path, ) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() try: current_user = CurrentUserContext( username="pytest", name="Py Test", role_codes=[], is_admin=False, ) service = ReceiptFolderService() receipt = service.save_receipt( filename="2月20_武汉-上海.pdf", content=b"%PDF-1.4 fake", media_type="application/pdf", current_user=current_user, document=OcrRecognizeDocumentRead( filename="2月20_武汉-上海.pdf", media_type="application/pdf", text=( "电子发票(铁路电子客票)\n" "发票号码:26429165800002785705 湖北\n" "开票日期:2026年05月18日\n" "武汉站 G458 上海虹桥站\n" "Wuhan Shanghaihongqiao\n" "2026年02月20日 07:55开 06车01B号 二等座\n" "票价:¥354.00\n" "4201061987****1615 曹笑竹\n" "电子客票号:6580061086021391007342026\n" "购买方名称:曹笑竹 统一社会信用代码:\n" "买票请到12306 发货请到95306\n" "中国铁路祝您旅途愉快" ), summary="电子发票(铁路电子客票);发票监;统一 制", document_type="train_ticket", document_type_label="火车/高铁票", scene_code="travel", scene_label="差旅票据", document_fields=[ OcrRecognizeFieldRead(key="merchant_name", label="商户", value="电子发票(铁路"), OcrRecognizeFieldRead(key="amount", label="金额", value="354元"), OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"), OcrRecognizeFieldRead(key="trip_no", label="车次", value="G458"), OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"), ], ), ) assert receipt.merchant_name == "中国铁路" detail = service.get_receipt(receipt.id, current_user) fields = {field.label: field.value for field in detail.fields} assert fields["商户"] == "中国铁路" assert fields["乘车人"] == "曹笑竹" assert fields["出发地点"] == "武汉" assert fields["到达地点"] == "上海虹桥" assert fields["身份证号"] == "4201061987****1615" assert fields["电子客票号"] == "6580061086021391007342026" assert fields["开票日期"] == "2026-05-18" assert fields["列车出发时间"] == "2026-02-20 07:55" assert fields["车厢"] == "06车" assert fields["座位号"] == "01B" finally: get_settings.cache_clear() def test_receipt_folder_train_ticket_repairs_invalid_generated_fields_from_ocr_text( monkeypatch, tmp_path, ) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() try: current_user = CurrentUserContext( username="pytest", name="Py Test", role_codes=[], is_admin=False, ) service = ReceiptFolderService() receipt = service.save_receipt( filename="2月21日_上海-深圳.png", content=b"fake image", media_type="image/png", current_user=current_user, document=OcrRecognizeDocumentRead( filename="2月21日_上海-深圳.png", media_type="image/png", text=( "行程单示意\n" "出票渠道:示例平台\n" "非官方车票\n" "不可报销\n" "仅供演示\n" "创建日期:2026年02月15日\n" "订单号:DEMO202602210001\n" "单据编号:DEMO-IT-000001\n" "上海虹桥\n" "G999\n" "深圳北\n" "站\n" "站\n" "Shanghaihongqiao\n" "Shenzhenbei\n" "2026年02月21日\n" "08:30出发\n" "全程约7小时30分\n" "15:00到达\n" "DEMO\n" "乘客:示例旅客\n" "车厢:05车\n" "席别:二等座\n" "-\n" "扫码无效\n" "证件号:310101199001010000\n" "座位:08A\n" "票价:¥438.00\n" "仅为演示" ), summary="行程单示意;出票渠道:示例平台;非官方车票", document_type="train_ticket", document_type_label="火车/高铁票", scene_code="travel", scene_label="差旅票据", document_fields=[ OcrRecognizeFieldRead(key="amount", label="金额", value="438元"), OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-21 08:30"), OcrRecognizeFieldRead(key="invoice_number", label="票据号码", value="DEMO202602210001"), OcrRecognizeFieldRead(key="trip_no", label="车次", value="G999"), OcrRecognizeFieldRead(key="route", label="行程", value="上海-深圳"), OcrRecognizeFieldRead(key="departure_station", label="出发地点", value="二等座"), OcrRecognizeFieldRead(key="arrival_station", label="到达地点", value="扫码无效"), OcrRecognizeFieldRead(key="passenger_name", label="乘车人", value="席别二等座"), ], ), ) detail = service.get_receipt(receipt.id, current_user) fields = {field.label: field.value for field in detail.fields} assert fields["出发地点"] == "上海虹桥" assert fields["到达地点"] == "深圳北" assert fields["乘车人"] == "示例旅客" assert fields["身份证号"] == "310101199001010000" assert fields["席别"] == "二等座" assert fields["车厢"] == "05车" assert fields["座位号"] == "08A" assert fields["票价"] == "438.00元" finally: get_settings.cache_clear() def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() try: current_user = CurrentUserContext( username="pytest", name="Py Test", role_codes=[], is_admin=False, ) service = ReceiptFolderService() content = b"%PDF-1.4 same receipt" receipt = service.save_receipt( filename="same-receipt.pdf", content=content, media_type="application/pdf", current_user=current_user, document=OcrRecognizeDocumentRead( filename="same-receipt.pdf", media_type="application/pdf", text="same receipt amount 354", document_type="other", document_type_label="其他单据", scene_code="other", scene_label="其他票据", ), ) receipt_dir = service.root / "pytest" / receipt.id assert receipt_dir.exists() duplicate = service.find_duplicate_receipt( filename="same-receipt.pdf", content=content, current_user=current_user, ) assert duplicate is not None assert duplicate.id == receipt.id service.delete_receipt(receipt_id=receipt.id, current_user=current_user) assert not receipt_dir.exists() assert ( service.find_duplicate_receipt( filename="same-receipt.pdf", content=content, current_user=current_user, ) is None ) finally: get_settings.cache_clear() def test_receipt_folder_duplicate_uses_newer_ocr_when_existing_meta_is_weaker(monkeypatch, tmp_path) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() try: current_user = CurrentUserContext( username="pytest", name="Py Test", role_codes=[], is_admin=False, ) service = ReceiptFolderService() content = b"%PDF-1.7 same train ticket" stale_receipt = service.save_receipt( filename="2月20_武汉-上海.pdf", content=content, media_type="application/pdf", current_user=current_user, document=OcrRecognizeDocumentRead( filename="2月20_武汉-上海.pdf", media_type="application/pdf", document_type="other", document_type_label="其他单据", scene_code="other", scene_label="其他票据", warnings=["PDF 转图片失败:Missing language pack for Adobe-GB1"], ), ) result = service.persist_ocr_batch( files=[("2月20_武汉-上海.pdf", content, "application/pdf")], result=OcrRecognizeBatchRead( total_file_count=1, success_count=1, documents=[ OcrRecognizeDocumentRead( filename="2月20_武汉-上海.pdf", media_type="application/pdf", text="G458 Wuhan Shanghaihongqiao 2026 02 20 07:55 票价: 354.00 12306", summary="Wuhan Shanghaihongqiao G458 354.00", document_type="train_ticket", document_type_label="火车/高铁票", scene_code="travel", scene_label="差旅票据", document_fields=[ OcrRecognizeFieldRead(key="amount", label="金额", value="354元"), OcrRecognizeFieldRead(key="trip_no", label="车次/航班", value="G458"), OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"), ], ), ], ), current_user=current_user, ) document = result.documents[0] assert document.receipt_id == stale_receipt.id assert document.document_type == "train_ticket" assert document.document_type_label == "火车/高铁票" assert any(field.label == "金额" and field.value == "354元" for field in document.document_fields) assert any("重复上传" in warning for warning in document.warnings) repaired = service.get_receipt(stale_receipt.id, current_user) assert repaired.document_type == "train_ticket" assert repaired.document_type_label == "火车/高铁票" assert {field.label: field.value for field in repaired.fields}["金额"] == "354元" finally: get_settings.cache_clear() def test_receipt_folder_recovers_train_ticket_detail_from_other_english_ocr(monkeypatch, tmp_path) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() try: current_user = CurrentUserContext( username="pytest", name="Py Test", role_codes=[], is_admin=False, ) service = ReceiptFolderService() receipt = service.save_receipt( filename="2月20_武汉-上海.pdf", content=b"%PDF-1.4 fake", media_type="application/pdf", current_user=current_user, document=OcrRecognizeDocumentRead( filename="2月20_武汉-上海.pdf", media_type="application/pdf", text=( ":26429165800002785705\n" ":2026 05 18\n" "G458\n" "Wuhan\n" "Shanghaihongqiao\n" "2026 02 20 07:55\n" "06 01B\n" ": 354.00\n" "4201061987****1615\n" ":6580061086021391007342026\n" "12306 95306" ), summary="Wuhan Shanghaihongqiao G458 354.00", document_type="other", document_type_label="其他单据", scene_code="other", scene_label="其他票据", ), ) assert receipt.document_type == "train_ticket" assert receipt.document_type_label == "火车/高铁票" assert receipt.scene_code == "travel" assert receipt.amount == "354.00元" assert receipt.document_date == "2026-02-20" assert receipt.merchant_name == "中国铁路" detail = service.get_receipt(receipt.id, current_user) fields = {field.label: field.value for field in detail.fields} assert fields["行程"] == "武汉-上海" assert fields["车次"] == "G458" assert fields["列车出发时间"] == "2026-02-20 07:55" assert fields["票价"] == "354.00元" assert fields["身份证号"] == "4201061987****1615" assert fields["车厢"] == "06车" assert fields["座位号"] == "01B" assert "乘车人" not in fields finally: get_settings.cache_clear() def test_receipt_folder_unlink_receipts_for_claim_marks_linked_receipts_unlinked(monkeypatch, tmp_path) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() try: current_user = CurrentUserContext( username="pytest", name="Py Test", role_codes=[], is_admin=False, ) service = ReceiptFolderService() receipt = service.save_receipt( filename="linked-receipt.pdf", content=b"%PDF-1.4 linked", media_type="application/pdf", current_user=current_user, linked_claim_id="claim-1", linked_claim_no="RE-001", linked_item_id="item-1", document=OcrRecognizeDocumentRead( filename="linked-receipt.pdf", media_type="application/pdf", text="invoice number 123 amount 100", document_type="vat_invoice", document_type_label="invoice", scene_code="other", scene_label="receipt", ), ) linked_detail = service.get_receipt(receipt.id, current_user) assert linked_detail.status == "linked" assert linked_detail.linked_claim_id == "claim-1" assert linked_detail.linked_claim_no == "RE-001" assert service.unlink_receipts_for_claim("claim-1") == 1 unlinked_detail = service.get_receipt(receipt.id, current_user) assert unlinked_detail.status == "unlinked" assert unlinked_detail.linked_claim_id == "" assert unlinked_detail.linked_claim_no == "" assert unlinked_detail.linked_at is None finally: get_settings.cache_clear()