From 84a8998e59d0adef4e325116100522d5452ea52e Mon Sep 17 00:00:00 2001 From: caoxiaozhu Date: Tue, 23 Jun 2026 09:42:00 +0800 Subject: [PATCH] =?UTF-8?q?feat(server):=20=E7=A5=A8=E6=8D=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E5=A4=B9=E8=B5=84=E4=BA=A7=E7=BC=93=E5=AD=98=E4=B8=8E?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E9=A2=84=E8=A7=88=E7=BB=9F=E4=B8=80=E7=94=9F?= =?UTF-8?q?=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 document_preview 模块,DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成(poppler-data 编码)、renderer_id 标识 - receipt_folder 服务复用预览生成,缓存票据资产并提供清理;删除票据时保留已关联报销单的附件副本 - document_intelligence 新增票据预览/资产缓存接入与字段提取增强;ocr 抽取复用预览工具,附件分析/文档/操作/展示四个子模块同步适配 - receipt_folder 端点补充资产缓存头,补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试,新增 attachment_analysis 回归测试 --- .../app/api/v1/endpoints/receipt_folder.py | 2 +- .../src/app/services/document_intelligence.py | 72 ++++- server/src/app/services/document_preview.py | 98 +++++++ .../expense_claim_attachment_analysis.py | 38 ++- .../expense_claim_attachment_document.py | 69 +++++ .../expense_claim_attachment_operations.py | 60 ++++ .../expense_claim_attachment_presentation.py | 32 +- server/src/app/services/ocr.py | 18 +- server/src/app/services/receipt_folder.py | 275 +++++++++++++++--- server/tests/test_document_intelligence.py | 29 ++ ...se_claim_attachment_analysis_regression.py | 169 +++++++++++ server/tests/test_ocr_endpoints.py | 70 +++++ server/tests/test_ocr_service.py | 49 ++++ server/tests/test_receipt_folder_service.py | 169 +++++++++++ server/tests/test_reimbursement_endpoints.py | 5 + 15 files changed, 1076 insertions(+), 79 deletions(-) create mode 100644 server/src/app/services/document_preview.py create mode 100644 server/tests/test_expense_claim_attachment_analysis_regression.py diff --git a/server/src/app/api/v1/endpoints/receipt_folder.py b/server/src/app/api/v1/endpoints/receipt_folder.py index fd5ee7b..624d0c7 100644 --- a/server/src/app/api/v1/endpoints/receipt_folder.py +++ b/server/src/app/api/v1/endpoints/receipt_folder.py @@ -92,7 +92,7 @@ def preview_receipt(receipt_id: str, current_user: CurrentUser) -> FileResponse: file_path, media_type, file_name = ReceiptFolderService().resolve_preview(receipt_id, current_user) except FileNotFoundError as exc: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Receipt preview not found") from exc - return FileResponse(file_path, media_type=media_type, filename=file_name) + return FileResponse(file_path, media_type=media_type, filename=file_name, headers={"Cache-Control": "no-store"}) @router.get( diff --git a/server/src/app/services/document_intelligence.py b/server/src/app/services/document_intelligence.py index 440817d..0250ce5 100644 --- a/server/src/app/services/document_intelligence.py +++ b/server/src/app/services/document_intelligence.py @@ -25,11 +25,15 @@ AMOUNT_PATTERNS = ( re.compile(r"[¥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"), re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"), ) -DATE_PATTERN = re.compile(r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)") +DATE_PATTERN = re.compile( + r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])" + r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)" +) TIME_PATTERN = re.compile(r"(?|-)\s*([\u4e00-\u9fa5]{2,12})") MERCHANT_PATTERNS = ( re.compile(r"(?:销售方(?:名称)?|商户(?:名称)?|开票方(?:名称)?|收款方(?:名称)?)[::\s]*([A-Za-z0-9\u4e00-\u9fa5()()·&\\-]{2,40})"), @@ -300,6 +304,14 @@ def _match_document_rule(compact_text: str) -> RuleMatch: best_score = score if best_score <= 0: + train_rule = DOCUMENT_TYPE_RULE_MAP.get("train_ticket") + if train_rule and _looks_like_train_ticket(compact_text): + return RuleMatch( + rule=train_rule, + confidence=0.82, + evidence=("车次", "12306"), + score=3.8, + ) return RuleMatch(rule=None, confidence=0.0, evidence=(), score=0.0) confidence = min(0.94, 0.30 + min(best_score, 4.8) * 0.12) @@ -311,6 +323,17 @@ def _match_document_rule(compact_text: str) -> RuleMatch: ) +def _looks_like_train_ticket(compact_text: str) -> bool: + text = str(compact_text or "").lower() + if not re.search(r"[gcdzktlys]\d{1,5}", text, flags=re.IGNORECASE): + return False + if "12306" in text or "95306" in text: + return True + if re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5]{2,12}", text): + return True + return "wuhan" in text and "shanghai" in text + + def _extract_json_payload(response_text: str | None) -> dict[str, Any] | None: if not response_text: return None @@ -521,33 +544,48 @@ def _merge_document_fields( def _extract_document_fields(text: str, document_type: str = "") -> list[DocumentField]: fields: list[DocumentField] = [] + normalized_type = str(document_type or "").strip().lower() + + def append_field(key: str, label: str, value: str) -> None: + cleaned = _clean_field_value(value) + if not cleaned: + return + if any(field.key == key for field in fields if field.key): + return + fields.append(DocumentField(key=key, label=label, value=cleaned)) + amount = _extract_amount(text) if amount: - fields.append(DocumentField(key="amount", label="金额", value=amount)) + append_field("amount", "金额", amount) date_value = _extract_date(text, document_type=document_type) if date_value: - fields.append(DocumentField(key="date", label="日期", value=date_value)) + append_field("date", "日期", date_value) merchant = _extract_merchant(text) if merchant: - fields.append(DocumentField(key="merchant_name", label="商户", value=merchant)) + append_field("merchant_name", "商户", merchant) invoice_number = _extract_pattern(INVOICE_NUMBER_PATTERN, text) if invoice_number: - fields.append(DocumentField(key="invoice_number", label="票据号码", value=invoice_number)) + append_field("invoice_number", "票据号码", invoice_number) invoice_code = _extract_pattern(INVOICE_CODE_PATTERN, text) if invoice_code: - fields.append(DocumentField(key="invoice_code", label="发票代码", value=invoice_code)) + append_field("invoice_code", "发票代码", invoice_code) trip_no = _extract_pattern(TRIP_NO_PATTERN, text) + if not trip_no and normalized_type == "train_ticket": + trip_no = _extract_pattern(TRAIN_STANDALONE_NO_PATTERN, text) if trip_no: - fields.append(DocumentField(key="trip_no", label="车次/航班", value=trip_no)) + append_field("trip_no", "车次/航班", trip_no.upper()) route = _extract_route(text) if route: - fields.append(DocumentField(key="route", label="行程", value=route)) + append_field("route", "行程", route) + + if normalized_type == "train_ticket" and not any(field.key == "amount" for field in fields): + append_field("amount", "金额", _extract_loose_decimal_amount(text)) return fields @@ -621,6 +659,7 @@ def _format_date_match_with_time(text: str, match: re.Match[str]) -> str: raw_value = str(match.group(1) or "").strip() normalized = raw_value.replace("年", "-").replace("月", "-").replace("日", "") normalized = normalized.replace("/", "-").replace(".", "-") + normalized = re.sub(r"\s+", "-", normalized) parts = [part for part in normalized.split("-") if part] if len(parts) != 3: return raw_value @@ -703,6 +742,23 @@ def _extract_route(text: str) -> str: return f"{start}-{end}" +def _extract_loose_decimal_amount(text: str) -> str: + best_value: Decimal | None = None + for match in re.finditer(r"(? best_value: + best_value = candidate + if best_value is None: + return "" + text_value = format(best_value, "f").rstrip("0").rstrip(".") + return f"{text_value}元" + + def _extract_pattern(pattern: re.Pattern[str], text: str) -> str: match = pattern.search(text) if not match: diff --git a/server/src/app/services/document_preview.py b/server/src/app/services/document_preview.py new file mode 100644 index 0000000..d4589ef --- /dev/null +++ b/server/src/app/services/document_preview.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import base64 +import binascii +import mimetypes +import re +import shutil +import subprocess +import tempfile +from pathlib import Path + + +class DocumentPreviewAssets: + PDF_RENDERER_ID = "pdftoppm-png-r160-poppler-data" + PDF_PREVIEW_MEDIA_TYPE = "image/png" + PDF_PREVIEW_SUFFIX = ".png" + + @staticmethod + def decode_data_url(payload: str) -> tuple[str, bytes] | None: + normalized = str(payload or "").strip() + matched = re.match( + r"^data:(?P[\w.+-]+/[\w.+-]+);base64,(?P.+)$", + normalized, + flags=re.DOTALL, + ) + if not matched: + return None + try: + content = base64.b64decode(matched.group("body"), validate=True) + except (binascii.Error, ValueError): + return None + return matched.group("media"), content + + @classmethod + def renderer_id_for_source(cls, media_type: str | None) -> str: + return cls.PDF_RENDERER_ID if str(media_type or "").strip() == "application/pdf" else "" + + @classmethod + def write_data_url_preview( + cls, + *, + preview_dir: Path, + preview_name_stem: str, + preview_data_url: str, + ) -> tuple[Path, str, str] | None: + decoded = cls.decode_data_url(preview_data_url) + if decoded is None: + return None + + preview_media_type, preview_content = decoded + suffix = mimetypes.guess_extension(preview_media_type) or ".bin" + preview_name = f"{Path(preview_name_stem).stem}{suffix}" + preview_path = preview_dir / preview_name + preview_path.write_bytes(preview_content) + return preview_path, preview_media_type, preview_name + + @classmethod + def render_pdf_first_page( + cls, + *, + pdf_path: Path, + preview_path: Path, + timeout_seconds: int | float, + ) -> Path: + preview_path.parent.mkdir(parents=True, exist_ok=True) + with tempfile.TemporaryDirectory(prefix=".pdf-preview-", dir=str(preview_path.parent)) as temp_dir: + prefix = Path(temp_dir) / "page" + completed = subprocess.run( + [ + "pdftoppm", + "-png", + "-r", + "160", + str(pdf_path), + str(prefix), + ], + capture_output=True, + text=True, + timeout=timeout_seconds, + check=False, + ) + if completed.returncode != 0: + detail = (completed.stderr or completed.stdout or "").strip() + raise RuntimeError(detail or "pdftoppm failed to render PDF preview.") + + pages = sorted(Path(temp_dir).glob("page-*.png"), key=cls._extract_pdf_page_sort_key) + if not pages: + raise RuntimeError("pdftoppm did not generate a preview image.") + shutil.copyfile(pages[0], preview_path) + return preview_path + + @staticmethod + def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]: + suffix = path.stem.rsplit("-", 1)[-1] + try: + return int(suffix), path.name + except ValueError: + return 0, path.name diff --git a/server/src/app/services/expense_claim_attachment_analysis.py b/server/src/app/services/expense_claim_attachment_analysis.py index 216fad8..82e749d 100644 --- a/server/src/app/services/expense_claim_attachment_analysis.py +++ b/server/src/app/services/expense_claim_attachment_analysis.py @@ -336,7 +336,27 @@ class ExpenseClaimAttachmentAnalysisMixin: @staticmethod def _has_date_like_text(text: str) -> bool: - return bool(re.search(r"(20\d{2}[年/\-.]\d{1,2}[月/\-.]\d{1,2}日?)", text)) + return bool(re.search(r"(20\d{2}(?:[年/\-.]|\s+)\d{1,2}(?:[月/\-.]|\s+)\d{1,2}日?)", text)) + + @staticmethod + def _has_document_date_field(document_info: dict[str, Any]) -> bool: + date_keys = DOCUMENT_TRIP_DATE_KEYS | DOCUMENT_GENERIC_DATE_KEYS | DOCUMENT_INVOICE_DATE_KEYS + date_label_tokens = ( + *DOCUMENT_TRIP_DATE_LABEL_TOKENS, + *DOCUMENT_GENERIC_DATE_LABEL_TOKENS, + *DOCUMENT_INVOICE_DATE_LABEL_TOKENS, + ) + for field in list(document_info.get("fields") or []): + if not isinstance(field, dict): + continue + value = str(field.get("value") or "").strip() + if not value: + continue + key = str(field.get("key") or "").strip().lower().replace("_", "") + label = str(field.get("label") or "").replace(" ", "") + if key in date_keys or any(token in label for token in date_label_tokens): + return True + return False @staticmethod def _normalize_match_text(text: str) -> str: @@ -538,6 +558,12 @@ class ExpenseClaimAttachmentAnalysisMixin: recognized_document_label = str(document_info.get("document_type_label") or "其他单据").strip() or "其他单据" requirement_matches = bool(requirement_check.get("matches")) mismatch_severity = str(requirement_check.get("mismatch_severity") or "high").strip().lower() or "high" + document_fields = [ + field + for field in list(document_info.get("fields") or []) + if isinstance(field, dict) and str(field.get("value") or "").strip() + ] + has_readable_content = bool(line_count > 0 or compact_text or document_fields) has_ticket_keyword = any( keyword in compact_text @@ -556,15 +582,18 @@ class ExpenseClaimAttachmentAnalysisMixin: ) ) amount_candidates = self._extract_amount_candidates(text) + field_amount = self._resolve_document_field_amount({"document_fields": document_fields}) + if field_amount is not None and field_amount not in amount_candidates: + amount_candidates.insert(0, field_amount) item_amount = Decimal(item.item_amount or Decimal("0.00")).quantize(Decimal("0.01")) has_matching_amount = any(abs(candidate - item_amount) <= Decimal("1.00") for candidate in amount_candidates) - has_date_text = self._has_date_like_text(text) + has_date_text = self._has_date_like_text(text) or self._has_document_date_field(document_info) amount_mismatch = bool(amount_candidates) and item_amount > Decimal("0.00") and not has_matching_amount points: list[str] = [] if warnings: points.append(f"识别提示:{warnings[0]}") - if line_count == 0 or not compact_text: + if not has_readable_content: points.append("附件内容:未识别到有效文字,当前附件更像普通图片或内容过于模糊。") if recognized_document_type == "other" and not has_ticket_keyword: points.append("票据类型:未识别到发票、票据、电子行程单等关键字,暂无法判断票据类型。") @@ -617,8 +646,7 @@ class ExpenseClaimAttachmentAnalysisMixin: headline = "AI提示:住宿金额超出报销标准" summary = "当前住宿票据金额超过规则中心差旅住宿标准,已作为风险项保留在单据中;如需按特殊情况提交,请补充超标原因。" elif ( - line_count == 0 - or not compact_text + not has_readable_content or (recognized_document_type == "other" and not has_ticket_keyword and issue_count >= 2) or (not requirement_matches and mismatch_severity == "high") or (purpose_mismatch_point and amount_mismatch) diff --git a/server/src/app/services/expense_claim_attachment_document.py b/server/src/app/services/expense_claim_attachment_document.py index a01cf20..e26e021 100644 --- a/server/src/app/services/expense_claim_attachment_document.py +++ b/server/src/app/services/expense_claim_attachment_document.py @@ -119,6 +119,13 @@ class ExpenseClaimAttachmentDocumentMixin: metadata=metadata, item=item, ) + metadata = self._refresh_pdf_attachment_preview_meta_if_needed( + file_path=file_path, + metadata=metadata, + ) + if self._attachment_metadata_needs_analysis_refresh(metadata): + self._refresh_item_attachment_analysis(item) + metadata = self._attachment_storage.read_meta(file_path) uploaded_at_value = metadata.get("uploaded_at") uploaded_at = None if isinstance(uploaded_at_value, str) and uploaded_at_value.strip(): @@ -157,6 +164,68 @@ class ExpenseClaimAttachmentDocumentMixin: "requirement_check": requirement_check, } + @classmethod + def _attachment_metadata_needs_analysis_refresh(cls, metadata: dict[str, Any]) -> bool: + analysis = metadata.get("analysis") + if not isinstance(analysis, dict): + return cls._attachment_metadata_has_ocr_signal(metadata) + + points = [ + str(point or "").strip() + for point in list(analysis.get("points") or []) + if str(point or "").strip() + ] + if not points: + return False + + if any("未识别到有效文字" in point for point in points): + return cls._attachment_metadata_has_readable_signal(metadata) + + if any("未识别到列车出发时间" in point or "未识别到开票日期" in point for point in points): + return cls._attachment_metadata_has_date_field(metadata) + + return False + + @classmethod + def _attachment_metadata_has_ocr_signal(cls, metadata: dict[str, Any]) -> bool: + return bool( + str(metadata.get("ocr_text") or "").strip() + or str(metadata.get("ocr_summary") or "").strip() + or int(metadata.get("ocr_line_count") or 0) > 0 + or cls._attachment_metadata_document_fields(metadata) + ) + + @classmethod + def _attachment_metadata_has_readable_signal(cls, metadata: dict[str, Any]) -> bool: + return bool( + str(metadata.get("ocr_text") or "").strip() + or str(metadata.get("ocr_summary") or "").strip() + or int(metadata.get("ocr_line_count") or 0) > 0 + or cls._attachment_metadata_document_fields(metadata) + ) + + @staticmethod + def _attachment_metadata_document_fields(metadata: dict[str, Any]) -> list[dict[str, Any]]: + document_info = metadata.get("document_info") + if not isinstance(document_info, dict): + return [] + return [ + field + for field in list(document_info.get("fields") or []) + if isinstance(field, dict) and str(field.get("value") or "").strip() + ] + + @classmethod + def _attachment_metadata_has_date_field(cls, metadata: dict[str, Any]) -> bool: + for field in cls._attachment_metadata_document_fields(metadata): + key = str(field.get("key") or "").strip().lower().replace("_", "") + label = str(field.get("label") or "").replace(" ", "") + if key in {"date", "tripdate", "departuredate", "invoicedate"}: + return True + if any(token in label for token in ("日期", "时间", "出发")): + return True + return False + def _build_attachment_document_info(self, document: Any) -> dict[str, Any]: insight = build_document_insight( filename=str(getattr(document, "filename", "") or ""), diff --git a/server/src/app/services/expense_claim_attachment_operations.py b/server/src/app/services/expense_claim_attachment_operations.py index aae47e7..6f7a185 100644 --- a/server/src/app/services/expense_claim_attachment_operations.py +++ b/server/src/app/services/expense_claim_attachment_operations.py @@ -32,6 +32,7 @@ from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY from app.services.agent_foundation import AgentFoundationService from app.services.audit import AuditLogService +from app.services.document_preview import DocumentPreviewAssets from app.services.document_intelligence import build_document_insight from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation @@ -238,6 +239,7 @@ class ExpenseClaimAttachmentOperationsMixin: "preview_storage_key": str(preview_meta["preview_storage_key"]), "preview_media_type": str(preview_meta["preview_media_type"]), "preview_file_name": str(preview_meta["preview_file_name"]), + "preview_rendered_with": str(preview_meta.get("preview_rendered_with") or ""), "analysis": attachment_analysis, "document_info": document_info, "requirement_check": requirement_check, @@ -673,6 +675,60 @@ class ExpenseClaimAttachmentOperationsMixin: self._attachment_storage.write_meta(file_path, metadata) return metadata + def _refresh_pdf_attachment_preview_meta_if_needed( + self, + *, + file_path: Path, + metadata: dict[str, Any], + ) -> dict[str, Any]: + if not metadata: + return metadata + + media_type = str( + metadata.get("media_type") + or self._attachment_presentation.resolve_media_type(file_path.name) + ).strip() + if media_type != "application/pdf": + return metadata + + preview_storage_key = str(metadata.get("preview_storage_key") or "").strip() + preview_path = self._attachment_storage.resolve_path(preview_storage_key) if preview_storage_key else None + if ( + preview_path is not None + and preview_path.exists() + and str(metadata.get("preview_kind") or "").strip() == "image" + and str(metadata.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE + and str(metadata.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID + ): + return metadata + + preview_name = str(metadata.get("preview_file_name") or "").strip() + if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX): + preview_name = f"{file_path.stem}.preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}" + preview_path = file_path.parent / preview_name + + try: + DocumentPreviewAssets.render_pdf_first_page( + pdf_path=file_path, + preview_path=preview_path, + timeout_seconds=OcrService(self.db).settings.ocr_timeout_seconds, + ) + except Exception: + return metadata + + metadata.update( + { + "previewable": True, + "preview_kind": "image", + "preview_storage_key": self._attachment_storage.to_storage_key(preview_path), + "preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE, + "preview_file_name": preview_path.name, + "preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID, + } + ) + self._attachment_storage.write_meta(file_path, metadata) + return metadata + def _resolve_item_attachment_preview_content(self, item: ExpenseClaimItem) -> tuple[Path, str, str]: file_path, media_type, filename = self._resolve_item_attachment_content(item) metadata = self._attachment_storage.read_meta(file_path) @@ -681,6 +737,10 @@ class ExpenseClaimAttachmentOperationsMixin: metadata=metadata, item=item, ) + metadata = self._refresh_pdf_attachment_preview_meta_if_needed( + file_path=file_path, + metadata=metadata, + ) preview_storage_key = str(metadata.get("preview_storage_key") or "").strip() preview_file_name = str(metadata.get("preview_file_name") or "").strip() preview_media_type = str(metadata.get("preview_media_type") or "").strip() diff --git a/server/src/app/services/expense_claim_attachment_presentation.py b/server/src/app/services/expense_claim_attachment_presentation.py index 75ac294..3f88d51 100644 --- a/server/src/app/services/expense_claim_attachment_presentation.py +++ b/server/src/app/services/expense_claim_attachment_presentation.py @@ -1,13 +1,11 @@ from __future__ import annotations -import base64 -import binascii import mimetypes -import re from pathlib import Path from typing import Any from urllib.parse import quote +from app.services.document_preview import DocumentPreviewAssets from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage @@ -42,6 +40,7 @@ class ExpenseClaimAttachmentPresentation: "preview_storage_key": self.storage.to_storage_key(preview_path), "preview_media_type": preview_media_type, "preview_file_name": preview_file_name, + "preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type), } if preview_kind: @@ -51,6 +50,7 @@ class ExpenseClaimAttachmentPresentation: "preview_storage_key": storage_key, "preview_media_type": media_type, "preview_file_name": filename, + "preview_rendered_with": "", } return { @@ -59,6 +59,7 @@ class ExpenseClaimAttachmentPresentation: "preview_storage_key": "", "preview_media_type": "", "preview_file_name": "", + "preview_rendered_with": "", } @staticmethod @@ -72,15 +73,7 @@ class ExpenseClaimAttachmentPresentation: @staticmethod def decode_data_url(payload: str) -> tuple[str, bytes] | None: - normalized = str(payload or "").strip() - matched = re.match(r"^data:(?P[\w.+-]+/[\w.+-]+);base64,(?P.+)$", normalized, flags=re.DOTALL) - if not matched: - return None - try: - content = base64.b64decode(matched.group("body"), validate=True) - except (binascii.Error, ValueError): - return None - return matched.group("media"), content + return DocumentPreviewAssets.decode_data_url(payload) def _write_preview_asset_from_data_url( self, @@ -89,16 +82,11 @@ class ExpenseClaimAttachmentPresentation: original_filename: str, preview_data_url: str, ) -> tuple[Path, str, str] | None: - decoded = self.decode_data_url(preview_data_url) - if decoded is None: - return None - - preview_media_type, preview_content = decoded - suffix = mimetypes.guess_extension(preview_media_type) or ".bin" - preview_name = f"{Path(original_filename).stem}.preview{suffix}" - preview_path = attachment_dir / preview_name - preview_path.write_bytes(preview_content) - return preview_path, preview_media_type, preview_name + return DocumentPreviewAssets.write_data_url_preview( + preview_dir=attachment_dir, + preview_name_stem=f"{Path(original_filename).stem}.preview", + preview_data_url=preview_data_url, + ) @staticmethod def build_preview_client_path(claim_id: str, item_id: str) -> str: diff --git a/server/src/app/services/ocr.py b/server/src/app/services/ocr.py index ad79620..0ededfa 100644 --- a/server/src/app/services/ocr.py +++ b/server/src/app/services/ocr.py @@ -537,7 +537,7 @@ class OcrService: if page_summary: aggregated.summary_fragments.append(page_summary) - page_text = str(payload.get("text", "") or "").strip() + page_text = self._resolve_worker_document_text(payload) if page_text: aggregated.text_fragments.append(page_text) @@ -626,6 +626,22 @@ class OcrService: return descriptor.text_layer return "" + @staticmethod + def _resolve_worker_document_text(payload: dict) -> str: + for key in ("text", "ocr_text", "raw_text", "full_text"): + value = str(payload.get(key, "") or "").strip() + if value: + return value + + lines = payload.get("lines", []) + if not isinstance(lines, list): + return "" + return "\n".join( + str(item.get("text", "") or "").strip() + for item in lines + if isinstance(item, dict) and str(item.get("text", "") or "").strip() + ).strip() + @staticmethod def _build_lines( items: list[dict], diff --git a/server/src/app/services/receipt_folder.py b/server/src/app/services/receipt_folder.py index 729b6ed..5fa9387 100644 --- a/server/src/app/services/receipt_folder.py +++ b/server/src/app/services/receipt_folder.py @@ -12,7 +12,7 @@ from uuid import uuid4 from app.api.deps import CurrentUserContext from app.core.config import get_settings -from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead +from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead from app.schemas.receipt_folder import ( ReceiptFolderDeleteResponse, ReceiptFolderDetailRead, @@ -20,11 +20,13 @@ from app.schemas.receipt_folder import ( ReceiptFolderItemRead, ReceiptFolderUpdate, ) -from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation +from app.services.document_preview import DocumentPreviewAssets +from app.services.document_intelligence import build_document_insight from app.services.ocr import SUPPORTED_SUFFIXES RECEIPT_DATE_PATTERN = re.compile( - r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)" + r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])" + r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)" ) RECEIPT_TIME_PATTERN = re.compile(r"(? dict[str, Any]: preview_data_url = str(getattr(document, "preview_data_url", "") or "").strip() - decoded = ExpenseClaimAttachmentPresentation.decode_data_url(preview_data_url) - if decoded is not None: - preview_media_type, preview_content = decoded - suffix = mimetypes.guess_extension(preview_media_type) or ".bin" - preview_name = f"preview{suffix}" - preview_path = receipt_dir / preview_name - preview_path.write_bytes(preview_content) + preview_asset = DocumentPreviewAssets.write_data_url_preview( + preview_dir=receipt_dir, + preview_name_stem="preview", + preview_data_url=preview_data_url, + ) + if preview_asset is not None: + _, preview_media_type, preview_name = preview_asset return { "previewable": True, "preview_kind": "image", "preview_file_name": preview_name, "preview_media_type": preview_media_type, + "preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type), } if self._is_previewable(media_type): return { @@ -120,14 +125,67 @@ class ReceiptFolderStorageMixin: "preview_kind": "image" if media_type.startswith("image/") else "pdf", "preview_file_name": source_path.name, "preview_media_type": media_type, + "preview_rendered_with": "", } return { "previewable": False, "preview_kind": "", "preview_file_name": "", "preview_media_type": "", + "preview_rendered_with": "", } + def _refresh_pdf_preview_asset_if_needed( + self, + *, + receipt_dir: Path, + meta: dict[str, Any], + ) -> dict[str, Any]: + source_name = str(meta.get("source_file_name") or meta.get("file_name") or "").strip() + if not source_name: + return meta + + source_path = self._assert_child(receipt_dir / source_name) + source_media_type = self.resolve_media_type(source_path.name, str(meta.get("media_type") or "")) + if source_media_type != "application/pdf" or not source_path.exists(): + return meta + + preview_name = str(meta.get("preview_file_name") or "").strip() + preview_path = self._assert_child(receipt_dir / preview_name) if preview_name else None + if ( + preview_path is not None + and preview_path.exists() + and str(meta.get("preview_kind") or "").strip() == "image" + and str(meta.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE + and str(meta.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID + ): + return meta + + if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX): + preview_name = f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}" + preview_path = self._assert_child(receipt_dir / preview_name) + + try: + DocumentPreviewAssets.render_pdf_first_page( + pdf_path=source_path, + preview_path=preview_path, + timeout_seconds=get_settings().ocr_timeout_seconds, + ) + except Exception: + return meta + + meta.update( + { + "previewable": True, + "preview_kind": "image", + "preview_file_name": preview_path.name, + "preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE, + "preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID, + } + ) + self._write_meta(receipt_dir, meta) + return meta + @staticmethod def _is_previewable(media_type: str) -> bool: return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf" @@ -256,6 +314,7 @@ class ReceiptFolderItemMixin: def _build_item(self, meta: dict[str, Any]) -> ReceiptFolderItemRead: receipt_id = str(meta.get("id") or "").strip() status_value = str(meta.get("status") or "unlinked").strip() or "unlinked" + identity = self._resolve_receipt_document_identity(meta) return ReceiptFolderItemRead( id=receipt_id, file_name=str(meta.get("file_name") or ""), @@ -263,10 +322,10 @@ class ReceiptFolderItemMixin: size_bytes=int(meta.get("size_bytes") or 0), status=status_value, status_label="已关联" if status_value == "linked" else "未关联", - document_type=str(meta.get("document_type") or "other"), - document_type_label=str(meta.get("document_type_label") or "其他单据"), - scene_code=str(meta.get("scene_code") or "other"), - scene_label=str(meta.get("scene_label") or "其他票据"), + document_type=identity["document_type"], + document_type_label=identity["document_type_label"], + scene_code=identity["scene_code"], + scene_label=identity["scene_label"], summary=str(meta.get("summary") or ""), amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")), document_date=self._resolve_receipt_document_date(meta), @@ -283,6 +342,38 @@ class ReceiptFolderItemMixin: warnings=[str(value) for value in list(meta.get("ocr_warnings") or []) if str(value).strip()], ) + def _resolve_receipt_document_identity(self, meta: dict[str, Any]) -> dict[str, str]: + document_type = str(meta.get("document_type") or "other").strip() or "other" + document_type_label = str(meta.get("document_type_label") or "其他单据").strip() or "其他单据" + scene_code = str(meta.get("scene_code") or "other").strip() or "other" + scene_label = str(meta.get("scene_label") or "其他票据").strip() or "其他票据" + if document_type not in {"", "other"} and document_type_label != "其他单据": + return { + "document_type": document_type, + "document_type_label": document_type_label, + "scene_code": scene_code, + "scene_label": scene_label, + } + + insight = build_document_insight( + filename=str(meta.get("file_name") or ""), + summary=str(meta.get("summary") or ""), + text=self._receipt_text(meta), + ) + if insight.document_type in {"", "other"}: + return { + "document_type": document_type, + "document_type_label": document_type_label, + "scene_code": scene_code, + "scene_label": scene_label, + } + return { + "document_type": insight.document_type, + "document_type_label": insight.document_type_label, + "scene_code": insight.scene_code, + "scene_label": insight.scene_label, + } + def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]: fields = [ ReceiptFolderFieldRead( @@ -503,7 +594,15 @@ class ReceiptFolderTrainTicketMixin: if str(document_type or "").strip().lower() == "train_ticket": return True compact = "".join([document_type_label, scene_label, text]).replace(" ", "") - return any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次")) + if any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次")): + return True + lower_compact = compact.lower() + return bool(re.search(r"[GCDZKTLYS]\d{1,5}", compact, flags=re.IGNORECASE)) and ( + "12306" in compact + or "95306" in compact + or re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5]{2,12}", compact) + or ("wuhan" in lower_compact and "shanghai" in lower_compact) + ) @classmethod def _is_train_ticket_meta(cls, meta: dict[str, Any]) -> bool: @@ -581,6 +680,7 @@ class ReceiptFolderTrainTicketMixin: return raw normalized = match.group(1).replace("年", "-").replace("月", "-").replace("日", "") normalized = normalized.replace("/", "-").replace(".", "-") + normalized = re.sub(r"\s+", "-", normalized) parts = [part for part in normalized.split("-") if part] if len(parts) != 3: return match.group(1) @@ -651,7 +751,28 @@ class ReceiptFolderTrainTicketMixin: cleaned = re.sub(r"[^·\u4e00-\u9fa5]", "", str(value or "")).strip() if not 2 <= len(cleaned) <= 8: return "" - if any(token in cleaned for token in ("电子", "客票", "铁路", "发票", "税务", "湖北省", "中国铁路", "开票", "日期")): + if any( + token in cleaned + for token in ( + "电子", + "客票", + "铁路", + "发票", + "税务", + "湖北省", + "中国铁路", + "开票", + "日期", + "车厢", + "座位", + "票价", + "金额", + "行程", + "出发", + "到达", + "车次", + ) + ): return "" return cleaned @@ -660,20 +781,29 @@ class ReceiptFolderTrainTicketMixin: labeled = cls._extract_first(TRAIN_ID_PATTERN, text) if labeled: return labeled + fallback = "" for line in str(text or "").replace("\r", "\n").splitlines(): compact_line = line.replace(" ", "") if any(token in compact_line for token in ("发票号码", "电子客票号", "客票号", "订单号")): continue match = TRAIN_ID_FALLBACK_PATTERN.search(compact_line) - if match: - return str(match.group(1) or "").strip() - return "" + if not match: + continue + candidate = str(match.group(1) or "").strip() + if "*" in candidate: + return candidate + if not fallback: + fallback = candidate + return fallback @staticmethod def _extract_train_carriage_and_seat(text: str) -> tuple[str, str]: combined_match = TRAIN_COMBINED_SEAT_PATTERN.search(str(text or "")) if combined_match: return f"{combined_match.group(1)}车", combined_match.group(2) + loose_match = TRAIN_LOOSE_SEAT_PATTERN.search(str(text or "")) + if loose_match: + return f"{loose_match.group(1).zfill(2)}车", loose_match.group(2).upper() carriage_no = ReceiptFolderService._extract_first(TRAIN_CARRIAGE_PATTERN, text).replace(" ", "") seat_no = ReceiptFolderService._extract_first(TRAIN_SEAT_NO_PATTERN, text) return carriage_no, seat_no @@ -681,6 +811,12 @@ class ReceiptFolderTrainTicketMixin: @staticmethod def _extract_train_fare(text: str) -> str: match = TRAIN_FARE_PATTERN.search(str(text or "")) + if not match: + match = max( + list(TRAIN_LOOSE_FARE_PATTERN.finditer(str(text or ""))), + key=lambda item: float(str(item.group(1) or "0").replace(",", ".")), + default=None, + ) if not match: return "" value = str(match.group(1) or "").replace(",", ".").strip() @@ -721,13 +857,10 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re ) if existing_receipt is not None: enriched.append( - document.model_copy( - update={ - "receipt_id": existing_receipt.id, - "receipt_status": existing_receipt.status, - "receipt_preview_url": existing_receipt.preview_url, - "receipt_source_url": existing_receipt.source_url, - } + self._enrich_ocr_document_with_receipt( + document, + receipt=existing_receipt, + current_user=current_user, ) ) continue @@ -744,14 +877,11 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re warning = "已上传过同样的单据,请不要重复上传。" existing_warnings = [str(item) for item in list(document.warnings or []) if str(item).strip()] enriched.append( - document.model_copy( - update={ - "receipt_id": duplicate_receipt.id, - "receipt_status": duplicate_receipt.status, - "receipt_preview_url": duplicate_receipt.preview_url, - "receipt_source_url": duplicate_receipt.source_url, - "warnings": list(dict.fromkeys([*existing_warnings, warning])), - } + self._enrich_ocr_document_with_receipt( + document, + receipt=duplicate_receipt, + current_user=current_user, + extra_warnings=[*existing_warnings, warning], ) ) continue @@ -763,17 +893,78 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re current_user=current_user, ) enriched.append( - document.model_copy( - update={ - "receipt_id": receipt.id, - "receipt_status": receipt.status, - "receipt_preview_url": receipt.preview_url, - "receipt_source_url": receipt.source_url, - } + self._enrich_ocr_document_with_receipt( + document, + receipt=receipt, + current_user=current_user, ) ) return result.model_copy(update={"documents": enriched}) + def _enrich_ocr_document_with_receipt( + self, + document: OcrRecognizeDocumentRead, + *, + receipt: ReceiptFolderItemRead, + current_user: CurrentUserContext, + extra_warnings: list[str] | None = None, + ) -> OcrRecognizeDocumentRead: + update: dict[str, Any] = { + "receipt_id": receipt.id, + "receipt_status": receipt.status, + "receipt_preview_url": receipt.preview_url, + "receipt_source_url": receipt.source_url, + } + + try: + meta = self._read_receipt_meta(receipt.id, current_user) + except FileNotFoundError: + meta = {} + + if meta: + update.update( + { + "text": str(meta.get("ocr_text") or document.text or ""), + "summary": str(meta.get("summary") or document.summary or ""), + "document_type": str(meta.get("document_type") or document.document_type or "other"), + "document_type_label": str(meta.get("document_type_label") or document.document_type_label or "其他单据"), + "scene_code": str(meta.get("scene_code") or document.scene_code or "other"), + "scene_label": str(meta.get("scene_label") or document.scene_label or "其他票据"), + "classification_source": str(meta.get("ocr_classification_source") or document.classification_source or ""), + "classification_confidence": float( + meta.get("ocr_classification_confidence") + or document.classification_confidence + or 0.0 + ), + "classification_evidence": [ + str(value) + for value in list(meta.get("ocr_classification_evidence") or document.classification_evidence or []) + if str(value).strip() + ], + "document_fields": self._build_ocr_document_fields_from_meta(meta), + } + ) + + warnings = [ + str(item) + for item in list(extra_warnings if extra_warnings is not None else document.warnings or []) + if str(item).strip() + ] + if warnings: + update["warnings"] = list(dict.fromkeys(warnings)) + return document.model_copy(update=update) + + def _build_ocr_document_fields_from_meta(self, meta: dict[str, Any]) -> list[OcrRecognizeFieldRead]: + return [ + OcrRecognizeFieldRead( + key=field.key, + label=field.label, + value=field.value, + ) + for field in self._resolve_fields(meta) + if field.label and field.value + ] + def save_receipt( self, *, @@ -1024,6 +1215,7 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re def resolve_preview(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]: meta = self._read_receipt_meta(receipt_id, current_user) receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id) + meta = self._refresh_pdf_preview_asset_if_needed(receipt_dir=receipt_dir, meta=meta) preview_name = str(meta.get("preview_file_name") or "").strip() if preview_name: preview_path = self._assert_child(receipt_dir / preview_name) @@ -1038,4 +1230,3 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re if self._is_previewable(source_media_type): return source_path, source_media_type, source_name raise FileNotFoundError("Receipt preview not found") - diff --git a/server/tests/test_document_intelligence.py b/server/tests/test_document_intelligence.py index e927301..a94f0a5 100644 --- a/server/tests/test_document_intelligence.py +++ b/server/tests/test_document_intelligence.py @@ -84,6 +84,35 @@ def test_document_intelligence_prefers_train_ticket_for_railway_e_ticket_invoice assert any(field.label == "金额" and field.value == "354元" for field in insight.fields) +def test_document_intelligence_recovers_train_ticket_from_english_station_ocr_text() -> None: + insight = build_document_insight( + filename="2月20_武汉-上海.pdf", + summary=":26429165800002785705;:2026 05 18;Wuhan Shanghaihongqiao G458", + text=( + ":26429165800002785705\n" + ":2026 05 18\n" + "G458\n" + "Wuhan\n" + "Shanghaihongqiao\n" + "2026 02 20 07:55\n" + "06 01B\n" + ": 354.00\n" + "4201061987****1615\n" + ":6580061086021391007342026\n" + "12306 95306" + ), + ) + + assert insight.document_type == "train_ticket" + assert insight.document_type_label == "火车/高铁票" + assert insight.scene_code == "travel" + fields = {field.label: field.value for field in insight.fields} + assert fields["金额"] == "354元" + assert fields["列车出发时间"] == "2026-02-20 07:55" + assert fields["车次/航班"] == "G458" + assert fields["行程"] == "武汉-上海" + + def test_document_intelligence_labels_train_ticket_date_as_train_departure_time() -> None: insight = build_document_insight( filename="铁路电子客票.pdf", diff --git a/server/tests/test_expense_claim_attachment_analysis_regression.py b/server/tests/test_expense_claim_attachment_analysis_regression.py new file mode 100644 index 0000000..b6a914e --- /dev/null +++ b/server/tests/test_expense_claim_attachment_analysis_regression.py @@ -0,0 +1,169 @@ +from __future__ import annotations + +import json +from decimal import Decimal + +from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead +from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage +from app.services.ocr import OcrService +from test_reimbursement_endpoints import build_client, seed_claim + + +def test_train_ticket_attachment_with_structured_fields_is_not_flagged_as_unreadable( + monkeypatch, + tmp_path, +) -> None: + def fake_recognize( + self, + files: list[tuple[str, bytes, str | None]], + ) -> OcrRecognizeBatchRead: + return OcrRecognizeBatchRead( + total_file_count=1, + success_count=1, + documents=[ + OcrRecognizeDocumentRead( + filename="2月20_武汉-上海.pdf", + media_type="application/pdf", + text=( + ":26429165800002785705\n" + ":2026 05 18\n" + "G458\n" + "Wuhan\n" + "Shanghaihongqiao\n" + "2026 02 20 07:55\n" + "06 01B\n" + ": 354.00\n" + "4201061987****1615\n" + ":6580061086021391007342026\n" + "12306 95306" + ), + summary="Wuhan Shanghaihongqiao G458 354.00", + avg_score=0.0, + line_count=0, + page_count=1, + warnings=[], + ) + ], + ) + + monkeypatch.setattr(OcrService, "recognize_files", fake_recognize) + monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path) + + client, session_factory = build_client() + with session_factory() as db: + claim, item = seed_claim(db) + claim.expense_type = "travel" + claim.reason = "武汉-上海差旅" + claim.location = "上海" + claim.amount = Decimal("354.00") + item.item_type = "train_ticket" + item.item_reason = "武汉-上海" + item.item_location = "上海" + item.item_amount = Decimal("354.00") + db.commit() + claim_id = claim.id + item_id = item.id + + upload_response = client.post( + f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment", + headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"}, + files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))], + ) + + assert upload_response.status_code == 200 + attachment = upload_response.json()["attachment"] + analysis = attachment["analysis"] + points = analysis["points"] + + assert attachment["document_info"]["document_type"] == "train_ticket" + assert analysis["severity"] == "pass" + assert not any("未识别到有效文字" in point for point in points) + assert not any("未识别到列车出发时间" in point for point in points) + + +def test_attachment_meta_read_repairs_stale_unreadable_train_ticket_analysis( + monkeypatch, + tmp_path, +) -> None: + def fake_recognize( + self, + files: list[tuple[str, bytes, str | None]], + ) -> OcrRecognizeBatchRead: + return OcrRecognizeBatchRead( + total_file_count=1, + success_count=1, + documents=[ + OcrRecognizeDocumentRead( + filename="2月20_武汉-上海.pdf", + media_type="application/pdf", + text=( + ":26429165800002785705 :2026 05 18\n" + "G458\n" + "Wuhan Shanghaihongqiao\n" + "2026 02 20 07:55 06 01B\n" + ": 354.00\n" + "4201061987****1615\n" + ":6580061086021391007342026\n" + "12306 95306" + ), + summary="Wuhan Shanghaihongqiao G458 354.00", + avg_score=0.0, + line_count=0, + page_count=1, + warnings=[], + ) + ], + ) + + monkeypatch.setattr(OcrService, "recognize_files", fake_recognize) + monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path) + + client, session_factory = build_client() + with session_factory() as db: + claim, item = seed_claim(db) + claim.expense_type = "travel" + claim.reason = "武汉-上海差旅" + claim.location = "上海" + claim.amount = Decimal("354.00") + item.item_type = "train_ticket" + item.item_reason = "武汉-上海" + item.item_location = "上海" + item.item_amount = Decimal("354.00") + db.commit() + claim_id = claim.id + item_id = item.id + + upload_response = client.post( + f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment", + headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"}, + files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))], + ) + assert upload_response.status_code == 200 + + meta_path = next(tmp_path.rglob("*.meta.json")) + meta = json.loads(meta_path.read_text(encoding="utf-8")) + meta["analysis"] = { + "severity": "high", + "label": "高风险", + "headline": "AI提示:附件不符合票据校验条件", + "summary": "当前附件存在明显异常,票据类型与当前费用场景不匹配,或无法作为有效报销材料。", + "points": [ + "附件内容:未识别到有效文字,当前附件更像普通图片或内容过于模糊。", + "日期字段:未识别到列车出发时间或乘车日期。", + ], + "rule_basis": [], + "suggestion": "建议过滤当前不匹配的票据,重新上传符合当前费用场景的清晰原件。", + } + meta_path.write_text(json.dumps(meta, ensure_ascii=False), encoding="utf-8") + + meta_response = client.get( + f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/meta", + headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"}, + ) + + assert meta_response.status_code == 200 + analysis = meta_response.json()["analysis"] + points = analysis["points"] + assert analysis["severity"] == "pass" + assert not any("未识别到有效文字" in point for point in points) + assert not any("未识别到列车出发时间" in point for point in points) diff --git a/server/tests/test_ocr_endpoints.py b/server/tests/test_ocr_endpoints.py index 115178a..8853a5d 100644 --- a/server/tests/test_ocr_endpoints.py +++ b/server/tests/test_ocr_endpoints.py @@ -176,3 +176,73 @@ def test_ocr_recognize_endpoint_returns_structured_payload(monkeypatch, tmp_path assert deleted_response.status_code == 404 finally: get_settings.cache_clear() + + +def test_ocr_recognize_endpoint_returns_receipt_enriched_train_fields(monkeypatch, tmp_path) -> None: + def fake_recognize( + self, + files: list[tuple[str, bytes, str | None]], + ) -> OcrRecognizeBatchRead: + return OcrRecognizeBatchRead( + engine="paddleocr_mobile", + model="PP-OCRv5_mobile", + total_file_count=1, + success_count=1, + documents=[ + OcrRecognizeDocumentRead( + filename="2月20_武汉-上海.png", + media_type="image/png", + text=( + ":26429165800002785705\n" + "G458\n" + "Wuhan\n" + "Shanghaihongqiao\n" + "2026 02 20 07:55\n" + "06 01B\n" + ": 354.00\n" + "4201061987****1615\n" + ":6580061086021391007342026\n" + "12306 95306" + ), + summary="Wuhan Shanghaihongqiao G458 354.00", + avg_score=0.92, + line_count=0, + page_count=1, + document_type="train_ticket", + document_type_label="火车/高铁票", + scene_code="travel", + scene_label="差旅票据", + document_fields=[ + OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"), + OcrRecognizeFieldRead(key="trip_no", label="车次/航班", value="G458"), + OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"), + OcrRecognizeFieldRead(key="amount", label="金额", value="354元"), + ], + ) + ], + ) + + monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) + get_settings.cache_clear() + monkeypatch.setattr(OcrService, "recognize_files", fake_recognize) + try: + client = build_client() + response = client.post( + "/api/v1/ocr/recognize", + headers={"x-auth-username": "pytest", "x-auth-name": "Py Test"}, + files=[("files", ("2月20_武汉-上海.png", b"fake-image", "image/png"))], + ) + finally: + get_settings.cache_clear() + + assert response.status_code == 200 + document = response.json()["documents"][0] + fields = { + item["label"]: item["value"] + for item in document["document_fields"] + } + assert document["receipt_id"] + assert fields["身份证号"] == "4201061987****1615" + assert fields["车厢"] == "06车" + assert fields["座位号"] == "01B" + assert fields["票价"] == "354.00元" diff --git a/server/tests/test_ocr_service.py b/server/tests/test_ocr_service.py index 461daa7..2e86a40 100644 --- a/server/tests/test_ocr_service.py +++ b/server/tests/test_ocr_service.py @@ -101,6 +101,55 @@ print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False)) assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"] +def test_ocr_service_recovers_image_text_from_worker_ocr_text( + monkeypatch, + tmp_path: Path, +) -> None: + def fake_invoke_worker( + self, + *, + python_bin: str, + worker_path: str, + input_paths: list[Path], + ) -> dict: + return { + "engine": "paddleocr_mobile", + "model": "PP-OCRv5_mobile", + "documents": [ + { + "input_path": str(input_paths[0]), + "engine": "paddleocr_mobile", + "model": "PP-OCRv5_mobile", + "ocr_text": "铁路电子客票 武汉-上海 2026 02 20 07:55 G458 : 354.00 12306 95306", + "avg_score": 0.92, + "line_count": 0, + "page_count": 1, + "warnings": [], + "lines": [], + } + ], + } + + monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) + monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python") + monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py") + monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker) + OcrService._result_cache.clear() + get_settings.cache_clear() + try: + result = OcrService().recognize_files([("train-ticket.png", b"fake-train-image", "image/png")]) + finally: + OcrService._result_cache.clear() + get_settings.cache_clear() + + recognized = result.documents[0] + assert "铁路电子客票" in recognized.text + assert recognized.document_type == "train_ticket" + assert any(field.label == "列车出发时间" and field.value == "2026-02-20 07:55" for field in recognized.document_fields) + assert any(field.label == "车次/航班" and field.value == "G458" for field in recognized.document_fields) + assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields) + + def test_ocr_service_passes_configured_device_to_worker( monkeypatch, tmp_path: Path, diff --git a/server/tests/test_receipt_folder_service.py b/server/tests/test_receipt_folder_service.py index cd34f8b..bc814f5 100644 --- a/server/tests/test_receipt_folder_service.py +++ b/server/tests/test_receipt_folder_service.py @@ -1,8 +1,11 @@ from __future__ import annotations +import base64 + from app.api.deps import CurrentUserContext from app.core.config import get_settings from app.schemas.ocr import OcrRecognizeDocumentRead +from app.services.document_preview import DocumentPreviewAssets from app.services.receipt_folder import ReceiptFolderService @@ -69,6 +72,172 @@ def test_receipt_folder_train_ticket_uses_invoice_date_and_enriches_fields(monke get_settings.cache_clear() +def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None: + monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) + get_settings.cache_clear() + try: + current_user = CurrentUserContext( + username="pytest", + name="Py Test", + role_codes=[], + is_admin=False, + ) + stale_preview = b"stale-preview" + preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}" + service = ReceiptFolderService() + receipt = service.save_receipt( + filename="2月20_武汉-上海.pdf", + content=b"%PDF-1.4 fake", + media_type="application/pdf", + current_user=current_user, + document=OcrRecognizeDocumentRead( + filename="2月20_武汉-上海.pdf", + media_type="application/pdf", + preview_kind="image", + preview_data_url=preview_data_url, + ), + ) + + receipt_dir = next(service.root.glob("pytest/*")) + preview_path = receipt_dir / "preview.png" + assert preview_path.read_bytes() == stale_preview + stale_meta = service._read_meta(receipt_dir) + stale_meta.pop("preview_rendered_with", None) + service._write_meta(receipt_dir, stale_meta) + + def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds): + preview_path.write_bytes(b"refreshed-preview") + return preview_path + + monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page) + + resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user) + + assert resolved_path == preview_path + assert media_type == "image/png" + assert file_name == "preview.png" + assert preview_path.read_bytes() == b"refreshed-preview" + meta = service._read_meta(receipt_dir) + assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID + finally: + get_settings.cache_clear() + + +def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None: + monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) + get_settings.cache_clear() + try: + current_user = CurrentUserContext( + username="pytest", + name="Py Test", + role_codes=[], + is_admin=False, + ) + service = ReceiptFolderService() + content = b"%PDF-1.4 same receipt" + + receipt = service.save_receipt( + filename="same-receipt.pdf", + content=content, + media_type="application/pdf", + current_user=current_user, + document=OcrRecognizeDocumentRead( + filename="same-receipt.pdf", + media_type="application/pdf", + text="same receipt amount 354", + document_type="other", + document_type_label="其他单据", + scene_code="other", + scene_label="其他票据", + ), + ) + receipt_dir = service.root / "pytest" / receipt.id + + assert receipt_dir.exists() + duplicate = service.find_duplicate_receipt( + filename="same-receipt.pdf", + content=content, + current_user=current_user, + ) + assert duplicate is not None + assert duplicate.id == receipt.id + + service.delete_receipt(receipt_id=receipt.id, current_user=current_user) + + assert not receipt_dir.exists() + assert ( + service.find_duplicate_receipt( + filename="same-receipt.pdf", + content=content, + current_user=current_user, + ) + is None + ) + finally: + get_settings.cache_clear() + + +def test_receipt_folder_recovers_train_ticket_detail_from_other_english_ocr(monkeypatch, tmp_path) -> None: + monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) + get_settings.cache_clear() + try: + current_user = CurrentUserContext( + username="pytest", + name="Py Test", + role_codes=[], + is_admin=False, + ) + service = ReceiptFolderService() + receipt = service.save_receipt( + filename="2月20_武汉-上海.pdf", + content=b"%PDF-1.4 fake", + media_type="application/pdf", + current_user=current_user, + document=OcrRecognizeDocumentRead( + filename="2月20_武汉-上海.pdf", + media_type="application/pdf", + text=( + ":26429165800002785705\n" + ":2026 05 18\n" + "G458\n" + "Wuhan\n" + "Shanghaihongqiao\n" + "2026 02 20 07:55\n" + "06 01B\n" + ": 354.00\n" + "4201061987****1615\n" + ":6580061086021391007342026\n" + "12306 95306" + ), + summary="Wuhan Shanghaihongqiao G458 354.00", + document_type="other", + document_type_label="其他单据", + scene_code="other", + scene_label="其他票据", + ), + ) + + assert receipt.document_type == "train_ticket" + assert receipt.document_type_label == "火车/高铁票" + assert receipt.scene_code == "travel" + assert receipt.amount == "354.00元" + assert receipt.document_date == "2026-02-20" + assert receipt.merchant_name == "中国铁路" + + detail = service.get_receipt(receipt.id, current_user) + fields = {field.label: field.value for field in detail.fields} + assert fields["行程"] == "武汉-上海" + assert fields["车次"] == "G458" + assert fields["列车出发时间"] == "2026-02-20 07:55" + assert fields["票价"] == "354.00元" + assert fields["身份证号"] == "4201061987****1615" + assert fields["车厢"] == "06车" + assert fields["座位号"] == "01B" + assert "乘车人" not in fields + finally: + get_settings.cache_clear() + + def test_receipt_folder_unlink_receipts_for_claim_marks_linked_receipts_unlinked(monkeypatch, tmp_path) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() diff --git a/server/tests/test_reimbursement_endpoints.py b/server/tests/test_reimbursement_endpoints.py index 64c3bd3..f7b79f2 100644 --- a/server/tests/test_reimbursement_endpoints.py +++ b/server/tests/test_reimbursement_endpoints.py @@ -1,6 +1,7 @@ from __future__ import annotations import base64 +import json from collections.abc import Generator from datetime import UTC, date, datetime from decimal import Decimal @@ -19,6 +20,7 @@ from app.models.organization import OrganizationUnit from app.models.risk_observation import RiskObservation, RiskObservationFeedback from app.models.role import Role from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead +from app.services.document_preview import DocumentPreviewAssets from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage from app.services.ocr import OcrService @@ -686,6 +688,9 @@ def test_claim_item_pdf_attachment_preview_returns_generated_image(monkeypatch, meta_payload = upload_response.json()["attachment"] assert meta_payload["preview_kind"] == "image" assert meta_payload["preview_url"].endswith(f"/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview") + meta_path = next(tmp_path.rglob("invoice.pdf.meta.json")) + stored_meta = json.loads(meta_path.read_text(encoding="utf-8")) + assert stored_meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID preview_response = client.get( f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview",