feat(server): 票据文件夹资产缓存与文档预览统一生成

- 新增 document_preview 模块,DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成(poppler-data 编码)、renderer_id 标识
- receipt_folder 服务复用预览生成,缓存票据资产并提供清理;删除票据时保留已关联报销单的附件副本
- document_intelligence 新增票据预览/资产缓存接入与字段提取增强;ocr 抽取复用预览工具,附件分析/文档/操作/展示四个子模块同步适配
- receipt_folder 端点补充资产缓存头,补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试,新增 attachment_analysis 回归测试
This commit is contained in:
caoxiaozhu
2026-06-23 09:42:00 +08:00
parent bc743adef3
commit 84a8998e59
15 changed files with 1076 additions and 79 deletions

View File

@@ -92,7 +92,7 @@ def preview_receipt(receipt_id: str, current_user: CurrentUser) -> FileResponse:
file_path, media_type, file_name = ReceiptFolderService().resolve_preview(receipt_id, current_user)
except FileNotFoundError as exc:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Receipt preview not found") from exc
return FileResponse(file_path, media_type=media_type, filename=file_name)
return FileResponse(file_path, media_type=media_type, filename=file_name, headers={"Cache-Control": "no-store"})
@router.get(

View File

@@ -25,11 +25,15 @@ AMOUNT_PATTERNS = (
re.compile(r"[¥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"),
re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
)
DATE_PATTERN = re.compile(r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)")
DATE_PATTERN = re.compile(
r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
)
TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:]([0-5]\d)(?!\d)")
INVOICE_NUMBER_PATTERN = re.compile(r"(?:发票号码|票号|单号|订单号)[:\s]*([A-Za-z0-9-]{6,24})")
INVOICE_CODE_PATTERN = re.compile(r"(?:发票代码)[:\s]*([A-Za-z0-9-]{6,24})")
TRIP_NO_PATTERN = re.compile(r"(?:车次|航班(?:号)?)[:\s]*([A-Za-z0-9]{2,12})")
TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(?<![A-Za-z0-9])([GCDZKTLYS]\d{1,5})(?![A-Za-z0-9])", re.IGNORECASE)
ROUTE_PATTERN = re.compile(r"([\u4e00-\u9fa5]{2,12})\s*(?:至|→|->|-)\s*([\u4e00-\u9fa5]{2,12})")
MERCHANT_PATTERNS = (
re.compile(r"(?:销售方(?:名称)?|商户(?:名称)?|开票方(?:名称)?|收款方(?:名称)?)[:\s]*([A-Za-z0-9\u4e00-\u9fa5()·&\\-]{2,40})"),
@@ -300,6 +304,14 @@ def _match_document_rule(compact_text: str) -> RuleMatch:
best_score = score
if best_score <= 0:
train_rule = DOCUMENT_TYPE_RULE_MAP.get("train_ticket")
if train_rule and _looks_like_train_ticket(compact_text):
return RuleMatch(
rule=train_rule,
confidence=0.82,
evidence=("车次", "12306"),
score=3.8,
)
return RuleMatch(rule=None, confidence=0.0, evidence=(), score=0.0)
confidence = min(0.94, 0.30 + min(best_score, 4.8) * 0.12)
@@ -311,6 +323,17 @@ def _match_document_rule(compact_text: str) -> RuleMatch:
)
def _looks_like_train_ticket(compact_text: str) -> bool:
text = str(compact_text or "").lower()
if not re.search(r"[gcdzktlys]\d{1,5}", text, flags=re.IGNORECASE):
return False
if "12306" in text or "95306" in text:
return True
if re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—||-)[\u4e00-\u9fa5]{2,12}", text):
return True
return "wuhan" in text and "shanghai" in text
def _extract_json_payload(response_text: str | None) -> dict[str, Any] | None:
if not response_text:
return None
@@ -521,33 +544,48 @@ def _merge_document_fields(
def _extract_document_fields(text: str, document_type: str = "") -> list[DocumentField]:
fields: list[DocumentField] = []
normalized_type = str(document_type or "").strip().lower()
def append_field(key: str, label: str, value: str) -> None:
cleaned = _clean_field_value(value)
if not cleaned:
return
if any(field.key == key for field in fields if field.key):
return
fields.append(DocumentField(key=key, label=label, value=cleaned))
amount = _extract_amount(text)
if amount:
fields.append(DocumentField(key="amount", label="金额", value=amount))
append_field("amount", "金额", amount)
date_value = _extract_date(text, document_type=document_type)
if date_value:
fields.append(DocumentField(key="date", label="日期", value=date_value))
append_field("date", "日期", date_value)
merchant = _extract_merchant(text)
if merchant:
fields.append(DocumentField(key="merchant_name", label="商户", value=merchant))
append_field("merchant_name", "商户", merchant)
invoice_number = _extract_pattern(INVOICE_NUMBER_PATTERN, text)
if invoice_number:
fields.append(DocumentField(key="invoice_number", label="票据号码", value=invoice_number))
append_field("invoice_number", "票据号码", invoice_number)
invoice_code = _extract_pattern(INVOICE_CODE_PATTERN, text)
if invoice_code:
fields.append(DocumentField(key="invoice_code", label="发票代码", value=invoice_code))
append_field("invoice_code", "发票代码", invoice_code)
trip_no = _extract_pattern(TRIP_NO_PATTERN, text)
if not trip_no and normalized_type == "train_ticket":
trip_no = _extract_pattern(TRAIN_STANDALONE_NO_PATTERN, text)
if trip_no:
fields.append(DocumentField(key="trip_no", label="车次/航班", value=trip_no))
append_field("trip_no", "车次/航班", trip_no.upper())
route = _extract_route(text)
if route:
fields.append(DocumentField(key="route", label="行程", value=route))
append_field("route", "行程", route)
if normalized_type == "train_ticket" and not any(field.key == "amount" for field in fields):
append_field("amount", "金额", _extract_loose_decimal_amount(text))
return fields
@@ -621,6 +659,7 @@ def _format_date_match_with_time(text: str, match: re.Match[str]) -> str:
raw_value = str(match.group(1) or "").strip()
normalized = raw_value.replace("", "-").replace("", "-").replace("", "")
normalized = normalized.replace("/", "-").replace(".", "-")
normalized = re.sub(r"\s+", "-", normalized)
parts = [part for part in normalized.split("-") if part]
if len(parts) != 3:
return raw_value
@@ -703,6 +742,23 @@ def _extract_route(text: str) -> str:
return f"{start}-{end}"
def _extract_loose_decimal_amount(text: str) -> str:
best_value: Decimal | None = None
for match in re.finditer(r"(?<!\d)(\d{1,6}\.\d{1,2})(?!\d)", str(text or "")):
try:
candidate = Decimal(match.group(1)).quantize(Decimal("0.01"))
except InvalidOperation:
continue
if candidate <= Decimal("0.00"):
continue
if best_value is None or candidate > best_value:
best_value = candidate
if best_value is None:
return ""
text_value = format(best_value, "f").rstrip("0").rstrip(".")
return f"{text_value}"
def _extract_pattern(pattern: re.Pattern[str], text: str) -> str:
match = pattern.search(text)
if not match:

View File

@@ -0,0 +1,98 @@
from __future__ import annotations
import base64
import binascii
import mimetypes
import re
import shutil
import subprocess
import tempfile
from pathlib import Path
class DocumentPreviewAssets:
PDF_RENDERER_ID = "pdftoppm-png-r160-poppler-data"
PDF_PREVIEW_MEDIA_TYPE = "image/png"
PDF_PREVIEW_SUFFIX = ".png"
@staticmethod
def decode_data_url(payload: str) -> tuple[str, bytes] | None:
normalized = str(payload or "").strip()
matched = re.match(
r"^data:(?P<media>[\w.+-]+/[\w.+-]+);base64,(?P<body>.+)$",
normalized,
flags=re.DOTALL,
)
if not matched:
return None
try:
content = base64.b64decode(matched.group("body"), validate=True)
except (binascii.Error, ValueError):
return None
return matched.group("media"), content
@classmethod
def renderer_id_for_source(cls, media_type: str | None) -> str:
return cls.PDF_RENDERER_ID if str(media_type or "").strip() == "application/pdf" else ""
@classmethod
def write_data_url_preview(
cls,
*,
preview_dir: Path,
preview_name_stem: str,
preview_data_url: str,
) -> tuple[Path, str, str] | None:
decoded = cls.decode_data_url(preview_data_url)
if decoded is None:
return None
preview_media_type, preview_content = decoded
suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
preview_name = f"{Path(preview_name_stem).stem}{suffix}"
preview_path = preview_dir / preview_name
preview_path.write_bytes(preview_content)
return preview_path, preview_media_type, preview_name
@classmethod
def render_pdf_first_page(
cls,
*,
pdf_path: Path,
preview_path: Path,
timeout_seconds: int | float,
) -> Path:
preview_path.parent.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory(prefix=".pdf-preview-", dir=str(preview_path.parent)) as temp_dir:
prefix = Path(temp_dir) / "page"
completed = subprocess.run(
[
"pdftoppm",
"-png",
"-r",
"160",
str(pdf_path),
str(prefix),
],
capture_output=True,
text=True,
timeout=timeout_seconds,
check=False,
)
if completed.returncode != 0:
detail = (completed.stderr or completed.stdout or "").strip()
raise RuntimeError(detail or "pdftoppm failed to render PDF preview.")
pages = sorted(Path(temp_dir).glob("page-*.png"), key=cls._extract_pdf_page_sort_key)
if not pages:
raise RuntimeError("pdftoppm did not generate a preview image.")
shutil.copyfile(pages[0], preview_path)
return preview_path
@staticmethod
def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]:
suffix = path.stem.rsplit("-", 1)[-1]
try:
return int(suffix), path.name
except ValueError:
return 0, path.name

View File

@@ -336,7 +336,27 @@ class ExpenseClaimAttachmentAnalysisMixin:
@staticmethod
def _has_date_like_text(text: str) -> bool:
return bool(re.search(r"(20\d{2}[年/\-.]\d{1,2}[月/\-.]\d{1,2}日?)", text))
return bool(re.search(r"(20\d{2}(?:[年/\-.]|\s+)\d{1,2}(?:[月/\-.]|\s+)\d{1,2}日?)", text))
@staticmethod
def _has_document_date_field(document_info: dict[str, Any]) -> bool:
date_keys = DOCUMENT_TRIP_DATE_KEYS | DOCUMENT_GENERIC_DATE_KEYS | DOCUMENT_INVOICE_DATE_KEYS
date_label_tokens = (
*DOCUMENT_TRIP_DATE_LABEL_TOKENS,
*DOCUMENT_GENERIC_DATE_LABEL_TOKENS,
*DOCUMENT_INVOICE_DATE_LABEL_TOKENS,
)
for field in list(document_info.get("fields") or []):
if not isinstance(field, dict):
continue
value = str(field.get("value") or "").strip()
if not value:
continue
key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
if key in date_keys or any(token in label for token in date_label_tokens):
return True
return False
@staticmethod
def _normalize_match_text(text: str) -> str:
@@ -538,6 +558,12 @@ class ExpenseClaimAttachmentAnalysisMixin:
recognized_document_label = str(document_info.get("document_type_label") or "其他单据").strip() or "其他单据"
requirement_matches = bool(requirement_check.get("matches"))
mismatch_severity = str(requirement_check.get("mismatch_severity") or "high").strip().lower() or "high"
document_fields = [
field
for field in list(document_info.get("fields") or [])
if isinstance(field, dict) and str(field.get("value") or "").strip()
]
has_readable_content = bool(line_count > 0 or compact_text or document_fields)
has_ticket_keyword = any(
keyword in compact_text
@@ -556,15 +582,18 @@ class ExpenseClaimAttachmentAnalysisMixin:
)
)
amount_candidates = self._extract_amount_candidates(text)
field_amount = self._resolve_document_field_amount({"document_fields": document_fields})
if field_amount is not None and field_amount not in amount_candidates:
amount_candidates.insert(0, field_amount)
item_amount = Decimal(item.item_amount or Decimal("0.00")).quantize(Decimal("0.01"))
has_matching_amount = any(abs(candidate - item_amount) <= Decimal("1.00") for candidate in amount_candidates)
has_date_text = self._has_date_like_text(text)
has_date_text = self._has_date_like_text(text) or self._has_document_date_field(document_info)
amount_mismatch = bool(amount_candidates) and item_amount > Decimal("0.00") and not has_matching_amount
points: list[str] = []
if warnings:
points.append(f"识别提示:{warnings[0]}")
if line_count == 0 or not compact_text:
if not has_readable_content:
points.append("附件内容:未识别到有效文字,当前附件更像普通图片或内容过于模糊。")
if recognized_document_type == "other" and not has_ticket_keyword:
points.append("票据类型:未识别到发票、票据、电子行程单等关键字,暂无法判断票据类型。")
@@ -617,8 +646,7 @@ class ExpenseClaimAttachmentAnalysisMixin:
headline = "AI提示住宿金额超出报销标准"
summary = "当前住宿票据金额超过规则中心差旅住宿标准,已作为风险项保留在单据中;如需按特殊情况提交,请补充超标原因。"
elif (
line_count == 0
or not compact_text
not has_readable_content
or (recognized_document_type == "other" and not has_ticket_keyword and issue_count >= 2)
or (not requirement_matches and mismatch_severity == "high")
or (purpose_mismatch_point and amount_mismatch)

View File

@@ -119,6 +119,13 @@ class ExpenseClaimAttachmentDocumentMixin:
metadata=metadata,
item=item,
)
metadata = self._refresh_pdf_attachment_preview_meta_if_needed(
file_path=file_path,
metadata=metadata,
)
if self._attachment_metadata_needs_analysis_refresh(metadata):
self._refresh_item_attachment_analysis(item)
metadata = self._attachment_storage.read_meta(file_path)
uploaded_at_value = metadata.get("uploaded_at")
uploaded_at = None
if isinstance(uploaded_at_value, str) and uploaded_at_value.strip():
@@ -157,6 +164,68 @@ class ExpenseClaimAttachmentDocumentMixin:
"requirement_check": requirement_check,
}
@classmethod
def _attachment_metadata_needs_analysis_refresh(cls, metadata: dict[str, Any]) -> bool:
analysis = metadata.get("analysis")
if not isinstance(analysis, dict):
return cls._attachment_metadata_has_ocr_signal(metadata)
points = [
str(point or "").strip()
for point in list(analysis.get("points") or [])
if str(point or "").strip()
]
if not points:
return False
if any("未识别到有效文字" in point for point in points):
return cls._attachment_metadata_has_readable_signal(metadata)
if any("未识别到列车出发时间" in point or "未识别到开票日期" in point for point in points):
return cls._attachment_metadata_has_date_field(metadata)
return False
@classmethod
def _attachment_metadata_has_ocr_signal(cls, metadata: dict[str, Any]) -> bool:
return bool(
str(metadata.get("ocr_text") or "").strip()
or str(metadata.get("ocr_summary") or "").strip()
or int(metadata.get("ocr_line_count") or 0) > 0
or cls._attachment_metadata_document_fields(metadata)
)
@classmethod
def _attachment_metadata_has_readable_signal(cls, metadata: dict[str, Any]) -> bool:
return bool(
str(metadata.get("ocr_text") or "").strip()
or str(metadata.get("ocr_summary") or "").strip()
or int(metadata.get("ocr_line_count") or 0) > 0
or cls._attachment_metadata_document_fields(metadata)
)
@staticmethod
def _attachment_metadata_document_fields(metadata: dict[str, Any]) -> list[dict[str, Any]]:
document_info = metadata.get("document_info")
if not isinstance(document_info, dict):
return []
return [
field
for field in list(document_info.get("fields") or [])
if isinstance(field, dict) and str(field.get("value") or "").strip()
]
@classmethod
def _attachment_metadata_has_date_field(cls, metadata: dict[str, Any]) -> bool:
for field in cls._attachment_metadata_document_fields(metadata):
key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
if key in {"date", "tripdate", "departuredate", "invoicedate"}:
return True
if any(token in label for token in ("日期", "时间", "出发")):
return True
return False
def _build_attachment_document_info(self, document: Any) -> dict[str, Any]:
insight = build_document_insight(
filename=str(getattr(document, "filename", "") or ""),

View File

@@ -32,6 +32,7 @@ from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager
from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY
from app.services.agent_foundation import AgentFoundationService
from app.services.audit import AuditLogService
from app.services.document_preview import DocumentPreviewAssets
from app.services.document_intelligence import build_document_insight
from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
@@ -238,6 +239,7 @@ class ExpenseClaimAttachmentOperationsMixin:
"preview_storage_key": str(preview_meta["preview_storage_key"]),
"preview_media_type": str(preview_meta["preview_media_type"]),
"preview_file_name": str(preview_meta["preview_file_name"]),
"preview_rendered_with": str(preview_meta.get("preview_rendered_with") or ""),
"analysis": attachment_analysis,
"document_info": document_info,
"requirement_check": requirement_check,
@@ -673,6 +675,60 @@ class ExpenseClaimAttachmentOperationsMixin:
self._attachment_storage.write_meta(file_path, metadata)
return metadata
def _refresh_pdf_attachment_preview_meta_if_needed(
self,
*,
file_path: Path,
metadata: dict[str, Any],
) -> dict[str, Any]:
if not metadata:
return metadata
media_type = str(
metadata.get("media_type")
or self._attachment_presentation.resolve_media_type(file_path.name)
).strip()
if media_type != "application/pdf":
return metadata
preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
preview_path = self._attachment_storage.resolve_path(preview_storage_key) if preview_storage_key else None
if (
preview_path is not None
and preview_path.exists()
and str(metadata.get("preview_kind") or "").strip() == "image"
and str(metadata.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
and str(metadata.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
):
return metadata
preview_name = str(metadata.get("preview_file_name") or "").strip()
if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
preview_name = f"{file_path.stem}.preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
preview_path = file_path.parent / preview_name
try:
DocumentPreviewAssets.render_pdf_first_page(
pdf_path=file_path,
preview_path=preview_path,
timeout_seconds=OcrService(self.db).settings.ocr_timeout_seconds,
)
except Exception:
return metadata
metadata.update(
{
"previewable": True,
"preview_kind": "image",
"preview_storage_key": self._attachment_storage.to_storage_key(preview_path),
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
"preview_file_name": preview_path.name,
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
}
)
self._attachment_storage.write_meta(file_path, metadata)
return metadata
def _resolve_item_attachment_preview_content(self, item: ExpenseClaimItem) -> tuple[Path, str, str]:
file_path, media_type, filename = self._resolve_item_attachment_content(item)
metadata = self._attachment_storage.read_meta(file_path)
@@ -681,6 +737,10 @@ class ExpenseClaimAttachmentOperationsMixin:
metadata=metadata,
item=item,
)
metadata = self._refresh_pdf_attachment_preview_meta_if_needed(
file_path=file_path,
metadata=metadata,
)
preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
preview_file_name = str(metadata.get("preview_file_name") or "").strip()
preview_media_type = str(metadata.get("preview_media_type") or "").strip()

View File

@@ -1,13 +1,11 @@
from __future__ import annotations
import base64
import binascii
import mimetypes
import re
from pathlib import Path
from typing import Any
from urllib.parse import quote
from app.services.document_preview import DocumentPreviewAssets
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
@@ -42,6 +40,7 @@ class ExpenseClaimAttachmentPresentation:
"preview_storage_key": self.storage.to_storage_key(preview_path),
"preview_media_type": preview_media_type,
"preview_file_name": preview_file_name,
"preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
}
if preview_kind:
@@ -51,6 +50,7 @@ class ExpenseClaimAttachmentPresentation:
"preview_storage_key": storage_key,
"preview_media_type": media_type,
"preview_file_name": filename,
"preview_rendered_with": "",
}
return {
@@ -59,6 +59,7 @@ class ExpenseClaimAttachmentPresentation:
"preview_storage_key": "",
"preview_media_type": "",
"preview_file_name": "",
"preview_rendered_with": "",
}
@staticmethod
@@ -72,15 +73,7 @@ class ExpenseClaimAttachmentPresentation:
@staticmethod
def decode_data_url(payload: str) -> tuple[str, bytes] | None:
normalized = str(payload or "").strip()
matched = re.match(r"^data:(?P<media>[\w.+-]+/[\w.+-]+);base64,(?P<body>.+)$", normalized, flags=re.DOTALL)
if not matched:
return None
try:
content = base64.b64decode(matched.group("body"), validate=True)
except (binascii.Error, ValueError):
return None
return matched.group("media"), content
return DocumentPreviewAssets.decode_data_url(payload)
def _write_preview_asset_from_data_url(
self,
@@ -89,16 +82,11 @@ class ExpenseClaimAttachmentPresentation:
original_filename: str,
preview_data_url: str,
) -> tuple[Path, str, str] | None:
decoded = self.decode_data_url(preview_data_url)
if decoded is None:
return None
preview_media_type, preview_content = decoded
suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
preview_name = f"{Path(original_filename).stem}.preview{suffix}"
preview_path = attachment_dir / preview_name
preview_path.write_bytes(preview_content)
return preview_path, preview_media_type, preview_name
return DocumentPreviewAssets.write_data_url_preview(
preview_dir=attachment_dir,
preview_name_stem=f"{Path(original_filename).stem}.preview",
preview_data_url=preview_data_url,
)
@staticmethod
def build_preview_client_path(claim_id: str, item_id: str) -> str:

View File

@@ -537,7 +537,7 @@ class OcrService:
if page_summary:
aggregated.summary_fragments.append(page_summary)
page_text = str(payload.get("text", "") or "").strip()
page_text = self._resolve_worker_document_text(payload)
if page_text:
aggregated.text_fragments.append(page_text)
@@ -626,6 +626,22 @@ class OcrService:
return descriptor.text_layer
return ""
@staticmethod
def _resolve_worker_document_text(payload: dict) -> str:
for key in ("text", "ocr_text", "raw_text", "full_text"):
value = str(payload.get(key, "") or "").strip()
if value:
return value
lines = payload.get("lines", [])
if not isinstance(lines, list):
return ""
return "\n".join(
str(item.get("text", "") or "").strip()
for item in lines
if isinstance(item, dict) and str(item.get("text", "") or "").strip()
).strip()
@staticmethod
def _build_lines(
items: list[dict],

View File

@@ -12,7 +12,7 @@ from uuid import uuid4
from app.api.deps import CurrentUserContext
from app.core.config import get_settings
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead
from app.schemas.receipt_folder import (
ReceiptFolderDeleteResponse,
ReceiptFolderDetailRead,
@@ -20,11 +20,13 @@ from app.schemas.receipt_folder import (
ReceiptFolderItemRead,
ReceiptFolderUpdate,
)
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
from app.services.document_preview import DocumentPreviewAssets
from app.services.document_intelligence import build_document_insight
from app.services.ocr import SUPPORTED_SUFFIXES
RECEIPT_DATE_PATTERN = re.compile(
r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)"
r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
)
RECEIPT_TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:]([0-5]\d)(?!\d)")
TRAIN_INVOICE_DATE_PATTERN = re.compile(
@@ -45,7 +47,9 @@ TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座
TRAIN_CARRIAGE_PATTERN = re.compile(r"(?:车厢|车厢号)\s*[:]?\s*([0-9]{1,2}\s*车?)")
TRAIN_SEAT_NO_PATTERN = re.compile(r"(?:座位|座位号)\s*[:]?\s*([0-9]{1,3}[A-F号]?)", re.IGNORECASE)
TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])号?", re.IGNORECASE)
TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE)
TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[::¥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)")
class ReceiptFolderStorageMixin:
@@ -101,18 +105,19 @@ class ReceiptFolderStorageMixin:
document: Any | None,
) -> dict[str, Any]:
preview_data_url = str(getattr(document, "preview_data_url", "") or "").strip()
decoded = ExpenseClaimAttachmentPresentation.decode_data_url(preview_data_url)
if decoded is not None:
preview_media_type, preview_content = decoded
suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
preview_name = f"preview{suffix}"
preview_path = receipt_dir / preview_name
preview_path.write_bytes(preview_content)
preview_asset = DocumentPreviewAssets.write_data_url_preview(
preview_dir=receipt_dir,
preview_name_stem="preview",
preview_data_url=preview_data_url,
)
if preview_asset is not None:
_, preview_media_type, preview_name = preview_asset
return {
"previewable": True,
"preview_kind": "image",
"preview_file_name": preview_name,
"preview_media_type": preview_media_type,
"preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
}
if self._is_previewable(media_type):
return {
@@ -120,14 +125,67 @@ class ReceiptFolderStorageMixin:
"preview_kind": "image" if media_type.startswith("image/") else "pdf",
"preview_file_name": source_path.name,
"preview_media_type": media_type,
"preview_rendered_with": "",
}
return {
"previewable": False,
"preview_kind": "",
"preview_file_name": "",
"preview_media_type": "",
"preview_rendered_with": "",
}
def _refresh_pdf_preview_asset_if_needed(
self,
*,
receipt_dir: Path,
meta: dict[str, Any],
) -> dict[str, Any]:
source_name = str(meta.get("source_file_name") or meta.get("file_name") or "").strip()
if not source_name:
return meta
source_path = self._assert_child(receipt_dir / source_name)
source_media_type = self.resolve_media_type(source_path.name, str(meta.get("media_type") or ""))
if source_media_type != "application/pdf" or not source_path.exists():
return meta
preview_name = str(meta.get("preview_file_name") or "").strip()
preview_path = self._assert_child(receipt_dir / preview_name) if preview_name else None
if (
preview_path is not None
and preview_path.exists()
and str(meta.get("preview_kind") or "").strip() == "image"
and str(meta.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
and str(meta.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
):
return meta
if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
preview_name = f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
preview_path = self._assert_child(receipt_dir / preview_name)
try:
DocumentPreviewAssets.render_pdf_first_page(
pdf_path=source_path,
preview_path=preview_path,
timeout_seconds=get_settings().ocr_timeout_seconds,
)
except Exception:
return meta
meta.update(
{
"previewable": True,
"preview_kind": "image",
"preview_file_name": preview_path.name,
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
}
)
self._write_meta(receipt_dir, meta)
return meta
@staticmethod
def _is_previewable(media_type: str) -> bool:
return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf"
@@ -256,6 +314,7 @@ class ReceiptFolderItemMixin:
def _build_item(self, meta: dict[str, Any]) -> ReceiptFolderItemRead:
receipt_id = str(meta.get("id") or "").strip()
status_value = str(meta.get("status") or "unlinked").strip() or "unlinked"
identity = self._resolve_receipt_document_identity(meta)
return ReceiptFolderItemRead(
id=receipt_id,
file_name=str(meta.get("file_name") or ""),
@@ -263,10 +322,10 @@ class ReceiptFolderItemMixin:
size_bytes=int(meta.get("size_bytes") or 0),
status=status_value,
status_label="已关联" if status_value == "linked" else "未关联",
document_type=str(meta.get("document_type") or "other"),
document_type_label=str(meta.get("document_type_label") or "其他单据"),
scene_code=str(meta.get("scene_code") or "other"),
scene_label=str(meta.get("scene_label") or "其他票据"),
document_type=identity["document_type"],
document_type_label=identity["document_type_label"],
scene_code=identity["scene_code"],
scene_label=identity["scene_label"],
summary=str(meta.get("summary") or ""),
amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")),
document_date=self._resolve_receipt_document_date(meta),
@@ -283,6 +342,38 @@ class ReceiptFolderItemMixin:
warnings=[str(value) for value in list(meta.get("ocr_warnings") or []) if str(value).strip()],
)
def _resolve_receipt_document_identity(self, meta: dict[str, Any]) -> dict[str, str]:
document_type = str(meta.get("document_type") or "other").strip() or "other"
document_type_label = str(meta.get("document_type_label") or "其他单据").strip() or "其他单据"
scene_code = str(meta.get("scene_code") or "other").strip() or "other"
scene_label = str(meta.get("scene_label") or "其他票据").strip() or "其他票据"
if document_type not in {"", "other"} and document_type_label != "其他单据":
return {
"document_type": document_type,
"document_type_label": document_type_label,
"scene_code": scene_code,
"scene_label": scene_label,
}
insight = build_document_insight(
filename=str(meta.get("file_name") or ""),
summary=str(meta.get("summary") or ""),
text=self._receipt_text(meta),
)
if insight.document_type in {"", "other"}:
return {
"document_type": document_type,
"document_type_label": document_type_label,
"scene_code": scene_code,
"scene_label": scene_label,
}
return {
"document_type": insight.document_type,
"document_type_label": insight.document_type_label,
"scene_code": insight.scene_code,
"scene_label": insight.scene_label,
}
def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]:
fields = [
ReceiptFolderFieldRead(
@@ -503,7 +594,15 @@ class ReceiptFolderTrainTicketMixin:
if str(document_type or "").strip().lower() == "train_ticket":
return True
compact = "".join([document_type_label, scene_label, text]).replace(" ", "")
return any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次"))
if any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次")):
return True
lower_compact = compact.lower()
return bool(re.search(r"[GCDZKTLYS]\d{1,5}", compact, flags=re.IGNORECASE)) and (
"12306" in compact
or "95306" in compact
or re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—||-)[\u4e00-\u9fa5]{2,12}", compact)
or ("wuhan" in lower_compact and "shanghai" in lower_compact)
)
@classmethod
def _is_train_ticket_meta(cls, meta: dict[str, Any]) -> bool:
@@ -581,6 +680,7 @@ class ReceiptFolderTrainTicketMixin:
return raw
normalized = match.group(1).replace("", "-").replace("", "-").replace("", "")
normalized = normalized.replace("/", "-").replace(".", "-")
normalized = re.sub(r"\s+", "-", normalized)
parts = [part for part in normalized.split("-") if part]
if len(parts) != 3:
return match.group(1)
@@ -651,7 +751,28 @@ class ReceiptFolderTrainTicketMixin:
cleaned = re.sub(r"[^·\u4e00-\u9fa5]", "", str(value or "")).strip()
if not 2 <= len(cleaned) <= 8:
return ""
if any(token in cleaned for token in ("电子", "客票", "铁路", "发票", "税务", "湖北省", "中国铁路", "开票", "日期")):
if any(
token in cleaned
for token in (
"电子",
"客票",
"铁路",
"发票",
"税务",
"湖北省",
"中国铁路",
"开票",
"日期",
"车厢",
"座位",
"票价",
"金额",
"行程",
"出发",
"到达",
"车次",
)
):
return ""
return cleaned
@@ -660,20 +781,29 @@ class ReceiptFolderTrainTicketMixin:
labeled = cls._extract_first(TRAIN_ID_PATTERN, text)
if labeled:
return labeled
fallback = ""
for line in str(text or "").replace("\r", "\n").splitlines():
compact_line = line.replace(" ", "")
if any(token in compact_line for token in ("发票号码", "电子客票号", "客票号", "订单号")):
continue
match = TRAIN_ID_FALLBACK_PATTERN.search(compact_line)
if match:
return str(match.group(1) or "").strip()
return ""
if not match:
continue
candidate = str(match.group(1) or "").strip()
if "*" in candidate:
return candidate
if not fallback:
fallback = candidate
return fallback
@staticmethod
def _extract_train_carriage_and_seat(text: str) -> tuple[str, str]:
combined_match = TRAIN_COMBINED_SEAT_PATTERN.search(str(text or ""))
if combined_match:
return f"{combined_match.group(1)}", combined_match.group(2)
loose_match = TRAIN_LOOSE_SEAT_PATTERN.search(str(text or ""))
if loose_match:
return f"{loose_match.group(1).zfill(2)}", loose_match.group(2).upper()
carriage_no = ReceiptFolderService._extract_first(TRAIN_CARRIAGE_PATTERN, text).replace(" ", "")
seat_no = ReceiptFolderService._extract_first(TRAIN_SEAT_NO_PATTERN, text)
return carriage_no, seat_no
@@ -681,6 +811,12 @@ class ReceiptFolderTrainTicketMixin:
@staticmethod
def _extract_train_fare(text: str) -> str:
match = TRAIN_FARE_PATTERN.search(str(text or ""))
if not match:
match = max(
list(TRAIN_LOOSE_FARE_PATTERN.finditer(str(text or ""))),
key=lambda item: float(str(item.group(1) or "0").replace(",", ".")),
default=None,
)
if not match:
return ""
value = str(match.group(1) or "").replace(",", ".").strip()
@@ -721,13 +857,10 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
)
if existing_receipt is not None:
enriched.append(
document.model_copy(
update={
"receipt_id": existing_receipt.id,
"receipt_status": existing_receipt.status,
"receipt_preview_url": existing_receipt.preview_url,
"receipt_source_url": existing_receipt.source_url,
}
self._enrich_ocr_document_with_receipt(
document,
receipt=existing_receipt,
current_user=current_user,
)
)
continue
@@ -744,14 +877,11 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
warning = "已上传过同样的单据,请不要重复上传。"
existing_warnings = [str(item) for item in list(document.warnings or []) if str(item).strip()]
enriched.append(
document.model_copy(
update={
"receipt_id": duplicate_receipt.id,
"receipt_status": duplicate_receipt.status,
"receipt_preview_url": duplicate_receipt.preview_url,
"receipt_source_url": duplicate_receipt.source_url,
"warnings": list(dict.fromkeys([*existing_warnings, warning])),
}
self._enrich_ocr_document_with_receipt(
document,
receipt=duplicate_receipt,
current_user=current_user,
extra_warnings=[*existing_warnings, warning],
)
)
continue
@@ -763,17 +893,78 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
current_user=current_user,
)
enriched.append(
document.model_copy(
update={
"receipt_id": receipt.id,
"receipt_status": receipt.status,
"receipt_preview_url": receipt.preview_url,
"receipt_source_url": receipt.source_url,
}
self._enrich_ocr_document_with_receipt(
document,
receipt=receipt,
current_user=current_user,
)
)
return result.model_copy(update={"documents": enriched})
def _enrich_ocr_document_with_receipt(
self,
document: OcrRecognizeDocumentRead,
*,
receipt: ReceiptFolderItemRead,
current_user: CurrentUserContext,
extra_warnings: list[str] | None = None,
) -> OcrRecognizeDocumentRead:
update: dict[str, Any] = {
"receipt_id": receipt.id,
"receipt_status": receipt.status,
"receipt_preview_url": receipt.preview_url,
"receipt_source_url": receipt.source_url,
}
try:
meta = self._read_receipt_meta(receipt.id, current_user)
except FileNotFoundError:
meta = {}
if meta:
update.update(
{
"text": str(meta.get("ocr_text") or document.text or ""),
"summary": str(meta.get("summary") or document.summary or ""),
"document_type": str(meta.get("document_type") or document.document_type or "other"),
"document_type_label": str(meta.get("document_type_label") or document.document_type_label or "其他单据"),
"scene_code": str(meta.get("scene_code") or document.scene_code or "other"),
"scene_label": str(meta.get("scene_label") or document.scene_label or "其他票据"),
"classification_source": str(meta.get("ocr_classification_source") or document.classification_source or ""),
"classification_confidence": float(
meta.get("ocr_classification_confidence")
or document.classification_confidence
or 0.0
),
"classification_evidence": [
str(value)
for value in list(meta.get("ocr_classification_evidence") or document.classification_evidence or [])
if str(value).strip()
],
"document_fields": self._build_ocr_document_fields_from_meta(meta),
}
)
warnings = [
str(item)
for item in list(extra_warnings if extra_warnings is not None else document.warnings or [])
if str(item).strip()
]
if warnings:
update["warnings"] = list(dict.fromkeys(warnings))
return document.model_copy(update=update)
def _build_ocr_document_fields_from_meta(self, meta: dict[str, Any]) -> list[OcrRecognizeFieldRead]:
return [
OcrRecognizeFieldRead(
key=field.key,
label=field.label,
value=field.value,
)
for field in self._resolve_fields(meta)
if field.label and field.value
]
def save_receipt(
self,
*,
@@ -1024,6 +1215,7 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
def resolve_preview(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]:
meta = self._read_receipt_meta(receipt_id, current_user)
receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id)
meta = self._refresh_pdf_preview_asset_if_needed(receipt_dir=receipt_dir, meta=meta)
preview_name = str(meta.get("preview_file_name") or "").strip()
if preview_name:
preview_path = self._assert_child(receipt_dir / preview_name)
@@ -1038,4 +1230,3 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
if self._is_previewable(source_media_type):
return source_path, source_media_type, source_name
raise FileNotFoundError("Receipt preview not found")

View File

@@ -84,6 +84,35 @@ def test_document_intelligence_prefers_train_ticket_for_railway_e_ticket_invoice
assert any(field.label == "金额" and field.value == "354元" for field in insight.fields)
def test_document_intelligence_recovers_train_ticket_from_english_station_ocr_text() -> None:
insight = build_document_insight(
filename="2月20_武汉-上海.pdf",
summary=":26429165800002785705:2026 05 18Wuhan Shanghaihongqiao G458",
text=(
":26429165800002785705\n"
":2026 05 18\n"
"G458\n"
"Wuhan\n"
"Shanghaihongqiao\n"
"2026 02 20 07:55\n"
"06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
)
assert insight.document_type == "train_ticket"
assert insight.document_type_label == "火车/高铁票"
assert insight.scene_code == "travel"
fields = {field.label: field.value for field in insight.fields}
assert fields["金额"] == "354元"
assert fields["列车出发时间"] == "2026-02-20 07:55"
assert fields["车次/航班"] == "G458"
assert fields["行程"] == "武汉-上海"
def test_document_intelligence_labels_train_ticket_date_as_train_departure_time() -> None:
insight = build_document_insight(
filename="铁路电子客票.pdf",

View File

@@ -0,0 +1,169 @@
from __future__ import annotations
import json
from decimal import Decimal
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
from app.services.ocr import OcrService
from test_reimbursement_endpoints import build_client, seed_claim
def test_train_ticket_attachment_with_structured_fields_is_not_flagged_as_unreadable(
monkeypatch,
tmp_path,
) -> None:
def fake_recognize(
self,
files: list[tuple[str, bytes, str | None]],
) -> OcrRecognizeBatchRead:
return OcrRecognizeBatchRead(
total_file_count=1,
success_count=1,
documents=[
OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text=(
":26429165800002785705\n"
":2026 05 18\n"
"G458\n"
"Wuhan\n"
"Shanghaihongqiao\n"
"2026 02 20 07:55\n"
"06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
summary="Wuhan Shanghaihongqiao G458 354.00",
avg_score=0.0,
line_count=0,
page_count=1,
warnings=[],
)
],
)
monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
client, session_factory = build_client()
with session_factory() as db:
claim, item = seed_claim(db)
claim.expense_type = "travel"
claim.reason = "武汉-上海差旅"
claim.location = "上海"
claim.amount = Decimal("354.00")
item.item_type = "train_ticket"
item.item_reason = "武汉-上海"
item.item_location = "上海"
item.item_amount = Decimal("354.00")
db.commit()
claim_id = claim.id
item_id = item.id
upload_response = client.post(
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
)
assert upload_response.status_code == 200
attachment = upload_response.json()["attachment"]
analysis = attachment["analysis"]
points = analysis["points"]
assert attachment["document_info"]["document_type"] == "train_ticket"
assert analysis["severity"] == "pass"
assert not any("未识别到有效文字" in point for point in points)
assert not any("未识别到列车出发时间" in point for point in points)
def test_attachment_meta_read_repairs_stale_unreadable_train_ticket_analysis(
monkeypatch,
tmp_path,
) -> None:
def fake_recognize(
self,
files: list[tuple[str, bytes, str | None]],
) -> OcrRecognizeBatchRead:
return OcrRecognizeBatchRead(
total_file_count=1,
success_count=1,
documents=[
OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text=(
":26429165800002785705 :2026 05 18\n"
"G458\n"
"Wuhan Shanghaihongqiao\n"
"2026 02 20 07:55 06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
summary="Wuhan Shanghaihongqiao G458 354.00",
avg_score=0.0,
line_count=0,
page_count=1,
warnings=[],
)
],
)
monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
client, session_factory = build_client()
with session_factory() as db:
claim, item = seed_claim(db)
claim.expense_type = "travel"
claim.reason = "武汉-上海差旅"
claim.location = "上海"
claim.amount = Decimal("354.00")
item.item_type = "train_ticket"
item.item_reason = "武汉-上海"
item.item_location = "上海"
item.item_amount = Decimal("354.00")
db.commit()
claim_id = claim.id
item_id = item.id
upload_response = client.post(
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
)
assert upload_response.status_code == 200
meta_path = next(tmp_path.rglob("*.meta.json"))
meta = json.loads(meta_path.read_text(encoding="utf-8"))
meta["analysis"] = {
"severity": "high",
"label": "高风险",
"headline": "AI提示附件不符合票据校验条件",
"summary": "当前附件存在明显异常,票据类型与当前费用场景不匹配,或无法作为有效报销材料。",
"points": [
"附件内容:未识别到有效文字,当前附件更像普通图片或内容过于模糊。",
"日期字段:未识别到列车出发时间或乘车日期。",
],
"rule_basis": [],
"suggestion": "建议过滤当前不匹配的票据,重新上传符合当前费用场景的清晰原件。",
}
meta_path.write_text(json.dumps(meta, ensure_ascii=False), encoding="utf-8")
meta_response = client.get(
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/meta",
headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
)
assert meta_response.status_code == 200
analysis = meta_response.json()["analysis"]
points = analysis["points"]
assert analysis["severity"] == "pass"
assert not any("未识别到有效文字" in point for point in points)
assert not any("未识别到列车出发时间" in point for point in points)

View File

@@ -176,3 +176,73 @@ def test_ocr_recognize_endpoint_returns_structured_payload(monkeypatch, tmp_path
assert deleted_response.status_code == 404
finally:
get_settings.cache_clear()
def test_ocr_recognize_endpoint_returns_receipt_enriched_train_fields(monkeypatch, tmp_path) -> None:
def fake_recognize(
self,
files: list[tuple[str, bytes, str | None]],
) -> OcrRecognizeBatchRead:
return OcrRecognizeBatchRead(
engine="paddleocr_mobile",
model="PP-OCRv5_mobile",
total_file_count=1,
success_count=1,
documents=[
OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.png",
media_type="image/png",
text=(
":26429165800002785705\n"
"G458\n"
"Wuhan\n"
"Shanghaihongqiao\n"
"2026 02 20 07:55\n"
"06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
summary="Wuhan Shanghaihongqiao G458 354.00",
avg_score=0.92,
line_count=0,
page_count=1,
document_type="train_ticket",
document_type_label="火车/高铁票",
scene_code="travel",
scene_label="差旅票据",
document_fields=[
OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"),
OcrRecognizeFieldRead(key="trip_no", label="车次/航班", value="G458"),
OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"),
OcrRecognizeFieldRead(key="amount", label="金额", value="354元"),
],
)
],
)
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
try:
client = build_client()
response = client.post(
"/api/v1/ocr/recognize",
headers={"x-auth-username": "pytest", "x-auth-name": "Py Test"},
files=[("files", ("2月20_武汉-上海.png", b"fake-image", "image/png"))],
)
finally:
get_settings.cache_clear()
assert response.status_code == 200
document = response.json()["documents"][0]
fields = {
item["label"]: item["value"]
for item in document["document_fields"]
}
assert document["receipt_id"]
assert fields["身份证号"] == "4201061987****1615"
assert fields["车厢"] == "06车"
assert fields["座位号"] == "01B"
assert fields["票价"] == "354.00元"

View File

@@ -101,6 +101,55 @@ print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))
assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]
def test_ocr_service_recovers_image_text_from_worker_ocr_text(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"ocr_text": "铁路电子客票 武汉-上海 2026 02 20 07:55 G458 : 354.00 12306 95306",
"avg_score": 0.92,
"line_count": 0,
"page_count": 1,
"warnings": [],
"lines": [],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
OcrService._result_cache.clear()
get_settings.cache_clear()
try:
result = OcrService().recognize_files([("train-ticket.png", b"fake-train-image", "image/png")])
finally:
OcrService._result_cache.clear()
get_settings.cache_clear()
recognized = result.documents[0]
assert "铁路电子客票" in recognized.text
assert recognized.document_type == "train_ticket"
assert any(field.label == "列车出发时间" and field.value == "2026-02-20 07:55" for field in recognized.document_fields)
assert any(field.label == "车次/航班" and field.value == "G458" for field in recognized.document_fields)
assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields)
def test_ocr_service_passes_configured_device_to_worker(
monkeypatch,
tmp_path: Path,

View File

@@ -1,8 +1,11 @@
from __future__ import annotations
import base64
from app.api.deps import CurrentUserContext
from app.core.config import get_settings
from app.schemas.ocr import OcrRecognizeDocumentRead
from app.services.document_preview import DocumentPreviewAssets
from app.services.receipt_folder import ReceiptFolderService
@@ -69,6 +72,172 @@ def test_receipt_folder_train_ticket_uses_invoice_date_and_enriches_fields(monke
get_settings.cache_clear()
def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
stale_preview = b"stale-preview"
preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}"
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=b"%PDF-1.4 fake",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
preview_kind="image",
preview_data_url=preview_data_url,
),
)
receipt_dir = next(service.root.glob("pytest/*"))
preview_path = receipt_dir / "preview.png"
assert preview_path.read_bytes() == stale_preview
stale_meta = service._read_meta(receipt_dir)
stale_meta.pop("preview_rendered_with", None)
service._write_meta(receipt_dir, stale_meta)
def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
preview_path.write_bytes(b"refreshed-preview")
return preview_path
monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user)
assert resolved_path == preview_path
assert media_type == "image/png"
assert file_name == "preview.png"
assert preview_path.read_bytes() == b"refreshed-preview"
meta = service._read_meta(receipt_dir)
assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
finally:
get_settings.cache_clear()
def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
service = ReceiptFolderService()
content = b"%PDF-1.4 same receipt"
receipt = service.save_receipt(
filename="same-receipt.pdf",
content=content,
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="same-receipt.pdf",
media_type="application/pdf",
text="same receipt amount 354",
document_type="other",
document_type_label="其他单据",
scene_code="other",
scene_label="其他票据",
),
)
receipt_dir = service.root / "pytest" / receipt.id
assert receipt_dir.exists()
duplicate = service.find_duplicate_receipt(
filename="same-receipt.pdf",
content=content,
current_user=current_user,
)
assert duplicate is not None
assert duplicate.id == receipt.id
service.delete_receipt(receipt_id=receipt.id, current_user=current_user)
assert not receipt_dir.exists()
assert (
service.find_duplicate_receipt(
filename="same-receipt.pdf",
content=content,
current_user=current_user,
)
is None
)
finally:
get_settings.cache_clear()
def test_receipt_folder_recovers_train_ticket_detail_from_other_english_ocr(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=b"%PDF-1.4 fake",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text=(
":26429165800002785705\n"
":2026 05 18\n"
"G458\n"
"Wuhan\n"
"Shanghaihongqiao\n"
"2026 02 20 07:55\n"
"06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
summary="Wuhan Shanghaihongqiao G458 354.00",
document_type="other",
document_type_label="其他单据",
scene_code="other",
scene_label="其他票据",
),
)
assert receipt.document_type == "train_ticket"
assert receipt.document_type_label == "火车/高铁票"
assert receipt.scene_code == "travel"
assert receipt.amount == "354.00元"
assert receipt.document_date == "2026-02-20"
assert receipt.merchant_name == "中国铁路"
detail = service.get_receipt(receipt.id, current_user)
fields = {field.label: field.value for field in detail.fields}
assert fields["行程"] == "武汉-上海"
assert fields["车次"] == "G458"
assert fields["列车出发时间"] == "2026-02-20 07:55"
assert fields["票价"] == "354.00元"
assert fields["身份证号"] == "4201061987****1615"
assert fields["车厢"] == "06车"
assert fields["座位号"] == "01B"
assert "乘车人" not in fields
finally:
get_settings.cache_clear()
def test_receipt_folder_unlink_receipts_for_claim_marks_linked_receipts_unlinked(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import base64
import json
from collections.abc import Generator
from datetime import UTC, date, datetime
from decimal import Decimal
@@ -19,6 +20,7 @@ from app.models.organization import OrganizationUnit
from app.models.risk_observation import RiskObservation, RiskObservationFeedback
from app.models.role import Role
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
from app.services.document_preview import DocumentPreviewAssets
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
from app.services.ocr import OcrService
@@ -686,6 +688,9 @@ def test_claim_item_pdf_attachment_preview_returns_generated_image(monkeypatch,
meta_payload = upload_response.json()["attachment"]
assert meta_payload["preview_kind"] == "image"
assert meta_payload["preview_url"].endswith(f"/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview")
meta_path = next(tmp_path.rglob("invoice.pdf.meta.json"))
stored_meta = json.loads(meta_path.read_text(encoding="utf-8"))
assert stored_meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
preview_response = client.get(
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview",