feat(server): 票据文件夹资产缓存与文档预览统一生成
- 新增 document_preview 模块,DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成(poppler-data 编码)、renderer_id 标识 - receipt_folder 服务复用预览生成,缓存票据资产并提供清理;删除票据时保留已关联报销单的附件副本 - document_intelligence 新增票据预览/资产缓存接入与字段提取增强;ocr 抽取复用预览工具,附件分析/文档/操作/展示四个子模块同步适配 - receipt_folder 端点补充资产缓存头,补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试,新增 attachment_analysis 回归测试
This commit is contained in:
@@ -92,7 +92,7 @@ def preview_receipt(receipt_id: str, current_user: CurrentUser) -> FileResponse:
|
||||
file_path, media_type, file_name = ReceiptFolderService().resolve_preview(receipt_id, current_user)
|
||||
except FileNotFoundError as exc:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Receipt preview not found") from exc
|
||||
return FileResponse(file_path, media_type=media_type, filename=file_name)
|
||||
return FileResponse(file_path, media_type=media_type, filename=file_name, headers={"Cache-Control": "no-store"})
|
||||
|
||||
|
||||
@router.get(
|
||||
|
||||
@@ -25,11 +25,15 @@ AMOUNT_PATTERNS = (
|
||||
re.compile(r"[¥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"),
|
||||
re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
|
||||
)
|
||||
DATE_PATTERN = re.compile(r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)")
|
||||
DATE_PATTERN = re.compile(
|
||||
r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
|
||||
r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
|
||||
)
|
||||
TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[::]([0-5]\d)(?!\d)")
|
||||
INVOICE_NUMBER_PATTERN = re.compile(r"(?:发票号码|票号|单号|订单号)[::\s]*([A-Za-z0-9-]{6,24})")
|
||||
INVOICE_CODE_PATTERN = re.compile(r"(?:发票代码)[::\s]*([A-Za-z0-9-]{6,24})")
|
||||
TRIP_NO_PATTERN = re.compile(r"(?:车次|航班(?:号)?)[::\s]*([A-Za-z0-9]{2,12})")
|
||||
TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(?<![A-Za-z0-9])([GCDZKTLYS]\d{1,5})(?![A-Za-z0-9])", re.IGNORECASE)
|
||||
ROUTE_PATTERN = re.compile(r"([\u4e00-\u9fa5]{2,12})\s*(?:至|→|->|-)\s*([\u4e00-\u9fa5]{2,12})")
|
||||
MERCHANT_PATTERNS = (
|
||||
re.compile(r"(?:销售方(?:名称)?|商户(?:名称)?|开票方(?:名称)?|收款方(?:名称)?)[::\s]*([A-Za-z0-9\u4e00-\u9fa5()()·&\\-]{2,40})"),
|
||||
@@ -300,6 +304,14 @@ def _match_document_rule(compact_text: str) -> RuleMatch:
|
||||
best_score = score
|
||||
|
||||
if best_score <= 0:
|
||||
train_rule = DOCUMENT_TYPE_RULE_MAP.get("train_ticket")
|
||||
if train_rule and _looks_like_train_ticket(compact_text):
|
||||
return RuleMatch(
|
||||
rule=train_rule,
|
||||
confidence=0.82,
|
||||
evidence=("车次", "12306"),
|
||||
score=3.8,
|
||||
)
|
||||
return RuleMatch(rule=None, confidence=0.0, evidence=(), score=0.0)
|
||||
|
||||
confidence = min(0.94, 0.30 + min(best_score, 4.8) * 0.12)
|
||||
@@ -311,6 +323,17 @@ def _match_document_rule(compact_text: str) -> RuleMatch:
|
||||
)
|
||||
|
||||
|
||||
def _looks_like_train_ticket(compact_text: str) -> bool:
|
||||
text = str(compact_text or "").lower()
|
||||
if not re.search(r"[gcdzktlys]\d{1,5}", text, flags=re.IGNORECASE):
|
||||
return False
|
||||
if "12306" in text or "95306" in text:
|
||||
return True
|
||||
if re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5]{2,12}", text):
|
||||
return True
|
||||
return "wuhan" in text and "shanghai" in text
|
||||
|
||||
|
||||
def _extract_json_payload(response_text: str | None) -> dict[str, Any] | None:
|
||||
if not response_text:
|
||||
return None
|
||||
@@ -521,33 +544,48 @@ def _merge_document_fields(
|
||||
|
||||
def _extract_document_fields(text: str, document_type: str = "") -> list[DocumentField]:
|
||||
fields: list[DocumentField] = []
|
||||
normalized_type = str(document_type or "").strip().lower()
|
||||
|
||||
def append_field(key: str, label: str, value: str) -> None:
|
||||
cleaned = _clean_field_value(value)
|
||||
if not cleaned:
|
||||
return
|
||||
if any(field.key == key for field in fields if field.key):
|
||||
return
|
||||
fields.append(DocumentField(key=key, label=label, value=cleaned))
|
||||
|
||||
amount = _extract_amount(text)
|
||||
if amount:
|
||||
fields.append(DocumentField(key="amount", label="金额", value=amount))
|
||||
append_field("amount", "金额", amount)
|
||||
|
||||
date_value = _extract_date(text, document_type=document_type)
|
||||
if date_value:
|
||||
fields.append(DocumentField(key="date", label="日期", value=date_value))
|
||||
append_field("date", "日期", date_value)
|
||||
|
||||
merchant = _extract_merchant(text)
|
||||
if merchant:
|
||||
fields.append(DocumentField(key="merchant_name", label="商户", value=merchant))
|
||||
append_field("merchant_name", "商户", merchant)
|
||||
|
||||
invoice_number = _extract_pattern(INVOICE_NUMBER_PATTERN, text)
|
||||
if invoice_number:
|
||||
fields.append(DocumentField(key="invoice_number", label="票据号码", value=invoice_number))
|
||||
append_field("invoice_number", "票据号码", invoice_number)
|
||||
|
||||
invoice_code = _extract_pattern(INVOICE_CODE_PATTERN, text)
|
||||
if invoice_code:
|
||||
fields.append(DocumentField(key="invoice_code", label="发票代码", value=invoice_code))
|
||||
append_field("invoice_code", "发票代码", invoice_code)
|
||||
|
||||
trip_no = _extract_pattern(TRIP_NO_PATTERN, text)
|
||||
if not trip_no and normalized_type == "train_ticket":
|
||||
trip_no = _extract_pattern(TRAIN_STANDALONE_NO_PATTERN, text)
|
||||
if trip_no:
|
||||
fields.append(DocumentField(key="trip_no", label="车次/航班", value=trip_no))
|
||||
append_field("trip_no", "车次/航班", trip_no.upper())
|
||||
|
||||
route = _extract_route(text)
|
||||
if route:
|
||||
fields.append(DocumentField(key="route", label="行程", value=route))
|
||||
append_field("route", "行程", route)
|
||||
|
||||
if normalized_type == "train_ticket" and not any(field.key == "amount" for field in fields):
|
||||
append_field("amount", "金额", _extract_loose_decimal_amount(text))
|
||||
|
||||
return fields
|
||||
|
||||
@@ -621,6 +659,7 @@ def _format_date_match_with_time(text: str, match: re.Match[str]) -> str:
|
||||
raw_value = str(match.group(1) or "").strip()
|
||||
normalized = raw_value.replace("年", "-").replace("月", "-").replace("日", "")
|
||||
normalized = normalized.replace("/", "-").replace(".", "-")
|
||||
normalized = re.sub(r"\s+", "-", normalized)
|
||||
parts = [part for part in normalized.split("-") if part]
|
||||
if len(parts) != 3:
|
||||
return raw_value
|
||||
@@ -703,6 +742,23 @@ def _extract_route(text: str) -> str:
|
||||
return f"{start}-{end}"
|
||||
|
||||
|
||||
def _extract_loose_decimal_amount(text: str) -> str:
|
||||
best_value: Decimal | None = None
|
||||
for match in re.finditer(r"(?<!\d)(\d{1,6}\.\d{1,2})(?!\d)", str(text or "")):
|
||||
try:
|
||||
candidate = Decimal(match.group(1)).quantize(Decimal("0.01"))
|
||||
except InvalidOperation:
|
||||
continue
|
||||
if candidate <= Decimal("0.00"):
|
||||
continue
|
||||
if best_value is None or candidate > best_value:
|
||||
best_value = candidate
|
||||
if best_value is None:
|
||||
return ""
|
||||
text_value = format(best_value, "f").rstrip("0").rstrip(".")
|
||||
return f"{text_value}元"
|
||||
|
||||
|
||||
def _extract_pattern(pattern: re.Pattern[str], text: str) -> str:
|
||||
match = pattern.search(text)
|
||||
if not match:
|
||||
|
||||
98
server/src/app/services/document_preview.py
Normal file
98
server/src/app/services/document_preview.py
Normal file
@@ -0,0 +1,98 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import binascii
|
||||
import mimetypes
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class DocumentPreviewAssets:
|
||||
PDF_RENDERER_ID = "pdftoppm-png-r160-poppler-data"
|
||||
PDF_PREVIEW_MEDIA_TYPE = "image/png"
|
||||
PDF_PREVIEW_SUFFIX = ".png"
|
||||
|
||||
@staticmethod
|
||||
def decode_data_url(payload: str) -> tuple[str, bytes] | None:
|
||||
normalized = str(payload or "").strip()
|
||||
matched = re.match(
|
||||
r"^data:(?P<media>[\w.+-]+/[\w.+-]+);base64,(?P<body>.+)$",
|
||||
normalized,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
if not matched:
|
||||
return None
|
||||
try:
|
||||
content = base64.b64decode(matched.group("body"), validate=True)
|
||||
except (binascii.Error, ValueError):
|
||||
return None
|
||||
return matched.group("media"), content
|
||||
|
||||
@classmethod
|
||||
def renderer_id_for_source(cls, media_type: str | None) -> str:
|
||||
return cls.PDF_RENDERER_ID if str(media_type or "").strip() == "application/pdf" else ""
|
||||
|
||||
@classmethod
|
||||
def write_data_url_preview(
|
||||
cls,
|
||||
*,
|
||||
preview_dir: Path,
|
||||
preview_name_stem: str,
|
||||
preview_data_url: str,
|
||||
) -> tuple[Path, str, str] | None:
|
||||
decoded = cls.decode_data_url(preview_data_url)
|
||||
if decoded is None:
|
||||
return None
|
||||
|
||||
preview_media_type, preview_content = decoded
|
||||
suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
|
||||
preview_name = f"{Path(preview_name_stem).stem}{suffix}"
|
||||
preview_path = preview_dir / preview_name
|
||||
preview_path.write_bytes(preview_content)
|
||||
return preview_path, preview_media_type, preview_name
|
||||
|
||||
@classmethod
|
||||
def render_pdf_first_page(
|
||||
cls,
|
||||
*,
|
||||
pdf_path: Path,
|
||||
preview_path: Path,
|
||||
timeout_seconds: int | float,
|
||||
) -> Path:
|
||||
preview_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with tempfile.TemporaryDirectory(prefix=".pdf-preview-", dir=str(preview_path.parent)) as temp_dir:
|
||||
prefix = Path(temp_dir) / "page"
|
||||
completed = subprocess.run(
|
||||
[
|
||||
"pdftoppm",
|
||||
"-png",
|
||||
"-r",
|
||||
"160",
|
||||
str(pdf_path),
|
||||
str(prefix),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_seconds,
|
||||
check=False,
|
||||
)
|
||||
if completed.returncode != 0:
|
||||
detail = (completed.stderr or completed.stdout or "").strip()
|
||||
raise RuntimeError(detail or "pdftoppm failed to render PDF preview.")
|
||||
|
||||
pages = sorted(Path(temp_dir).glob("page-*.png"), key=cls._extract_pdf_page_sort_key)
|
||||
if not pages:
|
||||
raise RuntimeError("pdftoppm did not generate a preview image.")
|
||||
shutil.copyfile(pages[0], preview_path)
|
||||
return preview_path
|
||||
|
||||
@staticmethod
|
||||
def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]:
|
||||
suffix = path.stem.rsplit("-", 1)[-1]
|
||||
try:
|
||||
return int(suffix), path.name
|
||||
except ValueError:
|
||||
return 0, path.name
|
||||
@@ -336,7 +336,27 @@ class ExpenseClaimAttachmentAnalysisMixin:
|
||||
|
||||
@staticmethod
|
||||
def _has_date_like_text(text: str) -> bool:
|
||||
return bool(re.search(r"(20\d{2}[年/\-.]\d{1,2}[月/\-.]\d{1,2}日?)", text))
|
||||
return bool(re.search(r"(20\d{2}(?:[年/\-.]|\s+)\d{1,2}(?:[月/\-.]|\s+)\d{1,2}日?)", text))
|
||||
|
||||
@staticmethod
|
||||
def _has_document_date_field(document_info: dict[str, Any]) -> bool:
|
||||
date_keys = DOCUMENT_TRIP_DATE_KEYS | DOCUMENT_GENERIC_DATE_KEYS | DOCUMENT_INVOICE_DATE_KEYS
|
||||
date_label_tokens = (
|
||||
*DOCUMENT_TRIP_DATE_LABEL_TOKENS,
|
||||
*DOCUMENT_GENERIC_DATE_LABEL_TOKENS,
|
||||
*DOCUMENT_INVOICE_DATE_LABEL_TOKENS,
|
||||
)
|
||||
for field in list(document_info.get("fields") or []):
|
||||
if not isinstance(field, dict):
|
||||
continue
|
||||
value = str(field.get("value") or "").strip()
|
||||
if not value:
|
||||
continue
|
||||
key = str(field.get("key") or "").strip().lower().replace("_", "")
|
||||
label = str(field.get("label") or "").replace(" ", "")
|
||||
if key in date_keys or any(token in label for token in date_label_tokens):
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _normalize_match_text(text: str) -> str:
|
||||
@@ -538,6 +558,12 @@ class ExpenseClaimAttachmentAnalysisMixin:
|
||||
recognized_document_label = str(document_info.get("document_type_label") or "其他单据").strip() or "其他单据"
|
||||
requirement_matches = bool(requirement_check.get("matches"))
|
||||
mismatch_severity = str(requirement_check.get("mismatch_severity") or "high").strip().lower() or "high"
|
||||
document_fields = [
|
||||
field
|
||||
for field in list(document_info.get("fields") or [])
|
||||
if isinstance(field, dict) and str(field.get("value") or "").strip()
|
||||
]
|
||||
has_readable_content = bool(line_count > 0 or compact_text or document_fields)
|
||||
|
||||
has_ticket_keyword = any(
|
||||
keyword in compact_text
|
||||
@@ -556,15 +582,18 @@ class ExpenseClaimAttachmentAnalysisMixin:
|
||||
)
|
||||
)
|
||||
amount_candidates = self._extract_amount_candidates(text)
|
||||
field_amount = self._resolve_document_field_amount({"document_fields": document_fields})
|
||||
if field_amount is not None and field_amount not in amount_candidates:
|
||||
amount_candidates.insert(0, field_amount)
|
||||
item_amount = Decimal(item.item_amount or Decimal("0.00")).quantize(Decimal("0.01"))
|
||||
has_matching_amount = any(abs(candidate - item_amount) <= Decimal("1.00") for candidate in amount_candidates)
|
||||
has_date_text = self._has_date_like_text(text)
|
||||
has_date_text = self._has_date_like_text(text) or self._has_document_date_field(document_info)
|
||||
amount_mismatch = bool(amount_candidates) and item_amount > Decimal("0.00") and not has_matching_amount
|
||||
|
||||
points: list[str] = []
|
||||
if warnings:
|
||||
points.append(f"识别提示:{warnings[0]}")
|
||||
if line_count == 0 or not compact_text:
|
||||
if not has_readable_content:
|
||||
points.append("附件内容:未识别到有效文字,当前附件更像普通图片或内容过于模糊。")
|
||||
if recognized_document_type == "other" and not has_ticket_keyword:
|
||||
points.append("票据类型:未识别到发票、票据、电子行程单等关键字,暂无法判断票据类型。")
|
||||
@@ -617,8 +646,7 @@ class ExpenseClaimAttachmentAnalysisMixin:
|
||||
headline = "AI提示:住宿金额超出报销标准"
|
||||
summary = "当前住宿票据金额超过规则中心差旅住宿标准,已作为风险项保留在单据中;如需按特殊情况提交,请补充超标原因。"
|
||||
elif (
|
||||
line_count == 0
|
||||
or not compact_text
|
||||
not has_readable_content
|
||||
or (recognized_document_type == "other" and not has_ticket_keyword and issue_count >= 2)
|
||||
or (not requirement_matches and mismatch_severity == "high")
|
||||
or (purpose_mismatch_point and amount_mismatch)
|
||||
|
||||
@@ -119,6 +119,13 @@ class ExpenseClaimAttachmentDocumentMixin:
|
||||
metadata=metadata,
|
||||
item=item,
|
||||
)
|
||||
metadata = self._refresh_pdf_attachment_preview_meta_if_needed(
|
||||
file_path=file_path,
|
||||
metadata=metadata,
|
||||
)
|
||||
if self._attachment_metadata_needs_analysis_refresh(metadata):
|
||||
self._refresh_item_attachment_analysis(item)
|
||||
metadata = self._attachment_storage.read_meta(file_path)
|
||||
uploaded_at_value = metadata.get("uploaded_at")
|
||||
uploaded_at = None
|
||||
if isinstance(uploaded_at_value, str) and uploaded_at_value.strip():
|
||||
@@ -157,6 +164,68 @@ class ExpenseClaimAttachmentDocumentMixin:
|
||||
"requirement_check": requirement_check,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _attachment_metadata_needs_analysis_refresh(cls, metadata: dict[str, Any]) -> bool:
|
||||
analysis = metadata.get("analysis")
|
||||
if not isinstance(analysis, dict):
|
||||
return cls._attachment_metadata_has_ocr_signal(metadata)
|
||||
|
||||
points = [
|
||||
str(point or "").strip()
|
||||
for point in list(analysis.get("points") or [])
|
||||
if str(point or "").strip()
|
||||
]
|
||||
if not points:
|
||||
return False
|
||||
|
||||
if any("未识别到有效文字" in point for point in points):
|
||||
return cls._attachment_metadata_has_readable_signal(metadata)
|
||||
|
||||
if any("未识别到列车出发时间" in point or "未识别到开票日期" in point for point in points):
|
||||
return cls._attachment_metadata_has_date_field(metadata)
|
||||
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def _attachment_metadata_has_ocr_signal(cls, metadata: dict[str, Any]) -> bool:
|
||||
return bool(
|
||||
str(metadata.get("ocr_text") or "").strip()
|
||||
or str(metadata.get("ocr_summary") or "").strip()
|
||||
or int(metadata.get("ocr_line_count") or 0) > 0
|
||||
or cls._attachment_metadata_document_fields(metadata)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _attachment_metadata_has_readable_signal(cls, metadata: dict[str, Any]) -> bool:
|
||||
return bool(
|
||||
str(metadata.get("ocr_text") or "").strip()
|
||||
or str(metadata.get("ocr_summary") or "").strip()
|
||||
or int(metadata.get("ocr_line_count") or 0) > 0
|
||||
or cls._attachment_metadata_document_fields(metadata)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _attachment_metadata_document_fields(metadata: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
document_info = metadata.get("document_info")
|
||||
if not isinstance(document_info, dict):
|
||||
return []
|
||||
return [
|
||||
field
|
||||
for field in list(document_info.get("fields") or [])
|
||||
if isinstance(field, dict) and str(field.get("value") or "").strip()
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def _attachment_metadata_has_date_field(cls, metadata: dict[str, Any]) -> bool:
|
||||
for field in cls._attachment_metadata_document_fields(metadata):
|
||||
key = str(field.get("key") or "").strip().lower().replace("_", "")
|
||||
label = str(field.get("label") or "").replace(" ", "")
|
||||
if key in {"date", "tripdate", "departuredate", "invoicedate"}:
|
||||
return True
|
||||
if any(token in label for token in ("日期", "时间", "出发")):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _build_attachment_document_info(self, document: Any) -> dict[str, Any]:
|
||||
insight = build_document_insight(
|
||||
filename=str(getattr(document, "filename", "") or ""),
|
||||
|
||||
@@ -32,6 +32,7 @@ from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager
|
||||
from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY
|
||||
from app.services.agent_foundation import AgentFoundationService
|
||||
from app.services.audit import AuditLogService
|
||||
from app.services.document_preview import DocumentPreviewAssets
|
||||
from app.services.document_intelligence import build_document_insight
|
||||
from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy
|
||||
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
|
||||
@@ -238,6 +239,7 @@ class ExpenseClaimAttachmentOperationsMixin:
|
||||
"preview_storage_key": str(preview_meta["preview_storage_key"]),
|
||||
"preview_media_type": str(preview_meta["preview_media_type"]),
|
||||
"preview_file_name": str(preview_meta["preview_file_name"]),
|
||||
"preview_rendered_with": str(preview_meta.get("preview_rendered_with") or ""),
|
||||
"analysis": attachment_analysis,
|
||||
"document_info": document_info,
|
||||
"requirement_check": requirement_check,
|
||||
@@ -673,6 +675,60 @@ class ExpenseClaimAttachmentOperationsMixin:
|
||||
self._attachment_storage.write_meta(file_path, metadata)
|
||||
return metadata
|
||||
|
||||
def _refresh_pdf_attachment_preview_meta_if_needed(
|
||||
self,
|
||||
*,
|
||||
file_path: Path,
|
||||
metadata: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
if not metadata:
|
||||
return metadata
|
||||
|
||||
media_type = str(
|
||||
metadata.get("media_type")
|
||||
or self._attachment_presentation.resolve_media_type(file_path.name)
|
||||
).strip()
|
||||
if media_type != "application/pdf":
|
||||
return metadata
|
||||
|
||||
preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
|
||||
preview_path = self._attachment_storage.resolve_path(preview_storage_key) if preview_storage_key else None
|
||||
if (
|
||||
preview_path is not None
|
||||
and preview_path.exists()
|
||||
and str(metadata.get("preview_kind") or "").strip() == "image"
|
||||
and str(metadata.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
|
||||
and str(metadata.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
|
||||
):
|
||||
return metadata
|
||||
|
||||
preview_name = str(metadata.get("preview_file_name") or "").strip()
|
||||
if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
|
||||
preview_name = f"{file_path.stem}.preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
|
||||
preview_path = file_path.parent / preview_name
|
||||
|
||||
try:
|
||||
DocumentPreviewAssets.render_pdf_first_page(
|
||||
pdf_path=file_path,
|
||||
preview_path=preview_path,
|
||||
timeout_seconds=OcrService(self.db).settings.ocr_timeout_seconds,
|
||||
)
|
||||
except Exception:
|
||||
return metadata
|
||||
|
||||
metadata.update(
|
||||
{
|
||||
"previewable": True,
|
||||
"preview_kind": "image",
|
||||
"preview_storage_key": self._attachment_storage.to_storage_key(preview_path),
|
||||
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
|
||||
"preview_file_name": preview_path.name,
|
||||
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
|
||||
}
|
||||
)
|
||||
self._attachment_storage.write_meta(file_path, metadata)
|
||||
return metadata
|
||||
|
||||
def _resolve_item_attachment_preview_content(self, item: ExpenseClaimItem) -> tuple[Path, str, str]:
|
||||
file_path, media_type, filename = self._resolve_item_attachment_content(item)
|
||||
metadata = self._attachment_storage.read_meta(file_path)
|
||||
@@ -681,6 +737,10 @@ class ExpenseClaimAttachmentOperationsMixin:
|
||||
metadata=metadata,
|
||||
item=item,
|
||||
)
|
||||
metadata = self._refresh_pdf_attachment_preview_meta_if_needed(
|
||||
file_path=file_path,
|
||||
metadata=metadata,
|
||||
)
|
||||
preview_storage_key = str(metadata.get("preview_storage_key") or "").strip()
|
||||
preview_file_name = str(metadata.get("preview_file_name") or "").strip()
|
||||
preview_media_type = str(metadata.get("preview_media_type") or "").strip()
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import binascii
|
||||
import mimetypes
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import quote
|
||||
|
||||
from app.services.document_preview import DocumentPreviewAssets
|
||||
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
|
||||
|
||||
|
||||
@@ -42,6 +40,7 @@ class ExpenseClaimAttachmentPresentation:
|
||||
"preview_storage_key": self.storage.to_storage_key(preview_path),
|
||||
"preview_media_type": preview_media_type,
|
||||
"preview_file_name": preview_file_name,
|
||||
"preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
|
||||
}
|
||||
|
||||
if preview_kind:
|
||||
@@ -51,6 +50,7 @@ class ExpenseClaimAttachmentPresentation:
|
||||
"preview_storage_key": storage_key,
|
||||
"preview_media_type": media_type,
|
||||
"preview_file_name": filename,
|
||||
"preview_rendered_with": "",
|
||||
}
|
||||
|
||||
return {
|
||||
@@ -59,6 +59,7 @@ class ExpenseClaimAttachmentPresentation:
|
||||
"preview_storage_key": "",
|
||||
"preview_media_type": "",
|
||||
"preview_file_name": "",
|
||||
"preview_rendered_with": "",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
@@ -72,15 +73,7 @@ class ExpenseClaimAttachmentPresentation:
|
||||
|
||||
@staticmethod
|
||||
def decode_data_url(payload: str) -> tuple[str, bytes] | None:
|
||||
normalized = str(payload or "").strip()
|
||||
matched = re.match(r"^data:(?P<media>[\w.+-]+/[\w.+-]+);base64,(?P<body>.+)$", normalized, flags=re.DOTALL)
|
||||
if not matched:
|
||||
return None
|
||||
try:
|
||||
content = base64.b64decode(matched.group("body"), validate=True)
|
||||
except (binascii.Error, ValueError):
|
||||
return None
|
||||
return matched.group("media"), content
|
||||
return DocumentPreviewAssets.decode_data_url(payload)
|
||||
|
||||
def _write_preview_asset_from_data_url(
|
||||
self,
|
||||
@@ -89,16 +82,11 @@ class ExpenseClaimAttachmentPresentation:
|
||||
original_filename: str,
|
||||
preview_data_url: str,
|
||||
) -> tuple[Path, str, str] | None:
|
||||
decoded = self.decode_data_url(preview_data_url)
|
||||
if decoded is None:
|
||||
return None
|
||||
|
||||
preview_media_type, preview_content = decoded
|
||||
suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
|
||||
preview_name = f"{Path(original_filename).stem}.preview{suffix}"
|
||||
preview_path = attachment_dir / preview_name
|
||||
preview_path.write_bytes(preview_content)
|
||||
return preview_path, preview_media_type, preview_name
|
||||
return DocumentPreviewAssets.write_data_url_preview(
|
||||
preview_dir=attachment_dir,
|
||||
preview_name_stem=f"{Path(original_filename).stem}.preview",
|
||||
preview_data_url=preview_data_url,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def build_preview_client_path(claim_id: str, item_id: str) -> str:
|
||||
|
||||
@@ -537,7 +537,7 @@ class OcrService:
|
||||
if page_summary:
|
||||
aggregated.summary_fragments.append(page_summary)
|
||||
|
||||
page_text = str(payload.get("text", "") or "").strip()
|
||||
page_text = self._resolve_worker_document_text(payload)
|
||||
if page_text:
|
||||
aggregated.text_fragments.append(page_text)
|
||||
|
||||
@@ -626,6 +626,22 @@ class OcrService:
|
||||
return descriptor.text_layer
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _resolve_worker_document_text(payload: dict) -> str:
|
||||
for key in ("text", "ocr_text", "raw_text", "full_text"):
|
||||
value = str(payload.get(key, "") or "").strip()
|
||||
if value:
|
||||
return value
|
||||
|
||||
lines = payload.get("lines", [])
|
||||
if not isinstance(lines, list):
|
||||
return ""
|
||||
return "\n".join(
|
||||
str(item.get("text", "") or "").strip()
|
||||
for item in lines
|
||||
if isinstance(item, dict) and str(item.get("text", "") or "").strip()
|
||||
).strip()
|
||||
|
||||
@staticmethod
|
||||
def _build_lines(
|
||||
items: list[dict],
|
||||
|
||||
@@ -12,7 +12,7 @@ from uuid import uuid4
|
||||
|
||||
from app.api.deps import CurrentUserContext
|
||||
from app.core.config import get_settings
|
||||
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
|
||||
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead
|
||||
from app.schemas.receipt_folder import (
|
||||
ReceiptFolderDeleteResponse,
|
||||
ReceiptFolderDetailRead,
|
||||
@@ -20,11 +20,13 @@ from app.schemas.receipt_folder import (
|
||||
ReceiptFolderItemRead,
|
||||
ReceiptFolderUpdate,
|
||||
)
|
||||
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
|
||||
from app.services.document_preview import DocumentPreviewAssets
|
||||
from app.services.document_intelligence import build_document_insight
|
||||
from app.services.ocr import SUPPORTED_SUFFIXES
|
||||
|
||||
RECEIPT_DATE_PATTERN = re.compile(
|
||||
r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)"
|
||||
r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
|
||||
r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
|
||||
)
|
||||
RECEIPT_TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[::]([0-5]\d)(?!\d)")
|
||||
TRAIN_INVOICE_DATE_PATTERN = re.compile(
|
||||
@@ -45,7 +47,9 @@ TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座
|
||||
TRAIN_CARRIAGE_PATTERN = re.compile(r"(?:车厢|车厢号)\s*[::]?\s*([0-9]{1,2}\s*车?)")
|
||||
TRAIN_SEAT_NO_PATTERN = re.compile(r"(?:座位|座位号)\s*[::]?\s*([0-9]{1,3}[A-F号]?)", re.IGNORECASE)
|
||||
TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])号?", re.IGNORECASE)
|
||||
TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE)
|
||||
TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[::¥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
|
||||
TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)")
|
||||
|
||||
|
||||
class ReceiptFolderStorageMixin:
|
||||
@@ -101,18 +105,19 @@ class ReceiptFolderStorageMixin:
|
||||
document: Any | None,
|
||||
) -> dict[str, Any]:
|
||||
preview_data_url = str(getattr(document, "preview_data_url", "") or "").strip()
|
||||
decoded = ExpenseClaimAttachmentPresentation.decode_data_url(preview_data_url)
|
||||
if decoded is not None:
|
||||
preview_media_type, preview_content = decoded
|
||||
suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
|
||||
preview_name = f"preview{suffix}"
|
||||
preview_path = receipt_dir / preview_name
|
||||
preview_path.write_bytes(preview_content)
|
||||
preview_asset = DocumentPreviewAssets.write_data_url_preview(
|
||||
preview_dir=receipt_dir,
|
||||
preview_name_stem="preview",
|
||||
preview_data_url=preview_data_url,
|
||||
)
|
||||
if preview_asset is not None:
|
||||
_, preview_media_type, preview_name = preview_asset
|
||||
return {
|
||||
"previewable": True,
|
||||
"preview_kind": "image",
|
||||
"preview_file_name": preview_name,
|
||||
"preview_media_type": preview_media_type,
|
||||
"preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
|
||||
}
|
||||
if self._is_previewable(media_type):
|
||||
return {
|
||||
@@ -120,14 +125,67 @@ class ReceiptFolderStorageMixin:
|
||||
"preview_kind": "image" if media_type.startswith("image/") else "pdf",
|
||||
"preview_file_name": source_path.name,
|
||||
"preview_media_type": media_type,
|
||||
"preview_rendered_with": "",
|
||||
}
|
||||
return {
|
||||
"previewable": False,
|
||||
"preview_kind": "",
|
||||
"preview_file_name": "",
|
||||
"preview_media_type": "",
|
||||
"preview_rendered_with": "",
|
||||
}
|
||||
|
||||
def _refresh_pdf_preview_asset_if_needed(
|
||||
self,
|
||||
*,
|
||||
receipt_dir: Path,
|
||||
meta: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
source_name = str(meta.get("source_file_name") or meta.get("file_name") or "").strip()
|
||||
if not source_name:
|
||||
return meta
|
||||
|
||||
source_path = self._assert_child(receipt_dir / source_name)
|
||||
source_media_type = self.resolve_media_type(source_path.name, str(meta.get("media_type") or ""))
|
||||
if source_media_type != "application/pdf" or not source_path.exists():
|
||||
return meta
|
||||
|
||||
preview_name = str(meta.get("preview_file_name") or "").strip()
|
||||
preview_path = self._assert_child(receipt_dir / preview_name) if preview_name else None
|
||||
if (
|
||||
preview_path is not None
|
||||
and preview_path.exists()
|
||||
and str(meta.get("preview_kind") or "").strip() == "image"
|
||||
and str(meta.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
|
||||
and str(meta.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
|
||||
):
|
||||
return meta
|
||||
|
||||
if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
|
||||
preview_name = f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
|
||||
preview_path = self._assert_child(receipt_dir / preview_name)
|
||||
|
||||
try:
|
||||
DocumentPreviewAssets.render_pdf_first_page(
|
||||
pdf_path=source_path,
|
||||
preview_path=preview_path,
|
||||
timeout_seconds=get_settings().ocr_timeout_seconds,
|
||||
)
|
||||
except Exception:
|
||||
return meta
|
||||
|
||||
meta.update(
|
||||
{
|
||||
"previewable": True,
|
||||
"preview_kind": "image",
|
||||
"preview_file_name": preview_path.name,
|
||||
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
|
||||
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
|
||||
}
|
||||
)
|
||||
self._write_meta(receipt_dir, meta)
|
||||
return meta
|
||||
|
||||
@staticmethod
|
||||
def _is_previewable(media_type: str) -> bool:
|
||||
return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf"
|
||||
@@ -256,6 +314,7 @@ class ReceiptFolderItemMixin:
|
||||
def _build_item(self, meta: dict[str, Any]) -> ReceiptFolderItemRead:
|
||||
receipt_id = str(meta.get("id") or "").strip()
|
||||
status_value = str(meta.get("status") or "unlinked").strip() or "unlinked"
|
||||
identity = self._resolve_receipt_document_identity(meta)
|
||||
return ReceiptFolderItemRead(
|
||||
id=receipt_id,
|
||||
file_name=str(meta.get("file_name") or ""),
|
||||
@@ -263,10 +322,10 @@ class ReceiptFolderItemMixin:
|
||||
size_bytes=int(meta.get("size_bytes") or 0),
|
||||
status=status_value,
|
||||
status_label="已关联" if status_value == "linked" else "未关联",
|
||||
document_type=str(meta.get("document_type") or "other"),
|
||||
document_type_label=str(meta.get("document_type_label") or "其他单据"),
|
||||
scene_code=str(meta.get("scene_code") or "other"),
|
||||
scene_label=str(meta.get("scene_label") or "其他票据"),
|
||||
document_type=identity["document_type"],
|
||||
document_type_label=identity["document_type_label"],
|
||||
scene_code=identity["scene_code"],
|
||||
scene_label=identity["scene_label"],
|
||||
summary=str(meta.get("summary") or ""),
|
||||
amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")),
|
||||
document_date=self._resolve_receipt_document_date(meta),
|
||||
@@ -283,6 +342,38 @@ class ReceiptFolderItemMixin:
|
||||
warnings=[str(value) for value in list(meta.get("ocr_warnings") or []) if str(value).strip()],
|
||||
)
|
||||
|
||||
def _resolve_receipt_document_identity(self, meta: dict[str, Any]) -> dict[str, str]:
|
||||
document_type = str(meta.get("document_type") or "other").strip() or "other"
|
||||
document_type_label = str(meta.get("document_type_label") or "其他单据").strip() or "其他单据"
|
||||
scene_code = str(meta.get("scene_code") or "other").strip() or "other"
|
||||
scene_label = str(meta.get("scene_label") or "其他票据").strip() or "其他票据"
|
||||
if document_type not in {"", "other"} and document_type_label != "其他单据":
|
||||
return {
|
||||
"document_type": document_type,
|
||||
"document_type_label": document_type_label,
|
||||
"scene_code": scene_code,
|
||||
"scene_label": scene_label,
|
||||
}
|
||||
|
||||
insight = build_document_insight(
|
||||
filename=str(meta.get("file_name") or ""),
|
||||
summary=str(meta.get("summary") or ""),
|
||||
text=self._receipt_text(meta),
|
||||
)
|
||||
if insight.document_type in {"", "other"}:
|
||||
return {
|
||||
"document_type": document_type,
|
||||
"document_type_label": document_type_label,
|
||||
"scene_code": scene_code,
|
||||
"scene_label": scene_label,
|
||||
}
|
||||
return {
|
||||
"document_type": insight.document_type,
|
||||
"document_type_label": insight.document_type_label,
|
||||
"scene_code": insight.scene_code,
|
||||
"scene_label": insight.scene_label,
|
||||
}
|
||||
|
||||
def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]:
|
||||
fields = [
|
||||
ReceiptFolderFieldRead(
|
||||
@@ -503,7 +594,15 @@ class ReceiptFolderTrainTicketMixin:
|
||||
if str(document_type or "").strip().lower() == "train_ticket":
|
||||
return True
|
||||
compact = "".join([document_type_label, scene_label, text]).replace(" ", "")
|
||||
return any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次"))
|
||||
if any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次")):
|
||||
return True
|
||||
lower_compact = compact.lower()
|
||||
return bool(re.search(r"[GCDZKTLYS]\d{1,5}", compact, flags=re.IGNORECASE)) and (
|
||||
"12306" in compact
|
||||
or "95306" in compact
|
||||
or re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5]{2,12}", compact)
|
||||
or ("wuhan" in lower_compact and "shanghai" in lower_compact)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _is_train_ticket_meta(cls, meta: dict[str, Any]) -> bool:
|
||||
@@ -581,6 +680,7 @@ class ReceiptFolderTrainTicketMixin:
|
||||
return raw
|
||||
normalized = match.group(1).replace("年", "-").replace("月", "-").replace("日", "")
|
||||
normalized = normalized.replace("/", "-").replace(".", "-")
|
||||
normalized = re.sub(r"\s+", "-", normalized)
|
||||
parts = [part for part in normalized.split("-") if part]
|
||||
if len(parts) != 3:
|
||||
return match.group(1)
|
||||
@@ -651,7 +751,28 @@ class ReceiptFolderTrainTicketMixin:
|
||||
cleaned = re.sub(r"[^·\u4e00-\u9fa5]", "", str(value or "")).strip()
|
||||
if not 2 <= len(cleaned) <= 8:
|
||||
return ""
|
||||
if any(token in cleaned for token in ("电子", "客票", "铁路", "发票", "税务", "湖北省", "中国铁路", "开票", "日期")):
|
||||
if any(
|
||||
token in cleaned
|
||||
for token in (
|
||||
"电子",
|
||||
"客票",
|
||||
"铁路",
|
||||
"发票",
|
||||
"税务",
|
||||
"湖北省",
|
||||
"中国铁路",
|
||||
"开票",
|
||||
"日期",
|
||||
"车厢",
|
||||
"座位",
|
||||
"票价",
|
||||
"金额",
|
||||
"行程",
|
||||
"出发",
|
||||
"到达",
|
||||
"车次",
|
||||
)
|
||||
):
|
||||
return ""
|
||||
return cleaned
|
||||
|
||||
@@ -660,20 +781,29 @@ class ReceiptFolderTrainTicketMixin:
|
||||
labeled = cls._extract_first(TRAIN_ID_PATTERN, text)
|
||||
if labeled:
|
||||
return labeled
|
||||
fallback = ""
|
||||
for line in str(text or "").replace("\r", "\n").splitlines():
|
||||
compact_line = line.replace(" ", "")
|
||||
if any(token in compact_line for token in ("发票号码", "电子客票号", "客票号", "订单号")):
|
||||
continue
|
||||
match = TRAIN_ID_FALLBACK_PATTERN.search(compact_line)
|
||||
if match:
|
||||
return str(match.group(1) or "").strip()
|
||||
return ""
|
||||
if not match:
|
||||
continue
|
||||
candidate = str(match.group(1) or "").strip()
|
||||
if "*" in candidate:
|
||||
return candidate
|
||||
if not fallback:
|
||||
fallback = candidate
|
||||
return fallback
|
||||
|
||||
@staticmethod
|
||||
def _extract_train_carriage_and_seat(text: str) -> tuple[str, str]:
|
||||
combined_match = TRAIN_COMBINED_SEAT_PATTERN.search(str(text or ""))
|
||||
if combined_match:
|
||||
return f"{combined_match.group(1)}车", combined_match.group(2)
|
||||
loose_match = TRAIN_LOOSE_SEAT_PATTERN.search(str(text or ""))
|
||||
if loose_match:
|
||||
return f"{loose_match.group(1).zfill(2)}车", loose_match.group(2).upper()
|
||||
carriage_no = ReceiptFolderService._extract_first(TRAIN_CARRIAGE_PATTERN, text).replace(" ", "")
|
||||
seat_no = ReceiptFolderService._extract_first(TRAIN_SEAT_NO_PATTERN, text)
|
||||
return carriage_no, seat_no
|
||||
@@ -681,6 +811,12 @@ class ReceiptFolderTrainTicketMixin:
|
||||
@staticmethod
|
||||
def _extract_train_fare(text: str) -> str:
|
||||
match = TRAIN_FARE_PATTERN.search(str(text or ""))
|
||||
if not match:
|
||||
match = max(
|
||||
list(TRAIN_LOOSE_FARE_PATTERN.finditer(str(text or ""))),
|
||||
key=lambda item: float(str(item.group(1) or "0").replace(",", ".")),
|
||||
default=None,
|
||||
)
|
||||
if not match:
|
||||
return ""
|
||||
value = str(match.group(1) or "").replace(",", ".").strip()
|
||||
@@ -721,13 +857,10 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
|
||||
)
|
||||
if existing_receipt is not None:
|
||||
enriched.append(
|
||||
document.model_copy(
|
||||
update={
|
||||
"receipt_id": existing_receipt.id,
|
||||
"receipt_status": existing_receipt.status,
|
||||
"receipt_preview_url": existing_receipt.preview_url,
|
||||
"receipt_source_url": existing_receipt.source_url,
|
||||
}
|
||||
self._enrich_ocr_document_with_receipt(
|
||||
document,
|
||||
receipt=existing_receipt,
|
||||
current_user=current_user,
|
||||
)
|
||||
)
|
||||
continue
|
||||
@@ -744,14 +877,11 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
|
||||
warning = "已上传过同样的单据,请不要重复上传。"
|
||||
existing_warnings = [str(item) for item in list(document.warnings or []) if str(item).strip()]
|
||||
enriched.append(
|
||||
document.model_copy(
|
||||
update={
|
||||
"receipt_id": duplicate_receipt.id,
|
||||
"receipt_status": duplicate_receipt.status,
|
||||
"receipt_preview_url": duplicate_receipt.preview_url,
|
||||
"receipt_source_url": duplicate_receipt.source_url,
|
||||
"warnings": list(dict.fromkeys([*existing_warnings, warning])),
|
||||
}
|
||||
self._enrich_ocr_document_with_receipt(
|
||||
document,
|
||||
receipt=duplicate_receipt,
|
||||
current_user=current_user,
|
||||
extra_warnings=[*existing_warnings, warning],
|
||||
)
|
||||
)
|
||||
continue
|
||||
@@ -763,16 +893,77 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
|
||||
current_user=current_user,
|
||||
)
|
||||
enriched.append(
|
||||
document.model_copy(
|
||||
update={
|
||||
self._enrich_ocr_document_with_receipt(
|
||||
document,
|
||||
receipt=receipt,
|
||||
current_user=current_user,
|
||||
)
|
||||
)
|
||||
return result.model_copy(update={"documents": enriched})
|
||||
|
||||
def _enrich_ocr_document_with_receipt(
|
||||
self,
|
||||
document: OcrRecognizeDocumentRead,
|
||||
*,
|
||||
receipt: ReceiptFolderItemRead,
|
||||
current_user: CurrentUserContext,
|
||||
extra_warnings: list[str] | None = None,
|
||||
) -> OcrRecognizeDocumentRead:
|
||||
update: dict[str, Any] = {
|
||||
"receipt_id": receipt.id,
|
||||
"receipt_status": receipt.status,
|
||||
"receipt_preview_url": receipt.preview_url,
|
||||
"receipt_source_url": receipt.source_url,
|
||||
}
|
||||
|
||||
try:
|
||||
meta = self._read_receipt_meta(receipt.id, current_user)
|
||||
except FileNotFoundError:
|
||||
meta = {}
|
||||
|
||||
if meta:
|
||||
update.update(
|
||||
{
|
||||
"text": str(meta.get("ocr_text") or document.text or ""),
|
||||
"summary": str(meta.get("summary") or document.summary or ""),
|
||||
"document_type": str(meta.get("document_type") or document.document_type or "other"),
|
||||
"document_type_label": str(meta.get("document_type_label") or document.document_type_label or "其他单据"),
|
||||
"scene_code": str(meta.get("scene_code") or document.scene_code or "other"),
|
||||
"scene_label": str(meta.get("scene_label") or document.scene_label or "其他票据"),
|
||||
"classification_source": str(meta.get("ocr_classification_source") or document.classification_source or ""),
|
||||
"classification_confidence": float(
|
||||
meta.get("ocr_classification_confidence")
|
||||
or document.classification_confidence
|
||||
or 0.0
|
||||
),
|
||||
"classification_evidence": [
|
||||
str(value)
|
||||
for value in list(meta.get("ocr_classification_evidence") or document.classification_evidence or [])
|
||||
if str(value).strip()
|
||||
],
|
||||
"document_fields": self._build_ocr_document_fields_from_meta(meta),
|
||||
}
|
||||
)
|
||||
|
||||
warnings = [
|
||||
str(item)
|
||||
for item in list(extra_warnings if extra_warnings is not None else document.warnings or [])
|
||||
if str(item).strip()
|
||||
]
|
||||
if warnings:
|
||||
update["warnings"] = list(dict.fromkeys(warnings))
|
||||
return document.model_copy(update=update)
|
||||
|
||||
def _build_ocr_document_fields_from_meta(self, meta: dict[str, Any]) -> list[OcrRecognizeFieldRead]:
|
||||
return [
|
||||
OcrRecognizeFieldRead(
|
||||
key=field.key,
|
||||
label=field.label,
|
||||
value=field.value,
|
||||
)
|
||||
return result.model_copy(update={"documents": enriched})
|
||||
for field in self._resolve_fields(meta)
|
||||
if field.label and field.value
|
||||
]
|
||||
|
||||
def save_receipt(
|
||||
self,
|
||||
@@ -1024,6 +1215,7 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
|
||||
def resolve_preview(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]:
|
||||
meta = self._read_receipt_meta(receipt_id, current_user)
|
||||
receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id)
|
||||
meta = self._refresh_pdf_preview_asset_if_needed(receipt_dir=receipt_dir, meta=meta)
|
||||
preview_name = str(meta.get("preview_file_name") or "").strip()
|
||||
if preview_name:
|
||||
preview_path = self._assert_child(receipt_dir / preview_name)
|
||||
@@ -1038,4 +1230,3 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
|
||||
if self._is_previewable(source_media_type):
|
||||
return source_path, source_media_type, source_name
|
||||
raise FileNotFoundError("Receipt preview not found")
|
||||
|
||||
|
||||
@@ -84,6 +84,35 @@ def test_document_intelligence_prefers_train_ticket_for_railway_e_ticket_invoice
|
||||
assert any(field.label == "金额" and field.value == "354元" for field in insight.fields)
|
||||
|
||||
|
||||
def test_document_intelligence_recovers_train_ticket_from_english_station_ocr_text() -> None:
|
||||
insight = build_document_insight(
|
||||
filename="2月20_武汉-上海.pdf",
|
||||
summary=":26429165800002785705;:2026 05 18;Wuhan Shanghaihongqiao G458",
|
||||
text=(
|
||||
":26429165800002785705\n"
|
||||
":2026 05 18\n"
|
||||
"G458\n"
|
||||
"Wuhan\n"
|
||||
"Shanghaihongqiao\n"
|
||||
"2026 02 20 07:55\n"
|
||||
"06 01B\n"
|
||||
": 354.00\n"
|
||||
"4201061987****1615\n"
|
||||
":6580061086021391007342026\n"
|
||||
"12306 95306"
|
||||
),
|
||||
)
|
||||
|
||||
assert insight.document_type == "train_ticket"
|
||||
assert insight.document_type_label == "火车/高铁票"
|
||||
assert insight.scene_code == "travel"
|
||||
fields = {field.label: field.value for field in insight.fields}
|
||||
assert fields["金额"] == "354元"
|
||||
assert fields["列车出发时间"] == "2026-02-20 07:55"
|
||||
assert fields["车次/航班"] == "G458"
|
||||
assert fields["行程"] == "武汉-上海"
|
||||
|
||||
|
||||
def test_document_intelligence_labels_train_ticket_date_as_train_departure_time() -> None:
|
||||
insight = build_document_insight(
|
||||
filename="铁路电子客票.pdf",
|
||||
|
||||
@@ -0,0 +1,169 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from decimal import Decimal
|
||||
|
||||
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
|
||||
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
|
||||
from app.services.ocr import OcrService
|
||||
from test_reimbursement_endpoints import build_client, seed_claim
|
||||
|
||||
|
||||
def test_train_ticket_attachment_with_structured_fields_is_not_flagged_as_unreadable(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
def fake_recognize(
|
||||
self,
|
||||
files: list[tuple[str, bytes, str | None]],
|
||||
) -> OcrRecognizeBatchRead:
|
||||
return OcrRecognizeBatchRead(
|
||||
total_file_count=1,
|
||||
success_count=1,
|
||||
documents=[
|
||||
OcrRecognizeDocumentRead(
|
||||
filename="2月20_武汉-上海.pdf",
|
||||
media_type="application/pdf",
|
||||
text=(
|
||||
":26429165800002785705\n"
|
||||
":2026 05 18\n"
|
||||
"G458\n"
|
||||
"Wuhan\n"
|
||||
"Shanghaihongqiao\n"
|
||||
"2026 02 20 07:55\n"
|
||||
"06 01B\n"
|
||||
": 354.00\n"
|
||||
"4201061987****1615\n"
|
||||
":6580061086021391007342026\n"
|
||||
"12306 95306"
|
||||
),
|
||||
summary="Wuhan Shanghaihongqiao G458 354.00",
|
||||
avg_score=0.0,
|
||||
line_count=0,
|
||||
page_count=1,
|
||||
warnings=[],
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
|
||||
monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
|
||||
|
||||
client, session_factory = build_client()
|
||||
with session_factory() as db:
|
||||
claim, item = seed_claim(db)
|
||||
claim.expense_type = "travel"
|
||||
claim.reason = "武汉-上海差旅"
|
||||
claim.location = "上海"
|
||||
claim.amount = Decimal("354.00")
|
||||
item.item_type = "train_ticket"
|
||||
item.item_reason = "武汉-上海"
|
||||
item.item_location = "上海"
|
||||
item.item_amount = Decimal("354.00")
|
||||
db.commit()
|
||||
claim_id = claim.id
|
||||
item_id = item.id
|
||||
|
||||
upload_response = client.post(
|
||||
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
|
||||
headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
|
||||
files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
|
||||
)
|
||||
|
||||
assert upload_response.status_code == 200
|
||||
attachment = upload_response.json()["attachment"]
|
||||
analysis = attachment["analysis"]
|
||||
points = analysis["points"]
|
||||
|
||||
assert attachment["document_info"]["document_type"] == "train_ticket"
|
||||
assert analysis["severity"] == "pass"
|
||||
assert not any("未识别到有效文字" in point for point in points)
|
||||
assert not any("未识别到列车出发时间" in point for point in points)
|
||||
|
||||
|
||||
def test_attachment_meta_read_repairs_stale_unreadable_train_ticket_analysis(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
def fake_recognize(
|
||||
self,
|
||||
files: list[tuple[str, bytes, str | None]],
|
||||
) -> OcrRecognizeBatchRead:
|
||||
return OcrRecognizeBatchRead(
|
||||
total_file_count=1,
|
||||
success_count=1,
|
||||
documents=[
|
||||
OcrRecognizeDocumentRead(
|
||||
filename="2月20_武汉-上海.pdf",
|
||||
media_type="application/pdf",
|
||||
text=(
|
||||
":26429165800002785705 :2026 05 18\n"
|
||||
"G458\n"
|
||||
"Wuhan Shanghaihongqiao\n"
|
||||
"2026 02 20 07:55 06 01B\n"
|
||||
": 354.00\n"
|
||||
"4201061987****1615\n"
|
||||
":6580061086021391007342026\n"
|
||||
"12306 95306"
|
||||
),
|
||||
summary="Wuhan Shanghaihongqiao G458 354.00",
|
||||
avg_score=0.0,
|
||||
line_count=0,
|
||||
page_count=1,
|
||||
warnings=[],
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
|
||||
monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
|
||||
|
||||
client, session_factory = build_client()
|
||||
with session_factory() as db:
|
||||
claim, item = seed_claim(db)
|
||||
claim.expense_type = "travel"
|
||||
claim.reason = "武汉-上海差旅"
|
||||
claim.location = "上海"
|
||||
claim.amount = Decimal("354.00")
|
||||
item.item_type = "train_ticket"
|
||||
item.item_reason = "武汉-上海"
|
||||
item.item_location = "上海"
|
||||
item.item_amount = Decimal("354.00")
|
||||
db.commit()
|
||||
claim_id = claim.id
|
||||
item_id = item.id
|
||||
|
||||
upload_response = client.post(
|
||||
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
|
||||
headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
|
||||
files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
|
||||
)
|
||||
assert upload_response.status_code == 200
|
||||
|
||||
meta_path = next(tmp_path.rglob("*.meta.json"))
|
||||
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
meta["analysis"] = {
|
||||
"severity": "high",
|
||||
"label": "高风险",
|
||||
"headline": "AI提示:附件不符合票据校验条件",
|
||||
"summary": "当前附件存在明显异常,票据类型与当前费用场景不匹配,或无法作为有效报销材料。",
|
||||
"points": [
|
||||
"附件内容:未识别到有效文字,当前附件更像普通图片或内容过于模糊。",
|
||||
"日期字段:未识别到列车出发时间或乘车日期。",
|
||||
],
|
||||
"rule_basis": [],
|
||||
"suggestion": "建议过滤当前不匹配的票据,重新上传符合当前费用场景的清晰原件。",
|
||||
}
|
||||
meta_path.write_text(json.dumps(meta, ensure_ascii=False), encoding="utf-8")
|
||||
|
||||
meta_response = client.get(
|
||||
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/meta",
|
||||
headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
|
||||
)
|
||||
|
||||
assert meta_response.status_code == 200
|
||||
analysis = meta_response.json()["analysis"]
|
||||
points = analysis["points"]
|
||||
assert analysis["severity"] == "pass"
|
||||
assert not any("未识别到有效文字" in point for point in points)
|
||||
assert not any("未识别到列车出发时间" in point for point in points)
|
||||
@@ -176,3 +176,73 @@ def test_ocr_recognize_endpoint_returns_structured_payload(monkeypatch, tmp_path
|
||||
assert deleted_response.status_code == 404
|
||||
finally:
|
||||
get_settings.cache_clear()
|
||||
|
||||
|
||||
def test_ocr_recognize_endpoint_returns_receipt_enriched_train_fields(monkeypatch, tmp_path) -> None:
|
||||
def fake_recognize(
|
||||
self,
|
||||
files: list[tuple[str, bytes, str | None]],
|
||||
) -> OcrRecognizeBatchRead:
|
||||
return OcrRecognizeBatchRead(
|
||||
engine="paddleocr_mobile",
|
||||
model="PP-OCRv5_mobile",
|
||||
total_file_count=1,
|
||||
success_count=1,
|
||||
documents=[
|
||||
OcrRecognizeDocumentRead(
|
||||
filename="2月20_武汉-上海.png",
|
||||
media_type="image/png",
|
||||
text=(
|
||||
":26429165800002785705\n"
|
||||
"G458\n"
|
||||
"Wuhan\n"
|
||||
"Shanghaihongqiao\n"
|
||||
"2026 02 20 07:55\n"
|
||||
"06 01B\n"
|
||||
": 354.00\n"
|
||||
"4201061987****1615\n"
|
||||
":6580061086021391007342026\n"
|
||||
"12306 95306"
|
||||
),
|
||||
summary="Wuhan Shanghaihongqiao G458 354.00",
|
||||
avg_score=0.92,
|
||||
line_count=0,
|
||||
page_count=1,
|
||||
document_type="train_ticket",
|
||||
document_type_label="火车/高铁票",
|
||||
scene_code="travel",
|
||||
scene_label="差旅票据",
|
||||
document_fields=[
|
||||
OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"),
|
||||
OcrRecognizeFieldRead(key="trip_no", label="车次/航班", value="G458"),
|
||||
OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"),
|
||||
OcrRecognizeFieldRead(key="amount", label="金额", value="354元"),
|
||||
],
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
|
||||
get_settings.cache_clear()
|
||||
monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
|
||||
try:
|
||||
client = build_client()
|
||||
response = client.post(
|
||||
"/api/v1/ocr/recognize",
|
||||
headers={"x-auth-username": "pytest", "x-auth-name": "Py Test"},
|
||||
files=[("files", ("2月20_武汉-上海.png", b"fake-image", "image/png"))],
|
||||
)
|
||||
finally:
|
||||
get_settings.cache_clear()
|
||||
|
||||
assert response.status_code == 200
|
||||
document = response.json()["documents"][0]
|
||||
fields = {
|
||||
item["label"]: item["value"]
|
||||
for item in document["document_fields"]
|
||||
}
|
||||
assert document["receipt_id"]
|
||||
assert fields["身份证号"] == "4201061987****1615"
|
||||
assert fields["车厢"] == "06车"
|
||||
assert fields["座位号"] == "01B"
|
||||
assert fields["票价"] == "354.00元"
|
||||
|
||||
@@ -101,6 +101,55 @@ print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))
|
||||
assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]
|
||||
|
||||
|
||||
def test_ocr_service_recovers_image_text_from_worker_ocr_text(
|
||||
monkeypatch,
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
def fake_invoke_worker(
|
||||
self,
|
||||
*,
|
||||
python_bin: str,
|
||||
worker_path: str,
|
||||
input_paths: list[Path],
|
||||
) -> dict:
|
||||
return {
|
||||
"engine": "paddleocr_mobile",
|
||||
"model": "PP-OCRv5_mobile",
|
||||
"documents": [
|
||||
{
|
||||
"input_path": str(input_paths[0]),
|
||||
"engine": "paddleocr_mobile",
|
||||
"model": "PP-OCRv5_mobile",
|
||||
"ocr_text": "铁路电子客票 武汉-上海 2026 02 20 07:55 G458 : 354.00 12306 95306",
|
||||
"avg_score": 0.92,
|
||||
"line_count": 0,
|
||||
"page_count": 1,
|
||||
"warnings": [],
|
||||
"lines": [],
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
|
||||
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
|
||||
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
|
||||
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
|
||||
OcrService._result_cache.clear()
|
||||
get_settings.cache_clear()
|
||||
try:
|
||||
result = OcrService().recognize_files([("train-ticket.png", b"fake-train-image", "image/png")])
|
||||
finally:
|
||||
OcrService._result_cache.clear()
|
||||
get_settings.cache_clear()
|
||||
|
||||
recognized = result.documents[0]
|
||||
assert "铁路电子客票" in recognized.text
|
||||
assert recognized.document_type == "train_ticket"
|
||||
assert any(field.label == "列车出发时间" and field.value == "2026-02-20 07:55" for field in recognized.document_fields)
|
||||
assert any(field.label == "车次/航班" and field.value == "G458" for field in recognized.document_fields)
|
||||
assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields)
|
||||
|
||||
|
||||
def test_ocr_service_passes_configured_device_to_worker(
|
||||
monkeypatch,
|
||||
tmp_path: Path,
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
|
||||
from app.api.deps import CurrentUserContext
|
||||
from app.core.config import get_settings
|
||||
from app.schemas.ocr import OcrRecognizeDocumentRead
|
||||
from app.services.document_preview import DocumentPreviewAssets
|
||||
from app.services.receipt_folder import ReceiptFolderService
|
||||
|
||||
|
||||
@@ -69,6 +72,172 @@ def test_receipt_folder_train_ticket_uses_invoice_date_and_enriches_fields(monke
|
||||
get_settings.cache_clear()
|
||||
|
||||
|
||||
def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None:
|
||||
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
|
||||
get_settings.cache_clear()
|
||||
try:
|
||||
current_user = CurrentUserContext(
|
||||
username="pytest",
|
||||
name="Py Test",
|
||||
role_codes=[],
|
||||
is_admin=False,
|
||||
)
|
||||
stale_preview = b"stale-preview"
|
||||
preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}"
|
||||
service = ReceiptFolderService()
|
||||
receipt = service.save_receipt(
|
||||
filename="2月20_武汉-上海.pdf",
|
||||
content=b"%PDF-1.4 fake",
|
||||
media_type="application/pdf",
|
||||
current_user=current_user,
|
||||
document=OcrRecognizeDocumentRead(
|
||||
filename="2月20_武汉-上海.pdf",
|
||||
media_type="application/pdf",
|
||||
preview_kind="image",
|
||||
preview_data_url=preview_data_url,
|
||||
),
|
||||
)
|
||||
|
||||
receipt_dir = next(service.root.glob("pytest/*"))
|
||||
preview_path = receipt_dir / "preview.png"
|
||||
assert preview_path.read_bytes() == stale_preview
|
||||
stale_meta = service._read_meta(receipt_dir)
|
||||
stale_meta.pop("preview_rendered_with", None)
|
||||
service._write_meta(receipt_dir, stale_meta)
|
||||
|
||||
def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
|
||||
preview_path.write_bytes(b"refreshed-preview")
|
||||
return preview_path
|
||||
|
||||
monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
|
||||
|
||||
resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user)
|
||||
|
||||
assert resolved_path == preview_path
|
||||
assert media_type == "image/png"
|
||||
assert file_name == "preview.png"
|
||||
assert preview_path.read_bytes() == b"refreshed-preview"
|
||||
meta = service._read_meta(receipt_dir)
|
||||
assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
|
||||
finally:
|
||||
get_settings.cache_clear()
|
||||
|
||||
|
||||
def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None:
|
||||
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
|
||||
get_settings.cache_clear()
|
||||
try:
|
||||
current_user = CurrentUserContext(
|
||||
username="pytest",
|
||||
name="Py Test",
|
||||
role_codes=[],
|
||||
is_admin=False,
|
||||
)
|
||||
service = ReceiptFolderService()
|
||||
content = b"%PDF-1.4 same receipt"
|
||||
|
||||
receipt = service.save_receipt(
|
||||
filename="same-receipt.pdf",
|
||||
content=content,
|
||||
media_type="application/pdf",
|
||||
current_user=current_user,
|
||||
document=OcrRecognizeDocumentRead(
|
||||
filename="same-receipt.pdf",
|
||||
media_type="application/pdf",
|
||||
text="same receipt amount 354",
|
||||
document_type="other",
|
||||
document_type_label="其他单据",
|
||||
scene_code="other",
|
||||
scene_label="其他票据",
|
||||
),
|
||||
)
|
||||
receipt_dir = service.root / "pytest" / receipt.id
|
||||
|
||||
assert receipt_dir.exists()
|
||||
duplicate = service.find_duplicate_receipt(
|
||||
filename="same-receipt.pdf",
|
||||
content=content,
|
||||
current_user=current_user,
|
||||
)
|
||||
assert duplicate is not None
|
||||
assert duplicate.id == receipt.id
|
||||
|
||||
service.delete_receipt(receipt_id=receipt.id, current_user=current_user)
|
||||
|
||||
assert not receipt_dir.exists()
|
||||
assert (
|
||||
service.find_duplicate_receipt(
|
||||
filename="same-receipt.pdf",
|
||||
content=content,
|
||||
current_user=current_user,
|
||||
)
|
||||
is None
|
||||
)
|
||||
finally:
|
||||
get_settings.cache_clear()
|
||||
|
||||
|
||||
def test_receipt_folder_recovers_train_ticket_detail_from_other_english_ocr(monkeypatch, tmp_path) -> None:
|
||||
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
|
||||
get_settings.cache_clear()
|
||||
try:
|
||||
current_user = CurrentUserContext(
|
||||
username="pytest",
|
||||
name="Py Test",
|
||||
role_codes=[],
|
||||
is_admin=False,
|
||||
)
|
||||
service = ReceiptFolderService()
|
||||
receipt = service.save_receipt(
|
||||
filename="2月20_武汉-上海.pdf",
|
||||
content=b"%PDF-1.4 fake",
|
||||
media_type="application/pdf",
|
||||
current_user=current_user,
|
||||
document=OcrRecognizeDocumentRead(
|
||||
filename="2月20_武汉-上海.pdf",
|
||||
media_type="application/pdf",
|
||||
text=(
|
||||
":26429165800002785705\n"
|
||||
":2026 05 18\n"
|
||||
"G458\n"
|
||||
"Wuhan\n"
|
||||
"Shanghaihongqiao\n"
|
||||
"2026 02 20 07:55\n"
|
||||
"06 01B\n"
|
||||
": 354.00\n"
|
||||
"4201061987****1615\n"
|
||||
":6580061086021391007342026\n"
|
||||
"12306 95306"
|
||||
),
|
||||
summary="Wuhan Shanghaihongqiao G458 354.00",
|
||||
document_type="other",
|
||||
document_type_label="其他单据",
|
||||
scene_code="other",
|
||||
scene_label="其他票据",
|
||||
),
|
||||
)
|
||||
|
||||
assert receipt.document_type == "train_ticket"
|
||||
assert receipt.document_type_label == "火车/高铁票"
|
||||
assert receipt.scene_code == "travel"
|
||||
assert receipt.amount == "354.00元"
|
||||
assert receipt.document_date == "2026-02-20"
|
||||
assert receipt.merchant_name == "中国铁路"
|
||||
|
||||
detail = service.get_receipt(receipt.id, current_user)
|
||||
fields = {field.label: field.value for field in detail.fields}
|
||||
assert fields["行程"] == "武汉-上海"
|
||||
assert fields["车次"] == "G458"
|
||||
assert fields["列车出发时间"] == "2026-02-20 07:55"
|
||||
assert fields["票价"] == "354.00元"
|
||||
assert fields["身份证号"] == "4201061987****1615"
|
||||
assert fields["车厢"] == "06车"
|
||||
assert fields["座位号"] == "01B"
|
||||
assert "乘车人" not in fields
|
||||
finally:
|
||||
get_settings.cache_clear()
|
||||
|
||||
|
||||
def test_receipt_folder_unlink_receipts_for_claim_marks_linked_receipts_unlinked(monkeypatch, tmp_path) -> None:
|
||||
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
|
||||
get_settings.cache_clear()
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
from collections.abc import Generator
|
||||
from datetime import UTC, date, datetime
|
||||
from decimal import Decimal
|
||||
@@ -19,6 +20,7 @@ from app.models.organization import OrganizationUnit
|
||||
from app.models.risk_observation import RiskObservation, RiskObservationFeedback
|
||||
from app.models.role import Role
|
||||
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
|
||||
from app.services.document_preview import DocumentPreviewAssets
|
||||
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
|
||||
from app.services.ocr import OcrService
|
||||
|
||||
@@ -686,6 +688,9 @@ def test_claim_item_pdf_attachment_preview_returns_generated_image(monkeypatch,
|
||||
meta_payload = upload_response.json()["attachment"]
|
||||
assert meta_payload["preview_kind"] == "image"
|
||||
assert meta_payload["preview_url"].endswith(f"/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview")
|
||||
meta_path = next(tmp_path.rglob("invoice.pdf.meta.json"))
|
||||
stored_meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
assert stored_meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
|
||||
|
||||
preview_response = client.get(
|
||||
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/preview",
|
||||
|
||||
Reference in New Issue
Block a user