feat(server): 票据文件夹资产缓存与文档预览统一生成

- 新增 document_preview 模块,DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成(poppler-data 编码)、renderer_id 标识
- receipt_folder 服务复用预览生成,缓存票据资产并提供清理;删除票据时保留已关联报销单的附件副本
- document_intelligence 新增票据预览/资产缓存接入与字段提取增强;ocr 抽取复用预览工具,附件分析/文档/操作/展示四个子模块同步适配
- receipt_folder 端点补充资产缓存头,补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试,新增 attachment_analysis 回归测试
This commit is contained in:
caoxiaozhu
2026-06-23 09:42:00 +08:00
parent bc743adef3
commit 84a8998e59
15 changed files with 1076 additions and 79 deletions

View File

@@ -12,7 +12,7 @@ from uuid import uuid4
from app.api.deps import CurrentUserContext
from app.core.config import get_settings
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead
from app.schemas.receipt_folder import (
ReceiptFolderDeleteResponse,
ReceiptFolderDetailRead,
@@ -20,11 +20,13 @@ from app.schemas.receipt_folder import (
ReceiptFolderItemRead,
ReceiptFolderUpdate,
)
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
from app.services.document_preview import DocumentPreviewAssets
from app.services.document_intelligence import build_document_insight
from app.services.ocr import SUPPORTED_SUFFIXES
RECEIPT_DATE_PATTERN = re.compile(
r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)"
r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
)
RECEIPT_TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:]([0-5]\d)(?!\d)")
TRAIN_INVOICE_DATE_PATTERN = re.compile(
@@ -45,7 +47,9 @@ TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座
TRAIN_CARRIAGE_PATTERN = re.compile(r"(?:车厢|车厢号)\s*[:]?\s*([0-9]{1,2}\s*车?)")
TRAIN_SEAT_NO_PATTERN = re.compile(r"(?:座位|座位号)\s*[:]?\s*([0-9]{1,3}[A-F号]?)", re.IGNORECASE)
TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])号?", re.IGNORECASE)
TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE)
TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[::¥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)")
class ReceiptFolderStorageMixin:
@@ -101,18 +105,19 @@ class ReceiptFolderStorageMixin:
document: Any | None,
) -> dict[str, Any]:
preview_data_url = str(getattr(document, "preview_data_url", "") or "").strip()
decoded = ExpenseClaimAttachmentPresentation.decode_data_url(preview_data_url)
if decoded is not None:
preview_media_type, preview_content = decoded
suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
preview_name = f"preview{suffix}"
preview_path = receipt_dir / preview_name
preview_path.write_bytes(preview_content)
preview_asset = DocumentPreviewAssets.write_data_url_preview(
preview_dir=receipt_dir,
preview_name_stem="preview",
preview_data_url=preview_data_url,
)
if preview_asset is not None:
_, preview_media_type, preview_name = preview_asset
return {
"previewable": True,
"preview_kind": "image",
"preview_file_name": preview_name,
"preview_media_type": preview_media_type,
"preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
}
if self._is_previewable(media_type):
return {
@@ -120,14 +125,67 @@ class ReceiptFolderStorageMixin:
"preview_kind": "image" if media_type.startswith("image/") else "pdf",
"preview_file_name": source_path.name,
"preview_media_type": media_type,
"preview_rendered_with": "",
}
return {
"previewable": False,
"preview_kind": "",
"preview_file_name": "",
"preview_media_type": "",
"preview_rendered_with": "",
}
def _refresh_pdf_preview_asset_if_needed(
self,
*,
receipt_dir: Path,
meta: dict[str, Any],
) -> dict[str, Any]:
source_name = str(meta.get("source_file_name") or meta.get("file_name") or "").strip()
if not source_name:
return meta
source_path = self._assert_child(receipt_dir / source_name)
source_media_type = self.resolve_media_type(source_path.name, str(meta.get("media_type") or ""))
if source_media_type != "application/pdf" or not source_path.exists():
return meta
preview_name = str(meta.get("preview_file_name") or "").strip()
preview_path = self._assert_child(receipt_dir / preview_name) if preview_name else None
if (
preview_path is not None
and preview_path.exists()
and str(meta.get("preview_kind") or "").strip() == "image"
and str(meta.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
and str(meta.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
):
return meta
if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
preview_name = f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
preview_path = self._assert_child(receipt_dir / preview_name)
try:
DocumentPreviewAssets.render_pdf_first_page(
pdf_path=source_path,
preview_path=preview_path,
timeout_seconds=get_settings().ocr_timeout_seconds,
)
except Exception:
return meta
meta.update(
{
"previewable": True,
"preview_kind": "image",
"preview_file_name": preview_path.name,
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
}
)
self._write_meta(receipt_dir, meta)
return meta
@staticmethod
def _is_previewable(media_type: str) -> bool:
return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf"
@@ -256,6 +314,7 @@ class ReceiptFolderItemMixin:
def _build_item(self, meta: dict[str, Any]) -> ReceiptFolderItemRead:
receipt_id = str(meta.get("id") or "").strip()
status_value = str(meta.get("status") or "unlinked").strip() or "unlinked"
identity = self._resolve_receipt_document_identity(meta)
return ReceiptFolderItemRead(
id=receipt_id,
file_name=str(meta.get("file_name") or ""),
@@ -263,10 +322,10 @@ class ReceiptFolderItemMixin:
size_bytes=int(meta.get("size_bytes") or 0),
status=status_value,
status_label="已关联" if status_value == "linked" else "未关联",
document_type=str(meta.get("document_type") or "other"),
document_type_label=str(meta.get("document_type_label") or "其他单据"),
scene_code=str(meta.get("scene_code") or "other"),
scene_label=str(meta.get("scene_label") or "其他票据"),
document_type=identity["document_type"],
document_type_label=identity["document_type_label"],
scene_code=identity["scene_code"],
scene_label=identity["scene_label"],
summary=str(meta.get("summary") or ""),
amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")),
document_date=self._resolve_receipt_document_date(meta),
@@ -283,6 +342,38 @@ class ReceiptFolderItemMixin:
warnings=[str(value) for value in list(meta.get("ocr_warnings") or []) if str(value).strip()],
)
def _resolve_receipt_document_identity(self, meta: dict[str, Any]) -> dict[str, str]:
document_type = str(meta.get("document_type") or "other").strip() or "other"
document_type_label = str(meta.get("document_type_label") or "其他单据").strip() or "其他单据"
scene_code = str(meta.get("scene_code") or "other").strip() or "other"
scene_label = str(meta.get("scene_label") or "其他票据").strip() or "其他票据"
if document_type not in {"", "other"} and document_type_label != "其他单据":
return {
"document_type": document_type,
"document_type_label": document_type_label,
"scene_code": scene_code,
"scene_label": scene_label,
}
insight = build_document_insight(
filename=str(meta.get("file_name") or ""),
summary=str(meta.get("summary") or ""),
text=self._receipt_text(meta),
)
if insight.document_type in {"", "other"}:
return {
"document_type": document_type,
"document_type_label": document_type_label,
"scene_code": scene_code,
"scene_label": scene_label,
}
return {
"document_type": insight.document_type,
"document_type_label": insight.document_type_label,
"scene_code": insight.scene_code,
"scene_label": insight.scene_label,
}
def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]:
fields = [
ReceiptFolderFieldRead(
@@ -503,7 +594,15 @@ class ReceiptFolderTrainTicketMixin:
if str(document_type or "").strip().lower() == "train_ticket":
return True
compact = "".join([document_type_label, scene_label, text]).replace(" ", "")
return any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次"))
if any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次")):
return True
lower_compact = compact.lower()
return bool(re.search(r"[GCDZKTLYS]\d{1,5}", compact, flags=re.IGNORECASE)) and (
"12306" in compact
or "95306" in compact
or re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—||-)[\u4e00-\u9fa5]{2,12}", compact)
or ("wuhan" in lower_compact and "shanghai" in lower_compact)
)
@classmethod
def _is_train_ticket_meta(cls, meta: dict[str, Any]) -> bool:
@@ -581,6 +680,7 @@ class ReceiptFolderTrainTicketMixin:
return raw
normalized = match.group(1).replace("", "-").replace("", "-").replace("", "")
normalized = normalized.replace("/", "-").replace(".", "-")
normalized = re.sub(r"\s+", "-", normalized)
parts = [part for part in normalized.split("-") if part]
if len(parts) != 3:
return match.group(1)
@@ -651,7 +751,28 @@ class ReceiptFolderTrainTicketMixin:
cleaned = re.sub(r"[^·\u4e00-\u9fa5]", "", str(value or "")).strip()
if not 2 <= len(cleaned) <= 8:
return ""
if any(token in cleaned for token in ("电子", "客票", "铁路", "发票", "税务", "湖北省", "中国铁路", "开票", "日期")):
if any(
token in cleaned
for token in (
"电子",
"客票",
"铁路",
"发票",
"税务",
"湖北省",
"中国铁路",
"开票",
"日期",
"车厢",
"座位",
"票价",
"金额",
"行程",
"出发",
"到达",
"车次",
)
):
return ""
return cleaned
@@ -660,20 +781,29 @@ class ReceiptFolderTrainTicketMixin:
labeled = cls._extract_first(TRAIN_ID_PATTERN, text)
if labeled:
return labeled
fallback = ""
for line in str(text or "").replace("\r", "\n").splitlines():
compact_line = line.replace(" ", "")
if any(token in compact_line for token in ("发票号码", "电子客票号", "客票号", "订单号")):
continue
match = TRAIN_ID_FALLBACK_PATTERN.search(compact_line)
if match:
return str(match.group(1) or "").strip()
return ""
if not match:
continue
candidate = str(match.group(1) or "").strip()
if "*" in candidate:
return candidate
if not fallback:
fallback = candidate
return fallback
@staticmethod
def _extract_train_carriage_and_seat(text: str) -> tuple[str, str]:
combined_match = TRAIN_COMBINED_SEAT_PATTERN.search(str(text or ""))
if combined_match:
return f"{combined_match.group(1)}", combined_match.group(2)
loose_match = TRAIN_LOOSE_SEAT_PATTERN.search(str(text or ""))
if loose_match:
return f"{loose_match.group(1).zfill(2)}", loose_match.group(2).upper()
carriage_no = ReceiptFolderService._extract_first(TRAIN_CARRIAGE_PATTERN, text).replace(" ", "")
seat_no = ReceiptFolderService._extract_first(TRAIN_SEAT_NO_PATTERN, text)
return carriage_no, seat_no
@@ -681,6 +811,12 @@ class ReceiptFolderTrainTicketMixin:
@staticmethod
def _extract_train_fare(text: str) -> str:
match = TRAIN_FARE_PATTERN.search(str(text or ""))
if not match:
match = max(
list(TRAIN_LOOSE_FARE_PATTERN.finditer(str(text or ""))),
key=lambda item: float(str(item.group(1) or "0").replace(",", ".")),
default=None,
)
if not match:
return ""
value = str(match.group(1) or "").replace(",", ".").strip()
@@ -721,13 +857,10 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
)
if existing_receipt is not None:
enriched.append(
document.model_copy(
update={
"receipt_id": existing_receipt.id,
"receipt_status": existing_receipt.status,
"receipt_preview_url": existing_receipt.preview_url,
"receipt_source_url": existing_receipt.source_url,
}
self._enrich_ocr_document_with_receipt(
document,
receipt=existing_receipt,
current_user=current_user,
)
)
continue
@@ -744,14 +877,11 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
warning = "已上传过同样的单据,请不要重复上传。"
existing_warnings = [str(item) for item in list(document.warnings or []) if str(item).strip()]
enriched.append(
document.model_copy(
update={
"receipt_id": duplicate_receipt.id,
"receipt_status": duplicate_receipt.status,
"receipt_preview_url": duplicate_receipt.preview_url,
"receipt_source_url": duplicate_receipt.source_url,
"warnings": list(dict.fromkeys([*existing_warnings, warning])),
}
self._enrich_ocr_document_with_receipt(
document,
receipt=duplicate_receipt,
current_user=current_user,
extra_warnings=[*existing_warnings, warning],
)
)
continue
@@ -763,17 +893,78 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
current_user=current_user,
)
enriched.append(
document.model_copy(
update={
"receipt_id": receipt.id,
"receipt_status": receipt.status,
"receipt_preview_url": receipt.preview_url,
"receipt_source_url": receipt.source_url,
}
self._enrich_ocr_document_with_receipt(
document,
receipt=receipt,
current_user=current_user,
)
)
return result.model_copy(update={"documents": enriched})
def _enrich_ocr_document_with_receipt(
self,
document: OcrRecognizeDocumentRead,
*,
receipt: ReceiptFolderItemRead,
current_user: CurrentUserContext,
extra_warnings: list[str] | None = None,
) -> OcrRecognizeDocumentRead:
update: dict[str, Any] = {
"receipt_id": receipt.id,
"receipt_status": receipt.status,
"receipt_preview_url": receipt.preview_url,
"receipt_source_url": receipt.source_url,
}
try:
meta = self._read_receipt_meta(receipt.id, current_user)
except FileNotFoundError:
meta = {}
if meta:
update.update(
{
"text": str(meta.get("ocr_text") or document.text or ""),
"summary": str(meta.get("summary") or document.summary or ""),
"document_type": str(meta.get("document_type") or document.document_type or "other"),
"document_type_label": str(meta.get("document_type_label") or document.document_type_label or "其他单据"),
"scene_code": str(meta.get("scene_code") or document.scene_code or "other"),
"scene_label": str(meta.get("scene_label") or document.scene_label or "其他票据"),
"classification_source": str(meta.get("ocr_classification_source") or document.classification_source or ""),
"classification_confidence": float(
meta.get("ocr_classification_confidence")
or document.classification_confidence
or 0.0
),
"classification_evidence": [
str(value)
for value in list(meta.get("ocr_classification_evidence") or document.classification_evidence or [])
if str(value).strip()
],
"document_fields": self._build_ocr_document_fields_from_meta(meta),
}
)
warnings = [
str(item)
for item in list(extra_warnings if extra_warnings is not None else document.warnings or [])
if str(item).strip()
]
if warnings:
update["warnings"] = list(dict.fromkeys(warnings))
return document.model_copy(update=update)
def _build_ocr_document_fields_from_meta(self, meta: dict[str, Any]) -> list[OcrRecognizeFieldRead]:
return [
OcrRecognizeFieldRead(
key=field.key,
label=field.label,
value=field.value,
)
for field in self._resolve_fields(meta)
if field.label and field.value
]
def save_receipt(
self,
*,
@@ -1024,6 +1215,7 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
def resolve_preview(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]:
meta = self._read_receipt_meta(receipt_id, current_user)
receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id)
meta = self._refresh_pdf_preview_asset_if_needed(receipt_dir=receipt_dir, meta=meta)
preview_name = str(meta.get("preview_file_name") or "").strip()
if preview_name:
preview_path = self._assert_child(receipt_dir / preview_name)
@@ -1038,4 +1230,3 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
if self._is_previewable(source_media_type):
return source_path, source_media_type, source_name
raise FileNotFoundError("Receipt preview not found")