feat(server): 票据文件夹资产缓存与文档预览统一生成
- 新增 document_preview 模块,DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成(poppler-data 编码)、renderer_id 标识 - receipt_folder 服务复用预览生成,缓存票据资产并提供清理;删除票据时保留已关联报销单的附件副本 - document_intelligence 新增票据预览/资产缓存接入与字段提取增强;ocr 抽取复用预览工具,附件分析/文档/操作/展示四个子模块同步适配 - receipt_folder 端点补充资产缓存头,补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试,新增 attachment_analysis 回归测试
This commit is contained in:
@@ -12,7 +12,7 @@ from uuid import uuid4
|
||||
|
||||
from app.api.deps import CurrentUserContext
|
||||
from app.core.config import get_settings
|
||||
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
|
||||
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead
|
||||
from app.schemas.receipt_folder import (
|
||||
ReceiptFolderDeleteResponse,
|
||||
ReceiptFolderDetailRead,
|
||||
@@ -20,11 +20,13 @@ from app.schemas.receipt_folder import (
|
||||
ReceiptFolderItemRead,
|
||||
ReceiptFolderUpdate,
|
||||
)
|
||||
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
|
||||
from app.services.document_preview import DocumentPreviewAssets
|
||||
from app.services.document_intelligence import build_document_insight
|
||||
from app.services.ocr import SUPPORTED_SUFFIXES
|
||||
|
||||
RECEIPT_DATE_PATTERN = re.compile(
|
||||
r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)"
|
||||
r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
|
||||
r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
|
||||
)
|
||||
RECEIPT_TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[::]([0-5]\d)(?!\d)")
|
||||
TRAIN_INVOICE_DATE_PATTERN = re.compile(
|
||||
@@ -45,7 +47,9 @@ TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座
|
||||
TRAIN_CARRIAGE_PATTERN = re.compile(r"(?:车厢|车厢号)\s*[::]?\s*([0-9]{1,2}\s*车?)")
|
||||
TRAIN_SEAT_NO_PATTERN = re.compile(r"(?:座位|座位号)\s*[::]?\s*([0-9]{1,3}[A-F号]?)", re.IGNORECASE)
|
||||
TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])号?", re.IGNORECASE)
|
||||
TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE)
|
||||
TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[::¥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
|
||||
TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)")
|
||||
|
||||
|
||||
class ReceiptFolderStorageMixin:
|
||||
@@ -101,18 +105,19 @@ class ReceiptFolderStorageMixin:
|
||||
document: Any | None,
|
||||
) -> dict[str, Any]:
|
||||
preview_data_url = str(getattr(document, "preview_data_url", "") or "").strip()
|
||||
decoded = ExpenseClaimAttachmentPresentation.decode_data_url(preview_data_url)
|
||||
if decoded is not None:
|
||||
preview_media_type, preview_content = decoded
|
||||
suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
|
||||
preview_name = f"preview{suffix}"
|
||||
preview_path = receipt_dir / preview_name
|
||||
preview_path.write_bytes(preview_content)
|
||||
preview_asset = DocumentPreviewAssets.write_data_url_preview(
|
||||
preview_dir=receipt_dir,
|
||||
preview_name_stem="preview",
|
||||
preview_data_url=preview_data_url,
|
||||
)
|
||||
if preview_asset is not None:
|
||||
_, preview_media_type, preview_name = preview_asset
|
||||
return {
|
||||
"previewable": True,
|
||||
"preview_kind": "image",
|
||||
"preview_file_name": preview_name,
|
||||
"preview_media_type": preview_media_type,
|
||||
"preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
|
||||
}
|
||||
if self._is_previewable(media_type):
|
||||
return {
|
||||
@@ -120,14 +125,67 @@ class ReceiptFolderStorageMixin:
|
||||
"preview_kind": "image" if media_type.startswith("image/") else "pdf",
|
||||
"preview_file_name": source_path.name,
|
||||
"preview_media_type": media_type,
|
||||
"preview_rendered_with": "",
|
||||
}
|
||||
return {
|
||||
"previewable": False,
|
||||
"preview_kind": "",
|
||||
"preview_file_name": "",
|
||||
"preview_media_type": "",
|
||||
"preview_rendered_with": "",
|
||||
}
|
||||
|
||||
def _refresh_pdf_preview_asset_if_needed(
|
||||
self,
|
||||
*,
|
||||
receipt_dir: Path,
|
||||
meta: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
source_name = str(meta.get("source_file_name") or meta.get("file_name") or "").strip()
|
||||
if not source_name:
|
||||
return meta
|
||||
|
||||
source_path = self._assert_child(receipt_dir / source_name)
|
||||
source_media_type = self.resolve_media_type(source_path.name, str(meta.get("media_type") or ""))
|
||||
if source_media_type != "application/pdf" or not source_path.exists():
|
||||
return meta
|
||||
|
||||
preview_name = str(meta.get("preview_file_name") or "").strip()
|
||||
preview_path = self._assert_child(receipt_dir / preview_name) if preview_name else None
|
||||
if (
|
||||
preview_path is not None
|
||||
and preview_path.exists()
|
||||
and str(meta.get("preview_kind") or "").strip() == "image"
|
||||
and str(meta.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
|
||||
and str(meta.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
|
||||
):
|
||||
return meta
|
||||
|
||||
if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
|
||||
preview_name = f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
|
||||
preview_path = self._assert_child(receipt_dir / preview_name)
|
||||
|
||||
try:
|
||||
DocumentPreviewAssets.render_pdf_first_page(
|
||||
pdf_path=source_path,
|
||||
preview_path=preview_path,
|
||||
timeout_seconds=get_settings().ocr_timeout_seconds,
|
||||
)
|
||||
except Exception:
|
||||
return meta
|
||||
|
||||
meta.update(
|
||||
{
|
||||
"previewable": True,
|
||||
"preview_kind": "image",
|
||||
"preview_file_name": preview_path.name,
|
||||
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
|
||||
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
|
||||
}
|
||||
)
|
||||
self._write_meta(receipt_dir, meta)
|
||||
return meta
|
||||
|
||||
@staticmethod
|
||||
def _is_previewable(media_type: str) -> bool:
|
||||
return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf"
|
||||
@@ -256,6 +314,7 @@ class ReceiptFolderItemMixin:
|
||||
def _build_item(self, meta: dict[str, Any]) -> ReceiptFolderItemRead:
|
||||
receipt_id = str(meta.get("id") or "").strip()
|
||||
status_value = str(meta.get("status") or "unlinked").strip() or "unlinked"
|
||||
identity = self._resolve_receipt_document_identity(meta)
|
||||
return ReceiptFolderItemRead(
|
||||
id=receipt_id,
|
||||
file_name=str(meta.get("file_name") or ""),
|
||||
@@ -263,10 +322,10 @@ class ReceiptFolderItemMixin:
|
||||
size_bytes=int(meta.get("size_bytes") or 0),
|
||||
status=status_value,
|
||||
status_label="已关联" if status_value == "linked" else "未关联",
|
||||
document_type=str(meta.get("document_type") or "other"),
|
||||
document_type_label=str(meta.get("document_type_label") or "其他单据"),
|
||||
scene_code=str(meta.get("scene_code") or "other"),
|
||||
scene_label=str(meta.get("scene_label") or "其他票据"),
|
||||
document_type=identity["document_type"],
|
||||
document_type_label=identity["document_type_label"],
|
||||
scene_code=identity["scene_code"],
|
||||
scene_label=identity["scene_label"],
|
||||
summary=str(meta.get("summary") or ""),
|
||||
amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")),
|
||||
document_date=self._resolve_receipt_document_date(meta),
|
||||
@@ -283,6 +342,38 @@ class ReceiptFolderItemMixin:
|
||||
warnings=[str(value) for value in list(meta.get("ocr_warnings") or []) if str(value).strip()],
|
||||
)
|
||||
|
||||
def _resolve_receipt_document_identity(self, meta: dict[str, Any]) -> dict[str, str]:
|
||||
document_type = str(meta.get("document_type") or "other").strip() or "other"
|
||||
document_type_label = str(meta.get("document_type_label") or "其他单据").strip() or "其他单据"
|
||||
scene_code = str(meta.get("scene_code") or "other").strip() or "other"
|
||||
scene_label = str(meta.get("scene_label") or "其他票据").strip() or "其他票据"
|
||||
if document_type not in {"", "other"} and document_type_label != "其他单据":
|
||||
return {
|
||||
"document_type": document_type,
|
||||
"document_type_label": document_type_label,
|
||||
"scene_code": scene_code,
|
||||
"scene_label": scene_label,
|
||||
}
|
||||
|
||||
insight = build_document_insight(
|
||||
filename=str(meta.get("file_name") or ""),
|
||||
summary=str(meta.get("summary") or ""),
|
||||
text=self._receipt_text(meta),
|
||||
)
|
||||
if insight.document_type in {"", "other"}:
|
||||
return {
|
||||
"document_type": document_type,
|
||||
"document_type_label": document_type_label,
|
||||
"scene_code": scene_code,
|
||||
"scene_label": scene_label,
|
||||
}
|
||||
return {
|
||||
"document_type": insight.document_type,
|
||||
"document_type_label": insight.document_type_label,
|
||||
"scene_code": insight.scene_code,
|
||||
"scene_label": insight.scene_label,
|
||||
}
|
||||
|
||||
def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]:
|
||||
fields = [
|
||||
ReceiptFolderFieldRead(
|
||||
@@ -503,7 +594,15 @@ class ReceiptFolderTrainTicketMixin:
|
||||
if str(document_type or "").strip().lower() == "train_ticket":
|
||||
return True
|
||||
compact = "".join([document_type_label, scene_label, text]).replace(" ", "")
|
||||
return any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次"))
|
||||
if any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次")):
|
||||
return True
|
||||
lower_compact = compact.lower()
|
||||
return bool(re.search(r"[GCDZKTLYS]\d{1,5}", compact, flags=re.IGNORECASE)) and (
|
||||
"12306" in compact
|
||||
or "95306" in compact
|
||||
or re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5]{2,12}", compact)
|
||||
or ("wuhan" in lower_compact and "shanghai" in lower_compact)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _is_train_ticket_meta(cls, meta: dict[str, Any]) -> bool:
|
||||
@@ -581,6 +680,7 @@ class ReceiptFolderTrainTicketMixin:
|
||||
return raw
|
||||
normalized = match.group(1).replace("年", "-").replace("月", "-").replace("日", "")
|
||||
normalized = normalized.replace("/", "-").replace(".", "-")
|
||||
normalized = re.sub(r"\s+", "-", normalized)
|
||||
parts = [part for part in normalized.split("-") if part]
|
||||
if len(parts) != 3:
|
||||
return match.group(1)
|
||||
@@ -651,7 +751,28 @@ class ReceiptFolderTrainTicketMixin:
|
||||
cleaned = re.sub(r"[^·\u4e00-\u9fa5]", "", str(value or "")).strip()
|
||||
if not 2 <= len(cleaned) <= 8:
|
||||
return ""
|
||||
if any(token in cleaned for token in ("电子", "客票", "铁路", "发票", "税务", "湖北省", "中国铁路", "开票", "日期")):
|
||||
if any(
|
||||
token in cleaned
|
||||
for token in (
|
||||
"电子",
|
||||
"客票",
|
||||
"铁路",
|
||||
"发票",
|
||||
"税务",
|
||||
"湖北省",
|
||||
"中国铁路",
|
||||
"开票",
|
||||
"日期",
|
||||
"车厢",
|
||||
"座位",
|
||||
"票价",
|
||||
"金额",
|
||||
"行程",
|
||||
"出发",
|
||||
"到达",
|
||||
"车次",
|
||||
)
|
||||
):
|
||||
return ""
|
||||
return cleaned
|
||||
|
||||
@@ -660,20 +781,29 @@ class ReceiptFolderTrainTicketMixin:
|
||||
labeled = cls._extract_first(TRAIN_ID_PATTERN, text)
|
||||
if labeled:
|
||||
return labeled
|
||||
fallback = ""
|
||||
for line in str(text or "").replace("\r", "\n").splitlines():
|
||||
compact_line = line.replace(" ", "")
|
||||
if any(token in compact_line for token in ("发票号码", "电子客票号", "客票号", "订单号")):
|
||||
continue
|
||||
match = TRAIN_ID_FALLBACK_PATTERN.search(compact_line)
|
||||
if match:
|
||||
return str(match.group(1) or "").strip()
|
||||
return ""
|
||||
if not match:
|
||||
continue
|
||||
candidate = str(match.group(1) or "").strip()
|
||||
if "*" in candidate:
|
||||
return candidate
|
||||
if not fallback:
|
||||
fallback = candidate
|
||||
return fallback
|
||||
|
||||
@staticmethod
|
||||
def _extract_train_carriage_and_seat(text: str) -> tuple[str, str]:
|
||||
combined_match = TRAIN_COMBINED_SEAT_PATTERN.search(str(text or ""))
|
||||
if combined_match:
|
||||
return f"{combined_match.group(1)}车", combined_match.group(2)
|
||||
loose_match = TRAIN_LOOSE_SEAT_PATTERN.search(str(text or ""))
|
||||
if loose_match:
|
||||
return f"{loose_match.group(1).zfill(2)}车", loose_match.group(2).upper()
|
||||
carriage_no = ReceiptFolderService._extract_first(TRAIN_CARRIAGE_PATTERN, text).replace(" ", "")
|
||||
seat_no = ReceiptFolderService._extract_first(TRAIN_SEAT_NO_PATTERN, text)
|
||||
return carriage_no, seat_no
|
||||
@@ -681,6 +811,12 @@ class ReceiptFolderTrainTicketMixin:
|
||||
@staticmethod
|
||||
def _extract_train_fare(text: str) -> str:
|
||||
match = TRAIN_FARE_PATTERN.search(str(text or ""))
|
||||
if not match:
|
||||
match = max(
|
||||
list(TRAIN_LOOSE_FARE_PATTERN.finditer(str(text or ""))),
|
||||
key=lambda item: float(str(item.group(1) or "0").replace(",", ".")),
|
||||
default=None,
|
||||
)
|
||||
if not match:
|
||||
return ""
|
||||
value = str(match.group(1) or "").replace(",", ".").strip()
|
||||
@@ -721,13 +857,10 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
|
||||
)
|
||||
if existing_receipt is not None:
|
||||
enriched.append(
|
||||
document.model_copy(
|
||||
update={
|
||||
"receipt_id": existing_receipt.id,
|
||||
"receipt_status": existing_receipt.status,
|
||||
"receipt_preview_url": existing_receipt.preview_url,
|
||||
"receipt_source_url": existing_receipt.source_url,
|
||||
}
|
||||
self._enrich_ocr_document_with_receipt(
|
||||
document,
|
||||
receipt=existing_receipt,
|
||||
current_user=current_user,
|
||||
)
|
||||
)
|
||||
continue
|
||||
@@ -744,14 +877,11 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
|
||||
warning = "已上传过同样的单据,请不要重复上传。"
|
||||
existing_warnings = [str(item) for item in list(document.warnings or []) if str(item).strip()]
|
||||
enriched.append(
|
||||
document.model_copy(
|
||||
update={
|
||||
"receipt_id": duplicate_receipt.id,
|
||||
"receipt_status": duplicate_receipt.status,
|
||||
"receipt_preview_url": duplicate_receipt.preview_url,
|
||||
"receipt_source_url": duplicate_receipt.source_url,
|
||||
"warnings": list(dict.fromkeys([*existing_warnings, warning])),
|
||||
}
|
||||
self._enrich_ocr_document_with_receipt(
|
||||
document,
|
||||
receipt=duplicate_receipt,
|
||||
current_user=current_user,
|
||||
extra_warnings=[*existing_warnings, warning],
|
||||
)
|
||||
)
|
||||
continue
|
||||
@@ -763,17 +893,78 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
|
||||
current_user=current_user,
|
||||
)
|
||||
enriched.append(
|
||||
document.model_copy(
|
||||
update={
|
||||
"receipt_id": receipt.id,
|
||||
"receipt_status": receipt.status,
|
||||
"receipt_preview_url": receipt.preview_url,
|
||||
"receipt_source_url": receipt.source_url,
|
||||
}
|
||||
self._enrich_ocr_document_with_receipt(
|
||||
document,
|
||||
receipt=receipt,
|
||||
current_user=current_user,
|
||||
)
|
||||
)
|
||||
return result.model_copy(update={"documents": enriched})
|
||||
|
||||
def _enrich_ocr_document_with_receipt(
|
||||
self,
|
||||
document: OcrRecognizeDocumentRead,
|
||||
*,
|
||||
receipt: ReceiptFolderItemRead,
|
||||
current_user: CurrentUserContext,
|
||||
extra_warnings: list[str] | None = None,
|
||||
) -> OcrRecognizeDocumentRead:
|
||||
update: dict[str, Any] = {
|
||||
"receipt_id": receipt.id,
|
||||
"receipt_status": receipt.status,
|
||||
"receipt_preview_url": receipt.preview_url,
|
||||
"receipt_source_url": receipt.source_url,
|
||||
}
|
||||
|
||||
try:
|
||||
meta = self._read_receipt_meta(receipt.id, current_user)
|
||||
except FileNotFoundError:
|
||||
meta = {}
|
||||
|
||||
if meta:
|
||||
update.update(
|
||||
{
|
||||
"text": str(meta.get("ocr_text") or document.text or ""),
|
||||
"summary": str(meta.get("summary") or document.summary or ""),
|
||||
"document_type": str(meta.get("document_type") or document.document_type or "other"),
|
||||
"document_type_label": str(meta.get("document_type_label") or document.document_type_label or "其他单据"),
|
||||
"scene_code": str(meta.get("scene_code") or document.scene_code or "other"),
|
||||
"scene_label": str(meta.get("scene_label") or document.scene_label or "其他票据"),
|
||||
"classification_source": str(meta.get("ocr_classification_source") or document.classification_source or ""),
|
||||
"classification_confidence": float(
|
||||
meta.get("ocr_classification_confidence")
|
||||
or document.classification_confidence
|
||||
or 0.0
|
||||
),
|
||||
"classification_evidence": [
|
||||
str(value)
|
||||
for value in list(meta.get("ocr_classification_evidence") or document.classification_evidence or [])
|
||||
if str(value).strip()
|
||||
],
|
||||
"document_fields": self._build_ocr_document_fields_from_meta(meta),
|
||||
}
|
||||
)
|
||||
|
||||
warnings = [
|
||||
str(item)
|
||||
for item in list(extra_warnings if extra_warnings is not None else document.warnings or [])
|
||||
if str(item).strip()
|
||||
]
|
||||
if warnings:
|
||||
update["warnings"] = list(dict.fromkeys(warnings))
|
||||
return document.model_copy(update=update)
|
||||
|
||||
def _build_ocr_document_fields_from_meta(self, meta: dict[str, Any]) -> list[OcrRecognizeFieldRead]:
|
||||
return [
|
||||
OcrRecognizeFieldRead(
|
||||
key=field.key,
|
||||
label=field.label,
|
||||
value=field.value,
|
||||
)
|
||||
for field in self._resolve_fields(meta)
|
||||
if field.label and field.value
|
||||
]
|
||||
|
||||
def save_receipt(
|
||||
self,
|
||||
*,
|
||||
@@ -1024,6 +1215,7 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
|
||||
def resolve_preview(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]:
|
||||
meta = self._read_receipt_meta(receipt_id, current_user)
|
||||
receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id)
|
||||
meta = self._refresh_pdf_preview_asset_if_needed(receipt_dir=receipt_dir, meta=meta)
|
||||
preview_name = str(meta.get("preview_file_name") or "").strip()
|
||||
if preview_name:
|
||||
preview_path = self._assert_child(receipt_dir / preview_name)
|
||||
@@ -1038,4 +1230,3 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
|
||||
if self._is_previewable(source_media_type):
|
||||
return source_path, source_media_type, source_name
|
||||
raise FileNotFoundError("Receipt preview not found")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user