- 新增 system_cache 模块与 POST /settings/cache/clear,管理员可一键清理 OCR 结果/运行时配置/模型失败冷却/知识库索引/地点语义等进程内缓存 - 各服务暴露 clear_*_cache 方法(ocr/runtime_settings/runtime_chat/knowledge/application_location_semantic),SettingsCacheClearRead 汇总清理项 - OCR 转图片失败时尝试用 PDF 文本层兜底构建识别文档(有效字符≥8),并写结果缓存;OcrService 暴露 clear_result_cache - receipt_folder 车票过滤补充身份证号关键词,附件文档/操作/展示模块同步适配 - 新增 system_cache_endpoints 测试,更新 openapi_schema/ocr/receipt_folder/attachment_association_jobs 测试
1416 lines
59 KiB
Python
1416 lines
59 KiB
Python
from __future__ import annotations
|
||
|
||
import json
|
||
import hashlib
|
||
import mimetypes
|
||
import re
|
||
import shutil
|
||
from datetime import UTC, datetime
|
||
from pathlib import Path
|
||
from typing import Any
|
||
from uuid import uuid4
|
||
|
||
from app.api.deps import CurrentUserContext
|
||
from app.core.config import get_settings
|
||
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead
|
||
from app.schemas.receipt_folder import (
|
||
ReceiptFolderDeleteResponse,
|
||
ReceiptFolderDetailRead,
|
||
ReceiptFolderFieldRead,
|
||
ReceiptFolderItemRead,
|
||
ReceiptFolderUpdate,
|
||
)
|
||
from app.services.document_preview import DocumentPreviewAssets
|
||
from app.services.document_intelligence import build_document_insight
|
||
from app.services.ocr import SUPPORTED_SUFFIXES
|
||
|
||
RECEIPT_DATE_PATTERN = re.compile(
|
||
r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
|
||
r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
|
||
)
|
||
RECEIPT_TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[::]([0-5]\d)(?!\d)")
|
||
TRAIN_INVOICE_DATE_PATTERN = re.compile(
|
||
r"(?:开票日期|发票日期|开票时间)\s*[::]?\s*"
|
||
r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)"
|
||
)
|
||
TRAIN_ROUTE_PATTERN = re.compile(
|
||
r"([\u4e00-\u9fa5]{2,12})站?\s*(?:至|到|→|->|—|–|-)\s*"
|
||
r"([\u4e00-\u9fa5]{2,12})站?"
|
||
)
|
||
TRAIN_ROUTE_WITH_NO_PATTERN = re.compile(
|
||
r"([\u4e00-\u9fa5]{2,12})站?\s+[GCDZKTLYS]\d{1,5}\s+"
|
||
r"([\u4e00-\u9fa5]{2,12})站?",
|
||
re.IGNORECASE,
|
||
)
|
||
TRAIN_NO_PATTERN = re.compile(r"(?:车次|列车号)\s*[::]?\s*([GCDZKTLYS]\d{1,5})", re.IGNORECASE)
|
||
TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(?<![A-Z0-9])([GCDZKTLYS]\d{1,5})(?![A-Z0-9])", re.IGNORECASE)
|
||
TRAIN_PASSENGER_PATTERN = re.compile(r"(?:乘车人|乘客|旅客姓名|姓名)\s*[::]?\s*([\u4e00-\u9fa5·]{2,20})")
|
||
TRAIN_PURCHASER_NAME_PATTERN = re.compile(
|
||
r"购买方名称\s*[::]?\s*([·\u4e00-\u9fa5]{2,20}?)(?=\s*(?:统一社会信用代码|纳税人识别号|$))"
|
||
)
|
||
TRAIN_ID_PATTERN = re.compile(r"(?:有效身份证件号码|身份证件号码|证件号码|身份证号|证件号)\s*[::]?\s*([0-9Xx*]{6,24})")
|
||
TRAIN_ID_FALLBACK_PATTERN = re.compile(r"(?<![0-9A-Za-z])([0-9]{6,17}[0-9Xx*]{2,8})(?![0-9A-Za-z])")
|
||
TRAIN_ETICKET_PATTERN = re.compile(r"(?:电子客票号|客票号)\s*[::]?\s*([A-Z0-9]{6,32})", re.IGNORECASE)
|
||
TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座|一等卧|二等卧|软卧|硬卧|软座|硬座|无座)")
|
||
TRAIN_CARRIAGE_PATTERN = re.compile(r"(?:车厢|车厢号)\s*[::]?\s*([0-9]{1,2}\s*车?)")
|
||
TRAIN_SEAT_NO_PATTERN = re.compile(r"(?:座位|座位号)\s*[::]?\s*([0-9]{1,3}[A-F号]?)", re.IGNORECASE)
|
||
TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])号?", re.IGNORECASE)
|
||
TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE)
|
||
TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[::¥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
|
||
TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)")
|
||
TRAIN_STATION_FIELD_KEYS = {"departure_station", "arrival_station"}
|
||
TRAIN_STATION_FIELD_LABELS = {"出发地点", "到达地点"}
|
||
TRAIN_INVALID_STATION_TOKENS = (
|
||
"座",
|
||
"席",
|
||
"扫码",
|
||
"无效",
|
||
"票价",
|
||
"金额",
|
||
"车厢",
|
||
"座位",
|
||
"乘客",
|
||
"证件",
|
||
"身份证",
|
||
"订单",
|
||
"单据",
|
||
"日期",
|
||
"渠道",
|
||
"官方",
|
||
"演示",
|
||
"不可报销",
|
||
)
|
||
|
||
|
||
class ReceiptFolderStorageMixin:
|
||
@staticmethod
|
||
def normalize_filename(filename: str | None) -> str:
|
||
normalized = Path(str(filename or "").strip()).name
|
||
normalized = re.sub(r"[^\w.\-\u4e00-\u9fff]+", "_", normalized).strip("._")
|
||
return normalized or "receipt.bin"
|
||
|
||
@staticmethod
|
||
def resolve_media_type(filename: str, fallback: str | None = None) -> str:
|
||
return str(mimetypes.guess_type(filename)[0] or fallback or "application/octet-stream")
|
||
|
||
def _owner_root(self, owner_key: str) -> Path:
|
||
return self._assert_child(self.root / owner_key)
|
||
|
||
def _receipt_dir(self, owner_key: str, receipt_id: str) -> Path:
|
||
normalized = str(receipt_id or "").strip()
|
||
if not re.fullmatch(r"[0-9a-fA-F-]{32,36}", normalized):
|
||
raise FileNotFoundError("Receipt not found")
|
||
path = self._assert_child(self._owner_root(owner_key) / normalized)
|
||
if not path.exists() or not path.is_dir():
|
||
raise FileNotFoundError("Receipt not found")
|
||
return path
|
||
|
||
def _assert_child(self, path: Path) -> Path:
|
||
self.root.mkdir(parents=True, exist_ok=True)
|
||
resolved = path.resolve()
|
||
try:
|
||
resolved.relative_to(self.root)
|
||
except ValueError as exc:
|
||
raise FileNotFoundError("Receipt path is invalid") from exc
|
||
return resolved
|
||
|
||
@staticmethod
|
||
def _owner_key(current_user: CurrentUserContext) -> str:
|
||
raw = str(current_user.username or current_user.name or "anonymous").strip().lower()
|
||
normalized = re.sub(r"[^\w.\-\u4e00-\u9fff]+", "_", raw).strip("._")
|
||
return normalized or "anonymous"
|
||
|
||
@staticmethod
|
||
def _should_persist_source(filename: str, content: bytes) -> bool:
|
||
if not content:
|
||
return False
|
||
return Path(str(filename or "")).suffix.lower() in SUPPORTED_SUFFIXES
|
||
|
||
def _write_preview_asset(
|
||
self,
|
||
*,
|
||
receipt_dir: Path,
|
||
source_path: Path,
|
||
media_type: str,
|
||
document: Any | None,
|
||
) -> dict[str, Any]:
|
||
preview_data_url = str(getattr(document, "preview_data_url", "") or "").strip()
|
||
preview_asset = DocumentPreviewAssets.write_data_url_preview(
|
||
preview_dir=receipt_dir,
|
||
preview_name_stem="preview",
|
||
preview_data_url=preview_data_url,
|
||
)
|
||
if preview_asset is not None:
|
||
_, preview_media_type, preview_name = preview_asset
|
||
return {
|
||
"previewable": True,
|
||
"preview_kind": "image",
|
||
"preview_file_name": preview_name,
|
||
"preview_media_type": preview_media_type,
|
||
"preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
|
||
}
|
||
if str(media_type or "").strip() == "application/pdf":
|
||
preview_path = receipt_dir / f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
|
||
try:
|
||
DocumentPreviewAssets.render_pdf_first_page(
|
||
pdf_path=source_path,
|
||
preview_path=preview_path,
|
||
timeout_seconds=get_settings().ocr_timeout_seconds,
|
||
)
|
||
except Exception:
|
||
return {
|
||
"previewable": True,
|
||
"preview_kind": "pdf",
|
||
"preview_file_name": source_path.name,
|
||
"preview_media_type": media_type,
|
||
"preview_rendered_with": "",
|
||
}
|
||
return {
|
||
"previewable": True,
|
||
"preview_kind": "image",
|
||
"preview_file_name": preview_path.name,
|
||
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
|
||
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
|
||
}
|
||
if self._is_previewable(media_type):
|
||
return {
|
||
"previewable": True,
|
||
"preview_kind": "image" if media_type.startswith("image/") else "pdf",
|
||
"preview_file_name": source_path.name,
|
||
"preview_media_type": media_type,
|
||
"preview_rendered_with": "",
|
||
}
|
||
return {
|
||
"previewable": False,
|
||
"preview_kind": "",
|
||
"preview_file_name": "",
|
||
"preview_media_type": "",
|
||
"preview_rendered_with": "",
|
||
}
|
||
|
||
def _refresh_pdf_preview_asset_if_needed(
|
||
self,
|
||
*,
|
||
receipt_dir: Path,
|
||
meta: dict[str, Any],
|
||
) -> dict[str, Any]:
|
||
source_name = str(meta.get("source_file_name") or meta.get("file_name") or "").strip()
|
||
if not source_name:
|
||
return meta
|
||
|
||
source_path = self._assert_child(receipt_dir / source_name)
|
||
source_media_type = self.resolve_media_type(source_path.name, str(meta.get("media_type") or ""))
|
||
if source_media_type != "application/pdf" or not source_path.exists():
|
||
return meta
|
||
|
||
preview_name = str(meta.get("preview_file_name") or "").strip()
|
||
preview_path = self._assert_child(receipt_dir / preview_name) if preview_name else None
|
||
if (
|
||
preview_path is not None
|
||
and preview_path.exists()
|
||
and str(meta.get("preview_kind") or "").strip() == "image"
|
||
and str(meta.get("preview_media_type") or "").strip() == DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE
|
||
and str(meta.get("preview_rendered_with") or "").strip() == DocumentPreviewAssets.PDF_RENDERER_ID
|
||
):
|
||
return meta
|
||
|
||
if not preview_name or not preview_name.lower().endswith(DocumentPreviewAssets.PDF_PREVIEW_SUFFIX):
|
||
preview_name = f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
|
||
preview_path = self._assert_child(receipt_dir / preview_name)
|
||
|
||
try:
|
||
DocumentPreviewAssets.render_pdf_first_page(
|
||
pdf_path=source_path,
|
||
preview_path=preview_path,
|
||
timeout_seconds=get_settings().ocr_timeout_seconds,
|
||
)
|
||
except Exception:
|
||
meta.update(
|
||
{
|
||
"previewable": True,
|
||
"preview_kind": "pdf",
|
||
"preview_file_name": source_path.name,
|
||
"preview_media_type": "application/pdf",
|
||
"preview_rendered_with": "",
|
||
}
|
||
)
|
||
self._write_meta(receipt_dir, meta)
|
||
return meta
|
||
|
||
meta.update(
|
||
{
|
||
"previewable": True,
|
||
"preview_kind": "image",
|
||
"preview_file_name": preview_path.name,
|
||
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
|
||
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
|
||
}
|
||
)
|
||
self._write_meta(receipt_dir, meta)
|
||
return meta
|
||
|
||
@staticmethod
|
||
def _is_previewable(media_type: str) -> bool:
|
||
return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf"
|
||
|
||
@classmethod
|
||
def _build_document_meta(cls, document: Any | None) -> dict[str, Any]:
|
||
fields = []
|
||
for field in list(getattr(document, "document_fields", []) or []):
|
||
if isinstance(field, dict):
|
||
fields.append(
|
||
{
|
||
"key": str(field.get("key") or "").strip(),
|
||
"label": str(field.get("label") or "").strip(),
|
||
"value": str(field.get("value") or "").strip(),
|
||
}
|
||
)
|
||
else:
|
||
fields.append(
|
||
{
|
||
"key": str(getattr(field, "key", "") or "").strip(),
|
||
"label": str(getattr(field, "label", "") or "").strip(),
|
||
"value": str(getattr(field, "value", "") or "").strip(),
|
||
}
|
||
)
|
||
fields = [field for field in fields if field["label"] and field["value"]]
|
||
ocr_text = str(getattr(document, "text", "") or "")
|
||
summary = str(getattr(document, "summary", "") or "")
|
||
document_type = str(getattr(document, "document_type", "") or "other")
|
||
document_type_label = str(getattr(document, "document_type_label", "") or "其他单据")
|
||
scene_label = str(getattr(document, "scene_label", "") or "其他票据")
|
||
if cls._is_train_ticket_values(
|
||
document_type=document_type,
|
||
document_type_label=document_type_label,
|
||
scene_label=scene_label,
|
||
text=f"{summary}\n{ocr_text}",
|
||
):
|
||
fields = cls._enrich_train_ticket_field_dicts(
|
||
fields,
|
||
text=f"{ocr_text}\n{summary}\n{str(getattr(document, 'filename', '') or '')}",
|
||
)
|
||
return {
|
||
"engine": str(getattr(document, "engine", "") or ""),
|
||
"model": str(getattr(document, "model", "") or ""),
|
||
"ocr_text": ocr_text,
|
||
"summary": summary,
|
||
"ocr_avg_score": float(getattr(document, "avg_score", 0.0) or 0.0),
|
||
"ocr_line_count": int(getattr(document, "line_count", 0) or 0),
|
||
"page_count": int(getattr(document, "page_count", 1) or 1),
|
||
"document_type": document_type,
|
||
"document_type_label": document_type_label,
|
||
"scene_code": str(getattr(document, "scene_code", "") or "other"),
|
||
"scene_label": scene_label,
|
||
"ocr_classification_source": str(getattr(document, "classification_source", "") or ""),
|
||
"ocr_classification_confidence": float(getattr(document, "classification_confidence", 0.0) or 0.0),
|
||
"ocr_classification_evidence": [
|
||
str(value) for value in list(getattr(document, "classification_evidence", []) or []) if str(value).strip()
|
||
],
|
||
"document_fields": fields,
|
||
"editable_fields": {},
|
||
"ocr_warnings": [str(value) for value in list(getattr(document, "warnings", []) or []) if str(value).strip()],
|
||
}
|
||
|
||
def _iter_owner_meta(self, owner_key: str) -> list[dict[str, Any]]:
|
||
owner_root = self._owner_root(owner_key)
|
||
if not owner_root.exists():
|
||
return []
|
||
metas = []
|
||
for meta_path in owner_root.glob("*/meta.json"):
|
||
meta = self._read_meta(meta_path.parent)
|
||
if meta:
|
||
metas.append(meta)
|
||
return metas
|
||
|
||
def _read_receipt_meta(self, receipt_id: str, current_user: CurrentUserContext) -> dict[str, Any]:
|
||
return self._read_meta(self._receipt_dir(self._owner_key(current_user), receipt_id))
|
||
|
||
def _resolve_existing_item(
|
||
self,
|
||
receipt_id: str | None,
|
||
current_user: CurrentUserContext,
|
||
) -> ReceiptFolderItemRead | None:
|
||
normalized = str(receipt_id or "").strip()
|
||
if not normalized:
|
||
return None
|
||
try:
|
||
return self._build_item(self._read_receipt_meta(normalized, current_user))
|
||
except FileNotFoundError:
|
||
return None
|
||
|
||
@staticmethod
|
||
def _meta_path(receipt_dir: Path) -> Path:
|
||
return receipt_dir / "meta.json"
|
||
|
||
def _read_meta(self, receipt_dir: Path) -> dict[str, Any]:
|
||
meta_path = self._meta_path(receipt_dir)
|
||
if not meta_path.exists():
|
||
raise FileNotFoundError("Receipt not found")
|
||
try:
|
||
payload = json.loads(meta_path.read_text(encoding="utf-8"))
|
||
except (OSError, json.JSONDecodeError) as exc:
|
||
raise FileNotFoundError("Receipt metadata not found") from exc
|
||
return payload if isinstance(payload, dict) else {}
|
||
|
||
def _write_meta(self, receipt_dir: Path, payload: dict[str, Any]) -> None:
|
||
self._meta_path(receipt_dir).write_text(
|
||
json.dumps(payload, ensure_ascii=False, indent=2),
|
||
encoding="utf-8",
|
||
)
|
||
|
||
@staticmethod
|
||
def _content_hash(content: bytes) -> str:
|
||
return hashlib.sha256(content or b"").hexdigest() if content else ""
|
||
|
||
@staticmethod
|
||
def _operator_label(current_user: CurrentUserContext) -> str:
|
||
return str(current_user.name or current_user.username or "当前用户").strip() or "当前用户"
|
||
|
||
|
||
class ReceiptFolderItemMixin:
|
||
@staticmethod
|
||
def _matches_status(meta: dict[str, Any], status_filter: str) -> bool:
|
||
if status_filter in {"", "all"}:
|
||
return True
|
||
return str(meta.get("status") or "unlinked").strip().lower() == status_filter
|
||
|
||
def _build_item(self, meta: dict[str, Any]) -> ReceiptFolderItemRead:
|
||
receipt_id = str(meta.get("id") or "").strip()
|
||
status_value = str(meta.get("status") or "unlinked").strip() or "unlinked"
|
||
identity = self._resolve_receipt_document_identity(meta)
|
||
return ReceiptFolderItemRead(
|
||
id=receipt_id,
|
||
file_name=str(meta.get("file_name") or ""),
|
||
media_type=str(meta.get("media_type") or "application/octet-stream"),
|
||
size_bytes=int(meta.get("size_bytes") or 0),
|
||
status=status_value,
|
||
status_label="已关联" if status_value == "linked" else "未关联",
|
||
document_type=identity["document_type"],
|
||
document_type_label=identity["document_type_label"],
|
||
scene_code=identity["scene_code"],
|
||
scene_label=identity["scene_label"],
|
||
summary=str(meta.get("summary") or ""),
|
||
amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")),
|
||
document_date=self._resolve_receipt_document_date(meta),
|
||
merchant_name=self._resolve_receipt_merchant_name(meta),
|
||
avg_score=float(meta.get("ocr_avg_score") or 0.0),
|
||
uploaded_at=self._parse_datetime(meta.get("uploaded_at")),
|
||
linked_at=self._parse_datetime(meta.get("linked_at")),
|
||
linked_claim_id=str(meta.get("linked_claim_id") or ""),
|
||
linked_claim_no=str(meta.get("linked_claim_no") or ""),
|
||
previewable=bool(meta.get("previewable")),
|
||
preview_kind=str(meta.get("preview_kind") or ""),
|
||
preview_url=f"/receipt-folder/{receipt_id}/preview" if bool(meta.get("previewable")) and receipt_id else "",
|
||
source_url=f"/receipt-folder/{receipt_id}/source" if receipt_id else "",
|
||
warnings=[str(value) for value in list(meta.get("ocr_warnings") or []) if str(value).strip()],
|
||
)
|
||
|
||
def _resolve_receipt_document_identity(self, meta: dict[str, Any]) -> dict[str, str]:
|
||
document_type = str(meta.get("document_type") or "other").strip() or "other"
|
||
document_type_label = str(meta.get("document_type_label") or "其他单据").strip() or "其他单据"
|
||
scene_code = str(meta.get("scene_code") or "other").strip() or "other"
|
||
scene_label = str(meta.get("scene_label") or "其他票据").strip() or "其他票据"
|
||
if document_type not in {"", "other"} and document_type_label != "其他单据":
|
||
return {
|
||
"document_type": document_type,
|
||
"document_type_label": document_type_label,
|
||
"scene_code": scene_code,
|
||
"scene_label": scene_label,
|
||
}
|
||
|
||
insight = build_document_insight(
|
||
filename=str(meta.get("file_name") or ""),
|
||
summary=str(meta.get("summary") or ""),
|
||
text=self._receipt_text(meta),
|
||
)
|
||
if insight.document_type in {"", "other"}:
|
||
return {
|
||
"document_type": document_type,
|
||
"document_type_label": document_type_label,
|
||
"scene_code": scene_code,
|
||
"scene_label": scene_label,
|
||
}
|
||
return {
|
||
"document_type": insight.document_type,
|
||
"document_type_label": insight.document_type_label,
|
||
"scene_code": insight.scene_code,
|
||
"scene_label": insight.scene_label,
|
||
}
|
||
|
||
def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]:
|
||
fields = [
|
||
ReceiptFolderFieldRead(
|
||
key=str(field.get("key") or ""),
|
||
label=str(field.get("label") or ""),
|
||
value=str(field.get("value") or ""),
|
||
)
|
||
for field in list(meta.get("document_fields") or [])
|
||
if isinstance(field, dict) and str(field.get("label") or "").strip()
|
||
]
|
||
if self._is_train_ticket_meta(meta):
|
||
return [
|
||
ReceiptFolderFieldRead(**field)
|
||
for field in self._enrich_train_ticket_field_dicts(
|
||
[field.model_dump() for field in fields],
|
||
text=self._receipt_text(meta),
|
||
)
|
||
]
|
||
return fields
|
||
|
||
def _resolve_edit_logs(self, meta: dict[str, Any]) -> list[dict[str, Any]]:
|
||
logs = []
|
||
for log in list(meta.get("edit_logs") or []):
|
||
if not isinstance(log, dict):
|
||
continue
|
||
changes = [
|
||
{
|
||
"key": str(change.get("key") or ""),
|
||
"label": str(change.get("label") or ""),
|
||
"before": str(change.get("before") or ""),
|
||
"after": str(change.get("after") or ""),
|
||
}
|
||
for change in list(log.get("changes") or [])
|
||
if isinstance(change, dict)
|
||
and str(change.get("label") or change.get("key") or "").strip()
|
||
]
|
||
if not changes:
|
||
continue
|
||
logs.append(
|
||
{
|
||
"operated_at": self._parse_datetime(log.get("operated_at")),
|
||
"operator": str(log.get("operator") or "当前用户").strip() or "当前用户",
|
||
"changes": changes,
|
||
}
|
||
)
|
||
return logs
|
||
|
||
def _build_edit_changes(self, before_meta: dict[str, Any], after_meta: dict[str, Any]) -> list[dict[str, str]]:
|
||
before_values = self._flatten_editable_receipt_values(before_meta)
|
||
after_values = self._flatten_editable_receipt_values(after_meta)
|
||
changes = []
|
||
for key in sorted(set(before_values) | set(after_values)):
|
||
before = before_values.get(key, {})
|
||
after = after_values.get(key, {})
|
||
before_value = str(before.get("value") or "").strip()
|
||
after_value = str(after.get("value") or "").strip()
|
||
if before_value == after_value:
|
||
continue
|
||
label = str(after.get("label") or before.get("label") or key).strip()
|
||
changes.append(
|
||
{
|
||
"key": key,
|
||
"label": label,
|
||
"before": before_value,
|
||
"after": after_value,
|
||
}
|
||
)
|
||
return changes
|
||
|
||
def _flatten_editable_receipt_values(self, meta: dict[str, Any]) -> dict[str, dict[str, str]]:
|
||
values = {
|
||
"document_type_label": {
|
||
"label": "票据类型",
|
||
"value": str(meta.get("document_type_label") or "").strip(),
|
||
},
|
||
"scene_label": {
|
||
"label": "费用场景",
|
||
"value": str(meta.get("scene_label") or "").strip(),
|
||
},
|
||
"summary": {
|
||
"label": "摘要",
|
||
"value": str(meta.get("summary") or "").strip(),
|
||
},
|
||
"amount": {
|
||
"label": "金额",
|
||
"value": self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")),
|
||
},
|
||
"document_date": {
|
||
"label": "票据日期",
|
||
"value": self._resolve_receipt_document_date(meta),
|
||
},
|
||
"merchant_name": {
|
||
"label": "商户",
|
||
"value": self._resolve_receipt_merchant_name(meta),
|
||
},
|
||
}
|
||
for index, field in enumerate(list(meta.get("document_fields") or [])):
|
||
if not isinstance(field, dict):
|
||
continue
|
||
key = str(field.get("key") or "").strip()
|
||
label = str(field.get("label") or "").strip()
|
||
value = str(field.get("value") or "").strip()
|
||
stable_key = key or f"field_{index}_{label}"
|
||
if not stable_key and not label:
|
||
continue
|
||
values[stable_key] = {
|
||
"label": label or stable_key,
|
||
"value": value,
|
||
}
|
||
return values
|
||
|
||
def _resolve_receipt_document_date(self, meta: dict[str, Any]) -> str:
|
||
editable = meta.get("editable_fields")
|
||
if isinstance(editable, dict):
|
||
value = str(editable.get("document_date") or "").strip()
|
||
if value:
|
||
return value
|
||
|
||
fields = self._resolve_fields(meta)
|
||
for field in fields:
|
||
if field.key in {"invoice_date", "issue_date"} or field.label in {"开票日期", "发票日期"}:
|
||
return self._normalize_receipt_date_value(field.value)
|
||
|
||
if self._is_train_ticket_meta(meta):
|
||
invoice_date = self._extract_train_invoice_date(self._receipt_text(meta))
|
||
if invoice_date:
|
||
return invoice_date
|
||
|
||
for field in fields:
|
||
if field.key == "document_date" or field.label in {"日期", "乘车日期", "列车出发时间", "行程日期"}:
|
||
return self._normalize_receipt_date_value(field.value)
|
||
return ""
|
||
|
||
def _resolve_receipt_merchant_name(self, meta: dict[str, Any]) -> str:
|
||
value = self._resolve_editable_or_field(meta, "merchant_name", labels=("商户", "销售方", "收款方", "开票方"))
|
||
if value:
|
||
return value
|
||
if self._is_train_ticket_meta(meta):
|
||
return "中国铁路"
|
||
return ""
|
||
|
||
def _resolve_editable_or_field(self, meta: dict[str, Any], key: str, *, labels: tuple[str, ...]) -> str:
|
||
editable = meta.get("editable_fields")
|
||
if isinstance(editable, dict):
|
||
value = str(editable.get(key) or "").strip()
|
||
if value:
|
||
return value
|
||
label_set = set(labels)
|
||
for field in self._resolve_fields(meta):
|
||
if field.label in label_set or field.key == key:
|
||
return field.value
|
||
return ""
|
||
|
||
|
||
class ReceiptFolderTrainTicketMixin:
|
||
@classmethod
|
||
def _enrich_train_ticket_field_dicts(
|
||
cls,
|
||
fields: list[dict[str, Any]],
|
||
*,
|
||
text: str,
|
||
) -> list[dict[str, str]]:
|
||
normalized: list[dict[str, str]] = []
|
||
for field in fields:
|
||
key = str(field.get("key") or "").strip()
|
||
label = str(field.get("label") or "").strip()
|
||
value = str(field.get("value") or "").strip()
|
||
if not label or not value:
|
||
continue
|
||
if key == "merchant_name" or label == "商户":
|
||
continue
|
||
if not cls._should_keep_train_ticket_field(key=key, label=label, value=value):
|
||
continue
|
||
if key == "trip_no" and label == "车次/航班":
|
||
label = "车次"
|
||
if key == "route" and label == "行程":
|
||
label = "行程"
|
||
normalized.append({"key": key, "label": label, "value": value})
|
||
|
||
def add_field(key: str, label: str, value: str) -> None:
|
||
cleaned = str(value or "").strip()
|
||
if not cleaned:
|
||
return
|
||
if any(item["key"] == key for item in normalized if item["key"]):
|
||
return
|
||
if any(item["label"] == label for item in normalized):
|
||
return
|
||
normalized.append({"key": key, "label": label, "value": cleaned})
|
||
|
||
add_field("merchant_name", "商户", "中国铁路")
|
||
|
||
invoice_date = cls._extract_train_invoice_date(text)
|
||
add_field("invoice_date", "开票日期", invoice_date)
|
||
|
||
trip_datetime = cls._extract_train_trip_datetime(text)
|
||
add_field("trip_date", "列车出发时间", trip_datetime)
|
||
|
||
departure, arrival = cls._extract_train_route_points(text)
|
||
add_field("departure_station", "出发地点", departure)
|
||
add_field("arrival_station", "到达地点", arrival)
|
||
if departure and arrival:
|
||
add_field("route", "行程", f"{departure}-{arrival}")
|
||
|
||
add_field("train_no", "车次", cls._extract_first(TRAIN_NO_PATTERN, text) or cls._extract_first(TRAIN_STANDALONE_NO_PATTERN, text))
|
||
id_number = cls._extract_train_id_number(text)
|
||
add_field("passenger_name", "乘车人", cls._extract_train_passenger_name(text, id_number=id_number))
|
||
add_field("id_number", "身份证号", id_number)
|
||
add_field("electronic_ticket_no", "电子客票号", cls._extract_first(TRAIN_ETICKET_PATTERN, text))
|
||
add_field("seat_class", "席别", cls._extract_first(TRAIN_SEAT_CLASS_PATTERN, text))
|
||
carriage_no, seat_no = cls._extract_train_carriage_and_seat(text)
|
||
add_field("carriage_no", "车厢", carriage_no)
|
||
add_field("seat_no", "座位号", seat_no)
|
||
add_field("fare", "票价", cls._extract_train_fare(text))
|
||
return normalized
|
||
|
||
@staticmethod
|
||
def _is_train_ticket_values(
|
||
*,
|
||
document_type: str,
|
||
document_type_label: str,
|
||
scene_label: str,
|
||
text: str,
|
||
) -> bool:
|
||
if str(document_type or "").strip().lower() == "train_ticket":
|
||
return True
|
||
compact = "".join([document_type_label, scene_label, text]).replace(" ", "")
|
||
if any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次")):
|
||
return True
|
||
lower_compact = compact.lower()
|
||
return bool(re.search(r"[GCDZKTLYS]\d{1,5}", compact, flags=re.IGNORECASE)) and (
|
||
"12306" in compact
|
||
or "95306" in compact
|
||
or re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5]{2,12}", compact)
|
||
or ("wuhan" in lower_compact and "shanghai" in lower_compact)
|
||
)
|
||
|
||
@classmethod
|
||
def _is_train_ticket_meta(cls, meta: dict[str, Any]) -> bool:
|
||
return cls._is_train_ticket_values(
|
||
document_type=str(meta.get("document_type") or ""),
|
||
document_type_label=str(meta.get("document_type_label") or ""),
|
||
scene_label=str(meta.get("scene_label") or ""),
|
||
text=cls._receipt_text(meta),
|
||
)
|
||
|
||
@staticmethod
|
||
def _receipt_text(meta: dict[str, Any]) -> str:
|
||
field_text = "\n".join(
|
||
f"{field.get('label', '')} {field.get('value', '')}"
|
||
for field in list(meta.get("document_fields") or [])
|
||
if isinstance(field, dict)
|
||
)
|
||
return "\n".join(
|
||
value
|
||
for value in (
|
||
str(meta.get("ocr_text") or ""),
|
||
str(meta.get("summary") or ""),
|
||
str(meta.get("file_name") or ""),
|
||
field_text,
|
||
)
|
||
if value
|
||
)
|
||
|
||
@classmethod
|
||
def _extract_train_invoice_date(cls, text: str) -> str:
|
||
match = TRAIN_INVOICE_DATE_PATTERN.search(str(text or ""))
|
||
if not match:
|
||
return ""
|
||
return cls._normalize_receipt_date_value(match.group(1))
|
||
|
||
@classmethod
|
||
def _extract_train_trip_datetime(cls, text: str) -> str:
|
||
raw_text = str(text or "")
|
||
candidates: list[tuple[int, int, str]] = []
|
||
for index, match in enumerate(RECEIPT_DATE_PATTERN.finditer(raw_text)):
|
||
window = raw_text[max(0, match.start() - 14): match.end() + 8].replace(" ", "")
|
||
if any(token in window for token in ("开票日期", "发票日期", "开票时间")):
|
||
continue
|
||
value = cls._format_date_match_with_time(raw_text, match)
|
||
score = 0
|
||
nearby = raw_text[max(0, match.start() - 32): match.end() + 32]
|
||
compact = nearby.replace(" ", "")
|
||
if ":" in value or ":" in value:
|
||
score += 8
|
||
if any(token in compact for token in ("开车时间", "发车时间", "乘车日期", "乘车时间", "检票", "车次")):
|
||
score += 6
|
||
if any(token in compact for token in ("二等座", "一等座", "商务座", "硬座", "软卧", "硬卧")):
|
||
score += 3
|
||
candidates.append((score, -index, value))
|
||
if not candidates:
|
||
return ""
|
||
return max(candidates, key=lambda item: (item[0], item[1]))[2]
|
||
|
||
@classmethod
|
||
def _format_date_match_with_time(cls, text: str, match: re.Match[str]) -> str:
|
||
date_value = cls._normalize_receipt_date_value(match.group(1))
|
||
if not date_value:
|
||
return ""
|
||
surrounding = str(text or "")[max(0, match.start() - 18): match.end() + 24]
|
||
time_match = RECEIPT_TIME_PATTERN.search(surrounding)
|
||
if not time_match:
|
||
return date_value
|
||
return f"{date_value} {str(time_match.group(1)).zfill(2)}:{str(time_match.group(2)).zfill(2)}"
|
||
|
||
@staticmethod
|
||
def _normalize_receipt_date_value(value: str) -> str:
|
||
raw = str(value or "").strip()
|
||
match = RECEIPT_DATE_PATTERN.search(raw)
|
||
if not match:
|
||
return raw
|
||
normalized = match.group(1).replace("年", "-").replace("月", "-").replace("日", "")
|
||
normalized = normalized.replace("/", "-").replace(".", "-")
|
||
normalized = re.sub(r"\s+", "-", normalized)
|
||
parts = [part for part in normalized.split("-") if part]
|
||
if len(parts) != 3:
|
||
return match.group(1)
|
||
year, month, day = parts
|
||
return f"{year.zfill(4)}-{month.zfill(2)}-{day.zfill(2)}"
|
||
|
||
@classmethod
|
||
def _extract_train_route_points(cls, text: str) -> tuple[str, str]:
|
||
raw_text = str(text or "")
|
||
split_line_match = TRAIN_ROUTE_WITH_NO_PATTERN.search(raw_text)
|
||
if split_line_match:
|
||
departure = cls._clean_train_station(split_line_match.group(1))
|
||
arrival = cls._clean_train_station(split_line_match.group(2))
|
||
if cls._is_valid_train_station_value(departure) and cls._is_valid_train_station_value(arrival) and departure != arrival:
|
||
return departure, arrival
|
||
|
||
station_candidates: list[str] = []
|
||
for line in raw_text.replace("\r", "\n").splitlines():
|
||
candidate = cls._clean_train_station(line)
|
||
if not candidate or candidate in station_candidates:
|
||
continue
|
||
if not str(line or "").strip().endswith("站"):
|
||
continue
|
||
if not cls._is_valid_train_station_value(candidate):
|
||
continue
|
||
station_candidates.append(candidate)
|
||
if len(station_candidates) >= 2:
|
||
return station_candidates[0], station_candidates[1]
|
||
|
||
match = TRAIN_ROUTE_PATTERN.search(raw_text)
|
||
if match:
|
||
departure = cls._clean_train_station(match.group(1))
|
||
arrival = cls._clean_train_station(match.group(2))
|
||
if cls._is_valid_train_station_value(departure) and cls._is_valid_train_station_value(arrival) and departure != arrival:
|
||
return departure, arrival
|
||
return "", ""
|
||
|
||
@staticmethod
|
||
def _clean_train_station(value: str) -> str:
|
||
cleaned = re.sub(r"[^A-Za-z0-9\u4e00-\u9fa5()()·]", "", str(value or ""))
|
||
cleaned = re.sub(r"(?:火车站|高铁站|站)$", "", cleaned)
|
||
return cleaned.strip()
|
||
|
||
@classmethod
|
||
def _should_keep_train_ticket_field(cls, *, key: str, label: str, value: str) -> bool:
|
||
if key in TRAIN_STATION_FIELD_KEYS or label in TRAIN_STATION_FIELD_LABELS:
|
||
return cls._is_valid_train_station_value(value)
|
||
if key == "passenger_name" or label == "乘车人":
|
||
return bool(cls._clean_train_passenger_candidate(value))
|
||
return True
|
||
|
||
@classmethod
|
||
def _is_valid_train_station_value(cls, value: str) -> bool:
|
||
cleaned = cls._clean_train_station(value)
|
||
if not 2 <= len(cleaned) <= 12:
|
||
return False
|
||
if any(token in cleaned for token in TRAIN_INVALID_STATION_TOKENS):
|
||
return False
|
||
if re.search(r"[A-Za-z0-9]", cleaned):
|
||
return False
|
||
return True
|
||
|
||
@staticmethod
|
||
def _extract_first(pattern: re.Pattern[str], text: str) -> str:
|
||
match = pattern.search(str(text or ""))
|
||
return str(match.group(1) or "").strip() if match else ""
|
||
|
||
@classmethod
|
||
def _extract_train_passenger_name(cls, text: str, *, id_number: str = "") -> str:
|
||
lines = [line.strip() for line in str(text or "").replace("\r", "\n").splitlines() if line.strip()]
|
||
for line in lines:
|
||
labeled = cls._clean_train_passenger_candidate(cls._extract_first(TRAIN_PASSENGER_PATTERN, line))
|
||
if labeled:
|
||
return labeled
|
||
|
||
if id_number:
|
||
for index, line in enumerate(lines):
|
||
if id_number not in line:
|
||
continue
|
||
candidate = cls._clean_train_passenger_candidate(line.replace(id_number, " "))
|
||
if candidate:
|
||
return candidate
|
||
for offset in (1, -1, 2):
|
||
target_index = index + offset
|
||
if target_index < 0 or target_index >= len(lines):
|
||
continue
|
||
candidate = cls._clean_train_passenger_candidate(lines[target_index])
|
||
if candidate:
|
||
return candidate
|
||
for line in lines:
|
||
purchase_match = TRAIN_PURCHASER_NAME_PATTERN.search(line)
|
||
if purchase_match:
|
||
candidate = cls._clean_train_passenger_candidate(purchase_match.group(1))
|
||
if candidate:
|
||
return candidate
|
||
return ""
|
||
|
||
@staticmethod
|
||
def _clean_train_passenger_candidate(value: str) -> str:
|
||
cleaned = re.sub(r"[^·\u4e00-\u9fa5]", "", str(value or "")).strip()
|
||
if not 2 <= len(cleaned) <= 8:
|
||
return ""
|
||
if any(
|
||
token in cleaned
|
||
for token in (
|
||
"电子",
|
||
"客票",
|
||
"铁路",
|
||
"发票",
|
||
"税务",
|
||
"湖北省",
|
||
"中国铁路",
|
||
"开票",
|
||
"日期",
|
||
"车厢",
|
||
"席别",
|
||
"二等座",
|
||
"一等座",
|
||
"商务座",
|
||
"特等座",
|
||
"软座",
|
||
"硬座",
|
||
"无座",
|
||
"软卧",
|
||
"硬卧",
|
||
"座位",
|
||
"票价",
|
||
"金额",
|
||
"行程",
|
||
"出发",
|
||
"到达",
|
||
"车次",
|
||
"公司",
|
||
"信用代码",
|
||
"纳税人",
|
||
"扫码",
|
||
"无效",
|
||
"二维码",
|
||
"座席",
|
||
"身份",
|
||
"身份证号",
|
||
"证件",
|
||
)
|
||
):
|
||
return ""
|
||
return cleaned
|
||
|
||
@classmethod
|
||
def _extract_train_id_number(cls, text: str) -> str:
|
||
labeled = cls._extract_first(TRAIN_ID_PATTERN, text)
|
||
if labeled:
|
||
return labeled
|
||
fallback = ""
|
||
for line in str(text or "").replace("\r", "\n").splitlines():
|
||
compact_line = line.replace(" ", "")
|
||
if any(token in compact_line for token in ("发票号码", "电子客票号", "客票号", "订单号")):
|
||
continue
|
||
match = TRAIN_ID_FALLBACK_PATTERN.search(compact_line)
|
||
if not match:
|
||
continue
|
||
candidate = str(match.group(1) or "").strip()
|
||
if "*" in candidate:
|
||
return candidate
|
||
if not fallback:
|
||
fallback = candidate
|
||
return fallback
|
||
|
||
@staticmethod
|
||
def _extract_train_carriage_and_seat(text: str) -> tuple[str, str]:
|
||
combined_match = TRAIN_COMBINED_SEAT_PATTERN.search(str(text or ""))
|
||
if combined_match:
|
||
return f"{combined_match.group(1)}车", combined_match.group(2)
|
||
loose_match = TRAIN_LOOSE_SEAT_PATTERN.search(str(text or ""))
|
||
if loose_match:
|
||
return f"{loose_match.group(1).zfill(2)}车", loose_match.group(2).upper()
|
||
carriage_no = ReceiptFolderService._extract_first(TRAIN_CARRIAGE_PATTERN, text).replace(" ", "")
|
||
seat_no = ReceiptFolderService._extract_first(TRAIN_SEAT_NO_PATTERN, text)
|
||
return carriage_no, seat_no
|
||
|
||
@staticmethod
|
||
def _extract_train_fare(text: str) -> str:
|
||
match = TRAIN_FARE_PATTERN.search(str(text or ""))
|
||
if not match:
|
||
match = max(
|
||
list(TRAIN_LOOSE_FARE_PATTERN.finditer(str(text or ""))),
|
||
key=lambda item: float(str(item.group(1) or "0").replace(",", ".")),
|
||
default=None,
|
||
)
|
||
if not match:
|
||
return ""
|
||
value = str(match.group(1) or "").replace(",", ".").strip()
|
||
return f"{value}元" if value else ""
|
||
|
||
@staticmethod
|
||
def _parse_datetime(value: Any) -> datetime | None:
|
||
raw = str(value or "").strip()
|
||
if not raw:
|
||
return None
|
||
try:
|
||
return datetime.fromisoformat(raw)
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, ReceiptFolderTrainTicketMixin):
|
||
def __init__(self) -> None:
|
||
self.root = (get_settings().resolved_storage_root_dir / "receipt_folder").resolve()
|
||
|
||
def persist_ocr_batch(
|
||
self,
|
||
*,
|
||
files: list[tuple[str, bytes, str | None]],
|
||
result: OcrRecognizeBatchRead,
|
||
current_user: CurrentUserContext,
|
||
receipt_ids: list[str] | None = None,
|
||
) -> OcrRecognizeBatchRead:
|
||
documents = list(result.documents or [])
|
||
enriched: list[OcrRecognizeDocumentRead] = []
|
||
for index, document in enumerate(documents):
|
||
if index >= len(files):
|
||
enriched.append(document)
|
||
continue
|
||
existing_receipt = self._resolve_existing_item(
|
||
receipt_ids[index] if receipt_ids and index < len(receipt_ids) else "",
|
||
current_user,
|
||
)
|
||
if existing_receipt is not None:
|
||
enriched.append(
|
||
self._enrich_ocr_document_with_receipt(
|
||
document,
|
||
receipt=existing_receipt,
|
||
current_user=current_user,
|
||
)
|
||
)
|
||
continue
|
||
filename, content, media_type = files[index]
|
||
if not self._should_persist_source(filename, content):
|
||
enriched.append(document)
|
||
continue
|
||
duplicate_receipt = self.find_duplicate_receipt(
|
||
filename=filename,
|
||
content=content,
|
||
current_user=current_user,
|
||
)
|
||
if duplicate_receipt is not None:
|
||
duplicate_receipt = self._refresh_duplicate_receipt_from_document_if_stronger(
|
||
receipt=duplicate_receipt,
|
||
document=document,
|
||
current_user=current_user,
|
||
)
|
||
warning = "已上传过同样的单据,请不要重复上传。"
|
||
existing_warnings = [str(item) for item in list(document.warnings or []) if str(item).strip()]
|
||
enriched.append(
|
||
self._enrich_ocr_document_with_receipt(
|
||
document,
|
||
receipt=duplicate_receipt,
|
||
current_user=current_user,
|
||
extra_warnings=[*existing_warnings, warning],
|
||
)
|
||
)
|
||
continue
|
||
receipt = self.save_receipt(
|
||
filename=filename,
|
||
content=content,
|
||
media_type=media_type or document.media_type,
|
||
document=document,
|
||
current_user=current_user,
|
||
)
|
||
enriched.append(
|
||
self._enrich_ocr_document_with_receipt(
|
||
document,
|
||
receipt=receipt,
|
||
current_user=current_user,
|
||
)
|
||
)
|
||
return result.model_copy(update={"documents": enriched})
|
||
|
||
def _enrich_ocr_document_with_receipt(
|
||
self,
|
||
document: OcrRecognizeDocumentRead,
|
||
*,
|
||
receipt: ReceiptFolderItemRead,
|
||
current_user: CurrentUserContext,
|
||
extra_warnings: list[str] | None = None,
|
||
) -> OcrRecognizeDocumentRead:
|
||
update: dict[str, Any] = {
|
||
"receipt_id": receipt.id,
|
||
"receipt_status": receipt.status,
|
||
"receipt_preview_url": receipt.preview_url,
|
||
"receipt_source_url": receipt.source_url,
|
||
}
|
||
|
||
try:
|
||
meta = self._read_receipt_meta(receipt.id, current_user)
|
||
except FileNotFoundError:
|
||
meta = {}
|
||
|
||
if meta:
|
||
update.update(
|
||
{
|
||
"text": str(meta.get("ocr_text") or document.text or ""),
|
||
"summary": str(meta.get("summary") or document.summary or ""),
|
||
"document_type": str(meta.get("document_type") or document.document_type or "other"),
|
||
"document_type_label": str(meta.get("document_type_label") or document.document_type_label or "其他单据"),
|
||
"scene_code": str(meta.get("scene_code") or document.scene_code or "other"),
|
||
"scene_label": str(meta.get("scene_label") or document.scene_label or "其他票据"),
|
||
"classification_source": str(meta.get("ocr_classification_source") or document.classification_source or ""),
|
||
"classification_confidence": float(
|
||
meta.get("ocr_classification_confidence")
|
||
or document.classification_confidence
|
||
or 0.0
|
||
),
|
||
"classification_evidence": [
|
||
str(value)
|
||
for value in list(meta.get("ocr_classification_evidence") or document.classification_evidence or [])
|
||
if str(value).strip()
|
||
],
|
||
"document_fields": self._build_ocr_document_fields_from_meta(meta),
|
||
"preview_kind": str(meta.get("preview_kind") or document.preview_kind or ""),
|
||
}
|
||
)
|
||
|
||
warnings = [
|
||
str(item)
|
||
for item in list(extra_warnings if extra_warnings is not None else document.warnings or [])
|
||
if str(item).strip()
|
||
]
|
||
if warnings:
|
||
update["warnings"] = list(dict.fromkeys(warnings))
|
||
return document.model_copy(update=update)
|
||
|
||
def _refresh_duplicate_receipt_from_document_if_stronger(
|
||
self,
|
||
*,
|
||
receipt: ReceiptFolderItemRead,
|
||
document: OcrRecognizeDocumentRead,
|
||
current_user: CurrentUserContext,
|
||
) -> ReceiptFolderItemRead:
|
||
try:
|
||
meta = self._read_receipt_meta(receipt.id, current_user)
|
||
except FileNotFoundError:
|
||
return receipt
|
||
|
||
incoming_meta = self._build_document_meta(document)
|
||
if not self._is_incoming_document_meta_stronger(meta, incoming_meta):
|
||
return receipt
|
||
|
||
for key in (
|
||
"engine",
|
||
"model",
|
||
"ocr_text",
|
||
"summary",
|
||
"ocr_avg_score",
|
||
"ocr_line_count",
|
||
"page_count",
|
||
"document_type",
|
||
"document_type_label",
|
||
"scene_code",
|
||
"scene_label",
|
||
"ocr_classification_source",
|
||
"ocr_classification_confidence",
|
||
"ocr_classification_evidence",
|
||
"document_fields",
|
||
"ocr_warnings",
|
||
):
|
||
meta[key] = incoming_meta[key]
|
||
meta["updated_at"] = datetime.now(UTC).isoformat()
|
||
self._write_meta(self._receipt_dir(self._owner_key(current_user), receipt.id), meta)
|
||
return self._build_item(meta)
|
||
|
||
@staticmethod
|
||
def _is_incoming_document_meta_stronger(existing_meta: dict[str, Any], incoming_meta: dict[str, Any]) -> bool:
|
||
existing_type = str(existing_meta.get("document_type") or "other").strip() or "other"
|
||
incoming_type = str(incoming_meta.get("document_type") or "other").strip() or "other"
|
||
existing_fields = [field for field in list(existing_meta.get("document_fields") or []) if isinstance(field, dict)]
|
||
incoming_fields = [field for field in list(incoming_meta.get("document_fields") or []) if isinstance(field, dict)]
|
||
existing_text = str(existing_meta.get("ocr_text") or "").strip()
|
||
incoming_text = str(incoming_meta.get("ocr_text") or "").strip()
|
||
|
||
if incoming_type != "other" and existing_type == "other":
|
||
return True
|
||
if incoming_fields and not existing_fields:
|
||
return True
|
||
if incoming_text and not existing_text:
|
||
return True
|
||
return False
|
||
|
||
def _build_ocr_document_fields_from_meta(self, meta: dict[str, Any]) -> list[OcrRecognizeFieldRead]:
|
||
return [
|
||
OcrRecognizeFieldRead(
|
||
key=field.key,
|
||
label=field.label,
|
||
value=field.value,
|
||
)
|
||
for field in self._resolve_fields(meta)
|
||
if field.label and field.value
|
||
]
|
||
|
||
def save_receipt(
|
||
self,
|
||
*,
|
||
filename: str,
|
||
content: bytes,
|
||
media_type: str | None,
|
||
document: Any | None,
|
||
current_user: CurrentUserContext,
|
||
linked_claim_id: str = "",
|
||
linked_claim_no: str = "",
|
||
linked_item_id: str = "",
|
||
) -> ReceiptFolderItemRead:
|
||
owner_key = self._owner_key(current_user)
|
||
receipt_id = str(uuid4())
|
||
receipt_dir = self._owner_root(owner_key) / receipt_id
|
||
receipt_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
normalized_name = self.normalize_filename(filename)
|
||
source_path = receipt_dir / normalized_name
|
||
source_path.write_bytes(content)
|
||
resolved_media_type = self.resolve_media_type(normalized_name, media_type)
|
||
preview_meta = self._write_preview_asset(
|
||
receipt_dir=receipt_dir,
|
||
source_path=source_path,
|
||
media_type=resolved_media_type,
|
||
document=document,
|
||
)
|
||
now = datetime.now(UTC)
|
||
linked = bool(str(linked_claim_id or "").strip())
|
||
meta = {
|
||
"id": receipt_id,
|
||
"owner_key": owner_key,
|
||
"file_name": normalized_name,
|
||
"source_file_name": normalized_name,
|
||
"media_type": resolved_media_type,
|
||
"size_bytes": len(content),
|
||
"file_sha256": self._content_hash(content),
|
||
"uploaded_at": now.isoformat(),
|
||
"status": "linked" if linked else "unlinked",
|
||
"linked_claim_id": str(linked_claim_id or "").strip(),
|
||
"linked_claim_no": str(linked_claim_no or "").strip(),
|
||
"linked_item_id": str(linked_item_id or "").strip(),
|
||
"linked_at": now.isoformat() if linked else "",
|
||
**self._build_document_meta(document),
|
||
**preview_meta,
|
||
}
|
||
self._write_meta(receipt_dir, meta)
|
||
return self._build_item(meta)
|
||
|
||
def save_linked_attachment(
|
||
self,
|
||
*,
|
||
file_path: Path,
|
||
media_type: str,
|
||
document: Any | None,
|
||
current_user: CurrentUserContext,
|
||
claim_id: str,
|
||
claim_no: str,
|
||
item_id: str,
|
||
source_receipt_id: str = "",
|
||
) -> ReceiptFolderItemRead | None:
|
||
if not file_path.exists() or not file_path.is_file():
|
||
return None
|
||
if str(source_receipt_id or "").strip():
|
||
try:
|
||
return self.mark_receipt_linked(
|
||
receipt_id=source_receipt_id,
|
||
current_user=current_user,
|
||
claim_id=claim_id,
|
||
claim_no=claim_no,
|
||
item_id=item_id,
|
||
)
|
||
except FileNotFoundError:
|
||
pass
|
||
storage_root = get_settings().resolved_storage_root_dir
|
||
try:
|
||
file_path.resolve().relative_to(storage_root)
|
||
except ValueError:
|
||
return None
|
||
return self.save_receipt(
|
||
filename=file_path.name,
|
||
content=file_path.read_bytes(),
|
||
media_type=media_type,
|
||
document=document,
|
||
current_user=current_user,
|
||
linked_claim_id=claim_id,
|
||
linked_claim_no=claim_no,
|
||
linked_item_id=item_id,
|
||
)
|
||
|
||
def mark_receipt_linked(
|
||
self,
|
||
*,
|
||
receipt_id: str,
|
||
current_user: CurrentUserContext,
|
||
claim_id: str,
|
||
claim_no: str,
|
||
item_id: str,
|
||
) -> ReceiptFolderItemRead:
|
||
owner_key = self._owner_key(current_user)
|
||
receipt_dir = self._receipt_dir(owner_key, receipt_id)
|
||
meta = self._read_meta(receipt_dir)
|
||
meta["status"] = "linked"
|
||
meta["linked_claim_id"] = str(claim_id or "").strip()
|
||
meta["linked_claim_no"] = str(claim_no or "").strip()
|
||
meta["linked_item_id"] = str(item_id or "").strip()
|
||
meta["linked_at"] = datetime.now(UTC).isoformat()
|
||
self._write_meta(receipt_dir, meta)
|
||
return self._build_item(meta)
|
||
|
||
def list_receipts(
|
||
self,
|
||
*,
|
||
current_user: CurrentUserContext,
|
||
status_filter: str = "all",
|
||
) -> list[ReceiptFolderItemRead]:
|
||
status_filter = str(status_filter or "all").strip().lower()
|
||
items = [
|
||
self._build_item(meta)
|
||
for meta in self._iter_owner_meta(self._owner_key(current_user))
|
||
if self._matches_status(meta, status_filter)
|
||
]
|
||
return sorted(items, key=lambda item: item.uploaded_at or datetime.min.replace(tzinfo=UTC), reverse=True)
|
||
|
||
def get_receipt(self, receipt_id: str, current_user: CurrentUserContext) -> ReceiptFolderDetailRead:
|
||
meta = self._read_receipt_meta(receipt_id, current_user)
|
||
item = self._build_item(meta)
|
||
return ReceiptFolderDetailRead(
|
||
**item.model_dump(),
|
||
engine=str(meta.get("engine") or ""),
|
||
model=str(meta.get("model") or ""),
|
||
ocr_text=str(meta.get("ocr_text") or ""),
|
||
line_count=int(meta.get("ocr_line_count") or 0),
|
||
page_count=max(1, int(meta.get("page_count") or 1)),
|
||
classification_confidence=float(meta.get("ocr_classification_confidence") or 0.0),
|
||
classification_evidence=[
|
||
str(value) for value in list(meta.get("ocr_classification_evidence") or []) if str(value).strip()
|
||
],
|
||
fields=self._resolve_fields(meta),
|
||
raw_meta=meta,
|
||
edit_logs=self._resolve_edit_logs(meta),
|
||
)
|
||
|
||
def find_duplicate_receipt(
|
||
self,
|
||
*,
|
||
filename: str,
|
||
content: bytes,
|
||
current_user: CurrentUserContext,
|
||
) -> ReceiptFolderItemRead | None:
|
||
if not self._should_persist_source(filename, content):
|
||
return None
|
||
file_hash = self._content_hash(content)
|
||
for meta in self._iter_owner_meta(self._owner_key(current_user)):
|
||
if file_hash and str(meta.get("file_sha256") or "").strip() == file_hash:
|
||
return self._build_item(meta)
|
||
return None
|
||
|
||
def update_receipt(
|
||
self,
|
||
*,
|
||
receipt_id: str,
|
||
payload: ReceiptFolderUpdate,
|
||
current_user: CurrentUserContext,
|
||
) -> ReceiptFolderDetailRead:
|
||
owner_key = self._owner_key(current_user)
|
||
receipt_dir = self._receipt_dir(owner_key, receipt_id)
|
||
meta = self._read_meta(receipt_dir)
|
||
before_meta = json.loads(json.dumps(meta, ensure_ascii=False))
|
||
updates = payload.model_dump(exclude_unset=True)
|
||
for key in ("document_type", "document_type_label", "scene_code", "scene_label", "summary"):
|
||
if key in updates and updates[key] is not None:
|
||
meta[key] = str(updates[key] or "").strip()
|
||
|
||
editable = dict(meta.get("editable_fields") or {})
|
||
for key in ("amount", "document_date", "merchant_name"):
|
||
if key in updates and updates[key] is not None:
|
||
editable[key] = str(updates[key] or "").strip()
|
||
if "fields" in updates and updates["fields"] is not None:
|
||
meta["document_fields"] = [
|
||
field.model_dump() if isinstance(field, ReceiptFolderFieldRead) else dict(field)
|
||
for field in payload.fields or []
|
||
]
|
||
meta["editable_fields"] = editable
|
||
changes = self._build_edit_changes(before_meta, meta)
|
||
if changes:
|
||
logs = list(meta.get("edit_logs") or [])
|
||
logs.insert(
|
||
0,
|
||
{
|
||
"operated_at": datetime.now(UTC).isoformat(),
|
||
"operator": self._operator_label(current_user),
|
||
"changes": changes,
|
||
},
|
||
)
|
||
meta["edit_logs"] = logs[:50]
|
||
meta["updated_at"] = datetime.now(UTC).isoformat()
|
||
self._write_meta(receipt_dir, meta)
|
||
return self.get_receipt(receipt_id, current_user)
|
||
|
||
def delete_receipt(
|
||
self,
|
||
*,
|
||
receipt_id: str,
|
||
current_user: CurrentUserContext,
|
||
) -> ReceiptFolderDeleteResponse:
|
||
owner_key = self._owner_key(current_user)
|
||
receipt_dir = self._receipt_dir(owner_key, receipt_id)
|
||
shutil.rmtree(receipt_dir)
|
||
return ReceiptFolderDeleteResponse(message="票据已删除。", receipt_id=receipt_id)
|
||
|
||
def unlink_receipts_for_claim(self, claim_id: str) -> int:
|
||
normalized_claim_id = str(claim_id or "").strip()
|
||
if not normalized_claim_id:
|
||
return 0
|
||
unlinked_count = 0
|
||
self.root.mkdir(parents=True, exist_ok=True)
|
||
for meta_path in list(self.root.glob("*/*/meta.json")):
|
||
try:
|
||
meta = self._read_meta(meta_path.parent)
|
||
except FileNotFoundError:
|
||
continue
|
||
if str(meta.get("linked_claim_id") or "").strip() != normalized_claim_id:
|
||
continue
|
||
meta["status"] = "unlinked"
|
||
meta["linked_claim_id"] = ""
|
||
meta["linked_claim_no"] = ""
|
||
meta["linked_item_id"] = ""
|
||
meta["linked_at"] = ""
|
||
meta["updated_at"] = datetime.now(UTC).isoformat()
|
||
self._write_meta(meta_path.parent, meta)
|
||
unlinked_count += 1
|
||
return unlinked_count
|
||
|
||
def delete_receipts_for_claim(self, claim_id: str) -> int:
|
||
return self.unlink_receipts_for_claim(claim_id)
|
||
|
||
def resolve_source(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]:
|
||
meta = self._read_receipt_meta(receipt_id, current_user)
|
||
receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id)
|
||
file_name = str(meta.get("source_file_name") or meta.get("file_name") or "").strip()
|
||
path = self._assert_child(receipt_dir / file_name)
|
||
if not path.exists():
|
||
raise FileNotFoundError("Receipt source not found")
|
||
media_type = self.resolve_media_type(path.name, str(meta.get("media_type") or ""))
|
||
return path, media_type, str(meta.get("file_name") or path.name)
|
||
|
||
def resolve_preview(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]:
|
||
meta = self._read_receipt_meta(receipt_id, current_user)
|
||
receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id)
|
||
meta = self._refresh_pdf_preview_asset_if_needed(receipt_dir=receipt_dir, meta=meta)
|
||
preview_name = str(meta.get("preview_file_name") or "").strip()
|
||
if preview_name:
|
||
preview_path = self._assert_child(receipt_dir / preview_name)
|
||
if preview_path.exists():
|
||
return (
|
||
preview_path,
|
||
self.resolve_media_type(preview_path.name, str(meta.get("preview_media_type") or "")),
|
||
preview_path.name,
|
||
)
|
||
|
||
source_path, source_media_type, source_name = self.resolve_source(receipt_id, current_user)
|
||
if self._is_previewable(source_media_type):
|
||
return source_path, source_media_type, source_name
|
||
raise FileNotFoundError("Receipt preview not found")
|