Files
X-Financial/server/src/app/services/receipt_folder.py
caoxiaozhu 4c59941ec6 feat: 新增票据夹模块并优化 OCR 与员工画像服务
后端新增票据夹端点、数据模型和服务模块,优化 OCR 端点
Schema 和附件操作逻辑,完善员工行为画像服务和辅助函数,
前端新增票据夹视图和服务层,优化文档中心样式和侧边栏导
航,完善员工画像详情弹窗和权限控制,补充单元测试。
2026-05-29 14:51:18 +08:00

533 lines
22 KiB
Python

from __future__ import annotations
import json
import mimetypes
import re
import shutil
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
from uuid import uuid4
from app.api.deps import CurrentUserContext
from app.core.config import get_settings
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
from app.schemas.receipt_folder import (
ReceiptFolderDeleteResponse,
ReceiptFolderDetailRead,
ReceiptFolderFieldRead,
ReceiptFolderItemRead,
ReceiptFolderUpdate,
)
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
from app.services.ocr import SUPPORTED_SUFFIXES
class ReceiptFolderService:
def __init__(self) -> None:
self.root = (get_settings().resolved_storage_root_dir / "receipt_folder").resolve()
def persist_ocr_batch(
self,
*,
files: list[tuple[str, bytes, str | None]],
result: OcrRecognizeBatchRead,
current_user: CurrentUserContext,
receipt_ids: list[str] | None = None,
) -> OcrRecognizeBatchRead:
documents = list(result.documents or [])
enriched: list[OcrRecognizeDocumentRead] = []
for index, document in enumerate(documents):
if index >= len(files):
enriched.append(document)
continue
existing_receipt = self._resolve_existing_item(
receipt_ids[index] if receipt_ids and index < len(receipt_ids) else "",
current_user,
)
if existing_receipt is not None:
enriched.append(
document.model_copy(
update={
"receipt_id": existing_receipt.id,
"receipt_status": existing_receipt.status,
"receipt_preview_url": existing_receipt.preview_url,
"receipt_source_url": existing_receipt.source_url,
}
)
)
continue
filename, content, media_type = files[index]
if not self._should_persist_source(filename, content):
enriched.append(document)
continue
receipt = self.save_receipt(
filename=filename,
content=content,
media_type=media_type or document.media_type,
document=document,
current_user=current_user,
)
enriched.append(
document.model_copy(
update={
"receipt_id": receipt.id,
"receipt_status": receipt.status,
"receipt_preview_url": receipt.preview_url,
"receipt_source_url": receipt.source_url,
}
)
)
return result.model_copy(update={"documents": enriched})
def save_receipt(
self,
*,
filename: str,
content: bytes,
media_type: str | None,
document: Any | None,
current_user: CurrentUserContext,
linked_claim_id: str = "",
linked_claim_no: str = "",
linked_item_id: str = "",
) -> ReceiptFolderItemRead:
owner_key = self._owner_key(current_user)
receipt_id = str(uuid4())
receipt_dir = self._owner_root(owner_key) / receipt_id
receipt_dir.mkdir(parents=True, exist_ok=True)
normalized_name = self.normalize_filename(filename)
source_path = receipt_dir / normalized_name
source_path.write_bytes(content)
resolved_media_type = self.resolve_media_type(normalized_name, media_type)
preview_meta = self._write_preview_asset(
receipt_dir=receipt_dir,
source_path=source_path,
media_type=resolved_media_type,
document=document,
)
now = datetime.now(UTC)
linked = bool(str(linked_claim_id or "").strip())
meta = {
"id": receipt_id,
"owner_key": owner_key,
"file_name": normalized_name,
"source_file_name": normalized_name,
"media_type": resolved_media_type,
"size_bytes": len(content),
"uploaded_at": now.isoformat(),
"status": "linked" if linked else "unlinked",
"linked_claim_id": str(linked_claim_id or "").strip(),
"linked_claim_no": str(linked_claim_no or "").strip(),
"linked_item_id": str(linked_item_id or "").strip(),
"linked_at": now.isoformat() if linked else "",
**self._build_document_meta(document),
**preview_meta,
}
self._write_meta(receipt_dir, meta)
return self._build_item(meta)
def save_linked_attachment(
self,
*,
file_path: Path,
media_type: str,
document: Any | None,
current_user: CurrentUserContext,
claim_id: str,
claim_no: str,
item_id: str,
source_receipt_id: str = "",
) -> ReceiptFolderItemRead | None:
if not file_path.exists() or not file_path.is_file():
return None
if str(source_receipt_id or "").strip():
try:
return self.mark_receipt_linked(
receipt_id=source_receipt_id,
current_user=current_user,
claim_id=claim_id,
claim_no=claim_no,
item_id=item_id,
)
except FileNotFoundError:
pass
storage_root = get_settings().resolved_storage_root_dir
try:
file_path.resolve().relative_to(storage_root)
except ValueError:
return None
return self.save_receipt(
filename=file_path.name,
content=file_path.read_bytes(),
media_type=media_type,
document=document,
current_user=current_user,
linked_claim_id=claim_id,
linked_claim_no=claim_no,
linked_item_id=item_id,
)
def mark_receipt_linked(
self,
*,
receipt_id: str,
current_user: CurrentUserContext,
claim_id: str,
claim_no: str,
item_id: str,
) -> ReceiptFolderItemRead:
owner_key = self._owner_key(current_user)
receipt_dir = self._receipt_dir(owner_key, receipt_id)
meta = self._read_meta(receipt_dir)
meta["status"] = "linked"
meta["linked_claim_id"] = str(claim_id or "").strip()
meta["linked_claim_no"] = str(claim_no or "").strip()
meta["linked_item_id"] = str(item_id or "").strip()
meta["linked_at"] = datetime.now(UTC).isoformat()
self._write_meta(receipt_dir, meta)
return self._build_item(meta)
def list_receipts(
self,
*,
current_user: CurrentUserContext,
status_filter: str = "all",
) -> list[ReceiptFolderItemRead]:
status_filter = str(status_filter or "all").strip().lower()
items = [
self._build_item(meta)
for meta in self._iter_owner_meta(self._owner_key(current_user))
if self._matches_status(meta, status_filter)
]
return sorted(items, key=lambda item: item.uploaded_at or datetime.min.replace(tzinfo=UTC), reverse=True)
def get_receipt(self, receipt_id: str, current_user: CurrentUserContext) -> ReceiptFolderDetailRead:
meta = self._read_receipt_meta(receipt_id, current_user)
item = self._build_item(meta)
return ReceiptFolderDetailRead(
**item.model_dump(),
engine=str(meta.get("engine") or ""),
model=str(meta.get("model") or ""),
ocr_text=str(meta.get("ocr_text") or ""),
line_count=int(meta.get("ocr_line_count") or 0),
page_count=max(1, int(meta.get("page_count") or 1)),
classification_confidence=float(meta.get("ocr_classification_confidence") or 0.0),
classification_evidence=[
str(value) for value in list(meta.get("ocr_classification_evidence") or []) if str(value).strip()
],
fields=self._resolve_fields(meta),
raw_meta=meta,
)
def update_receipt(
self,
*,
receipt_id: str,
payload: ReceiptFolderUpdate,
current_user: CurrentUserContext,
) -> ReceiptFolderDetailRead:
owner_key = self._owner_key(current_user)
receipt_dir = self._receipt_dir(owner_key, receipt_id)
meta = self._read_meta(receipt_dir)
updates = payload.model_dump(exclude_unset=True)
for key in ("document_type", "document_type_label", "scene_code", "scene_label", "summary"):
if key in updates and updates[key] is not None:
meta[key] = str(updates[key] or "").strip()
editable = dict(meta.get("editable_fields") or {})
for key in ("amount", "document_date", "merchant_name"):
if key in updates and updates[key] is not None:
editable[key] = str(updates[key] or "").strip()
if "fields" in updates and updates["fields"] is not None:
meta["document_fields"] = [
field.model_dump() if isinstance(field, ReceiptFolderFieldRead) else dict(field)
for field in payload.fields or []
]
meta["editable_fields"] = editable
meta["updated_at"] = datetime.now(UTC).isoformat()
self._write_meta(receipt_dir, meta)
return self.get_receipt(receipt_id, current_user)
def delete_receipt(
self,
*,
receipt_id: str,
current_user: CurrentUserContext,
) -> ReceiptFolderDeleteResponse:
owner_key = self._owner_key(current_user)
receipt_dir = self._receipt_dir(owner_key, receipt_id)
shutil.rmtree(receipt_dir)
return ReceiptFolderDeleteResponse(message="票据已删除。", receipt_id=receipt_id)
def resolve_source(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]:
meta = self._read_receipt_meta(receipt_id, current_user)
receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id)
file_name = str(meta.get("source_file_name") or meta.get("file_name") or "").strip()
path = self._assert_child(receipt_dir / file_name)
if not path.exists():
raise FileNotFoundError("Receipt source not found")
media_type = self.resolve_media_type(path.name, str(meta.get("media_type") or ""))
return path, media_type, str(meta.get("file_name") or path.name)
def resolve_preview(self, receipt_id: str, current_user: CurrentUserContext) -> tuple[Path, str, str]:
meta = self._read_receipt_meta(receipt_id, current_user)
receipt_dir = self._receipt_dir(self._owner_key(current_user), receipt_id)
preview_name = str(meta.get("preview_file_name") or "").strip()
if preview_name:
preview_path = self._assert_child(receipt_dir / preview_name)
if preview_path.exists():
return (
preview_path,
self.resolve_media_type(preview_path.name, str(meta.get("preview_media_type") or "")),
preview_path.name,
)
source_path, source_media_type, source_name = self.resolve_source(receipt_id, current_user)
if self._is_previewable(source_media_type):
return source_path, source_media_type, source_name
raise FileNotFoundError("Receipt preview not found")
@staticmethod
def normalize_filename(filename: str | None) -> str:
normalized = Path(str(filename or "").strip()).name
normalized = re.sub(r"[^\w.\-\u4e00-\u9fff]+", "_", normalized).strip("._")
return normalized or "receipt.bin"
@staticmethod
def resolve_media_type(filename: str, fallback: str | None = None) -> str:
return str(mimetypes.guess_type(filename)[0] or fallback or "application/octet-stream")
def _owner_root(self, owner_key: str) -> Path:
return self._assert_child(self.root / owner_key)
def _receipt_dir(self, owner_key: str, receipt_id: str) -> Path:
normalized = str(receipt_id or "").strip()
if not re.fullmatch(r"[0-9a-fA-F-]{32,36}", normalized):
raise FileNotFoundError("Receipt not found")
path = self._assert_child(self._owner_root(owner_key) / normalized)
if not path.exists() or not path.is_dir():
raise FileNotFoundError("Receipt not found")
return path
def _assert_child(self, path: Path) -> Path:
self.root.mkdir(parents=True, exist_ok=True)
resolved = path.resolve()
try:
resolved.relative_to(self.root)
except ValueError as exc:
raise FileNotFoundError("Receipt path is invalid") from exc
return resolved
@staticmethod
def _owner_key(current_user: CurrentUserContext) -> str:
raw = str(current_user.username or current_user.name or "anonymous").strip().lower()
normalized = re.sub(r"[^\w.\-\u4e00-\u9fff]+", "_", raw).strip("._")
return normalized or "anonymous"
@staticmethod
def _should_persist_source(filename: str, content: bytes) -> bool:
if not content:
return False
return Path(str(filename or "")).suffix.lower() in SUPPORTED_SUFFIXES
def _write_preview_asset(
self,
*,
receipt_dir: Path,
source_path: Path,
media_type: str,
document: Any | None,
) -> dict[str, Any]:
preview_data_url = str(getattr(document, "preview_data_url", "") or "").strip()
decoded = ExpenseClaimAttachmentPresentation.decode_data_url(preview_data_url)
if decoded is not None:
preview_media_type, preview_content = decoded
suffix = mimetypes.guess_extension(preview_media_type) or ".bin"
preview_name = f"preview{suffix}"
preview_path = receipt_dir / preview_name
preview_path.write_bytes(preview_content)
return {
"previewable": True,
"preview_kind": "image",
"preview_file_name": preview_name,
"preview_media_type": preview_media_type,
}
if self._is_previewable(media_type):
return {
"previewable": True,
"preview_kind": "image" if media_type.startswith("image/") else "pdf",
"preview_file_name": source_path.name,
"preview_media_type": media_type,
}
return {
"previewable": False,
"preview_kind": "",
"preview_file_name": "",
"preview_media_type": "",
}
@staticmethod
def _is_previewable(media_type: str) -> bool:
return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf"
@staticmethod
def _build_document_meta(document: Any | None) -> dict[str, Any]:
fields = []
for field in list(getattr(document, "document_fields", []) or []):
if isinstance(field, dict):
fields.append(
{
"key": str(field.get("key") or "").strip(),
"label": str(field.get("label") or "").strip(),
"value": str(field.get("value") or "").strip(),
}
)
else:
fields.append(
{
"key": str(getattr(field, "key", "") or "").strip(),
"label": str(getattr(field, "label", "") or "").strip(),
"value": str(getattr(field, "value", "") or "").strip(),
}
)
fields = [field for field in fields if field["label"] and field["value"]]
return {
"engine": str(getattr(document, "engine", "") or ""),
"model": str(getattr(document, "model", "") or ""),
"ocr_text": str(getattr(document, "text", "") or ""),
"summary": str(getattr(document, "summary", "") or ""),
"ocr_avg_score": float(getattr(document, "avg_score", 0.0) or 0.0),
"ocr_line_count": int(getattr(document, "line_count", 0) or 0),
"page_count": int(getattr(document, "page_count", 1) or 1),
"document_type": str(getattr(document, "document_type", "") or "other"),
"document_type_label": str(getattr(document, "document_type_label", "") or "其他单据"),
"scene_code": str(getattr(document, "scene_code", "") or "other"),
"scene_label": str(getattr(document, "scene_label", "") or "其他票据"),
"ocr_classification_source": str(getattr(document, "classification_source", "") or ""),
"ocr_classification_confidence": float(getattr(document, "classification_confidence", 0.0) or 0.0),
"ocr_classification_evidence": [
str(value) for value in list(getattr(document, "classification_evidence", []) or []) if str(value).strip()
],
"document_fields": fields,
"editable_fields": {},
"ocr_warnings": [str(value) for value in list(getattr(document, "warnings", []) or []) if str(value).strip()],
}
def _iter_owner_meta(self, owner_key: str) -> list[dict[str, Any]]:
owner_root = self._owner_root(owner_key)
if not owner_root.exists():
return []
metas = []
for meta_path in owner_root.glob("*/meta.json"):
meta = self._read_meta(meta_path.parent)
if meta:
metas.append(meta)
return metas
def _read_receipt_meta(self, receipt_id: str, current_user: CurrentUserContext) -> dict[str, Any]:
return self._read_meta(self._receipt_dir(self._owner_key(current_user), receipt_id))
def _resolve_existing_item(
self,
receipt_id: str | None,
current_user: CurrentUserContext,
) -> ReceiptFolderItemRead | None:
normalized = str(receipt_id or "").strip()
if not normalized:
return None
try:
return self._build_item(self._read_receipt_meta(normalized, current_user))
except FileNotFoundError:
return None
@staticmethod
def _meta_path(receipt_dir: Path) -> Path:
return receipt_dir / "meta.json"
def _read_meta(self, receipt_dir: Path) -> dict[str, Any]:
meta_path = self._meta_path(receipt_dir)
if not meta_path.exists():
raise FileNotFoundError("Receipt not found")
try:
payload = json.loads(meta_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError) as exc:
raise FileNotFoundError("Receipt metadata not found") from exc
return payload if isinstance(payload, dict) else {}
def _write_meta(self, receipt_dir: Path, payload: dict[str, Any]) -> None:
self._meta_path(receipt_dir).write_text(
json.dumps(payload, ensure_ascii=False, indent=2),
encoding="utf-8",
)
@staticmethod
def _matches_status(meta: dict[str, Any], status_filter: str) -> bool:
if status_filter in {"", "all"}:
return True
return str(meta.get("status") or "unlinked").strip().lower() == status_filter
def _build_item(self, meta: dict[str, Any]) -> ReceiptFolderItemRead:
receipt_id = str(meta.get("id") or "").strip()
status_value = str(meta.get("status") or "unlinked").strip() or "unlinked"
return ReceiptFolderItemRead(
id=receipt_id,
file_name=str(meta.get("file_name") or ""),
media_type=str(meta.get("media_type") or "application/octet-stream"),
size_bytes=int(meta.get("size_bytes") or 0),
status=status_value,
status_label="已关联" if status_value == "linked" else "未关联",
document_type=str(meta.get("document_type") or "other"),
document_type_label=str(meta.get("document_type_label") or "其他单据"),
scene_code=str(meta.get("scene_code") or "other"),
scene_label=str(meta.get("scene_label") or "其他票据"),
summary=str(meta.get("summary") or ""),
amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")),
document_date=self._resolve_editable_or_field(meta, "document_date", labels=("日期", "开票日期", "乘车日期")),
merchant_name=self._resolve_editable_or_field(meta, "merchant_name", labels=("商户", "销售方", "收款方")),
avg_score=float(meta.get("ocr_avg_score") or 0.0),
uploaded_at=self._parse_datetime(meta.get("uploaded_at")),
linked_at=self._parse_datetime(meta.get("linked_at")),
linked_claim_id=str(meta.get("linked_claim_id") or ""),
linked_claim_no=str(meta.get("linked_claim_no") or ""),
previewable=bool(meta.get("previewable")),
preview_kind=str(meta.get("preview_kind") or ""),
preview_url=f"/receipt-folder/{receipt_id}/preview" if bool(meta.get("previewable")) and receipt_id else "",
source_url=f"/receipt-folder/{receipt_id}/source" if receipt_id else "",
warnings=[str(value) for value in list(meta.get("ocr_warnings") or []) if str(value).strip()],
)
def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]:
return [
ReceiptFolderFieldRead(
key=str(field.get("key") or ""),
label=str(field.get("label") or ""),
value=str(field.get("value") or ""),
)
for field in list(meta.get("document_fields") or [])
if isinstance(field, dict) and str(field.get("label") or "").strip()
]
def _resolve_editable_or_field(self, meta: dict[str, Any], key: str, *, labels: tuple[str, ...]) -> str:
editable = meta.get("editable_fields")
if isinstance(editable, dict):
value = str(editable.get(key) or "").strip()
if value:
return value
label_set = set(labels)
for field in self._resolve_fields(meta):
if field.label in label_set or field.key == key:
return field.value
return ""
@staticmethod
def _parse_datetime(value: Any) -> datetime | None:
raw = str(value or "").strip()
if not raw:
return None
try:
return datetime.fromisoformat(raw)
except ValueError:
return None