feat: 报销审批流重构与管家计划全链路贯通
- 重构报销状态注册表、审批流路由与平台风险标记 - 完善管家意图规划器与模型计划构建器全链路 - 新增 OCR Worker 脚本、数据库会话管理与通知状态 - 优化文档中心、日志视图、预算中心与员工管理交互 - 增强工作台摘要、图标资源与全局主题样式 - 补充审批路由、状态注册、OCR 服务与管家规划器测试覆盖
This commit is contained in:
@@ -1,10 +1,13 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import threading
|
||||
from collections import OrderedDict
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
@@ -17,6 +20,7 @@ from app.services.document_intelligence import DocumentIntelligenceService
|
||||
|
||||
WORKER_JSON_PREFIX = "__OCR_JSON__="
|
||||
SUPPORTED_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".pdf"}
|
||||
OCR_RESULT_CACHE_LIMIT = 32
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
@@ -50,6 +54,12 @@ class AggregatedOcrDocument:
|
||||
|
||||
|
||||
class OcrService:
|
||||
_cache_lock = threading.Lock()
|
||||
_result_cache: OrderedDict[str, OcrRecognizeDocumentRead] = OrderedDict()
|
||||
_worker_semaphore_lock = threading.Lock()
|
||||
_worker_semaphore: threading.Semaphore | None = None
|
||||
_worker_semaphore_limit = 0
|
||||
|
||||
def __init__(self, db: Session | None = None) -> None:
|
||||
self.settings = get_settings()
|
||||
self.document_intelligence_service = DocumentIntelligenceService(db)
|
||||
@@ -70,6 +80,7 @@ class OcrService:
|
||||
python_bin = self._resolve_python_bin()
|
||||
worker_path = self._resolve_worker_path()
|
||||
worker_payload: dict = {}
|
||||
cache_keys_by_source: dict[str, str] = {}
|
||||
|
||||
try:
|
||||
for filename, content, media_type in files:
|
||||
@@ -109,6 +120,16 @@ class OcrService:
|
||||
)
|
||||
continue
|
||||
|
||||
cache_key = self._build_cache_key(content)
|
||||
cached_document = self._read_cached_document(
|
||||
cache_key,
|
||||
filename=normalized_name,
|
||||
media_type=resolved_media_type,
|
||||
)
|
||||
if cached_document is not None:
|
||||
documents.append(cached_document)
|
||||
continue
|
||||
|
||||
temp_path = temp_root / f"{uuid4().hex}{suffix}"
|
||||
temp_path.write_bytes(content)
|
||||
cleanup_paths.append(temp_path)
|
||||
@@ -116,15 +137,16 @@ class OcrService:
|
||||
if suffix == ".pdf":
|
||||
try:
|
||||
text_layer = self._extract_pdf_text_layer(temp_path)
|
||||
prepared_inputs.extend(
|
||||
self._prepare_pdf_inputs(
|
||||
pdf_path=temp_path,
|
||||
filename=normalized_name,
|
||||
media_type=resolved_media_type,
|
||||
cleanup_paths=cleanup_paths,
|
||||
text_layer=text_layer,
|
||||
)
|
||||
pdf_inputs = self._prepare_pdf_inputs(
|
||||
pdf_path=temp_path,
|
||||
filename=normalized_name,
|
||||
media_type=resolved_media_type,
|
||||
cleanup_paths=cleanup_paths,
|
||||
text_layer=text_layer,
|
||||
)
|
||||
prepared_inputs.extend(pdf_inputs)
|
||||
for item in pdf_inputs:
|
||||
cache_keys_by_source.setdefault(item.source_key, cache_key)
|
||||
except RuntimeError as exc:
|
||||
documents.append(
|
||||
OcrRecognizeDocumentRead(
|
||||
@@ -135,10 +157,11 @@ class OcrService:
|
||||
)
|
||||
continue
|
||||
|
||||
source_key = uuid4().hex
|
||||
prepared_inputs.append(
|
||||
PreparedOcrInput(
|
||||
input_path=temp_path,
|
||||
source_key=uuid4().hex,
|
||||
source_key=source_key,
|
||||
filename=normalized_name,
|
||||
media_type=resolved_media_type,
|
||||
preview_kind="image" if resolved_media_type.startswith("image/") else "",
|
||||
@@ -149,6 +172,7 @@ class OcrService:
|
||||
),
|
||||
)
|
||||
)
|
||||
cache_keys_by_source[source_key] = cache_key
|
||||
|
||||
if prepared_inputs:
|
||||
worker_payload = self._invoke_worker(
|
||||
@@ -156,11 +180,15 @@ class OcrService:
|
||||
worker_path=worker_path,
|
||||
input_paths=[item.input_path for item in prepared_inputs],
|
||||
)
|
||||
documents.extend(
|
||||
self._build_documents(
|
||||
worker_documents=worker_payload.get("documents", []),
|
||||
prepared_inputs=prepared_inputs,
|
||||
)
|
||||
recognized_documents = self._build_documents(
|
||||
worker_documents=worker_payload.get("documents", []),
|
||||
prepared_inputs=prepared_inputs,
|
||||
)
|
||||
documents.extend(recognized_documents)
|
||||
self._write_cached_documents(
|
||||
recognized_documents,
|
||||
prepared_inputs=prepared_inputs,
|
||||
cache_keys_by_source=cache_keys_by_source,
|
||||
)
|
||||
|
||||
success_count = sum(
|
||||
@@ -215,6 +243,79 @@ class OcrService:
|
||||
raise RuntimeError(f"OCR worker 不存在:{worker_path}")
|
||||
return str(worker_path)
|
||||
|
||||
def _build_cache_key(self, content: bytes) -> str:
|
||||
digest = hashlib.sha256(content).hexdigest()
|
||||
return "|".join(
|
||||
[
|
||||
self.settings.ocr_language,
|
||||
self.settings.ocr_text_detection_model,
|
||||
self.settings.ocr_text_recognition_model,
|
||||
digest,
|
||||
]
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _read_cached_document(
|
||||
cls,
|
||||
cache_key: str,
|
||||
*,
|
||||
filename: str,
|
||||
media_type: str,
|
||||
) -> OcrRecognizeDocumentRead | None:
|
||||
if not cache_key:
|
||||
return None
|
||||
with cls._cache_lock:
|
||||
cached = cls._result_cache.get(cache_key)
|
||||
if cached is None:
|
||||
return None
|
||||
cls._result_cache.move_to_end(cache_key)
|
||||
return cached.model_copy(update={"filename": filename, "media_type": media_type})
|
||||
|
||||
@classmethod
|
||||
def _write_cached_documents(
|
||||
cls,
|
||||
documents: list[OcrRecognizeDocumentRead],
|
||||
*,
|
||||
prepared_inputs: list[PreparedOcrInput],
|
||||
cache_keys_by_source: dict[str, str],
|
||||
) -> None:
|
||||
if not documents or not cache_keys_by_source:
|
||||
return
|
||||
|
||||
source_order: list[str] = []
|
||||
seen_sources: set[str] = set()
|
||||
for item in prepared_inputs:
|
||||
if item.source_key in seen_sources:
|
||||
continue
|
||||
seen_sources.add(item.source_key)
|
||||
source_order.append(item.source_key)
|
||||
|
||||
with cls._cache_lock:
|
||||
for source_key, document in zip(source_order, documents, strict=False):
|
||||
cache_key = cache_keys_by_source.get(source_key, "")
|
||||
if not cache_key:
|
||||
continue
|
||||
cls._result_cache[cache_key] = document.model_copy(
|
||||
update={
|
||||
"receipt_id": "",
|
||||
"receipt_status": "",
|
||||
"receipt_preview_url": "",
|
||||
"receipt_source_url": "",
|
||||
}
|
||||
)
|
||||
cls._result_cache.move_to_end(cache_key)
|
||||
while len(cls._result_cache) > OCR_RESULT_CACHE_LIMIT:
|
||||
cls._result_cache.popitem(last=False)
|
||||
|
||||
@classmethod
|
||||
def _resolve_worker_semaphore(cls, limit: int) -> threading.Semaphore:
|
||||
normalized_limit = max(1, int(limit or 1))
|
||||
with cls._worker_semaphore_lock:
|
||||
if cls._worker_semaphore is None or cls._worker_semaphore_limit != normalized_limit:
|
||||
cls._worker_semaphore = threading.Semaphore(normalized_limit)
|
||||
cls._worker_semaphore_limit = normalized_limit
|
||||
return cls._worker_semaphore
|
||||
|
||||
def _invoke_worker(
|
||||
self,
|
||||
*,
|
||||
@@ -235,13 +336,15 @@ class OcrService:
|
||||
for path in input_paths:
|
||||
command.extend(["--input", str(path)])
|
||||
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=self.settings.ocr_timeout_seconds,
|
||||
check=False,
|
||||
)
|
||||
semaphore = self._resolve_worker_semaphore(self.settings.ocr_max_concurrent_workers)
|
||||
with semaphore:
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=self.settings.ocr_timeout_seconds,
|
||||
check=False,
|
||||
)
|
||||
if completed.returncode != 0:
|
||||
detail = (completed.stderr or completed.stdout or "").strip()
|
||||
raise RuntimeError(f"OCR 执行失败:{detail or 'worker 返回非 0 状态码。'}")
|
||||
|
||||
Reference in New Issue
Block a user