refactor(server): user_agent/steward/ocr 等服务重构并适配关联任务

- user_agent 拆分 application/locations/knowledge/response/review 四个子模块,接入申请位置语义与关联草稿分支
- steward planner/runtime/slot/plan_builder 决策链路重构,travel_reimbursement_calculator/orchestrator_expense_query 适配
- ocr/document_preview/document_intelligence/receipt_folder 复用预览与资产缓存,expense_claim_draft_flow/application_handoff 适配
- pyproject.toml 新增依赖,paddleocr bootstrap 脚本与 server_start.sh 调整
- 更新差旅/交通/通信等财务规则表,同步 document_intelligence/ocr/receipt_folder/user_agent 等测试
This commit is contained in:
caoxiaozhu
2026-06-24 10:42:24 +08:00
parent 332f77389d
commit 0264a4b5b4
41 changed files with 1273 additions and 182 deletions

View File

@@ -16,11 +16,13 @@ from sqlalchemy.orm import Session
from app.core.config import SERVER_DIR, get_settings
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead, OcrRecognizeLineRead
from app.services.document_preview import DocumentPreviewAssets
from app.services.document_intelligence import DocumentIntelligenceService
WORKER_JSON_PREFIX = "__OCR_JSON__="
SUPPORTED_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".pdf"}
OCR_RESULT_CACHE_LIMIT = 32
OCR_RESULT_CACHE_PIPELINE_VERSION = f"pdf-image-ocr:{DocumentPreviewAssets.PDF_RENDERER_ID}:no-pdf-direct-v2"
@dataclass(slots=True)
@@ -142,16 +144,6 @@ class OcrService:
cleanup_paths=cleanup_paths,
text_layer=text_layer,
)
if self._has_usable_pdf_text_layer(text_layer):
document = self._build_text_layer_document(
filename=normalized_name,
media_type=resolved_media_type,
text_layer=text_layer,
pdf_inputs=pdf_inputs,
)
documents.append(document)
self._write_cached_document(cache_key, document)
continue
prepared_inputs.extend(pdf_inputs)
for item in pdf_inputs:
cache_keys_by_source.setdefault(item.source_key, cache_key)
@@ -257,6 +249,7 @@ class OcrService:
digest = hashlib.sha256(content).hexdigest()
return "|".join(
[
OCR_RESULT_CACHE_PIPELINE_VERSION,
self.settings.ocr_language,
self.settings.ocr_device,
self.settings.ocr_text_detection_model,
@@ -406,11 +399,15 @@ class OcrService:
output_dir.mkdir(parents=True, exist_ok=True)
cleanup_paths.append(output_dir)
image_paths = self._convert_pdf_to_images(pdf_path=pdf_path, output_dir=output_dir)
image_paths, preview_usable = self._convert_pdf_to_images(pdf_path=pdf_path, output_dir=output_dir)
if not image_paths:
raise RuntimeError("PDF 转图片后未生成可识别页面。")
preview_data_url = self._build_preview_data_url(image_paths[0], media_type="image/png")
preview_data_url = (
self._build_preview_data_url(image_paths[0], media_type="image/png")
if preview_usable
else ""
)
source_key = uuid4().hex
descriptors: list[PreparedOcrInput] = []
for page_index, image_path in enumerate(image_paths):
@@ -421,7 +418,7 @@ class OcrService:
filename=filename,
media_type=media_type,
page_index=page_index,
preview_kind="image" if page_index == 0 else "",
preview_kind="image" if page_index == 0 and preview_data_url else "",
preview_data_url=preview_data_url if page_index == 0 else "",
text_layer=text_layer if page_index == 0 else "",
)
@@ -450,27 +447,17 @@ class OcrService:
return self._normalize_extracted_text(completed.stdout)
def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]:
prefix = output_dir / "page"
completed = subprocess.run(
[
"pdftoppm",
"-png",
"-r",
"160",
str(pdf_path),
str(prefix),
],
capture_output=True,
text=True,
timeout=self.settings.ocr_timeout_seconds,
check=False,
)
if completed.returncode != 0:
detail = (completed.stderr or completed.stdout or "").strip()
raise RuntimeError(f"PDF 转图片失败:{detail or 'pdftoppm 返回非 0 状态码。'}")
def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
try:
pages = DocumentPreviewAssets.render_pdf_pages(
pdf_path=pdf_path,
output_dir=output_dir,
timeout_seconds=self.settings.ocr_timeout_seconds,
)
except RuntimeError as exc:
raise RuntimeError(f"PDF 转图片失败:{exc}") from exc
return sorted(output_dir.glob("page-*.png"), key=self._extract_pdf_page_sort_key)
return pages, True
@staticmethod
def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]:
@@ -595,30 +582,6 @@ class OcrService:
return documents
def _build_text_layer_document(
self,
*,
filename: str,
media_type: str,
text_layer: str,
pdf_inputs: list[PreparedOcrInput],
) -> OcrRecognizeDocumentRead:
first_input = pdf_inputs[0] if pdf_inputs else None
aggregated = AggregatedOcrDocument(
filename=filename,
media_type=media_type,
source_key=first_input.source_key if first_input is not None else uuid4().hex,
page_count=max(1, len(pdf_inputs)),
preview_kind=str(first_input.preview_kind if first_input is not None else ""),
preview_data_url=str(first_input.preview_data_url if first_input is not None else ""),
)
aggregated.text_layer_fragments.append(text_layer)
return self._finalize_document(aggregated)
@classmethod
def _has_usable_pdf_text_layer(cls, text_layer: str) -> bool:
return cls._meaningful_char_count(text_layer) >= 8
@staticmethod
def _collect_descriptor_text_layer(descriptors: list[PreparedOcrInput]) -> str:
for descriptor in descriptors:
@@ -685,13 +648,6 @@ class OcrService:
summary = self._summarize_text(full_text)
preview_kind = aggregated.preview_kind
preview_data_url = aggregated.preview_data_url
if (
used_text_layer
and aggregated.media_type == "application/pdf"
and self._placeholder_ratio(ocr_text) >= 0.12
):
preview_kind = ""
preview_data_url = ""
insight = self.document_intelligence_service.build_document_insight(
filename=aggregated.filename,
summary=summary,