refactor(server): user_agent/steward/ocr 等服务重构并适配关联任务

- user_agent 拆分 application/locations/knowledge/response/review 四个子模块，接入申请位置语义与关联草稿分支 - steward planner/runtime/slot/plan_builder 决策链路重构，travel_reimbursement_calculator/orchestrator_expense_query 适配 - ocr/document_preview/document_intelligence/receipt_folder 复用预览与资产缓存，expense_claim_draft_flow/application_handoff 适配 - pyproject.toml 新增依赖，paddleocr bootstrap 脚本与 server_start.sh 调整 - 更新差旅/交通/通信等财务规则表，同步 document_intelligence/ocr/receipt_folder/user_agent 等测试
2026-06-24 10:42:24 +08:00
parent 332f77389d
commit 0264a4b5b4
41 changed files with 1273 additions and 182 deletions
--- a/server/src/app/services/ocr.py
+++ b/server/src/app/services/ocr.py
@@ -16,11 +16,13 @@ from sqlalchemy.orm import Session

 from app.core.config import SERVER_DIR, get_settings
 from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead, OcrRecognizeLineRead
+from app.services.document_preview import DocumentPreviewAssets
 from app.services.document_intelligence import DocumentIntelligenceService

 WORKER_JSON_PREFIX = "__OCR_JSON__="
 SUPPORTED_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".pdf"}
 OCR_RESULT_CACHE_LIMIT = 32
+OCR_RESULT_CACHE_PIPELINE_VERSION = f"pdf-image-ocr:{DocumentPreviewAssets.PDF_RENDERER_ID}:no-pdf-direct-v2"


@dataclass(slots=True)
@@ -142,16 +144,6 @@ class OcrService:
                            cleanup_paths=cleanup_paths,
                            text_layer=text_layer,
                        )
-                        if self._has_usable_pdf_text_layer(text_layer):
-                            document = self._build_text_layer_document(
-                                filename=normalized_name,
-                                media_type=resolved_media_type,
-                                text_layer=text_layer,
-                                pdf_inputs=pdf_inputs,
-                            )
-                            documents.append(document)
-                            self._write_cached_document(cache_key, document)
-                            continue
                        prepared_inputs.extend(pdf_inputs)
                        for item in pdf_inputs:
                            cache_keys_by_source.setdefault(item.source_key, cache_key)
@@ -257,6 +249,7 @@ class OcrService:
        digest = hashlib.sha256(content).hexdigest()
        return "|".join(
            [
+                OCR_RESULT_CACHE_PIPELINE_VERSION,
                self.settings.ocr_language,
                self.settings.ocr_device,
                self.settings.ocr_text_detection_model,
@@ -406,11 +399,15 @@ class OcrService:
        output_dir.mkdir(parents=True, exist_ok=True)
        cleanup_paths.append(output_dir)

-        image_paths = self._convert_pdf_to_images(pdf_path=pdf_path, output_dir=output_dir)
+        image_paths, preview_usable = self._convert_pdf_to_images(pdf_path=pdf_path, output_dir=output_dir)
        if not image_paths:
            raise RuntimeError("PDF 转图片后未生成可识别页面。")

-        preview_data_url = self._build_preview_data_url(image_paths[0], media_type="image/png")
+        preview_data_url = (
+            self._build_preview_data_url(image_paths[0], media_type="image/png")
+            if preview_usable
+            else ""
+        )
        source_key = uuid4().hex
        descriptors: list[PreparedOcrInput] = []
        for page_index, image_path in enumerate(image_paths):
@@ -421,7 +418,7 @@ class OcrService:
                    filename=filename,
                    media_type=media_type,
                    page_index=page_index,
-                    preview_kind="image" if page_index == 0 else "",
+                    preview_kind="image" if page_index == 0 and preview_data_url else "",
                    preview_data_url=preview_data_url if page_index == 0 else "",
                    text_layer=text_layer if page_index == 0 else "",
                )
@@ -450,27 +447,17 @@ class OcrService:

        return self._normalize_extracted_text(completed.stdout)

-    def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]:
-        prefix = output_dir / "page"
-        completed = subprocess.run(
-            [
-                "pdftoppm",
-                "-png",
-                "-r",
-                "160",
-                str(pdf_path),
-                str(prefix),
-            ],
-            capture_output=True,
-            text=True,
-            timeout=self.settings.ocr_timeout_seconds,
-            check=False,
-        )
-        if completed.returncode != 0:
-            detail = (completed.stderr or completed.stdout or "").strip()
-            raise RuntimeError(f"PDF 转图片失败：{detail or 'pdftoppm 返回非 0 状态码。'}")
+    def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
+        try:
+            pages = DocumentPreviewAssets.render_pdf_pages(
+                pdf_path=pdf_path,
+                output_dir=output_dir,
+                timeout_seconds=self.settings.ocr_timeout_seconds,
+            )
+        except RuntimeError as exc:
+            raise RuntimeError(f"PDF 转图片失败：{exc}") from exc

-        return sorted(output_dir.glob("page-*.png"), key=self._extract_pdf_page_sort_key)
+        return pages, True

    @staticmethod
    def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]:
@@ -595,30 +582,6 @@ class OcrService:

        return documents

-    def _build_text_layer_document(
-        self,
-        *,
-        filename: str,
-        media_type: str,
-        text_layer: str,
-        pdf_inputs: list[PreparedOcrInput],
-    ) -> OcrRecognizeDocumentRead:
-        first_input = pdf_inputs[0] if pdf_inputs else None
-        aggregated = AggregatedOcrDocument(
-            filename=filename,
-            media_type=media_type,
-            source_key=first_input.source_key if first_input is not None else uuid4().hex,
-            page_count=max(1, len(pdf_inputs)),
-            preview_kind=str(first_input.preview_kind if first_input is not None else ""),
-            preview_data_url=str(first_input.preview_data_url if first_input is not None else ""),
-        )
-        aggregated.text_layer_fragments.append(text_layer)
-        return self._finalize_document(aggregated)
-
-    @classmethod
-    def _has_usable_pdf_text_layer(cls, text_layer: str) -> bool:
-        return cls._meaningful_char_count(text_layer) >= 8
-
    @staticmethod
    def _collect_descriptor_text_layer(descriptors: list[PreparedOcrInput]) -> str:
        for descriptor in descriptors:
@@ -685,13 +648,6 @@ class OcrService:
            summary = self._summarize_text(full_text)
        preview_kind = aggregated.preview_kind
        preview_data_url = aggregated.preview_data_url
-        if (
-            used_text_layer
-            and aggregated.media_type == "application/pdf"
-            and self._placeholder_ratio(ocr_text) >= 0.12
-        ):
-            preview_kind = ""
-            preview_data_url = ""
        insight = self.document_intelligence_service.build_document_insight(
            filename=aggregated.filename,
            summary=summary,