feat: 增加差旅报销标准测算和财务终审流程

新增差旅报销测算接口及 Spreadsheet 规则解析，审批流程拆分直属领导审批与财务终审两阶段并细分权限，修复 PDF 文本层缺失时自动回退 OCR，提交后清理关联会话，前端适配审批流交互并补充单元测试。
2026-05-21 09:28:33 +08:00
parent 002bf4f756
commit 8f65661809
43 changed files with 4366 additions and 410 deletions
--- a/server/src/app/services/ocr.py
+++ b/server/src/app/services/ocr.py
@@ -2,6 +2,7 @@ from __future__ import annotations

 import base64
 import json
+import re
 import shutil
 import subprocess
 from dataclasses import dataclass, field
@@ -27,6 +28,7 @@ class PreparedOcrInput:
    page_index: int | None = None
    preview_kind: str = ""
    preview_data_url: str = ""
+    text_layer: str = ""


@dataclass(slots=True)
@@ -38,6 +40,7 @@ class AggregatedOcrDocument:
    model: str = "PP-OCRv5_mobile"
    summary_fragments: list[str] = field(default_factory=list)
    text_fragments: list[str] = field(default_factory=list)
+    text_layer_fragments: list[str] = field(default_factory=list)
    score_values: list[float] = field(default_factory=list)
    warnings: list[str] = field(default_factory=list)
    lines: list[OcrRecognizeLineRead] = field(default_factory=list)
@@ -112,12 +115,14 @@ class OcrService:

                if suffix == ".pdf":
                    try:
+                        text_layer = self._extract_pdf_text_layer(temp_path)
                        prepared_inputs.extend(
                            self._prepare_pdf_inputs(
                                pdf_path=temp_path,
                                filename=normalized_name,
                                media_type=resolved_media_type,
                                cleanup_paths=cleanup_paths,
+                                text_layer=text_layer,
                            )
                        )
                    except RuntimeError as exc:
@@ -261,6 +266,7 @@ class OcrService:
        filename: str,
        media_type: str,
        cleanup_paths: list[Path],
+        text_layer: str = "",
    ) -> list[PreparedOcrInput]:
        output_dir = pdf_path.with_suffix("")
        output_dir.mkdir(parents=True, exist_ok=True)
@@ -283,10 +289,33 @@ class OcrService:
                    page_index=page_index,
                    preview_kind="image" if page_index == 0 else "",
                    preview_data_url=preview_data_url if page_index == 0 else "",
+                    text_layer=text_layer if page_index == 0 else "",
                )
            )
        return descriptors

+    def _extract_pdf_text_layer(self, pdf_path: Path) -> str:
+        try:
+            completed = subprocess.run(
+                [
+                    "pdftotext",
+                    "-layout",
+                    str(pdf_path),
+                    "-",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=self.settings.ocr_timeout_seconds,
+                check=False,
+            )
+        except (OSError, subprocess.SubprocessError, UnicodeError):
+            return ""
+
+        if completed.returncode != 0:
+            return ""
+
+        return self._normalize_extracted_text(completed.stdout)
+
    def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]:
        prefix = output_dir / "page"
        completed = subprocess.run(
@@ -367,6 +396,8 @@ class OcrService:
                aggregated.preview_kind = descriptor.preview_kind
            if descriptor.preview_data_url and not aggregated.preview_data_url:
                aggregated.preview_data_url = descriptor.preview_data_url
+            if descriptor.text_layer and descriptor.text_layer not in aggregated.text_layer_fragments:
+                aggregated.text_layer_fragments.append(descriptor.text_layer)

            page_summary = str(payload.get("summary", "") or "").strip()
            if page_summary:
@@ -401,6 +432,20 @@ class OcrService:
            aggregated = aggregated_by_source.get(source_key)
            if aggregated is None:
                first_descriptor = descriptors[0]
+                text_layer = self._collect_descriptor_text_layer(descriptors)
+                if text_layer:
+                    fallback = AggregatedOcrDocument(
+                        filename=first_descriptor.filename,
+                        media_type=first_descriptor.media_type,
+                        source_key=first_descriptor.source_key,
+                        page_count=max(1, len(descriptors)),
+                        preview_kind=first_descriptor.preview_kind,
+                        preview_data_url=first_descriptor.preview_data_url,
+                        warnings=["OCR worker 未返回该文件的识别结果，已使用 PDF 文本层。"],
+                    )
+                    fallback.text_layer_fragments.append(text_layer)
+                    documents.append(self._finalize_document(fallback))
+                    continue
                documents.append(
                    OcrRecognizeDocumentRead(
                        filename=first_descriptor.filename,
@@ -416,6 +461,13 @@ class OcrService:

        return documents

+    @staticmethod
+    def _collect_descriptor_text_layer(descriptors: list[PreparedOcrInput]) -> str:
+        for descriptor in descriptors:
+            if descriptor.text_layer:
+                return descriptor.text_layer
+        return ""
+
    @staticmethod
    def _build_lines(
        items: list[dict],
@@ -451,13 +503,26 @@ class OcrService:
        return summary

    def _finalize_document(self, aggregated: AggregatedOcrDocument) -> OcrRecognizeDocumentRead:
-        full_text = "\n".join(fragment for fragment in aggregated.text_fragments if fragment).strip()
+        ocr_text = "\n".join(fragment for fragment in aggregated.text_fragments if fragment).strip()
+        text_layer = "\n".join(fragment for fragment in aggregated.text_layer_fragments if fragment).strip()
+        full_text, used_text_layer = self._choose_document_text(ocr_text=ocr_text, text_layer=text_layer)
        summary = self._truncate_summary(aggregated.summary_fragments or aggregated.text_fragments)
+        if used_text_layer or self._placeholder_ratio(summary) >= 0.12:
+            summary = self._summarize_text(full_text)
+        preview_kind = aggregated.preview_kind
+        preview_data_url = aggregated.preview_data_url
+        if (
+            used_text_layer
+            and aggregated.media_type == "application/pdf"
+            and self._placeholder_ratio(ocr_text) >= 0.12
+        ):
+            preview_kind = ""
+            preview_data_url = ""
        insight = self.document_intelligence_service.build_document_insight(
            filename=aggregated.filename,
            summary=summary,
            text=full_text,
-            preview_data_url=aggregated.preview_data_url,
+            preview_data_url=preview_data_url,
        )
        warnings = list(aggregated.warnings)
        for warning in insight.warnings:
@@ -493,8 +558,8 @@ class OcrService:
                )
                for field in insight.fields
            ],
-            preview_kind=aggregated.preview_kind,
-            preview_data_url=aggregated.preview_data_url,
+            preview_kind=preview_kind,
+            preview_data_url=preview_data_url,
            warnings=warnings,
            lines=sorted(
                aggregated.lines,
@@ -502,6 +567,45 @@ class OcrService:
            ),
        )

+    @classmethod
+    def _choose_document_text(cls, *, ocr_text: str, text_layer: str) -> tuple[str, bool]:
+        normalized_ocr_text = cls._normalize_extracted_text(ocr_text)
+        normalized_text_layer = cls._normalize_extracted_text(text_layer)
+        if not normalized_text_layer:
+            return normalized_ocr_text, False
+        if not normalized_ocr_text:
+            return normalized_text_layer, True
+        if cls._placeholder_ratio(normalized_ocr_text) >= 0.12 and cls._meaningful_char_count(normalized_text_layer) >= 8:
+            return normalized_text_layer, True
+        if cls._meaningful_char_count(normalized_text_layer) > cls._meaningful_char_count(normalized_ocr_text) * 1.3:
+            return normalized_text_layer, True
+        return normalized_ocr_text, False
+
+    @staticmethod
+    def _normalize_extracted_text(value: str) -> str:
+        lines = [re.sub(r"[ \t]+", " ", line).strip() for line in str(value or "").replace("\r", "\n").split("\n")]
+        return "\n".join(line for line in lines if line).strip()
+
+    @staticmethod
+    def _summarize_text(value: str) -> str:
+        lines = [line.strip() for line in str(value or "").splitlines() if line.strip()]
+        summary = "；".join(lines[:3])
+        if len(summary) > 180:
+            return f"{summary[:177]}..."
+        return summary
+
+    @staticmethod
+    def _meaningful_char_count(value: str) -> int:
+        return len(re.findall(r"[0-9A-Za-z\u4e00-\u9fff]", str(value or "")))
+
+    @staticmethod
+    def _placeholder_ratio(value: str) -> float:
+        chars = [char for char in str(value or "") if not char.isspace()]
+        if not chars:
+            return 0.0
+        placeholder_count = sum(1 for char in chars if char in {"□", "<EFBFBD>"})
+        return placeholder_count / len(chars)
+
    @staticmethod
    def _cleanup_temp_paths(paths: list[Path]) -> None:
        for path in reversed(paths):