feat: 增加差旅报销标准测算和财务终审流程

新增差旅报销测算接口及 Spreadsheet 规则解析,审批流程拆分
直属领导审批与财务终审两阶段并细分权限,修复 PDF 文本层
缺失时自动回退 OCR,提交后清理关联会话,前端适配审批流
交互并补充单元测试。
This commit is contained in:
caoxiaozhu
2026-05-21 09:28:33 +08:00
parent 002bf4f756
commit 8f65661809
43 changed files with 4366 additions and 410 deletions

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import base64
import json
import re
import shutil
import subprocess
from dataclasses import dataclass, field
@@ -27,6 +28,7 @@ class PreparedOcrInput:
page_index: int | None = None
preview_kind: str = ""
preview_data_url: str = ""
text_layer: str = ""
@dataclass(slots=True)
@@ -38,6 +40,7 @@ class AggregatedOcrDocument:
model: str = "PP-OCRv5_mobile"
summary_fragments: list[str] = field(default_factory=list)
text_fragments: list[str] = field(default_factory=list)
text_layer_fragments: list[str] = field(default_factory=list)
score_values: list[float] = field(default_factory=list)
warnings: list[str] = field(default_factory=list)
lines: list[OcrRecognizeLineRead] = field(default_factory=list)
@@ -112,12 +115,14 @@ class OcrService:
if suffix == ".pdf":
try:
text_layer = self._extract_pdf_text_layer(temp_path)
prepared_inputs.extend(
self._prepare_pdf_inputs(
pdf_path=temp_path,
filename=normalized_name,
media_type=resolved_media_type,
cleanup_paths=cleanup_paths,
text_layer=text_layer,
)
)
except RuntimeError as exc:
@@ -261,6 +266,7 @@ class OcrService:
filename: str,
media_type: str,
cleanup_paths: list[Path],
text_layer: str = "",
) -> list[PreparedOcrInput]:
output_dir = pdf_path.with_suffix("")
output_dir.mkdir(parents=True, exist_ok=True)
@@ -283,10 +289,33 @@ class OcrService:
page_index=page_index,
preview_kind="image" if page_index == 0 else "",
preview_data_url=preview_data_url if page_index == 0 else "",
text_layer=text_layer if page_index == 0 else "",
)
)
return descriptors
def _extract_pdf_text_layer(self, pdf_path: Path) -> str:
try:
completed = subprocess.run(
[
"pdftotext",
"-layout",
str(pdf_path),
"-",
],
capture_output=True,
text=True,
timeout=self.settings.ocr_timeout_seconds,
check=False,
)
except (OSError, subprocess.SubprocessError, UnicodeError):
return ""
if completed.returncode != 0:
return ""
return self._normalize_extracted_text(completed.stdout)
def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]:
prefix = output_dir / "page"
completed = subprocess.run(
@@ -367,6 +396,8 @@ class OcrService:
aggregated.preview_kind = descriptor.preview_kind
if descriptor.preview_data_url and not aggregated.preview_data_url:
aggregated.preview_data_url = descriptor.preview_data_url
if descriptor.text_layer and descriptor.text_layer not in aggregated.text_layer_fragments:
aggregated.text_layer_fragments.append(descriptor.text_layer)
page_summary = str(payload.get("summary", "") or "").strip()
if page_summary:
@@ -401,6 +432,20 @@ class OcrService:
aggregated = aggregated_by_source.get(source_key)
if aggregated is None:
first_descriptor = descriptors[0]
text_layer = self._collect_descriptor_text_layer(descriptors)
if text_layer:
fallback = AggregatedOcrDocument(
filename=first_descriptor.filename,
media_type=first_descriptor.media_type,
source_key=first_descriptor.source_key,
page_count=max(1, len(descriptors)),
preview_kind=first_descriptor.preview_kind,
preview_data_url=first_descriptor.preview_data_url,
warnings=["OCR worker 未返回该文件的识别结果,已使用 PDF 文本层。"],
)
fallback.text_layer_fragments.append(text_layer)
documents.append(self._finalize_document(fallback))
continue
documents.append(
OcrRecognizeDocumentRead(
filename=first_descriptor.filename,
@@ -416,6 +461,13 @@ class OcrService:
return documents
@staticmethod
def _collect_descriptor_text_layer(descriptors: list[PreparedOcrInput]) -> str:
for descriptor in descriptors:
if descriptor.text_layer:
return descriptor.text_layer
return ""
@staticmethod
def _build_lines(
items: list[dict],
@@ -451,13 +503,26 @@ class OcrService:
return summary
def _finalize_document(self, aggregated: AggregatedOcrDocument) -> OcrRecognizeDocumentRead:
full_text = "\n".join(fragment for fragment in aggregated.text_fragments if fragment).strip()
ocr_text = "\n".join(fragment for fragment in aggregated.text_fragments if fragment).strip()
text_layer = "\n".join(fragment for fragment in aggregated.text_layer_fragments if fragment).strip()
full_text, used_text_layer = self._choose_document_text(ocr_text=ocr_text, text_layer=text_layer)
summary = self._truncate_summary(aggregated.summary_fragments or aggregated.text_fragments)
if used_text_layer or self._placeholder_ratio(summary) >= 0.12:
summary = self._summarize_text(full_text)
preview_kind = aggregated.preview_kind
preview_data_url = aggregated.preview_data_url
if (
used_text_layer
and aggregated.media_type == "application/pdf"
and self._placeholder_ratio(ocr_text) >= 0.12
):
preview_kind = ""
preview_data_url = ""
insight = self.document_intelligence_service.build_document_insight(
filename=aggregated.filename,
summary=summary,
text=full_text,
preview_data_url=aggregated.preview_data_url,
preview_data_url=preview_data_url,
)
warnings = list(aggregated.warnings)
for warning in insight.warnings:
@@ -493,8 +558,8 @@ class OcrService:
)
for field in insight.fields
],
preview_kind=aggregated.preview_kind,
preview_data_url=aggregated.preview_data_url,
preview_kind=preview_kind,
preview_data_url=preview_data_url,
warnings=warnings,
lines=sorted(
aggregated.lines,
@@ -502,6 +567,45 @@ class OcrService:
),
)
@classmethod
def _choose_document_text(cls, *, ocr_text: str, text_layer: str) -> tuple[str, bool]:
normalized_ocr_text = cls._normalize_extracted_text(ocr_text)
normalized_text_layer = cls._normalize_extracted_text(text_layer)
if not normalized_text_layer:
return normalized_ocr_text, False
if not normalized_ocr_text:
return normalized_text_layer, True
if cls._placeholder_ratio(normalized_ocr_text) >= 0.12 and cls._meaningful_char_count(normalized_text_layer) >= 8:
return normalized_text_layer, True
if cls._meaningful_char_count(normalized_text_layer) > cls._meaningful_char_count(normalized_ocr_text) * 1.3:
return normalized_text_layer, True
return normalized_ocr_text, False
@staticmethod
def _normalize_extracted_text(value: str) -> str:
lines = [re.sub(r"[ \t]+", " ", line).strip() for line in str(value or "").replace("\r", "\n").split("\n")]
return "\n".join(line for line in lines if line).strip()
@staticmethod
def _summarize_text(value: str) -> str:
lines = [line.strip() for line in str(value or "").splitlines() if line.strip()]
summary = "".join(lines[:3])
if len(summary) > 180:
return f"{summary[:177]}..."
return summary
@staticmethod
def _meaningful_char_count(value: str) -> int:
return len(re.findall(r"[0-9A-Za-z\u4e00-\u9fff]", str(value or "")))
@staticmethod
def _placeholder_ratio(value: str) -> float:
chars = [char for char in str(value or "") if not char.isspace()]
if not chars:
return 0.0
placeholder_count = sum(1 for char in chars if char in {"", "<EFBFBD>"})
return placeholder_count / len(chars)
@staticmethod
def _cleanup_temp_paths(paths: list[Path]) -> None:
for path in reversed(paths):