feat: 增加差旅报销标准测算和财务终审流程
新增差旅报销测算接口及 Spreadsheet 规则解析,审批流程拆分 直属领导审批与财务终审两阶段并细分权限,修复 PDF 文本层 缺失时自动回退 OCR,提交后清理关联会话,前端适配审批流 交互并补充单元测试。
This commit is contained in:
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
from dataclasses import dataclass, field
|
||||
@@ -27,6 +28,7 @@ class PreparedOcrInput:
|
||||
page_index: int | None = None
|
||||
preview_kind: str = ""
|
||||
preview_data_url: str = ""
|
||||
text_layer: str = ""
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
@@ -38,6 +40,7 @@ class AggregatedOcrDocument:
|
||||
model: str = "PP-OCRv5_mobile"
|
||||
summary_fragments: list[str] = field(default_factory=list)
|
||||
text_fragments: list[str] = field(default_factory=list)
|
||||
text_layer_fragments: list[str] = field(default_factory=list)
|
||||
score_values: list[float] = field(default_factory=list)
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
lines: list[OcrRecognizeLineRead] = field(default_factory=list)
|
||||
@@ -112,12 +115,14 @@ class OcrService:
|
||||
|
||||
if suffix == ".pdf":
|
||||
try:
|
||||
text_layer = self._extract_pdf_text_layer(temp_path)
|
||||
prepared_inputs.extend(
|
||||
self._prepare_pdf_inputs(
|
||||
pdf_path=temp_path,
|
||||
filename=normalized_name,
|
||||
media_type=resolved_media_type,
|
||||
cleanup_paths=cleanup_paths,
|
||||
text_layer=text_layer,
|
||||
)
|
||||
)
|
||||
except RuntimeError as exc:
|
||||
@@ -261,6 +266,7 @@ class OcrService:
|
||||
filename: str,
|
||||
media_type: str,
|
||||
cleanup_paths: list[Path],
|
||||
text_layer: str = "",
|
||||
) -> list[PreparedOcrInput]:
|
||||
output_dir = pdf_path.with_suffix("")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
@@ -283,10 +289,33 @@ class OcrService:
|
||||
page_index=page_index,
|
||||
preview_kind="image" if page_index == 0 else "",
|
||||
preview_data_url=preview_data_url if page_index == 0 else "",
|
||||
text_layer=text_layer if page_index == 0 else "",
|
||||
)
|
||||
)
|
||||
return descriptors
|
||||
|
||||
def _extract_pdf_text_layer(self, pdf_path: Path) -> str:
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
[
|
||||
"pdftotext",
|
||||
"-layout",
|
||||
str(pdf_path),
|
||||
"-",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=self.settings.ocr_timeout_seconds,
|
||||
check=False,
|
||||
)
|
||||
except (OSError, subprocess.SubprocessError, UnicodeError):
|
||||
return ""
|
||||
|
||||
if completed.returncode != 0:
|
||||
return ""
|
||||
|
||||
return self._normalize_extracted_text(completed.stdout)
|
||||
|
||||
def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]:
|
||||
prefix = output_dir / "page"
|
||||
completed = subprocess.run(
|
||||
@@ -367,6 +396,8 @@ class OcrService:
|
||||
aggregated.preview_kind = descriptor.preview_kind
|
||||
if descriptor.preview_data_url and not aggregated.preview_data_url:
|
||||
aggregated.preview_data_url = descriptor.preview_data_url
|
||||
if descriptor.text_layer and descriptor.text_layer not in aggregated.text_layer_fragments:
|
||||
aggregated.text_layer_fragments.append(descriptor.text_layer)
|
||||
|
||||
page_summary = str(payload.get("summary", "") or "").strip()
|
||||
if page_summary:
|
||||
@@ -401,6 +432,20 @@ class OcrService:
|
||||
aggregated = aggregated_by_source.get(source_key)
|
||||
if aggregated is None:
|
||||
first_descriptor = descriptors[0]
|
||||
text_layer = self._collect_descriptor_text_layer(descriptors)
|
||||
if text_layer:
|
||||
fallback = AggregatedOcrDocument(
|
||||
filename=first_descriptor.filename,
|
||||
media_type=first_descriptor.media_type,
|
||||
source_key=first_descriptor.source_key,
|
||||
page_count=max(1, len(descriptors)),
|
||||
preview_kind=first_descriptor.preview_kind,
|
||||
preview_data_url=first_descriptor.preview_data_url,
|
||||
warnings=["OCR worker 未返回该文件的识别结果,已使用 PDF 文本层。"],
|
||||
)
|
||||
fallback.text_layer_fragments.append(text_layer)
|
||||
documents.append(self._finalize_document(fallback))
|
||||
continue
|
||||
documents.append(
|
||||
OcrRecognizeDocumentRead(
|
||||
filename=first_descriptor.filename,
|
||||
@@ -416,6 +461,13 @@ class OcrService:
|
||||
|
||||
return documents
|
||||
|
||||
@staticmethod
|
||||
def _collect_descriptor_text_layer(descriptors: list[PreparedOcrInput]) -> str:
|
||||
for descriptor in descriptors:
|
||||
if descriptor.text_layer:
|
||||
return descriptor.text_layer
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _build_lines(
|
||||
items: list[dict],
|
||||
@@ -451,13 +503,26 @@ class OcrService:
|
||||
return summary
|
||||
|
||||
def _finalize_document(self, aggregated: AggregatedOcrDocument) -> OcrRecognizeDocumentRead:
|
||||
full_text = "\n".join(fragment for fragment in aggregated.text_fragments if fragment).strip()
|
||||
ocr_text = "\n".join(fragment for fragment in aggregated.text_fragments if fragment).strip()
|
||||
text_layer = "\n".join(fragment for fragment in aggregated.text_layer_fragments if fragment).strip()
|
||||
full_text, used_text_layer = self._choose_document_text(ocr_text=ocr_text, text_layer=text_layer)
|
||||
summary = self._truncate_summary(aggregated.summary_fragments or aggregated.text_fragments)
|
||||
if used_text_layer or self._placeholder_ratio(summary) >= 0.12:
|
||||
summary = self._summarize_text(full_text)
|
||||
preview_kind = aggregated.preview_kind
|
||||
preview_data_url = aggregated.preview_data_url
|
||||
if (
|
||||
used_text_layer
|
||||
and aggregated.media_type == "application/pdf"
|
||||
and self._placeholder_ratio(ocr_text) >= 0.12
|
||||
):
|
||||
preview_kind = ""
|
||||
preview_data_url = ""
|
||||
insight = self.document_intelligence_service.build_document_insight(
|
||||
filename=aggregated.filename,
|
||||
summary=summary,
|
||||
text=full_text,
|
||||
preview_data_url=aggregated.preview_data_url,
|
||||
preview_data_url=preview_data_url,
|
||||
)
|
||||
warnings = list(aggregated.warnings)
|
||||
for warning in insight.warnings:
|
||||
@@ -493,8 +558,8 @@ class OcrService:
|
||||
)
|
||||
for field in insight.fields
|
||||
],
|
||||
preview_kind=aggregated.preview_kind,
|
||||
preview_data_url=aggregated.preview_data_url,
|
||||
preview_kind=preview_kind,
|
||||
preview_data_url=preview_data_url,
|
||||
warnings=warnings,
|
||||
lines=sorted(
|
||||
aggregated.lines,
|
||||
@@ -502,6 +567,45 @@ class OcrService:
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _choose_document_text(cls, *, ocr_text: str, text_layer: str) -> tuple[str, bool]:
|
||||
normalized_ocr_text = cls._normalize_extracted_text(ocr_text)
|
||||
normalized_text_layer = cls._normalize_extracted_text(text_layer)
|
||||
if not normalized_text_layer:
|
||||
return normalized_ocr_text, False
|
||||
if not normalized_ocr_text:
|
||||
return normalized_text_layer, True
|
||||
if cls._placeholder_ratio(normalized_ocr_text) >= 0.12 and cls._meaningful_char_count(normalized_text_layer) >= 8:
|
||||
return normalized_text_layer, True
|
||||
if cls._meaningful_char_count(normalized_text_layer) > cls._meaningful_char_count(normalized_ocr_text) * 1.3:
|
||||
return normalized_text_layer, True
|
||||
return normalized_ocr_text, False
|
||||
|
||||
@staticmethod
|
||||
def _normalize_extracted_text(value: str) -> str:
|
||||
lines = [re.sub(r"[ \t]+", " ", line).strip() for line in str(value or "").replace("\r", "\n").split("\n")]
|
||||
return "\n".join(line for line in lines if line).strip()
|
||||
|
||||
@staticmethod
|
||||
def _summarize_text(value: str) -> str:
|
||||
lines = [line.strip() for line in str(value or "").splitlines() if line.strip()]
|
||||
summary = ";".join(lines[:3])
|
||||
if len(summary) > 180:
|
||||
return f"{summary[:177]}..."
|
||||
return summary
|
||||
|
||||
@staticmethod
|
||||
def _meaningful_char_count(value: str) -> int:
|
||||
return len(re.findall(r"[0-9A-Za-z\u4e00-\u9fff]", str(value or "")))
|
||||
|
||||
@staticmethod
|
||||
def _placeholder_ratio(value: str) -> float:
|
||||
chars = [char for char in str(value or "") if not char.isspace()]
|
||||
if not chars:
|
||||
return 0.0
|
||||
placeholder_count = sum(1 for char in chars if char in {"□", "<EFBFBD>"})
|
||||
return placeholder_count / len(chars)
|
||||
|
||||
@staticmethod
|
||||
def _cleanup_temp_paths(paths: list[Path]) -> None:
|
||||
for path in reversed(paths):
|
||||
|
||||
Reference in New Issue
Block a user