refactor(server): user_agent/steward/ocr 等服务重构并适配关联任务
- user_agent 拆分 application/locations/knowledge/response/review 四个子模块,接入申请位置语义与关联草稿分支 - steward planner/runtime/slot/plan_builder 决策链路重构,travel_reimbursement_calculator/orchestrator_expense_query 适配 - ocr/document_preview/document_intelligence/receipt_folder 复用预览与资产缓存,expense_claim_draft_flow/application_handoff 适配 - pyproject.toml 新增依赖,paddleocr bootstrap 脚本与 server_start.sh 调整 - 更新差旅/交通/通信等财务规则表,同步 document_intelligence/ocr/receipt_folder/user_agent 等测试
This commit is contained in:
@@ -16,11 +16,13 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.config import SERVER_DIR, get_settings
|
||||
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead, OcrRecognizeLineRead
|
||||
from app.services.document_preview import DocumentPreviewAssets
|
||||
from app.services.document_intelligence import DocumentIntelligenceService
|
||||
|
||||
WORKER_JSON_PREFIX = "__OCR_JSON__="
|
||||
SUPPORTED_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".pdf"}
|
||||
OCR_RESULT_CACHE_LIMIT = 32
|
||||
OCR_RESULT_CACHE_PIPELINE_VERSION = f"pdf-image-ocr:{DocumentPreviewAssets.PDF_RENDERER_ID}:no-pdf-direct-v2"
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
@@ -142,16 +144,6 @@ class OcrService:
|
||||
cleanup_paths=cleanup_paths,
|
||||
text_layer=text_layer,
|
||||
)
|
||||
if self._has_usable_pdf_text_layer(text_layer):
|
||||
document = self._build_text_layer_document(
|
||||
filename=normalized_name,
|
||||
media_type=resolved_media_type,
|
||||
text_layer=text_layer,
|
||||
pdf_inputs=pdf_inputs,
|
||||
)
|
||||
documents.append(document)
|
||||
self._write_cached_document(cache_key, document)
|
||||
continue
|
||||
prepared_inputs.extend(pdf_inputs)
|
||||
for item in pdf_inputs:
|
||||
cache_keys_by_source.setdefault(item.source_key, cache_key)
|
||||
@@ -257,6 +249,7 @@ class OcrService:
|
||||
digest = hashlib.sha256(content).hexdigest()
|
||||
return "|".join(
|
||||
[
|
||||
OCR_RESULT_CACHE_PIPELINE_VERSION,
|
||||
self.settings.ocr_language,
|
||||
self.settings.ocr_device,
|
||||
self.settings.ocr_text_detection_model,
|
||||
@@ -406,11 +399,15 @@ class OcrService:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
cleanup_paths.append(output_dir)
|
||||
|
||||
image_paths = self._convert_pdf_to_images(pdf_path=pdf_path, output_dir=output_dir)
|
||||
image_paths, preview_usable = self._convert_pdf_to_images(pdf_path=pdf_path, output_dir=output_dir)
|
||||
if not image_paths:
|
||||
raise RuntimeError("PDF 转图片后未生成可识别页面。")
|
||||
|
||||
preview_data_url = self._build_preview_data_url(image_paths[0], media_type="image/png")
|
||||
preview_data_url = (
|
||||
self._build_preview_data_url(image_paths[0], media_type="image/png")
|
||||
if preview_usable
|
||||
else ""
|
||||
)
|
||||
source_key = uuid4().hex
|
||||
descriptors: list[PreparedOcrInput] = []
|
||||
for page_index, image_path in enumerate(image_paths):
|
||||
@@ -421,7 +418,7 @@ class OcrService:
|
||||
filename=filename,
|
||||
media_type=media_type,
|
||||
page_index=page_index,
|
||||
preview_kind="image" if page_index == 0 else "",
|
||||
preview_kind="image" if page_index == 0 and preview_data_url else "",
|
||||
preview_data_url=preview_data_url if page_index == 0 else "",
|
||||
text_layer=text_layer if page_index == 0 else "",
|
||||
)
|
||||
@@ -450,27 +447,17 @@ class OcrService:
|
||||
|
||||
return self._normalize_extracted_text(completed.stdout)
|
||||
|
||||
def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]:
|
||||
prefix = output_dir / "page"
|
||||
completed = subprocess.run(
|
||||
[
|
||||
"pdftoppm",
|
||||
"-png",
|
||||
"-r",
|
||||
"160",
|
||||
str(pdf_path),
|
||||
str(prefix),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=self.settings.ocr_timeout_seconds,
|
||||
check=False,
|
||||
)
|
||||
if completed.returncode != 0:
|
||||
detail = (completed.stderr or completed.stdout or "").strip()
|
||||
raise RuntimeError(f"PDF 转图片失败:{detail or 'pdftoppm 返回非 0 状态码。'}")
|
||||
def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
|
||||
try:
|
||||
pages = DocumentPreviewAssets.render_pdf_pages(
|
||||
pdf_path=pdf_path,
|
||||
output_dir=output_dir,
|
||||
timeout_seconds=self.settings.ocr_timeout_seconds,
|
||||
)
|
||||
except RuntimeError as exc:
|
||||
raise RuntimeError(f"PDF 转图片失败:{exc}") from exc
|
||||
|
||||
return sorted(output_dir.glob("page-*.png"), key=self._extract_pdf_page_sort_key)
|
||||
return pages, True
|
||||
|
||||
@staticmethod
|
||||
def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]:
|
||||
@@ -595,30 +582,6 @@ class OcrService:
|
||||
|
||||
return documents
|
||||
|
||||
def _build_text_layer_document(
|
||||
self,
|
||||
*,
|
||||
filename: str,
|
||||
media_type: str,
|
||||
text_layer: str,
|
||||
pdf_inputs: list[PreparedOcrInput],
|
||||
) -> OcrRecognizeDocumentRead:
|
||||
first_input = pdf_inputs[0] if pdf_inputs else None
|
||||
aggregated = AggregatedOcrDocument(
|
||||
filename=filename,
|
||||
media_type=media_type,
|
||||
source_key=first_input.source_key if first_input is not None else uuid4().hex,
|
||||
page_count=max(1, len(pdf_inputs)),
|
||||
preview_kind=str(first_input.preview_kind if first_input is not None else ""),
|
||||
preview_data_url=str(first_input.preview_data_url if first_input is not None else ""),
|
||||
)
|
||||
aggregated.text_layer_fragments.append(text_layer)
|
||||
return self._finalize_document(aggregated)
|
||||
|
||||
@classmethod
|
||||
def _has_usable_pdf_text_layer(cls, text_layer: str) -> bool:
|
||||
return cls._meaningful_char_count(text_layer) >= 8
|
||||
|
||||
@staticmethod
|
||||
def _collect_descriptor_text_layer(descriptors: list[PreparedOcrInput]) -> str:
|
||||
for descriptor in descriptors:
|
||||
@@ -685,13 +648,6 @@ class OcrService:
|
||||
summary = self._summarize_text(full_text)
|
||||
preview_kind = aggregated.preview_kind
|
||||
preview_data_url = aggregated.preview_data_url
|
||||
if (
|
||||
used_text_layer
|
||||
and aggregated.media_type == "application/pdf"
|
||||
and self._placeholder_ratio(ocr_text) >= 0.12
|
||||
):
|
||||
preview_kind = ""
|
||||
preview_data_url = ""
|
||||
insight = self.document_intelligence_service.build_document_insight(
|
||||
filename=aggregated.filename,
|
||||
summary=summary,
|
||||
|
||||
Reference in New Issue
Block a user