Files
X-Financial/server/tests/test_ocr_service.py
caoxiaozhu 0264a4b5b4 refactor(server): user_agent/steward/ocr 等服务重构并适配关联任务
- user_agent 拆分 application/locations/knowledge/response/review 四个子模块,接入申请位置语义与关联草稿分支
- steward planner/runtime/slot/plan_builder 决策链路重构,travel_reimbursement_calculator/orchestrator_expense_query 适配
- ocr/document_preview/document_intelligence/receipt_folder 复用预览与资产缓存,expense_claim_draft_flow/application_handoff 适配
- pyproject.toml 新增依赖,paddleocr bootstrap 脚本与 server_start.sh 调整
- 更新差旅/交通/通信等财务规则表,同步 document_intelligence/ocr/receipt_folder/user_agent 等测试
2026-06-24 10:42:24 +08:00

600 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import stat
import subprocess
from pathlib import Path
from app.core.config import get_settings
from app.services import document_preview
from app.services.ocr import OcrService
def test_ocr_runtime_installers_include_cjk_safe_pdf_rendering_tools() -> None:
repo_root = Path(__file__).resolve().parents[2]
dependency_sources = [
repo_root / "docker-compose.yml",
repo_root / "docker-compose.full.yml",
repo_root / "server" / "scripts" / "bootstrap_paddleocr_mobile.sh",
repo_root / "server" / "scripts" / "bootstrap_paddleocr_gpu.sh",
]
for path in dependency_sources:
content = path.read_text(encoding="utf-8")
assert "poppler-data" in content
assert "mupdf-tools" in content
def test_ocr_service_uses_worker_runtime_and_keeps_unsupported_files_as_warnings(
monkeypatch,
tmp_path: Path,
) -> None:
fake_python = tmp_path / "fake-ocr-python.py"
fake_python.write_text(
"""#!/usr/bin/env python3
import json
import sys
inputs = []
for index, arg in enumerate(sys.argv):
if arg == "--input" and index + 1 < len(sys.argv):
input_path = sys.argv[index + 1]
inputs.append(
{
"input_path": input_path,
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13",
"summary": "增值税电子发票,金额 100 元。",
"avg_score": 0.98,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13",
"score": 0.98,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
"page_index": 0,
}
],
}
)
payload = {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": inputs,
}
print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))
""",
encoding="utf-8",
)
fake_python.chmod(fake_python.stat().st_mode | stat.S_IEXEC)
monkeypatch.setenv("OCR_PYTHON_BIN", str(fake_python))
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("invoice.png", b"fake-image", "image/png"),
("notes.txt", b"plain-text", "text/plain"),
]
)
finally:
get_settings.cache_clear()
assert result.engine == "paddleocr_mobile"
assert result.model == "PP-OCRv5_mobile"
assert result.total_file_count == 2
assert result.success_count == 1
assert len(result.documents) == 2
recognized = next(item for item in result.documents if item.filename == "invoice.png")
assert recognized.summary == "增值税电子发票,金额 100 元。"
assert recognized.line_count == 1
assert recognized.document_type == "vat_invoice"
assert recognized.document_type_label == "增值税发票"
assert any(field.label == "金额" and field.value == "100元" for field in recognized.document_fields)
assert any(field.label == "票据号码" and field.value == "12345678" for field in recognized.document_fields)
assert any(field.label == "日期" and field.value == "2026-05-13" for field in recognized.document_fields)
assert recognized.lines[0].text == "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13"
skipped = next(item for item in result.documents if item.filename == "notes.txt")
assert skipped.line_count == 0
assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]
def test_ocr_service_recovers_image_text_from_worker_ocr_text(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"ocr_text": "铁路电子客票 武汉-上海 2026 02 20 07:55 G458 : 354.00 12306 95306",
"avg_score": 0.92,
"line_count": 0,
"page_count": 1,
"warnings": [],
"lines": [],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
OcrService._result_cache.clear()
get_settings.cache_clear()
try:
result = OcrService().recognize_files([("train-ticket.png", b"fake-train-image", "image/png")])
finally:
OcrService._result_cache.clear()
get_settings.cache_clear()
recognized = result.documents[0]
assert "铁路电子客票" in recognized.text
assert recognized.document_type == "train_ticket"
assert any(field.label == "列车出发时间" and field.value == "2026-02-20 07:55" for field in recognized.document_fields)
assert any(field.label == "车次/航班" and field.value == "G458" for field in recognized.document_fields)
assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields)
def test_ocr_service_passes_configured_device_to_worker(
monkeypatch,
tmp_path: Path,
) -> None:
captured_commands: list[list[str]] = []
def fake_run(
command: list[str],
*,
capture_output: bool,
text: bool,
timeout: int,
check: bool,
env: dict[str, str] | None = None,
) -> subprocess.CompletedProcess[str]:
captured_commands.append(command)
return subprocess.CompletedProcess(
args=command,
returncode=0,
stdout='__OCR_JSON__={"engine":"paddleocr_mobile","model":"PP-OCRv5_mobile","documents":[]}\n',
stderr="",
)
monkeypatch.setenv("OCR_DEVICE", "gpu:0")
get_settings.cache_clear()
monkeypatch.setattr(subprocess, "run", fake_run)
try:
payload = OcrService()._invoke_worker(
python_bin="python",
worker_path="worker.py",
input_paths=[tmp_path / "invoice.png"],
)
finally:
get_settings.cache_clear()
assert payload["engine"] == "paddleocr_mobile"
command = captured_commands[0]
device_index = command.index("--device")
assert command[device_index + 1] == "gpu:0"
def test_ocr_service_converts_pdf_to_images_and_returns_image_preview(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
first = output_dir / "page-1.png"
second = output_dir / "page-2.png"
first.write_bytes(b"fake-page-1")
second.write_bytes(b"fake-page-2")
return [first, second], True
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
assert [path.name for path in input_paths] == ["page-1.png", "page-2.png"]
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "高铁票 深圳北-广州南 车次 G1234 2026-05-13 金额 188 元",
"summary": "高铁票第一页",
"avg_score": 0.97,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "高铁票 深圳北-广州南 车次 G1234 2026-05-13 金额 188 元",
"score": 0.97,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
},
{
"input_path": str(input_paths[1]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "乘车人 张三",
"summary": "高铁票第二页",
"avg_score": 0.94,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "乘车人 张三",
"score": 0.94,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
},
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("train-ticket.pdf", b"%PDF-1.4 fake", "application/pdf"),
]
)
finally:
get_settings.cache_clear()
assert result.success_count == 1
assert len(result.documents) == 1
recognized = result.documents[0]
assert recognized.filename == "train-ticket.pdf"
assert recognized.page_count == 2
assert recognized.preview_kind == "image"
assert recognized.preview_data_url.startswith("data:image/png;base64,")
assert recognized.document_type == "train_ticket"
assert any(field.label == "金额" and field.value == "188元" for field in recognized.document_fields)
assert any(field.label == "车次/航班" and field.value == "G1234" for field in recognized.document_fields)
assert recognized.lines[0].page_index == 0
assert recognized.lines[1].page_index == 1
def test_ocr_service_rejects_pdf_ocr_when_rendered_image_fonts_are_broken(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
raise RuntimeError("PDF 转图片失败:检测到中文字体映射缺失,未生成可 OCR 的图片。")
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
raise AssertionError("PDF 转图片已确认丢中文时,不应继续调用 OCR worker。")
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("2月20_武汉-上海.pdf", b"%PDF-1.7 fake", "application/pdf"),
]
)
finally:
get_settings.cache_clear()
failed = result.documents[0]
assert failed.line_count == 0
assert failed.preview_kind == ""
assert failed.preview_data_url == ""
assert failed.warnings == ["PDF 转图片失败:检测到中文字体映射缺失,未生成可 OCR 的图片。"]
def test_ocr_pdf_conversion_tries_next_renderer_when_poppler_font_mapping_fails(
monkeypatch,
tmp_path: Path,
) -> None:
output_dir = tmp_path / "pages"
output_dir.mkdir()
calls: list[str] = []
def fake_run(
command: list[str],
*,
capture_output: bool,
text: bool,
timeout: int,
check: bool,
) -> subprocess.CompletedProcess[str]:
calls.append(Path(command[0]).name)
if Path(command[0]).name == "pdftoppm":
(output_dir / "page-1.png").write_bytes(b"broken-preview")
return subprocess.CompletedProcess(
args=command,
returncode=0,
stdout="",
stderr="Syntax Error: Missing language pack for 'Adobe-GB1' mapping",
)
(output_dir / "page-1.png").write_bytes(b"rendered-with-chinese")
return subprocess.CompletedProcess(
args=command,
returncode=0,
stdout="",
stderr="",
)
monkeypatch.setattr(
document_preview.shutil,
"which",
lambda name: f"/usr/bin/{name}" if name in {"pdftoppm", "mutool"} else None,
)
monkeypatch.setattr(subprocess, "run", fake_run)
pages, preview_usable = OcrService()._convert_pdf_to_images(
pdf_path=tmp_path / "ticket.pdf",
output_dir=output_dir,
)
assert pages == [output_dir / "page-1.png"]
assert preview_usable is True
assert calls == ["pdftoppm", "mutool"]
def test_ocr_service_invokes_worker_even_when_pdf_text_layer_is_usable(
monkeypatch,
tmp_path: Path,
) -> None:
calls = {"worker": 0}
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
page = output_dir / "page-1.png"
page.write_bytes(b"fake-rendered-page")
return [page], True
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
calls["worker"] += 1
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "电子发票(铁路电子客票) 武汉站 上海虹桥站 G458 票价 ¥354.00",
"summary": "铁路电子客票",
"avg_score": 0.95,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "电子发票(铁路电子客票) 武汉站 上海虹桥站 G458 票价 ¥354.00",
"score": 0.95,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
monkeypatch.setattr(
OcrService,
"_extract_pdf_text_layer",
lambda self, pdf_path: (
"电子发票(铁路电子客票)\n"
"发票号码:26429165800002785705\n"
"武汉站\n"
"上海虹桥站\n"
"G458\n"
"票价:¥354.00\n"
"电子客票号:6580061086021391007342026"
),
)
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("train-ticket.pdf", b"%PDF-1.7 fake", "application/pdf"),
]
)
finally:
get_settings.cache_clear()
recognized = result.documents[0]
assert result.success_count == 1
assert calls["worker"] == 1
assert recognized.document_type == "train_ticket"
assert "电子发票(铁路电子客票)" in recognized.text
assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields)
assert recognized.preview_kind == "image"
assert recognized.preview_data_url.startswith("data:image/png;base64,")
def test_ocr_service_reuses_cached_document_for_same_content(
monkeypatch,
tmp_path: Path,
) -> None:
calls = {"count": 0}
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
calls["count"] += 1
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "增值税电子发票 金额 20 元",
"summary": "增值税电子发票,金额 20 元。",
"avg_score": 0.97,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "增值税电子发票 金额 20 元",
"score": 0.97,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
OcrService._result_cache.clear()
get_settings.cache_clear()
try:
first = OcrService().recognize_files([("first.png", b"same-image", "image/png")])
second = OcrService().recognize_files([("second.png", b"same-image", "image/png")])
finally:
OcrService._result_cache.clear()
get_settings.cache_clear()
assert calls["count"] == 1
assert first.documents[0].filename == "first.png"
assert second.documents[0].filename == "second.png"
assert second.documents[0].summary == first.documents[0].summary
def test_ocr_cache_key_includes_pdf_render_pipeline_version() -> None:
cache_key = OcrService()._build_cache_key(b"same-pdf-content")
assert "pdf-image-ocr:" in cache_key
assert document_preview.DocumentPreviewAssets.PDF_RENDERER_ID in cache_key
assert "no-pdf-direct" in cache_key
def test_ocr_service_prefers_pdf_text_layer_when_rendered_ocr_is_placeholder_heavy(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
page = output_dir / "page-1.png"
page.write_bytes(b"fake-page")
return [page], True
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "□□□□□□\n□□□□26319166100006175398\nG456\n□□:□354.00",
"summary": "□□□□□□□□□□26319166100006175398",
"avg_score": 0.88,
"line_count": 4,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "□□□□□□",
"score": 0.88,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
monkeypatch.setattr(
OcrService,
"_extract_pdf_text_layer",
lambda self, pdf_path: (
"电子发票(铁路电子客票)\n"
"发票号码:26319166100006175398\n"
"上海虹桥站\n"
"武汉站\n"
"G456\n"
"票价:¥354.00"
),
)
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("train-ticket.pdf", b"%PDF-1.4 fake", "application/pdf"),
]
)
finally:
get_settings.cache_clear()
recognized = result.documents[0]
assert "电子发票(铁路电子客票)" in recognized.text
assert "上海虹桥站" in recognized.text
assert "□□□□" not in recognized.summary
assert recognized.document_type == "train_ticket"
assert recognized.preview_kind == "image"
assert recognized.preview_data_url.startswith("data:image/png;base64,")