Files
X-Financial/server/tests/test_ocr_service.py
caoxiaozhu 88e91a5900 feat(ocr): PDF 文本层可用时跳过 worker 调用并补装 poppler-data
- OcrService 提取 PDF 文本层后若有效字符达到阈值,直接构建文档并写入结果缓存,不再触发 OCR worker,仅无文本层时才解析 python_bin/worker_path 调用 worker
- _build_text_layer_document 复用 AggregatedOcrDocument 聚合文本层片段,_has_usable_pdf_text_layer 基于 meaningful_char_count 判定
- docker-compose 与 paddleocr bootstrap 脚本补装 poppler-data,保证 PDF 文本层抽取的中文编码正确
- 新增文本层直取与运行时依赖两项 ocr_service 单测
2026-06-21 23:23:59 +08:00

421 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import stat
import subprocess
from pathlib import Path
from app.core.config import get_settings
from app.services.ocr import OcrService
def test_ocr_runtime_installers_include_poppler_cjk_data() -> None:
repo_root = Path(__file__).resolve().parents[2]
dependency_sources = [
repo_root / "docker-compose.yml",
repo_root / "server" / "scripts" / "bootstrap_paddleocr_mobile.sh",
repo_root / "server" / "scripts" / "bootstrap_paddleocr_gpu.sh",
]
for path in dependency_sources:
assert "poppler-data" in path.read_text(encoding="utf-8")
def test_ocr_service_uses_worker_runtime_and_keeps_unsupported_files_as_warnings(
monkeypatch,
tmp_path: Path,
) -> None:
fake_python = tmp_path / "fake-ocr-python.py"
fake_python.write_text(
"""#!/usr/bin/env python3
import json
import sys
inputs = []
for index, arg in enumerate(sys.argv):
if arg == "--input" and index + 1 < len(sys.argv):
input_path = sys.argv[index + 1]
inputs.append(
{
"input_path": input_path,
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13",
"summary": "增值税电子发票,金额 100 元。",
"avg_score": 0.98,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13",
"score": 0.98,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
"page_index": 0,
}
],
}
)
payload = {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": inputs,
}
print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))
""",
encoding="utf-8",
)
fake_python.chmod(fake_python.stat().st_mode | stat.S_IEXEC)
monkeypatch.setenv("OCR_PYTHON_BIN", str(fake_python))
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("invoice.png", b"fake-image", "image/png"),
("notes.txt", b"plain-text", "text/plain"),
]
)
finally:
get_settings.cache_clear()
assert result.engine == "paddleocr_mobile"
assert result.model == "PP-OCRv5_mobile"
assert result.total_file_count == 2
assert result.success_count == 1
assert len(result.documents) == 2
recognized = next(item for item in result.documents if item.filename == "invoice.png")
assert recognized.summary == "增值税电子发票,金额 100 元。"
assert recognized.line_count == 1
assert recognized.document_type == "vat_invoice"
assert recognized.document_type_label == "增值税发票"
assert any(field.label == "金额" and field.value == "100元" for field in recognized.document_fields)
assert any(field.label == "票据号码" and field.value == "12345678" for field in recognized.document_fields)
assert any(field.label == "日期" and field.value == "2026-05-13" for field in recognized.document_fields)
assert recognized.lines[0].text == "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13"
skipped = next(item for item in result.documents if item.filename == "notes.txt")
assert skipped.line_count == 0
assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]
def test_ocr_service_passes_configured_device_to_worker(
monkeypatch,
tmp_path: Path,
) -> None:
captured_commands: list[list[str]] = []
def fake_run(
command: list[str],
*,
capture_output: bool,
text: bool,
timeout: int,
check: bool,
) -> subprocess.CompletedProcess[str]:
captured_commands.append(command)
return subprocess.CompletedProcess(
args=command,
returncode=0,
stdout='__OCR_JSON__={"engine":"paddleocr_mobile","model":"PP-OCRv5_mobile","documents":[]}\n',
stderr="",
)
monkeypatch.setenv("OCR_DEVICE", "gpu:0")
get_settings.cache_clear()
monkeypatch.setattr(subprocess, "run", fake_run)
try:
payload = OcrService()._invoke_worker(
python_bin="python",
worker_path="worker.py",
input_paths=[tmp_path / "invoice.png"],
)
finally:
get_settings.cache_clear()
assert payload["engine"] == "paddleocr_mobile"
command = captured_commands[0]
device_index = command.index("--device")
assert command[device_index + 1] == "gpu:0"
def test_ocr_service_converts_pdf_to_images_and_returns_image_preview(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]:
first = output_dir / "page-1.png"
second = output_dir / "page-2.png"
first.write_bytes(b"fake-page-1")
second.write_bytes(b"fake-page-2")
return [first, second]
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
assert [path.name for path in input_paths] == ["page-1.png", "page-2.png"]
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "高铁票 深圳北-广州南 车次 G1234 2026-05-13 金额 188 元",
"summary": "高铁票第一页",
"avg_score": 0.97,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "高铁票 深圳北-广州南 车次 G1234 2026-05-13 金额 188 元",
"score": 0.97,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
},
{
"input_path": str(input_paths[1]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "乘车人 张三",
"summary": "高铁票第二页",
"avg_score": 0.94,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "乘车人 张三",
"score": 0.94,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
},
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("train-ticket.pdf", b"%PDF-1.4 fake", "application/pdf"),
]
)
finally:
get_settings.cache_clear()
assert result.success_count == 1
assert len(result.documents) == 1
recognized = result.documents[0]
assert recognized.filename == "train-ticket.pdf"
assert recognized.page_count == 2
assert recognized.preview_kind == "image"
assert recognized.preview_data_url.startswith("data:image/png;base64,")
assert recognized.document_type == "train_ticket"
assert any(field.label == "金额" and field.value == "188元" for field in recognized.document_fields)
assert any(field.label == "车次/航班" and field.value == "G1234" for field in recognized.document_fields)
assert recognized.lines[0].page_index == 0
assert recognized.lines[1].page_index == 1
def test_ocr_service_uses_pdf_text_layer_without_worker_runtime(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]:
page = output_dir / "page-1.png"
page.write_bytes(b"fake-rendered-page")
return [page]
def fail_resolve_python(self) -> str:
raise AssertionError("PDF 文本层可用时不应强制解析 OCR worker。")
def fail_invoke_worker(self, **kwargs) -> dict:
raise AssertionError("PDF 文本层可用时不应调用 OCR worker。")
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", fail_resolve_python)
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fail_invoke_worker)
monkeypatch.setattr(
OcrService,
"_extract_pdf_text_layer",
lambda self, pdf_path: (
"电子发票(铁路电子客票)\n"
"发票号码:26429165800002785705\n"
"武汉站\n"
"上海虹桥站\n"
"G458\n"
"票价:¥354.00\n"
"电子客票号:6580061086021391007342026"
),
)
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("train-ticket.pdf", b"%PDF-1.7 fake", "application/pdf"),
]
)
finally:
get_settings.cache_clear()
recognized = result.documents[0]
assert result.success_count == 1
assert recognized.document_type == "train_ticket"
assert "电子发票(铁路电子客票)" in recognized.text
assert "电子客票号:6580061086021391007342026" in recognized.text
assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields)
assert recognized.preview_kind == "image"
assert recognized.preview_data_url.startswith("data:image/png;base64,")
def test_ocr_service_reuses_cached_document_for_same_content(
monkeypatch,
tmp_path: Path,
) -> None:
calls = {"count": 0}
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
calls["count"] += 1
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "增值税电子发票 金额 20 元",
"summary": "增值税电子发票,金额 20 元。",
"avg_score": 0.97,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "增值税电子发票 金额 20 元",
"score": 0.97,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
OcrService._result_cache.clear()
get_settings.cache_clear()
try:
first = OcrService().recognize_files([("first.png", b"same-image", "image/png")])
second = OcrService().recognize_files([("second.png", b"same-image", "image/png")])
finally:
OcrService._result_cache.clear()
get_settings.cache_clear()
assert calls["count"] == 1
assert first.documents[0].filename == "first.png"
assert second.documents[0].filename == "second.png"
assert second.documents[0].summary == first.documents[0].summary
def test_ocr_service_prefers_pdf_text_layer_when_rendered_ocr_is_placeholder_heavy(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]:
page = output_dir / "page-1.png"
page.write_bytes(b"fake-page")
return [page]
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "□□□□□□\n□□□□26319166100006175398\nG456\n□□:□354.00",
"summary": "□□□□□□□□□□26319166100006175398",
"avg_score": 0.88,
"line_count": 4,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "□□□□□□",
"score": 0.88,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
monkeypatch.setattr(
OcrService,
"_extract_pdf_text_layer",
lambda self, pdf_path: (
"电子发票(铁路电子客票)\n"
"发票号码:26319166100006175398\n"
"上海虹桥站\n"
"武汉站\n"
"G456\n"
"票价:¥354.00"
),
)
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("train-ticket.pdf", b"%PDF-1.4 fake", "application/pdf"),
]
)
finally:
get_settings.cache_clear()
recognized = result.documents[0]
assert "电子发票(铁路电子客票)" in recognized.text
assert "上海虹桥站" in recognized.text
assert "□□□□" not in recognized.summary
assert recognized.document_type == "train_ticket"
assert recognized.preview_kind == "image"
assert recognized.preview_data_url.startswith("data:image/png;base64,")