Files
X-Financial/server/tests/test_ocr_service.py
caoxiaozhu 9a5ed0e94a feat(server): 系统缓存清理接口与 OCR 文本层兜底增强
- 新增 system_cache 模块与 POST /settings/cache/clear,管理员可一键清理 OCR 结果/运行时配置/模型失败冷却/知识库索引/地点语义等进程内缓存
- 各服务暴露 clear_*_cache 方法(ocr/runtime_settings/runtime_chat/knowledge/application_location_semantic),SettingsCacheClearRead 汇总清理项
- OCR 转图片失败时尝试用 PDF 文本层兜底构建识别文档(有效字符≥8),并写结果缓存;OcrService 暴露 clear_result_cache
- receipt_folder 车票过滤补充身份证号关键词,附件文档/操作/展示模块同步适配
- 新增 system_cache_endpoints 测试,更新 openapi_schema/ocr/receipt_folder/attachment_association_jobs 测试
2026-06-24 12:35:51 +08:00

662 lines
24 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import stat
import subprocess
from pathlib import Path
from app.core.config import get_settings
from app.services import document_preview
from app.services.ocr import OcrService
def test_ocr_runtime_installers_include_cjk_safe_pdf_rendering_tools() -> None:
repo_root = Path(__file__).resolve().parents[2]
dependency_sources = [
repo_root / "docker-compose.yml",
repo_root / "docker-compose.full.yml",
repo_root / "server" / "scripts" / "bootstrap_paddleocr_mobile.sh",
repo_root / "server" / "scripts" / "bootstrap_paddleocr_gpu.sh",
]
for path in dependency_sources:
content = path.read_text(encoding="utf-8")
assert "poppler-data" in content
assert "mupdf-tools" in content
def test_ocr_service_uses_worker_runtime_and_keeps_unsupported_files_as_warnings(
monkeypatch,
tmp_path: Path,
) -> None:
fake_python = tmp_path / "fake-ocr-python.py"
fake_python.write_text(
"""#!/usr/bin/env python3
import json
import sys
inputs = []
for index, arg in enumerate(sys.argv):
if arg == "--input" and index + 1 < len(sys.argv):
input_path = sys.argv[index + 1]
inputs.append(
{
"input_path": input_path,
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13",
"summary": "增值税电子发票,金额 100 元。",
"avg_score": 0.98,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13",
"score": 0.98,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
"page_index": 0,
}
],
}
)
payload = {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": inputs,
}
print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))
""",
encoding="utf-8",
)
fake_python.chmod(fake_python.stat().st_mode | stat.S_IEXEC)
monkeypatch.setenv("OCR_PYTHON_BIN", str(fake_python))
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("invoice.png", b"fake-image", "image/png"),
("notes.txt", b"plain-text", "text/plain"),
]
)
finally:
get_settings.cache_clear()
assert result.engine == "paddleocr_mobile"
assert result.model == "PP-OCRv5_mobile"
assert result.total_file_count == 2
assert result.success_count == 1
assert len(result.documents) == 2
recognized = next(item for item in result.documents if item.filename == "invoice.png")
assert recognized.summary == "增值税电子发票,金额 100 元。"
assert recognized.line_count == 1
assert recognized.document_type == "vat_invoice"
assert recognized.document_type_label == "增值税发票"
assert any(field.label == "金额" and field.value == "100元" for field in recognized.document_fields)
assert any(field.label == "票据号码" and field.value == "12345678" for field in recognized.document_fields)
assert any(field.label == "日期" and field.value == "2026-05-13" for field in recognized.document_fields)
assert recognized.lines[0].text == "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13"
skipped = next(item for item in result.documents if item.filename == "notes.txt")
assert skipped.line_count == 0
assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]
def test_ocr_service_recovers_image_text_from_worker_ocr_text(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"ocr_text": "铁路电子客票 武汉-上海 2026 02 20 07:55 G458 : 354.00 12306 95306",
"avg_score": 0.92,
"line_count": 0,
"page_count": 1,
"warnings": [],
"lines": [],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
OcrService._result_cache.clear()
get_settings.cache_clear()
try:
result = OcrService().recognize_files([("train-ticket.png", b"fake-train-image", "image/png")])
finally:
OcrService._result_cache.clear()
get_settings.cache_clear()
recognized = result.documents[0]
assert "铁路电子客票" in recognized.text
assert recognized.document_type == "train_ticket"
assert any(field.label == "列车出发时间" and field.value == "2026-02-20 07:55" for field in recognized.document_fields)
assert any(field.label == "车次/航班" and field.value == "G458" for field in recognized.document_fields)
assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields)
def test_ocr_service_passes_configured_device_to_worker(
monkeypatch,
tmp_path: Path,
) -> None:
captured_commands: list[list[str]] = []
def fake_run(
command: list[str],
*,
capture_output: bool,
text: bool,
timeout: int,
check: bool,
env: dict[str, str] | None = None,
) -> subprocess.CompletedProcess[str]:
captured_commands.append(command)
return subprocess.CompletedProcess(
args=command,
returncode=0,
stdout='__OCR_JSON__={"engine":"paddleocr_mobile","model":"PP-OCRv5_mobile","documents":[]}\n',
stderr="",
)
monkeypatch.setenv("OCR_DEVICE", "gpu:0")
get_settings.cache_clear()
monkeypatch.setattr(subprocess, "run", fake_run)
try:
payload = OcrService()._invoke_worker(
python_bin="python",
worker_path="worker.py",
input_paths=[tmp_path / "invoice.png"],
)
finally:
get_settings.cache_clear()
assert payload["engine"] == "paddleocr_mobile"
command = captured_commands[0]
device_index = command.index("--device")
assert command[device_index + 1] == "gpu:0"
def test_ocr_service_converts_pdf_to_images_and_returns_image_preview(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
first = output_dir / "page-1.png"
second = output_dir / "page-2.png"
first.write_bytes(b"fake-page-1")
second.write_bytes(b"fake-page-2")
return [first, second], True
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
assert [path.name for path in input_paths] == ["page-1.png", "page-2.png"]
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "高铁票 深圳北-广州南 车次 G1234 2026-05-13 金额 188 元",
"summary": "高铁票第一页",
"avg_score": 0.97,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "高铁票 深圳北-广州南 车次 G1234 2026-05-13 金额 188 元",
"score": 0.97,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
},
{
"input_path": str(input_paths[1]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "乘车人 张三",
"summary": "高铁票第二页",
"avg_score": 0.94,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "乘车人 张三",
"score": 0.94,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
},
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("train-ticket.pdf", b"%PDF-1.4 fake", "application/pdf"),
]
)
finally:
get_settings.cache_clear()
assert result.success_count == 1
assert len(result.documents) == 1
recognized = result.documents[0]
assert recognized.filename == "train-ticket.pdf"
assert recognized.page_count == 2
assert recognized.preview_kind == "image"
assert recognized.preview_data_url.startswith("data:image/png;base64,")
assert recognized.document_type == "train_ticket"
assert any(field.label == "金额" and field.value == "188元" for field in recognized.document_fields)
assert any(field.label == "车次/航班" and field.value == "G1234" for field in recognized.document_fields)
assert recognized.lines[0].page_index == 0
assert recognized.lines[1].page_index == 1
def test_ocr_service_rejects_pdf_ocr_when_rendered_image_fonts_are_broken(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
raise RuntimeError("PDF 转图片失败:检测到中文字体映射缺失,未生成可 OCR 的图片。")
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
raise AssertionError("PDF 转图片已确认丢中文时,不应继续调用 OCR worker。")
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
get_settings.cache_clear()
OcrService._result_cache.clear()
try:
result = OcrService().recognize_files(
[
("2月20_武汉-上海.pdf", b"%PDF-1.7 fake", "application/pdf"),
]
)
finally:
OcrService._result_cache.clear()
get_settings.cache_clear()
failed = result.documents[0]
assert failed.line_count == 0
assert failed.preview_kind == ""
assert failed.preview_data_url == ""
assert failed.warnings == ["PDF 转图片失败:检测到中文字体映射缺失,未生成可 OCR 的图片。"]
def test_ocr_service_uses_pdf_text_layer_when_rendering_fails(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
raise RuntimeError("PDF 转图片失败Missing language pack for Adobe-GB1")
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
raise AssertionError("PDF 转图失败但文本层可用时,不应调用 OCR worker。")
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
monkeypatch.setattr(
OcrService,
"_extract_pdf_text_layer",
lambda self, pdf_path: (
"G458\n"
"Wuhan Shanghaihongqiao\n"
"2026 02 20 07:55\n"
"票价: 354.00\n"
"12306 95306"
),
)
get_settings.cache_clear()
OcrService._result_cache.clear()
try:
result = OcrService().recognize_files(
[
("2月20_武汉-上海.pdf", b"%PDF-1.7 text-layer-fallback", "application/pdf"),
]
)
finally:
OcrService._result_cache.clear()
get_settings.cache_clear()
recovered = result.documents[0]
assert result.success_count == 1
assert recovered.document_type == "train_ticket"
assert recovered.document_type_label == "火车/高铁票"
assert recovered.preview_kind == ""
assert recovered.preview_data_url == ""
assert any(field.label == "金额" and field.value == "354元" for field in recovered.document_fields)
assert any(field.label == "车次/航班" and field.value == "G458" for field in recovered.document_fields)
assert any(field.label == "行程" and field.value == "武汉-上海" for field in recovered.document_fields)
assert "PDF 转图片失败" in recovered.warnings[0]
assert "已使用 PDF 文本层" in recovered.warnings[1]
def test_ocr_pdf_conversion_tries_next_renderer_when_poppler_font_mapping_fails(
monkeypatch,
tmp_path: Path,
) -> None:
output_dir = tmp_path / "pages"
output_dir.mkdir()
calls: list[str] = []
def fake_run(
command: list[str],
*,
capture_output: bool,
text: bool,
timeout: int,
check: bool,
env: dict[str, str] | None = None,
) -> subprocess.CompletedProcess[str]:
calls.append(Path(command[0]).name)
if Path(command[0]).name == "pdftoppm":
(output_dir / "page-1.png").write_bytes(b"broken-preview")
return subprocess.CompletedProcess(
args=command,
returncode=0,
stdout="",
stderr="Syntax Error: Missing language pack for 'Adobe-GB1' mapping",
)
(output_dir / "page-1.png").write_bytes(b"rendered-with-chinese")
return subprocess.CompletedProcess(
args=command,
returncode=0,
stdout="",
stderr="",
)
monkeypatch.setattr(
document_preview.shutil,
"which",
lambda name: f"/usr/bin/{name}" if name in {"pdftoppm", "mutool"} else None,
)
monkeypatch.setattr(subprocess, "run", fake_run)
pages, preview_usable = OcrService()._convert_pdf_to_images(
pdf_path=tmp_path / "ticket.pdf",
output_dir=output_dir,
)
assert pages == [output_dir / "page-1.png"]
assert preview_usable is True
assert calls == ["pdftoppm", "mutool"]
def test_ocr_service_invokes_worker_even_when_pdf_text_layer_is_usable(
monkeypatch,
tmp_path: Path,
) -> None:
calls = {"worker": 0}
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
page = output_dir / "page-1.png"
page.write_bytes(b"fake-rendered-page")
return [page], True
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
calls["worker"] += 1
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "电子发票(铁路电子客票) 武汉站 上海虹桥站 G458 票价 ¥354.00",
"summary": "铁路电子客票",
"avg_score": 0.95,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "电子发票(铁路电子客票) 武汉站 上海虹桥站 G458 票价 ¥354.00",
"score": 0.95,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
monkeypatch.setattr(
OcrService,
"_extract_pdf_text_layer",
lambda self, pdf_path: (
"电子发票(铁路电子客票)\n"
"发票号码:26429165800002785705\n"
"武汉站\n"
"上海虹桥站\n"
"G458\n"
"票价:¥354.00\n"
"电子客票号:6580061086021391007342026"
),
)
get_settings.cache_clear()
OcrService._result_cache.clear()
try:
result = OcrService().recognize_files(
[
("train-ticket.pdf", b"%PDF-1.7 fake", "application/pdf"),
]
)
finally:
OcrService._result_cache.clear()
get_settings.cache_clear()
recognized = result.documents[0]
assert result.success_count == 1
assert calls["worker"] == 1
assert recognized.document_type == "train_ticket"
assert "电子发票(铁路电子客票)" in recognized.text
assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields)
assert recognized.preview_kind == "image"
assert recognized.preview_data_url.startswith("data:image/png;base64,")
def test_ocr_service_reuses_cached_document_for_same_content(
monkeypatch,
tmp_path: Path,
) -> None:
calls = {"count": 0}
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
calls["count"] += 1
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "增值税电子发票 金额 20 元",
"summary": "增值税电子发票,金额 20 元。",
"avg_score": 0.97,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "增值税电子发票 金额 20 元",
"score": 0.97,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
OcrService._result_cache.clear()
get_settings.cache_clear()
try:
first = OcrService().recognize_files([("first.png", b"same-image", "image/png")])
second = OcrService().recognize_files([("second.png", b"same-image", "image/png")])
finally:
OcrService._result_cache.clear()
get_settings.cache_clear()
assert calls["count"] == 1
assert first.documents[0].filename == "first.png"
assert second.documents[0].filename == "second.png"
assert second.documents[0].summary == first.documents[0].summary
def test_ocr_cache_key_includes_pdf_render_pipeline_version() -> None:
cache_key = OcrService()._build_cache_key(b"same-pdf-content")
assert "pdf-image-ocr:" in cache_key
assert document_preview.DocumentPreviewAssets.PDF_RENDERER_ID in cache_key
assert "no-pdf-direct" in cache_key
def test_ocr_service_prefers_pdf_text_layer_when_rendered_ocr_is_placeholder_heavy(
monkeypatch,
tmp_path: Path,
) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
page = output_dir / "page-1.png"
page.write_bytes(b"fake-page")
return [page], True
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "□□□□□□\n□□□□26319166100006175398\nG456\n□□:□354.00",
"summary": "□□□□□□□□□□26319166100006175398",
"avg_score": 0.88,
"line_count": 4,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "□□□□□□",
"score": 0.88,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
monkeypatch.setattr(
OcrService,
"_extract_pdf_text_layer",
lambda self, pdf_path: (
"电子发票(铁路电子客票)\n"
"发票号码:26319166100006175398\n"
"上海虹桥站\n"
"武汉站\n"
"G456\n"
"票价:¥354.00"
),
)
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("train-ticket.pdf", b"%PDF-1.4 fake", "application/pdf"),
]
)
finally:
get_settings.cache_clear()
recognized = result.documents[0]
assert "电子发票(铁路电子客票)" in recognized.text
assert "上海虹桥站" in recognized.text
assert "□□□□" not in recognized.summary
assert recognized.document_type == "train_ticket"
assert recognized.preview_kind == "image"
assert recognized.preview_data_url.startswith("data:image/png;base64,")