from __future__ import annotations import stat import subprocess from pathlib import Path from app.core.config import get_settings from app.services.ocr import OcrService def test_ocr_service_uses_worker_runtime_and_keeps_unsupported_files_as_warnings( monkeypatch, tmp_path: Path, ) -> None: fake_python = tmp_path / "fake-ocr-python.py" fake_python.write_text( """#!/usr/bin/env python3 import json import sys inputs = [] for index, arg in enumerate(sys.argv): if arg == "--input" and index + 1 < len(sys.argv): input_path = sys.argv[index + 1] inputs.append( { "input_path": input_path, "engine": "paddleocr_mobile", "model": "PP-OCRv5_mobile", "text": "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13", "summary": "增值税电子发票,金额 100 元。", "avg_score": 0.98, "line_count": 1, "page_count": 1, "warnings": [], "lines": [ { "text": "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13", "score": 0.98, "box": [[1, 2], [10, 2], [10, 8], [1, 8]], "page_index": 0, } ], } ) payload = { "engine": "paddleocr_mobile", "model": "PP-OCRv5_mobile", "documents": inputs, } print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False)) """, encoding="utf-8", ) fake_python.chmod(fake_python.stat().st_mode | stat.S_IEXEC) monkeypatch.setenv("OCR_PYTHON_BIN", str(fake_python)) monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() try: result = OcrService().recognize_files( [ ("invoice.png", b"fake-image", "image/png"), ("notes.txt", b"plain-text", "text/plain"), ] ) finally: get_settings.cache_clear() assert result.engine == "paddleocr_mobile" assert result.model == "PP-OCRv5_mobile" assert result.total_file_count == 2 assert result.success_count == 1 assert len(result.documents) == 2 recognized = next(item for item in result.documents if item.filename == "invoice.png") assert recognized.summary == "增值税电子发票,金额 100 元。" assert recognized.line_count == 1 assert recognized.document_type == "vat_invoice" assert recognized.document_type_label == "增值税发票" assert any(field.label == "金额" and field.value == "100元" for field in recognized.document_fields) assert any(field.label == "票据号码" and field.value == "12345678" for field in recognized.document_fields) assert any(field.label == "日期" and field.value == "2026-05-13" for field in recognized.document_fields) assert recognized.lines[0].text == "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13" skipped = next(item for item in result.documents if item.filename == "notes.txt") assert skipped.line_count == 0 assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"] def test_ocr_service_passes_configured_device_to_worker( monkeypatch, tmp_path: Path, ) -> None: captured_commands: list[list[str]] = [] def fake_run( command: list[str], *, capture_output: bool, text: bool, timeout: int, check: bool, ) -> subprocess.CompletedProcess[str]: captured_commands.append(command) return subprocess.CompletedProcess( args=command, returncode=0, stdout='__OCR_JSON__={"engine":"paddleocr_mobile","model":"PP-OCRv5_mobile","documents":[]}\n', stderr="", ) monkeypatch.setenv("OCR_DEVICE", "gpu:0") get_settings.cache_clear() monkeypatch.setattr(subprocess, "run", fake_run) try: payload = OcrService()._invoke_worker( python_bin="python", worker_path="worker.py", input_paths=[tmp_path / "invoice.png"], ) finally: get_settings.cache_clear() assert payload["engine"] == "paddleocr_mobile" command = captured_commands[0] device_index = command.index("--device") assert command[device_index + 1] == "gpu:0" def test_ocr_service_converts_pdf_to_images_and_returns_image_preview( monkeypatch, tmp_path: Path, ) -> None: def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]: first = output_dir / "page-1.png" second = output_dir / "page-2.png" first.write_bytes(b"fake-page-1") second.write_bytes(b"fake-page-2") return [first, second] def fake_invoke_worker( self, *, python_bin: str, worker_path: str, input_paths: list[Path], ) -> dict: assert [path.name for path in input_paths] == ["page-1.png", "page-2.png"] return { "engine": "paddleocr_mobile", "model": "PP-OCRv5_mobile", "documents": [ { "input_path": str(input_paths[0]), "engine": "paddleocr_mobile", "model": "PP-OCRv5_mobile", "text": "高铁票 深圳北-广州南 车次 G1234 2026-05-13 金额 188 元", "summary": "高铁票第一页", "avg_score": 0.97, "line_count": 1, "page_count": 1, "warnings": [], "lines": [ { "text": "高铁票 深圳北-广州南 车次 G1234 2026-05-13 金额 188 元", "score": 0.97, "box": [[1, 2], [10, 2], [10, 8], [1, 8]], } ], }, { "input_path": str(input_paths[1]), "engine": "paddleocr_mobile", "model": "PP-OCRv5_mobile", "text": "乘车人 张三", "summary": "高铁票第二页", "avg_score": 0.94, "line_count": 1, "page_count": 1, "warnings": [], "lines": [ { "text": "乘车人 张三", "score": 0.94, "box": [[1, 2], [10, 2], [10, 8], [1, 8]], } ], }, ], } monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python") monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py") monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images) monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker) get_settings.cache_clear() try: result = OcrService().recognize_files( [ ("train-ticket.pdf", b"%PDF-1.4 fake", "application/pdf"), ] ) finally: get_settings.cache_clear() assert result.success_count == 1 assert len(result.documents) == 1 recognized = result.documents[0] assert recognized.filename == "train-ticket.pdf" assert recognized.page_count == 2 assert recognized.preview_kind == "image" assert recognized.preview_data_url.startswith("data:image/png;base64,") assert recognized.document_type == "train_ticket" assert any(field.label == "金额" and field.value == "188元" for field in recognized.document_fields) assert any(field.label == "车次/航班" and field.value == "G1234" for field in recognized.document_fields) assert recognized.lines[0].page_index == 0 assert recognized.lines[1].page_index == 1 def test_ocr_service_reuses_cached_document_for_same_content( monkeypatch, tmp_path: Path, ) -> None: calls = {"count": 0} def fake_invoke_worker( self, *, python_bin: str, worker_path: str, input_paths: list[Path], ) -> dict: calls["count"] += 1 return { "engine": "paddleocr_mobile", "model": "PP-OCRv5_mobile", "documents": [ { "input_path": str(input_paths[0]), "engine": "paddleocr_mobile", "model": "PP-OCRv5_mobile", "text": "增值税电子发票 金额 20 元", "summary": "增值税电子发票,金额 20 元。", "avg_score": 0.97, "line_count": 1, "page_count": 1, "warnings": [], "lines": [ { "text": "增值税电子发票 金额 20 元", "score": 0.97, "box": [[1, 2], [10, 2], [10, 8], [1, 8]], } ], } ], } monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python") monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py") monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker) OcrService._result_cache.clear() get_settings.cache_clear() try: first = OcrService().recognize_files([("first.png", b"same-image", "image/png")]) second = OcrService().recognize_files([("second.png", b"same-image", "image/png")]) finally: OcrService._result_cache.clear() get_settings.cache_clear() assert calls["count"] == 1 assert first.documents[0].filename == "first.png" assert second.documents[0].filename == "second.png" assert second.documents[0].summary == first.documents[0].summary def test_ocr_service_prefers_pdf_text_layer_when_rendered_ocr_is_placeholder_heavy( monkeypatch, tmp_path: Path, ) -> None: def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]: page = output_dir / "page-1.png" page.write_bytes(b"fake-page") return [page] def fake_invoke_worker( self, *, python_bin: str, worker_path: str, input_paths: list[Path], ) -> dict: return { "engine": "paddleocr_mobile", "model": "PP-OCRv5_mobile", "documents": [ { "input_path": str(input_paths[0]), "engine": "paddleocr_mobile", "model": "PP-OCRv5_mobile", "text": "□□□□□□\n□□□□:26319166100006175398\nG456\n□□:□354.00", "summary": "□□□□□□;□□□□:26319166100006175398", "avg_score": 0.88, "line_count": 4, "page_count": 1, "warnings": [], "lines": [ { "text": "□□□□□□", "score": 0.88, "box": [[1, 2], [10, 2], [10, 8], [1, 8]], } ], } ], } monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python") monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py") monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images) monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker) monkeypatch.setattr( OcrService, "_extract_pdf_text_layer", lambda self, pdf_path: ( "电子发票(铁路电子客票)\n" "发票号码:26319166100006175398\n" "上海虹桥站\n" "武汉站\n" "G456\n" "票价:¥354.00" ), ) get_settings.cache_clear() try: result = OcrService().recognize_files( [ ("train-ticket.pdf", b"%PDF-1.4 fake", "application/pdf"), ] ) finally: get_settings.cache_clear() recognized = result.documents[0] assert "电子发票(铁路电子客票)" in recognized.text assert "上海虹桥站" in recognized.text assert "□□□□" not in recognized.summary assert recognized.document_type == "train_ticket" assert recognized.preview_kind == "" assert recognized.preview_data_url == ""