server/tests/test_ocr_service.py

from __future__ import annotations

import stat
from pathlib import Path

from app.core.config import get_settings
from app.services.ocr import OcrService


def test_ocr_service_uses_worker_runtime_and_keeps_unsupported_files_as_warnings(
    monkeypatch,
    tmp_path: Path,
) -> None:
    fake_python = tmp_path / "fake-ocr-python.py"
    fake_python.write_text(
        """#!/usr/bin/env python3
import json
import sys

inputs = []
for index, arg in enumerate(sys.argv):
    if arg == "--input" and index + 1 < len(sys.argv):
        input_path = sys.argv[index + 1]
        inputs.append(
            {
                "input_path": input_path,
                "engine": "paddleocr_mobile",
                "model": "PP-OCRv5_mobile",
                "text": "发票金额 100 元",
                "summary": "发票金额 100 元",
                "avg_score": 0.98,
                "line_count": 1,
                "page_count": 1,
                "warnings": [],
                "lines": [
                    {
                        "text": "发票金额 100 元",
                        "score": 0.98,
                        "box": [[1, 2], [10, 2], [10, 8], [1, 8]],
                        "page_index": 0,
                    }
                ],
            }
        )

payload = {
    "engine": "paddleocr_mobile",
    "model": "PP-OCRv5_mobile",
    "documents": inputs,
}
print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))
""",
        encoding="utf-8",
    )
    fake_python.chmod(fake_python.stat().st_mode | stat.S_IEXEC)

    monkeypatch.setenv("OCR_PYTHON_BIN", str(fake_python))
    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
    get_settings.cache_clear()
    try:
        result = OcrService().recognize_files(
            [
                ("invoice.png", b"fake-image", "image/png"),
                ("notes.txt", b"plain-text", "text/plain"),
            ]
        )
    finally:
        get_settings.cache_clear()

    assert result.engine == "paddleocr_mobile"
    assert result.model == "PP-OCRv5_mobile"
    assert result.total_file_count == 2
    assert result.success_count == 1
    assert len(result.documents) == 2

    recognized = next(item for item in result.documents if item.filename == "invoice.png")
    assert recognized.summary == "发票金额 100 元"
    assert recognized.line_count == 1
    assert recognized.lines[0].text == "发票金额 100 元"

    skipped = next(item for item in result.documents if item.filename == "notes.txt")
    assert skipped.line_count == 0
    assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]
test(server): add OCR endpoint and service tests New tests: - server/tests/test_ocr_endpoints.py: OCR API endpoint tests - server/tests/test_ocr_service.py: OCR service unit tests Updated tests: - server/tests/test_openapi_schema.py: update OpenAPI schema tests - server/tests/test_orchestrator_service.py: update orchestrator service tests 2026-05-12 03:05:05 +00:00			`from __future__ import annotations`

			`import stat`
			`from pathlib import Path`

			`from app.core.config import get_settings`
			`from app.services.ocr import OcrService`


			`def test_ocr_service_uses_worker_runtime_and_keeps_unsupported_files_as_warnings(`
			`monkeypatch,`
			`tmp_path: Path,`
			`) -> None:`
			`fake_python = tmp_path / "fake-ocr-python.py"`
			`fake_python.write_text(`
			`"""#!/usr/bin/env python3`
			`import json`
			`import sys`

			`inputs = []`
			`for index, arg in enumerate(sys.argv):`
			`if arg == "--input" and index + 1 < len(sys.argv):`
			`input_path = sys.argv[index + 1]`
			`inputs.append(`
			`{`
			`"input_path": input_path,`
			`"engine": "paddleocr_mobile",`
			`"model": "PP-OCRv5_mobile",`
			`"text": "发票金额 100 元",`
			`"summary": "发票金额 100 元",`
			`"avg_score": 0.98,`
			`"line_count": 1,`
			`"page_count": 1,`
			`"warnings": [],`
			`"lines": [`
			`{`
			`"text": "发票金额 100 元",`
			`"score": 0.98,`
			`"box": [[1, 2], [10, 2], [10, 8], [1, 8]],`
			`"page_index": 0,`
			`}`
			`],`
			`}`
			`)`

			`payload = {`
			`"engine": "paddleocr_mobile",`
			`"model": "PP-OCRv5_mobile",`
			`"documents": inputs,`
			`}`
			`print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))`
			`""",`
			`encoding="utf-8",`
			`)`
			`fake_python.chmod(fake_python.stat().st_mode \| stat.S_IEXEC)`

			`monkeypatch.setenv("OCR_PYTHON_BIN", str(fake_python))`
			`monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))`
			`get_settings.cache_clear()`
			`try:`
			`result = OcrService().recognize_files(`
			`[`
			`("invoice.png", b"fake-image", "image/png"),`
			`("notes.txt", b"plain-text", "text/plain"),`
			`]`
			`)`
			`finally:`
			`get_settings.cache_clear()`

			`assert result.engine == "paddleocr_mobile"`
			`assert result.model == "PP-OCRv5_mobile"`
			`assert result.total_file_count == 2`
			`assert result.success_count == 1`
			`assert len(result.documents) == 2`

			`recognized = next(item for item in result.documents if item.filename == "invoice.png")`
			`assert recognized.summary == "发票金额 100 元"`
			`assert recognized.line_count == 1`
			`assert recognized.lines[0].text == "发票金额 100 元"`

			`skipped = next(item for item in result.documents if item.filename == "notes.txt")`
			`assert skipped.line_count == 0`
			`assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]`