X-Financial/server/tests/test_ocr_service.py

from __future__ import annotations

import stat
from pathlib import Path

from app.core.config import get_settings
from app.services.ocr import OcrService


def test_ocr_service_uses_worker_runtime_and_keeps_unsupported_files_as_warnings(
    monkeypatch,
    tmp_path: Path,
) -> None:
    fake_python = tmp_path / "fake-ocr-python.py"
    fake_python.write_text(
        """#!/usr/bin/env python3
import json
import sys

inputs = []
for index, arg in enumerate(sys.argv):
    if arg == "--input" and index + 1 < len(sys.argv):
        input_path = sys.argv[index + 1]
        inputs.append(
            {
                "input_path": input_path,
                "engine": "paddleocr_mobile",
                "model": "PP-OCRv5_mobile",
                "text": "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13",
                "summary": "增值税电子发票，金额 100 元。",
                "avg_score": 0.98,
                "line_count": 1,
                "page_count": 1,
                "warnings": [],
                "lines": [
                    {
                        "text": "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13",
                        "score": 0.98,
                        "box": [[1, 2], [10, 2], [10, 8], [1, 8]],
                        "page_index": 0,
                    }
                ],
            }
        )

payload = {
    "engine": "paddleocr_mobile",
    "model": "PP-OCRv5_mobile",
    "documents": inputs,
}
print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))
""",
        encoding="utf-8",
    )
    fake_python.chmod(fake_python.stat().st_mode | stat.S_IEXEC)

    monkeypatch.setenv("OCR_PYTHON_BIN", str(fake_python))
    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
    get_settings.cache_clear()
    try:
        result = OcrService().recognize_files(
            [
                ("invoice.png", b"fake-image", "image/png"),
                ("notes.txt", b"plain-text", "text/plain"),
            ]
        )
    finally:
        get_settings.cache_clear()

    assert result.engine == "paddleocr_mobile"
    assert result.model == "PP-OCRv5_mobile"
    assert result.total_file_count == 2
    assert result.success_count == 1
    assert len(result.documents) == 2

    recognized = next(item for item in result.documents if item.filename == "invoice.png")
    assert recognized.summary == "增值税电子发票，金额 100 元。"
    assert recognized.line_count == 1
    assert recognized.document_type == "vat_invoice"
    assert recognized.document_type_label == "增值税发票"
    assert any(field.label == "金额" and field.value == "100元" for field in recognized.document_fields)
    assert any(field.label == "票据号码" and field.value == "12345678" for field in recognized.document_fields)
    assert any(field.label == "日期" and field.value == "2026-05-13" for field in recognized.document_fields)
    assert recognized.lines[0].text == "增值税电子发票 发票号码12345678 金额 100 元 2026-05-13"

    skipped = next(item for item in result.documents if item.filename == "notes.txt")
    assert skipped.line_count == 0
    assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]


def test_ocr_service_converts_pdf_to_images_and_returns_image_preview(
    monkeypatch,
    tmp_path: Path,
) -> None:
    def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]:
        first = output_dir / "page-1.png"
        second = output_dir / "page-2.png"
        first.write_bytes(b"fake-page-1")
        second.write_bytes(b"fake-page-2")
        return [first, second]

    def fake_invoke_worker(
        self,
        *,
        python_bin: str,
        worker_path: str,
        input_paths: list[Path],
    ) -> dict:
        assert [path.name for path in input_paths] == ["page-1.png", "page-2.png"]
        return {
            "engine": "paddleocr_mobile",
            "model": "PP-OCRv5_mobile",
            "documents": [
                {
                    "input_path": str(input_paths[0]),
                    "engine": "paddleocr_mobile",
                    "model": "PP-OCRv5_mobile",
                    "text": "高铁票 深圳北-广州南 车次 G1234 2026-05-13 金额 188 元",
                    "summary": "高铁票第一页",
                    "avg_score": 0.97,
                    "line_count": 1,
                    "page_count": 1,
                    "warnings": [],
                    "lines": [
                        {
                            "text": "高铁票 深圳北-广州南 车次 G1234 2026-05-13 金额 188 元",
                            "score": 0.97,
                            "box": [[1, 2], [10, 2], [10, 8], [1, 8]],
                        }
                    ],
                },
                {
                    "input_path": str(input_paths[1]),
                    "engine": "paddleocr_mobile",
                    "model": "PP-OCRv5_mobile",
                    "text": "乘车人 张三",
                    "summary": "高铁票第二页",
                    "avg_score": 0.94,
                    "line_count": 1,
                    "page_count": 1,
                    "warnings": [],
                    "lines": [
                        {
                            "text": "乘车人 张三",
                            "score": 0.94,
                            "box": [[1, 2], [10, 2], [10, 8], [1, 8]],
                        }
                    ],
                },
            ],
        }

    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
    monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
    monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
    monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
    monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
    get_settings.cache_clear()
    try:
        result = OcrService().recognize_files(
            [
                ("train-ticket.pdf", b"%PDF-1.4 fake", "application/pdf"),
            ]
        )
    finally:
        get_settings.cache_clear()

    assert result.success_count == 1
    assert len(result.documents) == 1
    recognized = result.documents[0]
    assert recognized.filename == "train-ticket.pdf"
    assert recognized.page_count == 2
    assert recognized.preview_kind == "image"
    assert recognized.preview_data_url.startswith("data:image/png;base64,")
    assert recognized.document_type == "train_ticket"
    assert any(field.label == "金额" and field.value == "188元" for field in recognized.document_fields)
    assert any(field.label == "车次/航班" and field.value == "G1234" for field in recognized.document_fields)
    assert recognized.lines[0].page_index == 0
    assert recognized.lines[1].page_index == 1


def test_ocr_service_prefers_pdf_text_layer_when_rendered_ocr_is_placeholder_heavy(
    monkeypatch,
    tmp_path: Path,
) -> None:
    def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]:
        page = output_dir / "page-1.png"
        page.write_bytes(b"fake-page")
        return [page]

    def fake_invoke_worker(
        self,
        *,
        python_bin: str,
        worker_path: str,
        input_paths: list[Path],
    ) -> dict:
        return {
            "engine": "paddleocr_mobile",
            "model": "PP-OCRv5_mobile",
            "documents": [
                {
                    "input_path": str(input_paths[0]),
                    "engine": "paddleocr_mobile",
                    "model": "PP-OCRv5_mobile",
                    "text": "□□□□□□\n□□□□：26319166100006175398\nG456\n□□:□354.00",
                    "summary": "□□□□□□；□□□□：26319166100006175398",
                    "avg_score": 0.88,
                    "line_count": 4,
                    "page_count": 1,
                    "warnings": [],
                    "lines": [
                        {
                            "text": "□□□□□□",
                            "score": 0.88,
                            "box": [[1, 2], [10, 2], [10, 8], [1, 8]],
                        }
                    ],
                }
            ],
        }

    monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
    monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
    monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
    monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
    monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
    monkeypatch.setattr(
        OcrService,
        "_extract_pdf_text_layer",
        lambda self, pdf_path: (
            "电子发票（铁路电子客票）\n"
            "发票号码:26319166100006175398\n"
            "上海虹桥站\n"
            "武汉站\n"
            "G456\n"
            "票价:￥354.00"
        ),
    )
    get_settings.cache_clear()
    try:
        result = OcrService().recognize_files(
            [
                ("train-ticket.pdf", b"%PDF-1.4 fake", "application/pdf"),
            ]
        )
    finally:
        get_settings.cache_clear()

    recognized = result.documents[0]
    assert "电子发票（铁路电子客票）" in recognized.text
    assert "上海虹桥站" in recognized.text
    assert "□□□□" not in recognized.summary
    assert recognized.document_type == "train_ticket"
    assert recognized.preview_kind == ""
    assert recognized.preview_data_url == ""