84 lines
2.6 KiB
Python
84 lines
2.6 KiB
Python
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import stat
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from app.core.config import get_settings
|
||
|
|
from app.services.ocr import OcrService
|
||
|
|
|
||
|
|
|
||
|
|
def test_ocr_service_uses_worker_runtime_and_keeps_unsupported_files_as_warnings(
|
||
|
|
monkeypatch,
|
||
|
|
tmp_path: Path,
|
||
|
|
) -> None:
|
||
|
|
fake_python = tmp_path / "fake-ocr-python.py"
|
||
|
|
fake_python.write_text(
|
||
|
|
"""#!/usr/bin/env python3
|
||
|
|
import json
|
||
|
|
import sys
|
||
|
|
|
||
|
|
inputs = []
|
||
|
|
for index, arg in enumerate(sys.argv):
|
||
|
|
if arg == "--input" and index + 1 < len(sys.argv):
|
||
|
|
input_path = sys.argv[index + 1]
|
||
|
|
inputs.append(
|
||
|
|
{
|
||
|
|
"input_path": input_path,
|
||
|
|
"engine": "paddleocr_mobile",
|
||
|
|
"model": "PP-OCRv5_mobile",
|
||
|
|
"text": "发票金额 100 元",
|
||
|
|
"summary": "发票金额 100 元",
|
||
|
|
"avg_score": 0.98,
|
||
|
|
"line_count": 1,
|
||
|
|
"page_count": 1,
|
||
|
|
"warnings": [],
|
||
|
|
"lines": [
|
||
|
|
{
|
||
|
|
"text": "发票金额 100 元",
|
||
|
|
"score": 0.98,
|
||
|
|
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
|
||
|
|
"page_index": 0,
|
||
|
|
}
|
||
|
|
],
|
||
|
|
}
|
||
|
|
)
|
||
|
|
|
||
|
|
payload = {
|
||
|
|
"engine": "paddleocr_mobile",
|
||
|
|
"model": "PP-OCRv5_mobile",
|
||
|
|
"documents": inputs,
|
||
|
|
}
|
||
|
|
print("__OCR_JSON__=" + json.dumps(payload, ensure_ascii=False))
|
||
|
|
""",
|
||
|
|
encoding="utf-8",
|
||
|
|
)
|
||
|
|
fake_python.chmod(fake_python.stat().st_mode | stat.S_IEXEC)
|
||
|
|
|
||
|
|
monkeypatch.setenv("OCR_PYTHON_BIN", str(fake_python))
|
||
|
|
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
|
||
|
|
get_settings.cache_clear()
|
||
|
|
try:
|
||
|
|
result = OcrService().recognize_files(
|
||
|
|
[
|
||
|
|
("invoice.png", b"fake-image", "image/png"),
|
||
|
|
("notes.txt", b"plain-text", "text/plain"),
|
||
|
|
]
|
||
|
|
)
|
||
|
|
finally:
|
||
|
|
get_settings.cache_clear()
|
||
|
|
|
||
|
|
assert result.engine == "paddleocr_mobile"
|
||
|
|
assert result.model == "PP-OCRv5_mobile"
|
||
|
|
assert result.total_file_count == 2
|
||
|
|
assert result.success_count == 1
|
||
|
|
assert len(result.documents) == 2
|
||
|
|
|
||
|
|
recognized = next(item for item in result.documents if item.filename == "invoice.png")
|
||
|
|
assert recognized.summary == "发票金额 100 元"
|
||
|
|
assert recognized.line_count == 1
|
||
|
|
assert recognized.lines[0].text == "发票金额 100 元"
|
||
|
|
|
||
|
|
skipped = next(item for item in result.documents if item.filename == "notes.txt")
|
||
|
|
assert skipped.line_count == 0
|
||
|
|
assert skipped.warnings == ["当前仅支持图片和 PDF 文件进行 OCR。"]
|