2026-05-12 03:04:10 +00:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
|
import json
|
|
|
|
|
|
import os
|
|
|
|
|
|
import sys
|
|
|
|
|
|
from statistics import fmean
|
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
|
|
os.environ.setdefault("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True")
|
|
|
|
|
|
|
|
|
|
|
|
from paddleocr import PaddleOCR # noqa: E402
|
|
|
|
|
|
|
|
|
|
|
|
WORKER_JSON_PREFIX = "__OCR_JSON__="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="Run PaddleOCR mobile worker.")
|
|
|
|
|
|
parser.add_argument("--input", action="append", dest="inputs", required=True)
|
|
|
|
|
|
parser.add_argument("--lang", default="ch")
|
|
|
|
|
|
parser.add_argument("--text-detection-model", default="PP-OCRv5_mobile_det")
|
|
|
|
|
|
parser.add_argument("--text-recognition-model", default="PP-OCRv5_mobile_rec")
|
2026-06-09 08:32:00 +00:00
|
|
|
|
parser.add_argument("--device", default=os.environ.get("OCR_DEVICE", ""))
|
2026-06-06 17:19:07 +08:00
|
|
|
|
parser.add_argument("--enable-mkldnn", action="store_true")
|
2026-05-12 03:04:10 +00:00
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def coerce_box(box: Any) -> list[list[int]]:
|
|
|
|
|
|
if not isinstance(box, list):
|
|
|
|
|
|
return []
|
|
|
|
|
|
points: list[list[int]] = []
|
|
|
|
|
|
for point in box:
|
|
|
|
|
|
if not isinstance(point, list) or len(point) != 2:
|
|
|
|
|
|
continue
|
|
|
|
|
|
points.append([int(point[0]), int(point[1])])
|
|
|
|
|
|
return points
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_document(input_path: str, results: list[Any]) -> dict[str, Any]:
|
|
|
|
|
|
lines: list[dict[str, Any]] = []
|
|
|
|
|
|
all_texts: list[str] = []
|
|
|
|
|
|
all_scores: list[float] = []
|
|
|
|
|
|
|
|
|
|
|
|
for fallback_page_index, result in enumerate(results):
|
|
|
|
|
|
payload = result.json
|
|
|
|
|
|
if isinstance(payload, str):
|
|
|
|
|
|
payload = json.loads(payload)
|
|
|
|
|
|
if not isinstance(payload, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
res = payload.get("res", payload)
|
|
|
|
|
|
if not isinstance(res, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
page_index = res.get("page_index")
|
|
|
|
|
|
if page_index is None:
|
|
|
|
|
|
page_index = fallback_page_index if len(results) > 1 else None
|
|
|
|
|
|
|
|
|
|
|
|
texts = res.get("rec_texts", [])
|
|
|
|
|
|
scores = res.get("rec_scores", [])
|
|
|
|
|
|
boxes = res.get("rec_polys") or res.get("dt_polys") or []
|
|
|
|
|
|
|
|
|
|
|
|
for index, text in enumerate(texts):
|
|
|
|
|
|
normalized_text = str(text or "").strip()
|
|
|
|
|
|
if not normalized_text:
|
|
|
|
|
|
continue
|
|
|
|
|
|
score = float(scores[index] if index < len(scores) else 0.0)
|
|
|
|
|
|
box = coerce_box(boxes[index] if index < len(boxes) else [])
|
|
|
|
|
|
lines.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"text": normalized_text,
|
|
|
|
|
|
"score": score,
|
|
|
|
|
|
"box": box,
|
|
|
|
|
|
"page_index": page_index,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
all_texts.append(normalized_text)
|
|
|
|
|
|
all_scores.append(score)
|
|
|
|
|
|
|
|
|
|
|
|
summary = ";".join(all_texts[:3])
|
|
|
|
|
|
if len(summary) > 180:
|
|
|
|
|
|
summary = f"{summary[:177]}..."
|
|
|
|
|
|
|
|
|
|
|
|
warnings: list[str] = []
|
|
|
|
|
|
if not lines:
|
|
|
|
|
|
warnings.append("未识别到可用文本。")
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
"input_path": input_path,
|
|
|
|
|
|
"engine": "paddleocr_mobile",
|
|
|
|
|
|
"model": "PP-OCRv5_mobile",
|
|
|
|
|
|
"text": "\n".join(all_texts),
|
|
|
|
|
|
"summary": summary,
|
|
|
|
|
|
"avg_score": float(fmean(all_scores)) if all_scores else 0.0,
|
|
|
|
|
|
"line_count": len(lines),
|
|
|
|
|
|
"page_count": len(results),
|
|
|
|
|
|
"warnings": warnings,
|
|
|
|
|
|
"lines": lines,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> int:
|
|
|
|
|
|
args = parse_args()
|
2026-06-09 08:32:00 +00:00
|
|
|
|
ocr_options = {
|
|
|
|
|
|
"text_detection_model_name": args.text_detection_model,
|
|
|
|
|
|
"text_recognition_model_name": args.text_recognition_model,
|
|
|
|
|
|
"use_doc_orientation_classify": False,
|
|
|
|
|
|
"use_doc_unwarping": False,
|
|
|
|
|
|
"use_textline_orientation": False,
|
|
|
|
|
|
"lang": args.lang,
|
2026-06-06 17:19:07 +08:00
|
|
|
|
# PaddlePaddle 3.3.x CPU oneDNN can fail on PP-OCRv5 static inference.
|
2026-06-09 08:32:00 +00:00
|
|
|
|
"enable_mkldnn": args.enable_mkldnn,
|
|
|
|
|
|
}
|
|
|
|
|
|
configured_device = str(args.device or "").strip()
|
|
|
|
|
|
if configured_device:
|
|
|
|
|
|
ocr_options["device"] = configured_device
|
|
|
|
|
|
ocr = PaddleOCR(**ocr_options)
|
2026-05-12 03:04:10 +00:00
|
|
|
|
|
|
|
|
|
|
documents = []
|
|
|
|
|
|
for input_path in args.inputs:
|
|
|
|
|
|
results = ocr.predict(input_path)
|
|
|
|
|
|
documents.append(build_document(input_path, results))
|
|
|
|
|
|
|
|
|
|
|
|
payload = {
|
|
|
|
|
|
"engine": "paddleocr_mobile",
|
|
|
|
|
|
"model": "PP-OCRv5_mobile",
|
|
|
|
|
|
"documents": documents,
|
|
|
|
|
|
}
|
|
|
|
|
|
print(f"{WORKER_JSON_PREFIX}{json.dumps(payload, ensure_ascii=False)}")
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
sys.exit(main())
|