Files
X-Financial/server/scripts/paddle_ocr_worker.py
caoxiaozhu fb23a6976a feat(server): add OCR invoice processing functionality
New endpoints:
- server/src/app/api/v1/endpoints/ocr.py: OCR API endpoints for invoice scanning

New schemas:
- server/src/app/schemas/ocr.py: OCR request/response data schemas

New services:
- server/src/app/services/ocr.py: OCR processing business logic
- server/src/app/services/expense_claims.py: expense claims management service

Scripts:
- server/scripts/bootstrap_paddleocr_mobile.sh: PaddleOCR mobile setup script
- server/scripts/paddle_ocr_worker.py: PaddleOCR worker process
2026-05-12 03:04:10 +00:00

127 lines
3.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import os
import sys
from statistics import fmean
from typing import Any
os.environ.setdefault("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True")
from paddleocr import PaddleOCR # noqa: E402
WORKER_JSON_PREFIX = "__OCR_JSON__="
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Run PaddleOCR mobile worker.")
parser.add_argument("--input", action="append", dest="inputs", required=True)
parser.add_argument("--lang", default="ch")
parser.add_argument("--text-detection-model", default="PP-OCRv5_mobile_det")
parser.add_argument("--text-recognition-model", default="PP-OCRv5_mobile_rec")
return parser.parse_args()
def coerce_box(box: Any) -> list[list[int]]:
if not isinstance(box, list):
return []
points: list[list[int]] = []
for point in box:
if not isinstance(point, list) or len(point) != 2:
continue
points.append([int(point[0]), int(point[1])])
return points
def build_document(input_path: str, results: list[Any]) -> dict[str, Any]:
lines: list[dict[str, Any]] = []
all_texts: list[str] = []
all_scores: list[float] = []
for fallback_page_index, result in enumerate(results):
payload = result.json
if isinstance(payload, str):
payload = json.loads(payload)
if not isinstance(payload, dict):
continue
res = payload.get("res", payload)
if not isinstance(res, dict):
continue
page_index = res.get("page_index")
if page_index is None:
page_index = fallback_page_index if len(results) > 1 else None
texts = res.get("rec_texts", [])
scores = res.get("rec_scores", [])
boxes = res.get("rec_polys") or res.get("dt_polys") or []
for index, text in enumerate(texts):
normalized_text = str(text or "").strip()
if not normalized_text:
continue
score = float(scores[index] if index < len(scores) else 0.0)
box = coerce_box(boxes[index] if index < len(boxes) else [])
lines.append(
{
"text": normalized_text,
"score": score,
"box": box,
"page_index": page_index,
}
)
all_texts.append(normalized_text)
all_scores.append(score)
summary = "".join(all_texts[:3])
if len(summary) > 180:
summary = f"{summary[:177]}..."
warnings: list[str] = []
if not lines:
warnings.append("未识别到可用文本。")
return {
"input_path": input_path,
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "\n".join(all_texts),
"summary": summary,
"avg_score": float(fmean(all_scores)) if all_scores else 0.0,
"line_count": len(lines),
"page_count": len(results),
"warnings": warnings,
"lines": lines,
}
def main() -> int:
args = parse_args()
ocr = PaddleOCR(
text_detection_model_name=args.text_detection_model,
text_recognition_model_name=args.text_recognition_model,
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
lang=args.lang,
)
documents = []
for input_path in args.inputs:
results = ocr.predict(input_path)
documents.append(build_document(input_path, results))
payload = {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": documents,
}
print(f"{WORKER_JSON_PREFIX}{json.dumps(payload, ensure_ascii=False)}")
return 0
if __name__ == "__main__":
sys.exit(main())