refactor(server): user_agent/steward/ocr 等服务重构并适配关联任务

- user_agent 拆分 application/locations/knowledge/response/review 四个子模块,接入申请位置语义与关联草稿分支
- steward planner/runtime/slot/plan_builder 决策链路重构,travel_reimbursement_calculator/orchestrator_expense_query 适配
- ocr/document_preview/document_intelligence/receipt_folder 复用预览与资产缓存,expense_claim_draft_flow/application_handoff 适配
- pyproject.toml 新增依赖,paddleocr bootstrap 脚本与 server_start.sh 调整
- 更新差旅/交通/通信等财务规则表,同步 document_intelligence/ocr/receipt_folder/user_agent 等测试
This commit is contained in:
caoxiaozhu
2026-06-24 10:42:24 +08:00
parent 332f77389d
commit 0264a4b5b4
41 changed files with 1273 additions and 182 deletions

View File

@@ -4,7 +4,7 @@ import base64
from app.api.deps import CurrentUserContext
from app.core.config import get_settings
from app.schemas.ocr import OcrRecognizeDocumentRead
from app.schemas.ocr import OcrRecognizeDocumentRead, OcrRecognizeFieldRead
from app.services.document_preview import DocumentPreviewAssets
from app.services.receipt_folder import ReceiptFolderService
@@ -72,6 +72,55 @@ def test_receipt_folder_train_ticket_uses_invoice_date_and_enriches_fields(monke
get_settings.cache_clear()
def test_receipt_folder_pdf_save_eagerly_renders_image_preview(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
preview_path.write_bytes(b"rendered-preview")
return preview_path
monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=b"%PDF-1.4 fake",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text="铁路电子客票 武汉 上海虹桥 354.00",
summary="铁路电子客票,武汉至上海虹桥。",
),
)
receipt_dir = next(service.root.glob("pytest/*"))
preview_path = receipt_dir / "preview.png"
meta = service._read_meta(receipt_dir)
assert receipt.preview_kind == "image"
assert preview_path.read_bytes() == b"rendered-preview"
assert meta["preview_file_name"] == "preview.png"
assert meta["preview_media_type"] == "image/png"
assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user)
assert resolved_path == preview_path
assert media_type == "image/png"
assert file_name == "preview.png"
finally:
get_settings.cache_clear()
def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
@@ -123,6 +172,213 @@ def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch,
get_settings.cache_clear()
def test_receipt_folder_pdf_preview_falls_back_to_source_when_render_fonts_missing(
monkeypatch,
tmp_path,
) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
stale_preview = b"broken-preview"
preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}"
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=b"%PDF-1.7 fake",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
preview_kind="image",
preview_data_url=preview_data_url,
),
)
receipt_dir = next(service.root.glob("pytest/*"))
meta = service._read_meta(receipt_dir)
meta["preview_rendered_with"] = "pdftoppm-png-r160-poppler-data"
service._write_meta(receipt_dir, meta)
def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
raise RuntimeError("Missing language pack for 'Adobe-GB1' mapping")
monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user)
assert resolved_path == receipt_dir / "2月20_武汉-上海.pdf"
assert media_type == "application/pdf"
assert file_name == "2月20_武汉-上海.pdf"
refreshed_meta = service._read_meta(receipt_dir)
assert refreshed_meta["preview_kind"] == "pdf"
assert refreshed_meta["preview_file_name"] == "2月20_武汉-上海.pdf"
assert refreshed_meta["preview_media_type"] == "application/pdf"
assert refreshed_meta["preview_rendered_with"] == ""
finally:
get_settings.cache_clear()
def test_receipt_folder_train_ticket_extracts_passenger_from_id_line_and_purchase_name(
monkeypatch,
tmp_path,
) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=b"%PDF-1.4 fake",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text=(
"电子发票(铁路电子客票)\n"
"发票号码:26429165800002785705 湖北\n"
"开票日期:2026年05月18日\n"
"武汉站 G458 上海虹桥站\n"
"Wuhan Shanghaihongqiao\n"
"2026年02月20日 07:55开 06车01B号 二等座\n"
"票价:¥354.00\n"
"4201061987****1615 曹笑竹\n"
"电子客票号:6580061086021391007342026\n"
"购买方名称:曹笑竹 统一社会信用代码:\n"
"买票请到12306 发货请到95306\n"
"中国铁路祝您旅途愉快"
),
summary="电子发票(铁路电子客票);发票监;统一 制",
document_type="train_ticket",
document_type_label="火车/高铁票",
scene_code="travel",
scene_label="差旅票据",
document_fields=[
OcrRecognizeFieldRead(key="merchant_name", label="商户", value="电子发票(铁路"),
OcrRecognizeFieldRead(key="amount", label="金额", value="354元"),
OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"),
OcrRecognizeFieldRead(key="trip_no", label="车次", value="G458"),
OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"),
],
),
)
assert receipt.merchant_name == "中国铁路"
detail = service.get_receipt(receipt.id, current_user)
fields = {field.label: field.value for field in detail.fields}
assert fields["商户"] == "中国铁路"
assert fields["乘车人"] == "曹笑竹"
assert fields["出发地点"] == "武汉"
assert fields["到达地点"] == "上海虹桥"
assert fields["身份证号"] == "4201061987****1615"
assert fields["电子客票号"] == "6580061086021391007342026"
assert fields["开票日期"] == "2026-05-18"
assert fields["列车出发时间"] == "2026-02-20 07:55"
assert fields["车厢"] == "06车"
assert fields["座位号"] == "01B"
finally:
get_settings.cache_clear()
def test_receipt_folder_train_ticket_repairs_invalid_generated_fields_from_ocr_text(
monkeypatch,
tmp_path,
) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月21日_上海-深圳.png",
content=b"fake image",
media_type="image/png",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月21日_上海-深圳.png",
media_type="image/png",
text=(
"行程单示意\n"
"出票渠道:示例平台\n"
"非官方车票\n"
"不可报销\n"
"仅供演示\n"
"创建日期2026年02月15日\n"
"订单号DEMO202602210001\n"
"单据编号DEMO-IT-000001\n"
"上海虹桥\n"
"G999\n"
"深圳北\n"
"\n"
"\n"
"Shanghaihongqiao\n"
"Shenzhenbei\n"
"2026年02月21日\n"
"08:30出发\n"
"全程约7小时30分\n"
"15:00到达\n"
"DEMO\n"
"乘客:示例旅客\n"
"车厢05车\n"
"席别:二等座\n"
"-\n"
"扫码无效\n"
"证件号310101199001010000\n"
"座位08A\n"
"票价¥438.00\n"
"仅为演示"
),
summary="行程单示意;出票渠道:示例平台;非官方车票",
document_type="train_ticket",
document_type_label="火车/高铁票",
scene_code="travel",
scene_label="差旅票据",
document_fields=[
OcrRecognizeFieldRead(key="amount", label="金额", value="438元"),
OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-21 08:30"),
OcrRecognizeFieldRead(key="invoice_number", label="票据号码", value="DEMO202602210001"),
OcrRecognizeFieldRead(key="trip_no", label="车次", value="G999"),
OcrRecognizeFieldRead(key="route", label="行程", value="上海-深圳"),
OcrRecognizeFieldRead(key="departure_station", label="出发地点", value="二等座"),
OcrRecognizeFieldRead(key="arrival_station", label="到达地点", value="扫码无效"),
OcrRecognizeFieldRead(key="passenger_name", label="乘车人", value="席别二等座"),
],
),
)
detail = service.get_receipt(receipt.id, current_user)
fields = {field.label: field.value for field in detail.fields}
assert fields["出发地点"] == "上海虹桥"
assert fields["到达地点"] == "深圳北"
assert fields["乘车人"] == "示例旅客"
assert fields["身份证号"] == "310101199001010000"
assert fields["席别"] == "二等座"
assert fields["车厢"] == "05车"
assert fields["座位号"] == "08A"
assert fields["票价"] == "438.00元"
finally:
get_settings.cache_clear()
def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()