Files
X-Financial/server/tests/test_expense_claim_attachment_analysis_regression.py
caoxiaozhu 84a8998e59 feat(server): 票据文件夹资产缓存与文档预览统一生成
- 新增 document_preview 模块,DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成(poppler-data 编码)、renderer_id 标识
- receipt_folder 服务复用预览生成,缓存票据资产并提供清理;删除票据时保留已关联报销单的附件副本
- document_intelligence 新增票据预览/资产缓存接入与字段提取增强;ocr 抽取复用预览工具,附件分析/文档/操作/展示四个子模块同步适配
- receipt_folder 端点补充资产缓存头,补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试,新增 attachment_analysis 回归测试
2026-06-23 09:42:00 +08:00

170 lines
6.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import json
from decimal import Decimal
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
from app.services.ocr import OcrService
from test_reimbursement_endpoints import build_client, seed_claim
def test_train_ticket_attachment_with_structured_fields_is_not_flagged_as_unreadable(
monkeypatch,
tmp_path,
) -> None:
def fake_recognize(
self,
files: list[tuple[str, bytes, str | None]],
) -> OcrRecognizeBatchRead:
return OcrRecognizeBatchRead(
total_file_count=1,
success_count=1,
documents=[
OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text=(
":26429165800002785705\n"
":2026 05 18\n"
"G458\n"
"Wuhan\n"
"Shanghaihongqiao\n"
"2026 02 20 07:55\n"
"06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
summary="Wuhan Shanghaihongqiao G458 354.00",
avg_score=0.0,
line_count=0,
page_count=1,
warnings=[],
)
],
)
monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
client, session_factory = build_client()
with session_factory() as db:
claim, item = seed_claim(db)
claim.expense_type = "travel"
claim.reason = "武汉-上海差旅"
claim.location = "上海"
claim.amount = Decimal("354.00")
item.item_type = "train_ticket"
item.item_reason = "武汉-上海"
item.item_location = "上海"
item.item_amount = Decimal("354.00")
db.commit()
claim_id = claim.id
item_id = item.id
upload_response = client.post(
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
)
assert upload_response.status_code == 200
attachment = upload_response.json()["attachment"]
analysis = attachment["analysis"]
points = analysis["points"]
assert attachment["document_info"]["document_type"] == "train_ticket"
assert analysis["severity"] == "pass"
assert not any("未识别到有效文字" in point for point in points)
assert not any("未识别到列车出发时间" in point for point in points)
def test_attachment_meta_read_repairs_stale_unreadable_train_ticket_analysis(
monkeypatch,
tmp_path,
) -> None:
def fake_recognize(
self,
files: list[tuple[str, bytes, str | None]],
) -> OcrRecognizeBatchRead:
return OcrRecognizeBatchRead(
total_file_count=1,
success_count=1,
documents=[
OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text=(
":26429165800002785705 :2026 05 18\n"
"G458\n"
"Wuhan Shanghaihongqiao\n"
"2026 02 20 07:55 06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
summary="Wuhan Shanghaihongqiao G458 354.00",
avg_score=0.0,
line_count=0,
page_count=1,
warnings=[],
)
],
)
monkeypatch.setattr(OcrService, "recognize_files", fake_recognize)
monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
client, session_factory = build_client()
with session_factory() as db:
claim, item = seed_claim(db)
claim.expense_type = "travel"
claim.reason = "武汉-上海差旅"
claim.location = "上海"
claim.amount = Decimal("354.00")
item.item_type = "train_ticket"
item.item_reason = "武汉-上海"
item.item_location = "上海"
item.item_amount = Decimal("354.00")
db.commit()
claim_id = claim.id
item_id = item.id
upload_response = client.post(
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment",
headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
files=[("file", ("2月20_武汉-上海.pdf", b"%PDF-1.4 fake", "application/pdf"))],
)
assert upload_response.status_code == 200
meta_path = next(tmp_path.rglob("*.meta.json"))
meta = json.loads(meta_path.read_text(encoding="utf-8"))
meta["analysis"] = {
"severity": "high",
"label": "高风险",
"headline": "AI提示附件不符合票据校验条件",
"summary": "当前附件存在明显异常,票据类型与当前费用场景不匹配,或无法作为有效报销材料。",
"points": [
"附件内容:未识别到有效文字,当前附件更像普通图片或内容过于模糊。",
"日期字段:未识别到列车出发时间或乘车日期。",
],
"rule_basis": [],
"suggestion": "建议过滤当前不匹配的票据,重新上传符合当前费用场景的清晰原件。",
}
meta_path.write_text(json.dumps(meta, ensure_ascii=False), encoding="utf-8")
meta_response = client.get(
f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment/meta",
headers={"x-auth-username": "emp-1", "x-auth-name": "Zhang San"},
)
assert meta_response.status_code == 200
analysis = meta_response.json()["analysis"]
points = analysis["points"]
assert analysis["severity"] == "pass"
assert not any("未识别到有效文字" in point for point in points)
assert not any("未识别到列车出发时间" in point for point in points)