Files
X-Financial/server/tests/test_document_intelligence.py
caoxiaozhu 84a8998e59 feat(server): 票据文件夹资产缓存与文档预览统一生成
- 新增 document_preview 模块,DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成(poppler-data 编码)、renderer_id 标识
- receipt_folder 服务复用预览生成,缓存票据资产并提供清理;删除票据时保留已关联报销单的附件副本
- document_intelligence 新增票据预览/资产缓存接入与字段提取增强;ocr 抽取复用预览工具,附件分析/文档/操作/展示四个子模块同步适配
- receipt_folder 端点补充资产缓存头,补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试,新增 attachment_analysis 回归测试
2026-06-23 09:42:00 +08:00

153 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import StaticPool
from app.services.document_intelligence import DocumentIntelligenceService, build_document_insight
def test_build_document_insight_prefers_transport_for_didi_text_with_hotel_noise() -> None:
insight = build_document_insight(
filename="didi-trip.png",
summary="滴滴出行行程单",
text="滴滴出行电子发票 订单号 12345678 上车点 深圳湾 下车点 后海 全季酒店 里程 12.4 公里 金额 48 元",
)
assert insight.document_type == "taxi_receipt"
assert insight.document_type_label == "出租车/网约车票据"
assert insight.scene_code == "transport"
assert any(field.label == "金额" and field.value == "48元" for field in insight.fields)
def test_document_intelligence_service_uses_rule_result_when_preview_available() -> None:
engine = create_engine(
"sqlite+pysqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
session = sessionmaker(bind=engine, autoflush=False, autocommit=False)()
try:
insight = DocumentIntelligenceService(session).build_document_insight(
filename="mixed-noise.png",
summary="OCR 混入酒店名称",
text="全季酒店 滴滴出行 订单号 12345678 上车 下车 金额 52 元",
preview_data_url="data:image/png;base64,ZmFrZQ==",
)
finally:
session.close()
assert insight.document_type == "taxi_receipt"
assert insight.classification_source == "rule"
def test_document_intelligence_extracts_larger_decimal_amount_from_multiple_candidates() -> None:
insight = build_document_insight(
filename="taxi-amount.png",
summary="滴滴出行电子行程单",
text="滴滴出行 支付金额 1 元,实付 13.4 元,订单号 12345678",
)
assert any(field.label == "金额" and field.value == "13.4元" for field in insight.fields)
def test_document_intelligence_extracts_hotel_total_fee_instead_of_date_year() -> None:
insight = build_document_insight(
filename="hotel-invoice.png",
summary="酒店住宿票据",
text="北京中心酒店 金额 2026-02-20 入住 总费用是828元 离店日期 2026-02-21",
)
assert insight.document_type == "hotel_invoice"
assert any(field.label == "金额" and field.value == "828元" for field in insight.fields)
assert not any(field.label == "金额" and field.value == "2026元" for field in insight.fields)
def test_document_intelligence_prefers_train_ticket_for_railway_e_ticket_invoice_text() -> None:
insight = build_document_insight(
filename="铁路电子客票.pdf",
summary="电子发票(铁路电子客票)",
text=(
"电子发票(铁路电子客票)\n"
"发票号码:26319166100006175398\n"
"上海虹桥站\n"
"武汉站\n"
"G456\n"
"二等座\n"
"票价:¥354.00"
),
)
assert insight.document_type == "train_ticket"
assert insight.document_type_label == "火车/高铁票"
assert insight.scene_code == "travel"
assert any(field.label == "金额" and field.value == "354元" for field in insight.fields)
def test_document_intelligence_recovers_train_ticket_from_english_station_ocr_text() -> None:
insight = build_document_insight(
filename="2月20_武汉-上海.pdf",
summary=":26429165800002785705:2026 05 18Wuhan Shanghaihongqiao G458",
text=(
":26429165800002785705\n"
":2026 05 18\n"
"G458\n"
"Wuhan\n"
"Shanghaihongqiao\n"
"2026 02 20 07:55\n"
"06 01B\n"
": 354.00\n"
"4201061987****1615\n"
":6580061086021391007342026\n"
"12306 95306"
),
)
assert insight.document_type == "train_ticket"
assert insight.document_type_label == "火车/高铁票"
assert insight.scene_code == "travel"
fields = {field.label: field.value for field in insight.fields}
assert fields["金额"] == "354元"
assert fields["列车出发时间"] == "2026-02-20 07:55"
assert fields["车次/航班"] == "G458"
assert fields["行程"] == "武汉-上海"
def test_document_intelligence_labels_train_ticket_date_as_train_departure_time() -> None:
insight = build_document_insight(
filename="铁路电子客票.pdf",
summary="铁路电子客票",
text=(
"中国铁路电子客票 开票日期 2026-02-18 "
"G456 上海虹桥-武汉 2026-02-20 08:30开 票价:¥354.00"
),
)
assert insight.document_type == "train_ticket"
assert any(
field.key == "date" and field.label == "列车出发时间" and field.value == "2026-02-20 08:30"
for field in insight.fields
)
assert not any(field.label == "开票日期" for field in insight.fields)
def test_document_intelligence_service_keeps_rule_fields_without_model_correction() -> None:
engine = create_engine(
"sqlite+pysqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
session = sessionmaker(bind=engine, autoflush=False, autocommit=False)()
try:
insight = DocumentIntelligenceService(session).build_document_insight(
filename="didi-corrected.png",
summary="滴滴出行电子行程单",
text="滴滴出行 支付金额 1 元 订单号 12345678",
preview_data_url="data:image/png;base64,ZmFrZQ==",
)
finally:
session.close()
assert any(field.label == "金额" and field.value == "1元" for field in insight.fields)
assert insight.warnings == ()