feat: 完善差旅票据行程提取与费用明细回填逻辑
增强文档智能识别的票据场景关键词和字段提取能力,优化 会话关联草稿报销单的解析路径,修复费用明细合并和票据 去重边界问题,前端改进报销创建和审批详情交互,补充单 元测试覆盖。
This commit is contained in:
@@ -184,6 +184,7 @@ AMOUNT_PATTERNS = (
|
||||
re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
|
||||
)
|
||||
DATE_PATTERN = re.compile(r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)")
|
||||
TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[::]([0-5]\d)(?!\d)")
|
||||
INVOICE_NUMBER_PATTERN = re.compile(r"(?:发票号码|票号|单号|订单号)[::\s]*([A-Za-z0-9-]{6,24})")
|
||||
INVOICE_CODE_PATTERN = re.compile(r"(?:发票代码)[::\s]*([A-Za-z0-9-]{6,24})")
|
||||
TRIP_NO_PATTERN = re.compile(r"(?:车次|航班(?:号)?)[::\s]*([A-Za-z0-9]{2,12})")
|
||||
@@ -192,6 +193,58 @@ MERCHANT_PATTERNS = (
|
||||
re.compile(r"(?:销售方(?:名称)?|商户(?:名称)?|开票方(?:名称)?|收款方(?:名称)?)[::\s]*([A-Za-z0-9\u4e00-\u9fa5()()·&\\-]{2,40})"),
|
||||
re.compile(r"([A-Za-z0-9\u4e00-\u9fa5()()·&\\-]{2,40}(?:酒店|宾馆|饭店|酒楼|餐厅|航空|铁路|滴滴出行|停车场|服务区))"),
|
||||
)
|
||||
DATE_FIELD_KEYS = {
|
||||
"date",
|
||||
"time",
|
||||
"issued_at",
|
||||
"invoice_date",
|
||||
"issue_date",
|
||||
"travel_date",
|
||||
"trip_date",
|
||||
"journey_date",
|
||||
"departure_date",
|
||||
"departure_time",
|
||||
"depart_date",
|
||||
"depart_time",
|
||||
"boarding_date",
|
||||
"boarding_time",
|
||||
"train_date",
|
||||
"train_time",
|
||||
"train_departure_time",
|
||||
"scheduled_departure_time",
|
||||
"flight_date",
|
||||
"flight_time",
|
||||
"ride_date",
|
||||
"ride_time",
|
||||
"pickup_time",
|
||||
"start_time",
|
||||
}
|
||||
TRIP_DATE_LABEL_BY_DOCUMENT_TYPE = {
|
||||
"train_ticket": "列车出发时间",
|
||||
"flight_itinerary": "起飞日期",
|
||||
"taxi_receipt": "乘车时间",
|
||||
"transport_receipt": "乘车时间",
|
||||
"parking_toll_receipt": "通行日期",
|
||||
}
|
||||
TRIP_DATE_FIELD_LABEL_TOKENS = (
|
||||
"日期",
|
||||
"时间",
|
||||
"开票日期",
|
||||
"发生时间",
|
||||
"行程日期",
|
||||
"出发日期",
|
||||
"出发时间",
|
||||
"列车出发时间",
|
||||
"发车日期",
|
||||
"发车时间",
|
||||
"开车时间",
|
||||
"乘车日期",
|
||||
"乘车时间",
|
||||
"起飞日期",
|
||||
"航班日期",
|
||||
"上车时间",
|
||||
"用车时间",
|
||||
)
|
||||
|
||||
|
||||
class DocumentIntelligenceService:
|
||||
@@ -212,7 +265,10 @@ class DocumentIntelligenceService:
|
||||
compact = re.sub(r"\s+", "", raw_text).lower()
|
||||
rule_match = _match_document_rule(compact)
|
||||
base_rule = rule_match.rule or DEFAULT_RULE
|
||||
fields = tuple(_extract_document_fields(raw_text))
|
||||
fields = _apply_document_type_field_labels(
|
||||
tuple(_extract_document_fields(raw_text, base_rule.document_type)),
|
||||
base_rule.document_type,
|
||||
)
|
||||
rule_insight = DocumentInsight(
|
||||
document_type=base_rule.document_type,
|
||||
document_type_label=base_rule.document_type_label,
|
||||
@@ -275,7 +331,10 @@ class DocumentIntelligenceService:
|
||||
for item in parsed.evidence
|
||||
if str(item or "").strip()
|
||||
][:4]
|
||||
normalized_fields = _normalize_llm_document_fields(parsed.fields)
|
||||
normalized_fields = _apply_document_type_field_labels(
|
||||
tuple(_normalize_llm_document_fields(parsed.fields)),
|
||||
normalized_type,
|
||||
)
|
||||
|
||||
return LlmDocumentClassification(
|
||||
document_type=normalized_type,
|
||||
@@ -312,7 +371,10 @@ class DocumentIntelligenceService:
|
||||
scene_code=rule_insight.scene_code,
|
||||
scene_label=rule_insight.scene_label,
|
||||
expense_type=rule_insight.expense_type,
|
||||
fields=merged_fields,
|
||||
fields=_apply_document_type_field_labels(
|
||||
merged_fields,
|
||||
rule_insight.document_type,
|
||||
),
|
||||
classification_source=rule_insight.classification_source,
|
||||
classification_confidence=rule_insight.classification_confidence,
|
||||
evidence=rule_insight.evidence,
|
||||
@@ -337,7 +399,10 @@ class DocumentIntelligenceService:
|
||||
scene_code=rule_insight.scene_code,
|
||||
scene_label=rule_insight.scene_label,
|
||||
expense_type=rule_insight.expense_type,
|
||||
fields=merged_fields,
|
||||
fields=_apply_document_type_field_labels(
|
||||
merged_fields,
|
||||
rule_insight.document_type,
|
||||
),
|
||||
classification_source=rule_insight.classification_source,
|
||||
classification_confidence=rule_insight.classification_confidence,
|
||||
evidence=rule_insight.evidence,
|
||||
@@ -354,7 +419,7 @@ class DocumentIntelligenceService:
|
||||
scene_code=rule.scene_code if parsed.scene_code == "other" else parsed.scene_code,
|
||||
scene_label=rule.scene_label if parsed.scene_label == "其他票据" else parsed.scene_label,
|
||||
expense_type=rule.expense_type if parsed.expense_type == "other" else parsed.expense_type,
|
||||
fields=merged_fields,
|
||||
fields=_apply_document_type_field_labels(merged_fields, rule.document_type),
|
||||
classification_source=source,
|
||||
classification_confidence=max(parsed.confidence, rule_insight.classification_confidence),
|
||||
evidence=tuple(parsed.evidence or rule_insight.evidence),
|
||||
@@ -464,8 +529,49 @@ def _normalize_llm_document_field_key(key: str, label: str) -> str:
|
||||
token in compact_label for token in ("金额", "价税合计", "合计", "总额", "总计", "票价", "支付金额", "实付金额", "实收金额")
|
||||
):
|
||||
return "amount"
|
||||
if compact_key in {"date", "time", "issued_at", "invoice_date"} or any(
|
||||
token in compact_label for token in ("日期", "时间", "开票日期", "发生时间")
|
||||
if compact_key in {
|
||||
"travel_date",
|
||||
"trip_date",
|
||||
"journey_date",
|
||||
"departure_date",
|
||||
"departure_time",
|
||||
"depart_date",
|
||||
"depart_time",
|
||||
"boarding_date",
|
||||
"boarding_time",
|
||||
"train_date",
|
||||
"train_time",
|
||||
"train_departure_time",
|
||||
"scheduled_departure_time",
|
||||
"flight_date",
|
||||
"flight_time",
|
||||
"ride_date",
|
||||
"ride_time",
|
||||
"pickup_time",
|
||||
"start_time",
|
||||
} or any(
|
||||
token in compact_label
|
||||
for token in (
|
||||
"行程日期",
|
||||
"出发日期",
|
||||
"出发时间",
|
||||
"列车出发时间",
|
||||
"发车日期",
|
||||
"发车时间",
|
||||
"开车时间",
|
||||
"乘车日期",
|
||||
"乘车时间",
|
||||
"起飞日期",
|
||||
"航班日期",
|
||||
"上车时间",
|
||||
"用车时间",
|
||||
)
|
||||
):
|
||||
return "trip_date"
|
||||
if compact_key in {"issued_at", "issue_date", "invoice_date"} or "开票日期" in compact_label:
|
||||
return "invoice_date"
|
||||
if compact_key in {"date", "time"} or any(
|
||||
token in compact_label for token in ("日期", "时间", "发生时间")
|
||||
):
|
||||
return "date"
|
||||
if compact_key in {"merchant_name", "merchant", "seller_name", "vendor_name"} or any(
|
||||
@@ -504,7 +610,7 @@ def _normalize_llm_document_field_value(key: str, value: str) -> str:
|
||||
return ""
|
||||
text_value = format(candidate.quantize(Decimal("0.01")), "f").rstrip("0").rstrip(".")
|
||||
return f"{text_value}元"
|
||||
if key == "date":
|
||||
if key in {"date", "time", "invoice_date", "trip_date"}:
|
||||
return _extract_date(raw_value) or _clean_field_value(raw_value)
|
||||
if key == "route":
|
||||
return _extract_route(raw_value) or _clean_field_value(
|
||||
@@ -517,6 +623,8 @@ def _llm_document_field_label(key: str) -> str:
|
||||
return {
|
||||
"amount": "金额",
|
||||
"date": "日期",
|
||||
"invoice_date": "开票日期",
|
||||
"trip_date": "行程日期",
|
||||
"merchant_name": "商户",
|
||||
"invoice_number": "票据号码",
|
||||
"invoice_code": "发票代码",
|
||||
@@ -525,6 +633,35 @@ def _llm_document_field_label(key: str) -> str:
|
||||
}.get(key, key)
|
||||
|
||||
|
||||
def _apply_document_type_field_labels(
|
||||
fields: tuple[DocumentField, ...],
|
||||
document_type: str,
|
||||
) -> tuple[DocumentField, ...]:
|
||||
date_label = TRIP_DATE_LABEL_BY_DOCUMENT_TYPE.get(
|
||||
str(document_type or "").strip().lower()
|
||||
)
|
||||
if not date_label:
|
||||
return fields
|
||||
|
||||
adjusted: list[DocumentField] = []
|
||||
for field in fields:
|
||||
compact_key = str(field.key or "").strip().lower()
|
||||
compact_label = str(field.label or "").replace(" ", "")
|
||||
if compact_key in {"issued_at", "issue_date", "invoice_date"} or any(
|
||||
token in compact_label for token in ("开票日期", "发票日期")
|
||||
):
|
||||
adjusted.append(field)
|
||||
continue
|
||||
is_date_field = compact_key in DATE_FIELD_KEYS or any(
|
||||
token in compact_label for token in TRIP_DATE_FIELD_LABEL_TOKENS
|
||||
)
|
||||
if is_date_field:
|
||||
adjusted.append(DocumentField(key=field.key, label=date_label, value=field.value))
|
||||
continue
|
||||
adjusted.append(field)
|
||||
return tuple(adjusted)
|
||||
|
||||
|
||||
def _merge_document_fields(
|
||||
base_fields: tuple[DocumentField, ...],
|
||||
override_fields: tuple[DocumentField, ...],
|
||||
@@ -540,13 +677,13 @@ def _merge_document_fields(
|
||||
return tuple(merged[key] for key in order if key in merged)
|
||||
|
||||
|
||||
def _extract_document_fields(text: str) -> list[DocumentField]:
|
||||
def _extract_document_fields(text: str, document_type: str = "") -> list[DocumentField]:
|
||||
fields: list[DocumentField] = []
|
||||
amount = _extract_amount(text)
|
||||
if amount:
|
||||
fields.append(DocumentField(key="amount", label="金额", value=amount))
|
||||
|
||||
date_value = _extract_date(text)
|
||||
date_value = _extract_date(text, document_type=document_type)
|
||||
if date_value:
|
||||
fields.append(DocumentField(key="date", label="日期", value=date_value))
|
||||
|
||||
@@ -594,10 +731,33 @@ def _extract_amount(text: str) -> str:
|
||||
return f"{text_value}元"
|
||||
|
||||
|
||||
def _extract_date(text: str) -> str:
|
||||
match = DATE_PATTERN.search(text)
|
||||
if not match:
|
||||
def _extract_date(text: str, *, document_type: str = "") -> str:
|
||||
matches = list(DATE_PATTERN.finditer(text))
|
||||
if not matches:
|
||||
return ""
|
||||
|
||||
normalized_type = str(document_type or "").strip().lower()
|
||||
if normalized_type in TRIP_DATE_LABEL_BY_DOCUMENT_TYPE:
|
||||
candidates: list[tuple[int, int, bool, str]] = []
|
||||
for index, match in enumerate(matches):
|
||||
value = _format_date_match_with_time(text, match)
|
||||
if not value:
|
||||
continue
|
||||
invoice_context = _is_invoice_date_context(text, match)
|
||||
score = _score_trip_date_context(text, match, value, invoice_context)
|
||||
candidates.append((score, index, invoice_context, value))
|
||||
|
||||
non_invoice_candidates = [candidate for candidate in candidates if not candidate[2]]
|
||||
if non_invoice_candidates:
|
||||
return max(non_invoice_candidates, key=lambda candidate: (candidate[0], -candidate[1]))[3]
|
||||
if candidates:
|
||||
return ""
|
||||
return ""
|
||||
|
||||
return _format_date_match_with_time(text, matches[0])
|
||||
|
||||
|
||||
def _format_date_match_with_time(text: str, match: re.Match[str]) -> str:
|
||||
raw_value = str(match.group(1) or "").strip()
|
||||
normalized = raw_value.replace("年", "-").replace("月", "-").replace("日", "")
|
||||
normalized = normalized.replace("/", "-").replace(".", "-")
|
||||
@@ -605,7 +765,60 @@ def _extract_date(text: str) -> str:
|
||||
if len(parts) != 3:
|
||||
return raw_value
|
||||
year, month, day = parts
|
||||
return f"{year.zfill(4)}-{month.zfill(2)}-{day.zfill(2)}"
|
||||
date_value = f"{year.zfill(4)}-{month.zfill(2)}-{day.zfill(2)}"
|
||||
surrounding = str(text or "")[max(0, match.start() - 18): match.end() + 24]
|
||||
time_match = TIME_PATTERN.search(surrounding)
|
||||
if time_match:
|
||||
hour = str(time_match.group(1) or "").zfill(2)
|
||||
minute = str(time_match.group(2) or "").zfill(2)
|
||||
return f"{date_value} {hour}:{minute}"
|
||||
return date_value
|
||||
|
||||
|
||||
def _is_invoice_date_context(text: str, match: re.Match[str]) -> bool:
|
||||
window = str(text or "")[max(0, match.start() - 12): match.end() + 8]
|
||||
compact = window.replace(" ", "")
|
||||
return any(token in compact for token in ("开票日期", "发票日期", "开票时间", "开票"))
|
||||
|
||||
|
||||
def _score_trip_date_context(
|
||||
text: str,
|
||||
match: re.Match[str],
|
||||
value: str,
|
||||
invoice_context: bool,
|
||||
) -> int:
|
||||
window = str(text or "")[max(0, match.start() - 32): match.end() + 32]
|
||||
compact = window.replace(" ", "")
|
||||
score = -20 if invoice_context else 0
|
||||
if ":" in value or ":" in value:
|
||||
score += 8
|
||||
if any(
|
||||
token in compact
|
||||
for token in (
|
||||
"行程日期",
|
||||
"出发日期",
|
||||
"出发时间",
|
||||
"列车出发时间",
|
||||
"发车日期",
|
||||
"发车时间",
|
||||
"开车时间",
|
||||
"乘车日期",
|
||||
"乘车时间",
|
||||
"起飞日期",
|
||||
"起飞时间",
|
||||
"航班日期",
|
||||
"上车时间",
|
||||
"用车时间",
|
||||
)
|
||||
):
|
||||
score += 6
|
||||
if any(token in compact for token in ("车次", "检票", "二等座", "一等座", "商务座", "软卧", "硬卧")):
|
||||
score += 3
|
||||
if re.search(r"[A-Z]\d{1,4}", compact):
|
||||
score += 2
|
||||
if re.search(r"[\u4e00-\u9fa5A-Za-z0-9()()·]{2,20}(?:至|到|→|->|—|–|-)[\u4e00-\u9fa5A-Za-z0-9()()·]{2,20}", compact):
|
||||
score += 2
|
||||
return score
|
||||
|
||||
|
||||
def _extract_merchant(text: str) -> str:
|
||||
|
||||
Reference in New Issue
Block a user