feat(server): 新增文档智能识别服务,扩展OCR接口支持 Azure Document Intelligence
This commit is contained in:
582
server/src/app/services/document_intelligence.py
Normal file
582
server/src/app/services/document_intelligence.py
Normal file
@@ -0,0 +1,582 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field, ValidationError
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.services.runtime_chat import RuntimeChatService
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class DocumentField:
|
||||
key: str
|
||||
label: str
|
||||
value: str
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class DocumentInsight:
|
||||
document_type: str
|
||||
document_type_label: str
|
||||
scene_code: str
|
||||
scene_label: str
|
||||
expense_type: str
|
||||
fields: tuple[DocumentField, ...] = ()
|
||||
classification_source: str = "rule"
|
||||
classification_confidence: float = 0.0
|
||||
evidence: tuple[str, ...] = ()
|
||||
warnings: tuple[str, ...] = ()
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class DocumentRule:
|
||||
document_type: str
|
||||
document_type_label: str
|
||||
scene_code: str
|
||||
scene_label: str
|
||||
expense_type: str
|
||||
keywords: tuple[str, ...]
|
||||
score_bias: float = 0.0
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class RuleMatch:
|
||||
rule: DocumentRule | None
|
||||
confidence: float
|
||||
evidence: tuple[str, ...]
|
||||
score: float
|
||||
|
||||
|
||||
class LlmDocumentClassification(BaseModel):
|
||||
document_type: str = Field(default="other")
|
||||
scene_code: str = Field(default="other")
|
||||
scene_label: str = Field(default="其他票据")
|
||||
expense_type: str = Field(default="other")
|
||||
confidence: float = Field(default=0.0, ge=0.0, le=1.0)
|
||||
evidence: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
DEFAULT_RULE = DocumentRule(
|
||||
document_type="other",
|
||||
document_type_label="其他单据",
|
||||
scene_code="other",
|
||||
scene_label="其他票据",
|
||||
expense_type="other",
|
||||
keywords=(),
|
||||
score_bias=0.0,
|
||||
)
|
||||
|
||||
DOCUMENT_RULES: tuple[DocumentRule, ...] = (
|
||||
DocumentRule(
|
||||
document_type="flight_itinerary",
|
||||
document_type_label="机票/航班行程单",
|
||||
scene_code="travel",
|
||||
scene_label="差旅票据",
|
||||
expense_type="travel",
|
||||
keywords=("电子行程单", "航班号", "航班", "机票", "登机", "航空", "客票"),
|
||||
score_bias=0.34,
|
||||
),
|
||||
DocumentRule(
|
||||
document_type="train_ticket",
|
||||
document_type_label="火车/高铁票",
|
||||
scene_code="travel",
|
||||
scene_label="差旅票据",
|
||||
expense_type="travel",
|
||||
keywords=("高铁", "火车", "动车", "铁路", "车次", "检票", "二等座", "一等座"),
|
||||
score_bias=0.32,
|
||||
),
|
||||
DocumentRule(
|
||||
document_type="hotel_invoice",
|
||||
document_type_label="酒店住宿票据",
|
||||
scene_code="hotel",
|
||||
scene_label="住宿票据",
|
||||
expense_type="hotel",
|
||||
keywords=("住宿", "房费", "客房", "入住", "离店", "酒店", "宾馆", "间夜"),
|
||||
score_bias=0.16,
|
||||
),
|
||||
DocumentRule(
|
||||
document_type="taxi_receipt",
|
||||
document_type_label="出租车/网约车票据",
|
||||
scene_code="transport",
|
||||
scene_label="交通票据",
|
||||
expense_type="transport",
|
||||
keywords=("滴滴出行", "滴滴", "网约车", "出租车", "打车", "快车", "专车", "订单号", "上车", "下车", "起点", "终点", "里程", "司机"),
|
||||
score_bias=0.38,
|
||||
),
|
||||
DocumentRule(
|
||||
document_type="parking_toll_receipt",
|
||||
document_type_label="停车/通行费票据",
|
||||
scene_code="transport",
|
||||
scene_label="交通票据",
|
||||
expense_type="transport",
|
||||
keywords=("停车费", "通行费", "过路费", "收费站", "停车场", "停车"),
|
||||
score_bias=0.28,
|
||||
),
|
||||
DocumentRule(
|
||||
document_type="meal_receipt",
|
||||
document_type_label="餐饮票据",
|
||||
scene_code="meal",
|
||||
scene_label="餐饮票据",
|
||||
expense_type="meal",
|
||||
keywords=("餐饮", "餐费", "用餐", "饭店", "酒楼", "餐厅", "食品", "外卖", "咖啡"),
|
||||
score_bias=0.14,
|
||||
),
|
||||
DocumentRule(
|
||||
document_type="office_invoice",
|
||||
document_type_label="办公用品票据",
|
||||
scene_code="office",
|
||||
scene_label="办公用品票据",
|
||||
expense_type="office",
|
||||
keywords=("办公用品", "文具", "耗材", "打印纸", "墨盒", "硒鼓", "键盘", "鼠标"),
|
||||
score_bias=0.14,
|
||||
),
|
||||
DocumentRule(
|
||||
document_type="meeting_invoice",
|
||||
document_type_label="会议/会务票据",
|
||||
scene_code="meeting",
|
||||
scene_label="会务票据",
|
||||
expense_type="meeting",
|
||||
keywords=("会议", "会务", "会展", "论坛", "会议室", "会场"),
|
||||
score_bias=0.12,
|
||||
),
|
||||
DocumentRule(
|
||||
document_type="training_invoice",
|
||||
document_type_label="培训票据",
|
||||
scene_code="training",
|
||||
scene_label="培训票据",
|
||||
expense_type="training",
|
||||
keywords=("培训", "课程", "讲师", "教材", "学费", "认证"),
|
||||
score_bias=0.12,
|
||||
),
|
||||
DocumentRule(
|
||||
document_type="vat_invoice",
|
||||
document_type_label="增值税发票",
|
||||
scene_code="other",
|
||||
scene_label="通用发票",
|
||||
expense_type="other",
|
||||
keywords=("发票代码", "发票号码", "价税合计", "增值税", "电子发票"),
|
||||
score_bias=-0.08,
|
||||
),
|
||||
DocumentRule(
|
||||
document_type="receipt",
|
||||
document_type_label="一般收据/凭证",
|
||||
scene_code="other",
|
||||
scene_label="其他票据",
|
||||
expense_type="other",
|
||||
keywords=("收据", "凭证", "票据"),
|
||||
score_bias=-0.18,
|
||||
),
|
||||
)
|
||||
|
||||
DOCUMENT_TYPE_RULE_MAP = {rule.document_type: rule for rule in DOCUMENT_RULES}
|
||||
SUPPORTED_DOCUMENT_TYPES = tuple(DOCUMENT_TYPE_RULE_MAP.keys()) + ("other",)
|
||||
|
||||
AMOUNT_PATTERNS = (
|
||||
re.compile(r"(?:价税合计|合计|金额|总额|票价|支付金额|实付金额|实收金额)[::\s¥¥]*([0-9]+(?:[.,][0-9]{1,2})?)"),
|
||||
re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
|
||||
)
|
||||
DATE_PATTERN = re.compile(r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)")
|
||||
INVOICE_NUMBER_PATTERN = re.compile(r"(?:发票号码|票号|单号|订单号)[::\s]*([A-Za-z0-9-]{6,24})")
|
||||
INVOICE_CODE_PATTERN = re.compile(r"(?:发票代码)[::\s]*([A-Za-z0-9-]{6,24})")
|
||||
TRIP_NO_PATTERN = re.compile(r"(?:车次|航班(?:号)?)[::\s]*([A-Za-z0-9]{2,12})")
|
||||
ROUTE_PATTERN = re.compile(r"([\u4e00-\u9fa5]{2,12})\s*(?:至|→|->|-)\s*([\u4e00-\u9fa5]{2,12})")
|
||||
MERCHANT_PATTERNS = (
|
||||
re.compile(r"(?:销售方(?:名称)?|商户(?:名称)?|开票方(?:名称)?|收款方(?:名称)?)[::\s]*([A-Za-z0-9\u4e00-\u9fa5()()·&\\-]{2,40})"),
|
||||
re.compile(r"([A-Za-z0-9\u4e00-\u9fa5()()·&\\-]{2,40}(?:酒店|宾馆|饭店|酒楼|餐厅|航空|铁路|滴滴出行|停车场|服务区))"),
|
||||
)
|
||||
|
||||
|
||||
class DocumentIntelligenceService:
|
||||
def __init__(self, db: Session | None = None) -> None:
|
||||
self.runtime_chat_service = RuntimeChatService(db) if db is not None else None
|
||||
|
||||
def build_document_insight(
|
||||
self,
|
||||
*,
|
||||
filename: str = "",
|
||||
summary: str = "",
|
||||
text: str = "",
|
||||
preview_data_url: str = "",
|
||||
) -> DocumentInsight:
|
||||
raw_text = " ".join(
|
||||
[str(filename or "").strip(), str(summary or "").strip(), str(text or "").strip()]
|
||||
).strip()
|
||||
compact = re.sub(r"\s+", "", raw_text).lower()
|
||||
rule_match = _match_document_rule(compact)
|
||||
base_rule = rule_match.rule or DEFAULT_RULE
|
||||
fields = tuple(_extract_document_fields(raw_text))
|
||||
rule_insight = DocumentInsight(
|
||||
document_type=base_rule.document_type,
|
||||
document_type_label=base_rule.document_type_label,
|
||||
scene_code=base_rule.scene_code,
|
||||
scene_label=base_rule.scene_label,
|
||||
expense_type=base_rule.expense_type,
|
||||
fields=fields,
|
||||
classification_source="rule",
|
||||
classification_confidence=rule_match.confidence,
|
||||
evidence=rule_match.evidence,
|
||||
)
|
||||
|
||||
llm_result = self._classify_with_model(
|
||||
filename=str(filename or "").strip(),
|
||||
summary=str(summary or "").strip(),
|
||||
text=str(text or "").strip(),
|
||||
preview_data_url=str(preview_data_url or "").strip(),
|
||||
rule_insight=rule_insight,
|
||||
fields=fields,
|
||||
)
|
||||
if llm_result is None:
|
||||
return rule_insight
|
||||
return self._merge_rule_and_model(
|
||||
rule_insight=rule_insight,
|
||||
llm_result=llm_result,
|
||||
fields=fields,
|
||||
has_preview=bool(preview_data_url),
|
||||
)
|
||||
|
||||
def _classify_with_model(
|
||||
self,
|
||||
*,
|
||||
filename: str,
|
||||
summary: str,
|
||||
text: str,
|
||||
preview_data_url: str,
|
||||
rule_insight: DocumentInsight,
|
||||
fields: tuple[DocumentField, ...],
|
||||
) -> tuple[str, LlmDocumentClassification] | None:
|
||||
if self.runtime_chat_service is None:
|
||||
return None
|
||||
|
||||
trimmed_text = text.strip()
|
||||
if not trimmed_text and not summary.strip():
|
||||
return None
|
||||
|
||||
facts = {
|
||||
"filename": filename,
|
||||
"summary": summary[:300],
|
||||
"ocr_text_excerpt": trimmed_text[:2000],
|
||||
"rule_candidate": {
|
||||
"document_type": rule_insight.document_type,
|
||||
"document_type_label": rule_insight.document_type_label,
|
||||
"scene_code": rule_insight.scene_code,
|
||||
"scene_label": rule_insight.scene_label,
|
||||
"expense_type": rule_insight.expense_type,
|
||||
"confidence": round(rule_insight.classification_confidence, 2),
|
||||
"evidence": list(rule_insight.evidence),
|
||||
},
|
||||
"extracted_fields": [
|
||||
{"key": field.key, "label": field.label, "value": field.value}
|
||||
for field in fields
|
||||
],
|
||||
"allowed_document_types": list(SUPPORTED_DOCUMENT_TYPES),
|
||||
}
|
||||
|
||||
system_prompt = (
|
||||
"你是企业报销票据识别复核器。"
|
||||
"你的任务不是 OCR,而是在已有 OCR 文本和票据预览基础上判断票据类型。"
|
||||
"只输出 JSON 对象,不要输出 Markdown、解释或代码块。"
|
||||
"document_type 只能是:"
|
||||
f"{', '.join(SUPPORTED_DOCUMENT_TYPES)}。"
|
||||
"如果证据不足,返回 other。"
|
||||
"严禁编造 OCR 中不存在的商户、酒店、航司、路线或金额。"
|
||||
"如果 OCR 出现冲突碎片,应优先依据票据主体信息,而不是单个噪声词。"
|
||||
"例如滴滴行程单/网约车发票,即使 OCR 混入酒店名称,也不能直接判成酒店票据。"
|
||||
"输出字段:document_type, scene_code, scene_label, expense_type, confidence, evidence。"
|
||||
)
|
||||
user_prompt = (
|
||||
"请根据以下票据事实给出最终分类 JSON:\n"
|
||||
f"{json.dumps(facts, ensure_ascii=False, indent=2)}\n\n"
|
||||
"示例输出:\n"
|
||||
"{\n"
|
||||
' "document_type": "taxi_receipt",\n'
|
||||
' "scene_code": "transport",\n'
|
||||
' "scene_label": "交通票据",\n'
|
||||
' "expense_type": "transport",\n'
|
||||
' "confidence": 0.86,\n'
|
||||
' "evidence": ["OCR 中出现 滴滴出行、订单号、上车/下车 等交通特征"]\n'
|
||||
"}"
|
||||
)
|
||||
|
||||
if preview_data_url:
|
||||
response_text = self.runtime_chat_service.complete(
|
||||
[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": user_prompt},
|
||||
{"type": "image_url", "image_url": {"url": preview_data_url}},
|
||||
],
|
||||
},
|
||||
],
|
||||
slot_priority=("vlm",),
|
||||
max_tokens=320,
|
||||
temperature=0.0,
|
||||
)
|
||||
parsed = self._parse_llm_payload(response_text)
|
||||
if parsed is not None:
|
||||
return "llm_vision", parsed
|
||||
|
||||
response_text = self.runtime_chat_service.complete(
|
||||
[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
slot_priority=("main", "backup"),
|
||||
max_tokens=320,
|
||||
temperature=0.0,
|
||||
)
|
||||
parsed = self._parse_llm_payload(response_text)
|
||||
if parsed is not None:
|
||||
return "llm_text", parsed
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _parse_llm_payload(response_text: str | None) -> LlmDocumentClassification | None:
|
||||
payload_json = _extract_json_payload(response_text)
|
||||
if payload_json is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
parsed = LlmDocumentClassification.model_validate(payload_json)
|
||||
except ValidationError:
|
||||
return None
|
||||
|
||||
normalized_type = str(parsed.document_type or "other").strip().lower() or "other"
|
||||
if normalized_type not in SUPPORTED_DOCUMENT_TYPES:
|
||||
normalized_type = "other"
|
||||
|
||||
base_rule = DOCUMENT_TYPE_RULE_MAP.get(normalized_type, DEFAULT_RULE)
|
||||
evidence = [
|
||||
str(item or "").strip()
|
||||
for item in parsed.evidence
|
||||
if str(item or "").strip()
|
||||
][:4]
|
||||
|
||||
return LlmDocumentClassification(
|
||||
document_type=normalized_type,
|
||||
scene_code=str(parsed.scene_code or base_rule.scene_code).strip() or base_rule.scene_code,
|
||||
scene_label=str(parsed.scene_label or base_rule.scene_label).strip() or base_rule.scene_label,
|
||||
expense_type=str(parsed.expense_type or base_rule.expense_type).strip() or base_rule.expense_type,
|
||||
confidence=float(parsed.confidence),
|
||||
evidence=evidence,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _merge_rule_and_model(
|
||||
*,
|
||||
rule_insight: DocumentInsight,
|
||||
llm_result: tuple[str, LlmDocumentClassification],
|
||||
fields: tuple[DocumentField, ...],
|
||||
has_preview: bool,
|
||||
) -> DocumentInsight:
|
||||
source, parsed = llm_result
|
||||
if parsed.confidence < 0.55:
|
||||
return rule_insight
|
||||
|
||||
should_override = False
|
||||
if parsed.document_type == rule_insight.document_type:
|
||||
should_override = True
|
||||
elif rule_insight.document_type == "other" and parsed.document_type != "other":
|
||||
should_override = True
|
||||
elif parsed.document_type != "other":
|
||||
threshold = 0.60 if has_preview else max(0.76, rule_insight.classification_confidence + 0.12)
|
||||
should_override = parsed.confidence >= threshold
|
||||
|
||||
if not should_override:
|
||||
return rule_insight
|
||||
|
||||
rule = DOCUMENT_TYPE_RULE_MAP.get(parsed.document_type, DEFAULT_RULE)
|
||||
warnings = list(rule_insight.warnings)
|
||||
if parsed.document_type != rule_insight.document_type:
|
||||
warnings.append("票据类型已结合大模型复核结果修正,建议人工再核对原图。")
|
||||
|
||||
return DocumentInsight(
|
||||
document_type=rule.document_type,
|
||||
document_type_label=rule.document_type_label,
|
||||
scene_code=rule.scene_code if parsed.scene_code == "other" else parsed.scene_code,
|
||||
scene_label=rule.scene_label if parsed.scene_label == "其他票据" else parsed.scene_label,
|
||||
expense_type=rule.expense_type if parsed.expense_type == "other" else parsed.expense_type,
|
||||
fields=fields,
|
||||
classification_source=source,
|
||||
classification_confidence=max(parsed.confidence, rule_insight.classification_confidence),
|
||||
evidence=tuple(parsed.evidence or rule_insight.evidence),
|
||||
warnings=tuple(warnings),
|
||||
)
|
||||
|
||||
|
||||
def build_document_insight(
|
||||
*,
|
||||
filename: str = "",
|
||||
summary: str = "",
|
||||
text: str = "",
|
||||
preview_data_url: str = "",
|
||||
) -> DocumentInsight:
|
||||
return DocumentIntelligenceService().build_document_insight(
|
||||
filename=filename,
|
||||
summary=summary,
|
||||
text=text,
|
||||
preview_data_url=preview_data_url,
|
||||
)
|
||||
|
||||
|
||||
def _match_document_rule(compact_text: str) -> RuleMatch:
|
||||
best_rule = DEFAULT_RULE
|
||||
best_evidence: tuple[str, ...] = ()
|
||||
best_score = 0.0
|
||||
|
||||
for rule in DOCUMENT_RULES:
|
||||
matched = tuple(keyword for keyword in rule.keywords if keyword.lower() in compact_text)
|
||||
if not matched:
|
||||
continue
|
||||
score = float(rule.score_bias) + len(matched) * 0.92 + sum(min(len(keyword), 6) * 0.08 for keyword in matched)
|
||||
if score > best_score:
|
||||
best_rule = rule
|
||||
best_evidence = matched
|
||||
best_score = score
|
||||
|
||||
if best_score <= 0:
|
||||
return RuleMatch(rule=None, confidence=0.0, evidence=(), score=0.0)
|
||||
|
||||
confidence = min(0.94, 0.30 + min(best_score, 4.8) * 0.12)
|
||||
return RuleMatch(
|
||||
rule=best_rule,
|
||||
confidence=round(confidence, 2),
|
||||
evidence=best_evidence[:4],
|
||||
score=best_score,
|
||||
)
|
||||
|
||||
|
||||
def _extract_json_payload(response_text: str | None) -> dict[str, Any] | None:
|
||||
if not response_text:
|
||||
return None
|
||||
|
||||
cleaned = re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL | re.IGNORECASE).strip()
|
||||
if not cleaned:
|
||||
return None
|
||||
|
||||
fenced_match = re.search(r"```(?:json)?\s*(\{.*\})\s*```", cleaned, flags=re.DOTALL)
|
||||
candidates = [fenced_match.group(1)] if fenced_match else []
|
||||
candidates.append(cleaned)
|
||||
|
||||
start = cleaned.find("{")
|
||||
end = cleaned.rfind("}")
|
||||
if start != -1 and end != -1 and end > start:
|
||||
candidates.append(cleaned[start : end + 1])
|
||||
|
||||
for candidate in candidates:
|
||||
try:
|
||||
parsed = json.loads(candidate)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if isinstance(parsed, dict):
|
||||
return parsed
|
||||
return None
|
||||
|
||||
|
||||
def _extract_document_fields(text: str) -> list[DocumentField]:
|
||||
fields: list[DocumentField] = []
|
||||
amount = _extract_amount(text)
|
||||
if amount:
|
||||
fields.append(DocumentField(key="amount", label="金额", value=amount))
|
||||
|
||||
date_value = _extract_date(text)
|
||||
if date_value:
|
||||
fields.append(DocumentField(key="date", label="日期", value=date_value))
|
||||
|
||||
merchant = _extract_merchant(text)
|
||||
if merchant:
|
||||
fields.append(DocumentField(key="merchant_name", label="商户", value=merchant))
|
||||
|
||||
invoice_number = _extract_pattern(INVOICE_NUMBER_PATTERN, text)
|
||||
if invoice_number:
|
||||
fields.append(DocumentField(key="invoice_number", label="票据号码", value=invoice_number))
|
||||
|
||||
invoice_code = _extract_pattern(INVOICE_CODE_PATTERN, text)
|
||||
if invoice_code:
|
||||
fields.append(DocumentField(key="invoice_code", label="发票代码", value=invoice_code))
|
||||
|
||||
trip_no = _extract_pattern(TRIP_NO_PATTERN, text)
|
||||
if trip_no:
|
||||
fields.append(DocumentField(key="trip_no", label="车次/航班", value=trip_no))
|
||||
|
||||
route = _extract_route(text)
|
||||
if route:
|
||||
fields.append(DocumentField(key="route", label="行程", value=route))
|
||||
|
||||
return fields
|
||||
|
||||
|
||||
def _extract_amount(text: str) -> str:
|
||||
best_value: Decimal | None = None
|
||||
for pattern in AMOUNT_PATTERNS:
|
||||
for match in pattern.finditer(text):
|
||||
raw_value = str(match.group(1) or "").replace(",", ".").strip()
|
||||
try:
|
||||
candidate = Decimal(raw_value)
|
||||
except InvalidOperation:
|
||||
continue
|
||||
if candidate <= Decimal("0.00"):
|
||||
continue
|
||||
if best_value is None or candidate > best_value:
|
||||
best_value = candidate
|
||||
if best_value is not None:
|
||||
break
|
||||
|
||||
if best_value is None:
|
||||
return ""
|
||||
normalized = best_value.quantize(Decimal("0.01"))
|
||||
text_value = format(normalized, "f").rstrip("0").rstrip(".")
|
||||
return f"{text_value}元"
|
||||
|
||||
|
||||
def _extract_date(text: str) -> str:
|
||||
match = DATE_PATTERN.search(text)
|
||||
if not match:
|
||||
return ""
|
||||
raw_value = str(match.group(1) or "").strip()
|
||||
normalized = raw_value.replace("年", "-").replace("月", "-").replace("日", "")
|
||||
normalized = normalized.replace("/", "-").replace(".", "-")
|
||||
parts = [part for part in normalized.split("-") if part]
|
||||
if len(parts) != 3:
|
||||
return raw_value
|
||||
year, month, day = parts
|
||||
return f"{year.zfill(4)}-{month.zfill(2)}-{day.zfill(2)}"
|
||||
|
||||
|
||||
def _extract_merchant(text: str) -> str:
|
||||
for pattern in MERCHANT_PATTERNS:
|
||||
match = pattern.search(text)
|
||||
if not match:
|
||||
continue
|
||||
value = _clean_field_value(match.group(1))
|
||||
if value:
|
||||
return value
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_route(text: str) -> str:
|
||||
match = ROUTE_PATTERN.search(text)
|
||||
if not match:
|
||||
return ""
|
||||
start = _clean_field_value(match.group(1))
|
||||
end = _clean_field_value(match.group(2))
|
||||
if not start or not end or start == end:
|
||||
return ""
|
||||
return f"{start}-{end}"
|
||||
|
||||
|
||||
def _extract_pattern(pattern: re.Pattern[str], text: str) -> str:
|
||||
match = pattern.search(text)
|
||||
if not match:
|
||||
return ""
|
||||
return _clean_field_value(match.group(1))
|
||||
|
||||
|
||||
def _clean_field_value(value: str) -> str:
|
||||
return str(value or "").strip().strip("::,,。.;;")
|
||||
Reference in New Issue
Block a user