feat(server): 票据文件夹资产缓存与文档预览统一生成

- 新增 document_preview 模块,DocumentPreviewAssets 统一处理 data URL 解码、pdftoppm PNG 预览生成(poppler-data 编码)、renderer_id 标识
- receipt_folder 服务复用预览生成,缓存票据资产并提供清理;删除票据时保留已关联报销单的附件副本
- document_intelligence 新增票据预览/资产缓存接入与字段提取增强;ocr 抽取复用预览工具,附件分析/文档/操作/展示四个子模块同步适配
- receipt_folder 端点补充资产缓存头,补/扩 document_intelligence、ocr_endpoints、ocr_service、receipt_folder_service、reimbursement_endpoints 测试,新增 attachment_analysis 回归测试
This commit is contained in:
caoxiaozhu
2026-06-23 09:42:00 +08:00
parent bc743adef3
commit 84a8998e59
15 changed files with 1076 additions and 79 deletions

View File

@@ -25,11 +25,15 @@ AMOUNT_PATTERNS = (
re.compile(r"[¥¥]\s*([0-9]+(?:[.,][0-9]{1,2})?)"),
re.compile(r"([0-9]+(?:[.,][0-9]{1,2})?)\s*元"),
)
DATE_PATTERN = re.compile(r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)")
DATE_PATTERN = re.compile(
r"((?:20\d{2}|19\d{2})(?:[-/年.]|\s+)(?:1[0-2]|0?[1-9])"
r"(?:[-/月.]|\s+)(?:3[01]|[12]\d|0?[1-9])日?)"
)
TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:]([0-5]\d)(?!\d)")
INVOICE_NUMBER_PATTERN = re.compile(r"(?:发票号码|票号|单号|订单号)[:\s]*([A-Za-z0-9-]{6,24})")
INVOICE_CODE_PATTERN = re.compile(r"(?:发票代码)[:\s]*([A-Za-z0-9-]{6,24})")
TRIP_NO_PATTERN = re.compile(r"(?:车次|航班(?:号)?)[:\s]*([A-Za-z0-9]{2,12})")
TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(?<![A-Za-z0-9])([GCDZKTLYS]\d{1,5})(?![A-Za-z0-9])", re.IGNORECASE)
ROUTE_PATTERN = re.compile(r"([\u4e00-\u9fa5]{2,12})\s*(?:至|→|->|-)\s*([\u4e00-\u9fa5]{2,12})")
MERCHANT_PATTERNS = (
re.compile(r"(?:销售方(?:名称)?|商户(?:名称)?|开票方(?:名称)?|收款方(?:名称)?)[:\s]*([A-Za-z0-9\u4e00-\u9fa5()·&\\-]{2,40})"),
@@ -300,6 +304,14 @@ def _match_document_rule(compact_text: str) -> RuleMatch:
best_score = score
if best_score <= 0:
train_rule = DOCUMENT_TYPE_RULE_MAP.get("train_ticket")
if train_rule and _looks_like_train_ticket(compact_text):
return RuleMatch(
rule=train_rule,
confidence=0.82,
evidence=("车次", "12306"),
score=3.8,
)
return RuleMatch(rule=None, confidence=0.0, evidence=(), score=0.0)
confidence = min(0.94, 0.30 + min(best_score, 4.8) * 0.12)
@@ -311,6 +323,17 @@ def _match_document_rule(compact_text: str) -> RuleMatch:
)
def _looks_like_train_ticket(compact_text: str) -> bool:
text = str(compact_text or "").lower()
if not re.search(r"[gcdzktlys]\d{1,5}", text, flags=re.IGNORECASE):
return False
if "12306" in text or "95306" in text:
return True
if re.search(r"[\u4e00-\u9fa5]{2,12}(?:至|到|→|->|—||-)[\u4e00-\u9fa5]{2,12}", text):
return True
return "wuhan" in text and "shanghai" in text
def _extract_json_payload(response_text: str | None) -> dict[str, Any] | None:
if not response_text:
return None
@@ -521,33 +544,48 @@ def _merge_document_fields(
def _extract_document_fields(text: str, document_type: str = "") -> list[DocumentField]:
fields: list[DocumentField] = []
normalized_type = str(document_type or "").strip().lower()
def append_field(key: str, label: str, value: str) -> None:
cleaned = _clean_field_value(value)
if not cleaned:
return
if any(field.key == key for field in fields if field.key):
return
fields.append(DocumentField(key=key, label=label, value=cleaned))
amount = _extract_amount(text)
if amount:
fields.append(DocumentField(key="amount", label="金额", value=amount))
append_field("amount", "金额", amount)
date_value = _extract_date(text, document_type=document_type)
if date_value:
fields.append(DocumentField(key="date", label="日期", value=date_value))
append_field("date", "日期", date_value)
merchant = _extract_merchant(text)
if merchant:
fields.append(DocumentField(key="merchant_name", label="商户", value=merchant))
append_field("merchant_name", "商户", merchant)
invoice_number = _extract_pattern(INVOICE_NUMBER_PATTERN, text)
if invoice_number:
fields.append(DocumentField(key="invoice_number", label="票据号码", value=invoice_number))
append_field("invoice_number", "票据号码", invoice_number)
invoice_code = _extract_pattern(INVOICE_CODE_PATTERN, text)
if invoice_code:
fields.append(DocumentField(key="invoice_code", label="发票代码", value=invoice_code))
append_field("invoice_code", "发票代码", invoice_code)
trip_no = _extract_pattern(TRIP_NO_PATTERN, text)
if not trip_no and normalized_type == "train_ticket":
trip_no = _extract_pattern(TRAIN_STANDALONE_NO_PATTERN, text)
if trip_no:
fields.append(DocumentField(key="trip_no", label="车次/航班", value=trip_no))
append_field("trip_no", "车次/航班", trip_no.upper())
route = _extract_route(text)
if route:
fields.append(DocumentField(key="route", label="行程", value=route))
append_field("route", "行程", route)
if normalized_type == "train_ticket" and not any(field.key == "amount" for field in fields):
append_field("amount", "金额", _extract_loose_decimal_amount(text))
return fields
@@ -621,6 +659,7 @@ def _format_date_match_with_time(text: str, match: re.Match[str]) -> str:
raw_value = str(match.group(1) or "").strip()
normalized = raw_value.replace("", "-").replace("", "-").replace("", "")
normalized = normalized.replace("/", "-").replace(".", "-")
normalized = re.sub(r"\s+", "-", normalized)
parts = [part for part in normalized.split("-") if part]
if len(parts) != 3:
return raw_value
@@ -703,6 +742,23 @@ def _extract_route(text: str) -> str:
return f"{start}-{end}"
def _extract_loose_decimal_amount(text: str) -> str:
best_value: Decimal | None = None
for match in re.finditer(r"(?<!\d)(\d{1,6}\.\d{1,2})(?!\d)", str(text or "")):
try:
candidate = Decimal(match.group(1)).quantize(Decimal("0.01"))
except InvalidOperation:
continue
if candidate <= Decimal("0.00"):
continue
if best_value is None or candidate > best_value:
best_value = candidate
if best_value is None:
return ""
text_value = format(best_value, "f").rstrip("0").rstrip(".")
return f"{text_value}"
def _extract_pattern(pattern: re.Pattern[str], text: str) -> str:
match = pattern.search(text)
if not match: