refactor(server): user_agent/steward/ocr 等服务重构并适配关联任务

- user_agent 拆分 application/locations/knowledge/response/review 四个子模块,接入申请位置语义与关联草稿分支
- steward planner/runtime/slot/plan_builder 决策链路重构,travel_reimbursement_calculator/orchestrator_expense_query 适配
- ocr/document_preview/document_intelligence/receipt_folder 复用预览与资产缓存,expense_claim_draft_flow/application_handoff 适配
- pyproject.toml 新增依赖,paddleocr bootstrap 脚本与 server_start.sh 调整
- 更新差旅/交通/通信等财务规则表,同步 document_intelligence/ocr/receipt_folder/user_agent 等测试
This commit is contained in:
caoxiaozhu
2026-06-24 10:42:24 +08:00
parent 332f77389d
commit 0264a4b5b4
41 changed files with 1273 additions and 182 deletions

View File

@@ -37,10 +37,18 @@ TRAIN_ROUTE_PATTERN = re.compile(
r"([\u4e00-\u9fa5]{2,12})站?\s*(?:至|到|→|->|—||-)\s*"
r"([\u4e00-\u9fa5]{2,12})站?"
)
TRAIN_ROUTE_WITH_NO_PATTERN = re.compile(
r"([\u4e00-\u9fa5]{2,12})站?\s+[GCDZKTLYS]\d{1,5}\s+"
r"([\u4e00-\u9fa5]{2,12})站?",
re.IGNORECASE,
)
TRAIN_NO_PATTERN = re.compile(r"(?:车次|列车号)\s*[:]?\s*([GCDZKTLYS]\d{1,5})", re.IGNORECASE)
TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(?<![A-Z0-9])([GCDZKTLYS]\d{1,5})(?![A-Z0-9])", re.IGNORECASE)
TRAIN_PASSENGER_PATTERN = re.compile(r"(?:乘车人|旅客姓名|姓名)\s*[:]?\s*([\u4e00-\u9fa5·]{2,20})")
TRAIN_ID_PATTERN = re.compile(r"(?:有效身份证件号码|身份证件号码|证件号码|身份证号)\s*[:]?\s*([0-9Xx*]{6,24})")
TRAIN_PASSENGER_PATTERN = re.compile(r"(?:乘车人|乘客|旅客姓名|姓名)\s*[:]?\s*([\u4e00-\u9fa5·]{2,20})")
TRAIN_PURCHASER_NAME_PATTERN = re.compile(
r"购买方名称\s*[:]?\s*([·\u4e00-\u9fa5]{2,20}?)(?=\s*(?:统一社会信用代码|纳税人识别号|$))"
)
TRAIN_ID_PATTERN = re.compile(r"(?:有效身份证件号码|身份证件号码|证件号码|身份证号|证件号)\s*[:]?\s*([0-9Xx*]{6,24})")
TRAIN_ID_FALLBACK_PATTERN = re.compile(r"(?<![0-9A-Za-z])([0-9]{6,17}[0-9Xx*]{2,8})(?![0-9A-Za-z])")
TRAIN_ETICKET_PATTERN = re.compile(r"(?:电子客票号|客票号)\s*[:]?\s*([A-Z0-9]{6,32})", re.IGNORECASE)
TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座|一等卧|二等卧|软卧|硬卧|软座|硬座|无座)")
@@ -50,6 +58,28 @@ TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])
TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE)
TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[::¥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)")
TRAIN_STATION_FIELD_KEYS = {"departure_station", "arrival_station"}
TRAIN_STATION_FIELD_LABELS = {"出发地点", "到达地点"}
TRAIN_INVALID_STATION_TOKENS = (
"",
"",
"扫码",
"无效",
"票价",
"金额",
"车厢",
"座位",
"乘客",
"证件",
"身份证",
"订单",
"单据",
"日期",
"渠道",
"官方",
"演示",
"不可报销",
)
class ReceiptFolderStorageMixin:
@@ -119,6 +149,29 @@ class ReceiptFolderStorageMixin:
"preview_media_type": preview_media_type,
"preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
}
if str(media_type or "").strip() == "application/pdf":
preview_path = receipt_dir / f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
try:
DocumentPreviewAssets.render_pdf_first_page(
pdf_path=source_path,
preview_path=preview_path,
timeout_seconds=get_settings().ocr_timeout_seconds,
)
except Exception:
return {
"previewable": True,
"preview_kind": "pdf",
"preview_file_name": source_path.name,
"preview_media_type": media_type,
"preview_rendered_with": "",
}
return {
"previewable": True,
"preview_kind": "image",
"preview_file_name": preview_path.name,
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
}
if self._is_previewable(media_type):
return {
"previewable": True,
@@ -172,6 +225,16 @@ class ReceiptFolderStorageMixin:
timeout_seconds=get_settings().ocr_timeout_seconds,
)
except Exception:
meta.update(
{
"previewable": True,
"preview_kind": "pdf",
"preview_file_name": source_path.name,
"preview_media_type": "application/pdf",
"preview_rendered_with": "",
}
)
self._write_meta(receipt_dir, meta)
return meta
meta.update(
@@ -543,6 +606,10 @@ class ReceiptFolderTrainTicketMixin:
value = str(field.get("value") or "").strip()
if not label or not value:
continue
if key == "merchant_name" or label == "商户":
continue
if not cls._should_keep_train_ticket_field(key=key, label=label, value=value):
continue
if key == "trip_no" and label == "车次/航班":
label = "车次"
if key == "route" and label == "行程":
@@ -559,6 +626,8 @@ class ReceiptFolderTrainTicketMixin:
return
normalized.append({"key": key, "label": label, "value": cleaned})
add_field("merchant_name", "商户", "中国铁路")
invoice_date = cls._extract_train_invoice_date(text)
add_field("invoice_date", "开票日期", invoice_date)
@@ -690,6 +759,13 @@ class ReceiptFolderTrainTicketMixin:
@classmethod
def _extract_train_route_points(cls, text: str) -> tuple[str, str]:
raw_text = str(text or "")
split_line_match = TRAIN_ROUTE_WITH_NO_PATTERN.search(raw_text)
if split_line_match:
departure = cls._clean_train_station(split_line_match.group(1))
arrival = cls._clean_train_station(split_line_match.group(2))
if cls._is_valid_train_station_value(departure) and cls._is_valid_train_station_value(arrival) and departure != arrival:
return departure, arrival
station_candidates: list[str] = []
for line in raw_text.replace("\r", "\n").splitlines():
candidate = cls._clean_train_station(line)
@@ -697,7 +773,7 @@ class ReceiptFolderTrainTicketMixin:
continue
if not str(line or "").strip().endswith(""):
continue
if any(token in candidate for token in ("发票", "客票", "铁路", "票价", "日期")):
if not cls._is_valid_train_station_value(candidate):
continue
station_candidates.append(candidate)
if len(station_candidates) >= 2:
@@ -707,7 +783,7 @@ class ReceiptFolderTrainTicketMixin:
if match:
departure = cls._clean_train_station(match.group(1))
arrival = cls._clean_train_station(match.group(2))
if departure and arrival and departure != arrival:
if cls._is_valid_train_station_value(departure) and cls._is_valid_train_station_value(arrival) and departure != arrival:
return departure, arrival
return "", ""
@@ -717,6 +793,25 @@ class ReceiptFolderTrainTicketMixin:
cleaned = re.sub(r"(?:火车站|高铁站|站)$", "", cleaned)
return cleaned.strip()
@classmethod
def _should_keep_train_ticket_field(cls, *, key: str, label: str, value: str) -> bool:
if key in TRAIN_STATION_FIELD_KEYS or label in TRAIN_STATION_FIELD_LABELS:
return cls._is_valid_train_station_value(value)
if key == "passenger_name" or label == "乘车人":
return bool(cls._clean_train_passenger_candidate(value))
return True
@classmethod
def _is_valid_train_station_value(cls, value: str) -> bool:
cleaned = cls._clean_train_station(value)
if not 2 <= len(cleaned) <= 12:
return False
if any(token in cleaned for token in TRAIN_INVALID_STATION_TOKENS):
return False
if re.search(r"[A-Za-z0-9]", cleaned):
return False
return True
@staticmethod
def _extract_first(pattern: re.Pattern[str], text: str) -> str:
match = pattern.search(str(text or ""))
@@ -724,24 +819,30 @@ class ReceiptFolderTrainTicketMixin:
@classmethod
def _extract_train_passenger_name(cls, text: str, *, id_number: str = "") -> str:
labeled = cls._extract_first(TRAIN_PASSENGER_PATTERN, text)
if labeled:
return labeled
lines = [line.strip() for line in str(text or "").replace("\r", "\n").splitlines() if line.strip()]
for index, line in enumerate(lines):
if id_number and id_number not in line:
continue
for offset in (1, -1, 2):
target_index = index + offset
if target_index < 0 or target_index >= len(lines):
for line in lines:
labeled = cls._clean_train_passenger_candidate(cls._extract_first(TRAIN_PASSENGER_PATTERN, line))
if labeled:
return labeled
if id_number:
for index, line in enumerate(lines):
if id_number not in line:
continue
candidate = cls._clean_train_passenger_candidate(lines[target_index])
candidate = cls._clean_train_passenger_candidate(line.replace(id_number, " "))
if candidate:
return candidate
for offset in (1, -1, 2):
target_index = index + offset
if target_index < 0 or target_index >= len(lines):
continue
candidate = cls._clean_train_passenger_candidate(lines[target_index])
if candidate:
return candidate
for line in lines:
if "购买方名称" in line:
candidate = cls._clean_train_passenger_candidate(line.split(":", 1)[-1].split("", 1)[-1])
purchase_match = TRAIN_PURCHASER_NAME_PATTERN.search(line)
if purchase_match:
candidate = cls._clean_train_passenger_candidate(purchase_match.group(1))
if candidate:
return candidate
return ""
@@ -764,6 +865,16 @@ class ReceiptFolderTrainTicketMixin:
"开票",
"日期",
"车厢",
"席别",
"二等座",
"一等座",
"商务座",
"特等座",
"软座",
"硬座",
"无座",
"软卧",
"硬卧",
"座位",
"票价",
"金额",
@@ -771,6 +882,14 @@ class ReceiptFolderTrainTicketMixin:
"出发",
"到达",
"车次",
"公司",
"信用代码",
"纳税人",
"扫码",
"无效",
"二维码",
"座席",
"证件",
)
):
return ""