refactor(server): user_agent/steward/ocr 等服务重构并适配关联任务
- user_agent 拆分 application/locations/knowledge/response/review 四个子模块,接入申请位置语义与关联草稿分支 - steward planner/runtime/slot/plan_builder 决策链路重构,travel_reimbursement_calculator/orchestrator_expense_query 适配 - ocr/document_preview/document_intelligence/receipt_folder 复用预览与资产缓存,expense_claim_draft_flow/application_handoff 适配 - pyproject.toml 新增依赖,paddleocr bootstrap 脚本与 server_start.sh 调整 - 更新差旅/交通/通信等财务规则表,同步 document_intelligence/ocr/receipt_folder/user_agent 等测试
This commit is contained in:
@@ -37,10 +37,18 @@ TRAIN_ROUTE_PATTERN = re.compile(
|
||||
r"([\u4e00-\u9fa5]{2,12})站?\s*(?:至|到|→|->|—|–|-)\s*"
|
||||
r"([\u4e00-\u9fa5]{2,12})站?"
|
||||
)
|
||||
TRAIN_ROUTE_WITH_NO_PATTERN = re.compile(
|
||||
r"([\u4e00-\u9fa5]{2,12})站?\s+[GCDZKTLYS]\d{1,5}\s+"
|
||||
r"([\u4e00-\u9fa5]{2,12})站?",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
TRAIN_NO_PATTERN = re.compile(r"(?:车次|列车号)\s*[::]?\s*([GCDZKTLYS]\d{1,5})", re.IGNORECASE)
|
||||
TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(?<![A-Z0-9])([GCDZKTLYS]\d{1,5})(?![A-Z0-9])", re.IGNORECASE)
|
||||
TRAIN_PASSENGER_PATTERN = re.compile(r"(?:乘车人|旅客姓名|姓名)\s*[::]?\s*([\u4e00-\u9fa5·]{2,20})")
|
||||
TRAIN_ID_PATTERN = re.compile(r"(?:有效身份证件号码|身份证件号码|证件号码|身份证号)\s*[::]?\s*([0-9Xx*]{6,24})")
|
||||
TRAIN_PASSENGER_PATTERN = re.compile(r"(?:乘车人|乘客|旅客姓名|姓名)\s*[::]?\s*([\u4e00-\u9fa5·]{2,20})")
|
||||
TRAIN_PURCHASER_NAME_PATTERN = re.compile(
|
||||
r"购买方名称\s*[::]?\s*([·\u4e00-\u9fa5]{2,20}?)(?=\s*(?:统一社会信用代码|纳税人识别号|$))"
|
||||
)
|
||||
TRAIN_ID_PATTERN = re.compile(r"(?:有效身份证件号码|身份证件号码|证件号码|身份证号|证件号)\s*[::]?\s*([0-9Xx*]{6,24})")
|
||||
TRAIN_ID_FALLBACK_PATTERN = re.compile(r"(?<![0-9A-Za-z])([0-9]{6,17}[0-9Xx*]{2,8})(?![0-9A-Za-z])")
|
||||
TRAIN_ETICKET_PATTERN = re.compile(r"(?:电子客票号|客票号)\s*[::]?\s*([A-Z0-9]{6,32})", re.IGNORECASE)
|
||||
TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座|一等卧|二等卧|软卧|硬卧|软座|硬座|无座)")
|
||||
@@ -50,6 +58,28 @@ TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])
|
||||
TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE)
|
||||
TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[::¥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
|
||||
TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)")
|
||||
TRAIN_STATION_FIELD_KEYS = {"departure_station", "arrival_station"}
|
||||
TRAIN_STATION_FIELD_LABELS = {"出发地点", "到达地点"}
|
||||
TRAIN_INVALID_STATION_TOKENS = (
|
||||
"座",
|
||||
"席",
|
||||
"扫码",
|
||||
"无效",
|
||||
"票价",
|
||||
"金额",
|
||||
"车厢",
|
||||
"座位",
|
||||
"乘客",
|
||||
"证件",
|
||||
"身份证",
|
||||
"订单",
|
||||
"单据",
|
||||
"日期",
|
||||
"渠道",
|
||||
"官方",
|
||||
"演示",
|
||||
"不可报销",
|
||||
)
|
||||
|
||||
|
||||
class ReceiptFolderStorageMixin:
|
||||
@@ -119,6 +149,29 @@ class ReceiptFolderStorageMixin:
|
||||
"preview_media_type": preview_media_type,
|
||||
"preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
|
||||
}
|
||||
if str(media_type or "").strip() == "application/pdf":
|
||||
preview_path = receipt_dir / f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
|
||||
try:
|
||||
DocumentPreviewAssets.render_pdf_first_page(
|
||||
pdf_path=source_path,
|
||||
preview_path=preview_path,
|
||||
timeout_seconds=get_settings().ocr_timeout_seconds,
|
||||
)
|
||||
except Exception:
|
||||
return {
|
||||
"previewable": True,
|
||||
"preview_kind": "pdf",
|
||||
"preview_file_name": source_path.name,
|
||||
"preview_media_type": media_type,
|
||||
"preview_rendered_with": "",
|
||||
}
|
||||
return {
|
||||
"previewable": True,
|
||||
"preview_kind": "image",
|
||||
"preview_file_name": preview_path.name,
|
||||
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
|
||||
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
|
||||
}
|
||||
if self._is_previewable(media_type):
|
||||
return {
|
||||
"previewable": True,
|
||||
@@ -172,6 +225,16 @@ class ReceiptFolderStorageMixin:
|
||||
timeout_seconds=get_settings().ocr_timeout_seconds,
|
||||
)
|
||||
except Exception:
|
||||
meta.update(
|
||||
{
|
||||
"previewable": True,
|
||||
"preview_kind": "pdf",
|
||||
"preview_file_name": source_path.name,
|
||||
"preview_media_type": "application/pdf",
|
||||
"preview_rendered_with": "",
|
||||
}
|
||||
)
|
||||
self._write_meta(receipt_dir, meta)
|
||||
return meta
|
||||
|
||||
meta.update(
|
||||
@@ -543,6 +606,10 @@ class ReceiptFolderTrainTicketMixin:
|
||||
value = str(field.get("value") or "").strip()
|
||||
if not label or not value:
|
||||
continue
|
||||
if key == "merchant_name" or label == "商户":
|
||||
continue
|
||||
if not cls._should_keep_train_ticket_field(key=key, label=label, value=value):
|
||||
continue
|
||||
if key == "trip_no" and label == "车次/航班":
|
||||
label = "车次"
|
||||
if key == "route" and label == "行程":
|
||||
@@ -559,6 +626,8 @@ class ReceiptFolderTrainTicketMixin:
|
||||
return
|
||||
normalized.append({"key": key, "label": label, "value": cleaned})
|
||||
|
||||
add_field("merchant_name", "商户", "中国铁路")
|
||||
|
||||
invoice_date = cls._extract_train_invoice_date(text)
|
||||
add_field("invoice_date", "开票日期", invoice_date)
|
||||
|
||||
@@ -690,6 +759,13 @@ class ReceiptFolderTrainTicketMixin:
|
||||
@classmethod
|
||||
def _extract_train_route_points(cls, text: str) -> tuple[str, str]:
|
||||
raw_text = str(text or "")
|
||||
split_line_match = TRAIN_ROUTE_WITH_NO_PATTERN.search(raw_text)
|
||||
if split_line_match:
|
||||
departure = cls._clean_train_station(split_line_match.group(1))
|
||||
arrival = cls._clean_train_station(split_line_match.group(2))
|
||||
if cls._is_valid_train_station_value(departure) and cls._is_valid_train_station_value(arrival) and departure != arrival:
|
||||
return departure, arrival
|
||||
|
||||
station_candidates: list[str] = []
|
||||
for line in raw_text.replace("\r", "\n").splitlines():
|
||||
candidate = cls._clean_train_station(line)
|
||||
@@ -697,7 +773,7 @@ class ReceiptFolderTrainTicketMixin:
|
||||
continue
|
||||
if not str(line or "").strip().endswith("站"):
|
||||
continue
|
||||
if any(token in candidate for token in ("发票", "客票", "铁路", "票价", "日期")):
|
||||
if not cls._is_valid_train_station_value(candidate):
|
||||
continue
|
||||
station_candidates.append(candidate)
|
||||
if len(station_candidates) >= 2:
|
||||
@@ -707,7 +783,7 @@ class ReceiptFolderTrainTicketMixin:
|
||||
if match:
|
||||
departure = cls._clean_train_station(match.group(1))
|
||||
arrival = cls._clean_train_station(match.group(2))
|
||||
if departure and arrival and departure != arrival:
|
||||
if cls._is_valid_train_station_value(departure) and cls._is_valid_train_station_value(arrival) and departure != arrival:
|
||||
return departure, arrival
|
||||
return "", ""
|
||||
|
||||
@@ -717,6 +793,25 @@ class ReceiptFolderTrainTicketMixin:
|
||||
cleaned = re.sub(r"(?:火车站|高铁站|站)$", "", cleaned)
|
||||
return cleaned.strip()
|
||||
|
||||
@classmethod
|
||||
def _should_keep_train_ticket_field(cls, *, key: str, label: str, value: str) -> bool:
|
||||
if key in TRAIN_STATION_FIELD_KEYS or label in TRAIN_STATION_FIELD_LABELS:
|
||||
return cls._is_valid_train_station_value(value)
|
||||
if key == "passenger_name" or label == "乘车人":
|
||||
return bool(cls._clean_train_passenger_candidate(value))
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def _is_valid_train_station_value(cls, value: str) -> bool:
|
||||
cleaned = cls._clean_train_station(value)
|
||||
if not 2 <= len(cleaned) <= 12:
|
||||
return False
|
||||
if any(token in cleaned for token in TRAIN_INVALID_STATION_TOKENS):
|
||||
return False
|
||||
if re.search(r"[A-Za-z0-9]", cleaned):
|
||||
return False
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def _extract_first(pattern: re.Pattern[str], text: str) -> str:
|
||||
match = pattern.search(str(text or ""))
|
||||
@@ -724,24 +819,30 @@ class ReceiptFolderTrainTicketMixin:
|
||||
|
||||
@classmethod
|
||||
def _extract_train_passenger_name(cls, text: str, *, id_number: str = "") -> str:
|
||||
labeled = cls._extract_first(TRAIN_PASSENGER_PATTERN, text)
|
||||
if labeled:
|
||||
return labeled
|
||||
|
||||
lines = [line.strip() for line in str(text or "").replace("\r", "\n").splitlines() if line.strip()]
|
||||
for index, line in enumerate(lines):
|
||||
if id_number and id_number not in line:
|
||||
continue
|
||||
for offset in (1, -1, 2):
|
||||
target_index = index + offset
|
||||
if target_index < 0 or target_index >= len(lines):
|
||||
for line in lines:
|
||||
labeled = cls._clean_train_passenger_candidate(cls._extract_first(TRAIN_PASSENGER_PATTERN, line))
|
||||
if labeled:
|
||||
return labeled
|
||||
|
||||
if id_number:
|
||||
for index, line in enumerate(lines):
|
||||
if id_number not in line:
|
||||
continue
|
||||
candidate = cls._clean_train_passenger_candidate(lines[target_index])
|
||||
candidate = cls._clean_train_passenger_candidate(line.replace(id_number, " "))
|
||||
if candidate:
|
||||
return candidate
|
||||
for offset in (1, -1, 2):
|
||||
target_index = index + offset
|
||||
if target_index < 0 or target_index >= len(lines):
|
||||
continue
|
||||
candidate = cls._clean_train_passenger_candidate(lines[target_index])
|
||||
if candidate:
|
||||
return candidate
|
||||
for line in lines:
|
||||
if "购买方名称" in line:
|
||||
candidate = cls._clean_train_passenger_candidate(line.split(":", 1)[-1].split(":", 1)[-1])
|
||||
purchase_match = TRAIN_PURCHASER_NAME_PATTERN.search(line)
|
||||
if purchase_match:
|
||||
candidate = cls._clean_train_passenger_candidate(purchase_match.group(1))
|
||||
if candidate:
|
||||
return candidate
|
||||
return ""
|
||||
@@ -764,6 +865,16 @@ class ReceiptFolderTrainTicketMixin:
|
||||
"开票",
|
||||
"日期",
|
||||
"车厢",
|
||||
"席别",
|
||||
"二等座",
|
||||
"一等座",
|
||||
"商务座",
|
||||
"特等座",
|
||||
"软座",
|
||||
"硬座",
|
||||
"无座",
|
||||
"软卧",
|
||||
"硬卧",
|
||||
"座位",
|
||||
"票价",
|
||||
"金额",
|
||||
@@ -771,6 +882,14 @@ class ReceiptFolderTrainTicketMixin:
|
||||
"出发",
|
||||
"到达",
|
||||
"车次",
|
||||
"公司",
|
||||
"信用代码",
|
||||
"纳税人",
|
||||
"扫码",
|
||||
"无效",
|
||||
"二维码",
|
||||
"座席",
|
||||
"证件",
|
||||
)
|
||||
):
|
||||
return ""
|
||||
|
||||
Reference in New Issue
Block a user