feat: 新增风险图谱算法与系统仪表盘及操作反馈体系

后端新增风险图谱算法模块、风险观察与反馈服务、规则 DSL
校验器和可解释性引擎,完善系统仪表盘和财务仪表盘统计,
优化 agent 运行和编排执行链路,清理旧开发文档,前端新增
系统趋势、负载热力图等多种仪表盘图表组件,完善操作反馈
对话框和工作台日期选择器,优化报销创建和审批详情交互,
补充单元测试覆盖。
This commit is contained in:
caoxiaozhu
2026-05-30 15:46:51 +08:00
parent 4c59941ec6
commit 7989f3a159
314 changed files with 30073 additions and 20626 deletions

View File

@@ -22,6 +22,30 @@ from app.schemas.receipt_folder import (
from app.services.expense_claim_attachment_presentation import ExpenseClaimAttachmentPresentation
from app.services.ocr import SUPPORTED_SUFFIXES
RECEIPT_DATE_PATTERN = re.compile(
r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)"
)
RECEIPT_TIME_PATTERN = re.compile(r"(?<!\d)([01]?\d|2[0-3])[:]([0-5]\d)(?!\d)")
TRAIN_INVOICE_DATE_PATTERN = re.compile(
r"(?:开票日期|发票日期|开票时间)\s*[:]?\s*"
r"((?:20\d{2}|19\d{2})[-/年.](?:1[0-2]|0?[1-9])[-/月.](?:3[01]|[12]\d|0?[1-9])日?)"
)
TRAIN_ROUTE_PATTERN = re.compile(
r"([\u4e00-\u9fa5]{2,12})站?\s*(?:至|到|→|->|—||-)\s*"
r"([\u4e00-\u9fa5]{2,12})站?"
)
TRAIN_NO_PATTERN = re.compile(r"(?:车次|列车号)\s*[:]?\s*([GCDZKTLYS]\d{1,5})", re.IGNORECASE)
TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(?<![A-Z0-9])([GCDZKTLYS]\d{1,5})(?![A-Z0-9])", re.IGNORECASE)
TRAIN_PASSENGER_PATTERN = re.compile(r"(?:乘车人|旅客姓名|姓名)\s*[:]?\s*([\u4e00-\u9fa5·]{2,20})")
TRAIN_ID_PATTERN = re.compile(r"(?:有效身份证件号码|身份证件号码|证件号码|身份证号)\s*[:]?\s*([0-9Xx*]{6,24})")
TRAIN_ID_FALLBACK_PATTERN = re.compile(r"(?<![0-9A-Za-z])([0-9]{6,17}[0-9Xx*]{2,8})(?![0-9A-Za-z])")
TRAIN_ETICKET_PATTERN = re.compile(r"(?:电子客票号|客票号)\s*[:]?\s*([A-Z0-9]{6,32})", re.IGNORECASE)
TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座|一等卧|二等卧|软卧|硬卧|软座|硬座|无座)")
TRAIN_CARRIAGE_PATTERN = re.compile(r"(?:车厢|车厢号)\s*[:]?\s*([0-9]{1,2}\s*车?)")
TRAIN_SEAT_NO_PATTERN = re.compile(r"(?:座位|座位号)\s*[:]?\s*([0-9]{1,3}[A-F号]?)", re.IGNORECASE)
TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])号?", re.IGNORECASE)
TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[::¥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
class ReceiptFolderService:
def __init__(self) -> None:
@@ -372,8 +396,8 @@ class ReceiptFolderService:
def _is_previewable(media_type: str) -> bool:
return str(media_type or "").startswith("image/") or str(media_type or "") == "application/pdf"
@staticmethod
def _build_document_meta(document: Any | None) -> dict[str, Any]:
@classmethod
def _build_document_meta(cls, document: Any | None) -> dict[str, Any]:
fields = []
for field in list(getattr(document, "document_fields", []) or []):
if isinstance(field, dict):
@@ -393,18 +417,33 @@ class ReceiptFolderService:
}
)
fields = [field for field in fields if field["label"] and field["value"]]
ocr_text = str(getattr(document, "text", "") or "")
summary = str(getattr(document, "summary", "") or "")
document_type = str(getattr(document, "document_type", "") or "other")
document_type_label = str(getattr(document, "document_type_label", "") or "其他单据")
scene_label = str(getattr(document, "scene_label", "") or "其他票据")
if cls._is_train_ticket_values(
document_type=document_type,
document_type_label=document_type_label,
scene_label=scene_label,
text=f"{summary}\n{ocr_text}",
):
fields = cls._enrich_train_ticket_field_dicts(
fields,
text=f"{ocr_text}\n{summary}\n{str(getattr(document, 'filename', '') or '')}",
)
return {
"engine": str(getattr(document, "engine", "") or ""),
"model": str(getattr(document, "model", "") or ""),
"ocr_text": str(getattr(document, "text", "") or ""),
"summary": str(getattr(document, "summary", "") or ""),
"ocr_text": ocr_text,
"summary": summary,
"ocr_avg_score": float(getattr(document, "avg_score", 0.0) or 0.0),
"ocr_line_count": int(getattr(document, "line_count", 0) or 0),
"page_count": int(getattr(document, "page_count", 1) or 1),
"document_type": str(getattr(document, "document_type", "") or "other"),
"document_type_label": str(getattr(document, "document_type_label", "") or "其他单据"),
"document_type": document_type,
"document_type_label": document_type_label,
"scene_code": str(getattr(document, "scene_code", "") or "other"),
"scene_label": str(getattr(document, "scene_label", "") or "其他票据"),
"scene_label": scene_label,
"ocr_classification_source": str(getattr(document, "classification_source", "") or ""),
"ocr_classification_confidence": float(getattr(document, "classification_confidence", 0.0) or 0.0),
"ocr_classification_evidence": [
@@ -484,8 +523,8 @@ class ReceiptFolderService:
scene_label=str(meta.get("scene_label") or "其他票据"),
summary=str(meta.get("summary") or ""),
amount=self._resolve_editable_or_field(meta, "amount", labels=("金额", "价税合计", "票价")),
document_date=self._resolve_editable_or_field(meta, "document_date", labels=("日期", "开票日期", "乘车日期")),
merchant_name=self._resolve_editable_or_field(meta, "merchant_name", labels=("商户", "销售方", "收款方")),
document_date=self._resolve_receipt_document_date(meta),
merchant_name=self._resolve_receipt_merchant_name(meta),
avg_score=float(meta.get("ocr_avg_score") or 0.0),
uploaded_at=self._parse_datetime(meta.get("uploaded_at")),
linked_at=self._parse_datetime(meta.get("linked_at")),
@@ -499,7 +538,7 @@ class ReceiptFolderService:
)
def _resolve_fields(self, meta: dict[str, Any]) -> list[ReceiptFolderFieldRead]:
return [
fields = [
ReceiptFolderFieldRead(
key=str(field.get("key") or ""),
label=str(field.get("label") or ""),
@@ -508,6 +547,45 @@ class ReceiptFolderService:
for field in list(meta.get("document_fields") or [])
if isinstance(field, dict) and str(field.get("label") or "").strip()
]
if self._is_train_ticket_meta(meta):
return [
ReceiptFolderFieldRead(**field)
for field in self._enrich_train_ticket_field_dicts(
[field.model_dump() for field in fields],
text=self._receipt_text(meta),
)
]
return fields
def _resolve_receipt_document_date(self, meta: dict[str, Any]) -> str:
editable = meta.get("editable_fields")
if isinstance(editable, dict):
value = str(editable.get("document_date") or "").strip()
if value:
return value
fields = self._resolve_fields(meta)
for field in fields:
if field.key in {"invoice_date", "issue_date"} or field.label in {"开票日期", "发票日期"}:
return self._normalize_receipt_date_value(field.value)
if self._is_train_ticket_meta(meta):
invoice_date = self._extract_train_invoice_date(self._receipt_text(meta))
if invoice_date:
return invoice_date
for field in fields:
if field.key == "document_date" or field.label in {"日期", "乘车日期", "列车出发时间", "行程日期"}:
return self._normalize_receipt_date_value(field.value)
return ""
def _resolve_receipt_merchant_name(self, meta: dict[str, Any]) -> str:
value = self._resolve_editable_or_field(meta, "merchant_name", labels=("商户", "销售方", "收款方", "开票方"))
if value:
return value
if self._is_train_ticket_meta(meta):
return "中国铁路"
return ""
def _resolve_editable_or_field(self, meta: dict[str, Any], key: str, *, labels: tuple[str, ...]) -> str:
editable = meta.get("editable_fields")
@@ -521,6 +599,254 @@ class ReceiptFolderService:
return field.value
return ""
@classmethod
def _enrich_train_ticket_field_dicts(
cls,
fields: list[dict[str, Any]],
*,
text: str,
) -> list[dict[str, str]]:
normalized: list[dict[str, str]] = []
for field in fields:
key = str(field.get("key") or "").strip()
label = str(field.get("label") or "").strip()
value = str(field.get("value") or "").strip()
if not label or not value:
continue
if key == "trip_no" and label == "车次/航班":
label = "车次"
if key == "route" and label == "行程":
label = "行程"
normalized.append({"key": key, "label": label, "value": value})
def add_field(key: str, label: str, value: str) -> None:
cleaned = str(value or "").strip()
if not cleaned:
return
if any(item["key"] == key for item in normalized if item["key"]):
return
if any(item["label"] == label for item in normalized):
return
normalized.append({"key": key, "label": label, "value": cleaned})
invoice_date = cls._extract_train_invoice_date(text)
add_field("invoice_date", "开票日期", invoice_date)
trip_datetime = cls._extract_train_trip_datetime(text)
add_field("trip_date", "列车出发时间", trip_datetime)
departure, arrival = cls._extract_train_route_points(text)
add_field("departure_station", "出发地点", departure)
add_field("arrival_station", "到达地点", arrival)
if departure and arrival:
add_field("route", "行程", f"{departure}-{arrival}")
add_field("train_no", "车次", cls._extract_first(TRAIN_NO_PATTERN, text) or cls._extract_first(TRAIN_STANDALONE_NO_PATTERN, text))
id_number = cls._extract_train_id_number(text)
add_field("passenger_name", "乘车人", cls._extract_train_passenger_name(text, id_number=id_number))
add_field("id_number", "身份证号", id_number)
add_field("electronic_ticket_no", "电子客票号", cls._extract_first(TRAIN_ETICKET_PATTERN, text))
add_field("seat_class", "席别", cls._extract_first(TRAIN_SEAT_CLASS_PATTERN, text))
carriage_no, seat_no = cls._extract_train_carriage_and_seat(text)
add_field("carriage_no", "车厢", carriage_no)
add_field("seat_no", "座位号", seat_no)
add_field("fare", "票价", cls._extract_train_fare(text))
return normalized
@staticmethod
def _is_train_ticket_values(
*,
document_type: str,
document_type_label: str,
scene_label: str,
text: str,
) -> bool:
if str(document_type or "").strip().lower() == "train_ticket":
return True
compact = "".join([document_type_label, scene_label, text]).replace(" ", "")
return any(token in compact for token in ("火车", "高铁", "动车", "铁路", "电子客票", "车次"))
@classmethod
def _is_train_ticket_meta(cls, meta: dict[str, Any]) -> bool:
return cls._is_train_ticket_values(
document_type=str(meta.get("document_type") or ""),
document_type_label=str(meta.get("document_type_label") or ""),
scene_label=str(meta.get("scene_label") or ""),
text=cls._receipt_text(meta),
)
@staticmethod
def _receipt_text(meta: dict[str, Any]) -> str:
field_text = "\n".join(
f"{field.get('label', '')} {field.get('value', '')}"
for field in list(meta.get("document_fields") or [])
if isinstance(field, dict)
)
return "\n".join(
value
for value in (
str(meta.get("ocr_text") or ""),
str(meta.get("summary") or ""),
str(meta.get("file_name") or ""),
field_text,
)
if value
)
@classmethod
def _extract_train_invoice_date(cls, text: str) -> str:
match = TRAIN_INVOICE_DATE_PATTERN.search(str(text or ""))
if not match:
return ""
return cls._normalize_receipt_date_value(match.group(1))
@classmethod
def _extract_train_trip_datetime(cls, text: str) -> str:
raw_text = str(text or "")
candidates: list[tuple[int, int, str]] = []
for index, match in enumerate(RECEIPT_DATE_PATTERN.finditer(raw_text)):
window = raw_text[max(0, match.start() - 14): match.end() + 8].replace(" ", "")
if any(token in window for token in ("开票日期", "发票日期", "开票时间")):
continue
value = cls._format_date_match_with_time(raw_text, match)
score = 0
nearby = raw_text[max(0, match.start() - 32): match.end() + 32]
compact = nearby.replace(" ", "")
if ":" in value or "" in value:
score += 8
if any(token in compact for token in ("开车时间", "发车时间", "乘车日期", "乘车时间", "检票", "车次")):
score += 6
if any(token in compact for token in ("二等座", "一等座", "商务座", "硬座", "软卧", "硬卧")):
score += 3
candidates.append((score, -index, value))
if not candidates:
return ""
return max(candidates, key=lambda item: (item[0], item[1]))[2]
@classmethod
def _format_date_match_with_time(cls, text: str, match: re.Match[str]) -> str:
date_value = cls._normalize_receipt_date_value(match.group(1))
if not date_value:
return ""
surrounding = str(text or "")[max(0, match.start() - 18): match.end() + 24]
time_match = RECEIPT_TIME_PATTERN.search(surrounding)
if not time_match:
return date_value
return f"{date_value} {str(time_match.group(1)).zfill(2)}:{str(time_match.group(2)).zfill(2)}"
@staticmethod
def _normalize_receipt_date_value(value: str) -> str:
raw = str(value or "").strip()
match = RECEIPT_DATE_PATTERN.search(raw)
if not match:
return raw
normalized = match.group(1).replace("", "-").replace("", "-").replace("", "")
normalized = normalized.replace("/", "-").replace(".", "-")
parts = [part for part in normalized.split("-") if part]
if len(parts) != 3:
return match.group(1)
year, month, day = parts
return f"{year.zfill(4)}-{month.zfill(2)}-{day.zfill(2)}"
@classmethod
def _extract_train_route_points(cls, text: str) -> tuple[str, str]:
raw_text = str(text or "")
station_candidates: list[str] = []
for line in raw_text.replace("\r", "\n").splitlines():
candidate = cls._clean_train_station(line)
if not candidate or candidate in station_candidates:
continue
if not str(line or "").strip().endswith(""):
continue
if any(token in candidate for token in ("发票", "客票", "铁路", "票价", "日期")):
continue
station_candidates.append(candidate)
if len(station_candidates) >= 2:
return station_candidates[0], station_candidates[1]
match = TRAIN_ROUTE_PATTERN.search(raw_text)
if match:
departure = cls._clean_train_station(match.group(1))
arrival = cls._clean_train_station(match.group(2))
if departure and arrival and departure != arrival:
return departure, arrival
return "", ""
@staticmethod
def _clean_train_station(value: str) -> str:
cleaned = re.sub(r"[^A-Za-z0-9\u4e00-\u9fa5()·]", "", str(value or ""))
cleaned = re.sub(r"(?:火车站|高铁站|站)$", "", cleaned)
return cleaned.strip()
@staticmethod
def _extract_first(pattern: re.Pattern[str], text: str) -> str:
match = pattern.search(str(text or ""))
return str(match.group(1) or "").strip() if match else ""
@classmethod
def _extract_train_passenger_name(cls, text: str, *, id_number: str = "") -> str:
labeled = cls._extract_first(TRAIN_PASSENGER_PATTERN, text)
if labeled:
return labeled
lines = [line.strip() for line in str(text or "").replace("\r", "\n").splitlines() if line.strip()]
for index, line in enumerate(lines):
if id_number and id_number not in line:
continue
for offset in (1, -1, 2):
target_index = index + offset
if target_index < 0 or target_index >= len(lines):
continue
candidate = cls._clean_train_passenger_candidate(lines[target_index])
if candidate:
return candidate
for line in lines:
if "购买方名称" in line:
candidate = cls._clean_train_passenger_candidate(line.split(":", 1)[-1].split("", 1)[-1])
if candidate:
return candidate
return ""
@staticmethod
def _clean_train_passenger_candidate(value: str) -> str:
cleaned = re.sub(r"[^·\u4e00-\u9fa5]", "", str(value or "")).strip()
if not 2 <= len(cleaned) <= 8:
return ""
if any(token in cleaned for token in ("电子", "客票", "铁路", "发票", "税务", "湖北省", "中国铁路", "开票", "日期")):
return ""
return cleaned
@classmethod
def _extract_train_id_number(cls, text: str) -> str:
labeled = cls._extract_first(TRAIN_ID_PATTERN, text)
if labeled:
return labeled
for line in str(text or "").replace("\r", "\n").splitlines():
compact_line = line.replace(" ", "")
if any(token in compact_line for token in ("发票号码", "电子客票号", "客票号", "订单号")):
continue
match = TRAIN_ID_FALLBACK_PATTERN.search(compact_line)
if match:
return str(match.group(1) or "").strip()
return ""
@staticmethod
def _extract_train_carriage_and_seat(text: str) -> tuple[str, str]:
combined_match = TRAIN_COMBINED_SEAT_PATTERN.search(str(text or ""))
if combined_match:
return f"{combined_match.group(1)}", combined_match.group(2)
carriage_no = ReceiptFolderService._extract_first(TRAIN_CARRIAGE_PATTERN, text).replace(" ", "")
seat_no = ReceiptFolderService._extract_first(TRAIN_SEAT_NO_PATTERN, text)
return carriage_no, seat_no
@staticmethod
def _extract_train_fare(text: str) -> str:
match = TRAIN_FARE_PATTERN.search(str(text or ""))
if not match:
return ""
value = str(match.group(1) or "").replace(",", ".").strip()
return f"{value}" if value else ""
@staticmethod
def _parse_datetime(value: Any) -> datetime | None:
raw = str(value or "").strip()