Files
X-Financial/server/src/app/services/risk_rule_dsl_validator.py

331 lines
14 KiB
Python
Raw Normal View History

from __future__ import annotations
from copy import deepcopy
from typing import Any
from app.services.risk_rule_generation_interpreter import (
COMPOSITE_RULE_OPERATORS,
COMPOSITE_RULE_TEMPLATE_KEY,
)
from app.services.risk_rule_generation_ontology import RiskRuleField
from app.services.risk_rule_generation_semantics import CITY_CONSISTENCY_SEMANTIC_TYPE
STRUCTURED_TERMS = (
"一致",
"不一致",
"匹配",
"不匹配",
"范围",
"早于",
"晚于",
"超过",
"超出",
"超预算",
"预算",
"余额",
"阈值",
"重复",
"同一发票",
"未上传",
"缺少附件",
)
CITY_TERMS = ("城市", "地点", "目的地", "行程", "交通票", "住宿")
DATE_TERMS = ("日期", "时间", "开始", "结束", "早于", "晚于", "入住", "离店")
AMOUNT_TERMS = ("金额", "预算", "余额", "阈值", "超过", "超出", "超预算")
ATTACHMENT_TERMS = ("附件", "票据", "发票", "水单", "上传", "未上传")
DUPLICATE_TERMS = ("重复", "同一发票", "发票号", "票据号")
KEYWORD_FALLBACK_TERMS = ("风险关键词", "关键词匹配", "规则描述中的风险关键词")
def validate_risk_rule_draft(
draft: dict[str, Any],
*,
fields: list[RiskRuleField],
natural_language: str,
) -> dict[str, Any]:
"""Normalize generated DSL and record validation issues.
This guardrail is intentionally deterministic. Hermes may provide semantic
understanding, but executable JSON must still pass a controlled schema.
"""
normalized = deepcopy(draft) if isinstance(draft, dict) else {}
field_by_key = {field.key: field for field in fields}
field_keys = _filter_fields(_read_string_list(normalized.get("field_keys")), field_by_key)
if not field_keys:
field_keys = [field.key for field in fields[:8]]
normalized["field_keys"] = field_keys
issues: list[str] = []
text = _join_text(
natural_language,
normalized.get("description"),
normalized.get("condition_summary"),
normalized.get("formula"),
)
template_key = str(normalized.get("template_key") or "field_required_v1").strip()
if template_key != COMPOSITE_RULE_TEMPLATE_KEY and _looks_like_city_rule(text, field_keys):
normalized["template_key"] = "field_compare_v1"
normalized["semantic_type"] = CITY_CONSISTENCY_SEMANTIC_TYPE
normalized["keywords"] = []
issues.append("city_rule_normalized_to_structured_compare")
elif template_key == "keyword_match_v1" and _requires_structured_dsl(text, field_keys, field_by_key):
normalized = _rewrite_keyword_rule_to_composite(normalized, text=text, fields=fields)
issues.append("keyword_rule_rewritten_to_composite_dsl")
elif template_key == COMPOSITE_RULE_TEMPLATE_KEY and not _read_list(normalized.get("conditions")):
normalized = _rewrite_keyword_rule_to_composite(normalized, text=text, fields=fields)
issues.append("empty_composite_rule_built_from_structured_fields")
if normalized.get("template_key") == COMPOSITE_RULE_TEMPLATE_KEY:
normalized = _normalize_composite_rule(normalized, fields=fields, issues=issues)
else:
normalized = _normalize_non_composite_rule(normalized, fields=fields, issues=issues)
normalized["dsl_validation"] = {
"status": "passed",
"issues": issues,
"template_key": normalized.get("template_key"),
"operators": [
str(item.get("operator") or "").strip()
for item in _read_list(normalized.get("conditions"))
if isinstance(item, dict)
],
}
return normalized
def _normalize_non_composite_rule(
draft: dict[str, Any],
*,
fields: list[RiskRuleField],
issues: list[str],
) -> dict[str, Any]:
field_by_key = {field.key: field for field in fields}
normalized = dict(draft)
normalized["field_keys"] = _filter_fields(_read_string_list(normalized.get("field_keys")), field_by_key)
summary = str(normalized.get("condition_summary") or "").strip()
if any(term in summary for term in KEYWORD_FALLBACK_TERMS) and normalized.get("template_key") != "keyword_match_v1":
normalized["condition_summary"] = _generic_structured_summary(normalized.get("field_keys") or [])
issues.append("keyword_fallback_summary_replaced")
return normalized
def _normalize_composite_rule(
draft: dict[str, Any],
*,
fields: list[RiskRuleField],
issues: list[str],
) -> dict[str, Any]:
field_by_key = {field.key: field for field in fields}
normalized = dict(draft)
conditions = []
for index, condition in enumerate(_read_list(normalized.get("conditions")), start=1):
if not isinstance(condition, dict):
issues.append("non_dict_condition_removed")
continue
normalized_condition = _normalize_condition(condition, index=index, field_by_key=field_by_key)
if normalized_condition:
conditions.append(normalized_condition)
else:
issues.append(f"invalid_condition_removed:{index}")
if not conditions:
conditions = _build_fallback_conditions(fields)
issues.append("fallback_conditions_created")
normalized["conditions"] = conditions
normalized["field_keys"] = _collect_condition_fields(conditions) or [
field.key for field in fields[:8]
]
normalized["hit_logic"] = _normalize_hit_logic(normalized.get("hit_logic"), conditions)
summary = str(normalized.get("condition_summary") or "").strip()
if not summary or any(term in summary for term in KEYWORD_FALLBACK_TERMS):
normalized["condition_summary"] = _generic_structured_summary(normalized["field_keys"])
issues.append("keyword_fallback_summary_replaced")
normalized["keywords"] = []
return normalized
def _normalize_condition(
condition: dict[str, Any],
*,
index: int,
field_by_key: dict[str, RiskRuleField],
) -> dict[str, Any] | None:
operator = str(condition.get("operator") or "").strip()
if operator not in COMPOSITE_RULE_OPERATORS:
return None
item = dict(condition)
item["id"] = str(item.get("id") or f"condition_{index}").strip()
item["operator"] = operator
for key in ("fields", "left_fields", "right_fields", "date_fields", "range_start_fields", "range_end_fields"):
item[key] = _filter_fields(_read_string_list(item.get(key)), field_by_key)
if operator in {"contains_any", "not_contains_any"}:
keywords = _read_string_list(item.get("keywords"))
if not keywords:
return None
item["keywords"] = keywords[:12]
if operator == "date_outside_range" and not item["date_fields"]:
return None
if operator == "numeric_compare":
item["compare"] = str(item.get("compare") or item.get("comparator") or "gt").strip()
if not item["left_fields"] and item["fields"]:
item["left_fields"] = item["fields"]
has_right = bool(item["right_fields"]) or item.get("threshold") is not None or item.get("value") is not None
if not item["left_fields"] or not has_right:
return None
if operator == "duplicate_value" and not item["fields"]:
return None
return item
def _rewrite_keyword_rule_to_composite(
draft: dict[str, Any],
*,
text: str,
fields: list[RiskRuleField],
) -> dict[str, Any]:
conditions = _build_structured_conditions(text, fields)
rewritten = dict(draft)
rewritten["template_key"] = COMPOSITE_RULE_TEMPLATE_KEY
rewritten["conditions"] = conditions
rewritten["hit_logic"] = _logic_for_conditions(conditions)
rewritten["keywords"] = []
if not rewritten.get("condition_summary") or any(
term in str(rewritten.get("condition_summary") or "") for term in KEYWORD_FALLBACK_TERMS
):
rewritten["condition_summary"] = _generic_structured_summary(_collect_condition_fields(conditions))
return rewritten
def _build_structured_conditions(text: str, fields: list[RiskRuleField]) -> list[dict[str, Any]]:
conditions: list[dict[str, Any]] = []
field_keys = [field.key for field in fields]
attachment_fields = [key for key in field_keys if key.startswith("attachment.")]
city_left = [key for key in field_keys if key in {"attachment.hotel_city", "attachment.route_cities"}]
city_right = [key for key in field_keys if key in {"claim.location", "item.item_location"}]
date_fields = [key for key in field_keys if _field_type(key, fields) == "date" and key.startswith("attachment.")]
range_start = [key for key in field_keys if key in {"claim.trip_start_date", "item.item_date"}]
range_end = [key for key in field_keys if key in {"claim.trip_end_date", "item.item_date"}]
amount_left = [key for key in field_keys if key in {"claim.amount", "item.item_amount"}]
amount_right = [key for key in field_keys if key.startswith("budget.")]
duplicate_fields = [key for key in field_keys if key in {"attachment.invoice_no", "item.invoice_id"}]
if attachment_fields and any(term in text for term in ATTACHMENT_TERMS):
conditions.append({"id": "attachment_evidence_present", "operator": "exists_any", "fields": attachment_fields[:4]})
if city_left and city_right and any(term in text for term in CITY_TERMS):
conditions.append({"id": "city_outside_business_scope", "operator": "not_in_scope", "left_fields": city_left, "right_fields": city_right})
if date_fields and (range_start or range_end) and any(term in text for term in DATE_TERMS):
conditions.append({"id": "date_outside_business_range", "operator": "date_outside_range", "date_fields": date_fields, "range_start_fields": range_start, "range_end_fields": range_end})
if amount_left and amount_right and any(term in text for term in AMOUNT_TERMS):
conditions.append({"id": "amount_exceeds_budget", "operator": "numeric_compare", "left_fields": amount_left[:1], "right_fields": amount_right[:1], "compare": "gt"})
if duplicate_fields and any(term in text for term in DUPLICATE_TERMS):
conditions.append({"id": "duplicate_invoice_no", "operator": "duplicate_value", "fields": duplicate_fields})
exception_keywords = draft_exception_keywords_from_text(text)
exception_fields = [key for key in field_keys if key in {"claim.reason", "item.item_reason"}]
if exception_fields and exception_keywords:
conditions.append({"id": "missing_reasonable_exception", "operator": "not_contains_any", "fields": exception_fields, "keywords": exception_keywords})
return conditions or [{"id": "structured_fields_present", "operator": "exists_any", "fields": field_keys[:4]}]
def draft_exception_keywords_from_text(text: str) -> list[str]:
candidates = ("延期", "改签", "临时任务", "跨城", "绕行", "补充说明", "审批说明")
return [item for item in candidates if item in text]
def _logic_for_conditions(conditions: list[dict[str, Any]]) -> dict[str, Any]:
required = [item["id"] for item in conditions if item.get("operator") in {"exists_any", "exists_all", "all_present"}]
exceptions = [item["id"] for item in conditions if item.get("operator") == "not_contains_any"]
anomaly = [item["id"] for item in conditions if item["id"] not in {*required, *exceptions}]
parts: list[Any] = [*required]
if len(anomaly) == 1:
parts.append(anomaly[0])
elif anomaly:
parts.append({"any": anomaly})
parts.extend(exceptions)
return {"all": parts or [item["id"] for item in conditions]}
def _normalize_hit_logic(value: Any, conditions: list[dict[str, Any]]) -> Any:
ids = {str(item.get("id") or "").strip() for item in conditions}
def normalize(node: Any) -> Any:
if isinstance(node, str):
return node if node in ids else None
if isinstance(node, list):
return [item for item in (normalize(child) for child in node) if item]
if isinstance(node, dict):
result = {}
for key in ("all", "any"):
values = normalize(node.get(key))
if values:
result[key] = values
if "not" in node:
result["not"] = normalize(node.get("not"))
return result or None
return None
normalized = normalize(value)
return normalized if normalized else _logic_for_conditions(conditions)
def _build_fallback_conditions(fields: list[RiskRuleField]) -> list[dict[str, Any]]:
return [{"id": "required_evidence_present", "operator": "exists_any", "fields": [field.key for field in fields[:4]]}]
def _requires_structured_dsl(
text: str,
field_keys: list[str],
field_by_key: dict[str, RiskRuleField],
) -> bool:
if any(term in text for term in STRUCTURED_TERMS):
return True
return any(
field_by_key.get(key) and field_by_key[key].field_type in {"date", "number", "list"}
for key in field_keys
)
def _looks_like_city_rule(text: str, field_keys: list[str]) -> bool:
has_city_field = any(key in {"claim.location", "item.item_location", "attachment.hotel_city", "attachment.route_cities"} for key in field_keys)
return has_city_field and any(term in text for term in CITY_TERMS) and any(term in text for term in ("一致", "匹配", "对应", "绕行", "跨城", "改签"))
def _collect_condition_fields(conditions: list[dict[str, Any]]) -> list[str]:
keys: list[str] = []
for condition in conditions:
for name in ("fields", "left_fields", "right_fields", "date_fields", "range_start_fields", "range_end_fields"):
for key in _read_string_list(condition.get(name)):
if key not in keys:
keys.append(key)
return keys
def _generic_structured_summary(field_keys: list[str]) -> str:
fields = "".join(field_keys[:6]) or "规则字段"
return f"按结构化字段执行判断:读取 {fields},根据字段关系、范围、阈值和例外说明决定是否命中风险。"
def _filter_fields(values: list[str], field_by_key: dict[str, RiskRuleField]) -> list[str]:
return [key for key in values if key in field_by_key]
def _field_type(key: str, fields: list[RiskRuleField]) -> str:
for field in fields:
if field.key == key:
return field.field_type
return ""
def _join_text(*values: Any) -> str:
return "\n".join(str(value or "") for value in values if str(value or "").strip())
def _read_list(value: Any) -> list[Any]:
return value if isinstance(value, list) else []
def _read_string_list(value: Any) -> list[str]:
if not isinstance(value, list):
return []
return [str(item or "").strip() for item in value if str(item or "").strip()]