331 lines
14 KiB
Python
331 lines
14 KiB
Python
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from copy import deepcopy
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
from app.services.risk_rule_generation_interpreter import (
|
||
|
|
COMPOSITE_RULE_OPERATORS,
|
||
|
|
COMPOSITE_RULE_TEMPLATE_KEY,
|
||
|
|
)
|
||
|
|
from app.services.risk_rule_generation_ontology import RiskRuleField
|
||
|
|
from app.services.risk_rule_generation_semantics import CITY_CONSISTENCY_SEMANTIC_TYPE
|
||
|
|
|
||
|
|
|
||
|
|
STRUCTURED_TERMS = (
|
||
|
|
"一致",
|
||
|
|
"不一致",
|
||
|
|
"匹配",
|
||
|
|
"不匹配",
|
||
|
|
"范围",
|
||
|
|
"早于",
|
||
|
|
"晚于",
|
||
|
|
"超过",
|
||
|
|
"超出",
|
||
|
|
"超预算",
|
||
|
|
"预算",
|
||
|
|
"余额",
|
||
|
|
"阈值",
|
||
|
|
"重复",
|
||
|
|
"同一发票",
|
||
|
|
"未上传",
|
||
|
|
"缺少附件",
|
||
|
|
)
|
||
|
|
CITY_TERMS = ("城市", "地点", "目的地", "行程", "交通票", "住宿")
|
||
|
|
DATE_TERMS = ("日期", "时间", "开始", "结束", "早于", "晚于", "入住", "离店")
|
||
|
|
AMOUNT_TERMS = ("金额", "预算", "余额", "阈值", "超过", "超出", "超预算")
|
||
|
|
ATTACHMENT_TERMS = ("附件", "票据", "发票", "水单", "上传", "未上传")
|
||
|
|
DUPLICATE_TERMS = ("重复", "同一发票", "发票号", "票据号")
|
||
|
|
KEYWORD_FALLBACK_TERMS = ("风险关键词", "关键词匹配", "规则描述中的风险关键词")
|
||
|
|
|
||
|
|
|
||
|
|
def validate_risk_rule_draft(
|
||
|
|
draft: dict[str, Any],
|
||
|
|
*,
|
||
|
|
fields: list[RiskRuleField],
|
||
|
|
natural_language: str,
|
||
|
|
) -> dict[str, Any]:
|
||
|
|
"""Normalize generated DSL and record validation issues.
|
||
|
|
|
||
|
|
This guardrail is intentionally deterministic. Hermes may provide semantic
|
||
|
|
understanding, but executable JSON must still pass a controlled schema.
|
||
|
|
"""
|
||
|
|
|
||
|
|
normalized = deepcopy(draft) if isinstance(draft, dict) else {}
|
||
|
|
field_by_key = {field.key: field for field in fields}
|
||
|
|
field_keys = _filter_fields(_read_string_list(normalized.get("field_keys")), field_by_key)
|
||
|
|
if not field_keys:
|
||
|
|
field_keys = [field.key for field in fields[:8]]
|
||
|
|
normalized["field_keys"] = field_keys
|
||
|
|
|
||
|
|
issues: list[str] = []
|
||
|
|
text = _join_text(
|
||
|
|
natural_language,
|
||
|
|
normalized.get("description"),
|
||
|
|
normalized.get("condition_summary"),
|
||
|
|
normalized.get("formula"),
|
||
|
|
)
|
||
|
|
template_key = str(normalized.get("template_key") or "field_required_v1").strip()
|
||
|
|
if template_key != COMPOSITE_RULE_TEMPLATE_KEY and _looks_like_city_rule(text, field_keys):
|
||
|
|
normalized["template_key"] = "field_compare_v1"
|
||
|
|
normalized["semantic_type"] = CITY_CONSISTENCY_SEMANTIC_TYPE
|
||
|
|
normalized["keywords"] = []
|
||
|
|
issues.append("city_rule_normalized_to_structured_compare")
|
||
|
|
elif template_key == "keyword_match_v1" and _requires_structured_dsl(text, field_keys, field_by_key):
|
||
|
|
normalized = _rewrite_keyword_rule_to_composite(normalized, text=text, fields=fields)
|
||
|
|
issues.append("keyword_rule_rewritten_to_composite_dsl")
|
||
|
|
elif template_key == COMPOSITE_RULE_TEMPLATE_KEY and not _read_list(normalized.get("conditions")):
|
||
|
|
normalized = _rewrite_keyword_rule_to_composite(normalized, text=text, fields=fields)
|
||
|
|
issues.append("empty_composite_rule_built_from_structured_fields")
|
||
|
|
|
||
|
|
if normalized.get("template_key") == COMPOSITE_RULE_TEMPLATE_KEY:
|
||
|
|
normalized = _normalize_composite_rule(normalized, fields=fields, issues=issues)
|
||
|
|
else:
|
||
|
|
normalized = _normalize_non_composite_rule(normalized, fields=fields, issues=issues)
|
||
|
|
|
||
|
|
normalized["dsl_validation"] = {
|
||
|
|
"status": "passed",
|
||
|
|
"issues": issues,
|
||
|
|
"template_key": normalized.get("template_key"),
|
||
|
|
"operators": [
|
||
|
|
str(item.get("operator") or "").strip()
|
||
|
|
for item in _read_list(normalized.get("conditions"))
|
||
|
|
if isinstance(item, dict)
|
||
|
|
],
|
||
|
|
}
|
||
|
|
return normalized
|
||
|
|
|
||
|
|
|
||
|
|
def _normalize_non_composite_rule(
|
||
|
|
draft: dict[str, Any],
|
||
|
|
*,
|
||
|
|
fields: list[RiskRuleField],
|
||
|
|
issues: list[str],
|
||
|
|
) -> dict[str, Any]:
|
||
|
|
field_by_key = {field.key: field for field in fields}
|
||
|
|
normalized = dict(draft)
|
||
|
|
normalized["field_keys"] = _filter_fields(_read_string_list(normalized.get("field_keys")), field_by_key)
|
||
|
|
summary = str(normalized.get("condition_summary") or "").strip()
|
||
|
|
if any(term in summary for term in KEYWORD_FALLBACK_TERMS) and normalized.get("template_key") != "keyword_match_v1":
|
||
|
|
normalized["condition_summary"] = _generic_structured_summary(normalized.get("field_keys") or [])
|
||
|
|
issues.append("keyword_fallback_summary_replaced")
|
||
|
|
return normalized
|
||
|
|
|
||
|
|
|
||
|
|
def _normalize_composite_rule(
|
||
|
|
draft: dict[str, Any],
|
||
|
|
*,
|
||
|
|
fields: list[RiskRuleField],
|
||
|
|
issues: list[str],
|
||
|
|
) -> dict[str, Any]:
|
||
|
|
field_by_key = {field.key: field for field in fields}
|
||
|
|
normalized = dict(draft)
|
||
|
|
conditions = []
|
||
|
|
for index, condition in enumerate(_read_list(normalized.get("conditions")), start=1):
|
||
|
|
if not isinstance(condition, dict):
|
||
|
|
issues.append("non_dict_condition_removed")
|
||
|
|
continue
|
||
|
|
normalized_condition = _normalize_condition(condition, index=index, field_by_key=field_by_key)
|
||
|
|
if normalized_condition:
|
||
|
|
conditions.append(normalized_condition)
|
||
|
|
else:
|
||
|
|
issues.append(f"invalid_condition_removed:{index}")
|
||
|
|
if not conditions:
|
||
|
|
conditions = _build_fallback_conditions(fields)
|
||
|
|
issues.append("fallback_conditions_created")
|
||
|
|
normalized["conditions"] = conditions
|
||
|
|
normalized["field_keys"] = _collect_condition_fields(conditions) or [
|
||
|
|
field.key for field in fields[:8]
|
||
|
|
]
|
||
|
|
normalized["hit_logic"] = _normalize_hit_logic(normalized.get("hit_logic"), conditions)
|
||
|
|
summary = str(normalized.get("condition_summary") or "").strip()
|
||
|
|
if not summary or any(term in summary for term in KEYWORD_FALLBACK_TERMS):
|
||
|
|
normalized["condition_summary"] = _generic_structured_summary(normalized["field_keys"])
|
||
|
|
issues.append("keyword_fallback_summary_replaced")
|
||
|
|
normalized["keywords"] = []
|
||
|
|
return normalized
|
||
|
|
|
||
|
|
|
||
|
|
def _normalize_condition(
|
||
|
|
condition: dict[str, Any],
|
||
|
|
*,
|
||
|
|
index: int,
|
||
|
|
field_by_key: dict[str, RiskRuleField],
|
||
|
|
) -> dict[str, Any] | None:
|
||
|
|
operator = str(condition.get("operator") or "").strip()
|
||
|
|
if operator not in COMPOSITE_RULE_OPERATORS:
|
||
|
|
return None
|
||
|
|
item = dict(condition)
|
||
|
|
item["id"] = str(item.get("id") or f"condition_{index}").strip()
|
||
|
|
item["operator"] = operator
|
||
|
|
for key in ("fields", "left_fields", "right_fields", "date_fields", "range_start_fields", "range_end_fields"):
|
||
|
|
item[key] = _filter_fields(_read_string_list(item.get(key)), field_by_key)
|
||
|
|
if operator in {"contains_any", "not_contains_any"}:
|
||
|
|
keywords = _read_string_list(item.get("keywords"))
|
||
|
|
if not keywords:
|
||
|
|
return None
|
||
|
|
item["keywords"] = keywords[:12]
|
||
|
|
if operator == "date_outside_range" and not item["date_fields"]:
|
||
|
|
return None
|
||
|
|
if operator == "numeric_compare":
|
||
|
|
item["compare"] = str(item.get("compare") or item.get("comparator") or "gt").strip()
|
||
|
|
if not item["left_fields"] and item["fields"]:
|
||
|
|
item["left_fields"] = item["fields"]
|
||
|
|
has_right = bool(item["right_fields"]) or item.get("threshold") is not None or item.get("value") is not None
|
||
|
|
if not item["left_fields"] or not has_right:
|
||
|
|
return None
|
||
|
|
if operator == "duplicate_value" and not item["fields"]:
|
||
|
|
return None
|
||
|
|
return item
|
||
|
|
|
||
|
|
|
||
|
|
def _rewrite_keyword_rule_to_composite(
|
||
|
|
draft: dict[str, Any],
|
||
|
|
*,
|
||
|
|
text: str,
|
||
|
|
fields: list[RiskRuleField],
|
||
|
|
) -> dict[str, Any]:
|
||
|
|
conditions = _build_structured_conditions(text, fields)
|
||
|
|
rewritten = dict(draft)
|
||
|
|
rewritten["template_key"] = COMPOSITE_RULE_TEMPLATE_KEY
|
||
|
|
rewritten["conditions"] = conditions
|
||
|
|
rewritten["hit_logic"] = _logic_for_conditions(conditions)
|
||
|
|
rewritten["keywords"] = []
|
||
|
|
if not rewritten.get("condition_summary") or any(
|
||
|
|
term in str(rewritten.get("condition_summary") or "") for term in KEYWORD_FALLBACK_TERMS
|
||
|
|
):
|
||
|
|
rewritten["condition_summary"] = _generic_structured_summary(_collect_condition_fields(conditions))
|
||
|
|
return rewritten
|
||
|
|
|
||
|
|
|
||
|
|
def _build_structured_conditions(text: str, fields: list[RiskRuleField]) -> list[dict[str, Any]]:
|
||
|
|
conditions: list[dict[str, Any]] = []
|
||
|
|
field_keys = [field.key for field in fields]
|
||
|
|
attachment_fields = [key for key in field_keys if key.startswith("attachment.")]
|
||
|
|
city_left = [key for key in field_keys if key in {"attachment.hotel_city", "attachment.route_cities"}]
|
||
|
|
city_right = [key for key in field_keys if key in {"claim.location", "item.item_location", "employee.location"}]
|
||
|
|
date_fields = [key for key in field_keys if _field_type(key, fields) == "date" and key.startswith("attachment.")]
|
||
|
|
range_start = [key for key in field_keys if key in {"claim.trip_start_date", "item.item_date"}]
|
||
|
|
range_end = [key for key in field_keys if key in {"claim.trip_end_date", "item.item_date"}]
|
||
|
|
amount_left = [key for key in field_keys if key in {"claim.amount", "item.item_amount"}]
|
||
|
|
amount_right = [key for key in field_keys if key.startswith("budget.")]
|
||
|
|
duplicate_fields = [key for key in field_keys if key in {"attachment.invoice_no", "item.invoice_id"}]
|
||
|
|
|
||
|
|
if attachment_fields and any(term in text for term in ATTACHMENT_TERMS):
|
||
|
|
conditions.append({"id": "attachment_evidence_present", "operator": "exists_any", "fields": attachment_fields[:4]})
|
||
|
|
if city_left and city_right and any(term in text for term in CITY_TERMS):
|
||
|
|
conditions.append({"id": "city_outside_business_scope", "operator": "not_in_scope", "left_fields": city_left, "right_fields": city_right})
|
||
|
|
if date_fields and (range_start or range_end) and any(term in text for term in DATE_TERMS):
|
||
|
|
conditions.append({"id": "date_outside_business_range", "operator": "date_outside_range", "date_fields": date_fields, "range_start_fields": range_start, "range_end_fields": range_end})
|
||
|
|
if amount_left and amount_right and any(term in text for term in AMOUNT_TERMS):
|
||
|
|
conditions.append({"id": "amount_exceeds_budget", "operator": "numeric_compare", "left_fields": amount_left[:1], "right_fields": amount_right[:1], "compare": "gt"})
|
||
|
|
if duplicate_fields and any(term in text for term in DUPLICATE_TERMS):
|
||
|
|
conditions.append({"id": "duplicate_invoice_no", "operator": "duplicate_value", "fields": duplicate_fields})
|
||
|
|
exception_keywords = draft_exception_keywords_from_text(text)
|
||
|
|
exception_fields = [key for key in field_keys if key in {"claim.reason", "item.item_reason"}]
|
||
|
|
if exception_fields and exception_keywords:
|
||
|
|
conditions.append({"id": "missing_reasonable_exception", "operator": "not_contains_any", "fields": exception_fields, "keywords": exception_keywords})
|
||
|
|
return conditions or [{"id": "structured_fields_present", "operator": "exists_any", "fields": field_keys[:4]}]
|
||
|
|
|
||
|
|
|
||
|
|
def draft_exception_keywords_from_text(text: str) -> list[str]:
|
||
|
|
candidates = ("延期", "改签", "临时任务", "跨城", "绕行", "补充说明", "审批说明")
|
||
|
|
return [item for item in candidates if item in text]
|
||
|
|
|
||
|
|
|
||
|
|
def _logic_for_conditions(conditions: list[dict[str, Any]]) -> dict[str, Any]:
|
||
|
|
required = [item["id"] for item in conditions if item.get("operator") in {"exists_any", "exists_all", "all_present"}]
|
||
|
|
exceptions = [item["id"] for item in conditions if item.get("operator") == "not_contains_any"]
|
||
|
|
anomaly = [item["id"] for item in conditions if item["id"] not in {*required, *exceptions}]
|
||
|
|
parts: list[Any] = [*required]
|
||
|
|
if len(anomaly) == 1:
|
||
|
|
parts.append(anomaly[0])
|
||
|
|
elif anomaly:
|
||
|
|
parts.append({"any": anomaly})
|
||
|
|
parts.extend(exceptions)
|
||
|
|
return {"all": parts or [item["id"] for item in conditions]}
|
||
|
|
|
||
|
|
|
||
|
|
def _normalize_hit_logic(value: Any, conditions: list[dict[str, Any]]) -> Any:
|
||
|
|
ids = {str(item.get("id") or "").strip() for item in conditions}
|
||
|
|
|
||
|
|
def normalize(node: Any) -> Any:
|
||
|
|
if isinstance(node, str):
|
||
|
|
return node if node in ids else None
|
||
|
|
if isinstance(node, list):
|
||
|
|
return [item for item in (normalize(child) for child in node) if item]
|
||
|
|
if isinstance(node, dict):
|
||
|
|
result = {}
|
||
|
|
for key in ("all", "any"):
|
||
|
|
values = normalize(node.get(key))
|
||
|
|
if values:
|
||
|
|
result[key] = values
|
||
|
|
if "not" in node:
|
||
|
|
result["not"] = normalize(node.get("not"))
|
||
|
|
return result or None
|
||
|
|
return None
|
||
|
|
|
||
|
|
normalized = normalize(value)
|
||
|
|
return normalized if normalized else _logic_for_conditions(conditions)
|
||
|
|
|
||
|
|
|
||
|
|
def _build_fallback_conditions(fields: list[RiskRuleField]) -> list[dict[str, Any]]:
|
||
|
|
return [{"id": "required_evidence_present", "operator": "exists_any", "fields": [field.key for field in fields[:4]]}]
|
||
|
|
|
||
|
|
|
||
|
|
def _requires_structured_dsl(
|
||
|
|
text: str,
|
||
|
|
field_keys: list[str],
|
||
|
|
field_by_key: dict[str, RiskRuleField],
|
||
|
|
) -> bool:
|
||
|
|
if any(term in text for term in STRUCTURED_TERMS):
|
||
|
|
return True
|
||
|
|
return any(
|
||
|
|
field_by_key.get(key) and field_by_key[key].field_type in {"date", "number", "list"}
|
||
|
|
for key in field_keys
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _looks_like_city_rule(text: str, field_keys: list[str]) -> bool:
|
||
|
|
has_city_field = any(key in {"claim.location", "item.item_location", "attachment.hotel_city", "attachment.route_cities"} for key in field_keys)
|
||
|
|
return has_city_field and any(term in text for term in CITY_TERMS) and any(term in text for term in ("一致", "匹配", "对应", "绕行", "跨城", "改签"))
|
||
|
|
|
||
|
|
|
||
|
|
def _collect_condition_fields(conditions: list[dict[str, Any]]) -> list[str]:
|
||
|
|
keys: list[str] = []
|
||
|
|
for condition in conditions:
|
||
|
|
for name in ("fields", "left_fields", "right_fields", "date_fields", "range_start_fields", "range_end_fields"):
|
||
|
|
for key in _read_string_list(condition.get(name)):
|
||
|
|
if key not in keys:
|
||
|
|
keys.append(key)
|
||
|
|
return keys
|
||
|
|
|
||
|
|
|
||
|
|
def _generic_structured_summary(field_keys: list[str]) -> str:
|
||
|
|
fields = "、".join(field_keys[:6]) or "规则字段"
|
||
|
|
return f"按结构化字段执行判断:读取 {fields},根据字段关系、范围、阈值和例外说明决定是否命中风险。"
|
||
|
|
|
||
|
|
|
||
|
|
def _filter_fields(values: list[str], field_by_key: dict[str, RiskRuleField]) -> list[str]:
|
||
|
|
return [key for key in values if key in field_by_key]
|
||
|
|
|
||
|
|
|
||
|
|
def _field_type(key: str, fields: list[RiskRuleField]) -> str:
|
||
|
|
for field in fields:
|
||
|
|
if field.key == key:
|
||
|
|
return field.field_type
|
||
|
|
return ""
|
||
|
|
|
||
|
|
|
||
|
|
def _join_text(*values: Any) -> str:
|
||
|
|
return "\n".join(str(value or "") for value in values if str(value or "").strip())
|
||
|
|
|
||
|
|
|
||
|
|
def _read_list(value: Any) -> list[Any]:
|
||
|
|
return value if isinstance(value, list) else []
|
||
|
|
|
||
|
|
|
||
|
|
def _read_string_list(value: Any) -> list[str]:
|
||
|
|
if not isinstance(value, list):
|
||
|
|
return []
|
||
|
|
return [str(item or "").strip() for item in value if str(item or "").strip()]
|