feat: 新增风险图谱算法与系统仪表盘及操作反馈体系
后端新增风险图谱算法模块、风险观察与反馈服务、规则 DSL 校验器和可解释性引擎,完善系统仪表盘和财务仪表盘统计, 优化 agent 运行和编排执行链路,清理旧开发文档,前端新增 系统趋势、负载热力图等多种仪表盘图表组件,完善操作反馈 对话框和工作台日期选择器,优化报销创建和审批详情交互, 补充单元测试覆盖。
This commit is contained in:
@@ -18,21 +18,45 @@ from .employee_behavior_profile import (
|
||||
score_by_bands,
|
||||
)
|
||||
from .employee_behavior_profile_tags import build_profile_radar, build_profile_tags
|
||||
from .risk_graph import (
|
||||
ALGORITHM_VERSION as FINANCIAL_RISK_GRAPH_ALGORITHM_VERSION,
|
||||
RiskGraphClaimItemSnapshot,
|
||||
RiskGraphClaimSnapshot,
|
||||
RiskGraphEvaluationContext,
|
||||
RiskGraphEvaluationResult,
|
||||
RiskHistoryStats,
|
||||
RiskObservationDraft,
|
||||
evaluate_financial_risk_graph,
|
||||
map_ontology_to_risk_graph,
|
||||
normalize_risk_signal,
|
||||
normalize_risk_signals,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ApplicantExpenseProfileInput",
|
||||
"ApplicantExpenseProfileResult",
|
||||
"EMPLOYEE_BEHAVIOR_PROFILE_ALGORITHM_VERSION",
|
||||
"FINANCIAL_RISK_GRAPH_ALGORITHM_VERSION",
|
||||
"ProfileComponent",
|
||||
"ProfileScoreResult",
|
||||
"RiskGraphClaimItemSnapshot",
|
||||
"RiskGraphClaimSnapshot",
|
||||
"RiskGraphEvaluationContext",
|
||||
"RiskGraphEvaluationResult",
|
||||
"RiskHistoryStats",
|
||||
"RiskObservationDraft",
|
||||
"build_review_suggestions",
|
||||
"build_profile_radar",
|
||||
"build_profile_tags",
|
||||
"calculate_review_priority_score",
|
||||
"evaluate_applicant_expense_profile",
|
||||
"evaluate_financial_risk_graph",
|
||||
"evaluate_weighted_profile",
|
||||
"map_ontology_to_risk_graph",
|
||||
"employee_profile_level_from_score",
|
||||
"normalize_by_peer_percentiles",
|
||||
"normalize_risk_signal",
|
||||
"normalize_risk_signals",
|
||||
"percentile",
|
||||
"score_by_bands",
|
||||
]
|
||||
|
||||
33
server/src/app/algorithem/risk_graph/__init__.py
Normal file
33
server/src/app/algorithem/risk_graph/__init__.py
Normal file
@@ -0,0 +1,33 @@
|
||||
"""Financial behavior graph risk engine."""
|
||||
|
||||
from .engine import evaluate_financial_risk_graph
|
||||
from .models import (
|
||||
ALGORITHM_VERSION,
|
||||
RiskGraphClaimItemSnapshot,
|
||||
RiskGraphClaimSnapshot,
|
||||
RiskGraphEvaluationContext,
|
||||
RiskGraphEvaluationResult,
|
||||
RiskHistoryStats,
|
||||
RiskObservationDraft,
|
||||
)
|
||||
from .ontology import OntologyRiskGraphMapping, map_ontology_to_risk_graph
|
||||
from .profile_baselines import ProfileBaselineSnapshot, ProfileBaselineUpdater
|
||||
from .signals import NormalizedRiskSignal, normalize_risk_signal, normalize_risk_signals
|
||||
|
||||
__all__ = [
|
||||
"ALGORITHM_VERSION",
|
||||
"NormalizedRiskSignal",
|
||||
"OntologyRiskGraphMapping",
|
||||
"RiskGraphClaimItemSnapshot",
|
||||
"RiskGraphClaimSnapshot",
|
||||
"RiskGraphEvaluationContext",
|
||||
"RiskGraphEvaluationResult",
|
||||
"RiskHistoryStats",
|
||||
"RiskObservationDraft",
|
||||
"ProfileBaselineSnapshot",
|
||||
"ProfileBaselineUpdater",
|
||||
"evaluate_financial_risk_graph",
|
||||
"map_ontology_to_risk_graph",
|
||||
"normalize_risk_signal",
|
||||
"normalize_risk_signals",
|
||||
]
|
||||
175
server/src/app/algorithem/risk_graph/anomaly_models.py
Normal file
175
server/src/app/algorithem/risk_graph/anomaly_models.py
Normal file
@@ -0,0 +1,175 @@
|
||||
"""Deterministic multi-model anomaly detection for risk graph features."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from statistics import median
|
||||
from typing import Any
|
||||
|
||||
ZERO = Decimal("0")
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class AnomalyPoint:
|
||||
key: str
|
||||
amount: Decimal
|
||||
occurred_at: datetime | None = None
|
||||
segment: str = ""
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class AnomalyModelSignal:
|
||||
method: str
|
||||
score: int
|
||||
reason: str
|
||||
related_keys: list[str] = field(default_factory=list)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"method": self.method,
|
||||
"score": self.score,
|
||||
"reason": self.reason,
|
||||
"related_keys": list(self.related_keys),
|
||||
}
|
||||
|
||||
|
||||
class MultiModelAnomalyDetector:
|
||||
def detect(
|
||||
self,
|
||||
points: list[AnomalyPoint],
|
||||
*,
|
||||
target_key: str,
|
||||
) -> list[AnomalyModelSignal]:
|
||||
target = next((point for point in points if point.key == target_key), None)
|
||||
if target is None:
|
||||
return []
|
||||
peers = [
|
||||
point
|
||||
for point in points
|
||||
if point.key != target.key and (not target.segment or point.segment == target.segment)
|
||||
]
|
||||
if len(peers) < 3:
|
||||
return []
|
||||
|
||||
signals = [
|
||||
self._robust_statistical_signal(target, peers),
|
||||
self._isolation_proxy_signal(target, peers),
|
||||
self._local_outlier_signal(target, peers),
|
||||
self._temporal_jump_signal(target, peers),
|
||||
self._periodic_deviation_signal(target, peers),
|
||||
]
|
||||
return [signal for signal in signals if signal is not None]
|
||||
|
||||
def _robust_statistical_signal(
|
||||
self,
|
||||
target: AnomalyPoint,
|
||||
peers: list[AnomalyPoint],
|
||||
) -> AnomalyModelSignal | None:
|
||||
values = [point.amount for point in peers if point.amount >= ZERO]
|
||||
if len(values) < 3:
|
||||
return None
|
||||
center = Decimal(str(median(values)))
|
||||
deviations = [abs(value - center) for value in values]
|
||||
mad = Decimal(str(median(deviations))) or Decimal("1")
|
||||
modified_z = abs(target.amount - center) / mad
|
||||
if modified_z < Decimal("3"):
|
||||
return None
|
||||
return AnomalyModelSignal(
|
||||
method="robust_statistics",
|
||||
score=min(100, int(modified_z * Decimal("18"))),
|
||||
reason="Target amount deviates from peer median by robust MAD.",
|
||||
related_keys=[point.key for point in peers],
|
||||
)
|
||||
|
||||
def _isolation_proxy_signal(
|
||||
self,
|
||||
target: AnomalyPoint,
|
||||
peers: list[AnomalyPoint],
|
||||
) -> AnomalyModelSignal | None:
|
||||
values = sorted(point.amount for point in peers)
|
||||
if target.amount <= values[-1] * Decimal("1.8"):
|
||||
return None
|
||||
return AnomalyModelSignal(
|
||||
method="isolation_forest_proxy",
|
||||
score=min(100, int((target.amount / max(values[-1], Decimal("1"))) * Decimal("45"))),
|
||||
reason="Target amount is isolated beyond the peer maximum envelope.",
|
||||
related_keys=[point.key for point in peers[-5:]],
|
||||
)
|
||||
|
||||
def _local_outlier_signal(
|
||||
self,
|
||||
target: AnomalyPoint,
|
||||
peers: list[AnomalyPoint],
|
||||
) -> AnomalyModelSignal | None:
|
||||
distances = sorted((abs(target.amount - point.amount), point.key) for point in peers)
|
||||
nearest = distances[: min(3, len(distances))]
|
||||
peer_distances = [
|
||||
abs(left.amount - right.amount)
|
||||
for index, left in enumerate(peers)
|
||||
for right in peers[index + 1 :]
|
||||
]
|
||||
local_scale = Decimal(str(median(peer_distances))) if peer_distances else Decimal("1")
|
||||
local_scale = max(local_scale, Decimal("1"))
|
||||
target_distance = sum((distance for distance, _ in nearest), ZERO) / Decimal(len(nearest))
|
||||
ratio = target_distance / local_scale
|
||||
if ratio < Decimal("2.5"):
|
||||
return None
|
||||
return AnomalyModelSignal(
|
||||
method="local_outlier_factor_proxy",
|
||||
score=min(100, int(ratio * Decimal("24"))),
|
||||
reason="Target is far away from its nearest peer neighborhood.",
|
||||
related_keys=[key for _, key in nearest],
|
||||
)
|
||||
|
||||
def _temporal_jump_signal(
|
||||
self,
|
||||
target: AnomalyPoint,
|
||||
peers: list[AnomalyPoint],
|
||||
) -> AnomalyModelSignal | None:
|
||||
if target.occurred_at is None:
|
||||
return None
|
||||
previous = [
|
||||
point
|
||||
for point in peers
|
||||
if point.occurred_at is not None and point.occurred_at < target.occurred_at
|
||||
]
|
||||
previous = sorted(previous, key=lambda item: item.occurred_at or datetime.min)[-3:]
|
||||
if len(previous) < 3:
|
||||
return None
|
||||
average = sum((point.amount for point in previous), ZERO) / Decimal(len(previous))
|
||||
if average <= ZERO or target.amount < average * Decimal("2.2"):
|
||||
return None
|
||||
return AnomalyModelSignal(
|
||||
method="temporal_jump",
|
||||
score=min(100, int((target.amount / average) * Decimal("32"))),
|
||||
reason="Target amount jumps above the recent moving average.",
|
||||
related_keys=[point.key for point in previous],
|
||||
)
|
||||
|
||||
def _periodic_deviation_signal(
|
||||
self,
|
||||
target: AnomalyPoint,
|
||||
peers: list[AnomalyPoint],
|
||||
) -> AnomalyModelSignal | None:
|
||||
if target.occurred_at is None:
|
||||
return None
|
||||
same_period = [
|
||||
point
|
||||
for point in peers
|
||||
if point.occurred_at is not None
|
||||
and point.occurred_at.weekday() == target.occurred_at.weekday()
|
||||
]
|
||||
if len(same_period) < 2:
|
||||
return None
|
||||
average = sum((point.amount for point in same_period), ZERO) / Decimal(len(same_period))
|
||||
if average <= ZERO or target.amount < average * Decimal("2"):
|
||||
return None
|
||||
return AnomalyModelSignal(
|
||||
method="periodic_deviation",
|
||||
score=min(100, int((target.amount / average) * Decimal("30"))),
|
||||
reason="Target deviates from same-weekday periodic peer behavior.",
|
||||
related_keys=[point.key for point in same_period],
|
||||
)
|
||||
183
server/src/app/algorithem/risk_graph/consistency.py
Normal file
183
server/src/app/algorithem/risk_graph/consistency.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""Multi-evidence and spatiotemporal consistency checks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date, datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
|
||||
from .models import RiskEvidence, RiskGraphClaimSnapshot
|
||||
from .signals import NormalizedRiskSignal, normalize_risk_signals
|
||||
|
||||
ZERO = Decimal("0")
|
||||
|
||||
|
||||
def evaluate_claim_consistency(
|
||||
claim: RiskGraphClaimSnapshot,
|
||||
) -> tuple[list[RiskEvidence], list[NormalizedRiskSignal]]:
|
||||
evidence: list[RiskEvidence] = []
|
||||
signals: list[NormalizedRiskSignal] = []
|
||||
|
||||
if _has_location_mismatch(claim):
|
||||
evidence.append(
|
||||
RiskEvidence(
|
||||
code="location_mismatch_graph",
|
||||
title="Location mismatch graph",
|
||||
detail="Claim location and item location are not aligned.",
|
||||
source="spatiotemporal",
|
||||
score=64,
|
||||
)
|
||||
)
|
||||
signals.extend(normalize_risk_signals(["location_mismatch"], source="spatiotemporal"))
|
||||
|
||||
amount_mismatch = _document_amount_mismatch(claim)
|
||||
if amount_mismatch:
|
||||
evidence.append(
|
||||
RiskEvidence(
|
||||
code="document_amount_mismatch",
|
||||
title="Document amount mismatch",
|
||||
detail="Claim amount and item amount sum are not aligned.",
|
||||
source="multi_evidence",
|
||||
score=72,
|
||||
metadata=amount_mismatch,
|
||||
)
|
||||
)
|
||||
signals.extend(
|
||||
normalize_risk_signals(
|
||||
[{"risk_signal": "document_expense_mismatch", "score": 72}],
|
||||
source="multi_evidence",
|
||||
)
|
||||
)
|
||||
|
||||
invoice_count_mismatch = _invoice_count_mismatch(claim)
|
||||
if invoice_count_mismatch:
|
||||
evidence.append(
|
||||
RiskEvidence(
|
||||
code="invoice_count_mismatch",
|
||||
title="Invoice count mismatch",
|
||||
detail="Declared invoice count and attached invoice count are not aligned.",
|
||||
source="multi_evidence",
|
||||
score=62,
|
||||
metadata=invoice_count_mismatch,
|
||||
)
|
||||
)
|
||||
signals.extend(
|
||||
normalize_risk_signals(
|
||||
[{"risk_signal": "document_expense_mismatch", "score": 62}],
|
||||
source="multi_evidence",
|
||||
)
|
||||
)
|
||||
|
||||
date_mismatch = _item_date_outside_claim_window(claim)
|
||||
if date_mismatch:
|
||||
evidence.append(
|
||||
RiskEvidence(
|
||||
code="date_outside_claim_window",
|
||||
title="Date outside claim window",
|
||||
detail="Item date is too far away from the claim occurrence date.",
|
||||
source="spatiotemporal",
|
||||
score=78,
|
||||
metadata=date_mismatch,
|
||||
)
|
||||
)
|
||||
signals.extend(normalize_risk_signals(["date_outside_trip"], source="spatiotemporal"))
|
||||
|
||||
return evidence, signals
|
||||
|
||||
|
||||
def _has_location_mismatch(claim: RiskGraphClaimSnapshot) -> bool:
|
||||
claim_location = _canonical_key(claim.location)
|
||||
if not claim_location or not claim.items:
|
||||
return False
|
||||
item_locations = {
|
||||
_canonical_key(item.item_location)
|
||||
for item in claim.items
|
||||
if str(item.item_location or "").strip()
|
||||
}
|
||||
if not item_locations:
|
||||
return False
|
||||
return any(location and location != claim_location for location in item_locations)
|
||||
|
||||
|
||||
def _document_amount_mismatch(claim: RiskGraphClaimSnapshot) -> dict[str, str] | None:
|
||||
if not claim.items:
|
||||
return None
|
||||
claim_amount = _to_decimal(claim.amount)
|
||||
item_amount_sum = sum((_to_decimal(item.item_amount) for item in claim.items), ZERO)
|
||||
if claim_amount <= ZERO or item_amount_sum <= ZERO:
|
||||
return None
|
||||
difference = abs(claim_amount - item_amount_sum)
|
||||
tolerance = max(Decimal("1"), claim_amount * Decimal("0.02"))
|
||||
if difference <= tolerance:
|
||||
return None
|
||||
return {
|
||||
"claim_amount": str(claim_amount),
|
||||
"item_amount_sum": str(item_amount_sum),
|
||||
"difference": str(difference),
|
||||
"tolerance": str(tolerance),
|
||||
}
|
||||
|
||||
|
||||
def _invoice_count_mismatch(claim: RiskGraphClaimSnapshot) -> dict[str, Any] | None:
|
||||
declared_count = int(claim.invoice_count or 0)
|
||||
if declared_count <= 0:
|
||||
return None
|
||||
invoice_ids = sorted(
|
||||
{
|
||||
str(item.invoice_id or "").strip()
|
||||
for item in claim.items
|
||||
if str(item.invoice_id or "").strip()
|
||||
}
|
||||
)
|
||||
actual_count = len(invoice_ids)
|
||||
if declared_count == actual_count:
|
||||
return None
|
||||
return {
|
||||
"declared_invoice_count": declared_count,
|
||||
"actual_invoice_count": actual_count,
|
||||
"invoice_ids": invoice_ids,
|
||||
}
|
||||
|
||||
|
||||
def _item_date_outside_claim_window(claim: RiskGraphClaimSnapshot) -> dict[str, Any] | None:
|
||||
occurred_date = _date_from_value(claim.occurred_at)
|
||||
if occurred_date is None or not claim.items:
|
||||
return None
|
||||
mismatches: list[dict[str, Any]] = []
|
||||
for item in claim.items:
|
||||
item_date = _date_from_value(item.item_date)
|
||||
if item_date is None:
|
||||
continue
|
||||
distance_days = abs((item_date - occurred_date).days)
|
||||
if distance_days <= 7:
|
||||
continue
|
||||
mismatches.append(
|
||||
{
|
||||
"item_id": item.item_id,
|
||||
"item_date": item_date.isoformat(),
|
||||
"occurred_at": occurred_date.isoformat(),
|
||||
"distance_days": distance_days,
|
||||
}
|
||||
)
|
||||
return {"mismatches": mismatches} if mismatches else None
|
||||
|
||||
|
||||
def _date_from_value(value: Any) -> date | None:
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, datetime):
|
||||
return value.date()
|
||||
if isinstance(value, date):
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
def _canonical_key(value: Any) -> str:
|
||||
return "_".join(str(value or "").strip().lower().split())
|
||||
|
||||
|
||||
def _to_decimal(value: Any) -> Decimal:
|
||||
try:
|
||||
return Decimal(str(value or "0"))
|
||||
except Exception:
|
||||
return ZERO
|
||||
77
server/src/app/algorithem/risk_graph/control_effect.py
Normal file
77
server/src/app/algorithem/risk_graph/control_effect.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Control effect analysis for risk rules, sampling, and digital employees."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
HIGH_LEVELS = {"high", "critical"}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ControlEffectSummary:
|
||||
before_count: int
|
||||
after_count: int
|
||||
risk_count_delta: int
|
||||
average_score_delta: float
|
||||
high_rate_delta: float
|
||||
confirmation_rate_delta: float
|
||||
false_positive_rate_delta: float
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"before_count": self.before_count,
|
||||
"after_count": self.after_count,
|
||||
"risk_count_delta": self.risk_count_delta,
|
||||
"average_score_delta": self.average_score_delta,
|
||||
"high_rate_delta": self.high_rate_delta,
|
||||
"confirmation_rate_delta": self.confirmation_rate_delta,
|
||||
"false_positive_rate_delta": self.false_positive_rate_delta,
|
||||
}
|
||||
|
||||
|
||||
class ControlEffectAnalyzer:
|
||||
def compare(
|
||||
self,
|
||||
before: list[dict[str, Any]],
|
||||
after: list[dict[str, Any]],
|
||||
) -> ControlEffectSummary:
|
||||
before_metrics = _metrics(before)
|
||||
after_metrics = _metrics(after)
|
||||
return ControlEffectSummary(
|
||||
before_count=before_metrics["count"],
|
||||
after_count=after_metrics["count"],
|
||||
risk_count_delta=after_metrics["count"] - before_metrics["count"],
|
||||
average_score_delta=round(after_metrics["average_score"] - before_metrics["average_score"], 4),
|
||||
high_rate_delta=round(after_metrics["high_rate"] - before_metrics["high_rate"], 4),
|
||||
confirmation_rate_delta=round(
|
||||
after_metrics["confirmation_rate"] - before_metrics["confirmation_rate"],
|
||||
4,
|
||||
),
|
||||
false_positive_rate_delta=round(
|
||||
after_metrics["false_positive_rate"] - before_metrics["false_positive_rate"],
|
||||
4,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _metrics(items: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
count = len(items)
|
||||
if count == 0:
|
||||
return {
|
||||
"count": 0,
|
||||
"average_score": 0.0,
|
||||
"high_rate": 0.0,
|
||||
"confirmation_rate": 0.0,
|
||||
"false_positive_rate": 0.0,
|
||||
}
|
||||
confirmed = sum(1 for item in items if item.get("feedback_status") == "confirmed")
|
||||
false_positive = sum(1 for item in items if item.get("feedback_status") == "false_positive")
|
||||
reviewed = confirmed + false_positive
|
||||
return {
|
||||
"count": count,
|
||||
"average_score": sum(int(item.get("risk_score") or 0) for item in items) / count,
|
||||
"high_rate": sum(1 for item in items if item.get("risk_level") in HIGH_LEVELS) / count,
|
||||
"confirmation_rate": confirmed / reviewed if reviewed else 0.0,
|
||||
"false_positive_rate": false_positive / reviewed if reviewed else 0.0,
|
||||
}
|
||||
82
server/src/app/algorithem/risk_graph/counterfactual.py
Normal file
82
server/src/app/algorithem/risk_graph/counterfactual.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Counterfactual recommendations for reducing financial risk scores."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class CounterfactualRiskAction:
|
||||
action_key: str
|
||||
title: str
|
||||
detail: str
|
||||
related_feature: str
|
||||
expected_score_delta: int
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"action_key": self.action_key,
|
||||
"title": self.title,
|
||||
"detail": self.detail,
|
||||
"related_feature": self.related_feature,
|
||||
"expected_score_delta": self.expected_score_delta,
|
||||
}
|
||||
|
||||
|
||||
class CounterfactualRiskAdvisor:
|
||||
def advise(self, observation: dict[str, Any]) -> list[CounterfactualRiskAction]:
|
||||
scores = dict(
|
||||
observation.get("contribution_scores")
|
||||
or observation.get("decision_trace", {}).get("input_scores")
|
||||
or {}
|
||||
)
|
||||
evidence_codes = {
|
||||
str(item.get("code") or "")
|
||||
for item in observation.get("evidence", [])
|
||||
if isinstance(item, dict)
|
||||
}
|
||||
trace = observation.get("decision_trace") or {}
|
||||
actions: list[CounterfactualRiskAction] = []
|
||||
|
||||
if int(scores.get("S_rule") or 0) >= 70:
|
||||
actions.append(
|
||||
CounterfactualRiskAction(
|
||||
action_key="complete_preapproval_or_required_attachment",
|
||||
title="Complete required approval evidence",
|
||||
detail="补齐事前申请、审批记录或制度要求的附件,可降低规则命中风险。",
|
||||
related_feature="S_rule",
|
||||
expected_score_delta=-20,
|
||||
)
|
||||
)
|
||||
if int(scores.get("S_anomaly") or 0) >= 70:
|
||||
actions.append(
|
||||
CounterfactualRiskAction(
|
||||
action_key="align_amount_with_peer_baseline",
|
||||
title="Align amount with peer baseline",
|
||||
detail="补充高金额原因或拆出不属于本次报销的费用,可降低基线偏离风险。",
|
||||
related_feature="S_anomaly",
|
||||
expected_score_delta=-18,
|
||||
)
|
||||
)
|
||||
if int(scores.get("S_graph") or 0) >= 70 or "duplicate_invoice_graph" in evidence_codes:
|
||||
actions.append(
|
||||
CounterfactualRiskAction(
|
||||
action_key="replace_duplicate_or_conflicting_invoice",
|
||||
title="Replace conflicting invoice",
|
||||
detail="替换重复票据、修正票据归属或说明跨单据复用原因,可降低图谱异常风险。",
|
||||
related_feature="S_graph",
|
||||
expected_score_delta=-25,
|
||||
)
|
||||
)
|
||||
if trace.get("data_quality_gate") not in {"", "passed", None}:
|
||||
actions.append(
|
||||
CounterfactualRiskAction(
|
||||
action_key="supplement_missing_risk_data",
|
||||
title="Supplement missing risk data",
|
||||
detail="补齐员工、金额、费用类型、票据明细等关键字段后再进入强风控判断。",
|
||||
related_feature="data_quality",
|
||||
expected_score_delta=-10,
|
||||
)
|
||||
)
|
||||
return actions
|
||||
132
server/src/app/algorithem/risk_graph/decisioning.py
Normal file
132
server/src/app/algorithem/risk_graph/decisioning.py
Normal file
@@ -0,0 +1,132 @@
|
||||
"""Decision trace and explanation helpers for risk graph observations."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
|
||||
from .models import PeerBaseline, RiskEvidence
|
||||
|
||||
RISK_SCORE_FORMULA = (
|
||||
"0.35*S_rule + 0.25*S_anomaly + "
|
||||
"0.20*S_graph + 0.15*S_policy + 0.05*S_history"
|
||||
)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DecisionTrace:
|
||||
formula: str
|
||||
algorithm_version: str
|
||||
input_scores: dict[str, int]
|
||||
output_score: int
|
||||
decision_row: str
|
||||
feature_contributions_json: list[dict[str, Any]]
|
||||
uncertainty_reasons_json: list[str]
|
||||
explanation_template_key: str
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"formula": self.formula,
|
||||
"algorithm_version": self.algorithm_version,
|
||||
"input_scores": dict(self.input_scores),
|
||||
"output_score": self.output_score,
|
||||
"decision_row": self.decision_row,
|
||||
"feature_contributions_json": list(self.feature_contributions_json),
|
||||
"uncertainty_reasons_json": list(self.uncertainty_reasons_json),
|
||||
"explanation_template_key": self.explanation_template_key,
|
||||
**self.metadata,
|
||||
}
|
||||
|
||||
|
||||
class DecisionTraceBuilder:
|
||||
def build(
|
||||
self,
|
||||
*,
|
||||
algorithm_version: str,
|
||||
risk_signal: str,
|
||||
risk_level: str,
|
||||
raw_risk_score: int,
|
||||
risk_score: int,
|
||||
contribution_scores: dict[str, int],
|
||||
evidence: list[RiskEvidence],
|
||||
baseline: PeerBaseline,
|
||||
confidence: Decimal,
|
||||
metadata: dict[str, Any],
|
||||
) -> DecisionTrace:
|
||||
return DecisionTrace(
|
||||
formula=RISK_SCORE_FORMULA,
|
||||
algorithm_version=algorithm_version,
|
||||
input_scores=contribution_scores,
|
||||
output_score=risk_score,
|
||||
decision_row=_decision_row(risk_score=risk_score, risk_level=risk_level),
|
||||
feature_contributions_json=_feature_contributions(contribution_scores),
|
||||
uncertainty_reasons_json=_uncertainty_reasons(
|
||||
raw_risk_score=raw_risk_score,
|
||||
risk_score=risk_score,
|
||||
evidence=evidence,
|
||||
baseline=baseline,
|
||||
confidence=confidence,
|
||||
metadata=metadata,
|
||||
),
|
||||
explanation_template_key=f"risk.{risk_signal}.{risk_level}",
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
def _decision_row(*, risk_score: int, risk_level: str) -> str:
|
||||
if risk_score >= 90:
|
||||
return f"{risk_level}:score>=90"
|
||||
if risk_score >= 70:
|
||||
return f"{risk_level}:70<=score<90"
|
||||
if risk_score >= 45:
|
||||
return f"{risk_level}:45<=score<70"
|
||||
return f"{risk_level}:score<45"
|
||||
|
||||
|
||||
def _feature_contributions(scores: dict[str, int]) -> list[dict[str, Any]]:
|
||||
weights = {
|
||||
"S_rule": Decimal("0.35"),
|
||||
"S_anomaly": Decimal("0.25"),
|
||||
"S_graph": Decimal("0.20"),
|
||||
"S_policy": Decimal("0.15"),
|
||||
"S_history": Decimal("0.05"),
|
||||
}
|
||||
rows = []
|
||||
for key, score in scores.items():
|
||||
weighted_score = Decimal(int(score or 0)) * weights.get(key, Decimal("0"))
|
||||
rows.append(
|
||||
{
|
||||
"feature": key,
|
||||
"score": int(score or 0),
|
||||
"weight": str(weights.get(key, Decimal("0"))),
|
||||
"weighted_score": float(weighted_score),
|
||||
}
|
||||
)
|
||||
return sorted(rows, key=lambda item: item["weighted_score"], reverse=True)
|
||||
|
||||
|
||||
def _uncertainty_reasons(
|
||||
*,
|
||||
raw_risk_score: int,
|
||||
risk_score: int,
|
||||
evidence: list[RiskEvidence],
|
||||
baseline: PeerBaseline,
|
||||
confidence: Decimal,
|
||||
metadata: dict[str, Any],
|
||||
) -> list[str]:
|
||||
reasons: list[str] = []
|
||||
if risk_score < raw_risk_score:
|
||||
reasons.append("score_capped_by_gate")
|
||||
if baseline.scope == "insufficient_sample" or baseline.sample_size <= 0:
|
||||
reasons.append("peer_baseline_insufficient")
|
||||
if confidence < Decimal("0.55"):
|
||||
reasons.append("low_confidence")
|
||||
if len({item.source for item in evidence if item.source}) < 2:
|
||||
reasons.append("single_evidence_source")
|
||||
if metadata.get("ontology_gate") == "candidate_only":
|
||||
reasons.append("ontology_candidate_only")
|
||||
if metadata.get("data_quality_gate") not in {"", "passed", None}:
|
||||
reasons.append("data_quality_gate_not_passed")
|
||||
return list(dict.fromkeys(reasons))
|
||||
794
server/src/app/algorithem/risk_graph/engine.py
Normal file
794
server/src/app/algorithem/risk_graph/engine.py
Normal file
@@ -0,0 +1,794 @@
|
||||
"""Financial behavior graph risk scoring engine."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from decimal import ROUND_CEILING, ROUND_FLOOR, ROUND_HALF_UP, Decimal
|
||||
from typing import Any
|
||||
|
||||
from .consistency import evaluate_claim_consistency
|
||||
from .decisioning import DecisionTraceBuilder
|
||||
from .graph import build_claim_graph, claim_node_key, employee_node_key
|
||||
from .models import (
|
||||
ALGORITHM_VERSION,
|
||||
AUTOMATION_ASSIST,
|
||||
AUTOMATION_AUTO_HOLD,
|
||||
AUTOMATION_MANUAL_REVIEW,
|
||||
AUTOMATION_SEMI_AUTO_REVIEW,
|
||||
LEVEL_CRITICAL,
|
||||
LEVEL_HIGH,
|
||||
LEVEL_LOW,
|
||||
LEVEL_MEDIUM,
|
||||
PeerBaseline,
|
||||
RiskEvidence,
|
||||
RiskGraphClaimSnapshot,
|
||||
RiskGraphEdge,
|
||||
RiskGraphEvaluationContext,
|
||||
RiskGraphEvaluationResult,
|
||||
RiskGraphNode,
|
||||
RiskHistoryStats,
|
||||
RiskObservationDraft,
|
||||
)
|
||||
from .ontology import map_ontology_to_risk_graph
|
||||
from .quality import RiskDataQualityGate
|
||||
from .sampling import RiskSamplingPlanner
|
||||
from .signals import (
|
||||
NormalizedRiskSignal,
|
||||
normalize_risk_signals,
|
||||
policy_refs_for_signal,
|
||||
)
|
||||
|
||||
ZERO = Decimal("0")
|
||||
ONE = Decimal("1")
|
||||
HUNDRED = Decimal("100")
|
||||
DATA_QUALITY_GATE = RiskDataQualityGate()
|
||||
SAMPLING_PLANNER = RiskSamplingPlanner()
|
||||
DECISION_TRACE_BUILDER = DecisionTraceBuilder()
|
||||
|
||||
|
||||
def evaluate_financial_risk_graph(
|
||||
context: RiskGraphEvaluationContext,
|
||||
) -> RiskGraphEvaluationResult:
|
||||
nodes, edges = build_claim_graph(context.claims)
|
||||
ontology_mapping = map_ontology_to_risk_graph(
|
||||
context.ontology_parse,
|
||||
ontology_parse_id=context.ontology_parse_id,
|
||||
ontology_version=context.ontology_version,
|
||||
)
|
||||
nodes = _merge_nodes(nodes, ontology_mapping.nodes)
|
||||
edges = _merge_edges(edges, ontology_mapping.edges)
|
||||
|
||||
target_ids = context.target_claim_ids or {claim.claim_id for claim in context.claims}
|
||||
target_claims = [claim for claim in context.claims if claim.claim_id in target_ids]
|
||||
observations: list[RiskObservationDraft] = []
|
||||
|
||||
for claim in target_claims:
|
||||
baseline = _resolve_peer_baseline(claim, context.claims, context.min_peer_sample_size)
|
||||
rule_score, rule_evidence, rule_signals = _score_rule_signals(claim)
|
||||
anomaly_score, anomaly_evidence = _score_amount_anomaly(claim, baseline)
|
||||
graph_score, graph_evidence, graph_signals = _score_graph_anomaly(claim, context)
|
||||
policy_score, policy_evidence, policy_refs = _score_policy_relevance(
|
||||
rule_signals + graph_signals + ontology_mapping.risk_signals,
|
||||
)
|
||||
history_score, history_evidence, history = _score_history(
|
||||
claim,
|
||||
rule_signals + graph_signals + ontology_mapping.risk_signals,
|
||||
context.history_stats,
|
||||
)
|
||||
|
||||
contribution_scores = {
|
||||
"S_rule": rule_score,
|
||||
"S_anomaly": anomaly_score,
|
||||
"S_graph": graph_score,
|
||||
"S_policy": policy_score,
|
||||
"S_history": history_score,
|
||||
}
|
||||
raw_risk_score = _weighted_risk_score(contribution_scores)
|
||||
quality_result = DATA_QUALITY_GATE.evaluate_claim(claim)
|
||||
evidence = [
|
||||
*rule_evidence,
|
||||
*anomaly_evidence,
|
||||
*graph_evidence,
|
||||
*policy_evidence,
|
||||
*history_evidence,
|
||||
]
|
||||
risk_score, evidence_source_gate = _apply_evidence_source_gate(
|
||||
raw_risk_score,
|
||||
evidence,
|
||||
)
|
||||
risk_score, data_quality_gate = DATA_QUALITY_GATE.apply_score_cap(
|
||||
risk_score,
|
||||
quality_result,
|
||||
)
|
||||
if risk_score < context.observation_threshold and ontology_mapping.gate != "candidate_only":
|
||||
continue
|
||||
if risk_score < context.observation_threshold and not ontology_mapping.risk_signals:
|
||||
continue
|
||||
|
||||
evidence_source_count = _evidence_source_count(evidence)
|
||||
primary_signal = _select_primary_signal(
|
||||
rule_signals + graph_signals + ontology_mapping.risk_signals,
|
||||
fallback_score=risk_score,
|
||||
)
|
||||
confidence = _calculate_confidence(
|
||||
evidence=evidence,
|
||||
baseline=baseline,
|
||||
ontology_confidence=ontology_mapping.confidence,
|
||||
history=history,
|
||||
data_quality_ok=quality_result.passed,
|
||||
)
|
||||
automation_mode = _resolve_automation_mode(
|
||||
risk_score=risk_score,
|
||||
confidence=confidence,
|
||||
evidence_count=len(evidence),
|
||||
history=history,
|
||||
)
|
||||
sampling_decision = SAMPLING_PLANNER.plan(
|
||||
risk_score=risk_score,
|
||||
confidence=confidence,
|
||||
evidence_source_count=evidence_source_count,
|
||||
data_quality_passed=quality_result.passed,
|
||||
data_quality_gate=data_quality_gate,
|
||||
history=history,
|
||||
)
|
||||
risk_level = _level_from_score(risk_score)
|
||||
decision_metadata = {
|
||||
"raw_risk_score": raw_risk_score,
|
||||
"evidence_source_count": evidence_source_count,
|
||||
"evidence_source_gate": evidence_source_gate,
|
||||
"data_quality_gate": data_quality_gate,
|
||||
"data_quality": quality_result.as_dict(),
|
||||
"sampling_strategy": sampling_decision.as_dict(),
|
||||
"contribution_scores": contribution_scores,
|
||||
"baseline_scope": baseline.scope,
|
||||
"ontology_gate": ontology_mapping.gate,
|
||||
}
|
||||
decision_trace = DECISION_TRACE_BUILDER.build(
|
||||
algorithm_version=ALGORITHM_VERSION,
|
||||
risk_signal=primary_signal.code,
|
||||
risk_level=risk_level,
|
||||
raw_risk_score=raw_risk_score,
|
||||
risk_score=risk_score,
|
||||
contribution_scores=contribution_scores,
|
||||
evidence=evidence,
|
||||
baseline=baseline,
|
||||
confidence=confidence,
|
||||
metadata=decision_metadata,
|
||||
)
|
||||
graph_node_keys = _claim_related_node_keys(claim, nodes)
|
||||
graph_edge_keys = _claim_related_edge_keys(claim, edges)
|
||||
similar_case_ids = _similar_case_ids(claim, context.claims)
|
||||
|
||||
observations.append(
|
||||
RiskObservationDraft(
|
||||
observation_key=f"risk:{claim.claim_id}:{primary_signal.code}",
|
||||
subject_type="expense_claim",
|
||||
subject_key=f"claim:{claim.claim_id}",
|
||||
subject_label=claim.claim_no or claim.claim_id,
|
||||
claim_id=claim.claim_id,
|
||||
claim_no=claim.claim_no,
|
||||
risk_type=primary_signal.code,
|
||||
risk_signal=primary_signal.code,
|
||||
title=f"{primary_signal.label} risk",
|
||||
description=_build_description(claim, primary_signal, risk_score, evidence),
|
||||
risk_score=risk_score,
|
||||
risk_level=risk_level,
|
||||
confidence_score=confidence,
|
||||
control_stage="reimbursement",
|
||||
control_mode="risk_observation",
|
||||
automation_mode=automation_mode,
|
||||
source="financial_risk_graph",
|
||||
algorithm_version=ALGORITHM_VERSION,
|
||||
contribution_scores=contribution_scores,
|
||||
baseline=baseline,
|
||||
evidence=evidence,
|
||||
graph_node_keys=graph_node_keys,
|
||||
graph_edge_keys=graph_edge_keys,
|
||||
policy_refs=policy_refs,
|
||||
similar_case_claim_ids=similar_case_ids,
|
||||
ontology_json=ontology_mapping.as_dict(),
|
||||
decision_trace=decision_trace.as_dict(),
|
||||
)
|
||||
)
|
||||
|
||||
return RiskGraphEvaluationResult(
|
||||
observations=sorted(observations, key=lambda item: item.risk_score, reverse=True),
|
||||
nodes=nodes,
|
||||
edges=edges,
|
||||
)
|
||||
|
||||
|
||||
def _score_rule_signals(
|
||||
claim: RiskGraphClaimSnapshot,
|
||||
) -> tuple[int, list[RiskEvidence], list[NormalizedRiskSignal]]:
|
||||
signals = normalize_risk_signals(claim.risk_flags, source="rule")
|
||||
if not signals:
|
||||
return 0, [], []
|
||||
score = min(100, max(item.score for item in signals) + max(0, len(signals) - 1) * 5)
|
||||
evidence = [
|
||||
RiskEvidence(
|
||||
code="rule_signal",
|
||||
title="Rule signal",
|
||||
detail=f"{signal.label}: {signal.severity}",
|
||||
source="rule",
|
||||
score=signal.score,
|
||||
metadata=signal.as_dict(),
|
||||
)
|
||||
for signal in signals
|
||||
]
|
||||
return score, evidence, signals
|
||||
|
||||
|
||||
def _score_amount_anomaly(
|
||||
claim: RiskGraphClaimSnapshot,
|
||||
baseline: PeerBaseline,
|
||||
) -> tuple[int, list[RiskEvidence]]:
|
||||
amount = _to_decimal(claim.amount)
|
||||
if baseline.sample_size <= 0 or baseline.p75_amount <= ZERO:
|
||||
return 0, [
|
||||
RiskEvidence(
|
||||
code="baseline_unavailable",
|
||||
title="Baseline unavailable",
|
||||
detail=baseline.fallback_reason or "No comparable peer sample.",
|
||||
source="baseline",
|
||||
)
|
||||
]
|
||||
|
||||
ratio = _safe_ratio(amount, baseline.p75_amount)
|
||||
score = _score_ratio(
|
||||
ratio,
|
||||
[
|
||||
(Decimal("1.00"), 0),
|
||||
(Decimal("1.25"), 30),
|
||||
(Decimal("1.50"), 55),
|
||||
(Decimal("2.00"), 75),
|
||||
(Decimal("3.00"), 95),
|
||||
],
|
||||
)
|
||||
if score <= 0:
|
||||
return 0, []
|
||||
return score, [
|
||||
RiskEvidence(
|
||||
code="peer_amount_deviation",
|
||||
title="Peer amount deviation",
|
||||
detail=(
|
||||
f"Claim amount {amount} is {ratio.quantize(Decimal('0.0001'))} "
|
||||
f"times peer p75 {baseline.p75_amount}."
|
||||
),
|
||||
source="baseline",
|
||||
score=score,
|
||||
metadata={"ratio": str(ratio), "baseline": baseline.as_dict()},
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def _score_graph_anomaly(
|
||||
claim: RiskGraphClaimSnapshot,
|
||||
context: RiskGraphEvaluationContext,
|
||||
) -> tuple[int, list[RiskEvidence], list[NormalizedRiskSignal]]:
|
||||
evidence: list[RiskEvidence] = []
|
||||
signals: list[NormalizedRiskSignal] = []
|
||||
|
||||
duplicate_claims = _duplicate_invoice_claims(claim, context.claims)
|
||||
if duplicate_claims:
|
||||
evidence.append(
|
||||
RiskEvidence(
|
||||
code="duplicate_invoice_graph",
|
||||
title="Duplicate invoice graph",
|
||||
detail="Same invoice appears in multiple claims.",
|
||||
source="graph",
|
||||
score=95,
|
||||
related_entity_keys=[f"claim:{item.claim_id}" for item in duplicate_claims],
|
||||
)
|
||||
)
|
||||
signals.extend(normalize_risk_signals(["duplicate_invoice"], source="graph"))
|
||||
|
||||
split_claims = _split_billing_claims(claim, context.claims, context.near_threshold_amount)
|
||||
if len(split_claims) >= 3:
|
||||
evidence.append(
|
||||
RiskEvidence(
|
||||
code="split_billing_graph",
|
||||
title="Split billing graph",
|
||||
detail="Same employee submitted several near-threshold claims in 7 days.",
|
||||
source="graph",
|
||||
score=78,
|
||||
related_entity_keys=[f"claim:{item.claim_id}" for item in split_claims],
|
||||
)
|
||||
)
|
||||
signals.extend(normalize_risk_signals(["split_billing"], source="graph"))
|
||||
|
||||
frequency_claims = _employee_frequency_claims(claim, context.claims)
|
||||
if len(frequency_claims) >= 4:
|
||||
score = min(88, 52 + len(frequency_claims) * 6)
|
||||
evidence.append(
|
||||
RiskEvidence(
|
||||
code="frequency_graph",
|
||||
title="Frequency graph",
|
||||
detail="Same employee has dense claims under the same expense type.",
|
||||
source="graph",
|
||||
score=score,
|
||||
related_entity_keys=[f"claim:{item.claim_id}" for item in frequency_claims],
|
||||
)
|
||||
)
|
||||
signals.extend(normalize_risk_signals(["frequency_anomaly"], source="graph"))
|
||||
|
||||
consistency_evidence, consistency_signals = evaluate_claim_consistency(claim)
|
||||
evidence.extend(consistency_evidence)
|
||||
signals.extend(consistency_signals)
|
||||
|
||||
cluster_claims = _cross_department_cluster_claims(claim, context.claims)
|
||||
if len(cluster_claims) >= 3:
|
||||
evidence.append(
|
||||
RiskEvidence(
|
||||
code="cross_department_cluster",
|
||||
title="Cross-department cluster",
|
||||
detail="Multiple departments produced similar high-value claims together.",
|
||||
source="graph",
|
||||
score=74,
|
||||
related_entity_keys=[f"claim:{item.claim_id}" for item in cluster_claims],
|
||||
)
|
||||
)
|
||||
signals.extend(normalize_risk_signals(["cross_department_cluster"], source="graph"))
|
||||
|
||||
if not evidence:
|
||||
return 0, [], []
|
||||
score = min(100, max(item.score for item in evidence) + max(0, len(evidence) - 1) * 6)
|
||||
return score, evidence, _dedupe_signals(signals)
|
||||
|
||||
|
||||
def _score_policy_relevance(
|
||||
signals: list[NormalizedRiskSignal],
|
||||
) -> tuple[int, list[RiskEvidence], list[str]]:
|
||||
refs: list[str] = []
|
||||
for signal in signals:
|
||||
for ref in policy_refs_for_signal(signal.code):
|
||||
if ref not in refs:
|
||||
refs.append(ref)
|
||||
if not refs:
|
||||
return 0, [], []
|
||||
score = min(88, 45 + len(refs) * 12)
|
||||
return score, [
|
||||
RiskEvidence(
|
||||
code="policy_relevance",
|
||||
title="Policy relevance",
|
||||
detail="Risk signal is bound to policy or control clause.",
|
||||
source="policy",
|
||||
score=score,
|
||||
metadata={"policy_refs": refs},
|
||||
)
|
||||
], refs
|
||||
|
||||
|
||||
def _score_history(
|
||||
claim: RiskGraphClaimSnapshot,
|
||||
signals: list[NormalizedRiskSignal],
|
||||
history_stats: list[RiskHistoryStats],
|
||||
) -> tuple[int, list[RiskEvidence], RiskHistoryStats | None]:
|
||||
signal_codes = {item.code for item in signals}
|
||||
expense_type = _canonical_key(claim.expense_type)
|
||||
matched = [
|
||||
item
|
||||
for item in history_stats
|
||||
if item.risk_signal in signal_codes
|
||||
and (not item.expense_type or _canonical_key(item.expense_type) == expense_type)
|
||||
]
|
||||
if not matched:
|
||||
return 0, [], None
|
||||
history = max(matched, key=lambda item: item.similar_case_count)
|
||||
total = max(1, history.similar_case_count)
|
||||
confirmed_rate = Decimal(history.confirmed_count) / Decimal(total)
|
||||
returned_rate = Decimal(history.returned_count) / Decimal(total)
|
||||
false_positive_rate = Decimal(history.false_positive_count) / Decimal(total)
|
||||
score = _clamp_score(
|
||||
HUNDRED * (confirmed_rate * Decimal("0.65") + returned_rate * Decimal("0.35"))
|
||||
- HUNDRED * false_positive_rate * Decimal("0.50")
|
||||
)
|
||||
if score <= 0:
|
||||
return 0, [], history
|
||||
return score, [
|
||||
RiskEvidence(
|
||||
code="history_feedback",
|
||||
title="History feedback",
|
||||
detail="Similar historical cases contain confirmed or returned risks.",
|
||||
source="feedback",
|
||||
score=score,
|
||||
metadata=history.as_dict(),
|
||||
)
|
||||
], history
|
||||
|
||||
|
||||
def _resolve_peer_baseline(
|
||||
target: RiskGraphClaimSnapshot,
|
||||
claims: list[RiskGraphClaimSnapshot],
|
||||
min_sample_size: int,
|
||||
) -> PeerBaseline:
|
||||
candidates = [claim for claim in claims if claim.claim_id != target.claim_id]
|
||||
scopes = [
|
||||
(
|
||||
"department_grade_expense_type",
|
||||
[
|
||||
claim
|
||||
for claim in candidates
|
||||
if _same(claim.department_name, target.department_name)
|
||||
and _same(claim.employee_grade, target.employee_grade)
|
||||
and _same(claim.expense_type, target.expense_type)
|
||||
],
|
||||
),
|
||||
(
|
||||
"department_expense_type",
|
||||
[
|
||||
claim
|
||||
for claim in candidates
|
||||
if _same(claim.department_name, target.department_name)
|
||||
and _same(claim.expense_type, target.expense_type)
|
||||
],
|
||||
),
|
||||
(
|
||||
"expense_type",
|
||||
[claim for claim in candidates if _same(claim.expense_type, target.expense_type)],
|
||||
),
|
||||
("all_claims", candidates),
|
||||
]
|
||||
for scope, scoped_claims in scopes:
|
||||
amounts = [
|
||||
_to_decimal(claim.amount)
|
||||
for claim in scoped_claims
|
||||
if _to_decimal(claim.amount) > ZERO
|
||||
]
|
||||
if len(amounts) >= min_sample_size:
|
||||
return _build_baseline(scope, amounts)
|
||||
return PeerBaseline(
|
||||
scope="insufficient_sample",
|
||||
sample_size=0,
|
||||
fallback_reason="Peer sample is below minimum threshold.",
|
||||
)
|
||||
|
||||
|
||||
def _build_baseline(scope: str, amounts: list[Decimal]) -> PeerBaseline:
|
||||
return PeerBaseline(
|
||||
scope=scope,
|
||||
sample_size=len(amounts),
|
||||
median_amount=_percentile(amounts, 50),
|
||||
p75_amount=_percentile(amounts, 75),
|
||||
p90_amount=_percentile(amounts, 90),
|
||||
mean_amount=sum(amounts, ZERO) / Decimal(len(amounts)),
|
||||
)
|
||||
|
||||
|
||||
def _weighted_risk_score(scores: dict[str, int]) -> int:
|
||||
weighted = (
|
||||
Decimal(scores["S_rule"]) * Decimal("0.35")
|
||||
+ Decimal(scores["S_anomaly"]) * Decimal("0.25")
|
||||
+ Decimal(scores["S_graph"]) * Decimal("0.20")
|
||||
+ Decimal(scores["S_policy"]) * Decimal("0.15")
|
||||
+ Decimal(scores["S_history"]) * Decimal("0.05")
|
||||
)
|
||||
return _clamp_score(weighted)
|
||||
|
||||
|
||||
def _evidence_source_count(evidence: list[RiskEvidence]) -> int:
|
||||
return len(
|
||||
{
|
||||
str(item.source or "").strip()
|
||||
for item in evidence
|
||||
if str(item.source or "").strip()
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _apply_evidence_source_gate(
|
||||
risk_score: int,
|
||||
evidence: list[RiskEvidence],
|
||||
) -> tuple[int, str]:
|
||||
if risk_score >= 70 and _evidence_source_count(evidence) < 2:
|
||||
return 69, "capped_high_risk_single_source"
|
||||
return risk_score, "passed"
|
||||
|
||||
|
||||
def _select_primary_signal(
|
||||
signals: list[NormalizedRiskSignal],
|
||||
*,
|
||||
fallback_score: int,
|
||||
) -> NormalizedRiskSignal:
|
||||
deduped = _dedupe_signals(signals)
|
||||
if deduped:
|
||||
return max(deduped, key=lambda item: (item.score, item.confidence, item.code))
|
||||
fallback = normalize_risk_signals(
|
||||
[{"risk_signal": "amount_limit_exceeded", "score": fallback_score}],
|
||||
source="algorithm",
|
||||
)
|
||||
return fallback[0]
|
||||
|
||||
|
||||
def _calculate_confidence(
|
||||
*,
|
||||
evidence: list[RiskEvidence],
|
||||
baseline: PeerBaseline,
|
||||
ontology_confidence: Decimal,
|
||||
history: RiskHistoryStats | None,
|
||||
data_quality_ok: bool,
|
||||
) -> Decimal:
|
||||
source_count = len({item.source for item in evidence})
|
||||
confidence = Decimal("0.42") + min(Decimal("0.30"), Decimal(source_count) * Decimal("0.10"))
|
||||
confidence += min(Decimal("0.16"), Decimal(baseline.sample_size) / Decimal("50"))
|
||||
confidence += ontology_confidence * Decimal("0.08")
|
||||
if history and history.similar_case_count:
|
||||
false_positive_rate = Decimal(history.false_positive_count) / Decimal(
|
||||
history.similar_case_count
|
||||
)
|
||||
confidence -= min(Decimal("0.18"), false_positive_rate * Decimal("0.30"))
|
||||
if not data_quality_ok:
|
||||
confidence -= Decimal("0.20")
|
||||
return max(Decimal("0.05"), min(Decimal("0.98"), confidence.quantize(Decimal("0.0001"))))
|
||||
|
||||
|
||||
def _resolve_automation_mode(
|
||||
*,
|
||||
risk_score: int,
|
||||
confidence: Decimal,
|
||||
evidence_count: int,
|
||||
history: RiskHistoryStats | None,
|
||||
) -> str:
|
||||
false_positive_rate = Decimal("0")
|
||||
if history and history.similar_case_count:
|
||||
false_positive_rate = Decimal(history.false_positive_count) / Decimal(
|
||||
history.similar_case_count
|
||||
)
|
||||
if (
|
||||
risk_score >= 90
|
||||
and confidence >= Decimal("0.90")
|
||||
and evidence_count >= 3
|
||||
and false_positive_rate <= Decimal("0.10")
|
||||
):
|
||||
return AUTOMATION_AUTO_HOLD
|
||||
if risk_score >= 75 and confidence >= Decimal("0.72") and evidence_count >= 2:
|
||||
return AUTOMATION_SEMI_AUTO_REVIEW
|
||||
if risk_score >= 40:
|
||||
return AUTOMATION_MANUAL_REVIEW
|
||||
return AUTOMATION_ASSIST
|
||||
|
||||
|
||||
def _duplicate_invoice_claims(
|
||||
target: RiskGraphClaimSnapshot,
|
||||
claims: list[RiskGraphClaimSnapshot],
|
||||
) -> list[RiskGraphClaimSnapshot]:
|
||||
invoice_ids = {item.invoice_id for item in target.items if item.invoice_id}
|
||||
if not invoice_ids:
|
||||
return []
|
||||
matched = []
|
||||
for claim in claims:
|
||||
if claim.claim_id == target.claim_id:
|
||||
continue
|
||||
if any(item.invoice_id in invoice_ids for item in claim.items if item.invoice_id):
|
||||
matched.append(claim)
|
||||
return matched
|
||||
|
||||
|
||||
def _split_billing_claims(
|
||||
target: RiskGraphClaimSnapshot,
|
||||
claims: list[RiskGraphClaimSnapshot],
|
||||
near_threshold_amount: Decimal,
|
||||
) -> list[RiskGraphClaimSnapshot]:
|
||||
if target.occurred_at is None:
|
||||
return []
|
||||
matched = [
|
||||
claim
|
||||
for claim in claims
|
||||
if _same_employee(claim, target)
|
||||
and _same(claim.expense_type, target.expense_type)
|
||||
and _same(claim.location, target.location)
|
||||
and claim.occurred_at is not None
|
||||
and abs((claim.occurred_at.date() - target.occurred_at.date()).days) <= 7
|
||||
and _to_decimal(claim.amount) <= near_threshold_amount
|
||||
and _to_decimal(claim.amount) >= near_threshold_amount * Decimal("0.55")
|
||||
]
|
||||
return matched
|
||||
|
||||
|
||||
def _employee_frequency_claims(
|
||||
target: RiskGraphClaimSnapshot,
|
||||
claims: list[RiskGraphClaimSnapshot],
|
||||
) -> list[RiskGraphClaimSnapshot]:
|
||||
if target.occurred_at is None:
|
||||
return []
|
||||
return [
|
||||
claim
|
||||
for claim in claims
|
||||
if _same_employee(claim, target)
|
||||
and _same(claim.expense_type, target.expense_type)
|
||||
and claim.occurred_at is not None
|
||||
and abs((claim.occurred_at.date() - target.occurred_at.date()).days) <= 30
|
||||
]
|
||||
|
||||
|
||||
def _cross_department_cluster_claims(
|
||||
target: RiskGraphClaimSnapshot,
|
||||
claims: list[RiskGraphClaimSnapshot],
|
||||
) -> list[RiskGraphClaimSnapshot]:
|
||||
if target.occurred_at is None or not target.location:
|
||||
return []
|
||||
matched = [
|
||||
claim
|
||||
for claim in claims
|
||||
if claim.occurred_at is not None
|
||||
and claim.occurred_at.date() == target.occurred_at.date()
|
||||
and _same(claim.location, target.location)
|
||||
and _same(claim.expense_type, target.expense_type)
|
||||
and _to_decimal(claim.amount) >= _to_decimal(target.amount) * Decimal("0.65")
|
||||
]
|
||||
departments = {
|
||||
_canonical_key(claim.department_name)
|
||||
for claim in matched
|
||||
if claim.department_name
|
||||
}
|
||||
return matched if len(departments) >= 2 else []
|
||||
|
||||
|
||||
def _similar_case_ids(
|
||||
target: RiskGraphClaimSnapshot,
|
||||
claims: list[RiskGraphClaimSnapshot],
|
||||
) -> list[str]:
|
||||
return [
|
||||
claim.claim_id
|
||||
for claim in _employee_frequency_claims(target, claims)
|
||||
if claim.claim_id != target.claim_id
|
||||
][:8]
|
||||
|
||||
|
||||
def _claim_related_node_keys(
|
||||
claim: RiskGraphClaimSnapshot,
|
||||
nodes: list[RiskGraphNode],
|
||||
) -> list[str]:
|
||||
claim_key = claim_node_key(claim)
|
||||
employee_key = employee_node_key(claim)
|
||||
related = {claim_key}
|
||||
if employee_key:
|
||||
related.add(employee_key)
|
||||
for node in nodes:
|
||||
if str(node.key).startswith(("expense_type:", "department:", "location:")):
|
||||
if str(node.label or "").strip() in {
|
||||
claim.expense_type,
|
||||
claim.department_name,
|
||||
claim.location,
|
||||
}:
|
||||
related.add(node.key)
|
||||
return sorted(related)
|
||||
|
||||
|
||||
def _claim_related_edge_keys(
|
||||
claim: RiskGraphClaimSnapshot,
|
||||
edges: list[RiskGraphEdge],
|
||||
) -> list[dict[str, str]]:
|
||||
claim_key = claim_node_key(claim)
|
||||
return [
|
||||
{
|
||||
"source_key": edge.source_key,
|
||||
"target_key": edge.target_key,
|
||||
"edge_type": edge.edge_type,
|
||||
}
|
||||
for edge in edges
|
||||
if edge.source_key == claim_key or edge.target_key == claim_key
|
||||
]
|
||||
|
||||
|
||||
def _build_description(
|
||||
claim: RiskGraphClaimSnapshot,
|
||||
signal: NormalizedRiskSignal,
|
||||
risk_score: int,
|
||||
evidence: list[RiskEvidence],
|
||||
) -> str:
|
||||
top_evidence = max(evidence, key=lambda item: item.score, default=None)
|
||||
if top_evidence is None:
|
||||
return (
|
||||
f"{claim.claim_no or claim.claim_id} produced "
|
||||
f"{signal.label} with score {risk_score}."
|
||||
)
|
||||
return (
|
||||
f"{claim.claim_no or claim.claim_id} produced {signal.label} "
|
||||
f"with score {risk_score}. Main evidence: {top_evidence.detail}"
|
||||
)
|
||||
|
||||
|
||||
def _level_from_score(score: int) -> str:
|
||||
if score >= 90:
|
||||
return LEVEL_CRITICAL
|
||||
if score >= 70:
|
||||
return LEVEL_HIGH
|
||||
if score >= 45:
|
||||
return LEVEL_MEDIUM
|
||||
return LEVEL_LOW
|
||||
|
||||
|
||||
def _score_ratio(value: Decimal, bands: list[tuple[Decimal, int]]) -> int:
|
||||
if not bands:
|
||||
return 0
|
||||
points = sorted(bands, key=lambda item: item[0])
|
||||
if value <= points[0][0]:
|
||||
return points[0][1]
|
||||
for index in range(1, len(points)):
|
||||
left_value, left_score = points[index - 1]
|
||||
right_value, right_score = points[index]
|
||||
if value > right_value:
|
||||
continue
|
||||
ratio = (value - left_value) / (right_value - left_value)
|
||||
return _clamp_score(Decimal(left_score) + ratio * Decimal(right_score - left_score))
|
||||
return points[-1][1]
|
||||
|
||||
|
||||
def _percentile(values: list[Decimal], percent: int) -> Decimal:
|
||||
normalized = sorted(value for value in values if value >= ZERO)
|
||||
if not normalized:
|
||||
return ZERO
|
||||
if len(normalized) == 1:
|
||||
return normalized[0]
|
||||
position = Decimal(len(normalized) - 1) * Decimal(percent) / HUNDRED
|
||||
lower = int(position.to_integral_value(rounding=ROUND_FLOOR))
|
||||
upper = int(position.to_integral_value(rounding=ROUND_CEILING))
|
||||
if lower == upper:
|
||||
return normalized[lower]
|
||||
fraction = position - Decimal(lower)
|
||||
return normalized[lower] + (normalized[upper] - normalized[lower]) * fraction
|
||||
|
||||
|
||||
def _safe_ratio(numerator: Any, denominator: Any) -> Decimal:
|
||||
denominator_value = _to_decimal(denominator)
|
||||
if denominator_value <= ZERO:
|
||||
return ZERO
|
||||
return (_to_decimal(numerator) / denominator_value).quantize(Decimal("0.0001"))
|
||||
|
||||
|
||||
def _to_decimal(value: Any) -> Decimal:
|
||||
try:
|
||||
return Decimal(str(value or "0"))
|
||||
except Exception:
|
||||
return ZERO
|
||||
|
||||
|
||||
def _clamp_score(value: Any) -> int:
|
||||
try:
|
||||
numeric = Decimal(str(value))
|
||||
except Exception:
|
||||
numeric = ZERO
|
||||
return max(0, min(100, int(numeric.quantize(ONE, rounding=ROUND_HALF_UP))))
|
||||
|
||||
|
||||
def _same(left: Any, right: Any) -> bool:
|
||||
return _canonical_key(left) == _canonical_key(right)
|
||||
|
||||
|
||||
def _same_employee(left: RiskGraphClaimSnapshot, right: RiskGraphClaimSnapshot) -> bool:
|
||||
left_key = left.employee_id or left.employee_name
|
||||
right_key = right.employee_id or right.employee_name
|
||||
return bool(left_key and _same(left_key, right_key))
|
||||
|
||||
|
||||
def _canonical_key(value: Any) -> str:
|
||||
return "_".join(str(value or "").strip().lower().split())
|
||||
|
||||
|
||||
def _dedupe_signals(signals: list[NormalizedRiskSignal]) -> list[NormalizedRiskSignal]:
|
||||
by_code: dict[str, NormalizedRiskSignal] = {}
|
||||
for signal in signals:
|
||||
current = by_code.get(signal.code)
|
||||
if current is None or signal.score > current.score:
|
||||
by_code[signal.code] = signal
|
||||
return list(by_code.values())
|
||||
|
||||
|
||||
def _merge_nodes(
|
||||
first: list[RiskGraphNode],
|
||||
second: list[RiskGraphNode],
|
||||
) -> list[RiskGraphNode]:
|
||||
by_key = {node.key: node for node in first}
|
||||
for node in second:
|
||||
by_key.setdefault(node.key, node)
|
||||
return list(by_key.values())
|
||||
|
||||
|
||||
def _merge_edges(
|
||||
first: list[RiskGraphEdge],
|
||||
second: list[RiskGraphEdge],
|
||||
) -> list[RiskGraphEdge]:
|
||||
by_key = {edge.edge_key(): edge for edge in first}
|
||||
for edge in second:
|
||||
by_key.setdefault(edge.edge_key(), edge)
|
||||
return list(by_key.values())
|
||||
113
server/src/app/algorithem/risk_graph/entity_resolution.py
Normal file
113
server/src/app/algorithem/risk_graph/entity_resolution.py
Normal file
@@ -0,0 +1,113 @@
|
||||
"""Canonical entity resolution for financial risk graph subjects."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
|
||||
|
||||
ENTITY_TYPE_ALIASES = {
|
||||
"supplier": "vendor",
|
||||
"merchant": "vendor",
|
||||
"hotel": "vendor",
|
||||
"bank_account_name": "bank_account",
|
||||
"employee_name": "employee",
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class CanonicalEntity:
|
||||
canonical_id: str
|
||||
entity_type: str
|
||||
canonical_key: str
|
||||
label: str
|
||||
aliases: list[str] = field(default_factory=list)
|
||||
source: str = ""
|
||||
confirmed_by: str = ""
|
||||
confirmed_at: str = ""
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"canonical_id": self.canonical_id,
|
||||
"entity_type": self.entity_type,
|
||||
"canonical_key": self.canonical_key,
|
||||
"label": self.label,
|
||||
"aliases": list(self.aliases),
|
||||
"source": self.source,
|
||||
"confirmed_by": self.confirmed_by,
|
||||
"confirmed_at": self.confirmed_at,
|
||||
"metadata": dict(self.metadata),
|
||||
}
|
||||
|
||||
|
||||
class FinancialEntityResolver:
|
||||
def resolve(
|
||||
self,
|
||||
entity_type: str,
|
||||
value: str,
|
||||
*,
|
||||
source: str = "",
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> CanonicalEntity | None:
|
||||
canonical_type = ENTITY_TYPE_ALIASES.get(_canonical_token(entity_type), _canonical_token(entity_type))
|
||||
canonical_key = _canonical_value(value)
|
||||
if not canonical_type or not canonical_key:
|
||||
return None
|
||||
canonical_id = _canonical_id(canonical_type, canonical_key)
|
||||
return CanonicalEntity(
|
||||
canonical_id=canonical_id,
|
||||
entity_type=canonical_type,
|
||||
canonical_key=canonical_key,
|
||||
label=str(value or "").strip(),
|
||||
aliases=[str(value or "").strip()],
|
||||
source=source,
|
||||
metadata=metadata or {},
|
||||
)
|
||||
|
||||
|
||||
class CanonicalEntityRegistry:
|
||||
def __init__(self) -> None:
|
||||
self._entities: dict[str, CanonicalEntity] = {}
|
||||
|
||||
def upsert(self, entity: CanonicalEntity) -> CanonicalEntity:
|
||||
current = self._entities.get(entity.canonical_id)
|
||||
if current is None:
|
||||
self._entities[entity.canonical_id] = entity
|
||||
return entity
|
||||
aliases = list(dict.fromkeys([*current.aliases, *entity.aliases]))
|
||||
current.aliases = aliases
|
||||
current.metadata.update(entity.metadata)
|
||||
return current
|
||||
|
||||
def confirm(self, canonical_id: str, *, actor: str) -> CanonicalEntity | None:
|
||||
entity = self._entities.get(canonical_id)
|
||||
if entity is None:
|
||||
return None
|
||||
entity.confirmed_by = str(actor or "").strip()
|
||||
entity.confirmed_at = datetime.now(UTC).isoformat()
|
||||
return entity
|
||||
|
||||
def get(self, canonical_id: str) -> CanonicalEntity | None:
|
||||
return self._entities.get(canonical_id)
|
||||
|
||||
def all(self) -> list[CanonicalEntity]:
|
||||
return list(self._entities.values())
|
||||
|
||||
|
||||
def _canonical_id(entity_type: str, canonical_key: str) -> str:
|
||||
digest = hashlib.sha1(f"{entity_type}:{canonical_key}".encode("utf-8")).hexdigest()[:12]
|
||||
return f"{entity_type}:{digest}"
|
||||
|
||||
|
||||
def _canonical_token(value: str) -> str:
|
||||
return "_".join(str(value or "").strip().lower().split())
|
||||
|
||||
|
||||
def _canonical_value(value: str) -> str:
|
||||
normalized = str(value or "").strip().lower()
|
||||
normalized = re.sub(r"[\s\-_/,,.。()()【】\[\]]+", "", normalized)
|
||||
return normalized
|
||||
71
server/src/app/algorithem/risk_graph/evaluation_cases.py
Normal file
71
server/src/app/algorithem/risk_graph/evaluation_cases.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""Replayable evaluation cases for the financial risk graph algorithm."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class RiskEvaluationCase:
|
||||
case_id: str
|
||||
category: str
|
||||
expected_signal: str
|
||||
expected_level: str
|
||||
description: str
|
||||
payload: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"case_id": self.case_id,
|
||||
"category": self.category,
|
||||
"expected_signal": self.expected_signal,
|
||||
"expected_level": self.expected_level,
|
||||
"description": self.description,
|
||||
"payload": dict(self.payload),
|
||||
}
|
||||
|
||||
|
||||
def default_risk_evaluation_cases() -> list[RiskEvaluationCase]:
|
||||
return [
|
||||
RiskEvaluationCase(
|
||||
case_id="positive_duplicate_invoice_high",
|
||||
category="positive",
|
||||
expected_signal="duplicate_invoice",
|
||||
expected_level="high",
|
||||
description="重复发票叠加高金额偏离,应输出高风险观察。",
|
||||
payload={"risk_flags": ["duplicate_invoice"], "invoice_reuse": True},
|
||||
),
|
||||
RiskEvaluationCase(
|
||||
case_id="negative_clean_low_amount",
|
||||
category="negative",
|
||||
expected_signal="none",
|
||||
expected_level="none",
|
||||
description="低金额、无规则命中、无图谱异常,不应输出风险观察。",
|
||||
payload={"amount": 300, "risk_flags": []},
|
||||
),
|
||||
RiskEvaluationCase(
|
||||
case_id="counterfactual_invoice_corrected",
|
||||
category="counterfactual",
|
||||
expected_signal="none",
|
||||
expected_level="none",
|
||||
description="重复票据被替换为唯一票据后,风险应消失或降级。",
|
||||
payload={"remove_duplicate_invoice": True},
|
||||
),
|
||||
RiskEvaluationCase(
|
||||
case_id="noise_missing_employee",
|
||||
category="noise",
|
||||
expected_signal="preapproval_absent",
|
||||
expected_level="medium",
|
||||
description="缺失员工信息时允许候选观察,但不能输出强风控结论。",
|
||||
payload={"missing_fields": ["employee"], "score_cap": 69},
|
||||
),
|
||||
RiskEvaluationCase(
|
||||
case_id="historical_false_positive_calibration",
|
||||
category="historical_false_positive",
|
||||
expected_signal="duplicate_invoice",
|
||||
expected_level="medium",
|
||||
description="历史误报率较高时进入校准抽审,不直接强拦截。",
|
||||
payload={"false_positive_rate": 0.35},
|
||||
),
|
||||
]
|
||||
144
server/src/app/algorithem/risk_graph/features.py
Normal file
144
server/src/app/algorithem/risk_graph/features.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Feature extraction for heterogeneous financial risk graphs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import Counter, defaultdict, deque
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from .models import RiskGraphEdge, RiskGraphNode
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskGraphFeatureSet:
|
||||
node_type_counts: dict[str, int] = field(default_factory=dict)
|
||||
edge_type_counts: dict[str, int] = field(default_factory=dict)
|
||||
meta_path_counts: dict[str, int] = field(default_factory=dict)
|
||||
degree_centrality: dict[str, float] = field(default_factory=dict)
|
||||
clusters: list[dict[str, Any]] = field(default_factory=list)
|
||||
neighbor_risk_density: dict[str, float] = field(default_factory=dict)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"node_type_counts": dict(self.node_type_counts),
|
||||
"edge_type_counts": dict(self.edge_type_counts),
|
||||
"meta_path_counts": dict(self.meta_path_counts),
|
||||
"degree_centrality": dict(self.degree_centrality),
|
||||
"clusters": list(self.clusters),
|
||||
"neighbor_risk_density": dict(self.neighbor_risk_density),
|
||||
}
|
||||
|
||||
|
||||
class HeterogeneousRiskGraphFeatureBuilder:
|
||||
def build(
|
||||
self,
|
||||
nodes: list[RiskGraphNode],
|
||||
edges: list[RiskGraphEdge],
|
||||
*,
|
||||
risk_node_keys: set[str] | None = None,
|
||||
) -> RiskGraphFeatureSet:
|
||||
node_by_key = {node.key: node for node in nodes}
|
||||
adjacency = _build_adjacency(edges)
|
||||
risk_keys = set(risk_node_keys or set())
|
||||
return RiskGraphFeatureSet(
|
||||
node_type_counts=dict(Counter(node.node_type for node in nodes)),
|
||||
edge_type_counts=dict(Counter(edge.edge_type for edge in edges)),
|
||||
meta_path_counts=_meta_path_counts(node_by_key, adjacency),
|
||||
degree_centrality=_degree_centrality(node_by_key, adjacency),
|
||||
clusters=_clusters(node_by_key, adjacency),
|
||||
neighbor_risk_density=_neighbor_risk_density(node_by_key, adjacency, risk_keys),
|
||||
)
|
||||
|
||||
|
||||
def _build_adjacency(edges: list[RiskGraphEdge]) -> dict[str, list[tuple[str, str]]]:
|
||||
adjacency: dict[str, list[tuple[str, str]]] = defaultdict(list)
|
||||
for edge in edges:
|
||||
adjacency[edge.source_key].append((edge.target_key, edge.edge_type))
|
||||
adjacency[edge.target_key].append((edge.source_key, edge.edge_type))
|
||||
return adjacency
|
||||
|
||||
|
||||
def _meta_path_counts(
|
||||
node_by_key: dict[str, RiskGraphNode],
|
||||
adjacency: dict[str, list[tuple[str, str]]],
|
||||
) -> dict[str, int]:
|
||||
counts: Counter[str] = Counter()
|
||||
for source_key, first_hops in adjacency.items():
|
||||
source = node_by_key.get(source_key)
|
||||
if source is None:
|
||||
continue
|
||||
for middle_key, first_edge_type in first_hops:
|
||||
middle = node_by_key.get(middle_key)
|
||||
if middle is None:
|
||||
continue
|
||||
for target_key, second_edge_type in adjacency.get(middle_key, []):
|
||||
if target_key == source_key:
|
||||
continue
|
||||
target = node_by_key.get(target_key)
|
||||
if target is None:
|
||||
continue
|
||||
key = (
|
||||
f"{source.node_type}->{first_edge_type}->{middle.node_type}"
|
||||
f"->{second_edge_type}->{target.node_type}"
|
||||
)
|
||||
counts[key] += 1
|
||||
return dict(counts)
|
||||
|
||||
|
||||
def _degree_centrality(
|
||||
node_by_key: dict[str, RiskGraphNode],
|
||||
adjacency: dict[str, list[tuple[str, str]]],
|
||||
) -> dict[str, float]:
|
||||
denominator = max(1, len(node_by_key) - 1)
|
||||
return {
|
||||
node_key: round(len(adjacency.get(node_key, [])) / denominator, 4)
|
||||
for node_key in node_by_key
|
||||
}
|
||||
|
||||
|
||||
def _clusters(
|
||||
node_by_key: dict[str, RiskGraphNode],
|
||||
adjacency: dict[str, list[tuple[str, str]]],
|
||||
) -> list[dict[str, Any]]:
|
||||
visited: set[str] = set()
|
||||
clusters: list[dict[str, Any]] = []
|
||||
for start_key in node_by_key:
|
||||
if start_key in visited:
|
||||
continue
|
||||
queue: deque[str] = deque([start_key])
|
||||
visited.add(start_key)
|
||||
members: list[str] = []
|
||||
type_counts: Counter[str] = Counter()
|
||||
while queue:
|
||||
node_key = queue.popleft()
|
||||
members.append(node_key)
|
||||
type_counts[node_by_key[node_key].node_type] += 1
|
||||
for next_key, _ in adjacency.get(node_key, []):
|
||||
if next_key in visited or next_key not in node_by_key:
|
||||
continue
|
||||
visited.add(next_key)
|
||||
queue.append(next_key)
|
||||
clusters.append(
|
||||
{
|
||||
"size": len(members),
|
||||
"node_keys": sorted(members),
|
||||
"node_type_counts": dict(type_counts),
|
||||
}
|
||||
)
|
||||
return sorted(clusters, key=lambda item: item["size"], reverse=True)
|
||||
|
||||
|
||||
def _neighbor_risk_density(
|
||||
node_by_key: dict[str, RiskGraphNode],
|
||||
adjacency: dict[str, list[tuple[str, str]]],
|
||||
risk_keys: set[str],
|
||||
) -> dict[str, float]:
|
||||
density: dict[str, float] = {}
|
||||
for node_key in node_by_key:
|
||||
neighbors = [target for target, _ in adjacency.get(node_key, [])]
|
||||
if not neighbors:
|
||||
density[node_key] = 0.0
|
||||
continue
|
||||
risk_neighbor_count = sum(1 for target in neighbors if target in risk_keys)
|
||||
density[node_key] = round(risk_neighbor_count / len(neighbors), 4)
|
||||
return density
|
||||
307
server/src/app/algorithem/risk_graph/graph.py
Normal file
307
server/src/app/algorithem/risk_graph/graph.py
Normal file
@@ -0,0 +1,307 @@
|
||||
"""Graph construction helpers for expense risk analysis."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from decimal import Decimal
|
||||
|
||||
from .models import RiskGraphClaimSnapshot, RiskGraphEdge, RiskGraphNode
|
||||
|
||||
ALLOWED_EDGE_TYPES = {
|
||||
"department_has_employee",
|
||||
"employee_submits_claim",
|
||||
"claim_has_item",
|
||||
"claim_expense_type",
|
||||
"claim_location",
|
||||
"claim_invoice",
|
||||
"claim_has_risk_signal",
|
||||
"claim_similar_to",
|
||||
"claim_duplicate_invoice",
|
||||
"ontology_extracts",
|
||||
"ontology_constrains",
|
||||
"ontology_signals",
|
||||
}
|
||||
|
||||
|
||||
def build_claim_graph(
|
||||
claims: list[RiskGraphClaimSnapshot],
|
||||
) -> tuple[list[RiskGraphNode], list[RiskGraphEdge]]:
|
||||
nodes: dict[str, RiskGraphNode] = {}
|
||||
edges: dict[tuple[str, str, str], RiskGraphEdge] = {}
|
||||
|
||||
for claim in claims:
|
||||
claim_key = claim_node_key(claim)
|
||||
_add_node(
|
||||
nodes,
|
||||
RiskGraphNode(
|
||||
key=claim_key,
|
||||
node_type="claim",
|
||||
label=claim.claim_no or claim.claim_id,
|
||||
canonical_key=claim_key,
|
||||
canonical_id=claim.claim_id or claim.claim_no,
|
||||
metadata={
|
||||
"claim_id": claim.claim_id,
|
||||
"amount": str(_to_decimal(claim.amount)),
|
||||
"expense_type": claim.expense_type,
|
||||
"status": claim.status,
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
employee_key = employee_node_key(claim)
|
||||
if employee_key:
|
||||
_add_node(
|
||||
nodes,
|
||||
RiskGraphNode(
|
||||
key=employee_key,
|
||||
node_type="employee",
|
||||
label=claim.employee_name or claim.employee_id or "unknown",
|
||||
canonical_key=employee_key,
|
||||
canonical_id=claim.employee_id or claim.employee_name,
|
||||
metadata={"employee_id": claim.employee_id, "grade": claim.employee_grade},
|
||||
),
|
||||
)
|
||||
_add_edge(
|
||||
edges,
|
||||
RiskGraphEdge(
|
||||
source_key=employee_key,
|
||||
target_key=claim_key,
|
||||
edge_type="employee_submits_claim",
|
||||
metadata={"amount": str(_to_decimal(claim.amount))},
|
||||
),
|
||||
)
|
||||
|
||||
department_key = department_node_key(claim)
|
||||
if department_key:
|
||||
_add_node(
|
||||
nodes,
|
||||
RiskGraphNode(
|
||||
key=department_key,
|
||||
node_type="department",
|
||||
label=claim.department_name or claim.department_id or "unknown",
|
||||
canonical_key=department_key,
|
||||
canonical_id=claim.department_id or claim.department_name,
|
||||
metadata={"department_id": claim.department_id},
|
||||
),
|
||||
)
|
||||
if employee_key:
|
||||
_add_edge(
|
||||
edges,
|
||||
RiskGraphEdge(
|
||||
source_key=department_key,
|
||||
target_key=employee_key,
|
||||
edge_type="department_has_employee",
|
||||
),
|
||||
)
|
||||
|
||||
expense_key = expense_type_node_key(claim.expense_type)
|
||||
if expense_key:
|
||||
_add_node(
|
||||
nodes,
|
||||
RiskGraphNode(
|
||||
key=expense_key,
|
||||
node_type="expense_type",
|
||||
label=claim.expense_type,
|
||||
canonical_key=expense_key,
|
||||
canonical_id=claim.expense_type,
|
||||
),
|
||||
)
|
||||
_add_edge(
|
||||
edges,
|
||||
RiskGraphEdge(
|
||||
source_key=claim_key,
|
||||
target_key=expense_key,
|
||||
edge_type="claim_expense_type",
|
||||
),
|
||||
)
|
||||
|
||||
location_key = location_node_key(claim.location)
|
||||
if location_key:
|
||||
_add_node(
|
||||
nodes,
|
||||
RiskGraphNode(
|
||||
key=location_key,
|
||||
node_type="location",
|
||||
label=claim.location,
|
||||
canonical_key=location_key,
|
||||
canonical_id=claim.location,
|
||||
),
|
||||
)
|
||||
_add_edge(
|
||||
edges,
|
||||
RiskGraphEdge(
|
||||
source_key=claim_key,
|
||||
target_key=location_key,
|
||||
edge_type="claim_location",
|
||||
),
|
||||
)
|
||||
|
||||
for item in claim.items:
|
||||
item_key = f"claim_item:{item.item_id}" if item.item_id else ""
|
||||
if item_key:
|
||||
_add_node(
|
||||
nodes,
|
||||
RiskGraphNode(
|
||||
key=item_key,
|
||||
node_type="claim_item",
|
||||
label=item.item_type or item.item_id,
|
||||
canonical_key=item_key,
|
||||
canonical_id=item.item_id,
|
||||
metadata={
|
||||
"amount": str(_to_decimal(item.item_amount)),
|
||||
"location": item.item_location,
|
||||
"invoice_id": item.invoice_id,
|
||||
},
|
||||
),
|
||||
)
|
||||
_add_edge(
|
||||
edges,
|
||||
RiskGraphEdge(
|
||||
source_key=claim_key,
|
||||
target_key=item_key,
|
||||
edge_type="claim_has_item",
|
||||
),
|
||||
)
|
||||
if item.invoice_id:
|
||||
invoice_key = invoice_node_key(item.invoice_id)
|
||||
_add_node(
|
||||
nodes,
|
||||
RiskGraphNode(
|
||||
key=invoice_key,
|
||||
node_type="invoice",
|
||||
label=item.invoice_id,
|
||||
canonical_key=invoice_key,
|
||||
canonical_id=item.invoice_id,
|
||||
),
|
||||
)
|
||||
_add_edge(
|
||||
edges,
|
||||
RiskGraphEdge(
|
||||
source_key=claim_key,
|
||||
target_key=invoice_key,
|
||||
edge_type="claim_invoice",
|
||||
),
|
||||
)
|
||||
|
||||
_link_duplicate_invoices(claims, edges)
|
||||
_link_similar_claims(claims, edges)
|
||||
return list(nodes.values()), list(edges.values())
|
||||
|
||||
|
||||
def claim_node_key(claim: RiskGraphClaimSnapshot) -> str:
|
||||
return f"claim:{claim.claim_id or claim.claim_no}"
|
||||
|
||||
|
||||
def employee_node_key(claim: RiskGraphClaimSnapshot) -> str:
|
||||
identifier = claim.employee_id or claim.employee_name
|
||||
return f"employee:{_canonical_key(identifier)}" if identifier else ""
|
||||
|
||||
|
||||
def department_node_key(claim: RiskGraphClaimSnapshot) -> str:
|
||||
identifier = claim.department_id or claim.department_name
|
||||
return f"department:{_canonical_key(identifier)}" if identifier else ""
|
||||
|
||||
|
||||
def expense_type_node_key(expense_type: str) -> str:
|
||||
return f"expense_type:{_canonical_key(expense_type)}" if str(expense_type or "").strip() else ""
|
||||
|
||||
|
||||
def location_node_key(location: str) -> str:
|
||||
return f"location:{_canonical_key(location)}" if str(location or "").strip() else ""
|
||||
|
||||
|
||||
def invoice_node_key(invoice_id: str) -> str:
|
||||
return f"invoice:{_canonical_key(invoice_id)}"
|
||||
|
||||
|
||||
def _link_duplicate_invoices(
|
||||
claims: list[RiskGraphClaimSnapshot],
|
||||
edges: dict[tuple[str, str, str], RiskGraphEdge],
|
||||
) -> None:
|
||||
by_invoice: dict[str, list[RiskGraphClaimSnapshot]] = {}
|
||||
for claim in claims:
|
||||
for item in claim.items:
|
||||
if item.invoice_id:
|
||||
by_invoice.setdefault(item.invoice_id, []).append(claim)
|
||||
|
||||
for invoice_id, invoice_claims in by_invoice.items():
|
||||
unique_claims = {claim.claim_id: claim for claim in invoice_claims}
|
||||
if len(unique_claims) < 2:
|
||||
continue
|
||||
claim_list = list(unique_claims.values())
|
||||
for source in claim_list:
|
||||
for target in claim_list:
|
||||
if source.claim_id == target.claim_id:
|
||||
continue
|
||||
_add_edge(
|
||||
edges,
|
||||
RiskGraphEdge(
|
||||
source_key=claim_node_key(source),
|
||||
target_key=claim_node_key(target),
|
||||
edge_type="claim_duplicate_invoice",
|
||||
weight=Decimal("2"),
|
||||
evidence=f"invoice:{invoice_id}",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _link_similar_claims(
|
||||
claims: list[RiskGraphClaimSnapshot],
|
||||
edges: dict[tuple[str, str, str], RiskGraphEdge],
|
||||
) -> None:
|
||||
for index, source in enumerate(claims):
|
||||
for target in claims[index + 1 :]:
|
||||
if not _is_similar_claim(source, target):
|
||||
continue
|
||||
_add_edge(
|
||||
edges,
|
||||
RiskGraphEdge(
|
||||
source_key=claim_node_key(source),
|
||||
target_key=claim_node_key(target),
|
||||
edge_type="claim_similar_to",
|
||||
weight=Decimal("0.7"),
|
||||
metadata={"reason": "same employee and expense type"},
|
||||
),
|
||||
)
|
||||
_add_edge(
|
||||
edges,
|
||||
RiskGraphEdge(
|
||||
source_key=claim_node_key(target),
|
||||
target_key=claim_node_key(source),
|
||||
edge_type="claim_similar_to",
|
||||
weight=Decimal("0.7"),
|
||||
metadata={"reason": "same employee and expense type"},
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _is_similar_claim(source: RiskGraphClaimSnapshot, target: RiskGraphClaimSnapshot) -> bool:
|
||||
source_employee = source.employee_id or source.employee_name
|
||||
target_employee = target.employee_id or target.employee_name
|
||||
if not source_employee or _canonical_key(source_employee) != _canonical_key(target_employee):
|
||||
return False
|
||||
if _canonical_key(source.expense_type) != _canonical_key(target.expense_type):
|
||||
return False
|
||||
if source.occurred_at is None or target.occurred_at is None:
|
||||
return True
|
||||
return abs((source.occurred_at.date() - target.occurred_at.date()).days) <= 30
|
||||
|
||||
|
||||
def _add_node(nodes: dict[str, RiskGraphNode], node: RiskGraphNode) -> None:
|
||||
nodes.setdefault(node.key, node)
|
||||
|
||||
|
||||
def _add_edge(edges: dict[tuple[str, str, str], RiskGraphEdge], edge: RiskGraphEdge) -> None:
|
||||
if edge.edge_type not in ALLOWED_EDGE_TYPES:
|
||||
return
|
||||
edges.setdefault(edge.edge_key(), edge)
|
||||
|
||||
|
||||
def _canonical_key(value: str | None) -> str:
|
||||
return "_".join(str(value or "").strip().lower().split())
|
||||
|
||||
|
||||
def _to_decimal(value: object) -> Decimal:
|
||||
try:
|
||||
return Decimal(str(value or "0"))
|
||||
except Exception:
|
||||
return Decimal("0")
|
||||
103
server/src/app/algorithem/risk_graph/lineage.py
Normal file
103
server/src/app/algorithem/risk_graph/lineage.py
Normal file
@@ -0,0 +1,103 @@
|
||||
"""Data lineage contracts for risk graph observations."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskDataLineage:
|
||||
observation_key: str
|
||||
data_tables: list[str] = field(default_factory=list)
|
||||
document_ids: list[str] = field(default_factory=list)
|
||||
ocr_job_ids: list[str] = field(default_factory=list)
|
||||
agent_run_ids: list[str] = field(default_factory=list)
|
||||
tool_call_ids: list[str] = field(default_factory=list)
|
||||
rule_versions: list[str] = field(default_factory=list)
|
||||
ontology_version: str = ""
|
||||
algorithm_version: str = ""
|
||||
source_event_ids: list[str] = field(default_factory=list)
|
||||
quality_gates: list[str] = field(default_factory=list)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"observation_key": self.observation_key,
|
||||
"data_tables": list(self.data_tables),
|
||||
"document_ids": list(self.document_ids),
|
||||
"ocr_job_ids": list(self.ocr_job_ids),
|
||||
"agent_run_ids": list(self.agent_run_ids),
|
||||
"tool_call_ids": list(self.tool_call_ids),
|
||||
"rule_versions": list(self.rule_versions),
|
||||
"ontology_version": self.ontology_version,
|
||||
"algorithm_version": self.algorithm_version,
|
||||
"source_event_ids": list(self.source_event_ids),
|
||||
"quality_gates": list(self.quality_gates),
|
||||
}
|
||||
|
||||
|
||||
class RiskDataLineageBuilder:
|
||||
def build_from_observation(
|
||||
self,
|
||||
observation: dict[str, Any],
|
||||
*,
|
||||
source_event_ids: list[str] | None = None,
|
||||
) -> RiskDataLineage:
|
||||
evidence = [item for item in observation.get("evidence", []) if isinstance(item, dict)]
|
||||
ontology_json = observation.get("ontology_json") or {}
|
||||
decision_trace = observation.get("decision_trace") or {}
|
||||
data_tables = ["risk_observations"]
|
||||
if observation.get("claim_id"):
|
||||
data_tables.extend(["expense_claims", "expense_claim_items"])
|
||||
if evidence:
|
||||
data_tables.append("risk_observation_evidence")
|
||||
|
||||
return RiskDataLineage(
|
||||
observation_key=str(observation.get("observation_key") or ""),
|
||||
data_tables=_unique(data_tables),
|
||||
document_ids=_evidence_values(evidence, ["document_id", "doc_id", "file_id"]),
|
||||
ocr_job_ids=_evidence_values(evidence, ["ocr_job_id", "ocr_run_id"]),
|
||||
agent_run_ids=_unique(
|
||||
[
|
||||
str(observation.get("run_id") or "").strip(),
|
||||
str(decision_trace.get("agent_run_id") or "").strip(),
|
||||
]
|
||||
),
|
||||
tool_call_ids=_evidence_values(evidence, ["tool_call_id"]),
|
||||
rule_versions=_unique(
|
||||
[
|
||||
*_evidence_values(evidence, ["rule_version"]),
|
||||
str(decision_trace.get("rule_version") or "").strip(),
|
||||
]
|
||||
),
|
||||
ontology_version=str(ontology_json.get("ontology_version") or "").strip(),
|
||||
algorithm_version=str(observation.get("algorithm_version") or "").strip(),
|
||||
source_event_ids=_unique(source_event_ids or []),
|
||||
quality_gates=_quality_gates(decision_trace),
|
||||
)
|
||||
|
||||
|
||||
def _evidence_values(evidence: list[dict[str, Any]], keys: list[str]) -> list[str]:
|
||||
values: list[str] = []
|
||||
for item in evidence:
|
||||
metadata = item.get("metadata") if isinstance(item.get("metadata"), dict) else {}
|
||||
for key in keys:
|
||||
value = str(item.get(key) or metadata.get(key) or "").strip()
|
||||
if value:
|
||||
values.append(value)
|
||||
return _unique(values)
|
||||
|
||||
|
||||
def _quality_gates(decision_trace: dict[str, Any]) -> list[str]:
|
||||
gates = [
|
||||
str(decision_trace.get("evidence_source_gate") or "").strip(),
|
||||
str(decision_trace.get("data_quality_gate") or "").strip(),
|
||||
]
|
||||
sampling = decision_trace.get("sampling_strategy")
|
||||
if isinstance(sampling, dict):
|
||||
gates.append(str(sampling.get("strategy") or "").strip())
|
||||
return _unique([item for item in gates if item and item != "passed"])
|
||||
|
||||
|
||||
def _unique(values: list[str]) -> list[str]:
|
||||
return list(dict.fromkeys(str(item).strip() for item in values if str(item).strip()))
|
||||
365
server/src/app/algorithem/risk_graph/models.py
Normal file
365
server/src/app/algorithem/risk_graph/models.py
Normal file
@@ -0,0 +1,365 @@
|
||||
"""Data contracts for the financial risk graph algorithm."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date, datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
|
||||
ALGORITHM_VERSION = "financial_risk_graph.v1"
|
||||
|
||||
LEVEL_LOW = "low"
|
||||
LEVEL_MEDIUM = "medium"
|
||||
LEVEL_HIGH = "high"
|
||||
LEVEL_CRITICAL = "critical"
|
||||
|
||||
AUTOMATION_ASSIST = "assist"
|
||||
AUTOMATION_MANUAL_REVIEW = "manual_review"
|
||||
AUTOMATION_SEMI_AUTO_REVIEW = "semi_auto_review"
|
||||
AUTOMATION_AUTO_HOLD = "auto_hold"
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskGraphClaimItemSnapshot:
|
||||
item_id: str = ""
|
||||
item_type: str = ""
|
||||
item_amount: Any = Decimal("0")
|
||||
item_location: str = ""
|
||||
item_date: date | None = None
|
||||
invoice_id: str | None = None
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@classmethod
|
||||
def from_orm(cls, item: Any) -> "RiskGraphClaimItemSnapshot":
|
||||
return cls(
|
||||
item_id=str(getattr(item, "id", "") or ""),
|
||||
item_type=str(getattr(item, "item_type", "") or ""),
|
||||
item_amount=getattr(item, "item_amount", Decimal("0")) or Decimal("0"),
|
||||
item_location=str(getattr(item, "item_location", "") or ""),
|
||||
item_date=getattr(item, "item_date", None),
|
||||
invoice_id=(
|
||||
str(getattr(item, "invoice_id", "") or "").strip()
|
||||
or None
|
||||
),
|
||||
metadata=_metadata_from_object(item),
|
||||
)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskGraphClaimSnapshot:
|
||||
claim_id: str
|
||||
claim_no: str = ""
|
||||
employee_id: str | None = None
|
||||
employee_name: str = ""
|
||||
department_id: str | None = None
|
||||
department_name: str = ""
|
||||
employee_grade: str | None = None
|
||||
expense_type: str = ""
|
||||
amount: Any = Decimal("0")
|
||||
currency: str = "CNY"
|
||||
invoice_count: int = 0
|
||||
occurred_at: datetime | None = None
|
||||
submitted_at: datetime | None = None
|
||||
status: str = ""
|
||||
reason: str = ""
|
||||
location: str = ""
|
||||
risk_flags: list[Any] = field(default_factory=list)
|
||||
items: list[RiskGraphClaimItemSnapshot] = field(default_factory=list)
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@classmethod
|
||||
def from_orm(cls, claim: Any) -> "RiskGraphClaimSnapshot":
|
||||
items = [
|
||||
RiskGraphClaimItemSnapshot.from_orm(item)
|
||||
for item in list(getattr(claim, "items", None) or [])
|
||||
]
|
||||
return cls(
|
||||
claim_id=str(getattr(claim, "id", "") or ""),
|
||||
claim_no=str(getattr(claim, "claim_no", "") or ""),
|
||||
employee_id=(
|
||||
str(getattr(claim, "employee_id", "") or "").strip()
|
||||
or None
|
||||
),
|
||||
employee_name=str(getattr(claim, "employee_name", "") or ""),
|
||||
department_id=(
|
||||
str(getattr(claim, "department_id", "") or "").strip()
|
||||
or None
|
||||
),
|
||||
department_name=str(getattr(claim, "department_name", "") or ""),
|
||||
employee_grade=(
|
||||
str(getattr(claim, "employee_grade", "") or "").strip()
|
||||
or None
|
||||
),
|
||||
expense_type=str(getattr(claim, "expense_type", "") or ""),
|
||||
amount=getattr(claim, "amount", Decimal("0")) or Decimal("0"),
|
||||
currency=str(getattr(claim, "currency", "CNY") or "CNY"),
|
||||
invoice_count=int(getattr(claim, "invoice_count", 0) or 0),
|
||||
occurred_at=getattr(claim, "occurred_at", None),
|
||||
submitted_at=getattr(claim, "submitted_at", None),
|
||||
status=str(getattr(claim, "status", "") or ""),
|
||||
reason=str(getattr(claim, "reason", "") or ""),
|
||||
location=str(getattr(claim, "location", "") or ""),
|
||||
risk_flags=list(getattr(claim, "risk_flags_json", None) or []),
|
||||
items=items,
|
||||
metadata=_metadata_from_object(claim),
|
||||
)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskGraphNode:
|
||||
key: str
|
||||
node_type: str
|
||||
label: str
|
||||
canonical_key: str = ""
|
||||
canonical_id: str = ""
|
||||
ontology_type: str = ""
|
||||
ontology_parse_id: str = ""
|
||||
ontology_version: str = ""
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"key": self.key,
|
||||
"node_type": self.node_type,
|
||||
"label": self.label,
|
||||
"canonical_key": self.canonical_key or self.key,
|
||||
"canonical_id": self.canonical_id or self.canonical_key or self.key,
|
||||
"ontology_type": self.ontology_type or self.node_type,
|
||||
"ontology_parse_id": self.ontology_parse_id,
|
||||
"ontology_version": self.ontology_version,
|
||||
"metadata": _json_safe(self.metadata),
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskGraphEdge:
|
||||
source_key: str
|
||||
target_key: str
|
||||
edge_type: str
|
||||
weight: Decimal = Decimal("1")
|
||||
source: str = "algorithm"
|
||||
evidence: str = ""
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def edge_key(self) -> tuple[str, str, str]:
|
||||
return (self.source_key, self.target_key, self.edge_type)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"source_key": self.source_key,
|
||||
"target_key": self.target_key,
|
||||
"edge_type": self.edge_type,
|
||||
"weight": _format_decimal(self.weight),
|
||||
"source": self.source,
|
||||
"evidence": self.evidence,
|
||||
"metadata": _json_safe(self.metadata),
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class PeerBaseline:
|
||||
scope: str
|
||||
sample_size: int
|
||||
median_amount: Decimal = Decimal("0")
|
||||
p75_amount: Decimal = Decimal("0")
|
||||
p90_amount: Decimal = Decimal("0")
|
||||
mean_amount: Decimal = Decimal("0")
|
||||
fallback_reason: str = ""
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"scope": self.scope,
|
||||
"sample_size": self.sample_size,
|
||||
"median_amount": _format_decimal(self.median_amount),
|
||||
"p75_amount": _format_decimal(self.p75_amount),
|
||||
"p90_amount": _format_decimal(self.p90_amount),
|
||||
"mean_amount": _format_decimal(self.mean_amount),
|
||||
"fallback_reason": self.fallback_reason,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskEvidence:
|
||||
code: str
|
||||
title: str
|
||||
detail: str
|
||||
source: str
|
||||
score: int = 0
|
||||
related_entity_keys: list[str] = field(default_factory=list)
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"code": self.code,
|
||||
"title": self.title,
|
||||
"detail": self.detail,
|
||||
"source": self.source,
|
||||
"score": int(self.score),
|
||||
"related_entity_keys": list(self.related_entity_keys),
|
||||
"metadata": _json_safe(self.metadata),
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskHistoryStats:
|
||||
risk_signal: str
|
||||
expense_type: str = ""
|
||||
similar_case_count: int = 0
|
||||
confirmed_count: int = 0
|
||||
false_positive_count: int = 0
|
||||
returned_count: int = 0
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"risk_signal": self.risk_signal,
|
||||
"expense_type": self.expense_type,
|
||||
"similar_case_count": self.similar_case_count,
|
||||
"confirmed_count": self.confirmed_count,
|
||||
"false_positive_count": self.false_positive_count,
|
||||
"returned_count": self.returned_count,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskGraphEvaluationContext:
|
||||
claims: list[RiskGraphClaimSnapshot]
|
||||
target_claim_ids: set[str] | None = None
|
||||
ontology_parse: Any | None = None
|
||||
ontology_parse_id: str = ""
|
||||
ontology_version: str = "ontology.v1"
|
||||
history_stats: list[RiskHistoryStats] = field(default_factory=list)
|
||||
min_peer_sample_size: int = 3
|
||||
observation_threshold: int = 31
|
||||
near_threshold_amount: Decimal = Decimal("5000")
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskObservationDraft:
|
||||
observation_key: str
|
||||
subject_type: str
|
||||
subject_key: str
|
||||
subject_label: str
|
||||
claim_id: str
|
||||
claim_no: str
|
||||
risk_type: str
|
||||
risk_signal: str
|
||||
title: str
|
||||
description: str
|
||||
risk_score: int
|
||||
risk_level: str
|
||||
confidence_score: Decimal
|
||||
control_stage: str
|
||||
control_mode: str
|
||||
automation_mode: str
|
||||
source: str
|
||||
algorithm_version: str
|
||||
contribution_scores: dict[str, int]
|
||||
baseline: PeerBaseline
|
||||
evidence: list[RiskEvidence] = field(default_factory=list)
|
||||
graph_node_keys: list[str] = field(default_factory=list)
|
||||
graph_edge_keys: list[dict[str, str]] = field(default_factory=list)
|
||||
policy_refs: list[str] = field(default_factory=list)
|
||||
similar_case_claim_ids: list[str] = field(default_factory=list)
|
||||
ontology_json: dict[str, Any] = field(default_factory=dict)
|
||||
decision_trace: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"observation_key": self.observation_key,
|
||||
"subject_type": self.subject_type,
|
||||
"subject_key": self.subject_key,
|
||||
"subject_label": self.subject_label,
|
||||
"claim_id": self.claim_id,
|
||||
"claim_no": self.claim_no,
|
||||
"risk_type": self.risk_type,
|
||||
"risk_signal": self.risk_signal,
|
||||
"title": self.title,
|
||||
"description": self.description,
|
||||
"risk_score": self.risk_score,
|
||||
"risk_level": self.risk_level,
|
||||
"confidence_score": _format_decimal(self.confidence_score),
|
||||
"control_stage": self.control_stage,
|
||||
"control_mode": self.control_mode,
|
||||
"automation_mode": self.automation_mode,
|
||||
"source": self.source,
|
||||
"algorithm_version": self.algorithm_version,
|
||||
"contribution_scores": dict(self.contribution_scores),
|
||||
"baseline": self.baseline.as_dict(),
|
||||
"evidence": [item.as_dict() for item in self.evidence],
|
||||
"graph_node_keys": list(self.graph_node_keys),
|
||||
"graph_edge_keys": list(self.graph_edge_keys),
|
||||
"policy_refs": list(self.policy_refs),
|
||||
"similar_case_claim_ids": list(self.similar_case_claim_ids),
|
||||
"ontology_json": _json_safe(self.ontology_json),
|
||||
"decision_trace": _json_safe(self.decision_trace),
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskGraphEvaluationResult:
|
||||
observations: list[RiskObservationDraft]
|
||||
nodes: list[RiskGraphNode]
|
||||
edges: list[RiskGraphEdge]
|
||||
algorithm_version: str = ALGORITHM_VERSION
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"algorithm_version": self.algorithm_version,
|
||||
"observations": [item.as_dict() for item in self.observations],
|
||||
"nodes": [item.as_dict() for item in self.nodes],
|
||||
"edges": [item.as_dict() for item in self.edges],
|
||||
"summary": {
|
||||
"observation_count": len(self.observations),
|
||||
"node_count": len(self.nodes),
|
||||
"edge_count": len(self.edges),
|
||||
"high_or_above_count": sum(
|
||||
1
|
||||
for item in self.observations
|
||||
if item.risk_level in {LEVEL_HIGH, LEVEL_CRITICAL}
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _format_decimal(value: Any, places: str = "0.0000") -> str:
|
||||
if value is None:
|
||||
return "0"
|
||||
if not isinstance(value, Decimal):
|
||||
value = Decimal(str(value or "0"))
|
||||
return format(value.quantize(Decimal(places)), "f").rstrip("0").rstrip(".") or "0"
|
||||
|
||||
|
||||
def _json_safe(value: Any) -> Any:
|
||||
if isinstance(value, Decimal):
|
||||
return _format_decimal(value)
|
||||
if isinstance(value, (datetime, date)):
|
||||
return value.isoformat()
|
||||
if isinstance(value, list):
|
||||
return [_json_safe(item) for item in value]
|
||||
if isinstance(value, tuple):
|
||||
return [_json_safe(item) for item in value]
|
||||
if isinstance(value, dict):
|
||||
return {str(key): _json_safe(item) for key, item in value.items()}
|
||||
return value
|
||||
|
||||
|
||||
def _metadata_from_object(source: Any) -> dict[str, Any]:
|
||||
metadata: dict[str, Any] = {}
|
||||
for attr in (
|
||||
"metadata",
|
||||
"metadata_json",
|
||||
"extra_json",
|
||||
"supplier_id",
|
||||
"supplier_name",
|
||||
"vendor_id",
|
||||
"vendor_name",
|
||||
"merchant_id",
|
||||
"merchant_name",
|
||||
):
|
||||
value = getattr(source, attr, None)
|
||||
if isinstance(value, dict):
|
||||
metadata.update(value)
|
||||
elif attr != "metadata" and value not in (None, ""):
|
||||
metadata[attr] = value
|
||||
return metadata
|
||||
270
server/src/app/algorithem/risk_graph/ontology.py
Normal file
270
server/src/app/algorithem/risk_graph/ontology.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""Ontology-to-risk-graph mapping utilities."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
|
||||
from .models import RiskGraphEdge, RiskGraphNode
|
||||
from .signals import NormalizedRiskSignal, normalize_risk_signals
|
||||
|
||||
ONTOLOGY_NODE_TYPE_MAP = {
|
||||
"expense_type": "expense_type",
|
||||
"document_type": "document",
|
||||
"employee": "employee",
|
||||
"department": "department",
|
||||
"vendor": "vendor",
|
||||
"supplier": "vendor",
|
||||
"merchant": "vendor",
|
||||
"customer": "customer",
|
||||
"risk_signal": "risk_signal",
|
||||
"invoice": "invoice",
|
||||
"claim": "claim",
|
||||
}
|
||||
|
||||
ALLOWED_ONTOLOGY_EDGE_TYPES = {
|
||||
"ontology_extracts",
|
||||
"ontology_constrains",
|
||||
"ontology_signals",
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class OntologyRiskGraphMapping:
|
||||
ontology_parse_id: str
|
||||
ontology_version: str
|
||||
domain: str
|
||||
scenario: str
|
||||
intent: str
|
||||
confidence: Decimal
|
||||
gate: str
|
||||
nodes: list[RiskGraphNode] = field(default_factory=list)
|
||||
edges: list[RiskGraphEdge] = field(default_factory=list)
|
||||
risk_signals: list[NormalizedRiskSignal] = field(default_factory=list)
|
||||
canonical_subject_key: str = ""
|
||||
raw_payload: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"ontology_parse_id": self.ontology_parse_id,
|
||||
"ontology_version": self.ontology_version,
|
||||
"domain": self.domain,
|
||||
"scenario": self.scenario,
|
||||
"intent": self.intent,
|
||||
"confidence": str(self.confidence),
|
||||
"gate": self.gate,
|
||||
"canonical_subject_key": self.canonical_subject_key,
|
||||
"risk_signals": [item.as_dict() for item in self.risk_signals],
|
||||
}
|
||||
|
||||
|
||||
def map_ontology_to_risk_graph(
|
||||
ontology: Any,
|
||||
*,
|
||||
ontology_parse_id: str = "",
|
||||
ontology_version: str = "ontology.v1",
|
||||
) -> OntologyRiskGraphMapping:
|
||||
payload = _model_to_dict(ontology)
|
||||
if not payload:
|
||||
return OntologyRiskGraphMapping(
|
||||
ontology_parse_id=ontology_parse_id,
|
||||
ontology_version=ontology_version,
|
||||
domain="unknown",
|
||||
scenario="unknown",
|
||||
intent="query",
|
||||
confidence=Decimal("0"),
|
||||
gate="candidate_only",
|
||||
)
|
||||
|
||||
parse_id = str(
|
||||
ontology_parse_id
|
||||
or payload.get("ontology_parse_id")
|
||||
or payload.get("parse_id")
|
||||
or payload.get("run_id")
|
||||
or "ontology_parse"
|
||||
)
|
||||
scenario = str(payload.get("scenario") or "unknown")
|
||||
intent = str(payload.get("intent") or "query")
|
||||
domain = str(payload.get("domain") or scenario)
|
||||
confidence = _to_decimal(payload.get("confidence"))
|
||||
gate = _gate_from_confidence(confidence)
|
||||
|
||||
nodes: list[RiskGraphNode] = [
|
||||
RiskGraphNode(
|
||||
key=f"ontology:{parse_id}",
|
||||
node_type="ontology_parse",
|
||||
label=parse_id,
|
||||
canonical_key=f"ontology:{parse_id}",
|
||||
canonical_id=parse_id,
|
||||
ontology_type="ontology_parse",
|
||||
ontology_parse_id=parse_id,
|
||||
ontology_version=ontology_version,
|
||||
metadata={
|
||||
"scenario": scenario,
|
||||
"intent": intent,
|
||||
"domain": domain,
|
||||
"confidence": str(confidence),
|
||||
},
|
||||
)
|
||||
]
|
||||
edges: list[RiskGraphEdge] = []
|
||||
canonical_subject_key = ""
|
||||
|
||||
for entity in list(payload.get("entities") or []):
|
||||
entity_payload = _model_to_dict(entity)
|
||||
raw_type = str(entity_payload.get("type") or "").strip().lower()
|
||||
node_type = ONTOLOGY_NODE_TYPE_MAP.get(raw_type, raw_type or "entity")
|
||||
value = str(
|
||||
entity_payload.get("normalized_value")
|
||||
or entity_payload.get("value")
|
||||
or ""
|
||||
).strip()
|
||||
if not value:
|
||||
continue
|
||||
key = f"{node_type}:{_canonical_key(value)}"
|
||||
nodes.append(
|
||||
RiskGraphNode(
|
||||
key=key,
|
||||
node_type=node_type,
|
||||
label=value,
|
||||
canonical_key=key,
|
||||
canonical_id=_canonical_key(value),
|
||||
ontology_type=raw_type or node_type,
|
||||
ontology_parse_id=parse_id,
|
||||
ontology_version=ontology_version,
|
||||
metadata={
|
||||
"role": entity_payload.get("role") or "target",
|
||||
"confidence": entity_payload.get("confidence") or 0,
|
||||
},
|
||||
)
|
||||
)
|
||||
edges.append(
|
||||
RiskGraphEdge(
|
||||
source_key=f"ontology:{parse_id}",
|
||||
target_key=key,
|
||||
edge_type="ontology_extracts",
|
||||
source="ontology",
|
||||
metadata={"raw_type": raw_type},
|
||||
)
|
||||
)
|
||||
if not canonical_subject_key and node_type in {"employee", "claim", "vendor"}:
|
||||
canonical_subject_key = key
|
||||
|
||||
for constraint in list(payload.get("constraints") or []):
|
||||
constraint_payload = _model_to_dict(constraint)
|
||||
field = str(constraint_payload.get("field") or "").strip()
|
||||
operator = str(constraint_payload.get("operator") or "").strip()
|
||||
value = str(constraint_payload.get("value") or "").strip()
|
||||
if not field or not value:
|
||||
continue
|
||||
key = f"constraint:{_canonical_key(field)}:{_canonical_key(value)}"
|
||||
nodes.append(
|
||||
RiskGraphNode(
|
||||
key=key,
|
||||
node_type="constraint",
|
||||
label=f"{field} {operator} {value}".strip(),
|
||||
canonical_key=key,
|
||||
canonical_id=key,
|
||||
ontology_type="constraint",
|
||||
ontology_parse_id=parse_id,
|
||||
ontology_version=ontology_version,
|
||||
metadata=constraint_payload,
|
||||
)
|
||||
)
|
||||
edges.append(
|
||||
RiskGraphEdge(
|
||||
source_key=f"ontology:{parse_id}",
|
||||
target_key=key,
|
||||
edge_type="ontology_constrains",
|
||||
source="ontology",
|
||||
)
|
||||
)
|
||||
|
||||
risk_signals = normalize_risk_signals(list(payload.get("risk_flags") or []), source="ontology")
|
||||
for signal in risk_signals:
|
||||
key = f"risk_signal:{signal.code}"
|
||||
nodes.append(
|
||||
RiskGraphNode(
|
||||
key=key,
|
||||
node_type="risk_signal",
|
||||
label=signal.label,
|
||||
canonical_key=key,
|
||||
canonical_id=signal.code,
|
||||
ontology_type="risk_signal",
|
||||
ontology_parse_id=parse_id,
|
||||
ontology_version=ontology_version,
|
||||
metadata={"severity": signal.severity, "score": signal.score},
|
||||
)
|
||||
)
|
||||
edges.append(
|
||||
RiskGraphEdge(
|
||||
source_key=f"ontology:{parse_id}",
|
||||
target_key=key,
|
||||
edge_type="ontology_signals",
|
||||
source="ontology",
|
||||
metadata={"gate": gate},
|
||||
)
|
||||
)
|
||||
|
||||
return OntologyRiskGraphMapping(
|
||||
ontology_parse_id=parse_id,
|
||||
ontology_version=ontology_version,
|
||||
domain=domain,
|
||||
scenario=scenario,
|
||||
intent=intent,
|
||||
confidence=confidence,
|
||||
gate=gate,
|
||||
nodes=_dedupe_nodes(nodes),
|
||||
edges=_dedupe_edges(edges),
|
||||
risk_signals=risk_signals,
|
||||
canonical_subject_key=canonical_subject_key,
|
||||
raw_payload=payload,
|
||||
)
|
||||
|
||||
|
||||
def _model_to_dict(value: Any) -> dict[str, Any]:
|
||||
if value is None:
|
||||
return {}
|
||||
if isinstance(value, dict):
|
||||
return dict(value)
|
||||
if hasattr(value, "model_dump"):
|
||||
return dict(value.model_dump(mode="json"))
|
||||
if hasattr(value, "dict"):
|
||||
return dict(value.dict())
|
||||
return {}
|
||||
|
||||
|
||||
def _gate_from_confidence(confidence: Decimal) -> str:
|
||||
if confidence >= Decimal("0.78"):
|
||||
return "automatic"
|
||||
if confidence >= Decimal("0.55"):
|
||||
return "review"
|
||||
return "candidate_only"
|
||||
|
||||
|
||||
def _canonical_key(value: str) -> str:
|
||||
return "_".join(str(value or "").strip().lower().split())
|
||||
|
||||
|
||||
def _to_decimal(value: Any) -> Decimal:
|
||||
try:
|
||||
return Decimal(str(value or "0"))
|
||||
except Exception:
|
||||
return Decimal("0")
|
||||
|
||||
|
||||
def _dedupe_nodes(nodes: list[RiskGraphNode]) -> list[RiskGraphNode]:
|
||||
by_key: dict[str, RiskGraphNode] = {}
|
||||
for node in nodes:
|
||||
by_key.setdefault(node.key, node)
|
||||
return list(by_key.values())
|
||||
|
||||
|
||||
def _dedupe_edges(edges: list[RiskGraphEdge]) -> list[RiskGraphEdge]:
|
||||
by_key: dict[tuple[str, str, str], RiskGraphEdge] = {}
|
||||
for edge in edges:
|
||||
if edge.edge_type not in ALLOWED_ONTOLOGY_EDGE_TYPES:
|
||||
continue
|
||||
by_key.setdefault(edge.edge_key(), edge)
|
||||
return list(by_key.values())
|
||||
@@ -0,0 +1,86 @@
|
||||
"""Output contract for finance policy knowledge organizing tasks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class PolicySourceRef:
|
||||
source_id: str
|
||||
title: str
|
||||
location: str = ""
|
||||
page: str = ""
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"source_id": self.source_id,
|
||||
"title": self.title,
|
||||
"location": self.location,
|
||||
"page": self.page,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class PolicyKnowledgeItem:
|
||||
policy_ref: str
|
||||
title: str
|
||||
summary: str
|
||||
expense_type: str = ""
|
||||
control_stage: str = ""
|
||||
trigger_conditions: list[str] = field(default_factory=list)
|
||||
source_refs: list[PolicySourceRef] = field(default_factory=list)
|
||||
review_status: str = "pending_review"
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"policy_ref": self.policy_ref,
|
||||
"title": self.title,
|
||||
"summary": self.summary,
|
||||
"expense_type": self.expense_type,
|
||||
"control_stage": self.control_stage,
|
||||
"trigger_conditions": list(self.trigger_conditions),
|
||||
"source_refs": [item.as_dict() for item in self.source_refs],
|
||||
"review_status": self.review_status,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class PolicyKnowledgeOrganizingReport:
|
||||
summary: str
|
||||
categories: list[str] = field(default_factory=list)
|
||||
knowledge_items: list[PolicyKnowledgeItem] = field(default_factory=list)
|
||||
source_refs: list[PolicySourceRef] = field(default_factory=list)
|
||||
open_questions: list[str] = field(default_factory=list)
|
||||
next_actions: list[str] = field(default_factory=list)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"summary": self.summary,
|
||||
"categories": list(self.categories),
|
||||
"knowledge_items": [item.as_dict() for item in self.knowledge_items],
|
||||
"source_refs": [item.as_dict() for item in self.source_refs],
|
||||
"open_questions": list(self.open_questions),
|
||||
"next_actions": list(self.next_actions),
|
||||
"risk_policy_refs": self.risk_policy_refs(),
|
||||
}
|
||||
|
||||
def risk_policy_refs(self) -> list[str]:
|
||||
return list(
|
||||
dict.fromkeys(
|
||||
item.policy_ref
|
||||
for item in self.knowledge_items
|
||||
if item.policy_ref and item.review_status in {"pending_review", "confirmed"}
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def build_policy_ref(expense_type: str, signal: str, *, prefix: str = "policy") -> str:
|
||||
expense = _token(expense_type) or "general"
|
||||
risk_signal = _token(signal) or "control"
|
||||
return f"{prefix}.{expense}.{risk_signal}"
|
||||
|
||||
|
||||
def _token(value: str) -> str:
|
||||
return "_".join(str(value or "").strip().lower().split())
|
||||
325
server/src/app/algorithem/risk_graph/process_mining.py
Normal file
325
server/src/app/algorithem/risk_graph/process_mining.py
Normal file
@@ -0,0 +1,325 @@
|
||||
"""Object-centric process mining for financial risk events."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
|
||||
from .models import RiskGraphClaimSnapshot
|
||||
|
||||
APPROVAL_EVENTS = {"approval_approved", "finance_approved", "claim_approved"}
|
||||
PAYMENT_EVENTS = {"payment_requested", "payment_completed"}
|
||||
RETURN_EVENTS = {"claim_returned", "approval_returned", "supplement_required"}
|
||||
SUBMIT_EVENTS = {"claim_submitted", "application_submitted"}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ObjectCentricEvent:
|
||||
event_id: str
|
||||
event_type: str
|
||||
occurred_at: datetime
|
||||
object_refs: dict[str, list[str]]
|
||||
actor: str = ""
|
||||
source: str = ""
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"event_id": self.event_id,
|
||||
"event_type": self.event_type,
|
||||
"occurred_at": self.occurred_at.isoformat(),
|
||||
"object_refs": {key: list(value) for key, value in self.object_refs.items()},
|
||||
"actor": self.actor,
|
||||
"source": self.source,
|
||||
"metadata": dict(self.metadata),
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ConformanceRisk:
|
||||
risk_code: str
|
||||
title: str
|
||||
detail: str
|
||||
severity: str
|
||||
related_event_ids: list[str] = field(default_factory=list)
|
||||
object_refs: dict[str, list[str]] = field(default_factory=dict)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"risk_code": self.risk_code,
|
||||
"title": self.title,
|
||||
"detail": self.detail,
|
||||
"severity": self.severity,
|
||||
"related_event_ids": list(self.related_event_ids),
|
||||
"object_refs": {key: list(value) for key, value in self.object_refs.items()},
|
||||
}
|
||||
|
||||
|
||||
class ObjectCentricProcessMiner:
|
||||
def build_from_claims(
|
||||
self,
|
||||
claims: list[RiskGraphClaimSnapshot],
|
||||
) -> list[ObjectCentricEvent]:
|
||||
events: list[ObjectCentricEvent] = []
|
||||
for claim in claims:
|
||||
events.extend(self._claim_events(claim))
|
||||
return sorted(events, key=lambda item: (item.occurred_at, item.event_id))
|
||||
|
||||
def build_from_dicts(self, rows: list[dict[str, Any]]) -> list[ObjectCentricEvent]:
|
||||
events: list[ObjectCentricEvent] = []
|
||||
for index, row in enumerate(rows):
|
||||
occurred_at = _datetime_from_value(row.get("occurred_at"))
|
||||
if occurred_at is None:
|
||||
continue
|
||||
event_type = str(row.get("event_type") or "").strip()
|
||||
if not event_type:
|
||||
continue
|
||||
events.append(
|
||||
ObjectCentricEvent(
|
||||
event_id=str(row.get("event_id") or f"event:{index}:{event_type}"),
|
||||
event_type=event_type,
|
||||
occurred_at=occurred_at,
|
||||
object_refs=_normalize_object_refs(row.get("object_refs")),
|
||||
actor=str(row.get("actor") or "").strip(),
|
||||
source=str(row.get("source") or "").strip(),
|
||||
metadata=dict(row.get("metadata") or {}),
|
||||
)
|
||||
)
|
||||
return sorted(events, key=lambda item: (item.occurred_at, item.event_id))
|
||||
|
||||
def _claim_events(self, claim: RiskGraphClaimSnapshot) -> list[ObjectCentricEvent]:
|
||||
object_refs = _claim_object_refs(claim)
|
||||
events: list[ObjectCentricEvent] = []
|
||||
occurred_at = claim.occurred_at or claim.submitted_at
|
||||
if occurred_at:
|
||||
events.append(
|
||||
ObjectCentricEvent(
|
||||
event_id=f"{claim.claim_id}:expense_occurred",
|
||||
event_type="expense_occurred",
|
||||
occurred_at=occurred_at,
|
||||
object_refs=object_refs,
|
||||
actor=claim.employee_id or claim.employee_name,
|
||||
source="expense_claim",
|
||||
metadata={"amount": str(claim.amount), "expense_type": claim.expense_type},
|
||||
)
|
||||
)
|
||||
if claim.submitted_at:
|
||||
events.append(
|
||||
ObjectCentricEvent(
|
||||
event_id=f"{claim.claim_id}:claim_submitted",
|
||||
event_type="claim_submitted",
|
||||
occurred_at=claim.submitted_at,
|
||||
object_refs=object_refs,
|
||||
actor=claim.employee_id or claim.employee_name,
|
||||
source="expense_claim",
|
||||
metadata={"status": claim.status},
|
||||
)
|
||||
)
|
||||
for item in claim.items:
|
||||
item_time = _datetime_from_value(item.item_date) or occurred_at or datetime.now(UTC)
|
||||
item_refs = _merge_object_refs(
|
||||
object_refs,
|
||||
{
|
||||
"claim_item": [item.item_id] if item.item_id else [],
|
||||
"invoice": [item.invoice_id] if item.invoice_id else [],
|
||||
},
|
||||
)
|
||||
events.append(
|
||||
ObjectCentricEvent(
|
||||
event_id=f"{claim.claim_id}:item:{item.item_id or len(events)}",
|
||||
event_type="expense_item_recorded",
|
||||
occurred_at=item_time,
|
||||
object_refs=item_refs,
|
||||
actor=claim.employee_id or claim.employee_name,
|
||||
source="expense_item",
|
||||
metadata={
|
||||
"amount": str(item.item_amount),
|
||||
"item_type": item.item_type,
|
||||
"item_location": item.item_location,
|
||||
},
|
||||
)
|
||||
)
|
||||
if item.invoice_id:
|
||||
events.append(
|
||||
ObjectCentricEvent(
|
||||
event_id=f"{claim.claim_id}:invoice:{item.invoice_id}",
|
||||
event_type="invoice_attached",
|
||||
occurred_at=item_time,
|
||||
object_refs=item_refs,
|
||||
actor=claim.employee_id or claim.employee_name,
|
||||
source="invoice",
|
||||
)
|
||||
)
|
||||
for index, flag in enumerate(claim.risk_flags):
|
||||
signal = _risk_signal_from_flag(flag)
|
||||
if not signal:
|
||||
continue
|
||||
events.append(
|
||||
ObjectCentricEvent(
|
||||
event_id=f"{claim.claim_id}:risk_flag:{index}:{signal}",
|
||||
event_type="risk_flagged",
|
||||
occurred_at=claim.submitted_at or occurred_at or datetime.now(UTC),
|
||||
object_refs=object_refs,
|
||||
source="risk_rule",
|
||||
metadata={"risk_signal": signal, "raw": flag},
|
||||
)
|
||||
)
|
||||
return events
|
||||
|
||||
|
||||
class ConformanceRiskDetector:
|
||||
def detect(self, events: list[ObjectCentricEvent]) -> list[ConformanceRisk]:
|
||||
risks: list[ConformanceRisk] = []
|
||||
for claim_key, claim_events in _events_by_object(events, "claim").items():
|
||||
ordered = sorted(claim_events, key=lambda item: (item.occurred_at, item.event_id))
|
||||
risks.extend(self._detect_claim_risks(claim_key, ordered))
|
||||
return risks
|
||||
|
||||
def _detect_claim_risks(
|
||||
self,
|
||||
claim_key: str,
|
||||
events: list[ObjectCentricEvent],
|
||||
) -> list[ConformanceRisk]:
|
||||
risks: list[ConformanceRisk] = []
|
||||
event_types = [event.event_type for event in events]
|
||||
first_submit = _first_event(events, SUBMIT_EVENTS)
|
||||
first_approval = _first_event(events, APPROVAL_EVENTS)
|
||||
first_payment = _first_event(events, PAYMENT_EVENTS)
|
||||
|
||||
if first_payment and (not first_approval or first_payment.occurred_at < first_approval.occurred_at):
|
||||
related = [first_payment.event_id]
|
||||
if first_approval:
|
||||
related.append(first_approval.event_id)
|
||||
risks.append(
|
||||
ConformanceRisk(
|
||||
risk_code="payment_before_approval",
|
||||
title="Payment before approval",
|
||||
detail="Payment event appears before an approval event.",
|
||||
severity="critical",
|
||||
related_event_ids=related,
|
||||
object_refs={"claim": [claim_key]},
|
||||
)
|
||||
)
|
||||
if first_approval and (not first_submit or first_approval.occurred_at < first_submit.occurred_at):
|
||||
related = [first_approval.event_id]
|
||||
if first_submit:
|
||||
related.append(first_submit.event_id)
|
||||
risks.append(
|
||||
ConformanceRisk(
|
||||
risk_code="approval_bypass",
|
||||
title="Approval bypass",
|
||||
detail="Approval appears before submission or without submission.",
|
||||
severity="high",
|
||||
related_event_ids=related,
|
||||
object_refs={"claim": [claim_key]},
|
||||
)
|
||||
)
|
||||
return_count = sum(1 for event_type in event_types if event_type in RETURN_EVENTS)
|
||||
submit_count = sum(1 for event_type in event_types if event_type in SUBMIT_EVENTS)
|
||||
if return_count >= 2 or (return_count >= 1 and submit_count >= 2):
|
||||
risks.append(
|
||||
ConformanceRisk(
|
||||
risk_code="rework_loop",
|
||||
title="Rework loop",
|
||||
detail="Claim has repeated return and resubmission events.",
|
||||
severity="medium",
|
||||
related_event_ids=[
|
||||
event.event_id
|
||||
for event in events
|
||||
if event.event_type in RETURN_EVENTS | SUBMIT_EVENTS
|
||||
],
|
||||
object_refs={"claim": [claim_key]},
|
||||
)
|
||||
)
|
||||
if "invoice_attached" in event_types and not first_submit:
|
||||
risks.append(
|
||||
ConformanceRisk(
|
||||
risk_code="process_bypass",
|
||||
title="Process bypass",
|
||||
detail="Invoice exists without a claim submission event.",
|
||||
severity="medium",
|
||||
related_event_ids=[
|
||||
event.event_id for event in events if event.event_type == "invoice_attached"
|
||||
],
|
||||
object_refs={"claim": [claim_key]},
|
||||
)
|
||||
)
|
||||
return risks
|
||||
|
||||
|
||||
def _claim_object_refs(claim: RiskGraphClaimSnapshot) -> dict[str, list[str]]:
|
||||
return {
|
||||
"claim": [claim.claim_id] if claim.claim_id else [],
|
||||
"employee": [claim.employee_id or claim.employee_name]
|
||||
if claim.employee_id or claim.employee_name
|
||||
else [],
|
||||
"department": [claim.department_id or claim.department_name]
|
||||
if claim.department_id or claim.department_name
|
||||
else [],
|
||||
"expense_type": [claim.expense_type] if claim.expense_type else [],
|
||||
}
|
||||
|
||||
|
||||
def _normalize_object_refs(value: Any) -> dict[str, list[str]]:
|
||||
if not isinstance(value, dict):
|
||||
return {}
|
||||
normalized: dict[str, list[str]] = {}
|
||||
for key, raw_items in value.items():
|
||||
if isinstance(raw_items, list):
|
||||
items = [str(item).strip() for item in raw_items if str(item).strip()]
|
||||
else:
|
||||
items = [str(raw_items).strip()] if str(raw_items or "").strip() else []
|
||||
normalized[str(key).strip()] = list(dict.fromkeys(items))
|
||||
return normalized
|
||||
|
||||
|
||||
def _merge_object_refs(*refs: dict[str, list[str]]) -> dict[str, list[str]]:
|
||||
merged: dict[str, list[str]] = {}
|
||||
for ref in refs:
|
||||
for key, values in ref.items():
|
||||
bucket = merged.setdefault(key, [])
|
||||
bucket.extend(str(value).strip() for value in values if str(value).strip())
|
||||
return {key: list(dict.fromkeys(values)) for key, values in merged.items()}
|
||||
|
||||
|
||||
def _events_by_object(
|
||||
events: list[ObjectCentricEvent],
|
||||
object_type: str,
|
||||
) -> dict[str, list[ObjectCentricEvent]]:
|
||||
grouped: dict[str, list[ObjectCentricEvent]] = {}
|
||||
for event in events:
|
||||
for object_key in event.object_refs.get(object_type, []):
|
||||
grouped.setdefault(object_key, []).append(event)
|
||||
return grouped
|
||||
|
||||
|
||||
def _first_event(
|
||||
events: list[ObjectCentricEvent],
|
||||
event_types: set[str],
|
||||
) -> ObjectCentricEvent | None:
|
||||
for event in events:
|
||||
if event.event_type in event_types:
|
||||
return event
|
||||
return None
|
||||
|
||||
|
||||
def _risk_signal_from_flag(flag: Any) -> str:
|
||||
if isinstance(flag, dict):
|
||||
raw = flag.get("risk_signal") or flag.get("signal") or flag.get("rule_code") or flag.get("code")
|
||||
else:
|
||||
raw = flag
|
||||
return "_".join(str(raw or "").strip().lower().split())
|
||||
|
||||
|
||||
def _datetime_from_value(value: Any) -> datetime | None:
|
||||
if isinstance(value, datetime):
|
||||
return value
|
||||
if hasattr(value, "year") and hasattr(value, "month") and hasattr(value, "day"):
|
||||
return datetime(value.year, value.month, value.day, tzinfo=UTC)
|
||||
if isinstance(value, str) and value.strip():
|
||||
try:
|
||||
return datetime.fromisoformat(value)
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
259
server/src/app/algorithem/risk_graph/profile_baselines.py
Normal file
259
server/src/app/algorithem/risk_graph/profile_baselines.py
Normal file
@@ -0,0 +1,259 @@
|
||||
"""Profile baseline contracts for digital employee scans."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from decimal import ROUND_CEILING, ROUND_FLOOR, Decimal
|
||||
from typing import Any
|
||||
|
||||
from .models import ALGORITHM_VERSION, RiskGraphClaimSnapshot
|
||||
|
||||
ZERO = Decimal("0")
|
||||
HUNDRED = Decimal("100")
|
||||
|
||||
BASELINE_ALGORITHM_VERSION = f"{ALGORITHM_VERSION}.profile_baselines.v1"
|
||||
BASELINE_DIMENSIONS = ("employee", "department", "supplier", "expense_type")
|
||||
SUPPLIER_ID_KEYS = ("supplier_id", "vendor_id", "merchant_id", "supplier_code")
|
||||
SUPPLIER_NAME_KEYS = ("supplier_name", "vendor_name", "merchant_name", "supplier", "vendor", "merchant")
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class ProfileBaselineBucket:
|
||||
dimension: str
|
||||
key: str
|
||||
label: str
|
||||
sample_size: int
|
||||
claim_count: int
|
||||
total_amount: Decimal
|
||||
average_amount: Decimal
|
||||
median_amount: Decimal
|
||||
p75_amount: Decimal
|
||||
p90_amount: Decimal
|
||||
claim_ids: list[str] = field(default_factory=list)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"dimension": self.dimension,
|
||||
"key": self.key,
|
||||
"label": self.label,
|
||||
"sample_size": self.sample_size,
|
||||
"claim_count": self.claim_count,
|
||||
"total_amount": _format_decimal(self.total_amount),
|
||||
"average_amount": _format_decimal(self.average_amount),
|
||||
"median_amount": _format_decimal(self.median_amount),
|
||||
"p75_amount": _format_decimal(self.p75_amount),
|
||||
"p90_amount": _format_decimal(self.p90_amount),
|
||||
"claim_ids": list(self.claim_ids),
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class ProfileBaselineSnapshot:
|
||||
algorithm_version: str
|
||||
buckets: list[ProfileBaselineBucket] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def dimension_counts(self) -> dict[str, int]:
|
||||
counts = {dimension: 0 for dimension in BASELINE_DIMENSIONS}
|
||||
for bucket in self.buckets:
|
||||
counts[bucket.dimension] = counts.get(bucket.dimension, 0) + 1
|
||||
return counts
|
||||
|
||||
def buckets_for(self, dimension: str) -> list[ProfileBaselineBucket]:
|
||||
return [bucket for bucket in self.buckets if bucket.dimension == dimension]
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"algorithm_version": self.algorithm_version,
|
||||
"dimension_counts": self.dimension_counts,
|
||||
"bucket_count": len(self.buckets),
|
||||
"buckets": [bucket.as_dict() for bucket in self.buckets],
|
||||
}
|
||||
|
||||
|
||||
class ProfileBaselineUpdater:
|
||||
def build_from_claims(
|
||||
self,
|
||||
claims: list[RiskGraphClaimSnapshot],
|
||||
) -> ProfileBaselineSnapshot:
|
||||
grouped: dict[tuple[str, str], list[tuple[Decimal, str]]] = defaultdict(list)
|
||||
labels: dict[tuple[str, str], str] = {}
|
||||
|
||||
for claim in claims:
|
||||
self._add_claim_rows(grouped, labels, claim)
|
||||
|
||||
buckets = [
|
||||
_build_bucket(dimension, key, labels[(dimension, key)], rows)
|
||||
for (dimension, key), rows in grouped.items()
|
||||
]
|
||||
buckets.sort(key=lambda item: (item.dimension, -item.total_amount, item.key))
|
||||
return ProfileBaselineSnapshot(
|
||||
algorithm_version=BASELINE_ALGORITHM_VERSION,
|
||||
buckets=buckets,
|
||||
)
|
||||
|
||||
def _add_claim_rows(
|
||||
self,
|
||||
grouped: dict[tuple[str, str], list[tuple[Decimal, str]]],
|
||||
labels: dict[tuple[str, str], str],
|
||||
claim: RiskGraphClaimSnapshot,
|
||||
) -> None:
|
||||
amount = _to_decimal(claim.amount)
|
||||
claim_id = claim.claim_id or claim.claim_no
|
||||
_add_row(
|
||||
grouped,
|
||||
labels,
|
||||
"employee",
|
||||
claim.employee_id or claim.employee_name,
|
||||
claim.employee_name or claim.employee_id,
|
||||
amount,
|
||||
claim_id,
|
||||
)
|
||||
_add_row(
|
||||
grouped,
|
||||
labels,
|
||||
"department",
|
||||
claim.department_id or claim.department_name,
|
||||
claim.department_name or claim.department_id,
|
||||
amount,
|
||||
claim_id,
|
||||
)
|
||||
_add_row(
|
||||
grouped,
|
||||
labels,
|
||||
"expense_type",
|
||||
claim.expense_type,
|
||||
claim.expense_type,
|
||||
amount,
|
||||
claim_id,
|
||||
)
|
||||
for supplier_key, supplier_label, supplier_amount in _supplier_rows(claim):
|
||||
_add_row(
|
||||
grouped,
|
||||
labels,
|
||||
"supplier",
|
||||
supplier_key,
|
||||
supplier_label,
|
||||
supplier_amount,
|
||||
claim_id,
|
||||
)
|
||||
|
||||
|
||||
def _build_bucket(
|
||||
dimension: str,
|
||||
key: str,
|
||||
label: str,
|
||||
rows: list[tuple[Decimal, str]],
|
||||
) -> ProfileBaselineBucket:
|
||||
amounts = [amount for amount, _claim_id in rows]
|
||||
total = sum(amounts, ZERO)
|
||||
sample_size = len(amounts)
|
||||
claim_ids = sorted({claim_id for _amount, claim_id in rows if claim_id})
|
||||
average = total / Decimal(sample_size) if sample_size else ZERO
|
||||
return ProfileBaselineBucket(
|
||||
dimension=dimension,
|
||||
key=key,
|
||||
label=label,
|
||||
sample_size=sample_size,
|
||||
claim_count=len(claim_ids),
|
||||
total_amount=total,
|
||||
average_amount=average,
|
||||
median_amount=_percentile(amounts, 50),
|
||||
p75_amount=_percentile(amounts, 75),
|
||||
p90_amount=_percentile(amounts, 90),
|
||||
claim_ids=claim_ids,
|
||||
)
|
||||
|
||||
|
||||
def _add_row(
|
||||
grouped: dict[tuple[str, str], list[tuple[Decimal, str]]],
|
||||
labels: dict[tuple[str, str], str],
|
||||
dimension: str,
|
||||
key_source: Any,
|
||||
label_source: Any,
|
||||
amount: Decimal,
|
||||
claim_id: str,
|
||||
) -> None:
|
||||
key = _canonical_key(key_source)
|
||||
if not key:
|
||||
return
|
||||
group_key = (dimension, key)
|
||||
labels.setdefault(group_key, str(label_source or key_source or key).strip() or key)
|
||||
grouped[group_key].append((amount, claim_id))
|
||||
|
||||
|
||||
def _supplier_rows(claim: RiskGraphClaimSnapshot) -> list[tuple[str, str, Decimal]]:
|
||||
item_rows: list[tuple[str, str, Decimal]] = []
|
||||
for item in claim.items:
|
||||
supplier = _extract_supplier(item.metadata)
|
||||
if supplier is not None:
|
||||
item_rows.append((*supplier, _to_decimal(item.item_amount)))
|
||||
if item_rows:
|
||||
return item_rows
|
||||
|
||||
supplier = _extract_supplier(claim.metadata) or _extract_supplier_from_flags(claim.risk_flags)
|
||||
if supplier is None:
|
||||
return []
|
||||
return [(*supplier, _to_decimal(claim.amount))]
|
||||
|
||||
|
||||
def _extract_supplier(metadata: Any) -> tuple[str, str] | None:
|
||||
if not isinstance(metadata, dict):
|
||||
return None
|
||||
supplier_id = _first_text(metadata, SUPPLIER_ID_KEYS)
|
||||
supplier_name = _first_text(metadata, SUPPLIER_NAME_KEYS)
|
||||
key = supplier_id or supplier_name
|
||||
if not key:
|
||||
return None
|
||||
return key, supplier_name or supplier_id or key
|
||||
|
||||
|
||||
def _extract_supplier_from_flags(flags: list[Any]) -> tuple[str, str] | None:
|
||||
for flag in flags or []:
|
||||
if not isinstance(flag, dict):
|
||||
continue
|
||||
supplier = _extract_supplier(flag) or _extract_supplier(flag.get("metadata"))
|
||||
if supplier is not None:
|
||||
return supplier
|
||||
return None
|
||||
|
||||
|
||||
def _first_text(source: dict[str, Any], keys: tuple[str, ...]) -> str:
|
||||
for key in keys:
|
||||
value = str(source.get(key) or "").strip()
|
||||
if value:
|
||||
return value
|
||||
return ""
|
||||
|
||||
|
||||
def _percentile(values: list[Decimal], percent: int) -> Decimal:
|
||||
normalized = sorted(value for value in values if value >= ZERO)
|
||||
if not normalized:
|
||||
return ZERO
|
||||
if len(normalized) == 1:
|
||||
return normalized[0]
|
||||
position = Decimal(len(normalized) - 1) * Decimal(percent) / HUNDRED
|
||||
lower = int(position.to_integral_value(rounding=ROUND_FLOOR))
|
||||
upper = int(position.to_integral_value(rounding=ROUND_CEILING))
|
||||
if lower == upper:
|
||||
return normalized[lower]
|
||||
fraction = position - Decimal(lower)
|
||||
return normalized[lower] + (normalized[upper] - normalized[lower]) * fraction
|
||||
|
||||
|
||||
def _to_decimal(value: Any) -> Decimal:
|
||||
try:
|
||||
return Decimal(str(value or "0"))
|
||||
except Exception:
|
||||
return ZERO
|
||||
|
||||
|
||||
def _format_decimal(value: Any) -> str:
|
||||
if not isinstance(value, Decimal):
|
||||
value = _to_decimal(value)
|
||||
return format(value.quantize(Decimal("0.0001")), "f").rstrip("0").rstrip(".") or "0"
|
||||
|
||||
|
||||
def _canonical_key(value: Any) -> str:
|
||||
return "_".join(str(value or "").strip().lower().split())
|
||||
84
server/src/app/algorithem/risk_graph/quality.py
Normal file
84
server/src/app/algorithem/risk_graph/quality.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Data quality gates for strong financial risk conclusions."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
|
||||
from .models import RiskGraphClaimSnapshot
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskDataQualityResult:
|
||||
passed: bool
|
||||
gate: str
|
||||
max_risk_score: int
|
||||
missing_fields: list[str] = field(default_factory=list)
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"passed": self.passed,
|
||||
"gate": self.gate,
|
||||
"max_risk_score": self.max_risk_score,
|
||||
"missing_fields": list(self.missing_fields),
|
||||
"warnings": list(self.warnings),
|
||||
}
|
||||
|
||||
|
||||
class RiskDataQualityGate:
|
||||
"""Prevent weak source data from becoming strong automated conclusions."""
|
||||
|
||||
def evaluate_claim(self, claim: RiskGraphClaimSnapshot) -> RiskDataQualityResult:
|
||||
missing_fields: list[str] = []
|
||||
warnings: list[str] = []
|
||||
|
||||
if not str(claim.claim_id or "").strip():
|
||||
missing_fields.append("claim_id")
|
||||
if not (str(claim.employee_id or "").strip() or str(claim.employee_name or "").strip()):
|
||||
missing_fields.append("employee")
|
||||
if _to_decimal(claim.amount) <= Decimal("0"):
|
||||
missing_fields.append("amount")
|
||||
if not str(claim.expense_type or "").strip():
|
||||
warnings.append("expense_type")
|
||||
if claim.invoice_count > 0 and not claim.items:
|
||||
warnings.append("invoice_items")
|
||||
|
||||
if missing_fields:
|
||||
return RiskDataQualityResult(
|
||||
passed=False,
|
||||
gate="capped_missing_required_fields",
|
||||
max_risk_score=69,
|
||||
missing_fields=missing_fields,
|
||||
warnings=warnings,
|
||||
)
|
||||
if len(warnings) >= 2:
|
||||
return RiskDataQualityResult(
|
||||
passed=False,
|
||||
gate="capped_low_context_quality",
|
||||
max_risk_score=69,
|
||||
warnings=warnings,
|
||||
)
|
||||
return RiskDataQualityResult(
|
||||
passed=True,
|
||||
gate="passed",
|
||||
max_risk_score=100,
|
||||
warnings=warnings,
|
||||
)
|
||||
|
||||
def apply_score_cap(
|
||||
self,
|
||||
risk_score: int,
|
||||
result: RiskDataQualityResult,
|
||||
) -> tuple[int, str]:
|
||||
if risk_score > result.max_risk_score:
|
||||
return result.max_risk_score, result.gate
|
||||
return risk_score, result.gate
|
||||
|
||||
|
||||
def _to_decimal(value: Any) -> Decimal:
|
||||
try:
|
||||
return Decimal(str(value or "0"))
|
||||
except Exception:
|
||||
return Decimal("0")
|
||||
93
server/src/app/algorithem/risk_graph/replay.py
Normal file
93
server/src/app/algorithem/risk_graph/replay.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""Replay-set contracts for risk graph algorithm evaluation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class AlgorithmReplayCase:
|
||||
replay_case_id: str
|
||||
claim_id: str
|
||||
ontology_version: str
|
||||
rule_version: str
|
||||
algorithm_version: str
|
||||
feedback_label: str
|
||||
payload: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"replay_case_id": self.replay_case_id,
|
||||
"claim_id": self.claim_id,
|
||||
"ontology_version": self.ontology_version,
|
||||
"rule_version": self.rule_version,
|
||||
"algorithm_version": self.algorithm_version,
|
||||
"feedback_label": self.feedback_label,
|
||||
"payload": dict(self.payload),
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class AlgorithmReplaySet:
|
||||
replay_set_id: str
|
||||
created_at: datetime
|
||||
cases: list[AlgorithmReplayCase] = field(default_factory=list)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"replay_set_id": self.replay_set_id,
|
||||
"created_at": self.created_at.isoformat(),
|
||||
"case_count": len(self.cases),
|
||||
"cases": [item.as_dict() for item in self.cases],
|
||||
}
|
||||
|
||||
|
||||
class AlgorithmReplaySetBuilder:
|
||||
def build_from_observations(
|
||||
self,
|
||||
replay_set_id: str,
|
||||
observations: list[dict[str, Any]],
|
||||
*,
|
||||
created_at: datetime,
|
||||
) -> AlgorithmReplaySet:
|
||||
cases = [
|
||||
self._case_from_observation(index, observation)
|
||||
for index, observation in enumerate(observations, start=1)
|
||||
]
|
||||
return AlgorithmReplaySet(
|
||||
replay_set_id=replay_set_id,
|
||||
created_at=created_at,
|
||||
cases=cases,
|
||||
)
|
||||
|
||||
def _case_from_observation(
|
||||
self,
|
||||
index: int,
|
||||
observation: dict[str, Any],
|
||||
) -> AlgorithmReplayCase:
|
||||
ontology = observation.get("ontology_json") or {}
|
||||
trace = observation.get("decision_trace") or {}
|
||||
return AlgorithmReplayCase(
|
||||
replay_case_id=str(
|
||||
observation.get("evaluation_case_id")
|
||||
or trace.get("evaluation_case_id")
|
||||
or f"replay:{index}:{observation.get('observation_key') or 'observation'}"
|
||||
),
|
||||
claim_id=str(observation.get("claim_id") or ""),
|
||||
ontology_version=str(ontology.get("ontology_version") or ""),
|
||||
rule_version=str(trace.get("rule_version") or ""),
|
||||
algorithm_version=str(observation.get("algorithm_version") or ""),
|
||||
feedback_label=str(
|
||||
observation.get("feedback_status")
|
||||
or observation.get("status")
|
||||
or "unreviewed"
|
||||
),
|
||||
payload={
|
||||
"risk_signal": observation.get("risk_signal"),
|
||||
"risk_score": observation.get("risk_score"),
|
||||
"risk_level": observation.get("risk_level"),
|
||||
"decision_trace": trace,
|
||||
},
|
||||
)
|
||||
106
server/src/app/algorithem/risk_graph/rule_discovery.py
Normal file
106
server/src/app/algorithem/risk_graph/rule_discovery.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Candidate risk rule discovery from reviewed risk observations."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class CandidateRiskRule:
|
||||
candidate_id: str
|
||||
rule_code: str
|
||||
title: str
|
||||
risk_signal: str
|
||||
evidence: list[dict[str, Any]]
|
||||
source: str
|
||||
confidence_score: float
|
||||
status: str = "candidate_review"
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"candidate_id": self.candidate_id,
|
||||
"rule_code": self.rule_code,
|
||||
"title": self.title,
|
||||
"risk_signal": self.risk_signal,
|
||||
"evidence": list(self.evidence),
|
||||
"source": self.source,
|
||||
"confidence_score": self.confidence_score,
|
||||
"status": self.status,
|
||||
}
|
||||
|
||||
|
||||
class CandidateRiskRuleDiscovery:
|
||||
def discover_from_feedback(
|
||||
self,
|
||||
observations: list[dict[str, Any]],
|
||||
feedback_items: list[dict[str, Any]],
|
||||
) -> list[CandidateRiskRule]:
|
||||
observation_by_key = {
|
||||
str(item.get("observation_key") or item.get("id") or ""): item
|
||||
for item in observations
|
||||
}
|
||||
candidates: list[CandidateRiskRule] = []
|
||||
for feedback in feedback_items:
|
||||
source = str(feedback.get("candidate_rule_source") or "").strip()
|
||||
decision = str(feedback.get("decision") or feedback.get("feedback_type") or "").strip()
|
||||
if source != "risk_observation_feedback" and "candidate" not in decision:
|
||||
continue
|
||||
observation_key = str(feedback.get("observation_key") or "").strip()
|
||||
observation = observation_by_key.get(observation_key, {})
|
||||
risk_signal = str(
|
||||
feedback.get("risk_signal") or observation.get("risk_signal") or ""
|
||||
).strip()
|
||||
if not risk_signal:
|
||||
continue
|
||||
confidence = _confidence(feedback, observation)
|
||||
candidates.append(
|
||||
CandidateRiskRule(
|
||||
candidate_id=f"candidate:{observation_key or risk_signal}:{risk_signal}",
|
||||
rule_code=f"candidate.risk.{risk_signal}",
|
||||
title=f"{risk_signal} candidate rule",
|
||||
risk_signal=risk_signal,
|
||||
evidence=_candidate_evidence(observation, feedback),
|
||||
source=source or "risk_observation_feedback",
|
||||
confidence_score=confidence,
|
||||
)
|
||||
)
|
||||
return _dedupe_candidates(candidates)
|
||||
|
||||
|
||||
def _confidence(feedback: dict[str, Any], observation: dict[str, Any]) -> float:
|
||||
raw = feedback.get("confidence_score")
|
||||
if raw in (None, ""):
|
||||
raw = observation.get("confidence_score")
|
||||
try:
|
||||
return max(0.0, min(1.0, float(raw or 0.55)))
|
||||
except (TypeError, ValueError):
|
||||
return 0.55
|
||||
|
||||
|
||||
def _candidate_evidence(
|
||||
observation: dict[str, Any],
|
||||
feedback: dict[str, Any],
|
||||
) -> list[dict[str, Any]]:
|
||||
evidence: list[dict[str, Any]] = []
|
||||
for item in observation.get("evidence", []) or []:
|
||||
if isinstance(item, dict):
|
||||
evidence.append({"source": item.get("source") or "observation", **item})
|
||||
evidence.append(
|
||||
{
|
||||
"source": feedback.get("candidate_rule_source") or "risk_observation_feedback",
|
||||
"feedback_type": feedback.get("feedback_type"),
|
||||
"action": feedback.get("action"),
|
||||
"comment": feedback.get("comment"),
|
||||
}
|
||||
)
|
||||
return evidence
|
||||
|
||||
|
||||
def _dedupe_candidates(candidates: list[CandidateRiskRule]) -> list[CandidateRiskRule]:
|
||||
by_code: dict[str, CandidateRiskRule] = {}
|
||||
for candidate in candidates:
|
||||
current = by_code.get(candidate.rule_code)
|
||||
if current is None or candidate.confidence_score > current.confidence_score:
|
||||
by_code[candidate.rule_code] = candidate
|
||||
return list(by_code.values())
|
||||
94
server/src/app/algorithem/risk_graph/sampling.py
Normal file
94
server/src/app/algorithem/risk_graph/sampling.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""Risk-based sampling strategy for audit review and replay."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
|
||||
from .models import RiskHistoryStats
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RiskSamplingDecision:
|
||||
strategy: str
|
||||
threshold: int
|
||||
replay_bucket: str
|
||||
audit_required: bool
|
||||
reason: str
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"strategy": self.strategy,
|
||||
"threshold": self.threshold,
|
||||
"replay_bucket": self.replay_bucket,
|
||||
"audit_required": self.audit_required,
|
||||
"reason": self.reason,
|
||||
}
|
||||
|
||||
|
||||
class RiskSamplingPlanner:
|
||||
def plan(
|
||||
self,
|
||||
*,
|
||||
risk_score: int,
|
||||
confidence: Decimal,
|
||||
evidence_source_count: int,
|
||||
data_quality_passed: bool = True,
|
||||
data_quality_gate: str = "",
|
||||
history: RiskHistoryStats | None = None,
|
||||
) -> RiskSamplingDecision:
|
||||
false_positive_rate = _false_positive_rate(history)
|
||||
if not data_quality_passed:
|
||||
return RiskSamplingDecision(
|
||||
strategy="uncertainty_sample",
|
||||
threshold=45,
|
||||
replay_bucket="data_quality_gate",
|
||||
audit_required=True,
|
||||
reason=data_quality_gate or "data_quality_gate_not_passed",
|
||||
)
|
||||
if risk_score >= 90:
|
||||
return RiskSamplingDecision(
|
||||
strategy="mandatory_review",
|
||||
threshold=90,
|
||||
replay_bucket="critical_high_risk",
|
||||
audit_required=True,
|
||||
reason="risk_score_above_critical_threshold",
|
||||
)
|
||||
if risk_score >= 70:
|
||||
return RiskSamplingDecision(
|
||||
strategy="focused_review",
|
||||
threshold=70,
|
||||
replay_bucket="high_risk",
|
||||
audit_required=True,
|
||||
reason="risk_score_above_high_threshold",
|
||||
)
|
||||
if false_positive_rate >= Decimal("0.30"):
|
||||
return RiskSamplingDecision(
|
||||
strategy="calibration_sample",
|
||||
threshold=45,
|
||||
replay_bucket="false_positive_calibration",
|
||||
audit_required=True,
|
||||
reason="historical_false_positive_rate_high",
|
||||
)
|
||||
if confidence < Decimal("0.55") or evidence_source_count < 2:
|
||||
return RiskSamplingDecision(
|
||||
strategy="uncertainty_sample",
|
||||
threshold=45,
|
||||
replay_bucket="low_confidence",
|
||||
audit_required=True,
|
||||
reason="confidence_or_evidence_source_insufficient",
|
||||
)
|
||||
return RiskSamplingDecision(
|
||||
strategy="monitor",
|
||||
threshold=31,
|
||||
replay_bucket="routine_monitoring",
|
||||
audit_required=False,
|
||||
reason="below_review_threshold",
|
||||
)
|
||||
|
||||
|
||||
def _false_positive_rate(history: RiskHistoryStats | None) -> Decimal:
|
||||
if history is None or history.similar_case_count <= 0:
|
||||
return Decimal("0")
|
||||
return Decimal(history.false_positive_count) / Decimal(history.similar_case_count)
|
||||
230
server/src/app/algorithem/risk_graph/signals.py
Normal file
230
server/src/app/algorithem/risk_graph/signals.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""Risk signal normalization shared by rules, ontology, and graph scoring."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
|
||||
SEVERITY_SCORE = {
|
||||
"info": 12,
|
||||
"low": 32,
|
||||
"medium": 58,
|
||||
"high": 82,
|
||||
"critical": 100,
|
||||
}
|
||||
|
||||
SIGNAL_ALIASES: dict[str, str] = {
|
||||
"amount_over_limit": "amount_limit_exceeded",
|
||||
"over_budget": "budget_overrun",
|
||||
"budget_exceeded": "budget_overrun",
|
||||
"duplicate_expense": "duplicate_invoice",
|
||||
"duplicate_ticket": "duplicate_invoice",
|
||||
"risk.invoice.duplicate_invoice": "duplicate_invoice",
|
||||
"location_mismatch": "location_mismatch",
|
||||
"city_mismatch": "location_mismatch",
|
||||
"hotel_itinerary_mismatch": "hotel_itinerary_mismatch",
|
||||
"date_outside_trip": "date_outside_trip",
|
||||
"preapproval_absent": "preapproval_absent",
|
||||
"application_fields_missing": "application_fields_missing",
|
||||
"attachment_ocr_missing": "attachment_missing",
|
||||
"missing_attachment": "attachment_missing",
|
||||
"reason_too_brief": "reason_too_brief",
|
||||
"vague_ticket_content": "vague_goods_description",
|
||||
"personal_purpose": "personal_purpose",
|
||||
"split_billing": "split_billing",
|
||||
"frequency_anomaly": "frequency_anomaly",
|
||||
"collusion": "cross_department_cluster",
|
||||
"cross_department_cluster": "cross_department_cluster",
|
||||
"buyer_name_mismatch": "buyer_name_mismatch",
|
||||
"document_expense_mismatch": "document_expense_mismatch",
|
||||
"void_or_red_invoice": "void_or_red_invoice",
|
||||
"cross_year_invoice": "cross_year_invoice",
|
||||
"entertainment_missing_detail": "entertainment_missing_detail",
|
||||
}
|
||||
|
||||
SIGNAL_LABELS: dict[str, str] = {
|
||||
"amount_limit_exceeded": "Amount limit exceeded",
|
||||
"budget_overrun": "Budget overrun",
|
||||
"duplicate_invoice": "Duplicate invoice",
|
||||
"location_mismatch": "Location mismatch",
|
||||
"hotel_itinerary_mismatch": "Hotel and itinerary mismatch",
|
||||
"date_outside_trip": "Date outside approved trip",
|
||||
"preapproval_absent": "Pre-approval missing",
|
||||
"application_fields_missing": "Application fields missing",
|
||||
"attachment_missing": "Attachment missing",
|
||||
"reason_too_brief": "Reason too brief",
|
||||
"vague_goods_description": "Vague goods description",
|
||||
"personal_purpose": "Possible personal purpose",
|
||||
"split_billing": "Split billing pattern",
|
||||
"frequency_anomaly": "Frequency anomaly",
|
||||
"cross_department_cluster": "Cross-department spending cluster",
|
||||
"buyer_name_mismatch": "Buyer name mismatch",
|
||||
"document_expense_mismatch": "Document and expense mismatch",
|
||||
"void_or_red_invoice": "Void or red invoice",
|
||||
"cross_year_invoice": "Cross-year invoice",
|
||||
"entertainment_missing_detail": "Entertainment detail missing",
|
||||
}
|
||||
|
||||
SIGNAL_DEFAULT_SEVERITY: dict[str, str] = {
|
||||
"duplicate_invoice": "critical",
|
||||
"personal_purpose": "high",
|
||||
"preapproval_absent": "high",
|
||||
"date_outside_trip": "high",
|
||||
"amount_limit_exceeded": "high",
|
||||
"budget_overrun": "high",
|
||||
"split_billing": "high",
|
||||
"cross_department_cluster": "high",
|
||||
"location_mismatch": "medium",
|
||||
"hotel_itinerary_mismatch": "medium",
|
||||
"frequency_anomaly": "medium",
|
||||
"buyer_name_mismatch": "medium",
|
||||
"document_expense_mismatch": "medium",
|
||||
"void_or_red_invoice": "high",
|
||||
"cross_year_invoice": "medium",
|
||||
"entertainment_missing_detail": "medium",
|
||||
"application_fields_missing": "low",
|
||||
"attachment_missing": "low",
|
||||
"reason_too_brief": "low",
|
||||
"vague_goods_description": "low",
|
||||
}
|
||||
|
||||
POLICY_BOUND_SIGNALS = {
|
||||
"amount_limit_exceeded",
|
||||
"budget_overrun",
|
||||
"preapproval_absent",
|
||||
"date_outside_trip",
|
||||
"hotel_itinerary_mismatch",
|
||||
"location_mismatch",
|
||||
"document_expense_mismatch",
|
||||
"buyer_name_mismatch",
|
||||
"entertainment_missing_detail",
|
||||
"application_fields_missing",
|
||||
"attachment_missing",
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class NormalizedRiskSignal:
|
||||
code: str
|
||||
raw_code: str
|
||||
label: str
|
||||
severity: str
|
||||
score: int
|
||||
confidence: Decimal = Decimal("1")
|
||||
source: str = "rule"
|
||||
metadata: dict[str, Any] | None = None
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"code": self.code,
|
||||
"raw_code": self.raw_code,
|
||||
"label": self.label,
|
||||
"severity": self.severity,
|
||||
"score": self.score,
|
||||
"confidence": str(self.confidence),
|
||||
"source": self.source,
|
||||
"metadata": self.metadata or {},
|
||||
}
|
||||
|
||||
|
||||
def normalize_risk_signal(value: Any, *, source: str = "rule") -> NormalizedRiskSignal | None:
|
||||
if isinstance(value, dict):
|
||||
raw_code = _first_present(
|
||||
value,
|
||||
"risk_signal",
|
||||
"signal",
|
||||
"code",
|
||||
"risk_type",
|
||||
"rule_code",
|
||||
"type",
|
||||
)
|
||||
severity = str(value.get("severity") or value.get("risk_level") or "").strip().lower()
|
||||
confidence = _to_decimal(value.get("confidence") or value.get("score_confidence") or 1)
|
||||
explicit_score = value.get("risk_score") or value.get("score")
|
||||
metadata = dict(value)
|
||||
else:
|
||||
raw_code = str(value or "").strip()
|
||||
severity = ""
|
||||
confidence = Decimal("1")
|
||||
explicit_score = None
|
||||
metadata = {}
|
||||
|
||||
if not raw_code:
|
||||
return None
|
||||
|
||||
canonical = SIGNAL_ALIASES.get(raw_code.strip().lower(), raw_code.strip().lower())
|
||||
canonical = canonical.replace(" ", "_")
|
||||
severity = severity or SIGNAL_DEFAULT_SEVERITY.get(canonical, "medium")
|
||||
score = _score_from_value(explicit_score, severity=severity)
|
||||
return NormalizedRiskSignal(
|
||||
code=canonical,
|
||||
raw_code=raw_code,
|
||||
label=SIGNAL_LABELS.get(canonical, canonical.replace("_", " ").title()),
|
||||
severity=severity,
|
||||
score=score,
|
||||
confidence=max(Decimal("0"), min(Decimal("1"), confidence)),
|
||||
source=source,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
def normalize_risk_signals(
|
||||
values: list[Any],
|
||||
*,
|
||||
source: str = "rule",
|
||||
) -> list[NormalizedRiskSignal]:
|
||||
by_code: dict[str, NormalizedRiskSignal] = {}
|
||||
for value in values:
|
||||
signal = normalize_risk_signal(value, source=source)
|
||||
if signal is None:
|
||||
continue
|
||||
current = by_code.get(signal.code)
|
||||
if current is None or signal.score > current.score:
|
||||
by_code[signal.code] = signal
|
||||
return sorted(by_code.values(), key=lambda item: (item.score, item.code), reverse=True)
|
||||
|
||||
|
||||
def policy_refs_for_signal(signal_code: str) -> list[str]:
|
||||
signal_code = SIGNAL_ALIASES.get(str(signal_code or "").strip().lower(), signal_code)
|
||||
if signal_code not in POLICY_BOUND_SIGNALS:
|
||||
return []
|
||||
return [f"policy.{signal_code}"]
|
||||
|
||||
|
||||
def severity_from_score(score: int) -> str:
|
||||
normalized = max(0, min(100, int(score or 0)))
|
||||
if normalized >= 90:
|
||||
return "critical"
|
||||
if normalized >= 70:
|
||||
return "high"
|
||||
if normalized >= 45:
|
||||
return "medium"
|
||||
return "low"
|
||||
|
||||
|
||||
def _first_present(value: dict[str, Any], *keys: str) -> str:
|
||||
for key in keys:
|
||||
candidate = str(value.get(key) or "").strip()
|
||||
if candidate:
|
||||
return candidate
|
||||
return ""
|
||||
|
||||
|
||||
def _score_from_value(value: Any, *, severity: str) -> int:
|
||||
if value is None or value == "":
|
||||
return SEVERITY_SCORE.get(severity, SEVERITY_SCORE["medium"])
|
||||
try:
|
||||
numeric = Decimal(str(value))
|
||||
except Exception:
|
||||
return SEVERITY_SCORE.get(severity, SEVERITY_SCORE["medium"])
|
||||
if numeric <= Decimal("1"):
|
||||
numeric *= Decimal("100")
|
||||
return max(0, min(100, int(numeric.to_integral_value())))
|
||||
|
||||
|
||||
def _to_decimal(value: Any) -> Decimal:
|
||||
try:
|
||||
return Decimal(str(value))
|
||||
except Exception:
|
||||
return Decimal("0")
|
||||
162
server/src/app/algorithem/risk_graph/temporal.py
Normal file
162
server/src/app/algorithem/risk_graph/temporal.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""Temporal monitoring for risk graph relationship changes."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import Counter, defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from .models import RiskGraphEdge
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class TemporalRiskGraphChange:
|
||||
change_type: str
|
||||
source_key: str
|
||||
target_key: str
|
||||
edge_type: str
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"change_type": self.change_type,
|
||||
"source_key": self.source_key,
|
||||
"target_key": self.target_key,
|
||||
"edge_type": self.edge_type,
|
||||
"metadata": dict(self.metadata),
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class TemporalRiskGraphSnapshotDiff:
|
||||
changes: list[TemporalRiskGraphChange]
|
||||
edge_type_delta: dict[str, int]
|
||||
|
||||
def as_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"changes": [item.as_dict() for item in self.changes],
|
||||
"edge_type_delta": dict(self.edge_type_delta),
|
||||
}
|
||||
|
||||
|
||||
class TemporalRiskGraphMonitor:
|
||||
def monitor(
|
||||
self,
|
||||
previous_edges: list[RiskGraphEdge],
|
||||
current_edges: list[RiskGraphEdge],
|
||||
*,
|
||||
risk_node_keys: set[str] | None = None,
|
||||
) -> TemporalRiskGraphSnapshotDiff:
|
||||
previous = {edge.edge_key(): edge for edge in previous_edges}
|
||||
current = {edge.edge_key(): edge for edge in current_edges}
|
||||
risk_keys = set(risk_node_keys or set())
|
||||
|
||||
changes: list[TemporalRiskGraphChange] = []
|
||||
for key, edge in current.items():
|
||||
if key not in previous:
|
||||
changes.append(_change("relationship_added", edge))
|
||||
if edge.source_key in risk_keys or edge.target_key in risk_keys:
|
||||
changes.append(_change("risk_propagation", edge))
|
||||
for key, edge in previous.items():
|
||||
if key not in current:
|
||||
changes.append(_change("relationship_removed", edge))
|
||||
|
||||
changes.extend(_relationship_volume_changes(previous_edges, current_edges))
|
||||
changes.extend(_target_migrations(previous_edges, current_edges))
|
||||
return TemporalRiskGraphSnapshotDiff(
|
||||
changes=changes,
|
||||
edge_type_delta=_edge_type_delta(previous_edges, current_edges),
|
||||
)
|
||||
|
||||
|
||||
def _change(change_type: str, edge: RiskGraphEdge, **metadata: Any) -> TemporalRiskGraphChange:
|
||||
return TemporalRiskGraphChange(
|
||||
change_type=change_type,
|
||||
source_key=edge.source_key,
|
||||
target_key=edge.target_key,
|
||||
edge_type=edge.edge_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
def _edge_type_delta(
|
||||
previous_edges: list[RiskGraphEdge],
|
||||
current_edges: list[RiskGraphEdge],
|
||||
) -> dict[str, int]:
|
||||
previous_counts = Counter(edge.edge_type for edge in previous_edges)
|
||||
current_counts = Counter(edge.edge_type for edge in current_edges)
|
||||
edge_types = set(previous_counts) | set(current_counts)
|
||||
return {
|
||||
edge_type: current_counts.get(edge_type, 0) - previous_counts.get(edge_type, 0)
|
||||
for edge_type in sorted(edge_types)
|
||||
}
|
||||
|
||||
|
||||
def _relationship_volume_changes(
|
||||
previous_edges: list[RiskGraphEdge],
|
||||
current_edges: list[RiskGraphEdge],
|
||||
) -> list[TemporalRiskGraphChange]:
|
||||
changes: list[TemporalRiskGraphChange] = []
|
||||
previous_counts = Counter(edge.edge_type for edge in previous_edges)
|
||||
current_by_type: dict[str, list[RiskGraphEdge]] = defaultdict(list)
|
||||
for edge in current_edges:
|
||||
current_by_type[edge.edge_type].append(edge)
|
||||
for edge_type, current_group in current_by_type.items():
|
||||
previous_count = previous_counts.get(edge_type, 0)
|
||||
current_count = len(current_group)
|
||||
if current_count >= 3 and current_count >= max(1, previous_count) * 2:
|
||||
changes.append(
|
||||
_change(
|
||||
"relationship_surge",
|
||||
current_group[0],
|
||||
previous_count=previous_count,
|
||||
current_count=current_count,
|
||||
)
|
||||
)
|
||||
previous_by_type: dict[str, list[RiskGraphEdge]] = defaultdict(list)
|
||||
for edge in previous_edges:
|
||||
previous_by_type[edge.edge_type].append(edge)
|
||||
current_counts = Counter(edge.edge_type for edge in current_edges)
|
||||
for edge_type, previous_group in previous_by_type.items():
|
||||
if len(previous_group) >= 3 and current_counts.get(edge_type, 0) == 0:
|
||||
changes.append(
|
||||
_change(
|
||||
"relationship_disappeared",
|
||||
previous_group[0],
|
||||
previous_count=len(previous_group),
|
||||
current_count=0,
|
||||
)
|
||||
)
|
||||
return changes
|
||||
|
||||
|
||||
def _target_migrations(
|
||||
previous_edges: list[RiskGraphEdge],
|
||||
current_edges: list[RiskGraphEdge],
|
||||
) -> list[TemporalRiskGraphChange]:
|
||||
previous_targets: dict[tuple[str, str], set[str]] = defaultdict(set)
|
||||
current_targets: dict[tuple[str, str], set[str]] = defaultdict(set)
|
||||
for edge in previous_edges:
|
||||
previous_targets[(edge.source_key, edge.edge_type)].add(edge.target_key)
|
||||
for edge in current_edges:
|
||||
current_targets[(edge.source_key, edge.edge_type)].add(edge.target_key)
|
||||
|
||||
changes: list[TemporalRiskGraphChange] = []
|
||||
for key, current_target_set in current_targets.items():
|
||||
previous_target_set = previous_targets.get(key, set())
|
||||
if previous_target_set and current_target_set != previous_target_set:
|
||||
source_key, edge_type = key
|
||||
target_key = sorted(current_target_set - previous_target_set or current_target_set)[0]
|
||||
changes.append(
|
||||
TemporalRiskGraphChange(
|
||||
change_type="target_migration",
|
||||
source_key=source_key,
|
||||
target_key=target_key,
|
||||
edge_type=edge_type,
|
||||
metadata={
|
||||
"previous_targets": sorted(previous_target_set),
|
||||
"current_targets": sorted(current_target_set),
|
||||
},
|
||||
)
|
||||
)
|
||||
return changes
|
||||
Reference in New Issue
Block a user