feat: 新增风险图谱算法与系统仪表盘及操作反馈体系

后端新增风险图谱算法模块、风险观察与反馈服务、规则 DSL
校验器和可解释性引擎,完善系统仪表盘和财务仪表盘统计,
优化 agent 运行和编排执行链路,清理旧开发文档,前端新增
系统趋势、负载热力图等多种仪表盘图表组件,完善操作反馈
对话框和工作台日期选择器,优化报销创建和审批详情交互,
补充单元测试覆盖。
This commit is contained in:
caoxiaozhu
2026-05-30 15:46:51 +08:00
parent 4c59941ec6
commit 7989f3a159
314 changed files with 30073 additions and 20626 deletions

View File

@@ -18,21 +18,45 @@ from .employee_behavior_profile import (
score_by_bands,
)
from .employee_behavior_profile_tags import build_profile_radar, build_profile_tags
from .risk_graph import (
ALGORITHM_VERSION as FINANCIAL_RISK_GRAPH_ALGORITHM_VERSION,
RiskGraphClaimItemSnapshot,
RiskGraphClaimSnapshot,
RiskGraphEvaluationContext,
RiskGraphEvaluationResult,
RiskHistoryStats,
RiskObservationDraft,
evaluate_financial_risk_graph,
map_ontology_to_risk_graph,
normalize_risk_signal,
normalize_risk_signals,
)
__all__ = [
"ApplicantExpenseProfileInput",
"ApplicantExpenseProfileResult",
"EMPLOYEE_BEHAVIOR_PROFILE_ALGORITHM_VERSION",
"FINANCIAL_RISK_GRAPH_ALGORITHM_VERSION",
"ProfileComponent",
"ProfileScoreResult",
"RiskGraphClaimItemSnapshot",
"RiskGraphClaimSnapshot",
"RiskGraphEvaluationContext",
"RiskGraphEvaluationResult",
"RiskHistoryStats",
"RiskObservationDraft",
"build_review_suggestions",
"build_profile_radar",
"build_profile_tags",
"calculate_review_priority_score",
"evaluate_applicant_expense_profile",
"evaluate_financial_risk_graph",
"evaluate_weighted_profile",
"map_ontology_to_risk_graph",
"employee_profile_level_from_score",
"normalize_by_peer_percentiles",
"normalize_risk_signal",
"normalize_risk_signals",
"percentile",
"score_by_bands",
]

View File

@@ -0,0 +1,33 @@
"""Financial behavior graph risk engine."""
from .engine import evaluate_financial_risk_graph
from .models import (
ALGORITHM_VERSION,
RiskGraphClaimItemSnapshot,
RiskGraphClaimSnapshot,
RiskGraphEvaluationContext,
RiskGraphEvaluationResult,
RiskHistoryStats,
RiskObservationDraft,
)
from .ontology import OntologyRiskGraphMapping, map_ontology_to_risk_graph
from .profile_baselines import ProfileBaselineSnapshot, ProfileBaselineUpdater
from .signals import NormalizedRiskSignal, normalize_risk_signal, normalize_risk_signals
__all__ = [
"ALGORITHM_VERSION",
"NormalizedRiskSignal",
"OntologyRiskGraphMapping",
"RiskGraphClaimItemSnapshot",
"RiskGraphClaimSnapshot",
"RiskGraphEvaluationContext",
"RiskGraphEvaluationResult",
"RiskHistoryStats",
"RiskObservationDraft",
"ProfileBaselineSnapshot",
"ProfileBaselineUpdater",
"evaluate_financial_risk_graph",
"map_ontology_to_risk_graph",
"normalize_risk_signal",
"normalize_risk_signals",
]

View File

@@ -0,0 +1,175 @@
"""Deterministic multi-model anomaly detection for risk graph features."""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime
from decimal import Decimal
from statistics import median
from typing import Any
ZERO = Decimal("0")
@dataclass(frozen=True, slots=True)
class AnomalyPoint:
key: str
amount: Decimal
occurred_at: datetime | None = None
segment: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
@dataclass(slots=True)
class AnomalyModelSignal:
method: str
score: int
reason: str
related_keys: list[str] = field(default_factory=list)
def as_dict(self) -> dict[str, Any]:
return {
"method": self.method,
"score": self.score,
"reason": self.reason,
"related_keys": list(self.related_keys),
}
class MultiModelAnomalyDetector:
def detect(
self,
points: list[AnomalyPoint],
*,
target_key: str,
) -> list[AnomalyModelSignal]:
target = next((point for point in points if point.key == target_key), None)
if target is None:
return []
peers = [
point
for point in points
if point.key != target.key and (not target.segment or point.segment == target.segment)
]
if len(peers) < 3:
return []
signals = [
self._robust_statistical_signal(target, peers),
self._isolation_proxy_signal(target, peers),
self._local_outlier_signal(target, peers),
self._temporal_jump_signal(target, peers),
self._periodic_deviation_signal(target, peers),
]
return [signal for signal in signals if signal is not None]
def _robust_statistical_signal(
self,
target: AnomalyPoint,
peers: list[AnomalyPoint],
) -> AnomalyModelSignal | None:
values = [point.amount for point in peers if point.amount >= ZERO]
if len(values) < 3:
return None
center = Decimal(str(median(values)))
deviations = [abs(value - center) for value in values]
mad = Decimal(str(median(deviations))) or Decimal("1")
modified_z = abs(target.amount - center) / mad
if modified_z < Decimal("3"):
return None
return AnomalyModelSignal(
method="robust_statistics",
score=min(100, int(modified_z * Decimal("18"))),
reason="Target amount deviates from peer median by robust MAD.",
related_keys=[point.key for point in peers],
)
def _isolation_proxy_signal(
self,
target: AnomalyPoint,
peers: list[AnomalyPoint],
) -> AnomalyModelSignal | None:
values = sorted(point.amount for point in peers)
if target.amount <= values[-1] * Decimal("1.8"):
return None
return AnomalyModelSignal(
method="isolation_forest_proxy",
score=min(100, int((target.amount / max(values[-1], Decimal("1"))) * Decimal("45"))),
reason="Target amount is isolated beyond the peer maximum envelope.",
related_keys=[point.key for point in peers[-5:]],
)
def _local_outlier_signal(
self,
target: AnomalyPoint,
peers: list[AnomalyPoint],
) -> AnomalyModelSignal | None:
distances = sorted((abs(target.amount - point.amount), point.key) for point in peers)
nearest = distances[: min(3, len(distances))]
peer_distances = [
abs(left.amount - right.amount)
for index, left in enumerate(peers)
for right in peers[index + 1 :]
]
local_scale = Decimal(str(median(peer_distances))) if peer_distances else Decimal("1")
local_scale = max(local_scale, Decimal("1"))
target_distance = sum((distance for distance, _ in nearest), ZERO) / Decimal(len(nearest))
ratio = target_distance / local_scale
if ratio < Decimal("2.5"):
return None
return AnomalyModelSignal(
method="local_outlier_factor_proxy",
score=min(100, int(ratio * Decimal("24"))),
reason="Target is far away from its nearest peer neighborhood.",
related_keys=[key for _, key in nearest],
)
def _temporal_jump_signal(
self,
target: AnomalyPoint,
peers: list[AnomalyPoint],
) -> AnomalyModelSignal | None:
if target.occurred_at is None:
return None
previous = [
point
for point in peers
if point.occurred_at is not None and point.occurred_at < target.occurred_at
]
previous = sorted(previous, key=lambda item: item.occurred_at or datetime.min)[-3:]
if len(previous) < 3:
return None
average = sum((point.amount for point in previous), ZERO) / Decimal(len(previous))
if average <= ZERO or target.amount < average * Decimal("2.2"):
return None
return AnomalyModelSignal(
method="temporal_jump",
score=min(100, int((target.amount / average) * Decimal("32"))),
reason="Target amount jumps above the recent moving average.",
related_keys=[point.key for point in previous],
)
def _periodic_deviation_signal(
self,
target: AnomalyPoint,
peers: list[AnomalyPoint],
) -> AnomalyModelSignal | None:
if target.occurred_at is None:
return None
same_period = [
point
for point in peers
if point.occurred_at is not None
and point.occurred_at.weekday() == target.occurred_at.weekday()
]
if len(same_period) < 2:
return None
average = sum((point.amount for point in same_period), ZERO) / Decimal(len(same_period))
if average <= ZERO or target.amount < average * Decimal("2"):
return None
return AnomalyModelSignal(
method="periodic_deviation",
score=min(100, int((target.amount / average) * Decimal("30"))),
reason="Target deviates from same-weekday periodic peer behavior.",
related_keys=[point.key for point in same_period],
)

View File

@@ -0,0 +1,183 @@
"""Multi-evidence and spatiotemporal consistency checks."""
from __future__ import annotations
from datetime import date, datetime
from decimal import Decimal
from typing import Any
from .models import RiskEvidence, RiskGraphClaimSnapshot
from .signals import NormalizedRiskSignal, normalize_risk_signals
ZERO = Decimal("0")
def evaluate_claim_consistency(
claim: RiskGraphClaimSnapshot,
) -> tuple[list[RiskEvidence], list[NormalizedRiskSignal]]:
evidence: list[RiskEvidence] = []
signals: list[NormalizedRiskSignal] = []
if _has_location_mismatch(claim):
evidence.append(
RiskEvidence(
code="location_mismatch_graph",
title="Location mismatch graph",
detail="Claim location and item location are not aligned.",
source="spatiotemporal",
score=64,
)
)
signals.extend(normalize_risk_signals(["location_mismatch"], source="spatiotemporal"))
amount_mismatch = _document_amount_mismatch(claim)
if amount_mismatch:
evidence.append(
RiskEvidence(
code="document_amount_mismatch",
title="Document amount mismatch",
detail="Claim amount and item amount sum are not aligned.",
source="multi_evidence",
score=72,
metadata=amount_mismatch,
)
)
signals.extend(
normalize_risk_signals(
[{"risk_signal": "document_expense_mismatch", "score": 72}],
source="multi_evidence",
)
)
invoice_count_mismatch = _invoice_count_mismatch(claim)
if invoice_count_mismatch:
evidence.append(
RiskEvidence(
code="invoice_count_mismatch",
title="Invoice count mismatch",
detail="Declared invoice count and attached invoice count are not aligned.",
source="multi_evidence",
score=62,
metadata=invoice_count_mismatch,
)
)
signals.extend(
normalize_risk_signals(
[{"risk_signal": "document_expense_mismatch", "score": 62}],
source="multi_evidence",
)
)
date_mismatch = _item_date_outside_claim_window(claim)
if date_mismatch:
evidence.append(
RiskEvidence(
code="date_outside_claim_window",
title="Date outside claim window",
detail="Item date is too far away from the claim occurrence date.",
source="spatiotemporal",
score=78,
metadata=date_mismatch,
)
)
signals.extend(normalize_risk_signals(["date_outside_trip"], source="spatiotemporal"))
return evidence, signals
def _has_location_mismatch(claim: RiskGraphClaimSnapshot) -> bool:
claim_location = _canonical_key(claim.location)
if not claim_location or not claim.items:
return False
item_locations = {
_canonical_key(item.item_location)
for item in claim.items
if str(item.item_location or "").strip()
}
if not item_locations:
return False
return any(location and location != claim_location for location in item_locations)
def _document_amount_mismatch(claim: RiskGraphClaimSnapshot) -> dict[str, str] | None:
if not claim.items:
return None
claim_amount = _to_decimal(claim.amount)
item_amount_sum = sum((_to_decimal(item.item_amount) for item in claim.items), ZERO)
if claim_amount <= ZERO or item_amount_sum <= ZERO:
return None
difference = abs(claim_amount - item_amount_sum)
tolerance = max(Decimal("1"), claim_amount * Decimal("0.02"))
if difference <= tolerance:
return None
return {
"claim_amount": str(claim_amount),
"item_amount_sum": str(item_amount_sum),
"difference": str(difference),
"tolerance": str(tolerance),
}
def _invoice_count_mismatch(claim: RiskGraphClaimSnapshot) -> dict[str, Any] | None:
declared_count = int(claim.invoice_count or 0)
if declared_count <= 0:
return None
invoice_ids = sorted(
{
str(item.invoice_id or "").strip()
for item in claim.items
if str(item.invoice_id or "").strip()
}
)
actual_count = len(invoice_ids)
if declared_count == actual_count:
return None
return {
"declared_invoice_count": declared_count,
"actual_invoice_count": actual_count,
"invoice_ids": invoice_ids,
}
def _item_date_outside_claim_window(claim: RiskGraphClaimSnapshot) -> dict[str, Any] | None:
occurred_date = _date_from_value(claim.occurred_at)
if occurred_date is None or not claim.items:
return None
mismatches: list[dict[str, Any]] = []
for item in claim.items:
item_date = _date_from_value(item.item_date)
if item_date is None:
continue
distance_days = abs((item_date - occurred_date).days)
if distance_days <= 7:
continue
mismatches.append(
{
"item_id": item.item_id,
"item_date": item_date.isoformat(),
"occurred_at": occurred_date.isoformat(),
"distance_days": distance_days,
}
)
return {"mismatches": mismatches} if mismatches else None
def _date_from_value(value: Any) -> date | None:
if value is None:
return None
if isinstance(value, datetime):
return value.date()
if isinstance(value, date):
return value
return None
def _canonical_key(value: Any) -> str:
return "_".join(str(value or "").strip().lower().split())
def _to_decimal(value: Any) -> Decimal:
try:
return Decimal(str(value or "0"))
except Exception:
return ZERO

View File

@@ -0,0 +1,77 @@
"""Control effect analysis for risk rules, sampling, and digital employees."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
HIGH_LEVELS = {"high", "critical"}
@dataclass(slots=True)
class ControlEffectSummary:
before_count: int
after_count: int
risk_count_delta: int
average_score_delta: float
high_rate_delta: float
confirmation_rate_delta: float
false_positive_rate_delta: float
def as_dict(self) -> dict[str, Any]:
return {
"before_count": self.before_count,
"after_count": self.after_count,
"risk_count_delta": self.risk_count_delta,
"average_score_delta": self.average_score_delta,
"high_rate_delta": self.high_rate_delta,
"confirmation_rate_delta": self.confirmation_rate_delta,
"false_positive_rate_delta": self.false_positive_rate_delta,
}
class ControlEffectAnalyzer:
def compare(
self,
before: list[dict[str, Any]],
after: list[dict[str, Any]],
) -> ControlEffectSummary:
before_metrics = _metrics(before)
after_metrics = _metrics(after)
return ControlEffectSummary(
before_count=before_metrics["count"],
after_count=after_metrics["count"],
risk_count_delta=after_metrics["count"] - before_metrics["count"],
average_score_delta=round(after_metrics["average_score"] - before_metrics["average_score"], 4),
high_rate_delta=round(after_metrics["high_rate"] - before_metrics["high_rate"], 4),
confirmation_rate_delta=round(
after_metrics["confirmation_rate"] - before_metrics["confirmation_rate"],
4,
),
false_positive_rate_delta=round(
after_metrics["false_positive_rate"] - before_metrics["false_positive_rate"],
4,
),
)
def _metrics(items: list[dict[str, Any]]) -> dict[str, Any]:
count = len(items)
if count == 0:
return {
"count": 0,
"average_score": 0.0,
"high_rate": 0.0,
"confirmation_rate": 0.0,
"false_positive_rate": 0.0,
}
confirmed = sum(1 for item in items if item.get("feedback_status") == "confirmed")
false_positive = sum(1 for item in items if item.get("feedback_status") == "false_positive")
reviewed = confirmed + false_positive
return {
"count": count,
"average_score": sum(int(item.get("risk_score") or 0) for item in items) / count,
"high_rate": sum(1 for item in items if item.get("risk_level") in HIGH_LEVELS) / count,
"confirmation_rate": confirmed / reviewed if reviewed else 0.0,
"false_positive_rate": false_positive / reviewed if reviewed else 0.0,
}

View File

@@ -0,0 +1,82 @@
"""Counterfactual recommendations for reducing financial risk scores."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
@dataclass(slots=True)
class CounterfactualRiskAction:
action_key: str
title: str
detail: str
related_feature: str
expected_score_delta: int
def as_dict(self) -> dict[str, Any]:
return {
"action_key": self.action_key,
"title": self.title,
"detail": self.detail,
"related_feature": self.related_feature,
"expected_score_delta": self.expected_score_delta,
}
class CounterfactualRiskAdvisor:
def advise(self, observation: dict[str, Any]) -> list[CounterfactualRiskAction]:
scores = dict(
observation.get("contribution_scores")
or observation.get("decision_trace", {}).get("input_scores")
or {}
)
evidence_codes = {
str(item.get("code") or "")
for item in observation.get("evidence", [])
if isinstance(item, dict)
}
trace = observation.get("decision_trace") or {}
actions: list[CounterfactualRiskAction] = []
if int(scores.get("S_rule") or 0) >= 70:
actions.append(
CounterfactualRiskAction(
action_key="complete_preapproval_or_required_attachment",
title="Complete required approval evidence",
detail="补齐事前申请、审批记录或制度要求的附件,可降低规则命中风险。",
related_feature="S_rule",
expected_score_delta=-20,
)
)
if int(scores.get("S_anomaly") or 0) >= 70:
actions.append(
CounterfactualRiskAction(
action_key="align_amount_with_peer_baseline",
title="Align amount with peer baseline",
detail="补充高金额原因或拆出不属于本次报销的费用,可降低基线偏离风险。",
related_feature="S_anomaly",
expected_score_delta=-18,
)
)
if int(scores.get("S_graph") or 0) >= 70 or "duplicate_invoice_graph" in evidence_codes:
actions.append(
CounterfactualRiskAction(
action_key="replace_duplicate_or_conflicting_invoice",
title="Replace conflicting invoice",
detail="替换重复票据、修正票据归属或说明跨单据复用原因,可降低图谱异常风险。",
related_feature="S_graph",
expected_score_delta=-25,
)
)
if trace.get("data_quality_gate") not in {"", "passed", None}:
actions.append(
CounterfactualRiskAction(
action_key="supplement_missing_risk_data",
title="Supplement missing risk data",
detail="补齐员工、金额、费用类型、票据明细等关键字段后再进入强风控判断。",
related_feature="data_quality",
expected_score_delta=-10,
)
)
return actions

View File

@@ -0,0 +1,132 @@
"""Decision trace and explanation helpers for risk graph observations."""
from __future__ import annotations
from dataclasses import dataclass, field
from decimal import Decimal
from typing import Any
from .models import PeerBaseline, RiskEvidence
RISK_SCORE_FORMULA = (
"0.35*S_rule + 0.25*S_anomaly + "
"0.20*S_graph + 0.15*S_policy + 0.05*S_history"
)
@dataclass(slots=True)
class DecisionTrace:
formula: str
algorithm_version: str
input_scores: dict[str, int]
output_score: int
decision_row: str
feature_contributions_json: list[dict[str, Any]]
uncertainty_reasons_json: list[str]
explanation_template_key: str
metadata: dict[str, Any] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"formula": self.formula,
"algorithm_version": self.algorithm_version,
"input_scores": dict(self.input_scores),
"output_score": self.output_score,
"decision_row": self.decision_row,
"feature_contributions_json": list(self.feature_contributions_json),
"uncertainty_reasons_json": list(self.uncertainty_reasons_json),
"explanation_template_key": self.explanation_template_key,
**self.metadata,
}
class DecisionTraceBuilder:
def build(
self,
*,
algorithm_version: str,
risk_signal: str,
risk_level: str,
raw_risk_score: int,
risk_score: int,
contribution_scores: dict[str, int],
evidence: list[RiskEvidence],
baseline: PeerBaseline,
confidence: Decimal,
metadata: dict[str, Any],
) -> DecisionTrace:
return DecisionTrace(
formula=RISK_SCORE_FORMULA,
algorithm_version=algorithm_version,
input_scores=contribution_scores,
output_score=risk_score,
decision_row=_decision_row(risk_score=risk_score, risk_level=risk_level),
feature_contributions_json=_feature_contributions(contribution_scores),
uncertainty_reasons_json=_uncertainty_reasons(
raw_risk_score=raw_risk_score,
risk_score=risk_score,
evidence=evidence,
baseline=baseline,
confidence=confidence,
metadata=metadata,
),
explanation_template_key=f"risk.{risk_signal}.{risk_level}",
metadata=metadata,
)
def _decision_row(*, risk_score: int, risk_level: str) -> str:
if risk_score >= 90:
return f"{risk_level}:score>=90"
if risk_score >= 70:
return f"{risk_level}:70<=score<90"
if risk_score >= 45:
return f"{risk_level}:45<=score<70"
return f"{risk_level}:score<45"
def _feature_contributions(scores: dict[str, int]) -> list[dict[str, Any]]:
weights = {
"S_rule": Decimal("0.35"),
"S_anomaly": Decimal("0.25"),
"S_graph": Decimal("0.20"),
"S_policy": Decimal("0.15"),
"S_history": Decimal("0.05"),
}
rows = []
for key, score in scores.items():
weighted_score = Decimal(int(score or 0)) * weights.get(key, Decimal("0"))
rows.append(
{
"feature": key,
"score": int(score or 0),
"weight": str(weights.get(key, Decimal("0"))),
"weighted_score": float(weighted_score),
}
)
return sorted(rows, key=lambda item: item["weighted_score"], reverse=True)
def _uncertainty_reasons(
*,
raw_risk_score: int,
risk_score: int,
evidence: list[RiskEvidence],
baseline: PeerBaseline,
confidence: Decimal,
metadata: dict[str, Any],
) -> list[str]:
reasons: list[str] = []
if risk_score < raw_risk_score:
reasons.append("score_capped_by_gate")
if baseline.scope == "insufficient_sample" or baseline.sample_size <= 0:
reasons.append("peer_baseline_insufficient")
if confidence < Decimal("0.55"):
reasons.append("low_confidence")
if len({item.source for item in evidence if item.source}) < 2:
reasons.append("single_evidence_source")
if metadata.get("ontology_gate") == "candidate_only":
reasons.append("ontology_candidate_only")
if metadata.get("data_quality_gate") not in {"", "passed", None}:
reasons.append("data_quality_gate_not_passed")
return list(dict.fromkeys(reasons))

View File

@@ -0,0 +1,794 @@
"""Financial behavior graph risk scoring engine."""
from __future__ import annotations
from decimal import ROUND_CEILING, ROUND_FLOOR, ROUND_HALF_UP, Decimal
from typing import Any
from .consistency import evaluate_claim_consistency
from .decisioning import DecisionTraceBuilder
from .graph import build_claim_graph, claim_node_key, employee_node_key
from .models import (
ALGORITHM_VERSION,
AUTOMATION_ASSIST,
AUTOMATION_AUTO_HOLD,
AUTOMATION_MANUAL_REVIEW,
AUTOMATION_SEMI_AUTO_REVIEW,
LEVEL_CRITICAL,
LEVEL_HIGH,
LEVEL_LOW,
LEVEL_MEDIUM,
PeerBaseline,
RiskEvidence,
RiskGraphClaimSnapshot,
RiskGraphEdge,
RiskGraphEvaluationContext,
RiskGraphEvaluationResult,
RiskGraphNode,
RiskHistoryStats,
RiskObservationDraft,
)
from .ontology import map_ontology_to_risk_graph
from .quality import RiskDataQualityGate
from .sampling import RiskSamplingPlanner
from .signals import (
NormalizedRiskSignal,
normalize_risk_signals,
policy_refs_for_signal,
)
ZERO = Decimal("0")
ONE = Decimal("1")
HUNDRED = Decimal("100")
DATA_QUALITY_GATE = RiskDataQualityGate()
SAMPLING_PLANNER = RiskSamplingPlanner()
DECISION_TRACE_BUILDER = DecisionTraceBuilder()
def evaluate_financial_risk_graph(
context: RiskGraphEvaluationContext,
) -> RiskGraphEvaluationResult:
nodes, edges = build_claim_graph(context.claims)
ontology_mapping = map_ontology_to_risk_graph(
context.ontology_parse,
ontology_parse_id=context.ontology_parse_id,
ontology_version=context.ontology_version,
)
nodes = _merge_nodes(nodes, ontology_mapping.nodes)
edges = _merge_edges(edges, ontology_mapping.edges)
target_ids = context.target_claim_ids or {claim.claim_id for claim in context.claims}
target_claims = [claim for claim in context.claims if claim.claim_id in target_ids]
observations: list[RiskObservationDraft] = []
for claim in target_claims:
baseline = _resolve_peer_baseline(claim, context.claims, context.min_peer_sample_size)
rule_score, rule_evidence, rule_signals = _score_rule_signals(claim)
anomaly_score, anomaly_evidence = _score_amount_anomaly(claim, baseline)
graph_score, graph_evidence, graph_signals = _score_graph_anomaly(claim, context)
policy_score, policy_evidence, policy_refs = _score_policy_relevance(
rule_signals + graph_signals + ontology_mapping.risk_signals,
)
history_score, history_evidence, history = _score_history(
claim,
rule_signals + graph_signals + ontology_mapping.risk_signals,
context.history_stats,
)
contribution_scores = {
"S_rule": rule_score,
"S_anomaly": anomaly_score,
"S_graph": graph_score,
"S_policy": policy_score,
"S_history": history_score,
}
raw_risk_score = _weighted_risk_score(contribution_scores)
quality_result = DATA_QUALITY_GATE.evaluate_claim(claim)
evidence = [
*rule_evidence,
*anomaly_evidence,
*graph_evidence,
*policy_evidence,
*history_evidence,
]
risk_score, evidence_source_gate = _apply_evidence_source_gate(
raw_risk_score,
evidence,
)
risk_score, data_quality_gate = DATA_QUALITY_GATE.apply_score_cap(
risk_score,
quality_result,
)
if risk_score < context.observation_threshold and ontology_mapping.gate != "candidate_only":
continue
if risk_score < context.observation_threshold and not ontology_mapping.risk_signals:
continue
evidence_source_count = _evidence_source_count(evidence)
primary_signal = _select_primary_signal(
rule_signals + graph_signals + ontology_mapping.risk_signals,
fallback_score=risk_score,
)
confidence = _calculate_confidence(
evidence=evidence,
baseline=baseline,
ontology_confidence=ontology_mapping.confidence,
history=history,
data_quality_ok=quality_result.passed,
)
automation_mode = _resolve_automation_mode(
risk_score=risk_score,
confidence=confidence,
evidence_count=len(evidence),
history=history,
)
sampling_decision = SAMPLING_PLANNER.plan(
risk_score=risk_score,
confidence=confidence,
evidence_source_count=evidence_source_count,
data_quality_passed=quality_result.passed,
data_quality_gate=data_quality_gate,
history=history,
)
risk_level = _level_from_score(risk_score)
decision_metadata = {
"raw_risk_score": raw_risk_score,
"evidence_source_count": evidence_source_count,
"evidence_source_gate": evidence_source_gate,
"data_quality_gate": data_quality_gate,
"data_quality": quality_result.as_dict(),
"sampling_strategy": sampling_decision.as_dict(),
"contribution_scores": contribution_scores,
"baseline_scope": baseline.scope,
"ontology_gate": ontology_mapping.gate,
}
decision_trace = DECISION_TRACE_BUILDER.build(
algorithm_version=ALGORITHM_VERSION,
risk_signal=primary_signal.code,
risk_level=risk_level,
raw_risk_score=raw_risk_score,
risk_score=risk_score,
contribution_scores=contribution_scores,
evidence=evidence,
baseline=baseline,
confidence=confidence,
metadata=decision_metadata,
)
graph_node_keys = _claim_related_node_keys(claim, nodes)
graph_edge_keys = _claim_related_edge_keys(claim, edges)
similar_case_ids = _similar_case_ids(claim, context.claims)
observations.append(
RiskObservationDraft(
observation_key=f"risk:{claim.claim_id}:{primary_signal.code}",
subject_type="expense_claim",
subject_key=f"claim:{claim.claim_id}",
subject_label=claim.claim_no or claim.claim_id,
claim_id=claim.claim_id,
claim_no=claim.claim_no,
risk_type=primary_signal.code,
risk_signal=primary_signal.code,
title=f"{primary_signal.label} risk",
description=_build_description(claim, primary_signal, risk_score, evidence),
risk_score=risk_score,
risk_level=risk_level,
confidence_score=confidence,
control_stage="reimbursement",
control_mode="risk_observation",
automation_mode=automation_mode,
source="financial_risk_graph",
algorithm_version=ALGORITHM_VERSION,
contribution_scores=contribution_scores,
baseline=baseline,
evidence=evidence,
graph_node_keys=graph_node_keys,
graph_edge_keys=graph_edge_keys,
policy_refs=policy_refs,
similar_case_claim_ids=similar_case_ids,
ontology_json=ontology_mapping.as_dict(),
decision_trace=decision_trace.as_dict(),
)
)
return RiskGraphEvaluationResult(
observations=sorted(observations, key=lambda item: item.risk_score, reverse=True),
nodes=nodes,
edges=edges,
)
def _score_rule_signals(
claim: RiskGraphClaimSnapshot,
) -> tuple[int, list[RiskEvidence], list[NormalizedRiskSignal]]:
signals = normalize_risk_signals(claim.risk_flags, source="rule")
if not signals:
return 0, [], []
score = min(100, max(item.score for item in signals) + max(0, len(signals) - 1) * 5)
evidence = [
RiskEvidence(
code="rule_signal",
title="Rule signal",
detail=f"{signal.label}: {signal.severity}",
source="rule",
score=signal.score,
metadata=signal.as_dict(),
)
for signal in signals
]
return score, evidence, signals
def _score_amount_anomaly(
claim: RiskGraphClaimSnapshot,
baseline: PeerBaseline,
) -> tuple[int, list[RiskEvidence]]:
amount = _to_decimal(claim.amount)
if baseline.sample_size <= 0 or baseline.p75_amount <= ZERO:
return 0, [
RiskEvidence(
code="baseline_unavailable",
title="Baseline unavailable",
detail=baseline.fallback_reason or "No comparable peer sample.",
source="baseline",
)
]
ratio = _safe_ratio(amount, baseline.p75_amount)
score = _score_ratio(
ratio,
[
(Decimal("1.00"), 0),
(Decimal("1.25"), 30),
(Decimal("1.50"), 55),
(Decimal("2.00"), 75),
(Decimal("3.00"), 95),
],
)
if score <= 0:
return 0, []
return score, [
RiskEvidence(
code="peer_amount_deviation",
title="Peer amount deviation",
detail=(
f"Claim amount {amount} is {ratio.quantize(Decimal('0.0001'))} "
f"times peer p75 {baseline.p75_amount}."
),
source="baseline",
score=score,
metadata={"ratio": str(ratio), "baseline": baseline.as_dict()},
)
]
def _score_graph_anomaly(
claim: RiskGraphClaimSnapshot,
context: RiskGraphEvaluationContext,
) -> tuple[int, list[RiskEvidence], list[NormalizedRiskSignal]]:
evidence: list[RiskEvidence] = []
signals: list[NormalizedRiskSignal] = []
duplicate_claims = _duplicate_invoice_claims(claim, context.claims)
if duplicate_claims:
evidence.append(
RiskEvidence(
code="duplicate_invoice_graph",
title="Duplicate invoice graph",
detail="Same invoice appears in multiple claims.",
source="graph",
score=95,
related_entity_keys=[f"claim:{item.claim_id}" for item in duplicate_claims],
)
)
signals.extend(normalize_risk_signals(["duplicate_invoice"], source="graph"))
split_claims = _split_billing_claims(claim, context.claims, context.near_threshold_amount)
if len(split_claims) >= 3:
evidence.append(
RiskEvidence(
code="split_billing_graph",
title="Split billing graph",
detail="Same employee submitted several near-threshold claims in 7 days.",
source="graph",
score=78,
related_entity_keys=[f"claim:{item.claim_id}" for item in split_claims],
)
)
signals.extend(normalize_risk_signals(["split_billing"], source="graph"))
frequency_claims = _employee_frequency_claims(claim, context.claims)
if len(frequency_claims) >= 4:
score = min(88, 52 + len(frequency_claims) * 6)
evidence.append(
RiskEvidence(
code="frequency_graph",
title="Frequency graph",
detail="Same employee has dense claims under the same expense type.",
source="graph",
score=score,
related_entity_keys=[f"claim:{item.claim_id}" for item in frequency_claims],
)
)
signals.extend(normalize_risk_signals(["frequency_anomaly"], source="graph"))
consistency_evidence, consistency_signals = evaluate_claim_consistency(claim)
evidence.extend(consistency_evidence)
signals.extend(consistency_signals)
cluster_claims = _cross_department_cluster_claims(claim, context.claims)
if len(cluster_claims) >= 3:
evidence.append(
RiskEvidence(
code="cross_department_cluster",
title="Cross-department cluster",
detail="Multiple departments produced similar high-value claims together.",
source="graph",
score=74,
related_entity_keys=[f"claim:{item.claim_id}" for item in cluster_claims],
)
)
signals.extend(normalize_risk_signals(["cross_department_cluster"], source="graph"))
if not evidence:
return 0, [], []
score = min(100, max(item.score for item in evidence) + max(0, len(evidence) - 1) * 6)
return score, evidence, _dedupe_signals(signals)
def _score_policy_relevance(
signals: list[NormalizedRiskSignal],
) -> tuple[int, list[RiskEvidence], list[str]]:
refs: list[str] = []
for signal in signals:
for ref in policy_refs_for_signal(signal.code):
if ref not in refs:
refs.append(ref)
if not refs:
return 0, [], []
score = min(88, 45 + len(refs) * 12)
return score, [
RiskEvidence(
code="policy_relevance",
title="Policy relevance",
detail="Risk signal is bound to policy or control clause.",
source="policy",
score=score,
metadata={"policy_refs": refs},
)
], refs
def _score_history(
claim: RiskGraphClaimSnapshot,
signals: list[NormalizedRiskSignal],
history_stats: list[RiskHistoryStats],
) -> tuple[int, list[RiskEvidence], RiskHistoryStats | None]:
signal_codes = {item.code for item in signals}
expense_type = _canonical_key(claim.expense_type)
matched = [
item
for item in history_stats
if item.risk_signal in signal_codes
and (not item.expense_type or _canonical_key(item.expense_type) == expense_type)
]
if not matched:
return 0, [], None
history = max(matched, key=lambda item: item.similar_case_count)
total = max(1, history.similar_case_count)
confirmed_rate = Decimal(history.confirmed_count) / Decimal(total)
returned_rate = Decimal(history.returned_count) / Decimal(total)
false_positive_rate = Decimal(history.false_positive_count) / Decimal(total)
score = _clamp_score(
HUNDRED * (confirmed_rate * Decimal("0.65") + returned_rate * Decimal("0.35"))
- HUNDRED * false_positive_rate * Decimal("0.50")
)
if score <= 0:
return 0, [], history
return score, [
RiskEvidence(
code="history_feedback",
title="History feedback",
detail="Similar historical cases contain confirmed or returned risks.",
source="feedback",
score=score,
metadata=history.as_dict(),
)
], history
def _resolve_peer_baseline(
target: RiskGraphClaimSnapshot,
claims: list[RiskGraphClaimSnapshot],
min_sample_size: int,
) -> PeerBaseline:
candidates = [claim for claim in claims if claim.claim_id != target.claim_id]
scopes = [
(
"department_grade_expense_type",
[
claim
for claim in candidates
if _same(claim.department_name, target.department_name)
and _same(claim.employee_grade, target.employee_grade)
and _same(claim.expense_type, target.expense_type)
],
),
(
"department_expense_type",
[
claim
for claim in candidates
if _same(claim.department_name, target.department_name)
and _same(claim.expense_type, target.expense_type)
],
),
(
"expense_type",
[claim for claim in candidates if _same(claim.expense_type, target.expense_type)],
),
("all_claims", candidates),
]
for scope, scoped_claims in scopes:
amounts = [
_to_decimal(claim.amount)
for claim in scoped_claims
if _to_decimal(claim.amount) > ZERO
]
if len(amounts) >= min_sample_size:
return _build_baseline(scope, amounts)
return PeerBaseline(
scope="insufficient_sample",
sample_size=0,
fallback_reason="Peer sample is below minimum threshold.",
)
def _build_baseline(scope: str, amounts: list[Decimal]) -> PeerBaseline:
return PeerBaseline(
scope=scope,
sample_size=len(amounts),
median_amount=_percentile(amounts, 50),
p75_amount=_percentile(amounts, 75),
p90_amount=_percentile(amounts, 90),
mean_amount=sum(amounts, ZERO) / Decimal(len(amounts)),
)
def _weighted_risk_score(scores: dict[str, int]) -> int:
weighted = (
Decimal(scores["S_rule"]) * Decimal("0.35")
+ Decimal(scores["S_anomaly"]) * Decimal("0.25")
+ Decimal(scores["S_graph"]) * Decimal("0.20")
+ Decimal(scores["S_policy"]) * Decimal("0.15")
+ Decimal(scores["S_history"]) * Decimal("0.05")
)
return _clamp_score(weighted)
def _evidence_source_count(evidence: list[RiskEvidence]) -> int:
return len(
{
str(item.source or "").strip()
for item in evidence
if str(item.source or "").strip()
}
)
def _apply_evidence_source_gate(
risk_score: int,
evidence: list[RiskEvidence],
) -> tuple[int, str]:
if risk_score >= 70 and _evidence_source_count(evidence) < 2:
return 69, "capped_high_risk_single_source"
return risk_score, "passed"
def _select_primary_signal(
signals: list[NormalizedRiskSignal],
*,
fallback_score: int,
) -> NormalizedRiskSignal:
deduped = _dedupe_signals(signals)
if deduped:
return max(deduped, key=lambda item: (item.score, item.confidence, item.code))
fallback = normalize_risk_signals(
[{"risk_signal": "amount_limit_exceeded", "score": fallback_score}],
source="algorithm",
)
return fallback[0]
def _calculate_confidence(
*,
evidence: list[RiskEvidence],
baseline: PeerBaseline,
ontology_confidence: Decimal,
history: RiskHistoryStats | None,
data_quality_ok: bool,
) -> Decimal:
source_count = len({item.source for item in evidence})
confidence = Decimal("0.42") + min(Decimal("0.30"), Decimal(source_count) * Decimal("0.10"))
confidence += min(Decimal("0.16"), Decimal(baseline.sample_size) / Decimal("50"))
confidence += ontology_confidence * Decimal("0.08")
if history and history.similar_case_count:
false_positive_rate = Decimal(history.false_positive_count) / Decimal(
history.similar_case_count
)
confidence -= min(Decimal("0.18"), false_positive_rate * Decimal("0.30"))
if not data_quality_ok:
confidence -= Decimal("0.20")
return max(Decimal("0.05"), min(Decimal("0.98"), confidence.quantize(Decimal("0.0001"))))
def _resolve_automation_mode(
*,
risk_score: int,
confidence: Decimal,
evidence_count: int,
history: RiskHistoryStats | None,
) -> str:
false_positive_rate = Decimal("0")
if history and history.similar_case_count:
false_positive_rate = Decimal(history.false_positive_count) / Decimal(
history.similar_case_count
)
if (
risk_score >= 90
and confidence >= Decimal("0.90")
and evidence_count >= 3
and false_positive_rate <= Decimal("0.10")
):
return AUTOMATION_AUTO_HOLD
if risk_score >= 75 and confidence >= Decimal("0.72") and evidence_count >= 2:
return AUTOMATION_SEMI_AUTO_REVIEW
if risk_score >= 40:
return AUTOMATION_MANUAL_REVIEW
return AUTOMATION_ASSIST
def _duplicate_invoice_claims(
target: RiskGraphClaimSnapshot,
claims: list[RiskGraphClaimSnapshot],
) -> list[RiskGraphClaimSnapshot]:
invoice_ids = {item.invoice_id for item in target.items if item.invoice_id}
if not invoice_ids:
return []
matched = []
for claim in claims:
if claim.claim_id == target.claim_id:
continue
if any(item.invoice_id in invoice_ids for item in claim.items if item.invoice_id):
matched.append(claim)
return matched
def _split_billing_claims(
target: RiskGraphClaimSnapshot,
claims: list[RiskGraphClaimSnapshot],
near_threshold_amount: Decimal,
) -> list[RiskGraphClaimSnapshot]:
if target.occurred_at is None:
return []
matched = [
claim
for claim in claims
if _same_employee(claim, target)
and _same(claim.expense_type, target.expense_type)
and _same(claim.location, target.location)
and claim.occurred_at is not None
and abs((claim.occurred_at.date() - target.occurred_at.date()).days) <= 7
and _to_decimal(claim.amount) <= near_threshold_amount
and _to_decimal(claim.amount) >= near_threshold_amount * Decimal("0.55")
]
return matched
def _employee_frequency_claims(
target: RiskGraphClaimSnapshot,
claims: list[RiskGraphClaimSnapshot],
) -> list[RiskGraphClaimSnapshot]:
if target.occurred_at is None:
return []
return [
claim
for claim in claims
if _same_employee(claim, target)
and _same(claim.expense_type, target.expense_type)
and claim.occurred_at is not None
and abs((claim.occurred_at.date() - target.occurred_at.date()).days) <= 30
]
def _cross_department_cluster_claims(
target: RiskGraphClaimSnapshot,
claims: list[RiskGraphClaimSnapshot],
) -> list[RiskGraphClaimSnapshot]:
if target.occurred_at is None or not target.location:
return []
matched = [
claim
for claim in claims
if claim.occurred_at is not None
and claim.occurred_at.date() == target.occurred_at.date()
and _same(claim.location, target.location)
and _same(claim.expense_type, target.expense_type)
and _to_decimal(claim.amount) >= _to_decimal(target.amount) * Decimal("0.65")
]
departments = {
_canonical_key(claim.department_name)
for claim in matched
if claim.department_name
}
return matched if len(departments) >= 2 else []
def _similar_case_ids(
target: RiskGraphClaimSnapshot,
claims: list[RiskGraphClaimSnapshot],
) -> list[str]:
return [
claim.claim_id
for claim in _employee_frequency_claims(target, claims)
if claim.claim_id != target.claim_id
][:8]
def _claim_related_node_keys(
claim: RiskGraphClaimSnapshot,
nodes: list[RiskGraphNode],
) -> list[str]:
claim_key = claim_node_key(claim)
employee_key = employee_node_key(claim)
related = {claim_key}
if employee_key:
related.add(employee_key)
for node in nodes:
if str(node.key).startswith(("expense_type:", "department:", "location:")):
if str(node.label or "").strip() in {
claim.expense_type,
claim.department_name,
claim.location,
}:
related.add(node.key)
return sorted(related)
def _claim_related_edge_keys(
claim: RiskGraphClaimSnapshot,
edges: list[RiskGraphEdge],
) -> list[dict[str, str]]:
claim_key = claim_node_key(claim)
return [
{
"source_key": edge.source_key,
"target_key": edge.target_key,
"edge_type": edge.edge_type,
}
for edge in edges
if edge.source_key == claim_key or edge.target_key == claim_key
]
def _build_description(
claim: RiskGraphClaimSnapshot,
signal: NormalizedRiskSignal,
risk_score: int,
evidence: list[RiskEvidence],
) -> str:
top_evidence = max(evidence, key=lambda item: item.score, default=None)
if top_evidence is None:
return (
f"{claim.claim_no or claim.claim_id} produced "
f"{signal.label} with score {risk_score}."
)
return (
f"{claim.claim_no or claim.claim_id} produced {signal.label} "
f"with score {risk_score}. Main evidence: {top_evidence.detail}"
)
def _level_from_score(score: int) -> str:
if score >= 90:
return LEVEL_CRITICAL
if score >= 70:
return LEVEL_HIGH
if score >= 45:
return LEVEL_MEDIUM
return LEVEL_LOW
def _score_ratio(value: Decimal, bands: list[tuple[Decimal, int]]) -> int:
if not bands:
return 0
points = sorted(bands, key=lambda item: item[0])
if value <= points[0][0]:
return points[0][1]
for index in range(1, len(points)):
left_value, left_score = points[index - 1]
right_value, right_score = points[index]
if value > right_value:
continue
ratio = (value - left_value) / (right_value - left_value)
return _clamp_score(Decimal(left_score) + ratio * Decimal(right_score - left_score))
return points[-1][1]
def _percentile(values: list[Decimal], percent: int) -> Decimal:
normalized = sorted(value for value in values if value >= ZERO)
if not normalized:
return ZERO
if len(normalized) == 1:
return normalized[0]
position = Decimal(len(normalized) - 1) * Decimal(percent) / HUNDRED
lower = int(position.to_integral_value(rounding=ROUND_FLOOR))
upper = int(position.to_integral_value(rounding=ROUND_CEILING))
if lower == upper:
return normalized[lower]
fraction = position - Decimal(lower)
return normalized[lower] + (normalized[upper] - normalized[lower]) * fraction
def _safe_ratio(numerator: Any, denominator: Any) -> Decimal:
denominator_value = _to_decimal(denominator)
if denominator_value <= ZERO:
return ZERO
return (_to_decimal(numerator) / denominator_value).quantize(Decimal("0.0001"))
def _to_decimal(value: Any) -> Decimal:
try:
return Decimal(str(value or "0"))
except Exception:
return ZERO
def _clamp_score(value: Any) -> int:
try:
numeric = Decimal(str(value))
except Exception:
numeric = ZERO
return max(0, min(100, int(numeric.quantize(ONE, rounding=ROUND_HALF_UP))))
def _same(left: Any, right: Any) -> bool:
return _canonical_key(left) == _canonical_key(right)
def _same_employee(left: RiskGraphClaimSnapshot, right: RiskGraphClaimSnapshot) -> bool:
left_key = left.employee_id or left.employee_name
right_key = right.employee_id or right.employee_name
return bool(left_key and _same(left_key, right_key))
def _canonical_key(value: Any) -> str:
return "_".join(str(value or "").strip().lower().split())
def _dedupe_signals(signals: list[NormalizedRiskSignal]) -> list[NormalizedRiskSignal]:
by_code: dict[str, NormalizedRiskSignal] = {}
for signal in signals:
current = by_code.get(signal.code)
if current is None or signal.score > current.score:
by_code[signal.code] = signal
return list(by_code.values())
def _merge_nodes(
first: list[RiskGraphNode],
second: list[RiskGraphNode],
) -> list[RiskGraphNode]:
by_key = {node.key: node for node in first}
for node in second:
by_key.setdefault(node.key, node)
return list(by_key.values())
def _merge_edges(
first: list[RiskGraphEdge],
second: list[RiskGraphEdge],
) -> list[RiskGraphEdge]:
by_key = {edge.edge_key(): edge for edge in first}
for edge in second:
by_key.setdefault(edge.edge_key(), edge)
return list(by_key.values())

View File

@@ -0,0 +1,113 @@
"""Canonical entity resolution for financial risk graph subjects."""
from __future__ import annotations
import hashlib
import re
from dataclasses import dataclass, field
from datetime import UTC, datetime
from typing import Any
ENTITY_TYPE_ALIASES = {
"supplier": "vendor",
"merchant": "vendor",
"hotel": "vendor",
"bank_account_name": "bank_account",
"employee_name": "employee",
}
@dataclass(slots=True)
class CanonicalEntity:
canonical_id: str
entity_type: str
canonical_key: str
label: str
aliases: list[str] = field(default_factory=list)
source: str = ""
confirmed_by: str = ""
confirmed_at: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"canonical_id": self.canonical_id,
"entity_type": self.entity_type,
"canonical_key": self.canonical_key,
"label": self.label,
"aliases": list(self.aliases),
"source": self.source,
"confirmed_by": self.confirmed_by,
"confirmed_at": self.confirmed_at,
"metadata": dict(self.metadata),
}
class FinancialEntityResolver:
def resolve(
self,
entity_type: str,
value: str,
*,
source: str = "",
metadata: dict[str, Any] | None = None,
) -> CanonicalEntity | None:
canonical_type = ENTITY_TYPE_ALIASES.get(_canonical_token(entity_type), _canonical_token(entity_type))
canonical_key = _canonical_value(value)
if not canonical_type or not canonical_key:
return None
canonical_id = _canonical_id(canonical_type, canonical_key)
return CanonicalEntity(
canonical_id=canonical_id,
entity_type=canonical_type,
canonical_key=canonical_key,
label=str(value or "").strip(),
aliases=[str(value or "").strip()],
source=source,
metadata=metadata or {},
)
class CanonicalEntityRegistry:
def __init__(self) -> None:
self._entities: dict[str, CanonicalEntity] = {}
def upsert(self, entity: CanonicalEntity) -> CanonicalEntity:
current = self._entities.get(entity.canonical_id)
if current is None:
self._entities[entity.canonical_id] = entity
return entity
aliases = list(dict.fromkeys([*current.aliases, *entity.aliases]))
current.aliases = aliases
current.metadata.update(entity.metadata)
return current
def confirm(self, canonical_id: str, *, actor: str) -> CanonicalEntity | None:
entity = self._entities.get(canonical_id)
if entity is None:
return None
entity.confirmed_by = str(actor or "").strip()
entity.confirmed_at = datetime.now(UTC).isoformat()
return entity
def get(self, canonical_id: str) -> CanonicalEntity | None:
return self._entities.get(canonical_id)
def all(self) -> list[CanonicalEntity]:
return list(self._entities.values())
def _canonical_id(entity_type: str, canonical_key: str) -> str:
digest = hashlib.sha1(f"{entity_type}:{canonical_key}".encode("utf-8")).hexdigest()[:12]
return f"{entity_type}:{digest}"
def _canonical_token(value: str) -> str:
return "_".join(str(value or "").strip().lower().split())
def _canonical_value(value: str) -> str:
normalized = str(value or "").strip().lower()
normalized = re.sub(r"[\s\-_/,.。()()【】\[\]]+", "", normalized)
return normalized

View File

@@ -0,0 +1,71 @@
"""Replayable evaluation cases for the financial risk graph algorithm."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
@dataclass(frozen=True, slots=True)
class RiskEvaluationCase:
case_id: str
category: str
expected_signal: str
expected_level: str
description: str
payload: dict[str, Any] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"case_id": self.case_id,
"category": self.category,
"expected_signal": self.expected_signal,
"expected_level": self.expected_level,
"description": self.description,
"payload": dict(self.payload),
}
def default_risk_evaluation_cases() -> list[RiskEvaluationCase]:
return [
RiskEvaluationCase(
case_id="positive_duplicate_invoice_high",
category="positive",
expected_signal="duplicate_invoice",
expected_level="high",
description="重复发票叠加高金额偏离,应输出高风险观察。",
payload={"risk_flags": ["duplicate_invoice"], "invoice_reuse": True},
),
RiskEvaluationCase(
case_id="negative_clean_low_amount",
category="negative",
expected_signal="none",
expected_level="none",
description="低金额、无规则命中、无图谱异常,不应输出风险观察。",
payload={"amount": 300, "risk_flags": []},
),
RiskEvaluationCase(
case_id="counterfactual_invoice_corrected",
category="counterfactual",
expected_signal="none",
expected_level="none",
description="重复票据被替换为唯一票据后,风险应消失或降级。",
payload={"remove_duplicate_invoice": True},
),
RiskEvaluationCase(
case_id="noise_missing_employee",
category="noise",
expected_signal="preapproval_absent",
expected_level="medium",
description="缺失员工信息时允许候选观察,但不能输出强风控结论。",
payload={"missing_fields": ["employee"], "score_cap": 69},
),
RiskEvaluationCase(
case_id="historical_false_positive_calibration",
category="historical_false_positive",
expected_signal="duplicate_invoice",
expected_level="medium",
description="历史误报率较高时进入校准抽审,不直接强拦截。",
payload={"false_positive_rate": 0.35},
),
]

View File

@@ -0,0 +1,144 @@
"""Feature extraction for heterogeneous financial risk graphs."""
from __future__ import annotations
from collections import Counter, defaultdict, deque
from dataclasses import dataclass, field
from typing import Any
from .models import RiskGraphEdge, RiskGraphNode
@dataclass(slots=True)
class RiskGraphFeatureSet:
node_type_counts: dict[str, int] = field(default_factory=dict)
edge_type_counts: dict[str, int] = field(default_factory=dict)
meta_path_counts: dict[str, int] = field(default_factory=dict)
degree_centrality: dict[str, float] = field(default_factory=dict)
clusters: list[dict[str, Any]] = field(default_factory=list)
neighbor_risk_density: dict[str, float] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"node_type_counts": dict(self.node_type_counts),
"edge_type_counts": dict(self.edge_type_counts),
"meta_path_counts": dict(self.meta_path_counts),
"degree_centrality": dict(self.degree_centrality),
"clusters": list(self.clusters),
"neighbor_risk_density": dict(self.neighbor_risk_density),
}
class HeterogeneousRiskGraphFeatureBuilder:
def build(
self,
nodes: list[RiskGraphNode],
edges: list[RiskGraphEdge],
*,
risk_node_keys: set[str] | None = None,
) -> RiskGraphFeatureSet:
node_by_key = {node.key: node for node in nodes}
adjacency = _build_adjacency(edges)
risk_keys = set(risk_node_keys or set())
return RiskGraphFeatureSet(
node_type_counts=dict(Counter(node.node_type for node in nodes)),
edge_type_counts=dict(Counter(edge.edge_type for edge in edges)),
meta_path_counts=_meta_path_counts(node_by_key, adjacency),
degree_centrality=_degree_centrality(node_by_key, adjacency),
clusters=_clusters(node_by_key, adjacency),
neighbor_risk_density=_neighbor_risk_density(node_by_key, adjacency, risk_keys),
)
def _build_adjacency(edges: list[RiskGraphEdge]) -> dict[str, list[tuple[str, str]]]:
adjacency: dict[str, list[tuple[str, str]]] = defaultdict(list)
for edge in edges:
adjacency[edge.source_key].append((edge.target_key, edge.edge_type))
adjacency[edge.target_key].append((edge.source_key, edge.edge_type))
return adjacency
def _meta_path_counts(
node_by_key: dict[str, RiskGraphNode],
adjacency: dict[str, list[tuple[str, str]]],
) -> dict[str, int]:
counts: Counter[str] = Counter()
for source_key, first_hops in adjacency.items():
source = node_by_key.get(source_key)
if source is None:
continue
for middle_key, first_edge_type in first_hops:
middle = node_by_key.get(middle_key)
if middle is None:
continue
for target_key, second_edge_type in adjacency.get(middle_key, []):
if target_key == source_key:
continue
target = node_by_key.get(target_key)
if target is None:
continue
key = (
f"{source.node_type}->{first_edge_type}->{middle.node_type}"
f"->{second_edge_type}->{target.node_type}"
)
counts[key] += 1
return dict(counts)
def _degree_centrality(
node_by_key: dict[str, RiskGraphNode],
adjacency: dict[str, list[tuple[str, str]]],
) -> dict[str, float]:
denominator = max(1, len(node_by_key) - 1)
return {
node_key: round(len(adjacency.get(node_key, [])) / denominator, 4)
for node_key in node_by_key
}
def _clusters(
node_by_key: dict[str, RiskGraphNode],
adjacency: dict[str, list[tuple[str, str]]],
) -> list[dict[str, Any]]:
visited: set[str] = set()
clusters: list[dict[str, Any]] = []
for start_key in node_by_key:
if start_key in visited:
continue
queue: deque[str] = deque([start_key])
visited.add(start_key)
members: list[str] = []
type_counts: Counter[str] = Counter()
while queue:
node_key = queue.popleft()
members.append(node_key)
type_counts[node_by_key[node_key].node_type] += 1
for next_key, _ in adjacency.get(node_key, []):
if next_key in visited or next_key not in node_by_key:
continue
visited.add(next_key)
queue.append(next_key)
clusters.append(
{
"size": len(members),
"node_keys": sorted(members),
"node_type_counts": dict(type_counts),
}
)
return sorted(clusters, key=lambda item: item["size"], reverse=True)
def _neighbor_risk_density(
node_by_key: dict[str, RiskGraphNode],
adjacency: dict[str, list[tuple[str, str]]],
risk_keys: set[str],
) -> dict[str, float]:
density: dict[str, float] = {}
for node_key in node_by_key:
neighbors = [target for target, _ in adjacency.get(node_key, [])]
if not neighbors:
density[node_key] = 0.0
continue
risk_neighbor_count = sum(1 for target in neighbors if target in risk_keys)
density[node_key] = round(risk_neighbor_count / len(neighbors), 4)
return density

View File

@@ -0,0 +1,307 @@
"""Graph construction helpers for expense risk analysis."""
from __future__ import annotations
from decimal import Decimal
from .models import RiskGraphClaimSnapshot, RiskGraphEdge, RiskGraphNode
ALLOWED_EDGE_TYPES = {
"department_has_employee",
"employee_submits_claim",
"claim_has_item",
"claim_expense_type",
"claim_location",
"claim_invoice",
"claim_has_risk_signal",
"claim_similar_to",
"claim_duplicate_invoice",
"ontology_extracts",
"ontology_constrains",
"ontology_signals",
}
def build_claim_graph(
claims: list[RiskGraphClaimSnapshot],
) -> tuple[list[RiskGraphNode], list[RiskGraphEdge]]:
nodes: dict[str, RiskGraphNode] = {}
edges: dict[tuple[str, str, str], RiskGraphEdge] = {}
for claim in claims:
claim_key = claim_node_key(claim)
_add_node(
nodes,
RiskGraphNode(
key=claim_key,
node_type="claim",
label=claim.claim_no or claim.claim_id,
canonical_key=claim_key,
canonical_id=claim.claim_id or claim.claim_no,
metadata={
"claim_id": claim.claim_id,
"amount": str(_to_decimal(claim.amount)),
"expense_type": claim.expense_type,
"status": claim.status,
},
),
)
employee_key = employee_node_key(claim)
if employee_key:
_add_node(
nodes,
RiskGraphNode(
key=employee_key,
node_type="employee",
label=claim.employee_name or claim.employee_id or "unknown",
canonical_key=employee_key,
canonical_id=claim.employee_id or claim.employee_name,
metadata={"employee_id": claim.employee_id, "grade": claim.employee_grade},
),
)
_add_edge(
edges,
RiskGraphEdge(
source_key=employee_key,
target_key=claim_key,
edge_type="employee_submits_claim",
metadata={"amount": str(_to_decimal(claim.amount))},
),
)
department_key = department_node_key(claim)
if department_key:
_add_node(
nodes,
RiskGraphNode(
key=department_key,
node_type="department",
label=claim.department_name or claim.department_id or "unknown",
canonical_key=department_key,
canonical_id=claim.department_id or claim.department_name,
metadata={"department_id": claim.department_id},
),
)
if employee_key:
_add_edge(
edges,
RiskGraphEdge(
source_key=department_key,
target_key=employee_key,
edge_type="department_has_employee",
),
)
expense_key = expense_type_node_key(claim.expense_type)
if expense_key:
_add_node(
nodes,
RiskGraphNode(
key=expense_key,
node_type="expense_type",
label=claim.expense_type,
canonical_key=expense_key,
canonical_id=claim.expense_type,
),
)
_add_edge(
edges,
RiskGraphEdge(
source_key=claim_key,
target_key=expense_key,
edge_type="claim_expense_type",
),
)
location_key = location_node_key(claim.location)
if location_key:
_add_node(
nodes,
RiskGraphNode(
key=location_key,
node_type="location",
label=claim.location,
canonical_key=location_key,
canonical_id=claim.location,
),
)
_add_edge(
edges,
RiskGraphEdge(
source_key=claim_key,
target_key=location_key,
edge_type="claim_location",
),
)
for item in claim.items:
item_key = f"claim_item:{item.item_id}" if item.item_id else ""
if item_key:
_add_node(
nodes,
RiskGraphNode(
key=item_key,
node_type="claim_item",
label=item.item_type or item.item_id,
canonical_key=item_key,
canonical_id=item.item_id,
metadata={
"amount": str(_to_decimal(item.item_amount)),
"location": item.item_location,
"invoice_id": item.invoice_id,
},
),
)
_add_edge(
edges,
RiskGraphEdge(
source_key=claim_key,
target_key=item_key,
edge_type="claim_has_item",
),
)
if item.invoice_id:
invoice_key = invoice_node_key(item.invoice_id)
_add_node(
nodes,
RiskGraphNode(
key=invoice_key,
node_type="invoice",
label=item.invoice_id,
canonical_key=invoice_key,
canonical_id=item.invoice_id,
),
)
_add_edge(
edges,
RiskGraphEdge(
source_key=claim_key,
target_key=invoice_key,
edge_type="claim_invoice",
),
)
_link_duplicate_invoices(claims, edges)
_link_similar_claims(claims, edges)
return list(nodes.values()), list(edges.values())
def claim_node_key(claim: RiskGraphClaimSnapshot) -> str:
return f"claim:{claim.claim_id or claim.claim_no}"
def employee_node_key(claim: RiskGraphClaimSnapshot) -> str:
identifier = claim.employee_id or claim.employee_name
return f"employee:{_canonical_key(identifier)}" if identifier else ""
def department_node_key(claim: RiskGraphClaimSnapshot) -> str:
identifier = claim.department_id or claim.department_name
return f"department:{_canonical_key(identifier)}" if identifier else ""
def expense_type_node_key(expense_type: str) -> str:
return f"expense_type:{_canonical_key(expense_type)}" if str(expense_type or "").strip() else ""
def location_node_key(location: str) -> str:
return f"location:{_canonical_key(location)}" if str(location or "").strip() else ""
def invoice_node_key(invoice_id: str) -> str:
return f"invoice:{_canonical_key(invoice_id)}"
def _link_duplicate_invoices(
claims: list[RiskGraphClaimSnapshot],
edges: dict[tuple[str, str, str], RiskGraphEdge],
) -> None:
by_invoice: dict[str, list[RiskGraphClaimSnapshot]] = {}
for claim in claims:
for item in claim.items:
if item.invoice_id:
by_invoice.setdefault(item.invoice_id, []).append(claim)
for invoice_id, invoice_claims in by_invoice.items():
unique_claims = {claim.claim_id: claim for claim in invoice_claims}
if len(unique_claims) < 2:
continue
claim_list = list(unique_claims.values())
for source in claim_list:
for target in claim_list:
if source.claim_id == target.claim_id:
continue
_add_edge(
edges,
RiskGraphEdge(
source_key=claim_node_key(source),
target_key=claim_node_key(target),
edge_type="claim_duplicate_invoice",
weight=Decimal("2"),
evidence=f"invoice:{invoice_id}",
),
)
def _link_similar_claims(
claims: list[RiskGraphClaimSnapshot],
edges: dict[tuple[str, str, str], RiskGraphEdge],
) -> None:
for index, source in enumerate(claims):
for target in claims[index + 1 :]:
if not _is_similar_claim(source, target):
continue
_add_edge(
edges,
RiskGraphEdge(
source_key=claim_node_key(source),
target_key=claim_node_key(target),
edge_type="claim_similar_to",
weight=Decimal("0.7"),
metadata={"reason": "same employee and expense type"},
),
)
_add_edge(
edges,
RiskGraphEdge(
source_key=claim_node_key(target),
target_key=claim_node_key(source),
edge_type="claim_similar_to",
weight=Decimal("0.7"),
metadata={"reason": "same employee and expense type"},
),
)
def _is_similar_claim(source: RiskGraphClaimSnapshot, target: RiskGraphClaimSnapshot) -> bool:
source_employee = source.employee_id or source.employee_name
target_employee = target.employee_id or target.employee_name
if not source_employee or _canonical_key(source_employee) != _canonical_key(target_employee):
return False
if _canonical_key(source.expense_type) != _canonical_key(target.expense_type):
return False
if source.occurred_at is None or target.occurred_at is None:
return True
return abs((source.occurred_at.date() - target.occurred_at.date()).days) <= 30
def _add_node(nodes: dict[str, RiskGraphNode], node: RiskGraphNode) -> None:
nodes.setdefault(node.key, node)
def _add_edge(edges: dict[tuple[str, str, str], RiskGraphEdge], edge: RiskGraphEdge) -> None:
if edge.edge_type not in ALLOWED_EDGE_TYPES:
return
edges.setdefault(edge.edge_key(), edge)
def _canonical_key(value: str | None) -> str:
return "_".join(str(value or "").strip().lower().split())
def _to_decimal(value: object) -> Decimal:
try:
return Decimal(str(value or "0"))
except Exception:
return Decimal("0")

View File

@@ -0,0 +1,103 @@
"""Data lineage contracts for risk graph observations."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
@dataclass(slots=True)
class RiskDataLineage:
observation_key: str
data_tables: list[str] = field(default_factory=list)
document_ids: list[str] = field(default_factory=list)
ocr_job_ids: list[str] = field(default_factory=list)
agent_run_ids: list[str] = field(default_factory=list)
tool_call_ids: list[str] = field(default_factory=list)
rule_versions: list[str] = field(default_factory=list)
ontology_version: str = ""
algorithm_version: str = ""
source_event_ids: list[str] = field(default_factory=list)
quality_gates: list[str] = field(default_factory=list)
def as_dict(self) -> dict[str, Any]:
return {
"observation_key": self.observation_key,
"data_tables": list(self.data_tables),
"document_ids": list(self.document_ids),
"ocr_job_ids": list(self.ocr_job_ids),
"agent_run_ids": list(self.agent_run_ids),
"tool_call_ids": list(self.tool_call_ids),
"rule_versions": list(self.rule_versions),
"ontology_version": self.ontology_version,
"algorithm_version": self.algorithm_version,
"source_event_ids": list(self.source_event_ids),
"quality_gates": list(self.quality_gates),
}
class RiskDataLineageBuilder:
def build_from_observation(
self,
observation: dict[str, Any],
*,
source_event_ids: list[str] | None = None,
) -> RiskDataLineage:
evidence = [item for item in observation.get("evidence", []) if isinstance(item, dict)]
ontology_json = observation.get("ontology_json") or {}
decision_trace = observation.get("decision_trace") or {}
data_tables = ["risk_observations"]
if observation.get("claim_id"):
data_tables.extend(["expense_claims", "expense_claim_items"])
if evidence:
data_tables.append("risk_observation_evidence")
return RiskDataLineage(
observation_key=str(observation.get("observation_key") or ""),
data_tables=_unique(data_tables),
document_ids=_evidence_values(evidence, ["document_id", "doc_id", "file_id"]),
ocr_job_ids=_evidence_values(evidence, ["ocr_job_id", "ocr_run_id"]),
agent_run_ids=_unique(
[
str(observation.get("run_id") or "").strip(),
str(decision_trace.get("agent_run_id") or "").strip(),
]
),
tool_call_ids=_evidence_values(evidence, ["tool_call_id"]),
rule_versions=_unique(
[
*_evidence_values(evidence, ["rule_version"]),
str(decision_trace.get("rule_version") or "").strip(),
]
),
ontology_version=str(ontology_json.get("ontology_version") or "").strip(),
algorithm_version=str(observation.get("algorithm_version") or "").strip(),
source_event_ids=_unique(source_event_ids or []),
quality_gates=_quality_gates(decision_trace),
)
def _evidence_values(evidence: list[dict[str, Any]], keys: list[str]) -> list[str]:
values: list[str] = []
for item in evidence:
metadata = item.get("metadata") if isinstance(item.get("metadata"), dict) else {}
for key in keys:
value = str(item.get(key) or metadata.get(key) or "").strip()
if value:
values.append(value)
return _unique(values)
def _quality_gates(decision_trace: dict[str, Any]) -> list[str]:
gates = [
str(decision_trace.get("evidence_source_gate") or "").strip(),
str(decision_trace.get("data_quality_gate") or "").strip(),
]
sampling = decision_trace.get("sampling_strategy")
if isinstance(sampling, dict):
gates.append(str(sampling.get("strategy") or "").strip())
return _unique([item for item in gates if item and item != "passed"])
def _unique(values: list[str]) -> list[str]:
return list(dict.fromkeys(str(item).strip() for item in values if str(item).strip()))

View File

@@ -0,0 +1,365 @@
"""Data contracts for the financial risk graph algorithm."""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import date, datetime
from decimal import Decimal
from typing import Any
ALGORITHM_VERSION = "financial_risk_graph.v1"
LEVEL_LOW = "low"
LEVEL_MEDIUM = "medium"
LEVEL_HIGH = "high"
LEVEL_CRITICAL = "critical"
AUTOMATION_ASSIST = "assist"
AUTOMATION_MANUAL_REVIEW = "manual_review"
AUTOMATION_SEMI_AUTO_REVIEW = "semi_auto_review"
AUTOMATION_AUTO_HOLD = "auto_hold"
@dataclass(slots=True)
class RiskGraphClaimItemSnapshot:
item_id: str = ""
item_type: str = ""
item_amount: Any = Decimal("0")
item_location: str = ""
item_date: date | None = None
invoice_id: str | None = None
metadata: dict[str, Any] = field(default_factory=dict)
@classmethod
def from_orm(cls, item: Any) -> "RiskGraphClaimItemSnapshot":
return cls(
item_id=str(getattr(item, "id", "") or ""),
item_type=str(getattr(item, "item_type", "") or ""),
item_amount=getattr(item, "item_amount", Decimal("0")) or Decimal("0"),
item_location=str(getattr(item, "item_location", "") or ""),
item_date=getattr(item, "item_date", None),
invoice_id=(
str(getattr(item, "invoice_id", "") or "").strip()
or None
),
metadata=_metadata_from_object(item),
)
@dataclass(slots=True)
class RiskGraphClaimSnapshot:
claim_id: str
claim_no: str = ""
employee_id: str | None = None
employee_name: str = ""
department_id: str | None = None
department_name: str = ""
employee_grade: str | None = None
expense_type: str = ""
amount: Any = Decimal("0")
currency: str = "CNY"
invoice_count: int = 0
occurred_at: datetime | None = None
submitted_at: datetime | None = None
status: str = ""
reason: str = ""
location: str = ""
risk_flags: list[Any] = field(default_factory=list)
items: list[RiskGraphClaimItemSnapshot] = field(default_factory=list)
metadata: dict[str, Any] = field(default_factory=dict)
@classmethod
def from_orm(cls, claim: Any) -> "RiskGraphClaimSnapshot":
items = [
RiskGraphClaimItemSnapshot.from_orm(item)
for item in list(getattr(claim, "items", None) or [])
]
return cls(
claim_id=str(getattr(claim, "id", "") or ""),
claim_no=str(getattr(claim, "claim_no", "") or ""),
employee_id=(
str(getattr(claim, "employee_id", "") or "").strip()
or None
),
employee_name=str(getattr(claim, "employee_name", "") or ""),
department_id=(
str(getattr(claim, "department_id", "") or "").strip()
or None
),
department_name=str(getattr(claim, "department_name", "") or ""),
employee_grade=(
str(getattr(claim, "employee_grade", "") or "").strip()
or None
),
expense_type=str(getattr(claim, "expense_type", "") or ""),
amount=getattr(claim, "amount", Decimal("0")) or Decimal("0"),
currency=str(getattr(claim, "currency", "CNY") or "CNY"),
invoice_count=int(getattr(claim, "invoice_count", 0) or 0),
occurred_at=getattr(claim, "occurred_at", None),
submitted_at=getattr(claim, "submitted_at", None),
status=str(getattr(claim, "status", "") or ""),
reason=str(getattr(claim, "reason", "") or ""),
location=str(getattr(claim, "location", "") or ""),
risk_flags=list(getattr(claim, "risk_flags_json", None) or []),
items=items,
metadata=_metadata_from_object(claim),
)
@dataclass(slots=True)
class RiskGraphNode:
key: str
node_type: str
label: str
canonical_key: str = ""
canonical_id: str = ""
ontology_type: str = ""
ontology_parse_id: str = ""
ontology_version: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"key": self.key,
"node_type": self.node_type,
"label": self.label,
"canonical_key": self.canonical_key or self.key,
"canonical_id": self.canonical_id or self.canonical_key or self.key,
"ontology_type": self.ontology_type or self.node_type,
"ontology_parse_id": self.ontology_parse_id,
"ontology_version": self.ontology_version,
"metadata": _json_safe(self.metadata),
}
@dataclass(slots=True)
class RiskGraphEdge:
source_key: str
target_key: str
edge_type: str
weight: Decimal = Decimal("1")
source: str = "algorithm"
evidence: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
def edge_key(self) -> tuple[str, str, str]:
return (self.source_key, self.target_key, self.edge_type)
def as_dict(self) -> dict[str, Any]:
return {
"source_key": self.source_key,
"target_key": self.target_key,
"edge_type": self.edge_type,
"weight": _format_decimal(self.weight),
"source": self.source,
"evidence": self.evidence,
"metadata": _json_safe(self.metadata),
}
@dataclass(slots=True)
class PeerBaseline:
scope: str
sample_size: int
median_amount: Decimal = Decimal("0")
p75_amount: Decimal = Decimal("0")
p90_amount: Decimal = Decimal("0")
mean_amount: Decimal = Decimal("0")
fallback_reason: str = ""
def as_dict(self) -> dict[str, Any]:
return {
"scope": self.scope,
"sample_size": self.sample_size,
"median_amount": _format_decimal(self.median_amount),
"p75_amount": _format_decimal(self.p75_amount),
"p90_amount": _format_decimal(self.p90_amount),
"mean_amount": _format_decimal(self.mean_amount),
"fallback_reason": self.fallback_reason,
}
@dataclass(slots=True)
class RiskEvidence:
code: str
title: str
detail: str
source: str
score: int = 0
related_entity_keys: list[str] = field(default_factory=list)
metadata: dict[str, Any] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"code": self.code,
"title": self.title,
"detail": self.detail,
"source": self.source,
"score": int(self.score),
"related_entity_keys": list(self.related_entity_keys),
"metadata": _json_safe(self.metadata),
}
@dataclass(slots=True)
class RiskHistoryStats:
risk_signal: str
expense_type: str = ""
similar_case_count: int = 0
confirmed_count: int = 0
false_positive_count: int = 0
returned_count: int = 0
def as_dict(self) -> dict[str, Any]:
return {
"risk_signal": self.risk_signal,
"expense_type": self.expense_type,
"similar_case_count": self.similar_case_count,
"confirmed_count": self.confirmed_count,
"false_positive_count": self.false_positive_count,
"returned_count": self.returned_count,
}
@dataclass(slots=True)
class RiskGraphEvaluationContext:
claims: list[RiskGraphClaimSnapshot]
target_claim_ids: set[str] | None = None
ontology_parse: Any | None = None
ontology_parse_id: str = ""
ontology_version: str = "ontology.v1"
history_stats: list[RiskHistoryStats] = field(default_factory=list)
min_peer_sample_size: int = 3
observation_threshold: int = 31
near_threshold_amount: Decimal = Decimal("5000")
@dataclass(slots=True)
class RiskObservationDraft:
observation_key: str
subject_type: str
subject_key: str
subject_label: str
claim_id: str
claim_no: str
risk_type: str
risk_signal: str
title: str
description: str
risk_score: int
risk_level: str
confidence_score: Decimal
control_stage: str
control_mode: str
automation_mode: str
source: str
algorithm_version: str
contribution_scores: dict[str, int]
baseline: PeerBaseline
evidence: list[RiskEvidence] = field(default_factory=list)
graph_node_keys: list[str] = field(default_factory=list)
graph_edge_keys: list[dict[str, str]] = field(default_factory=list)
policy_refs: list[str] = field(default_factory=list)
similar_case_claim_ids: list[str] = field(default_factory=list)
ontology_json: dict[str, Any] = field(default_factory=dict)
decision_trace: dict[str, Any] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"observation_key": self.observation_key,
"subject_type": self.subject_type,
"subject_key": self.subject_key,
"subject_label": self.subject_label,
"claim_id": self.claim_id,
"claim_no": self.claim_no,
"risk_type": self.risk_type,
"risk_signal": self.risk_signal,
"title": self.title,
"description": self.description,
"risk_score": self.risk_score,
"risk_level": self.risk_level,
"confidence_score": _format_decimal(self.confidence_score),
"control_stage": self.control_stage,
"control_mode": self.control_mode,
"automation_mode": self.automation_mode,
"source": self.source,
"algorithm_version": self.algorithm_version,
"contribution_scores": dict(self.contribution_scores),
"baseline": self.baseline.as_dict(),
"evidence": [item.as_dict() for item in self.evidence],
"graph_node_keys": list(self.graph_node_keys),
"graph_edge_keys": list(self.graph_edge_keys),
"policy_refs": list(self.policy_refs),
"similar_case_claim_ids": list(self.similar_case_claim_ids),
"ontology_json": _json_safe(self.ontology_json),
"decision_trace": _json_safe(self.decision_trace),
}
@dataclass(slots=True)
class RiskGraphEvaluationResult:
observations: list[RiskObservationDraft]
nodes: list[RiskGraphNode]
edges: list[RiskGraphEdge]
algorithm_version: str = ALGORITHM_VERSION
def as_dict(self) -> dict[str, Any]:
return {
"algorithm_version": self.algorithm_version,
"observations": [item.as_dict() for item in self.observations],
"nodes": [item.as_dict() for item in self.nodes],
"edges": [item.as_dict() for item in self.edges],
"summary": {
"observation_count": len(self.observations),
"node_count": len(self.nodes),
"edge_count": len(self.edges),
"high_or_above_count": sum(
1
for item in self.observations
if item.risk_level in {LEVEL_HIGH, LEVEL_CRITICAL}
),
},
}
def _format_decimal(value: Any, places: str = "0.0000") -> str:
if value is None:
return "0"
if not isinstance(value, Decimal):
value = Decimal(str(value or "0"))
return format(value.quantize(Decimal(places)), "f").rstrip("0").rstrip(".") or "0"
def _json_safe(value: Any) -> Any:
if isinstance(value, Decimal):
return _format_decimal(value)
if isinstance(value, (datetime, date)):
return value.isoformat()
if isinstance(value, list):
return [_json_safe(item) for item in value]
if isinstance(value, tuple):
return [_json_safe(item) for item in value]
if isinstance(value, dict):
return {str(key): _json_safe(item) for key, item in value.items()}
return value
def _metadata_from_object(source: Any) -> dict[str, Any]:
metadata: dict[str, Any] = {}
for attr in (
"metadata",
"metadata_json",
"extra_json",
"supplier_id",
"supplier_name",
"vendor_id",
"vendor_name",
"merchant_id",
"merchant_name",
):
value = getattr(source, attr, None)
if isinstance(value, dict):
metadata.update(value)
elif attr != "metadata" and value not in (None, ""):
metadata[attr] = value
return metadata

View File

@@ -0,0 +1,270 @@
"""Ontology-to-risk-graph mapping utilities."""
from __future__ import annotations
from dataclasses import dataclass, field
from decimal import Decimal
from typing import Any
from .models import RiskGraphEdge, RiskGraphNode
from .signals import NormalizedRiskSignal, normalize_risk_signals
ONTOLOGY_NODE_TYPE_MAP = {
"expense_type": "expense_type",
"document_type": "document",
"employee": "employee",
"department": "department",
"vendor": "vendor",
"supplier": "vendor",
"merchant": "vendor",
"customer": "customer",
"risk_signal": "risk_signal",
"invoice": "invoice",
"claim": "claim",
}
ALLOWED_ONTOLOGY_EDGE_TYPES = {
"ontology_extracts",
"ontology_constrains",
"ontology_signals",
}
@dataclass(slots=True)
class OntologyRiskGraphMapping:
ontology_parse_id: str
ontology_version: str
domain: str
scenario: str
intent: str
confidence: Decimal
gate: str
nodes: list[RiskGraphNode] = field(default_factory=list)
edges: list[RiskGraphEdge] = field(default_factory=list)
risk_signals: list[NormalizedRiskSignal] = field(default_factory=list)
canonical_subject_key: str = ""
raw_payload: dict[str, Any] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"ontology_parse_id": self.ontology_parse_id,
"ontology_version": self.ontology_version,
"domain": self.domain,
"scenario": self.scenario,
"intent": self.intent,
"confidence": str(self.confidence),
"gate": self.gate,
"canonical_subject_key": self.canonical_subject_key,
"risk_signals": [item.as_dict() for item in self.risk_signals],
}
def map_ontology_to_risk_graph(
ontology: Any,
*,
ontology_parse_id: str = "",
ontology_version: str = "ontology.v1",
) -> OntologyRiskGraphMapping:
payload = _model_to_dict(ontology)
if not payload:
return OntologyRiskGraphMapping(
ontology_parse_id=ontology_parse_id,
ontology_version=ontology_version,
domain="unknown",
scenario="unknown",
intent="query",
confidence=Decimal("0"),
gate="candidate_only",
)
parse_id = str(
ontology_parse_id
or payload.get("ontology_parse_id")
or payload.get("parse_id")
or payload.get("run_id")
or "ontology_parse"
)
scenario = str(payload.get("scenario") or "unknown")
intent = str(payload.get("intent") or "query")
domain = str(payload.get("domain") or scenario)
confidence = _to_decimal(payload.get("confidence"))
gate = _gate_from_confidence(confidence)
nodes: list[RiskGraphNode] = [
RiskGraphNode(
key=f"ontology:{parse_id}",
node_type="ontology_parse",
label=parse_id,
canonical_key=f"ontology:{parse_id}",
canonical_id=parse_id,
ontology_type="ontology_parse",
ontology_parse_id=parse_id,
ontology_version=ontology_version,
metadata={
"scenario": scenario,
"intent": intent,
"domain": domain,
"confidence": str(confidence),
},
)
]
edges: list[RiskGraphEdge] = []
canonical_subject_key = ""
for entity in list(payload.get("entities") or []):
entity_payload = _model_to_dict(entity)
raw_type = str(entity_payload.get("type") or "").strip().lower()
node_type = ONTOLOGY_NODE_TYPE_MAP.get(raw_type, raw_type or "entity")
value = str(
entity_payload.get("normalized_value")
or entity_payload.get("value")
or ""
).strip()
if not value:
continue
key = f"{node_type}:{_canonical_key(value)}"
nodes.append(
RiskGraphNode(
key=key,
node_type=node_type,
label=value,
canonical_key=key,
canonical_id=_canonical_key(value),
ontology_type=raw_type or node_type,
ontology_parse_id=parse_id,
ontology_version=ontology_version,
metadata={
"role": entity_payload.get("role") or "target",
"confidence": entity_payload.get("confidence") or 0,
},
)
)
edges.append(
RiskGraphEdge(
source_key=f"ontology:{parse_id}",
target_key=key,
edge_type="ontology_extracts",
source="ontology",
metadata={"raw_type": raw_type},
)
)
if not canonical_subject_key and node_type in {"employee", "claim", "vendor"}:
canonical_subject_key = key
for constraint in list(payload.get("constraints") or []):
constraint_payload = _model_to_dict(constraint)
field = str(constraint_payload.get("field") or "").strip()
operator = str(constraint_payload.get("operator") or "").strip()
value = str(constraint_payload.get("value") or "").strip()
if not field or not value:
continue
key = f"constraint:{_canonical_key(field)}:{_canonical_key(value)}"
nodes.append(
RiskGraphNode(
key=key,
node_type="constraint",
label=f"{field} {operator} {value}".strip(),
canonical_key=key,
canonical_id=key,
ontology_type="constraint",
ontology_parse_id=parse_id,
ontology_version=ontology_version,
metadata=constraint_payload,
)
)
edges.append(
RiskGraphEdge(
source_key=f"ontology:{parse_id}",
target_key=key,
edge_type="ontology_constrains",
source="ontology",
)
)
risk_signals = normalize_risk_signals(list(payload.get("risk_flags") or []), source="ontology")
for signal in risk_signals:
key = f"risk_signal:{signal.code}"
nodes.append(
RiskGraphNode(
key=key,
node_type="risk_signal",
label=signal.label,
canonical_key=key,
canonical_id=signal.code,
ontology_type="risk_signal",
ontology_parse_id=parse_id,
ontology_version=ontology_version,
metadata={"severity": signal.severity, "score": signal.score},
)
)
edges.append(
RiskGraphEdge(
source_key=f"ontology:{parse_id}",
target_key=key,
edge_type="ontology_signals",
source="ontology",
metadata={"gate": gate},
)
)
return OntologyRiskGraphMapping(
ontology_parse_id=parse_id,
ontology_version=ontology_version,
domain=domain,
scenario=scenario,
intent=intent,
confidence=confidence,
gate=gate,
nodes=_dedupe_nodes(nodes),
edges=_dedupe_edges(edges),
risk_signals=risk_signals,
canonical_subject_key=canonical_subject_key,
raw_payload=payload,
)
def _model_to_dict(value: Any) -> dict[str, Any]:
if value is None:
return {}
if isinstance(value, dict):
return dict(value)
if hasattr(value, "model_dump"):
return dict(value.model_dump(mode="json"))
if hasattr(value, "dict"):
return dict(value.dict())
return {}
def _gate_from_confidence(confidence: Decimal) -> str:
if confidence >= Decimal("0.78"):
return "automatic"
if confidence >= Decimal("0.55"):
return "review"
return "candidate_only"
def _canonical_key(value: str) -> str:
return "_".join(str(value or "").strip().lower().split())
def _to_decimal(value: Any) -> Decimal:
try:
return Decimal(str(value or "0"))
except Exception:
return Decimal("0")
def _dedupe_nodes(nodes: list[RiskGraphNode]) -> list[RiskGraphNode]:
by_key: dict[str, RiskGraphNode] = {}
for node in nodes:
by_key.setdefault(node.key, node)
return list(by_key.values())
def _dedupe_edges(edges: list[RiskGraphEdge]) -> list[RiskGraphEdge]:
by_key: dict[tuple[str, str, str], RiskGraphEdge] = {}
for edge in edges:
if edge.edge_type not in ALLOWED_ONTOLOGY_EDGE_TYPES:
continue
by_key.setdefault(edge.edge_key(), edge)
return list(by_key.values())

View File

@@ -0,0 +1,86 @@
"""Output contract for finance policy knowledge organizing tasks."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
@dataclass(frozen=True, slots=True)
class PolicySourceRef:
source_id: str
title: str
location: str = ""
page: str = ""
def as_dict(self) -> dict[str, Any]:
return {
"source_id": self.source_id,
"title": self.title,
"location": self.location,
"page": self.page,
}
@dataclass(frozen=True, slots=True)
class PolicyKnowledgeItem:
policy_ref: str
title: str
summary: str
expense_type: str = ""
control_stage: str = ""
trigger_conditions: list[str] = field(default_factory=list)
source_refs: list[PolicySourceRef] = field(default_factory=list)
review_status: str = "pending_review"
def as_dict(self) -> dict[str, Any]:
return {
"policy_ref": self.policy_ref,
"title": self.title,
"summary": self.summary,
"expense_type": self.expense_type,
"control_stage": self.control_stage,
"trigger_conditions": list(self.trigger_conditions),
"source_refs": [item.as_dict() for item in self.source_refs],
"review_status": self.review_status,
}
@dataclass(slots=True)
class PolicyKnowledgeOrganizingReport:
summary: str
categories: list[str] = field(default_factory=list)
knowledge_items: list[PolicyKnowledgeItem] = field(default_factory=list)
source_refs: list[PolicySourceRef] = field(default_factory=list)
open_questions: list[str] = field(default_factory=list)
next_actions: list[str] = field(default_factory=list)
def as_dict(self) -> dict[str, Any]:
return {
"summary": self.summary,
"categories": list(self.categories),
"knowledge_items": [item.as_dict() for item in self.knowledge_items],
"source_refs": [item.as_dict() for item in self.source_refs],
"open_questions": list(self.open_questions),
"next_actions": list(self.next_actions),
"risk_policy_refs": self.risk_policy_refs(),
}
def risk_policy_refs(self) -> list[str]:
return list(
dict.fromkeys(
item.policy_ref
for item in self.knowledge_items
if item.policy_ref and item.review_status in {"pending_review", "confirmed"}
)
)
def build_policy_ref(expense_type: str, signal: str, *, prefix: str = "policy") -> str:
expense = _token(expense_type) or "general"
risk_signal = _token(signal) or "control"
return f"{prefix}.{expense}.{risk_signal}"
def _token(value: str) -> str:
return "_".join(str(value or "").strip().lower().split())

View File

@@ -0,0 +1,325 @@
"""Object-centric process mining for financial risk events."""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import UTC, datetime
from typing import Any
from .models import RiskGraphClaimSnapshot
APPROVAL_EVENTS = {"approval_approved", "finance_approved", "claim_approved"}
PAYMENT_EVENTS = {"payment_requested", "payment_completed"}
RETURN_EVENTS = {"claim_returned", "approval_returned", "supplement_required"}
SUBMIT_EVENTS = {"claim_submitted", "application_submitted"}
@dataclass(slots=True)
class ObjectCentricEvent:
event_id: str
event_type: str
occurred_at: datetime
object_refs: dict[str, list[str]]
actor: str = ""
source: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"event_id": self.event_id,
"event_type": self.event_type,
"occurred_at": self.occurred_at.isoformat(),
"object_refs": {key: list(value) for key, value in self.object_refs.items()},
"actor": self.actor,
"source": self.source,
"metadata": dict(self.metadata),
}
@dataclass(slots=True)
class ConformanceRisk:
risk_code: str
title: str
detail: str
severity: str
related_event_ids: list[str] = field(default_factory=list)
object_refs: dict[str, list[str]] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"risk_code": self.risk_code,
"title": self.title,
"detail": self.detail,
"severity": self.severity,
"related_event_ids": list(self.related_event_ids),
"object_refs": {key: list(value) for key, value in self.object_refs.items()},
}
class ObjectCentricProcessMiner:
def build_from_claims(
self,
claims: list[RiskGraphClaimSnapshot],
) -> list[ObjectCentricEvent]:
events: list[ObjectCentricEvent] = []
for claim in claims:
events.extend(self._claim_events(claim))
return sorted(events, key=lambda item: (item.occurred_at, item.event_id))
def build_from_dicts(self, rows: list[dict[str, Any]]) -> list[ObjectCentricEvent]:
events: list[ObjectCentricEvent] = []
for index, row in enumerate(rows):
occurred_at = _datetime_from_value(row.get("occurred_at"))
if occurred_at is None:
continue
event_type = str(row.get("event_type") or "").strip()
if not event_type:
continue
events.append(
ObjectCentricEvent(
event_id=str(row.get("event_id") or f"event:{index}:{event_type}"),
event_type=event_type,
occurred_at=occurred_at,
object_refs=_normalize_object_refs(row.get("object_refs")),
actor=str(row.get("actor") or "").strip(),
source=str(row.get("source") or "").strip(),
metadata=dict(row.get("metadata") or {}),
)
)
return sorted(events, key=lambda item: (item.occurred_at, item.event_id))
def _claim_events(self, claim: RiskGraphClaimSnapshot) -> list[ObjectCentricEvent]:
object_refs = _claim_object_refs(claim)
events: list[ObjectCentricEvent] = []
occurred_at = claim.occurred_at or claim.submitted_at
if occurred_at:
events.append(
ObjectCentricEvent(
event_id=f"{claim.claim_id}:expense_occurred",
event_type="expense_occurred",
occurred_at=occurred_at,
object_refs=object_refs,
actor=claim.employee_id or claim.employee_name,
source="expense_claim",
metadata={"amount": str(claim.amount), "expense_type": claim.expense_type},
)
)
if claim.submitted_at:
events.append(
ObjectCentricEvent(
event_id=f"{claim.claim_id}:claim_submitted",
event_type="claim_submitted",
occurred_at=claim.submitted_at,
object_refs=object_refs,
actor=claim.employee_id or claim.employee_name,
source="expense_claim",
metadata={"status": claim.status},
)
)
for item in claim.items:
item_time = _datetime_from_value(item.item_date) or occurred_at or datetime.now(UTC)
item_refs = _merge_object_refs(
object_refs,
{
"claim_item": [item.item_id] if item.item_id else [],
"invoice": [item.invoice_id] if item.invoice_id else [],
},
)
events.append(
ObjectCentricEvent(
event_id=f"{claim.claim_id}:item:{item.item_id or len(events)}",
event_type="expense_item_recorded",
occurred_at=item_time,
object_refs=item_refs,
actor=claim.employee_id or claim.employee_name,
source="expense_item",
metadata={
"amount": str(item.item_amount),
"item_type": item.item_type,
"item_location": item.item_location,
},
)
)
if item.invoice_id:
events.append(
ObjectCentricEvent(
event_id=f"{claim.claim_id}:invoice:{item.invoice_id}",
event_type="invoice_attached",
occurred_at=item_time,
object_refs=item_refs,
actor=claim.employee_id or claim.employee_name,
source="invoice",
)
)
for index, flag in enumerate(claim.risk_flags):
signal = _risk_signal_from_flag(flag)
if not signal:
continue
events.append(
ObjectCentricEvent(
event_id=f"{claim.claim_id}:risk_flag:{index}:{signal}",
event_type="risk_flagged",
occurred_at=claim.submitted_at or occurred_at or datetime.now(UTC),
object_refs=object_refs,
source="risk_rule",
metadata={"risk_signal": signal, "raw": flag},
)
)
return events
class ConformanceRiskDetector:
def detect(self, events: list[ObjectCentricEvent]) -> list[ConformanceRisk]:
risks: list[ConformanceRisk] = []
for claim_key, claim_events in _events_by_object(events, "claim").items():
ordered = sorted(claim_events, key=lambda item: (item.occurred_at, item.event_id))
risks.extend(self._detect_claim_risks(claim_key, ordered))
return risks
def _detect_claim_risks(
self,
claim_key: str,
events: list[ObjectCentricEvent],
) -> list[ConformanceRisk]:
risks: list[ConformanceRisk] = []
event_types = [event.event_type for event in events]
first_submit = _first_event(events, SUBMIT_EVENTS)
first_approval = _first_event(events, APPROVAL_EVENTS)
first_payment = _first_event(events, PAYMENT_EVENTS)
if first_payment and (not first_approval or first_payment.occurred_at < first_approval.occurred_at):
related = [first_payment.event_id]
if first_approval:
related.append(first_approval.event_id)
risks.append(
ConformanceRisk(
risk_code="payment_before_approval",
title="Payment before approval",
detail="Payment event appears before an approval event.",
severity="critical",
related_event_ids=related,
object_refs={"claim": [claim_key]},
)
)
if first_approval and (not first_submit or first_approval.occurred_at < first_submit.occurred_at):
related = [first_approval.event_id]
if first_submit:
related.append(first_submit.event_id)
risks.append(
ConformanceRisk(
risk_code="approval_bypass",
title="Approval bypass",
detail="Approval appears before submission or without submission.",
severity="high",
related_event_ids=related,
object_refs={"claim": [claim_key]},
)
)
return_count = sum(1 for event_type in event_types if event_type in RETURN_EVENTS)
submit_count = sum(1 for event_type in event_types if event_type in SUBMIT_EVENTS)
if return_count >= 2 or (return_count >= 1 and submit_count >= 2):
risks.append(
ConformanceRisk(
risk_code="rework_loop",
title="Rework loop",
detail="Claim has repeated return and resubmission events.",
severity="medium",
related_event_ids=[
event.event_id
for event in events
if event.event_type in RETURN_EVENTS | SUBMIT_EVENTS
],
object_refs={"claim": [claim_key]},
)
)
if "invoice_attached" in event_types and not first_submit:
risks.append(
ConformanceRisk(
risk_code="process_bypass",
title="Process bypass",
detail="Invoice exists without a claim submission event.",
severity="medium",
related_event_ids=[
event.event_id for event in events if event.event_type == "invoice_attached"
],
object_refs={"claim": [claim_key]},
)
)
return risks
def _claim_object_refs(claim: RiskGraphClaimSnapshot) -> dict[str, list[str]]:
return {
"claim": [claim.claim_id] if claim.claim_id else [],
"employee": [claim.employee_id or claim.employee_name]
if claim.employee_id or claim.employee_name
else [],
"department": [claim.department_id or claim.department_name]
if claim.department_id or claim.department_name
else [],
"expense_type": [claim.expense_type] if claim.expense_type else [],
}
def _normalize_object_refs(value: Any) -> dict[str, list[str]]:
if not isinstance(value, dict):
return {}
normalized: dict[str, list[str]] = {}
for key, raw_items in value.items():
if isinstance(raw_items, list):
items = [str(item).strip() for item in raw_items if str(item).strip()]
else:
items = [str(raw_items).strip()] if str(raw_items or "").strip() else []
normalized[str(key).strip()] = list(dict.fromkeys(items))
return normalized
def _merge_object_refs(*refs: dict[str, list[str]]) -> dict[str, list[str]]:
merged: dict[str, list[str]] = {}
for ref in refs:
for key, values in ref.items():
bucket = merged.setdefault(key, [])
bucket.extend(str(value).strip() for value in values if str(value).strip())
return {key: list(dict.fromkeys(values)) for key, values in merged.items()}
def _events_by_object(
events: list[ObjectCentricEvent],
object_type: str,
) -> dict[str, list[ObjectCentricEvent]]:
grouped: dict[str, list[ObjectCentricEvent]] = {}
for event in events:
for object_key in event.object_refs.get(object_type, []):
grouped.setdefault(object_key, []).append(event)
return grouped
def _first_event(
events: list[ObjectCentricEvent],
event_types: set[str],
) -> ObjectCentricEvent | None:
for event in events:
if event.event_type in event_types:
return event
return None
def _risk_signal_from_flag(flag: Any) -> str:
if isinstance(flag, dict):
raw = flag.get("risk_signal") or flag.get("signal") or flag.get("rule_code") or flag.get("code")
else:
raw = flag
return "_".join(str(raw or "").strip().lower().split())
def _datetime_from_value(value: Any) -> datetime | None:
if isinstance(value, datetime):
return value
if hasattr(value, "year") and hasattr(value, "month") and hasattr(value, "day"):
return datetime(value.year, value.month, value.day, tzinfo=UTC)
if isinstance(value, str) and value.strip():
try:
return datetime.fromisoformat(value)
except ValueError:
return None
return None

View File

@@ -0,0 +1,259 @@
"""Profile baseline contracts for digital employee scans."""
from __future__ import annotations
from collections import defaultdict
from dataclasses import dataclass, field
from decimal import ROUND_CEILING, ROUND_FLOOR, Decimal
from typing import Any
from .models import ALGORITHM_VERSION, RiskGraphClaimSnapshot
ZERO = Decimal("0")
HUNDRED = Decimal("100")
BASELINE_ALGORITHM_VERSION = f"{ALGORITHM_VERSION}.profile_baselines.v1"
BASELINE_DIMENSIONS = ("employee", "department", "supplier", "expense_type")
SUPPLIER_ID_KEYS = ("supplier_id", "vendor_id", "merchant_id", "supplier_code")
SUPPLIER_NAME_KEYS = ("supplier_name", "vendor_name", "merchant_name", "supplier", "vendor", "merchant")
@dataclass(frozen=True, slots=True)
class ProfileBaselineBucket:
dimension: str
key: str
label: str
sample_size: int
claim_count: int
total_amount: Decimal
average_amount: Decimal
median_amount: Decimal
p75_amount: Decimal
p90_amount: Decimal
claim_ids: list[str] = field(default_factory=list)
def as_dict(self) -> dict[str, Any]:
return {
"dimension": self.dimension,
"key": self.key,
"label": self.label,
"sample_size": self.sample_size,
"claim_count": self.claim_count,
"total_amount": _format_decimal(self.total_amount),
"average_amount": _format_decimal(self.average_amount),
"median_amount": _format_decimal(self.median_amount),
"p75_amount": _format_decimal(self.p75_amount),
"p90_amount": _format_decimal(self.p90_amount),
"claim_ids": list(self.claim_ids),
}
@dataclass(frozen=True, slots=True)
class ProfileBaselineSnapshot:
algorithm_version: str
buckets: list[ProfileBaselineBucket] = field(default_factory=list)
@property
def dimension_counts(self) -> dict[str, int]:
counts = {dimension: 0 for dimension in BASELINE_DIMENSIONS}
for bucket in self.buckets:
counts[bucket.dimension] = counts.get(bucket.dimension, 0) + 1
return counts
def buckets_for(self, dimension: str) -> list[ProfileBaselineBucket]:
return [bucket for bucket in self.buckets if bucket.dimension == dimension]
def as_dict(self) -> dict[str, Any]:
return {
"algorithm_version": self.algorithm_version,
"dimension_counts": self.dimension_counts,
"bucket_count": len(self.buckets),
"buckets": [bucket.as_dict() for bucket in self.buckets],
}
class ProfileBaselineUpdater:
def build_from_claims(
self,
claims: list[RiskGraphClaimSnapshot],
) -> ProfileBaselineSnapshot:
grouped: dict[tuple[str, str], list[tuple[Decimal, str]]] = defaultdict(list)
labels: dict[tuple[str, str], str] = {}
for claim in claims:
self._add_claim_rows(grouped, labels, claim)
buckets = [
_build_bucket(dimension, key, labels[(dimension, key)], rows)
for (dimension, key), rows in grouped.items()
]
buckets.sort(key=lambda item: (item.dimension, -item.total_amount, item.key))
return ProfileBaselineSnapshot(
algorithm_version=BASELINE_ALGORITHM_VERSION,
buckets=buckets,
)
def _add_claim_rows(
self,
grouped: dict[tuple[str, str], list[tuple[Decimal, str]]],
labels: dict[tuple[str, str], str],
claim: RiskGraphClaimSnapshot,
) -> None:
amount = _to_decimal(claim.amount)
claim_id = claim.claim_id or claim.claim_no
_add_row(
grouped,
labels,
"employee",
claim.employee_id or claim.employee_name,
claim.employee_name or claim.employee_id,
amount,
claim_id,
)
_add_row(
grouped,
labels,
"department",
claim.department_id or claim.department_name,
claim.department_name or claim.department_id,
amount,
claim_id,
)
_add_row(
grouped,
labels,
"expense_type",
claim.expense_type,
claim.expense_type,
amount,
claim_id,
)
for supplier_key, supplier_label, supplier_amount in _supplier_rows(claim):
_add_row(
grouped,
labels,
"supplier",
supplier_key,
supplier_label,
supplier_amount,
claim_id,
)
def _build_bucket(
dimension: str,
key: str,
label: str,
rows: list[tuple[Decimal, str]],
) -> ProfileBaselineBucket:
amounts = [amount for amount, _claim_id in rows]
total = sum(amounts, ZERO)
sample_size = len(amounts)
claim_ids = sorted({claim_id for _amount, claim_id in rows if claim_id})
average = total / Decimal(sample_size) if sample_size else ZERO
return ProfileBaselineBucket(
dimension=dimension,
key=key,
label=label,
sample_size=sample_size,
claim_count=len(claim_ids),
total_amount=total,
average_amount=average,
median_amount=_percentile(amounts, 50),
p75_amount=_percentile(amounts, 75),
p90_amount=_percentile(amounts, 90),
claim_ids=claim_ids,
)
def _add_row(
grouped: dict[tuple[str, str], list[tuple[Decimal, str]]],
labels: dict[tuple[str, str], str],
dimension: str,
key_source: Any,
label_source: Any,
amount: Decimal,
claim_id: str,
) -> None:
key = _canonical_key(key_source)
if not key:
return
group_key = (dimension, key)
labels.setdefault(group_key, str(label_source or key_source or key).strip() or key)
grouped[group_key].append((amount, claim_id))
def _supplier_rows(claim: RiskGraphClaimSnapshot) -> list[tuple[str, str, Decimal]]:
item_rows: list[tuple[str, str, Decimal]] = []
for item in claim.items:
supplier = _extract_supplier(item.metadata)
if supplier is not None:
item_rows.append((*supplier, _to_decimal(item.item_amount)))
if item_rows:
return item_rows
supplier = _extract_supplier(claim.metadata) or _extract_supplier_from_flags(claim.risk_flags)
if supplier is None:
return []
return [(*supplier, _to_decimal(claim.amount))]
def _extract_supplier(metadata: Any) -> tuple[str, str] | None:
if not isinstance(metadata, dict):
return None
supplier_id = _first_text(metadata, SUPPLIER_ID_KEYS)
supplier_name = _first_text(metadata, SUPPLIER_NAME_KEYS)
key = supplier_id or supplier_name
if not key:
return None
return key, supplier_name or supplier_id or key
def _extract_supplier_from_flags(flags: list[Any]) -> tuple[str, str] | None:
for flag in flags or []:
if not isinstance(flag, dict):
continue
supplier = _extract_supplier(flag) or _extract_supplier(flag.get("metadata"))
if supplier is not None:
return supplier
return None
def _first_text(source: dict[str, Any], keys: tuple[str, ...]) -> str:
for key in keys:
value = str(source.get(key) or "").strip()
if value:
return value
return ""
def _percentile(values: list[Decimal], percent: int) -> Decimal:
normalized = sorted(value for value in values if value >= ZERO)
if not normalized:
return ZERO
if len(normalized) == 1:
return normalized[0]
position = Decimal(len(normalized) - 1) * Decimal(percent) / HUNDRED
lower = int(position.to_integral_value(rounding=ROUND_FLOOR))
upper = int(position.to_integral_value(rounding=ROUND_CEILING))
if lower == upper:
return normalized[lower]
fraction = position - Decimal(lower)
return normalized[lower] + (normalized[upper] - normalized[lower]) * fraction
def _to_decimal(value: Any) -> Decimal:
try:
return Decimal(str(value or "0"))
except Exception:
return ZERO
def _format_decimal(value: Any) -> str:
if not isinstance(value, Decimal):
value = _to_decimal(value)
return format(value.quantize(Decimal("0.0001")), "f").rstrip("0").rstrip(".") or "0"
def _canonical_key(value: Any) -> str:
return "_".join(str(value or "").strip().lower().split())

View File

@@ -0,0 +1,84 @@
"""Data quality gates for strong financial risk conclusions."""
from __future__ import annotations
from dataclasses import dataclass, field
from decimal import Decimal
from typing import Any
from .models import RiskGraphClaimSnapshot
@dataclass(slots=True)
class RiskDataQualityResult:
passed: bool
gate: str
max_risk_score: int
missing_fields: list[str] = field(default_factory=list)
warnings: list[str] = field(default_factory=list)
def as_dict(self) -> dict[str, Any]:
return {
"passed": self.passed,
"gate": self.gate,
"max_risk_score": self.max_risk_score,
"missing_fields": list(self.missing_fields),
"warnings": list(self.warnings),
}
class RiskDataQualityGate:
"""Prevent weak source data from becoming strong automated conclusions."""
def evaluate_claim(self, claim: RiskGraphClaimSnapshot) -> RiskDataQualityResult:
missing_fields: list[str] = []
warnings: list[str] = []
if not str(claim.claim_id or "").strip():
missing_fields.append("claim_id")
if not (str(claim.employee_id or "").strip() or str(claim.employee_name or "").strip()):
missing_fields.append("employee")
if _to_decimal(claim.amount) <= Decimal("0"):
missing_fields.append("amount")
if not str(claim.expense_type or "").strip():
warnings.append("expense_type")
if claim.invoice_count > 0 and not claim.items:
warnings.append("invoice_items")
if missing_fields:
return RiskDataQualityResult(
passed=False,
gate="capped_missing_required_fields",
max_risk_score=69,
missing_fields=missing_fields,
warnings=warnings,
)
if len(warnings) >= 2:
return RiskDataQualityResult(
passed=False,
gate="capped_low_context_quality",
max_risk_score=69,
warnings=warnings,
)
return RiskDataQualityResult(
passed=True,
gate="passed",
max_risk_score=100,
warnings=warnings,
)
def apply_score_cap(
self,
risk_score: int,
result: RiskDataQualityResult,
) -> tuple[int, str]:
if risk_score > result.max_risk_score:
return result.max_risk_score, result.gate
return risk_score, result.gate
def _to_decimal(value: Any) -> Decimal:
try:
return Decimal(str(value or "0"))
except Exception:
return Decimal("0")

View File

@@ -0,0 +1,93 @@
"""Replay-set contracts for risk graph algorithm evaluation."""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
@dataclass(frozen=True, slots=True)
class AlgorithmReplayCase:
replay_case_id: str
claim_id: str
ontology_version: str
rule_version: str
algorithm_version: str
feedback_label: str
payload: dict[str, Any] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"replay_case_id": self.replay_case_id,
"claim_id": self.claim_id,
"ontology_version": self.ontology_version,
"rule_version": self.rule_version,
"algorithm_version": self.algorithm_version,
"feedback_label": self.feedback_label,
"payload": dict(self.payload),
}
@dataclass(slots=True)
class AlgorithmReplaySet:
replay_set_id: str
created_at: datetime
cases: list[AlgorithmReplayCase] = field(default_factory=list)
def as_dict(self) -> dict[str, Any]:
return {
"replay_set_id": self.replay_set_id,
"created_at": self.created_at.isoformat(),
"case_count": len(self.cases),
"cases": [item.as_dict() for item in self.cases],
}
class AlgorithmReplaySetBuilder:
def build_from_observations(
self,
replay_set_id: str,
observations: list[dict[str, Any]],
*,
created_at: datetime,
) -> AlgorithmReplaySet:
cases = [
self._case_from_observation(index, observation)
for index, observation in enumerate(observations, start=1)
]
return AlgorithmReplaySet(
replay_set_id=replay_set_id,
created_at=created_at,
cases=cases,
)
def _case_from_observation(
self,
index: int,
observation: dict[str, Any],
) -> AlgorithmReplayCase:
ontology = observation.get("ontology_json") or {}
trace = observation.get("decision_trace") or {}
return AlgorithmReplayCase(
replay_case_id=str(
observation.get("evaluation_case_id")
or trace.get("evaluation_case_id")
or f"replay:{index}:{observation.get('observation_key') or 'observation'}"
),
claim_id=str(observation.get("claim_id") or ""),
ontology_version=str(ontology.get("ontology_version") or ""),
rule_version=str(trace.get("rule_version") or ""),
algorithm_version=str(observation.get("algorithm_version") or ""),
feedback_label=str(
observation.get("feedback_status")
or observation.get("status")
or "unreviewed"
),
payload={
"risk_signal": observation.get("risk_signal"),
"risk_score": observation.get("risk_score"),
"risk_level": observation.get("risk_level"),
"decision_trace": trace,
},
)

View File

@@ -0,0 +1,106 @@
"""Candidate risk rule discovery from reviewed risk observations."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
@dataclass(frozen=True, slots=True)
class CandidateRiskRule:
candidate_id: str
rule_code: str
title: str
risk_signal: str
evidence: list[dict[str, Any]]
source: str
confidence_score: float
status: str = "candidate_review"
def as_dict(self) -> dict[str, Any]:
return {
"candidate_id": self.candidate_id,
"rule_code": self.rule_code,
"title": self.title,
"risk_signal": self.risk_signal,
"evidence": list(self.evidence),
"source": self.source,
"confidence_score": self.confidence_score,
"status": self.status,
}
class CandidateRiskRuleDiscovery:
def discover_from_feedback(
self,
observations: list[dict[str, Any]],
feedback_items: list[dict[str, Any]],
) -> list[CandidateRiskRule]:
observation_by_key = {
str(item.get("observation_key") or item.get("id") or ""): item
for item in observations
}
candidates: list[CandidateRiskRule] = []
for feedback in feedback_items:
source = str(feedback.get("candidate_rule_source") or "").strip()
decision = str(feedback.get("decision") or feedback.get("feedback_type") or "").strip()
if source != "risk_observation_feedback" and "candidate" not in decision:
continue
observation_key = str(feedback.get("observation_key") or "").strip()
observation = observation_by_key.get(observation_key, {})
risk_signal = str(
feedback.get("risk_signal") or observation.get("risk_signal") or ""
).strip()
if not risk_signal:
continue
confidence = _confidence(feedback, observation)
candidates.append(
CandidateRiskRule(
candidate_id=f"candidate:{observation_key or risk_signal}:{risk_signal}",
rule_code=f"candidate.risk.{risk_signal}",
title=f"{risk_signal} candidate rule",
risk_signal=risk_signal,
evidence=_candidate_evidence(observation, feedback),
source=source or "risk_observation_feedback",
confidence_score=confidence,
)
)
return _dedupe_candidates(candidates)
def _confidence(feedback: dict[str, Any], observation: dict[str, Any]) -> float:
raw = feedback.get("confidence_score")
if raw in (None, ""):
raw = observation.get("confidence_score")
try:
return max(0.0, min(1.0, float(raw or 0.55)))
except (TypeError, ValueError):
return 0.55
def _candidate_evidence(
observation: dict[str, Any],
feedback: dict[str, Any],
) -> list[dict[str, Any]]:
evidence: list[dict[str, Any]] = []
for item in observation.get("evidence", []) or []:
if isinstance(item, dict):
evidence.append({"source": item.get("source") or "observation", **item})
evidence.append(
{
"source": feedback.get("candidate_rule_source") or "risk_observation_feedback",
"feedback_type": feedback.get("feedback_type"),
"action": feedback.get("action"),
"comment": feedback.get("comment"),
}
)
return evidence
def _dedupe_candidates(candidates: list[CandidateRiskRule]) -> list[CandidateRiskRule]:
by_code: dict[str, CandidateRiskRule] = {}
for candidate in candidates:
current = by_code.get(candidate.rule_code)
if current is None or candidate.confidence_score > current.confidence_score:
by_code[candidate.rule_code] = candidate
return list(by_code.values())

View File

@@ -0,0 +1,94 @@
"""Risk-based sampling strategy for audit review and replay."""
from __future__ import annotations
from dataclasses import dataclass
from decimal import Decimal
from typing import Any
from .models import RiskHistoryStats
@dataclass(slots=True)
class RiskSamplingDecision:
strategy: str
threshold: int
replay_bucket: str
audit_required: bool
reason: str
def as_dict(self) -> dict[str, Any]:
return {
"strategy": self.strategy,
"threshold": self.threshold,
"replay_bucket": self.replay_bucket,
"audit_required": self.audit_required,
"reason": self.reason,
}
class RiskSamplingPlanner:
def plan(
self,
*,
risk_score: int,
confidence: Decimal,
evidence_source_count: int,
data_quality_passed: bool = True,
data_quality_gate: str = "",
history: RiskHistoryStats | None = None,
) -> RiskSamplingDecision:
false_positive_rate = _false_positive_rate(history)
if not data_quality_passed:
return RiskSamplingDecision(
strategy="uncertainty_sample",
threshold=45,
replay_bucket="data_quality_gate",
audit_required=True,
reason=data_quality_gate or "data_quality_gate_not_passed",
)
if risk_score >= 90:
return RiskSamplingDecision(
strategy="mandatory_review",
threshold=90,
replay_bucket="critical_high_risk",
audit_required=True,
reason="risk_score_above_critical_threshold",
)
if risk_score >= 70:
return RiskSamplingDecision(
strategy="focused_review",
threshold=70,
replay_bucket="high_risk",
audit_required=True,
reason="risk_score_above_high_threshold",
)
if false_positive_rate >= Decimal("0.30"):
return RiskSamplingDecision(
strategy="calibration_sample",
threshold=45,
replay_bucket="false_positive_calibration",
audit_required=True,
reason="historical_false_positive_rate_high",
)
if confidence < Decimal("0.55") or evidence_source_count < 2:
return RiskSamplingDecision(
strategy="uncertainty_sample",
threshold=45,
replay_bucket="low_confidence",
audit_required=True,
reason="confidence_or_evidence_source_insufficient",
)
return RiskSamplingDecision(
strategy="monitor",
threshold=31,
replay_bucket="routine_monitoring",
audit_required=False,
reason="below_review_threshold",
)
def _false_positive_rate(history: RiskHistoryStats | None) -> Decimal:
if history is None or history.similar_case_count <= 0:
return Decimal("0")
return Decimal(history.false_positive_count) / Decimal(history.similar_case_count)

View File

@@ -0,0 +1,230 @@
"""Risk signal normalization shared by rules, ontology, and graph scoring."""
from __future__ import annotations
from dataclasses import dataclass
from decimal import Decimal
from typing import Any
SEVERITY_SCORE = {
"info": 12,
"low": 32,
"medium": 58,
"high": 82,
"critical": 100,
}
SIGNAL_ALIASES: dict[str, str] = {
"amount_over_limit": "amount_limit_exceeded",
"over_budget": "budget_overrun",
"budget_exceeded": "budget_overrun",
"duplicate_expense": "duplicate_invoice",
"duplicate_ticket": "duplicate_invoice",
"risk.invoice.duplicate_invoice": "duplicate_invoice",
"location_mismatch": "location_mismatch",
"city_mismatch": "location_mismatch",
"hotel_itinerary_mismatch": "hotel_itinerary_mismatch",
"date_outside_trip": "date_outside_trip",
"preapproval_absent": "preapproval_absent",
"application_fields_missing": "application_fields_missing",
"attachment_ocr_missing": "attachment_missing",
"missing_attachment": "attachment_missing",
"reason_too_brief": "reason_too_brief",
"vague_ticket_content": "vague_goods_description",
"personal_purpose": "personal_purpose",
"split_billing": "split_billing",
"frequency_anomaly": "frequency_anomaly",
"collusion": "cross_department_cluster",
"cross_department_cluster": "cross_department_cluster",
"buyer_name_mismatch": "buyer_name_mismatch",
"document_expense_mismatch": "document_expense_mismatch",
"void_or_red_invoice": "void_or_red_invoice",
"cross_year_invoice": "cross_year_invoice",
"entertainment_missing_detail": "entertainment_missing_detail",
}
SIGNAL_LABELS: dict[str, str] = {
"amount_limit_exceeded": "Amount limit exceeded",
"budget_overrun": "Budget overrun",
"duplicate_invoice": "Duplicate invoice",
"location_mismatch": "Location mismatch",
"hotel_itinerary_mismatch": "Hotel and itinerary mismatch",
"date_outside_trip": "Date outside approved trip",
"preapproval_absent": "Pre-approval missing",
"application_fields_missing": "Application fields missing",
"attachment_missing": "Attachment missing",
"reason_too_brief": "Reason too brief",
"vague_goods_description": "Vague goods description",
"personal_purpose": "Possible personal purpose",
"split_billing": "Split billing pattern",
"frequency_anomaly": "Frequency anomaly",
"cross_department_cluster": "Cross-department spending cluster",
"buyer_name_mismatch": "Buyer name mismatch",
"document_expense_mismatch": "Document and expense mismatch",
"void_or_red_invoice": "Void or red invoice",
"cross_year_invoice": "Cross-year invoice",
"entertainment_missing_detail": "Entertainment detail missing",
}
SIGNAL_DEFAULT_SEVERITY: dict[str, str] = {
"duplicate_invoice": "critical",
"personal_purpose": "high",
"preapproval_absent": "high",
"date_outside_trip": "high",
"amount_limit_exceeded": "high",
"budget_overrun": "high",
"split_billing": "high",
"cross_department_cluster": "high",
"location_mismatch": "medium",
"hotel_itinerary_mismatch": "medium",
"frequency_anomaly": "medium",
"buyer_name_mismatch": "medium",
"document_expense_mismatch": "medium",
"void_or_red_invoice": "high",
"cross_year_invoice": "medium",
"entertainment_missing_detail": "medium",
"application_fields_missing": "low",
"attachment_missing": "low",
"reason_too_brief": "low",
"vague_goods_description": "low",
}
POLICY_BOUND_SIGNALS = {
"amount_limit_exceeded",
"budget_overrun",
"preapproval_absent",
"date_outside_trip",
"hotel_itinerary_mismatch",
"location_mismatch",
"document_expense_mismatch",
"buyer_name_mismatch",
"entertainment_missing_detail",
"application_fields_missing",
"attachment_missing",
}
@dataclass(slots=True)
class NormalizedRiskSignal:
code: str
raw_code: str
label: str
severity: str
score: int
confidence: Decimal = Decimal("1")
source: str = "rule"
metadata: dict[str, Any] | None = None
def as_dict(self) -> dict[str, Any]:
return {
"code": self.code,
"raw_code": self.raw_code,
"label": self.label,
"severity": self.severity,
"score": self.score,
"confidence": str(self.confidence),
"source": self.source,
"metadata": self.metadata or {},
}
def normalize_risk_signal(value: Any, *, source: str = "rule") -> NormalizedRiskSignal | None:
if isinstance(value, dict):
raw_code = _first_present(
value,
"risk_signal",
"signal",
"code",
"risk_type",
"rule_code",
"type",
)
severity = str(value.get("severity") or value.get("risk_level") or "").strip().lower()
confidence = _to_decimal(value.get("confidence") or value.get("score_confidence") or 1)
explicit_score = value.get("risk_score") or value.get("score")
metadata = dict(value)
else:
raw_code = str(value or "").strip()
severity = ""
confidence = Decimal("1")
explicit_score = None
metadata = {}
if not raw_code:
return None
canonical = SIGNAL_ALIASES.get(raw_code.strip().lower(), raw_code.strip().lower())
canonical = canonical.replace(" ", "_")
severity = severity or SIGNAL_DEFAULT_SEVERITY.get(canonical, "medium")
score = _score_from_value(explicit_score, severity=severity)
return NormalizedRiskSignal(
code=canonical,
raw_code=raw_code,
label=SIGNAL_LABELS.get(canonical, canonical.replace("_", " ").title()),
severity=severity,
score=score,
confidence=max(Decimal("0"), min(Decimal("1"), confidence)),
source=source,
metadata=metadata,
)
def normalize_risk_signals(
values: list[Any],
*,
source: str = "rule",
) -> list[NormalizedRiskSignal]:
by_code: dict[str, NormalizedRiskSignal] = {}
for value in values:
signal = normalize_risk_signal(value, source=source)
if signal is None:
continue
current = by_code.get(signal.code)
if current is None or signal.score > current.score:
by_code[signal.code] = signal
return sorted(by_code.values(), key=lambda item: (item.score, item.code), reverse=True)
def policy_refs_for_signal(signal_code: str) -> list[str]:
signal_code = SIGNAL_ALIASES.get(str(signal_code or "").strip().lower(), signal_code)
if signal_code not in POLICY_BOUND_SIGNALS:
return []
return [f"policy.{signal_code}"]
def severity_from_score(score: int) -> str:
normalized = max(0, min(100, int(score or 0)))
if normalized >= 90:
return "critical"
if normalized >= 70:
return "high"
if normalized >= 45:
return "medium"
return "low"
def _first_present(value: dict[str, Any], *keys: str) -> str:
for key in keys:
candidate = str(value.get(key) or "").strip()
if candidate:
return candidate
return ""
def _score_from_value(value: Any, *, severity: str) -> int:
if value is None or value == "":
return SEVERITY_SCORE.get(severity, SEVERITY_SCORE["medium"])
try:
numeric = Decimal(str(value))
except Exception:
return SEVERITY_SCORE.get(severity, SEVERITY_SCORE["medium"])
if numeric <= Decimal("1"):
numeric *= Decimal("100")
return max(0, min(100, int(numeric.to_integral_value())))
def _to_decimal(value: Any) -> Decimal:
try:
return Decimal(str(value))
except Exception:
return Decimal("0")

View File

@@ -0,0 +1,162 @@
"""Temporal monitoring for risk graph relationship changes."""
from __future__ import annotations
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from typing import Any
from .models import RiskGraphEdge
@dataclass(slots=True)
class TemporalRiskGraphChange:
change_type: str
source_key: str
target_key: str
edge_type: str
metadata: dict[str, Any] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"change_type": self.change_type,
"source_key": self.source_key,
"target_key": self.target_key,
"edge_type": self.edge_type,
"metadata": dict(self.metadata),
}
@dataclass(slots=True)
class TemporalRiskGraphSnapshotDiff:
changes: list[TemporalRiskGraphChange]
edge_type_delta: dict[str, int]
def as_dict(self) -> dict[str, Any]:
return {
"changes": [item.as_dict() for item in self.changes],
"edge_type_delta": dict(self.edge_type_delta),
}
class TemporalRiskGraphMonitor:
def monitor(
self,
previous_edges: list[RiskGraphEdge],
current_edges: list[RiskGraphEdge],
*,
risk_node_keys: set[str] | None = None,
) -> TemporalRiskGraphSnapshotDiff:
previous = {edge.edge_key(): edge for edge in previous_edges}
current = {edge.edge_key(): edge for edge in current_edges}
risk_keys = set(risk_node_keys or set())
changes: list[TemporalRiskGraphChange] = []
for key, edge in current.items():
if key not in previous:
changes.append(_change("relationship_added", edge))
if edge.source_key in risk_keys or edge.target_key in risk_keys:
changes.append(_change("risk_propagation", edge))
for key, edge in previous.items():
if key not in current:
changes.append(_change("relationship_removed", edge))
changes.extend(_relationship_volume_changes(previous_edges, current_edges))
changes.extend(_target_migrations(previous_edges, current_edges))
return TemporalRiskGraphSnapshotDiff(
changes=changes,
edge_type_delta=_edge_type_delta(previous_edges, current_edges),
)
def _change(change_type: str, edge: RiskGraphEdge, **metadata: Any) -> TemporalRiskGraphChange:
return TemporalRiskGraphChange(
change_type=change_type,
source_key=edge.source_key,
target_key=edge.target_key,
edge_type=edge.edge_type,
metadata=metadata,
)
def _edge_type_delta(
previous_edges: list[RiskGraphEdge],
current_edges: list[RiskGraphEdge],
) -> dict[str, int]:
previous_counts = Counter(edge.edge_type for edge in previous_edges)
current_counts = Counter(edge.edge_type for edge in current_edges)
edge_types = set(previous_counts) | set(current_counts)
return {
edge_type: current_counts.get(edge_type, 0) - previous_counts.get(edge_type, 0)
for edge_type in sorted(edge_types)
}
def _relationship_volume_changes(
previous_edges: list[RiskGraphEdge],
current_edges: list[RiskGraphEdge],
) -> list[TemporalRiskGraphChange]:
changes: list[TemporalRiskGraphChange] = []
previous_counts = Counter(edge.edge_type for edge in previous_edges)
current_by_type: dict[str, list[RiskGraphEdge]] = defaultdict(list)
for edge in current_edges:
current_by_type[edge.edge_type].append(edge)
for edge_type, current_group in current_by_type.items():
previous_count = previous_counts.get(edge_type, 0)
current_count = len(current_group)
if current_count >= 3 and current_count >= max(1, previous_count) * 2:
changes.append(
_change(
"relationship_surge",
current_group[0],
previous_count=previous_count,
current_count=current_count,
)
)
previous_by_type: dict[str, list[RiskGraphEdge]] = defaultdict(list)
for edge in previous_edges:
previous_by_type[edge.edge_type].append(edge)
current_counts = Counter(edge.edge_type for edge in current_edges)
for edge_type, previous_group in previous_by_type.items():
if len(previous_group) >= 3 and current_counts.get(edge_type, 0) == 0:
changes.append(
_change(
"relationship_disappeared",
previous_group[0],
previous_count=len(previous_group),
current_count=0,
)
)
return changes
def _target_migrations(
previous_edges: list[RiskGraphEdge],
current_edges: list[RiskGraphEdge],
) -> list[TemporalRiskGraphChange]:
previous_targets: dict[tuple[str, str], set[str]] = defaultdict(set)
current_targets: dict[tuple[str, str], set[str]] = defaultdict(set)
for edge in previous_edges:
previous_targets[(edge.source_key, edge.edge_type)].add(edge.target_key)
for edge in current_edges:
current_targets[(edge.source_key, edge.edge_type)].add(edge.target_key)
changes: list[TemporalRiskGraphChange] = []
for key, current_target_set in current_targets.items():
previous_target_set = previous_targets.get(key, set())
if previous_target_set and current_target_set != previous_target_set:
source_key, edge_type = key
target_key = sorted(current_target_set - previous_target_set or current_target_set)[0]
changes.append(
TemporalRiskGraphChange(
change_type="target_migration",
source_key=source_key,
target_key=target_key,
edge_type=edge_type,
metadata={
"previous_targets": sorted(previous_target_set),
"current_targets": sorted(current_target_set),
},
)
)
return changes