Files
X-Financial/server/src/app/services/ontology_detection.py
caoxiaozhu 7989f3a159 feat: 新增风险图谱算法与系统仪表盘及操作反馈体系
后端新增风险图谱算法模块、风险观察与反馈服务、规则 DSL
校验器和可解释性引擎,完善系统仪表盘和财务仪表盘统计,
优化 agent 运行和编排执行链路,清理旧开发文档,前端新增
系统趋势、负载热力图等多种仪表盘图表组件,完善操作反馈
对话框和工作台日期选择器,优化报销创建和审批详情交互,
补充单元测试覆盖。
2026-05-30 15:46:51 +08:00

587 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import json
import re
from typing import Any
from pydantic import ValidationError
from app.core.logging import get_logger
from app.schemas.ontology import (
OntologyConstraint,
OntologyEntity,
OntologyMetric,
OntologyParseRequest,
OntologyTimeRange,
)
from app.services.ontology_rules import (
AP_CORE_KEYWORDS,
AR_CORE_KEYWORDS,
BUDGET_DRAFT_KEYWORDS,
BUDGET_OPERATE_KEYWORDS,
COMPARE_KEYWORDS,
DRAFT_FOLLOW_UP_KEYWORDS,
DRAFT_KEYWORDS,
EXPENSE_APPLICATION_CONTEXT_TYPES,
EXPENSE_NARRATIVE_KEYWORDS,
EXPENSE_REVIEW_ACTIONS,
EXPLAIN_KEYWORDS,
GENERIC_EXPENSE_PROMPTS,
KNOWLEDGE_INTENTS,
looks_like_expense_application_signal,
OPERATE_KEYWORDS,
QUERY_KEYWORDS,
RISK_KEYWORDS,
SCENARIO_KEYWORDS,
STATUS_KEYWORDS,
LlmOntologyEntityHint,
LlmOntologyParseResult,
)
logger = get_logger("app.services.ontology")
TRANSPORT_EXPENSE_OVERRIDE_KEYWORDS = (
"打车",
"网约车",
"出租车票",
"出租车",
"的士票",
"的士",
"滴滴",
"市内交通",
"乘车",
"乘车费",
"用车",
"叫车",
"车费",
"车资",
"机场",
)
EXPLICIT_ENTERTAINMENT_KEYWORDS = (
"业务招待",
"招待费",
"招待",
"宴请",
"请客",
"请客户吃饭",
"客户吃饭",
"客户用餐",
"客户餐",
"商务接待",
"商务宴请",
"接待餐",
)
class OntologyDetectionMixin:
@staticmethod
def _is_expense_application_context(context_json: dict[str, Any]) -> bool:
document_type = str(context_json.get("document_type") or "").strip()
application_stage = str(context_json.get("application_stage") or "").strip()
entry_source = str(context_json.get("entry_source") or "").strip()
session_type = str(context_json.get("session_type") or "").strip()
return (
document_type in EXPENSE_APPLICATION_CONTEXT_TYPES
or application_stage in EXPENSE_APPLICATION_CONTEXT_TYPES
or session_type in EXPENSE_APPLICATION_CONTEXT_TYPES
or entry_source in {"application", "documents_application", "expense_application"}
)
@staticmethod
def _looks_like_expense_application(compact_query: str) -> bool:
return looks_like_expense_application_signal(compact_query)
def _detect_scenario(self, compact_query: str) -> tuple[str, float]:
scores = {key: 0.0 for key in SCENARIO_KEYWORDS}
for scenario, keywords in SCENARIO_KEYWORDS.items():
for keyword, weight in keywords:
if keyword in compact_query:
scores[scenario] += weight
best_scenario = max(scores, key=scores.get)
best_score = scores[best_scenario]
if scores.get("budget", 0.0) > 0 and scores["budget"] >= best_score:
best_scenario = "budget"
best_score = scores["budget"]
if best_score <= 0:
if "单据" in compact_query and any(
keyword in compact_query for keyword in STATUS_KEYWORDS
):
return "expense", 0.14
return "unknown", 0.0
if best_scenario == "knowledge":
business_scores = [
scores["expense"],
scores["accounts_receivable"],
scores["accounts_payable"],
scores["budget"],
]
if max(business_scores) > 0:
best_scenario = ("expense", "accounts_receivable", "accounts_payable", "budget")[
business_scores.index(max(business_scores))
]
best_score = max(business_scores)
return best_scenario, round(min(best_score, 0.34), 2)
def _detect_intent(
self,
compact_query: str,
*,
scenario: str,
entities: list[OntologyEntity],
time_range: OntologyTimeRange,
) -> tuple[str, float]:
if any(keyword in compact_query for keyword in OPERATE_KEYWORDS):
return "operate", 0.30
if scenario == "budget" and any(
keyword in compact_query for keyword in BUDGET_OPERATE_KEYWORDS
):
return "operate", 0.30
if scenario == "budget" and any(
keyword in compact_query for keyword in BUDGET_DRAFT_KEYWORDS
):
return "draft", 0.28
status_document_query = (
"单据" in compact_query
and any(keyword in compact_query for keyword in STATUS_KEYWORDS)
and not any(keyword in compact_query for keyword in DRAFT_KEYWORDS if keyword != "草稿")
)
historical_document_query = any(
keyword in compact_query
for keyword in ("报销的单据", "报销单据", "报销过的单据", "报销记录")
)
if scenario == "expense" and any(
keyword in compact_query
for keyword in (
"报销了吗",
"报销了么",
"报销了没",
"报销了没有",
"报销没",
"单据状态",
"审批状态",
"报销进度",
"到哪了",
"到了哪",
"有没有报销",
"是否报销",
"进行中的单据",
"草稿单据",
"草稿的单据",
"待补充单据",
"审批中的单据",
"已提交单据",
"已入账单据",
)
) or (scenario == "expense" and (status_document_query or historical_document_query)):
return "query", 0.24
if any(keyword in compact_query for keyword in DRAFT_KEYWORDS):
return "draft", 0.26
if scenario == "expense" and "报销" in compact_query and any(
item.type == "expense_type"
and str(item.normalized_value or item.value or "").strip()
for item in entities
) and not any(
keyword in compact_query
for keyword in (
*QUERY_KEYWORDS,
*COMPARE_KEYWORDS,
*EXPLAIN_KEYWORDS,
*RISK_KEYWORDS,
)
):
return "draft", 0.25
if scenario == "expense" and self._is_generic_expense_prompt(compact_query):
return "draft", 0.24
if any(keyword in compact_query for keyword in COMPARE_KEYWORDS):
return "compare", 0.24
if any(keyword in compact_query for keyword in EXPLAIN_KEYWORDS):
return "explain", 0.22
if any(keyword in compact_query for keyword in RISK_KEYWORDS):
return "risk_check", 0.24
if any(keyword in compact_query for keyword in QUERY_KEYWORDS):
return "query", 0.20
if self._looks_like_expense_narrative(
compact_query,
scenario=scenario,
entities=entities,
time_range=time_range,
):
return "draft", 0.22
return "query", 0.10
@staticmethod
def _looks_like_follow_up_message(compact_query: str) -> bool:
if not compact_query:
return False
if any(keyword in compact_query for keyword in DRAFT_FOLLOW_UP_KEYWORDS):
return True
if compact_query.startswith(("", "", "", "这个", "那个")):
return True
has_domain_keyword = any(
keyword in compact_query
for keyword, _weight in (
*SCENARIO_KEYWORDS["expense"],
*SCENARIO_KEYWORDS["accounts_receivable"],
*SCENARIO_KEYWORDS["accounts_payable"],
*SCENARIO_KEYWORDS["knowledge"],
)
)
return len(compact_query) <= 12 and not has_domain_keyword
def _should_inherit_expense_draft(
self,
compact_query: str,
*,
scenario: str,
entities: list[OntologyEntity],
time_range: OntologyTimeRange,
context_json: dict[str, Any],
) -> bool:
context_scenario = self._resolve_context_scenario(context_json)
draft_claim_id = str(context_json.get("draft_claim_id") or "").strip()
review_action = str(context_json.get("review_action") or "").strip()
if review_action in EXPENSE_REVIEW_ACTIONS:
return True
if context_scenario != "expense" and not draft_claim_id:
return False
if any(keyword in compact_query for keyword in DRAFT_FOLLOW_UP_KEYWORDS):
return True
if self._looks_like_expense_narrative(
compact_query,
scenario="expense",
entities=entities,
time_range=time_range,
):
return True
if self._looks_like_follow_up_message(compact_query):
return True
if any(keyword in compact_query for keyword in OPERATE_KEYWORDS):
return False
if any(keyword in compact_query for keyword in COMPARE_KEYWORDS + RISK_KEYWORDS):
return False
if any(keyword in compact_query for keyword in QUERY_KEYWORDS):
return False
return bool(
draft_claim_id
and any(
item.type
in {"amount", "customer", "employee", "expense_type", "project", "invoice"}
for item in entities
)
)
@staticmethod
def _is_generic_expense_prompt(compact_query: str) -> bool:
return compact_query in GENERIC_EXPENSE_PROMPTS
@staticmethod
def _looks_like_expense_narrative(
compact_query: str,
*,
scenario: str,
entities: list[OntologyEntity],
time_range: OntologyTimeRange,
) -> bool:
if scenario not in {"expense", "accounts_receivable", "accounts_payable", "unknown"}:
return False
if any(keyword in compact_query for keyword in AR_CORE_KEYWORDS + AP_CORE_KEYWORDS):
return False
entity_types = {item.type for item in entities}
has_expense_signal = any(
keyword in compact_query for keyword in EXPENSE_NARRATIVE_KEYWORDS
) or "expense_type" in entity_types
has_context_signal = (
bool(time_range.start_date)
or "amount" in entity_types
or ("报销" in compact_query and "expense_type" in entity_types)
)
return has_expense_signal and has_context_signal
def _parse_with_model(
self,
*,
payload: OntologyParseRequest,
query: str,
compact_query: str,
fallback_scenario: str,
fallback_intent: str,
entities: list[OntologyEntity],
time_range: OntologyTimeRange,
metrics: list[OntologyMetric],
constraints: list[OntologyConstraint],
) -> tuple[LlmOntologyParseResult | None, list[dict[str, Any]], str | None]:
messages = self._build_model_messages(
payload=payload,
query=query,
compact_query=compact_query,
fallback_scenario=fallback_scenario,
fallback_intent=fallback_intent,
entities=entities,
time_range=time_range,
metrics=metrics,
constraints=constraints,
)
chat_result = self.runtime_chat_service.complete_with_trace(
messages,
max_tokens=600,
temperature=0.0,
)
response_text = chat_result.text
traces = chat_result.calls_as_dicts()
payload_json = self._extract_json_payload(response_text)
if payload_json is None:
return None, traces, "model_output_empty_or_invalid_json"
try:
return LlmOntologyParseResult.model_validate(payload_json), traces, None
except ValidationError as exc:
logger.warning("Semantic model output validation failed: %s", exc)
return None, traces, "model_output_validation_failed"
@staticmethod
def _build_model_messages(
*,
payload: OntologyParseRequest,
query: str,
compact_query: str,
fallback_scenario: str,
fallback_intent: str,
entities: list[OntologyEntity],
time_range: OntologyTimeRange,
metrics: list[OntologyMetric],
constraints: list[OntologyConstraint],
) -> list[dict[str, str]]:
facts = {
"query": query,
"compact_query": compact_query,
"context": {
"entry_source": payload.context_json.get("entry_source"),
"attachment_names": payload.context_json.get("attachment_names", []),
"attachment_count": payload.context_json.get("attachment_count", 0),
"ocr_summary": payload.context_json.get("ocr_summary", ""),
"ocr_documents": payload.context_json.get("ocr_documents", []),
"request_context": payload.context_json.get("request_context"),
"role_codes": payload.context_json.get("role_codes", []),
"conversation_id": payload.context_json.get("conversation_id"),
"conversation_scenario": payload.context_json.get("conversation_scenario"),
"conversation_intent": payload.context_json.get("conversation_intent"),
"document_type": payload.context_json.get("document_type"),
"application_stage": payload.context_json.get("application_stage"),
"application_fields": payload.context_json.get("application_fields"),
"draft_claim_id": payload.context_json.get("draft_claim_id"),
"review_action": payload.context_json.get("review_action"),
"review_form_values": payload.context_json.get("review_form_values"),
"conversation_history": payload.context_json.get("conversation_history", []),
},
"rule_candidates": {
"scenario": fallback_scenario,
"intent": fallback_intent,
"entities": [item.model_dump(mode="json") for item in entities],
"time_range": time_range.model_dump(mode="json"),
"metrics": [item.model_dump(mode="json") for item in metrics],
"constraints": [item.model_dump(mode="json") for item in constraints],
},
}
system_prompt = (
"你是企业财务共享平台的语义解析器。"
"你的任务是把用户输入解析为固定 JSON用于后续路由、追问和权限判断。"
"只输出 JSON 对象,不要输出 Markdown、代码块、解释、标题或 <think>。"
"场景 scenario 只能是expense, accounts_receivable, "
"accounts_payable, budget, knowledge, unknown。"
"意图 intent 只能是query, explain, compare, risk_check, draft, operate。"
"如果用户是在描述一笔待处理费用、待报销事项、上传票据或希望整理报销,"
"即使没有明确说“生成草稿”,也优先使用 expense + draft。"
"如果提供了 conversation_history必须把最近轮次作为当前追问的上下文"
"正确理解“这个”“那笔”“改成 800”“继续补充”这类省略表达。"
"出现“客户”不等于应收,出现“供应商”不等于应付,必须结合动作词和业务目标判断。"
"预算编制、预算金额、成本中心、预算科目、预算预警、预算占用、"
"剩余预算、可用预算、超预算、预算不足等问题必须使用 budget 场景。"
"只有明确查询、统计、列出、多少、明细、对比时才优先使用 query 或 compare。"
"附件名称和 OCR 摘要只作为辅助证据,不能编造未出现的事实。"
"如果用户明确提到打车、的士票、出租车票、网约车、乘车费、车费等交通票据,"
"即使句子里出现“客户”,也必须优先识别为 transport不要推断为 entertainment。"
"不要输出用户原文未出现、且与规则候选冲突的费用类型。"
"信息不足时 clarification_required=true并给出一句简短中文追问。"
"missing_slots 使用简短 snake_case例如 expense_type, amount, "
"customer_name, participants, attachments, budget_period, "
"budget_subject, budget_amount。"
"entity_hints 只填写你比较确定的业务对象;如果不确定,可以返回空数组。"
"费用申请场景下,建议把干净的申请事由放入 type=reason"
"把出行方式放入 type=transport_mode取值优先为飞机、火车、轮船。"
"reason 只能保留真实业务目的,例如“服务美团业务部署”,"
"不要把发生时间、地点、出差天数、交通方式混进 reason。"
)
user_prompt = (
"请根据以下事实输出 JSON\n"
f"{json.dumps(facts, ensure_ascii=False, indent=2, default=str)}\n\n"
"输出格式:\n"
"{\n"
' "scenario": "expense",\n'
' "intent": "draft",\n'
' "confidence": 0.88,\n'
' "clarification_required": true,\n'
' "clarification_question": "请补充发生时间、金额和票据附件。",\n'
' "missing_slots": ["time_range", "amount", "attachments"],\n'
' "ambiguity": [],\n'
' "entity_hints": [\n'
' {"type": "expense_type", "value": "交通费", '
'"normalized_value": "transport", "role": "filter", '
'"confidence": 0.86},\n'
' {"type": "reason", "value": "服务客户业务部署", '
'"normalized_value": "服务客户业务部署", "role": "target", '
'"confidence": 0.86},\n'
' {"type": "budget_subject", "value": "差旅费", '
'"normalized_value": "travel", "role": "filter", '
'"confidence": 0.86}\n'
" ]\n"
"}"
)
return [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
@staticmethod
def _extract_json_payload(response_text: str | None) -> dict[str, Any] | None:
if not response_text:
return None
cleaned = re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL | re.IGNORECASE)
cleaned = cleaned.strip()
if not cleaned:
return None
fenced_match = re.search(r"```(?:json)?\s*(\{.*\})\s*```", cleaned, flags=re.DOTALL)
candidates = [fenced_match.group(1)] if fenced_match else []
candidates.extend([cleaned])
start = cleaned.find("{")
end = cleaned.rfind("}")
if start != -1 and end != -1 and end > start:
candidates.append(cleaned[start : end + 1])
for candidate in candidates:
try:
parsed = json.loads(candidate)
except json.JSONDecodeError:
continue
if isinstance(parsed, dict):
return parsed
return None
@staticmethod
def _resolve_scenario(
fallback_scenario: str,
model_parse: LlmOntologyParseResult | None,
) -> str:
if model_parse is None:
return fallback_scenario
if model_parse.scenario == "unknown" and fallback_scenario != "unknown":
return fallback_scenario
return model_parse.scenario
def _resolve_intent(
self,
compact_query: str,
*,
fallback_intent: str,
scenario: str,
entities: list[OntologyEntity],
time_range: OntologyTimeRange,
model_parse: LlmOntologyParseResult | None,
) -> str:
candidate = model_parse.intent if model_parse is not None else fallback_intent
if scenario == "knowledge":
if candidate in KNOWLEDGE_INTENTS:
return candidate
if fallback_intent in KNOWLEDGE_INTENTS:
return fallback_intent
return "query"
if candidate == "query" and scenario == "expense":
if self._is_generic_expense_prompt(compact_query) or fallback_intent == "draft":
return "draft"
return candidate
@staticmethod
def _merge_entities(
base_entities: list[OntologyEntity],
entity_hints: list[LlmOntologyEntityHint],
compact_query: str = "",
) -> list[OntologyEntity]:
merged: dict[tuple[str, str], OntologyEntity] = {
(item.type, item.normalized_value): item for item in base_entities
}
for hint in entity_hints:
value = str(hint.value or "").strip()
if not value:
continue
normalized_value = str(hint.normalized_value or value).strip()
key = (str(hint.type).strip(), normalized_value)
candidate = OntologyEntity(
type=str(hint.type).strip(),
value=value,
normalized_value=normalized_value,
role=str(hint.role or "target").strip() or "target",
confidence=float(hint.confidence),
)
existing = merged.get(key)
if existing is None or existing.confidence < candidate.confidence:
merged[key] = candidate
items = list(merged.values())
if OntologyDetectionMixin._should_transport_override_entertainment(
compact_query,
items,
):
items = [
item
for item in items
if not (
item.type == "expense_type"
and item.normalized_value == "entertainment"
)
]
return items
@staticmethod
def _should_transport_override_entertainment(
compact_query: str,
entities: list[OntologyEntity],
) -> bool:
expense_types = {
str(item.normalized_value or item.value or "").strip()
for item in entities
if item.type == "expense_type"
}
if not {"transport", "entertainment"}.issubset(expense_types):
return False
if not any(keyword in compact_query for keyword in TRANSPORT_EXPENSE_OVERRIDE_KEYWORDS):
return False
return not any(keyword in compact_query for keyword in EXPLICIT_ENTERTAINMENT_KEYWORDS)
@staticmethod
def _normalize_short_text_list(values: list[str]) -> list[str]:
normalized: list[str] = []
seen: set[str] = set()
for value in values:
cleaned = str(value or "").strip()
if not cleaned or cleaned in seen:
continue
normalized.append(cleaned)
seen.add(cleaned)
return normalized[:6]