from __future__ import annotations import json import re from typing import Any from pydantic import ValidationError from app.core.logging import get_logger from app.schemas.ontology import ( OntologyConstraint, OntologyEntity, OntologyMetric, OntologyParseRequest, OntologyTimeRange, ) from app.services.ontology_rules import ( AR_CORE_KEYWORDS, AP_CORE_KEYWORDS, COMPARE_KEYWORDS, DRAFT_FOLLOW_UP_KEYWORDS, DRAFT_KEYWORDS, EXPENSE_APPLICATION_CONTEXT_TYPES, EXPENSE_APPLICATION_KEYWORDS, EXPENSE_NARRATIVE_KEYWORDS, EXPENSE_REVIEW_ACTIONS, EXPLAIN_KEYWORDS, GENERIC_EXPENSE_PROMPTS, KNOWLEDGE_INTENTS, LlmOntologyEntityHint, LlmOntologyParseResult, OPERATE_KEYWORDS, QUERY_KEYWORDS, RISK_KEYWORDS, SCENARIO_KEYWORDS, STATUS_KEYWORDS, ) logger = get_logger("app.services.ontology") TRANSPORT_EXPENSE_OVERRIDE_KEYWORDS = ( "打车", "网约车", "出租车票", "出租车", "的士票", "的士", "滴滴", "市内交通", "乘车", "乘车费", "用车", "叫车", "车费", "车资", "机场", ) EXPLICIT_ENTERTAINMENT_KEYWORDS = ( "业务招待", "招待费", "招待", "宴请", "请客", "请客户吃饭", "客户吃饭", "客户用餐", "客户餐", "商务接待", "商务宴请", "接待餐", ) class OntologyDetectionMixin: @staticmethod def _is_expense_application_context(context_json: dict[str, Any]) -> bool: document_type = str(context_json.get("document_type") or "").strip() application_stage = str(context_json.get("application_stage") or "").strip() entry_source = str(context_json.get("entry_source") or "").strip() session_type = str(context_json.get("session_type") or "").strip() return ( document_type in EXPENSE_APPLICATION_CONTEXT_TYPES or application_stage in EXPENSE_APPLICATION_CONTEXT_TYPES or session_type in EXPENSE_APPLICATION_CONTEXT_TYPES or entry_source in {"application", "documents_application", "expense_application"} ) @staticmethod def _looks_like_expense_application(compact_query: str) -> bool: return any(keyword in compact_query for keyword in EXPENSE_APPLICATION_KEYWORDS) def _detect_scenario(self, compact_query: str) -> tuple[str, float]: scores = {key: 0.0 for key in SCENARIO_KEYWORDS} for scenario, keywords in SCENARIO_KEYWORDS.items(): for keyword, weight in keywords: if keyword in compact_query: scores[scenario] += weight best_scenario = max(scores, key=scores.get) best_score = scores[best_scenario] if best_score <= 0: if "单据" in compact_query and any( keyword in compact_query for keyword in STATUS_KEYWORDS ): return "expense", 0.14 return "unknown", 0.0 if best_scenario == "knowledge": business_scores = [ scores["expense"], scores["accounts_receivable"], scores["accounts_payable"], ] if max(business_scores) > 0: best_scenario = ("expense", "accounts_receivable", "accounts_payable")[ business_scores.index(max(business_scores)) ] best_score = max(business_scores) return best_scenario, round(min(best_score, 0.34), 2) def _detect_intent( self, compact_query: str, *, scenario: str, entities: list[OntologyEntity], time_range: OntologyTimeRange, ) -> tuple[str, float]: if any(keyword in compact_query for keyword in OPERATE_KEYWORDS): return "operate", 0.30 status_document_query = ( "单据" in compact_query and any(keyword in compact_query for keyword in STATUS_KEYWORDS) and not any(keyword in compact_query for keyword in DRAFT_KEYWORDS if keyword != "草稿") ) historical_document_query = any( keyword in compact_query for keyword in ("报销的单据", "报销单据", "报销过的单据", "报销记录") ) if scenario == "expense" and any( keyword in compact_query for keyword in ( "报销了吗", "报销了么", "报销了没", "报销了没有", "报销没", "单据状态", "审批状态", "报销进度", "到哪了", "到了哪", "有没有报销", "是否报销", "进行中的单据", "草稿单据", "草稿的单据", "待补充单据", "审批中的单据", "已提交单据", "已入账单据", ) ) or (scenario == "expense" and (status_document_query or historical_document_query)): return "query", 0.24 if any(keyword in compact_query for keyword in DRAFT_KEYWORDS): return "draft", 0.26 if scenario == "expense" and "报销" in compact_query and any( item.type == "expense_type" and str(item.normalized_value or item.value or "").strip() for item in entities ) and not any( keyword in compact_query for keyword in ( *QUERY_KEYWORDS, *COMPARE_KEYWORDS, *EXPLAIN_KEYWORDS, *RISK_KEYWORDS, ) ): return "draft", 0.25 if scenario == "expense" and self._is_generic_expense_prompt(compact_query): return "draft", 0.24 if any(keyword in compact_query for keyword in COMPARE_KEYWORDS): return "compare", 0.24 if any(keyword in compact_query for keyword in EXPLAIN_KEYWORDS): return "explain", 0.22 if any(keyword in compact_query for keyword in RISK_KEYWORDS): return "risk_check", 0.24 if any(keyword in compact_query for keyword in QUERY_KEYWORDS): return "query", 0.20 if self._looks_like_expense_narrative( compact_query, scenario=scenario, entities=entities, time_range=time_range, ): return "draft", 0.22 return "query", 0.10 @staticmethod def _looks_like_follow_up_message(compact_query: str) -> bool: if not compact_query: return False if any(keyword in compact_query for keyword in DRAFT_FOLLOW_UP_KEYWORDS): return True if compact_query.startswith(("那", "这", "它", "这个", "那个")): return True has_domain_keyword = any( keyword in compact_query for keyword, _weight in ( *SCENARIO_KEYWORDS["expense"], *SCENARIO_KEYWORDS["accounts_receivable"], *SCENARIO_KEYWORDS["accounts_payable"], *SCENARIO_KEYWORDS["knowledge"], ) ) return len(compact_query) <= 12 and not has_domain_keyword def _should_inherit_expense_draft( self, compact_query: str, *, scenario: str, entities: list[OntologyEntity], time_range: OntologyTimeRange, context_json: dict[str, Any], ) -> bool: context_scenario = self._resolve_context_scenario(context_json) draft_claim_id = str(context_json.get("draft_claim_id") or "").strip() review_action = str(context_json.get("review_action") or "").strip() if review_action in EXPENSE_REVIEW_ACTIONS: return True if context_scenario != "expense" and not draft_claim_id: return False if any(keyword in compact_query for keyword in DRAFT_FOLLOW_UP_KEYWORDS): return True if self._looks_like_expense_narrative( compact_query, scenario="expense", entities=entities, time_range=time_range, ): return True if self._looks_like_follow_up_message(compact_query): return True if any(keyword in compact_query for keyword in OPERATE_KEYWORDS): return False if any(keyword in compact_query for keyword in COMPARE_KEYWORDS + RISK_KEYWORDS): return False if any(keyword in compact_query for keyword in QUERY_KEYWORDS): return False return bool( draft_claim_id and any( item.type in {"amount", "customer", "employee", "expense_type", "project", "invoice"} for item in entities ) ) @staticmethod def _is_generic_expense_prompt(compact_query: str) -> bool: return compact_query in GENERIC_EXPENSE_PROMPTS @staticmethod def _looks_like_expense_narrative( compact_query: str, *, scenario: str, entities: list[OntologyEntity], time_range: OntologyTimeRange, ) -> bool: if scenario not in {"expense", "accounts_receivable", "accounts_payable", "unknown"}: return False if any(keyword in compact_query for keyword in AR_CORE_KEYWORDS + AP_CORE_KEYWORDS): return False entity_types = {item.type for item in entities} has_expense_signal = any( keyword in compact_query for keyword in EXPENSE_NARRATIVE_KEYWORDS ) or "expense_type" in entity_types has_context_signal = ( bool(time_range.start_date) or "amount" in entity_types or ("报销" in compact_query and "expense_type" in entity_types) ) return has_expense_signal and has_context_signal def _parse_with_model( self, *, payload: OntologyParseRequest, query: str, compact_query: str, fallback_scenario: str, fallback_intent: str, entities: list[OntologyEntity], time_range: OntologyTimeRange, metrics: list[OntologyMetric], constraints: list[OntologyConstraint], ) -> LlmOntologyParseResult | None: messages = self._build_model_messages( payload=payload, query=query, compact_query=compact_query, fallback_scenario=fallback_scenario, fallback_intent=fallback_intent, entities=entities, time_range=time_range, metrics=metrics, constraints=constraints, ) response_text = self.runtime_chat_service.complete( messages, max_tokens=600, temperature=0.0, ) payload_json = self._extract_json_payload(response_text) if payload_json is None: return None try: return LlmOntologyParseResult.model_validate(payload_json) except ValidationError as exc: logger.warning("Semantic model output validation failed: %s", exc) return None @staticmethod def _build_model_messages( *, payload: OntologyParseRequest, query: str, compact_query: str, fallback_scenario: str, fallback_intent: str, entities: list[OntologyEntity], time_range: OntologyTimeRange, metrics: list[OntologyMetric], constraints: list[OntologyConstraint], ) -> list[dict[str, str]]: facts = { "query": query, "compact_query": compact_query, "context": { "entry_source": payload.context_json.get("entry_source"), "attachment_names": payload.context_json.get("attachment_names", []), "attachment_count": payload.context_json.get("attachment_count", 0), "ocr_summary": payload.context_json.get("ocr_summary", ""), "ocr_documents": payload.context_json.get("ocr_documents", []), "request_context": payload.context_json.get("request_context"), "role_codes": payload.context_json.get("role_codes", []), "conversation_id": payload.context_json.get("conversation_id"), "conversation_scenario": payload.context_json.get("conversation_scenario"), "conversation_intent": payload.context_json.get("conversation_intent"), "document_type": payload.context_json.get("document_type"), "application_stage": payload.context_json.get("application_stage"), "application_fields": payload.context_json.get("application_fields"), "draft_claim_id": payload.context_json.get("draft_claim_id"), "review_action": payload.context_json.get("review_action"), "review_form_values": payload.context_json.get("review_form_values"), "conversation_history": payload.context_json.get("conversation_history", []), }, "rule_candidates": { "scenario": fallback_scenario, "intent": fallback_intent, "entities": [item.model_dump(mode="json") for item in entities], "time_range": time_range.model_dump(mode="json"), "metrics": [item.model_dump(mode="json") for item in metrics], "constraints": [item.model_dump(mode="json") for item in constraints], }, } system_prompt = ( "你是企业财务共享平台的语义解析器。" "你的任务是把用户输入解析为固定 JSON,用于后续路由、追问和权限判断。" "只输出 JSON 对象,不要输出 Markdown、代码块、解释、标题或 。" "场景 scenario 只能是:expense, accounts_receivable, " "accounts_payable, knowledge, unknown。" "意图 intent 只能是:query, explain, compare, risk_check, draft, operate。" "如果用户是在描述一笔待处理费用、待报销事项、上传票据或希望整理报销," "即使没有明确说“生成草稿”,也优先使用 expense + draft。" "如果提供了 conversation_history,必须把最近轮次作为当前追问的上下文," "正确理解“这个”“那笔”“改成 800”“继续补充”这类省略表达。" "出现“客户”不等于应收,出现“供应商”不等于应付,必须结合动作词和业务目标判断。" "只有明确查询、统计、列出、多少、明细、对比时才优先使用 query 或 compare。" "附件名称和 OCR 摘要只作为辅助证据,不能编造未出现的事实。" "如果用户明确提到打车、的士票、出租车票、网约车、乘车费、车费等交通票据," "即使句子里出现“客户”,也必须优先识别为 transport,不要推断为 entertainment。" "不要输出用户原文未出现、且与规则候选冲突的费用类型。" "信息不足时 clarification_required=true,并给出一句简短中文追问。" "missing_slots 使用简短 snake_case,例如 expense_type, amount, " "customer_name, participants, attachments。" "entity_hints 只填写你比较确定的业务对象;如果不确定,可以返回空数组。" ) user_prompt = ( "请根据以下事实输出 JSON:\n" f"{json.dumps(facts, ensure_ascii=False, indent=2, default=str)}\n\n" "输出格式:\n" "{\n" ' "scenario": "expense",\n' ' "intent": "draft",\n' ' "confidence": 0.88,\n' ' "clarification_required": true,\n' ' "clarification_question": "请补充发生时间、金额和票据附件。",\n' ' "missing_slots": ["time_range", "amount", "attachments"],\n' ' "ambiguity": [],\n' ' "entity_hints": [\n' ' {"type": "expense_type", "value": "交通费", ' '"normalized_value": "transport", "role": "filter", ' '"confidence": 0.86}\n' " ]\n" "}" ) return [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ] @staticmethod def _extract_json_payload(response_text: str | None) -> dict[str, Any] | None: if not response_text: return None cleaned = re.sub(r".*?", "", response_text, flags=re.DOTALL | re.IGNORECASE) cleaned = cleaned.strip() if not cleaned: return None fenced_match = re.search(r"```(?:json)?\s*(\{.*\})\s*```", cleaned, flags=re.DOTALL) candidates = [fenced_match.group(1)] if fenced_match else [] candidates.extend([cleaned]) start = cleaned.find("{") end = cleaned.rfind("}") if start != -1 and end != -1 and end > start: candidates.append(cleaned[start : end + 1]) for candidate in candidates: try: parsed = json.loads(candidate) except json.JSONDecodeError: continue if isinstance(parsed, dict): return parsed return None @staticmethod def _resolve_scenario( fallback_scenario: str, model_parse: LlmOntologyParseResult | None, ) -> str: if model_parse is None: return fallback_scenario if model_parse.scenario == "unknown" and fallback_scenario != "unknown": return fallback_scenario return model_parse.scenario def _resolve_intent( self, compact_query: str, *, fallback_intent: str, scenario: str, entities: list[OntologyEntity], time_range: OntologyTimeRange, model_parse: LlmOntologyParseResult | None, ) -> str: candidate = model_parse.intent if model_parse is not None else fallback_intent if scenario == "knowledge": if candidate in KNOWLEDGE_INTENTS: return candidate if fallback_intent in KNOWLEDGE_INTENTS: return fallback_intent return "query" if candidate == "query" and scenario == "expense": if self._is_generic_expense_prompt(compact_query) or fallback_intent == "draft": return "draft" return candidate @staticmethod def _merge_entities( base_entities: list[OntologyEntity], entity_hints: list[LlmOntologyEntityHint], compact_query: str = "", ) -> list[OntologyEntity]: merged: dict[tuple[str, str], OntologyEntity] = { (item.type, item.normalized_value): item for item in base_entities } for hint in entity_hints: value = str(hint.value or "").strip() if not value: continue normalized_value = str(hint.normalized_value or value).strip() key = (str(hint.type).strip(), normalized_value) candidate = OntologyEntity( type=str(hint.type).strip(), value=value, normalized_value=normalized_value, role=str(hint.role or "target").strip() or "target", confidence=float(hint.confidence), ) existing = merged.get(key) if existing is None or existing.confidence < candidate.confidence: merged[key] = candidate items = list(merged.values()) if OntologyDetectionMixin._should_transport_override_entertainment( compact_query, items, ): items = [ item for item in items if not ( item.type == "expense_type" and item.normalized_value == "entertainment" ) ] return items @staticmethod def _should_transport_override_entertainment( compact_query: str, entities: list[OntologyEntity], ) -> bool: expense_types = { str(item.normalized_value or item.value or "").strip() for item in entities if item.type == "expense_type" } if not {"transport", "entertainment"}.issubset(expense_types): return False if not any(keyword in compact_query for keyword in TRANSPORT_EXPENSE_OVERRIDE_KEYWORDS): return False return not any(keyword in compact_query for keyword in EXPLICIT_ENTERTAINMENT_KEYWORDS) @staticmethod def _normalize_short_text_list(values: list[str]) -> list[str]: normalized: list[str] = [] seen: set[str] = set() for value in values: cleaned = str(value or "").strip() if not cleaned or cleaned in seen: continue normalized.append(cleaned) seen.add(cleaned) return normalized[:6]