feat: 增强知识库索引与设置页面模块化拆分

扩展知识库索引任务和 RAG 检索支持增量入库和文档去重，优化本体检测和规则匹配精度，前端设置页面拆分为 LLM、邮件和 Hermes 员工同步子面板并重构样式，新增日志详情组件和知识入库日志模型，补充单元测试覆盖。
2026-05-22 23:47:28 +08:00
parent 88ff04bef8
commit 5b388d08c0
84 changed files with 10170 additions and 2599 deletions
--- a/server/src/app/services/ontology_detection.py
+++ b/server/src/app/services/ontology_detection.py
@@ -37,6 +37,39 @@ from app.services.ontology_rules import (
 logger = get_logger("app.services.ontology")


+TRANSPORT_EXPENSE_OVERRIDE_KEYWORDS = (
+    "打车",
+    "网约车",
+    "出租车票",
+    "出租车",
+    "的士票",
+    "的士",
+    "滴滴",
+    "市内交通",
+    "乘车",
+    "乘车费",
+    "用车",
+    "叫车",
+    "车费",
+    "车资",
+    "机场",
+)
+EXPLICIT_ENTERTAINMENT_KEYWORDS = (
+    "业务招待",
+    "招待费",
+    "招待",
+    "宴请",
+    "请客",
+    "请客户吃饭",
+    "客户吃饭",
+    "客户用餐",
+    "客户餐",
+    "商务接待",
+    "商务宴请",
+    "接待餐",
+)
+
+
 class OntologyDetectionMixin:
    def _detect_scenario(self, compact_query: str) -> tuple[str, float]:
        scores = {key: 0.0 for key in SCENARIO_KEYWORDS}
@@ -337,6 +370,9 @@ class OntologyDetectionMixin:
            "出现“客户”不等于应收，出现“供应商”不等于应付，必须结合动作词和业务目标判断。"
            "只有明确查询、统计、列出、多少、明细、对比时才优先使用 query 或 compare。"
            "附件名称和 OCR 摘要只作为辅助证据，不能编造未出现的事实。"
+            "如果用户明确提到打车、的士票、出租车票、网约车、乘车费、车费等交通票据，"
+            "即使句子里出现“客户”，也必须优先识别为 transport，不要推断为 entertainment。"
+            "不要输出用户原文未出现、且与规则候选冲突的费用类型。"
            "信息不足时 clarification_required=true，并给出一句简短中文追问。"
            "missing_slots 使用简短 snake_case，例如 expense_type, amount, "
            "customer_name, participants, attachments。"
@@ -351,12 +387,12 @@ class OntologyDetectionMixin:
            '  "intent": "draft",\n'
            '  "confidence": 0.88,\n'
            '  "clarification_required": true,\n'
-            '  "clarification_question": "请补充客户单位、参与人员和票据附件。",\n'
-            '  "missing_slots": ["customer_name", "participants", "attachments"],\n'
+            '  "clarification_question": "请补充发生时间、金额和票据附件。",\n'
+            '  "missing_slots": ["time_range", "amount", "attachments"],\n'
            '  "ambiguity": [],\n'
            '  "entity_hints": [\n'
-            '    {"type": "expense_type", "value": "招待", '
-            '"normalized_value": "entertainment", "role": "filter", '
+            '    {"type": "expense_type", "value": "交通费", '
+            '"normalized_value": "transport", "role": "filter", '
            '"confidence": 0.86}\n'
            "  ]\n"
            "}"
@@ -432,6 +468,7 @@ class OntologyDetectionMixin:
    def _merge_entities(
        base_entities: list[OntologyEntity],
        entity_hints: list[LlmOntologyEntityHint],
+        compact_query: str = "",
    ) -> list[OntologyEntity]:
        merged: dict[tuple[str, str], OntologyEntity] = {
            (item.type, item.normalized_value): item for item in base_entities
@@ -454,7 +491,36 @@ class OntologyDetectionMixin:
            if existing is None or existing.confidence < candidate.confidence:
                merged[key] = candidate

-        return list(merged.values())
+        items = list(merged.values())
+        if OntologyDetectionMixin._should_transport_override_entertainment(
+            compact_query,
+            items,
+        ):
+            items = [
+                item
+                for item in items
+                if not (
+                    item.type == "expense_type"
+                    and item.normalized_value == "entertainment"
+                )
+            ]
+        return items
+
+    @staticmethod
+    def _should_transport_override_entertainment(
+        compact_query: str,
+        entities: list[OntologyEntity],
+    ) -> bool:
+        expense_types = {
+            str(item.normalized_value or item.value or "").strip()
+            for item in entities
+            if item.type == "expense_type"
+        }
+        if not {"transport", "entertainment"}.issubset(expense_types):
+            return False
+        if not any(keyword in compact_query for keyword in TRANSPORT_EXPENSE_OVERRIDE_KEYWORDS):
+            return False
+        return not any(keyword in compact_query for keyword in EXPLICIT_ENTERTAINMENT_KEYWORDS)

    @staticmethod
    def _normalize_short_text_list(values: list[str]) -> list[str]: