feat: 集成Hermes智能体系统，增强聊天和差旅报销功能

2026-05-16 06:14:08 +00:00
parent 763afa0ee2
commit 212c935308
46 changed files with 8802 additions and 5372 deletions
--- a/server/src/app/services/knowledge.py
+++ b/server/src/app/services/knowledge.py
@@ -72,6 +72,23 @@ STRUCTURED_PREVIEW_EXTENSIONS = {"docx", "xlsx", "pptx"} | TEXT_EXTENSIONS
 INLINE_PREVIEW_EXTENSIONS = {"pdf"} | IMAGE_EXTENSIONS
 ONLYOFFICE_EDITABLE_EXTENSIONS = {"docx", "xlsx", "pptx"}
 KNOWLEDGE_INGEST_SYNC_STALE_SECONDS = 90
+KNOWLEDGE_SEARCH_RESULT_LIMIT = 3
+KNOWLEDGE_SEARCH_STOP_TERMS = {
+    "什么",
+    "怎么",
+    "如何",
+    "多少",
+    "是否",
+    "可以",
+    "一下",
+    "请问",
+    "帮我",
+    "一下子",
+    "这个",
+    "那个",
+    "哪些",
+    "一下吧",
+}

 KNOWLEDGE_INGEST_STATUS_PUBLISHED = 1
 KNOWLEDGE_INGEST_STATUS_SYNCING = 2
@@ -346,6 +363,156 @@ class KnowledgeService:
        self.ensure_library_ready()
        return self.llm_wiki_root

+    def search_llm_wiki(self, query: str, *, limit: int = KNOWLEDGE_SEARCH_RESULT_LIMIT) -> dict[str, Any]:
+        self.ensure_library_ready()
+        normalized_query = self._normalize_search_text(query)
+        if not normalized_query:
+            return {
+                "result_type": "knowledge_search",
+                "query": "",
+                "record_count": 0,
+                "hits": [],
+                "references": [],
+                "message": "请先输入要检索的制度或规则问题。",
+            }
+
+        index = self._load_index()
+        if self._reconcile_document_ingest_statuses(index):
+            self._save_index(index)
+        entry_by_id = {
+            str(item.get("id") or "").strip(): item
+            for item in list(index.get("documents") or [])
+            if str(item.get("id") or "").strip()
+        }
+        wiki_index = self._load_llm_wiki_index()
+        query_terms = self._extract_search_terms(query)
+        hits: list[dict[str, Any]] = []
+
+        for wiki_document in list(wiki_index.get("documents") or []):
+            document_id = str(wiki_document.get("document_id") or "").strip()
+            if not document_id:
+                continue
+            entry = entry_by_id.get(document_id)
+            if entry is None or not self._has_matching_llm_wiki_artifact(entry, wiki_document):
+                continue
+
+            quality_status = str(wiki_document.get("quality_status") or "").strip()
+            if quality_status == "failed":
+                continue
+
+            document_name = str(wiki_document.get("document_name") or entry.get("original_name") or "").strip()
+            document_dir = self.llm_wiki_documents_root / document_id
+            candidates = self._load_json_file(document_dir / "knowledge_candidates.json", default=[])
+            matched_in_document = False
+
+            for index, candidate in enumerate(candidates, start=1):
+                if not isinstance(candidate, dict):
+                    continue
+                title = str(candidate.get("title") or "").strip()
+                content = str(candidate.get("content") or "").strip()
+                tags = [str(item).strip() for item in list(candidate.get("tags") or []) if str(item).strip()]
+                evidence = [
+                    str(item).strip() for item in list(candidate.get("evidence") or []) if str(item).strip()
+                ]
+                score, matched_terms = self._score_knowledge_search_match(
+                    query_text=normalized_query,
+                    query_terms=query_terms,
+                    title=title,
+                    content=content,
+                    tags=tags,
+                    document_name=document_name,
+                    evidence=evidence,
+                )
+                if score <= 0:
+                    continue
+
+                matched_in_document = True
+                candidate_id = str(candidate.get("candidate_id") or f"candidate_{index}").strip()
+                hits.append(
+                    {
+                        "code": f"knowledge.{document_id}.{candidate_id}",
+                        "candidate_id": candidate_id,
+                        "title": title or document_name or "制度知识条目",
+                        "content": content,
+                        "excerpt": self._build_search_excerpt(content or title, query_terms),
+                        "document_id": document_id,
+                        "document_name": document_name,
+                        "version": str(wiki_document.get("document_version") or "").strip() or None,
+                        "updated_at": self._format_search_timestamp(wiki_document.get("updated_at")),
+                        "quality_status": quality_status,
+                        "tags": tags,
+                        "evidence": evidence,
+                        "score": score,
+                        "matched_terms": matched_terms,
+                    }
+                )
+
+        self._boost_title_family_hits(hits)
+        ranked_hits = sorted(
+            hits,
+            key=lambda item: (
+                -int(item.get("score") or 0),
+                str(item.get("quality_status") or "") != "formal",
+                str(item.get("title") or ""),
+            ),
+        )[: max(1, limit)]
+
+        if ranked_hits:
+            titles = "、".join(str(item.get("title") or "") for item in ranked_hits[:2] if str(item.get("title") or "").strip())
+            return {
+                "result_type": "knowledge_search",
+                "query": str(query).strip(),
+                "record_count": len(ranked_hits),
+                "hits": ranked_hits,
+                "references": [str(item.get("code") or "").strip() for item in ranked_hits if str(item.get("code") or "").strip()],
+                "message": (
+                    f"已从已归纳制度知识中检索到 {len(ranked_hits)} 条相关内容。"
+                    f"{f'优先参考：{titles}。' if titles else ''}"
+                ),
+            }
+
+        return {
+            "result_type": "knowledge_search",
+            "query": str(query).strip(),
+            "record_count": 0,
+            "hits": [],
+            "references": [],
+            "message": (
+                f"当前未在已归纳制度知识中检索到与“{str(query).strip()}”直接匹配的内容。"
+                "知识问答仅基于 LLM Wiki 已形成的知识条目回答；当前依据不足，不能继续扩展回答。"
+            ),
+        }
+
+    @staticmethod
+    def _boost_title_family_hits(hits: list[dict[str, Any]]) -> None:
+        if len(hits) < 2:
+            return
+        preliminary = sorted(
+            hits,
+            key=lambda item: (
+                -int(item.get("score") or 0),
+                str(item.get("quality_status") or "") != "formal",
+                str(item.get("title") or ""),
+            ),
+        )
+        primary = preliminary[0]
+        primary_title = str(primary.get("title") or "").strip()
+        primary_document_id = str(primary.get("document_id") or "").strip()
+        if len(primary_title) < 3 or not primary_document_id:
+            return
+
+        family_key = primary_title[:3]
+        family_hits = [
+            item
+            for item in hits
+            if str(item.get("document_id") or "").strip() == primary_document_id
+            and str(item.get("title") or "").strip().startswith(family_key)
+        ]
+        if len(family_hits) < 2:
+            return
+        for item in family_hits:
+            item["score"] = int(item.get("score") or 0) + 20
+
    def extract_document_text(self, document_id: str) -> str:
        self.ensure_library_ready()
        entry = self.get_document_entry(document_id)
@@ -830,6 +997,151 @@ class KnowledgeService:
            if str(item.get("document_id") or "").strip()
        }

+    @staticmethod
+    def _load_json_file(path: Path, *, default: Any) -> Any:
+        try:
+            return json.loads(path.read_text(encoding="utf-8"))
+        except (FileNotFoundError, json.JSONDecodeError):
+            return default
+
+    @staticmethod
+    def _load_text_file(path: Path) -> str:
+        try:
+            return path.read_text(encoding="utf-8").strip()
+        except FileNotFoundError:
+            return ""
+
+    @staticmethod
+    def _normalize_search_text(value: Any) -> str:
+        text = str(value or "").strip().lower()
+        return re.sub(r"[^0-9a-z\u4e00-\u9fff]+", "", text)
+
+    @staticmethod
+    def _extract_search_terms(query: str) -> list[str]:
+        normalized = KnowledgeService._normalize_search_text(query)
+        if not normalized:
+            return []
+
+        terms: set[str] = set()
+        for part in re.findall(r"[0-9a-z]+|[\u4e00-\u9fff]+", normalized):
+            if len(part) <= 1:
+                continue
+            if part not in KNOWLEDGE_SEARCH_STOP_TERMS:
+                terms.add(part)
+            if not re.fullmatch(r"[\u4e00-\u9fff]+", part):
+                continue
+            upper_size = min(4, len(part))
+            for size in range(2, upper_size + 1):
+                for index in range(0, len(part) - size + 1):
+                    gram = part[index : index + size]
+                    if gram in KNOWLEDGE_SEARCH_STOP_TERMS:
+                        continue
+                    terms.add(gram)
+
+        return sorted(terms, key=lambda item: (-len(item), item))
+
+    @staticmethod
+    def _score_knowledge_search_match(
+        *,
+        query_text: str,
+        query_terms: list[str],
+        title: str,
+        content: str,
+        tags: list[str],
+        document_name: str,
+        evidence: list[str],
+    ) -> tuple[int, list[str]]:
+        normalized_title = KnowledgeService._normalize_search_text(title)
+        normalized_content = KnowledgeService._normalize_search_text(content)
+        normalized_tags = [KnowledgeService._normalize_search_text(item) for item in tags]
+        normalized_document_name = KnowledgeService._normalize_search_text(document_name)
+        normalized_evidence = [KnowledgeService._normalize_search_text(item) for item in evidence]
+
+        score = 0
+        matched_terms: list[str] = []
+
+        if query_text and query_text in normalized_title:
+            score += 140
+        elif query_text and any(query_text in item for item in normalized_tags):
+            score += 120
+        elif query_text and query_text in normalized_content:
+            score += 88
+
+        for phrase in [normalized_title, *normalized_tags, normalized_document_name]:
+            if not phrase:
+                continue
+            if phrase in query_text:
+                score += 24 + min(18, len(phrase) * 2)
+                matched_terms.append(phrase)
+            elif query_text and query_text in phrase:
+                score += 16
+
+        for term in query_terms:
+            if len(term) <= 1:
+                continue
+            term_score = 0
+            if term in normalized_title:
+                term_score = 18 if len(term) >= 4 else 14
+            elif any(term in item for item in normalized_tags):
+                term_score = 16 if len(term) >= 4 else 12
+            elif term in normalized_content:
+                term_score = 10 if len(term) >= 4 else 8
+            elif term in normalized_document_name or any(term in item for item in normalized_evidence):
+                term_score = 6
+            if term_score:
+                score += term_score
+                matched_terms.append(term)
+
+        if score <= 0:
+            return 0, []
+
+        distinct_matches = []
+        for item in matched_terms:
+            if item and item not in distinct_matches:
+                distinct_matches.append(item)
+        score += min(24, len(distinct_matches) * 4)
+        return score, distinct_matches[:6]
+
+    @staticmethod
+    def _build_search_excerpt(text: str, query_terms: list[str], *, max_length: int = 140) -> str:
+        plain_text = re.sub(r"[#*_`>\-\[\]]+", " ", str(text or ""))
+        plain_text = re.sub(r"\s+", " ", plain_text).strip()
+        if not plain_text:
+            return ""
+
+        normalized_text = KnowledgeService._normalize_search_text(plain_text)
+        for term in query_terms:
+            if not term or term not in normalized_text:
+                continue
+            raw_index = plain_text.find(term)
+            if raw_index == -1:
+                continue
+            start = max(0, raw_index - 36)
+            end = min(len(plain_text), raw_index + max_length - 36)
+            snippet = plain_text[start:end].strip(" ，。；：")
+            if start > 0:
+                snippet = f"...{snippet}"
+            if end < len(plain_text):
+                snippet = f"{snippet}..."
+            return snippet
+
+        if len(plain_text) <= max_length:
+            return plain_text
+        return f"{plain_text[: max_length - 3].rstrip()}..."
+
+    @staticmethod
+    def _format_search_timestamp(value: Any) -> str | None:
+        raw_value = str(value or "").strip()
+        if not raw_value:
+            return None
+        try:
+            parsed = datetime.fromisoformat(raw_value)
+        except ValueError:
+            return raw_value or None
+        if parsed.tzinfo is None:
+            parsed = parsed.replace(tzinfo=UTC)
+        return parsed.astimezone(UTC).date().isoformat()
+
    def _has_ingested_llm_wiki_document(
        self,
        entry: dict[str, Any],