feat: 重构知识库系统，移除Hermes集成，增强RAG和同步功能

主要变更: - 移除Hermes智能体及相关回调服务 - 新增知识库RAG、同步、调度、规范化和索引任务服务 - 重构orchestrator服务，增强运行时聊天功能 - 更新前端聊天、政策制度、设置等页面样式和逻辑 - 更新expense_claims和document_intelligence服务 - 删除llm_wiki相关服务和测试文件 - 更新docker-compose配置和启动脚本
2026-05-17 08:38:41 +00:00
parent 212c935308
commit 68f663f2f4
308 changed files with 83729 additions and 13588 deletions
--- a/server/src/app/services/runtime_chat.py
+++ b/server/src/app/services/runtime_chat.py
@@ -1,6 +1,7 @@
 from __future__ import annotations

 from http import HTTPStatus
+from time import monotonic, sleep
 from typing import Any

 from sqlalchemy.orm import Session
@@ -18,6 +19,12 @@ from app.services.model_connectivity import (
 from app.services.settings import SettingsService

 logger = get_logger("app.services.runtime_chat")
+DEFAULT_RUNTIME_CHAT_TIMEOUT_SECONDS = 45
+DEFAULT_RUNTIME_CHAT_RETRY_ATTEMPTS = 2
+DEFAULT_RUNTIME_CHAT_RETRY_DELAY_SECONDS = 0.6
+DEFAULT_RUNTIME_CHAT_FAILURE_COOLDOWN_SECONDS = 90
+
+_slot_failure_until: dict[str, float] = {}


 class RuntimeChatService:
@@ -32,33 +39,71 @@ class RuntimeChatService:
        slot_priority: tuple[str, ...] = ("main", "backup"),
        max_tokens: int = 500,
        temperature: float = 0.2,
+        timeout_seconds: int | None = None,
+        slot_timeouts: dict[str, int] | None = None,
+        max_attempts: int | None = None,
    ) -> str | None:
-        for slot in slot_priority:
-            config = self._load_chat_slot(slot)
-            if config is None:
-                continue
+        configs = [
+            config
+            for slot in slot_priority
+            if (config := self._load_chat_slot(slot)) is not None
+        ]
+        resolved_timeout_seconds = timeout_seconds or DEFAULT_RUNTIME_CHAT_TIMEOUT_SECONDS
+        resolved_slot_timeouts = dict(slot_timeouts or {})
+        resolved_max_attempts = max_attempts or DEFAULT_RUNTIME_CHAT_RETRY_ATTEMPTS

-            try:
-                response_text = self._request_chat_completion(
-                    config,
-                    messages,
-                    max_tokens=max_tokens,
-                    temperature=temperature,
-                )
-            except Exception as exc:
-                logger.warning(
-                    "Runtime chat request failed slot=%s provider=%s: %s",
-                    slot,
-                    config["provider"],
-                    exc,
-                )
-                continue
-
-            if response_text:
-                return response_text.strip()
+        for attempt in range(1, resolved_max_attempts + 1):
+            for config in configs:
+                cache_key = self._build_slot_cache_key(config)
+                if _slot_failure_until.get(cache_key, 0.0) > monotonic():
+                    logger.info(
+                        "Skip runtime chat slot=%s provider=%s because it is in cooldown",
+                        config["slot"],
+                        config["provider"],
+                    )
+                    continue
+                try:
+                    response_text = self._request_chat_completion(
+                        config,
+                        messages,
+                        max_tokens=max_tokens,
+                        temperature=temperature,
+                        timeout_seconds=resolved_slot_timeouts.get(
+                            config["slot"],
+                            resolved_timeout_seconds,
+                        ),
+                    )
+                    if response_text:
+                        _slot_failure_until.pop(cache_key, None)
+                        return response_text.strip()
+                except Exception as exc:
+                    _slot_failure_until[cache_key] = (
+                        monotonic() + DEFAULT_RUNTIME_CHAT_FAILURE_COOLDOWN_SECONDS
+                    )
+                    logger.warning(
+                        "Runtime chat request failed slot=%s provider=%s attempt=%s/%s: %s",
+                        config["slot"],
+                        config["provider"],
+                        attempt,
+                        resolved_max_attempts,
+                        exc,
+                    )
+            if attempt < resolved_max_attempts:
+                sleep(DEFAULT_RUNTIME_CHAT_RETRY_DELAY_SECONDS)

        return None

+    @staticmethod
+    def _build_slot_cache_key(config: dict[str, str]) -> str:
+        return "|".join(
+            [
+                str(config.get("slot") or ""),
+                str(config.get("provider") or ""),
+                str(config.get("endpoint") or ""),
+                str(config.get("model") or ""),
+            ]
+        )
+
    def _load_chat_slot(self, slot: str) -> dict[str, str] | None:
        try:
            config = self.settings_service.get_runtime_model_config(slot)
@@ -95,6 +140,7 @@ class RuntimeChatService:
        *,
        max_tokens: int,
        temperature: float,
+        timeout_seconds: int,
    ) -> str:
        provider = config["provider"]
        endpoint = config["endpoint"]
@@ -109,6 +155,7 @@ class RuntimeChatService:
                messages=messages,
                max_tokens=max_tokens,
                temperature=temperature,
+                timeout_seconds=timeout_seconds,
            )

        if provider == "Ollama":
@@ -119,38 +166,48 @@ class RuntimeChatService:
                messages=messages,
                max_tokens=max_tokens,
                temperature=temperature,
+                timeout_seconds=timeout_seconds,
            )

        return self._request_openai_compatible(
+            provider=provider,
            endpoint=endpoint,
            model=model,
            api_key=api_key,
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
+            timeout_seconds=timeout_seconds,
        )

    def _request_openai_compatible(
        self,
        *,
+        provider: str,
        endpoint: str,
        model: str,
        api_key: str,
        messages: list[dict[str, Any]],
        max_tokens: int,
        temperature: float,
+        timeout_seconds: int,
    ) -> str:
        url = _ensure_path(_normalize_endpoint(endpoint), "chat/completions")
+        request_payload: dict[str, Any] = {
+            "model": model,
+            "messages": messages,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+        }
+        if provider == "GLM":
+            request_payload["thinking"] = {"type": "disabled"}
+
        status_code, payload = _send_json_request(
            "POST",
            url,
            headers=_build_headers(api_key=api_key, use_bearer=True),
-            payload={
-                "model": model,
-                "messages": messages,
-                "max_tokens": max_tokens,
-                "temperature": temperature,
-            },
+            payload=request_payload,
+            timeout_seconds=timeout_seconds,
        )
        if status_code >= HTTPStatus.BAD_REQUEST:
            raise ConnectivityCheckError(
@@ -168,6 +225,7 @@ class RuntimeChatService:
        messages: list[dict[str, Any]],
        max_tokens: int,
        temperature: float,
+        timeout_seconds: int,
    ) -> str:
        url = _ensure_path(_normalize_endpoint(endpoint), "api/chat")
        status_code, payload = _send_json_request(
@@ -183,6 +241,7 @@ class RuntimeChatService:
                    "temperature": temperature,
                },
            },
+            timeout_seconds=timeout_seconds,
        )
        if status_code >= HTTPStatus.BAD_REQUEST:
            raise ConnectivityCheckError(
@@ -200,6 +259,7 @@ class RuntimeChatService:
        messages: list[dict[str, Any]],
        max_tokens: int,
        temperature: float,
+        timeout_seconds: int,
    ) -> str:
        deployment_base = _build_azure_deployment_base(endpoint, model)
        url = f"{deployment_base}/chat/completions?api-version={AZURE_API_VERSION}"
@@ -212,6 +272,7 @@ class RuntimeChatService:
                "max_tokens": max_tokens,
                "temperature": temperature,
            },
+            timeout_seconds=timeout_seconds,
        )
        if status_code >= HTTPStatus.BAD_REQUEST:
            raise ConnectivityCheckError(