feat: 重构知识库系统,移除Hermes集成,增强RAG和同步功能

主要变更:
- 移除Hermes智能体及相关回调服务
- 新增知识库RAG、同步、调度、规范化和索引任务服务
- 重构orchestrator服务,增强运行时聊天功能
- 更新前端聊天、政策制度、设置等页面样式和逻辑
- 更新expense_claims和document_intelligence服务
- 删除llm_wiki相关服务和测试文件
- 更新docker-compose配置和启动脚本
This commit is contained in:
caoxiaozhu
2026-05-17 08:38:41 +00:00
parent 212c935308
commit 68f663f2f4
308 changed files with 83729 additions and 13588 deletions

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
from http import HTTPStatus
from time import monotonic, sleep
from typing import Any
from sqlalchemy.orm import Session
@@ -18,6 +19,12 @@ from app.services.model_connectivity import (
from app.services.settings import SettingsService
logger = get_logger("app.services.runtime_chat")
DEFAULT_RUNTIME_CHAT_TIMEOUT_SECONDS = 45
DEFAULT_RUNTIME_CHAT_RETRY_ATTEMPTS = 2
DEFAULT_RUNTIME_CHAT_RETRY_DELAY_SECONDS = 0.6
DEFAULT_RUNTIME_CHAT_FAILURE_COOLDOWN_SECONDS = 90
_slot_failure_until: dict[str, float] = {}
class RuntimeChatService:
@@ -32,33 +39,71 @@ class RuntimeChatService:
slot_priority: tuple[str, ...] = ("main", "backup"),
max_tokens: int = 500,
temperature: float = 0.2,
timeout_seconds: int | None = None,
slot_timeouts: dict[str, int] | None = None,
max_attempts: int | None = None,
) -> str | None:
for slot in slot_priority:
config = self._load_chat_slot(slot)
if config is None:
continue
configs = [
config
for slot in slot_priority
if (config := self._load_chat_slot(slot)) is not None
]
resolved_timeout_seconds = timeout_seconds or DEFAULT_RUNTIME_CHAT_TIMEOUT_SECONDS
resolved_slot_timeouts = dict(slot_timeouts or {})
resolved_max_attempts = max_attempts or DEFAULT_RUNTIME_CHAT_RETRY_ATTEMPTS
try:
response_text = self._request_chat_completion(
config,
messages,
max_tokens=max_tokens,
temperature=temperature,
)
except Exception as exc:
logger.warning(
"Runtime chat request failed slot=%s provider=%s: %s",
slot,
config["provider"],
exc,
)
continue
if response_text:
return response_text.strip()
for attempt in range(1, resolved_max_attempts + 1):
for config in configs:
cache_key = self._build_slot_cache_key(config)
if _slot_failure_until.get(cache_key, 0.0) > monotonic():
logger.info(
"Skip runtime chat slot=%s provider=%s because it is in cooldown",
config["slot"],
config["provider"],
)
continue
try:
response_text = self._request_chat_completion(
config,
messages,
max_tokens=max_tokens,
temperature=temperature,
timeout_seconds=resolved_slot_timeouts.get(
config["slot"],
resolved_timeout_seconds,
),
)
if response_text:
_slot_failure_until.pop(cache_key, None)
return response_text.strip()
except Exception as exc:
_slot_failure_until[cache_key] = (
monotonic() + DEFAULT_RUNTIME_CHAT_FAILURE_COOLDOWN_SECONDS
)
logger.warning(
"Runtime chat request failed slot=%s provider=%s attempt=%s/%s: %s",
config["slot"],
config["provider"],
attempt,
resolved_max_attempts,
exc,
)
if attempt < resolved_max_attempts:
sleep(DEFAULT_RUNTIME_CHAT_RETRY_DELAY_SECONDS)
return None
@staticmethod
def _build_slot_cache_key(config: dict[str, str]) -> str:
return "|".join(
[
str(config.get("slot") or ""),
str(config.get("provider") or ""),
str(config.get("endpoint") or ""),
str(config.get("model") or ""),
]
)
def _load_chat_slot(self, slot: str) -> dict[str, str] | None:
try:
config = self.settings_service.get_runtime_model_config(slot)
@@ -95,6 +140,7 @@ class RuntimeChatService:
*,
max_tokens: int,
temperature: float,
timeout_seconds: int,
) -> str:
provider = config["provider"]
endpoint = config["endpoint"]
@@ -109,6 +155,7 @@ class RuntimeChatService:
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
timeout_seconds=timeout_seconds,
)
if provider == "Ollama":
@@ -119,38 +166,48 @@ class RuntimeChatService:
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
timeout_seconds=timeout_seconds,
)
return self._request_openai_compatible(
provider=provider,
endpoint=endpoint,
model=model,
api_key=api_key,
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
timeout_seconds=timeout_seconds,
)
def _request_openai_compatible(
self,
*,
provider: str,
endpoint: str,
model: str,
api_key: str,
messages: list[dict[str, Any]],
max_tokens: int,
temperature: float,
timeout_seconds: int,
) -> str:
url = _ensure_path(_normalize_endpoint(endpoint), "chat/completions")
request_payload: dict[str, Any] = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
}
if provider == "GLM":
request_payload["thinking"] = {"type": "disabled"}
status_code, payload = _send_json_request(
"POST",
url,
headers=_build_headers(api_key=api_key, use_bearer=True),
payload={
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
},
payload=request_payload,
timeout_seconds=timeout_seconds,
)
if status_code >= HTTPStatus.BAD_REQUEST:
raise ConnectivityCheckError(
@@ -168,6 +225,7 @@ class RuntimeChatService:
messages: list[dict[str, Any]],
max_tokens: int,
temperature: float,
timeout_seconds: int,
) -> str:
url = _ensure_path(_normalize_endpoint(endpoint), "api/chat")
status_code, payload = _send_json_request(
@@ -183,6 +241,7 @@ class RuntimeChatService:
"temperature": temperature,
},
},
timeout_seconds=timeout_seconds,
)
if status_code >= HTTPStatus.BAD_REQUEST:
raise ConnectivityCheckError(
@@ -200,6 +259,7 @@ class RuntimeChatService:
messages: list[dict[str, Any]],
max_tokens: int,
temperature: float,
timeout_seconds: int,
) -> str:
deployment_base = _build_azure_deployment_base(endpoint, model)
url = f"{deployment_base}/chat/completions?api-version={AZURE_API_VERSION}"
@@ -212,6 +272,7 @@ class RuntimeChatService:
"max_tokens": max_tokens,
"temperature": temperature,
},
timeout_seconds=timeout_seconds,
)
if status_code >= HTTPStatus.BAD_REQUEST:
raise ConnectivityCheckError(