feat: 重构知识库系统,移除Hermes集成,增强RAG和同步功能
主要变更: - 移除Hermes智能体及相关回调服务 - 新增知识库RAG、同步、调度、规范化和索引任务服务 - 重构orchestrator服务,增强运行时聊天功能 - 更新前端聊天、政策制度、设置等页面样式和逻辑 - 更新expense_claims和document_intelligence服务 - 删除llm_wiki相关服务和测试文件 - 更新docker-compose配置和启动脚本
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from http import HTTPStatus
|
||||
from time import monotonic, sleep
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
@@ -18,6 +19,12 @@ from app.services.model_connectivity import (
|
||||
from app.services.settings import SettingsService
|
||||
|
||||
logger = get_logger("app.services.runtime_chat")
|
||||
DEFAULT_RUNTIME_CHAT_TIMEOUT_SECONDS = 45
|
||||
DEFAULT_RUNTIME_CHAT_RETRY_ATTEMPTS = 2
|
||||
DEFAULT_RUNTIME_CHAT_RETRY_DELAY_SECONDS = 0.6
|
||||
DEFAULT_RUNTIME_CHAT_FAILURE_COOLDOWN_SECONDS = 90
|
||||
|
||||
_slot_failure_until: dict[str, float] = {}
|
||||
|
||||
|
||||
class RuntimeChatService:
|
||||
@@ -32,33 +39,71 @@ class RuntimeChatService:
|
||||
slot_priority: tuple[str, ...] = ("main", "backup"),
|
||||
max_tokens: int = 500,
|
||||
temperature: float = 0.2,
|
||||
timeout_seconds: int | None = None,
|
||||
slot_timeouts: dict[str, int] | None = None,
|
||||
max_attempts: int | None = None,
|
||||
) -> str | None:
|
||||
for slot in slot_priority:
|
||||
config = self._load_chat_slot(slot)
|
||||
if config is None:
|
||||
continue
|
||||
configs = [
|
||||
config
|
||||
for slot in slot_priority
|
||||
if (config := self._load_chat_slot(slot)) is not None
|
||||
]
|
||||
resolved_timeout_seconds = timeout_seconds or DEFAULT_RUNTIME_CHAT_TIMEOUT_SECONDS
|
||||
resolved_slot_timeouts = dict(slot_timeouts or {})
|
||||
resolved_max_attempts = max_attempts or DEFAULT_RUNTIME_CHAT_RETRY_ATTEMPTS
|
||||
|
||||
try:
|
||||
response_text = self._request_chat_completion(
|
||||
config,
|
||||
messages,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Runtime chat request failed slot=%s provider=%s: %s",
|
||||
slot,
|
||||
config["provider"],
|
||||
exc,
|
||||
)
|
||||
continue
|
||||
|
||||
if response_text:
|
||||
return response_text.strip()
|
||||
for attempt in range(1, resolved_max_attempts + 1):
|
||||
for config in configs:
|
||||
cache_key = self._build_slot_cache_key(config)
|
||||
if _slot_failure_until.get(cache_key, 0.0) > monotonic():
|
||||
logger.info(
|
||||
"Skip runtime chat slot=%s provider=%s because it is in cooldown",
|
||||
config["slot"],
|
||||
config["provider"],
|
||||
)
|
||||
continue
|
||||
try:
|
||||
response_text = self._request_chat_completion(
|
||||
config,
|
||||
messages,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
timeout_seconds=resolved_slot_timeouts.get(
|
||||
config["slot"],
|
||||
resolved_timeout_seconds,
|
||||
),
|
||||
)
|
||||
if response_text:
|
||||
_slot_failure_until.pop(cache_key, None)
|
||||
return response_text.strip()
|
||||
except Exception as exc:
|
||||
_slot_failure_until[cache_key] = (
|
||||
monotonic() + DEFAULT_RUNTIME_CHAT_FAILURE_COOLDOWN_SECONDS
|
||||
)
|
||||
logger.warning(
|
||||
"Runtime chat request failed slot=%s provider=%s attempt=%s/%s: %s",
|
||||
config["slot"],
|
||||
config["provider"],
|
||||
attempt,
|
||||
resolved_max_attempts,
|
||||
exc,
|
||||
)
|
||||
if attempt < resolved_max_attempts:
|
||||
sleep(DEFAULT_RUNTIME_CHAT_RETRY_DELAY_SECONDS)
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _build_slot_cache_key(config: dict[str, str]) -> str:
|
||||
return "|".join(
|
||||
[
|
||||
str(config.get("slot") or ""),
|
||||
str(config.get("provider") or ""),
|
||||
str(config.get("endpoint") or ""),
|
||||
str(config.get("model") or ""),
|
||||
]
|
||||
)
|
||||
|
||||
def _load_chat_slot(self, slot: str) -> dict[str, str] | None:
|
||||
try:
|
||||
config = self.settings_service.get_runtime_model_config(slot)
|
||||
@@ -95,6 +140,7 @@ class RuntimeChatService:
|
||||
*,
|
||||
max_tokens: int,
|
||||
temperature: float,
|
||||
timeout_seconds: int,
|
||||
) -> str:
|
||||
provider = config["provider"]
|
||||
endpoint = config["endpoint"]
|
||||
@@ -109,6 +155,7 @@ class RuntimeChatService:
|
||||
messages=messages,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
|
||||
if provider == "Ollama":
|
||||
@@ -119,38 +166,48 @@ class RuntimeChatService:
|
||||
messages=messages,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
|
||||
return self._request_openai_compatible(
|
||||
provider=provider,
|
||||
endpoint=endpoint,
|
||||
model=model,
|
||||
api_key=api_key,
|
||||
messages=messages,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
|
||||
def _request_openai_compatible(
|
||||
self,
|
||||
*,
|
||||
provider: str,
|
||||
endpoint: str,
|
||||
model: str,
|
||||
api_key: str,
|
||||
messages: list[dict[str, Any]],
|
||||
max_tokens: int,
|
||||
temperature: float,
|
||||
timeout_seconds: int,
|
||||
) -> str:
|
||||
url = _ensure_path(_normalize_endpoint(endpoint), "chat/completions")
|
||||
request_payload: dict[str, Any] = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
}
|
||||
if provider == "GLM":
|
||||
request_payload["thinking"] = {"type": "disabled"}
|
||||
|
||||
status_code, payload = _send_json_request(
|
||||
"POST",
|
||||
url,
|
||||
headers=_build_headers(api_key=api_key, use_bearer=True),
|
||||
payload={
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
},
|
||||
payload=request_payload,
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
if status_code >= HTTPStatus.BAD_REQUEST:
|
||||
raise ConnectivityCheckError(
|
||||
@@ -168,6 +225,7 @@ class RuntimeChatService:
|
||||
messages: list[dict[str, Any]],
|
||||
max_tokens: int,
|
||||
temperature: float,
|
||||
timeout_seconds: int,
|
||||
) -> str:
|
||||
url = _ensure_path(_normalize_endpoint(endpoint), "api/chat")
|
||||
status_code, payload = _send_json_request(
|
||||
@@ -183,6 +241,7 @@ class RuntimeChatService:
|
||||
"temperature": temperature,
|
||||
},
|
||||
},
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
if status_code >= HTTPStatus.BAD_REQUEST:
|
||||
raise ConnectivityCheckError(
|
||||
@@ -200,6 +259,7 @@ class RuntimeChatService:
|
||||
messages: list[dict[str, Any]],
|
||||
max_tokens: int,
|
||||
temperature: float,
|
||||
timeout_seconds: int,
|
||||
) -> str:
|
||||
deployment_base = _build_azure_deployment_base(endpoint, model)
|
||||
url = f"{deployment_base}/chat/completions?api-version={AZURE_API_VERSION}"
|
||||
@@ -212,6 +272,7 @@ class RuntimeChatService:
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
},
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
if status_code >= HTTPStatus.BAD_REQUEST:
|
||||
raise ConnectivityCheckError(
|
||||
|
||||
Reference in New Issue
Block a user