Files
X-Financial/server/src/app/services/knowledge_sync.py
caoxiaozhu 68f663f2f4 feat: 重构知识库系统,移除Hermes集成,增强RAG和同步功能
主要变更:
- 移除Hermes智能体及相关回调服务
- 新增知识库RAG、同步、调度、规范化和索引任务服务
- 重构orchestrator服务,增强运行时聊天功能
- 更新前端聊天、政策制度、设置等页面样式和逻辑
- 更新expense_claims和document_intelligence服务
- 删除llm_wiki相关服务和测试文件
- 更新docker-compose配置和启动脚本
2026-05-17 08:38:41 +00:00

245 lines
9.2 KiB
Python

from __future__ import annotations
from dataclasses import dataclass, field
from datetime import UTC, datetime, timedelta
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.api.deps import CurrentUserContext
from app.core.agent_enums import AgentName, AgentPermissionLevel, AgentRunSource, AgentRunStatus
from app.models.agent_asset import AgentAsset
from app.services.agent_runs import AgentRunService
from app.services.knowledge import (
KNOWLEDGE_INGEST_STATUS_FAILED,
KNOWLEDGE_INGEST_STATUS_SYNCING,
KnowledgeService,
)
from app.services.knowledge_index_tasks import knowledge_index_task_manager
ALL_KNOWLEDGE_FOLDERS_LABEL = "全部知识库"
@dataclass(slots=True)
class KnowledgeSyncDispatchResult:
ok: bool = True
agent_run_id: str = ""
folder: str = ""
document_ids: list[str] = field(default_factory=list)
queued_at: datetime = field(default_factory=lambda: datetime.now(UTC))
status: str = AgentRunStatus.SUCCEEDED.value
summary: str = ""
reused: bool = False
class KnowledgeSyncDispatchService:
def __init__(self, db: Session) -> None:
self.db = db
self.run_service = AgentRunService(db)
self.knowledge_service = KnowledgeService(db=db)
def queue_sync(
self,
*,
current_user: CurrentUserContext,
folder: str | None = None,
document_ids: list[str] | None = None,
source: str = AgentRunSource.USER_MESSAGE.value,
force: bool = False,
changed_only: bool = True,
) -> KnowledgeSyncDispatchResult:
normalized_folder = str(folder or "").strip() or None
folder_label = normalized_folder or ALL_KNOWLEDGE_FOLDERS_LABEL
normalized_requested_ids = [
str(item).strip()
for item in document_ids or []
if str(item).strip()
]
all_documents = self.knowledge_service.list_documents_for_ingest(
folder=normalized_folder,
document_ids=normalized_requested_ids,
changed_only=False,
)
target_documents = self.knowledge_service.list_documents_for_ingest(
folder=normalized_folder,
document_ids=normalized_requested_ids,
changed_only=(False if force else changed_only),
)
target_document_ids = [
str(item.get("id") or "").strip()
for item in target_documents
if str(item.get("id") or "").strip()
]
if not all_documents:
return KnowledgeSyncDispatchResult(
folder=folder_label,
document_ids=[],
status=AgentRunStatus.SUCCEEDED.value,
summary="当前目录暂无可归纳的知识文档。",
)
if not target_document_ids:
return KnowledgeSyncDispatchResult(
folder=folder_label,
document_ids=[],
status=AgentRunStatus.SUCCEEDED.value,
summary="当前目录没有需要增量归纳的文档。",
)
active_run = self._find_active_run(
folder=folder_label,
requested_document_ids=target_document_ids,
)
if active_run is not None:
active_document_ids = [
str(item).strip()
for item in list(active_run.route_json.get("requested_document_ids") or target_document_ids)
if str(item).strip()
]
return KnowledgeSyncDispatchResult(
agent_run_id=active_run.run_id,
folder=folder_label,
document_ids=active_document_ids,
queued_at=active_run.started_at,
status=active_run.status,
summary="已有知识归纳任务正在执行,系统已复用当前任务。",
reused=True,
)
task_asset = self.db.scalar(
select(AgentAsset).where(AgentAsset.code == "task.hermes.knowledge_index_sync")
)
run = self.run_service.create_run(
agent=AgentName.HERMES.value,
source=source,
user_id=current_user.username,
task_id=task_asset.id if task_asset is not None else None,
permission_level=AgentPermissionLevel.READ.value,
status=AgentRunStatus.RUNNING.value,
result_summary="知识归纳任务已入队,等待后台执行。",
route_json={
"job_type": "knowledge_index_sync",
"phase": "queued",
"folder": folder_label,
"force": force,
"changed_only": (False if force else changed_only),
"requested_document_ids": target_document_ids,
"requested_by_username": current_user.username,
"requested_by_name": current_user.name,
"progress": {
"total_documents": len(target_document_ids),
"completed_documents": 0,
"failed_documents": 0,
"skipped_documents": 0,
"percent": 0,
},
},
)
try:
self.knowledge_service.set_document_ingest_statuses(
target_document_ids,
status_code=KNOWLEDGE_INGEST_STATUS_SYNCING,
agent_run_id=run.run_id,
)
knowledge_index_task_manager.submit_sync(
agent_run_id=run.run_id,
folder=folder_label,
current_user=current_user,
document_ids=target_document_ids,
force=force,
)
return KnowledgeSyncDispatchResult(
agent_run_id=run.run_id,
folder=folder_label,
document_ids=target_document_ids,
queued_at=run.started_at,
status=run.status,
summary="知识归纳任务已进入后台执行,可在日志管理中查看进度。",
)
except Exception as exc:
self.run_service.update_run(
run.run_id,
status=AgentRunStatus.FAILED.value,
error_message=str(exc),
result_summary=str(exc),
finished_at=datetime.now(UTC),
)
self.knowledge_service.set_document_ingest_statuses(
target_document_ids,
status_code=KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=run.run_id,
)
raise
def _find_active_run(
self,
*,
folder: str,
requested_document_ids: list[str],
):
requested_set = {str(item).strip() for item in requested_document_ids if str(item).strip()}
for item in self.run_service.list_runs(
agent=AgentName.HERMES.value,
status=AgentRunStatus.RUNNING.value,
limit=100,
):
if str(item.route_json.get("job_type") or "").strip() != "knowledge_index_sync":
continue
heartbeat_raw = str(item.route_json.get("heartbeat_at") or "").strip()
heartbeat_at = None
if heartbeat_raw:
try:
heartbeat_at = datetime.fromisoformat(heartbeat_raw)
except ValueError:
heartbeat_at = None
last_seen_at = heartbeat_at or item.started_at
if last_seen_at.tzinfo is None:
last_seen_at = last_seen_at.replace(tzinfo=UTC)
if datetime.now(UTC) - last_seen_at > timedelta(minutes=30):
stale_document_ids = [
str(document_id).strip()
for document_id in list(item.route_json.get("requested_document_ids") or [])
if str(document_id).strip()
]
if stale_document_ids:
self.knowledge_service.set_document_ingest_statuses(
stale_document_ids,
status_code=KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=item.run_id,
)
self.run_service.merge_route_json(
item.run_id,
{
"phase": "stale_failed",
"heartbeat_at": datetime.now(UTC).isoformat(),
},
status=AgentRunStatus.FAILED.value,
result_summary="知识归纳任务长时间无心跳,系统已自动标记失败。",
error_message="Knowledge index heartbeat timed out.",
finished_at=datetime.now(UTC),
)
continue
active_ids = {
str(document_id).strip()
for document_id in list(item.route_json.get("requested_document_ids") or [])
if str(document_id).strip()
}
active_folder = str(item.route_json.get("folder") or "").strip()
if active_folder == ALL_KNOWLEDGE_FOLDERS_LABEL:
if not requested_set or active_ids & requested_set:
return item
continue
if active_folder == folder:
if not requested_set or not active_ids or active_ids & requested_set:
return item
return None