Files
X-Agents/server/internal/service/knowledge_service.go
DESKTOP-72TV0V4\caoxiaozhu 4a7199de93 feat: 完善后端知识库服务和配置
- 优化 AI-Core 客户端调用
- 添加更多知识库配置选项
- 完善文档解析逻辑

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 15:42:42 +08:00

528 lines
15 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package service
import (
"bytes"
"encoding/json"
"log"
"mime/multipart"
"net/http"
"os"
"strings"
"time"
"github.com/google/uuid"
"x-agents/server/internal/model"
"x-agents/server/internal/repository"
)
// debugLog 专用调试日志
var knowledgeDebugLog *log.Logger
func init() {
debugFile, _ := os.OpenFile("logs/debug.log", os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
knowledgeDebugLog = log.New(debugFile, "", log.Ldate|log.Ltime)
}
type KnowledgeService struct {
repo *repository.KnowledgeRepository
modelRepo *repository.ModelRepository
uploadService *UploadService
pythonServiceURL string
aiCoreClient *AICoreClient
markdownLocalPath string // Markdown 本地存储路径
}
func NewKnowledgeService(repo *repository.KnowledgeRepository, modelRepo *repository.ModelRepository, uploadService *UploadService, pythonServiceURL, aiCoreServiceAddr, markdownLocalPath string) *KnowledgeService {
aiCoreClient, _ := NewAICoreClient(aiCoreServiceAddr)
return &KnowledgeService{
repo: repo,
modelRepo: modelRepo,
uploadService: uploadService,
pythonServiceURL: pythonServiceURL,
aiCoreClient: aiCoreClient,
markdownLocalPath: markdownLocalPath,
}
}
// Create 创建知识库
func (s *KnowledgeService) Create(req model.CreateKnowledgeRequest) (*model.KnowledgeBase, error) {
// 验证 LLM 模型存在
if _, err := s.modelRepo.FindByID(req.LLMModelID); err != nil {
return nil, err
}
// 验证 Embedding 模型存在
if _, err := s.modelRepo.FindByID(req.EmbeddingModelID); err != nil {
return nil, err
}
kb := &model.KnowledgeBase{
ID: uuid.New().String(),
Name: req.Name,
Description: req.Description,
LLMModelID: req.LLMModelID,
EmbeddingModelID: req.EmbeddingModelID,
ParsingConfig: req.ParsingConfig,
StorageConfig: req.StorageConfig,
Status: "active",
DocumentCount: 0,
ChunkCount: 0,
}
// 设置默认值
if kb.ParsingConfig.Engine == "" {
kb.ParsingConfig.Engine = "markitdown"
}
if kb.ParsingConfig.EnablePDF != false {
kb.ParsingConfig.EnablePDF = true
}
if kb.ParsingConfig.Pandoc != false {
kb.ParsingConfig.Pandoc = true
}
if err := s.repo.Create(kb); err != nil {
return nil, err
}
return kb, nil
}
// List 获取知识库列表
func (s *KnowledgeService) List() ([]model.KnowledgeBase, error) {
return s.repo.FindAll()
}
// GetByID 获取知识库详情
func (s *KnowledgeService) GetByID(id string) (*model.KnowledgeBase, error) {
return s.repo.FindByID(id)
}
// Update 更新知识库
func (s *KnowledgeService) Update(id string, req model.UpdateKnowledgeRequest) error {
updates := make(map[string]interface{})
if req.Name != "" {
updates["name"] = req.Name
}
if req.Description != "" {
updates["description"] = req.Description
}
if req.LLMModelID != "" {
// 验证模型存在
if _, err := s.modelRepo.FindByID(req.LLMModelID); err != nil {
return err
}
updates["llm_model_id"] = req.LLMModelID
}
if req.EmbeddingModelID != "" {
// 验证模型存在
if _, err := s.modelRepo.FindByID(req.EmbeddingModelID); err != nil {
return err
}
updates["embedding_model_id"] = req.EmbeddingModelID
}
if req.ParsingConfig.Engine != "" {
updates["parsing_config"] = req.ParsingConfig
}
if req.StorageConfig.Type != "" {
updates["storage_config"] = req.StorageConfig
}
if req.Status != "" {
updates["status"] = req.Status
}
return s.repo.Update(id, updates)
}
// Delete 删除知识库
func (s *KnowledgeService) Delete(id string) error {
// 先删除关联的文档
if err := s.repo.DeleteDocumentsByKBID(id); err != nil {
return err
}
return s.repo.Delete(id)
}
// ListDocuments 获取知识库下的文档列表
func (s *KnowledgeService) ListDocuments(kbID string, status string) ([]model.KnowledgeDocument, error) {
docs, err := s.repo.FindDocumentsByKBID(kbID, status)
if err != nil {
knowledgeDebugLog.Printf("[Knowledge ListDocuments] 错误: %v", err)
return nil, err
}
knowledgeDebugLog.Printf("[Knowledge ListDocuments] kbID=%s, count=%d", kbID, len(docs))
for i, doc := range docs {
knowledgeDebugLog.Printf("[Knowledge ListDocuments] doc[%d]: id=%s, name=%s, file_url=%q", i, doc.ID, doc.Name, doc.FileURL)
// 如果是 MinIO 内网地址,转换为代理 URL
if doc.FileURL != "" && (strings.Contains(doc.FileURL, "10.10.10.189") || strings.Contains(doc.FileURL, ":9768")) {
// 从 URL 中提取 fileKey包含扩展名
parts := strings.Split(doc.FileURL, "/")
if len(parts) > 0 {
fileName := parts[len(parts)-1]
fileKey := strings.Split(fileName, "?")[0]
doc.FileURL = "/api/file_proxy?kb_id=" + kbID + "&key=" + fileKey
}
}
}
return docs, nil
}
// UploadDocument 上传文档到知识库
func (s *KnowledgeService) UploadDocument(kbID string, file *multipart.FileHeader) (*model.KnowledgeDocument, string, error) {
knowledgeDebugLog.Printf("[Knowledge Upload] 开始上传文件: kbID=%s, filename=%s, size=%d", kbID, file.Filename, file.Size)
// 验证知识库存在
kb, err := s.repo.FindByID(kbID)
if err != nil {
knowledgeDebugLog.Printf("[Knowledge Upload] 错误: 知识库不存在, kbID=%s, err=%v", kbID, err)
return nil, "", err
}
knowledgeDebugLog.Printf("[Knowledge Upload] 知识库配置: kbID=%s, storage_config.Type=%q, storage_config=%+v",
kbID, kb.StorageConfig.Type, kb.StorageConfig)
// 上传文件(根据知识库的 storage_config 选择存储方式)
var result *UploadResponse
if kb.StorageConfig.Type != "" {
// 使用知识库的存储配置
knowledgeDebugLog.Printf("[Knowledge Upload] 使用知识库存储配置: type=%s, endpoint=%s, bucket=%s",
kb.StorageConfig.Type, kb.StorageConfig.Endpoint, kb.StorageConfig.Bucket)
result, err = s.uploadService.UploadWithConfig(file, kb.StorageConfig)
} else {
// 使用全局配置
knowledgeDebugLog.Printf("[Knowledge Upload] 使用全局存储配置")
result, err = s.uploadService.Upload(file)
}
if err != nil {
knowledgeDebugLog.Printf("[Knowledge Upload] 错误: 上传失败, err=%v", err)
return nil, "", err
}
knowledgeDebugLog.Printf("[Knowledge Upload] 上传结果: success=%v, url=%s, message=%s",
result.Success, result.URL, result.Message)
if !result.Success {
knowledgeDebugLog.Printf("[Knowledge Upload] 错误: 上传返回失败, message=%s", result.Message)
return nil, "", nil
}
// 获取文件扩展名
ext := getFileExt(file.Filename)
knowledgeDebugLog.Printf("[Knowledge Upload] 准备创建文档记录: result.URL=%q, fileKey=%s, ext=%s",
result.URL, result.FileKey, ext)
// 创建文档记录
doc := &model.KnowledgeDocument{
ID: uuid.New().String(),
KnowledgeBaseID: kbID,
Name: file.Filename,
FileKey: result.FileKey + ext,
FileURL: result.URL,
FileSize: file.Size,
Status: "parsing",
UploadedAt: time.Now(),
}
knowledgeDebugLog.Printf("[Knowledge Upload] 文档创建完成: doc.FileURL=%q", doc.FileURL)
if err := s.repo.CreateDocument(doc); err != nil {
return nil, "", err
}
// 更新知识库文档数
s.updateDocumentCount(kbID)
// 异步调用 Python 服务解析文档
go s.parseDocument(kbID, doc.ID, result.URL, kb.ParsingConfig)
// 异步调用 AI-Core gRPC 服务解析文档(获取 Markdown
go s.parseDocumentWithAICore(doc.ID, result.URL, doc.Name)
return doc, result.URL, nil
}
// parseDocument 调用 Python 服务解析文档
func (s *KnowledgeService) parseDocument(kbID, docID, fileURL string, config model.ParsingConfig) {
// 构建请求
reqBody := map[string]interface{}{
"file_url": fileURL,
"engine": config.Engine,
}
if config.Engine == "docling" && config.DoclingURL != "" {
reqBody["docling_url"] = config.DoclingURL
}
body, _ := json.Marshal(reqBody)
resp, err := http.Post(s.pythonServiceURL+"/parse", "application/json", bytes.NewBuffer(body))
if err != nil {
s.repo.UpdateDocument(docID, map[string]interface{}{"status": "failed"})
return
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
s.repo.UpdateDocument(docID, map[string]interface{}{"status": "failed"})
return
}
// 解析响应
var result map[string]interface{}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
s.repo.UpdateDocument(docID, map[string]interface{}{"status": "failed"})
return
}
if success, ok := result["success"].(bool); ok && success {
// 解析成功,更新状态
chunks := []string{}
if c, ok := result["chunks"].([]interface{}); ok {
for _, chunk := range c {
if c, ok := chunk.(string); ok {
chunks = append(chunks, c)
}
}
}
s.repo.UpdateDocument(docID, map[string]interface{}{
"status": "parsed",
"chunk_count": len(chunks),
})
// 更新知识库的 chunk_count
s.updateChunkCount(kbID)
} else {
s.repo.UpdateDocument(docID, map[string]interface{}{"status": "failed"})
}
}
// parseDocumentWithAICore 调用 AI-Core gRPC 服务解析文档
func (s *KnowledgeService) parseDocumentWithAICore(docID, fileURL, fileName string) {
if s.aiCoreClient == nil {
knowledgeDebugLog.Printf("[AICore] AI-Core 客户端未初始化")
return
}
knowledgeDebugLog.Printf("[AICore] 开始解析文档: docID=%s, fileURL=%s, fileName=%s", docID, fileURL, fileName)
result, err := s.aiCoreClient.ParseDocument(fileURL, fileName, "")
if err != nil {
knowledgeDebugLog.Printf("[AICore] 解析失败: docID=%s, err=%v", docID, err)
return
}
if result.Success && result.Content != "" {
knowledgeDebugLog.Printf("[AICore] 解析成功: docID=%s, contentLength=%d", docID, len(result.Content))
// 保存到本地文件
markdownPath := s.saveMarkdownToFile(docID, fileName, result.Content)
if markdownPath != "" {
knowledgeDebugLog.Printf("[AICore] Markdown 保存到本地: docID=%s, path=%s", docID, markdownPath)
}
// 更新文档的 Content 字段
s.repo.UpdateDocument(docID, map[string]interface{}{
"content": result.Content,
})
} else {
knowledgeDebugLog.Printf("[AICore] 解析返回失败: docID=%s, message=%s", docID, result.Message)
}
}
// saveMarkdownToFile 保存 Markdown 内容到本地文件
func (s *KnowledgeService) saveMarkdownToFile(docID, fileName, content string) string {
if s.markdownLocalPath == "" {
s.markdownLocalPath = "resource/markdown"
}
// 创建目录
if err := os.MkdirAll(s.markdownLocalPath, 0755); err != nil {
knowledgeDebugLog.Printf("[AICore] 创建目录失败: path=%s, err=%v", s.markdownLocalPath, err)
return ""
}
// 生成文件名(用 docID + .md
markdownFileName := docID + ".md"
markdownPath := s.markdownLocalPath + "/" + markdownFileName
// 写入文件
if err := os.WriteFile(markdownPath, []byte(content), 0644); err != nil {
knowledgeDebugLog.Printf("[AICore] 保存 Markdown 失败: path=%s, err=%v", markdownPath, err)
return ""
}
return markdownPath
}
// DeleteDocument 删除文档
func (s *KnowledgeService) DeleteDocument(kbID, docID string) error {
// 验证文档存在
doc, err := s.repo.FindDocumentByID(docID)
if err != nil {
return err
}
if doc.KnowledgeBaseID != kbID {
return nil
}
// 获取知识库配置
kb, err := s.repo.FindByID(kbID)
if err != nil {
return err
}
// 删除文件
if doc.FileKey != "" {
knowledgeDebugLog.Printf("[Knowledge DeleteDocument] 删除文件: kbID=%s, docID=%s, fileKey=%s, storageType=%s",
kbID, docID, doc.FileKey, kb.StorageConfig.Type)
if kb.StorageConfig.Type != "" {
// 使用知识库的存储配置删除
s.uploadService.DeleteFileWithConfig(doc.FileKey, kb.StorageConfig)
} else {
// 使用全局配置删除
s.uploadService.DeleteFile(doc.FileKey)
}
}
// 删除文档记录
if err := s.repo.DeleteDocument(docID); err != nil {
return err
}
// 更新知识库文档数
s.updateDocumentCount(kbID)
return nil
}
// ReparseDocument 重新解析文档
func (s *KnowledgeService) ReparseDocument(kbID, docID string) error {
// 验证文档存在
doc, err := s.repo.FindDocumentByID(docID)
if err != nil {
return err
}
if doc.KnowledgeBaseID != kbID {
return nil
}
// 获取知识库配置
kb, err := s.repo.FindByID(kbID)
if err != nil {
return err
}
// 获取文件 URL
fileURL, err := s.uploadService.GetFileURL(doc.FileKey)
if err != nil {
return err
}
// 重置状态为 parsing
s.repo.UpdateDocument(docID, map[string]interface{}{"status": "parsing"})
// 异步重新解析
go s.parseDocument(kbID, docID, fileURL, kb.ParsingConfig)
return nil
}
// GetDocumentPreview 获取文档预览
func (s *KnowledgeService) GetDocumentPreview(kbID, docID string, page int) (*model.DocumentPreviewResponse, error) {
// 验证文档存在
doc, err := s.repo.FindDocumentByID(docID)
if err != nil {
return nil, err
}
if doc.KnowledgeBaseID != kbID {
return nil, nil
}
// 获取文件URL
fileURL, _ := s.uploadService.GetFileURL(doc.FileKey)
// 根据文件类型决定预览方式
fileName := doc.Name
isPDF := strings.HasSuffix(strings.ToLower(fileName), ".pdf")
isOffice := false
officeExts := []string{".csv", ".xlsx", ".xls", ".docx", ".doc", ".pptx", ".ppt", ".txt", ".md"}
for _, ext := range officeExts {
if strings.HasSuffix(strings.ToLower(fileName), ext) {
isOffice = true
break
}
}
// PDF文件返回文件URL
if isPDF {
return &model.DocumentPreviewResponse{
TotalPages: 1,
CurrentPage: page,
Content: fileURL,
ContentType: "url",
}, nil
}
// Office文件调用解析服务转换为HTML
if isOffice && s.aiCoreClient != nil {
knowledgeDebugLog.Printf("[Preview] Parsing office file: %s, URL: %s", fileName, fileURL)
result, err := s.aiCoreClient.ParseDocument(fileURL, fileName, "")
if err != nil {
// 解析失败返回文件URL
knowledgeDebugLog.Printf("[Preview] Parse document failed: %v", err)
return &model.DocumentPreviewResponse{
TotalPages: 1,
CurrentPage: page,
Content: fileURL,
ContentType: "url",
}, nil
}
knowledgeDebugLog.Printf("[Preview] Parse result: success=%v, content_length=%d", result.Success, len(result.Content))
// 返回HTML内容
if result.Success && result.Content != "" {
knowledgeDebugLog.Printf("[Preview] Returning HTML content, length: %d", len(result.Content))
return &model.DocumentPreviewResponse{
TotalPages: 1,
CurrentPage: page,
Content: result.Content,
ContentType: "html",
}, nil
}
}
// 其他情况返回文件URL
return &model.DocumentPreviewResponse{
TotalPages: 1,
CurrentPage: page,
Content: fileURL,
ContentType: "url",
}, nil
}
// updateDocumentCount 更新知识库文档数
func (s *KnowledgeService) updateDocumentCount(kbID string) {
count, _ := s.repo.CountDocumentsByKBID(kbID)
s.repo.Update(kbID, map[string]interface{}{"document_count": int(count)})
}
// updateChunkCount 更新知识库 chunk 数
func (s *KnowledgeService) updateChunkCount(kbID string) {
docs, _ := s.repo.FindDocumentsByKBID(kbID, "parsed")
totalChunks := 0
for _, doc := range docs {
totalChunks += doc.ChunkCount
}
s.repo.Update(kbID, map[string]interface{}{"chunk_count": totalChunks})
}
func getFileExt(filename string) string {
exts := []string{".pdf", ".docx", ".xlsx", ".pptx", ".txt", ".md", ".html"}
for _, ext := range exts {
if len(filename) >= len(ext) && filename[len(filename)-len(ext):] == ext {
return ext
}
}
if len(filename) > 4 {
return filename[len(filename)-4:]
}
return ""
}