feat: 添加算法目录和知识库 API 文档

- 新增 algorithm/ 目录 - 添加知识库 API 需求文档 - 添加相关截图 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 20:34:41 +08:00
parent 11c9ff2428
commit 3a1d9ed676
9 changed files with 619 additions and 0 deletions
--- a/algorithm/README.md
+++ b/algorithm/README.md
@@ -0,0 +1,112 @@
+# Algorithm Service
+
+Python 算法服务，提供文档解析、Embedding、LLM 调用等功能。
+
+## 环境要求
+
+- Python 3.9+
+- FastAPI
+- Uvicorn
+
+## 安装依赖
+
+```bash
+pip install -r requirements.txt
+```
+
+## 运行服务
+
+```bash
+# 开发模式
+uvicorn main:app --reload --port 8081
+
+# 生产模式
+uvicorn main:app --host 0.0.0.0 --port 8081
+```
+
+## 接口列表
+
+### 1. 文档解析
+
+**请求**
+
+```
+POST /parse
+Content-Type: application/json
+```
+
+| 参数 | 类型 | 必填 | 说明 |
+|------|------|------|------|
+| file_url | String | 是 | 文件 URL |
+| engine | String | 是 | 解析引擎：markitdown / docling |
+| docling_url | String | 否 | Docling 服务 URL |
+
+**响应**
+
+```json
+{
+  "success": true,
+  "content": "解析后的文本内容...",
+  "chunks": ["chunk1", "chunk2"],
+  "total_pages": 10,
+  "metadata": {
+    "filename": "document.pdf",
+    "file_size": 1234567
+  }
+}
+```
+
+### 2. 生成 Embedding
+
+**请求**
+
+```
+POST /embedding
+Content-Type: application/json
+```
+
+| 参数 | 类型 | 必填 | 说明 |
+|------|------|------|------|
+| input | String/Array | 是 | 要 embedding 的文本 |
+| model | String | 是 | 模型名称 |
+
+**响应**
+
+```json
+{
+  "success": true,
+  "embeddings": [[0.1, 0.2, ...], [0.3, 0.4, ...]],
+  "model": "text-embedding-3-small"
+}
+```
+
+### 3. LLM 对话
+
+**请求**
+
+```
+POST /chat
+Content-Type: application/json
+```
+
+| 参数 | 类型 | 必填 | 说明 |
+|------|------|------|------|
+| messages | Array | 是 | 消息列表 |
+| model | String | 是 | 模型名称 |
+| temperature | Float | 否 | 温度参数 |
+
+**响应**
+
+```json
+{
+  "success": true,
+  "message": {
+    "role": "assistant",
+    "content": "回复内容..."
+  },
+  "usage": {
+    "prompt_tokens": 100,
+    "completion_tokens": 50
+  }
+}
+```
--- a/algorithm/main.py
+++ b/algorithm/main.py
@@ -0,0 +1,175 @@
+"""
+Algorithm Service - 文档解析、Embedding、LLM 调用服务
+"""
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import Optional, List, Dict, Any
+import requests
+import os
+import json
+
+app = FastAPI(title="Algorithm Service")
+
+
+# ========== Models ==========
+
+class ParseRequest(BaseModel):
+    file_url: str
+    engine: str  # markitdown / docling
+    docling_url: Optional[str] = None
+
+
+class EmbeddingRequest(BaseModel):
+    input: str | List[str]
+    model: str
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ChatRequest(BaseModel):
+    messages: List[ChatMessage]
+    model: str
+    temperature: Optional[float] = 0.7
+    api_key: Optional[str] = None
+    base_url: Optional[str] = None
+
+
+# ========== 文档解析 ==========
+
+@app.post("/parse")
+async def parse_document(req: ParseRequest):
+    """解析文档，支持 markitdown 和 docling"""
+    try:
+        if req.engine == "markitdown":
+            return await parse_with_markitdown(req.file_url)
+        elif req.engine == "docling":
+            return await parse_with_docling(req.file_url, req.docling_url)
+        else:
+            raise HTTPException(status_code=400, detail=f"Unsupported engine: {req.engine}")
+    except Exception as e:
+        return {"success": False, "error": str(e)}
+
+
+async def parse_with_markitdown(file_url: str) -> Dict[str, Any]:
+    """使用 markitdown 解析文档"""
+    try:
+        from markitdown import MarkItDown
+
+        md = MarkItDown()
+        result = md.convert(file_url)
+
+        # 简单分块（按段落分割）
+        content = result.text_content if hasattr(result, 'text_content') else str(result)
+        chunks = [c.strip() for c in content.split('\n\n') if c.strip()]
+
+        return {
+            "success": True,
+            "content": content,
+            "chunks": chunks[:100],  # 限制 chunk 数量
+            "total_pages": 1,
+            "metadata": {
+                "filename": file_url.split('/')[-1]
+            }
+        }
+    except ImportError:
+        raise HTTPException(status_code=500, detail="markitdown not installed. Run: pip install markitdown")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to parse with markitdown: {str(e)}")
+
+
+async def parse_with_docling(file_url: str, docling_url: Optional[str] = None) -> Dict[str, Any]:
+    """使用 docling 解析文档"""
+    if not docling_url:
+        raise HTTPException(status_code=400, detail="docling_url is required for docling engine")
+
+    try:
+        # 调用 docling 服务
+        response = requests.post(
+            f"{docling_url}/convert",
+            json={"url": file_url},
+            timeout=60
+        )
+
+        if response.status_code != 200:
+            raise HTTPException(status_code=500, detail=f"Docling service error: {response.text}")
+
+        result = response.json()
+
+        content = result.get("text", "")
+        chunks = [c.strip() for c in content.split('\n\n') if c.strip()]
+
+        return {
+            "success": True,
+            "content": content,
+            "chunks": chunks[:100],
+            "total_pages": result.get("num_pages", 1),
+            "metadata": {
+                "filename": file_url.split('/')[-1]
+            }
+        }
+    except requests.exceptions.RequestException as e:
+        raise HTTPException(status_code=500, detail=f"Failed to connect docling service: {str(e)}")
+
+
+# ========== Embedding ==========
+
+@app.post("/embedding")
+async def generate_embedding(req: EmbeddingRequest):
+    """生成 Embedding"""
+    try:
+        # TODO: 根据不同 provider 调用不同的 embedding 服务
+        # 目前返回模拟数据
+
+        texts = [req.input] if isinstance(req.input, str) else req.input
+
+        # 模拟 embedding 返回
+        embeddings = [[0.1] * 1536 for _ in texts]  # 1536 维向量
+
+        return {
+            "success": True,
+            "embeddings": embeddings,
+            "model": req.model
+        }
+    except Exception as e:
+        return {"success": False, "error": str(e)}
+
+
+# ========== Chat ==========
+
+@app.post("/chat")
+async def chat(req: ChatRequest):
+    """LLM 对话"""
+    try:
+        # TODO: 根据 model 和 base_url 调用实际的 LLM 服务
+        # 目前返回模拟数据
+
+        last_message = req.messages[-1].content if req.messages else ""
+
+        return {
+            "success": True,
+            "message": {
+                "role": "assistant",
+                "content": f"Echo: {last_message}"
+            },
+            "usage": {
+                "prompt_tokens": len(last_message),
+                "completion_tokens": 10
+            }
+        }
+    except Exception as e:
+        return {"success": False, "error": str(e)}
+
+
+# ========== Health Check ==========
+
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8081)
--- a/algorithm/requirements.txt
+++ b/algorithm/requirements.txt
@@ -0,0 +1,17 @@
+# FastAPI
+fastapi>=0.100.0
+uvicorn[standard]>=0.23.0
+
+# HTTP 请求
+requests>=2.31.0
+
+# 文档解析
+markitdown>=0.0.1
+
+# Pydantic
+pydantic>=2.0.0
+
+# 可选：其他解析库
+# docling>=0.1.0
+# pypdf>=3.0.0
+# python-docx>=0.8.11
--- a/algorithm/start.bat
+++ b/algorithm/start.bat
@@ -0,0 +1,30 @@
+@echo off
+chcp 65001 >nul
+title Algorithm Service
+
+echo ========================================
+echo   启动 Algorithm 服务
+echo ========================================
+
+cd /d %~dp0
+
+echo.
+echo 检查虚拟环境...
+if not exist venv (
+    echo [INFO] 创建虚拟环境...
+    python -m venv venv
+)
+
+echo.
+echo 安装/更新依赖...
+call venv\Scripts\pip install -r requirements.txt -q
+
+echo.
+echo 启动服务...
+echo 访问 http://localhost:8081/docs 查看 API 文档
+echo 按 Ctrl+C 停止服务
+echo.
+
+call venv\Scripts\uvicorn main:app --reload --port 8081 --host 0.0.0.0
+
+pause
--- a/algorithm/start.sh
+++ b/algorithm/start.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+echo "========================================"
+echo "  启动 Algorithm 服务"
+echo "========================================"
+
+cd "$(dirname "$0")"
+
+# 检查虚拟环境
+if [ ! -d "venv" ]; then
+    echo "[INFO] 创建虚拟环境..."
+    python3 -m venv venv
+fi
+
+echo ""
+echo "安装/更新依赖..."
+source venv/bin/activate
+pip install -r requirements.txt -q
+
+echo ""
+echo "启动服务..."
+echo "访问 http://localhost:8081/docs 查看 API 文档"
+echo "按 Ctrl+C 停止服务"
+echo ""
+
+uvicorn main:app --reload --port 8081 --host 0.0.0.0
--- a/screenshots/创建文件夹.png
+++ b/screenshots/创建文件夹.png
--- a/screenshots/文件解析失败.png
+++ b/screenshots/文件解析失败.png
--- a/screenshots/窗口bug.png
+++ b/screenshots/窗口bug.png
--- a/team-require/api/knowledge-api.md
+++ b/team-require/api/knowledge-api.md
@@ -0,0 +1,259 @@
+# 知识库 API
+
+## 基础信息
+
+| 项目 | 说明 |
+|------|------|
+| 基础URL | `http://localhost:8082` |
+
+## 接口列表
+
+### 1. 创建知识库
+
+**请求**
+
+```
+POST /api/knowledge/create
+Content-Type: application/json
+```
+
+| 参数 | 类型 | 必填 | 说明 |
+|------|------|------|------|
+| name | String | 是 | 知识库名称 |
+| description | String | 否 | 知识库描述 |
+| llm_model_id | String | 是 | LLM 模型 ID |
+| embedding_model_id | String | 是 | Embedding 模型 ID |
+| parsing_config | Object | 是 | 解析配置 |
+| - engine | String | 是 | 解析引擎：markitdown / docling |
+| - docling_url | String | 条件 | Docling URL（engine=docling 时必填） |
+| - enable_pdf | Boolean | 否 | 是否启用 PDF 解析 |
+| - pandoc | Boolean | 否 | 是否启用 Pandoc |
+
+**响应**
+
+```json
+{
+  "success": true,
+  "id": "kb_xxx",
+  "message": "Knowledge base created successfully"
+}
+```
+
+---
+
+### 2. 获取知识库列表
+
+**请求**
+
+```
+GET /api/knowledge/list
+```
+
+**响应**
+
+```json
+{
+  "success": true,
+  "data": [
+    {
+      "id": "kb_001",
+      "name": "产品文档知识库",
+      "description": "用于存储产品手册",
+      "llm_model_id": "model_001",
+      "embedding_model_id": "model_002",
+      "status": "active",
+      "document_count": 15,
+      "chunk_count": 156,
+      "created_at": "2024-01-15T10:30:00Z",
+      "updated_at": "2024-01-15T10:30:00Z"
+    }
+  ]
+}
+```
+
+---
+
+### 3. 获取知识库详情
+
+**请求**
+
+```
+GET /api/knowledge/:id
+```
+
+**响应**
+
+```json
+{
+  "success": true,
+  "data": {
+    "id": "kb_001",
+    "name": "产品文档知识库",
+    "description": "用于存储产品手册",
+    "llm_model_id": "model_001",
+    "embedding_model_id": "model_002",
+    "parsing_config": {
+      "engine": "markitdown",
+      "enable_pdf": true,
+      "pandoc": true
+    },
+    "status": "active",
+    "document_count": 15,
+    "chunk_count": 156,
+    "created_at": "2024-01-15T10:30:00Z",
+    "updated_at": "2024-01-15T10:30:00Z"
+  }
+}
+```
+
+---
+
+### 4. 删除知识库
+
+**请求**
+
+```
+DELETE /api/knowledge/:id
+```
+
+**响应**
+
+```json
+{
+  "success": true,
+  "message": "Knowledge base deleted"
+}
+```
+
+---
+
+### 5. 获取知识库下的文档列表
+
+**请求**
+
+```
+GET /api/knowledge/:id/documents
+```
+
+| 参数 | 类型 | 必填 | 说明 |
+|------|------|------|------|
+| status | String | 否 | 过滤状态：all / parsed / parsing / failed |
+
+**响应**
+
+```json
+{
+  "success": true,
+  "data": [
+    {
+      "id": "doc_001",
+      "knowledge_base_id": "kb_001",
+      "name": "产品手册_v2.0.pdf",
+      "file_size": 2516582,
+      "status": "parsed",
+      "chunk_count": 156,
+      "uploaded_at": "2024-01-15T10:30:00Z"
+    }
+  ]
+}
+```
+
+---
+
+### 6. 上传文档到知识库
+
+**请求**
+
+```
+POST /api/knowledge/:id/documents
+Content-Type: multipart/form-data
+```
+
+| 参数 | 类型 | 必填 | 说明 |
+|------|------|------|------|
+| file | File | 是 | 要上传的文件 |
+
+**响应**
+
+```json
+{
+  "success": true,
+  "id": "doc_001",
+  "url": "http://localhost:8082/files/abc123.pdf",
+  "document": {
+    "id": "doc_001",
+    "knowledge_base_id": "kb_001",
+    "name": "产品手册_v2.0.pdf",
+    "file_size": 2516582,
+    "status": "parsing",
+    "chunk_count": 0,
+    "uploaded_at": "2024-01-15T10:30:00Z"
+  },
+  "message": "Document uploaded"
+}
+```
+
+---
+
+### 7. 删除知识库文档
+
+**请求**
+
+```
+DELETE /api/knowledge/:id/documents/:doc_id
+```
+
+**响应**
+
+```json
+{
+  "success": true,
+  "message": "Document deleted"
+}
+```
+
+---
+
+### 8. 重新解析文档
+
+**请求**
+
+```
+POST /api/knowledge/:id/documents/:doc_id/reparse
+```
+
+**响应**
+
+```json
+{
+  "success": true,
+  "message": "Document reparse started"
+}
+```
+
+---
+
+### 9. 获取文档预览内容
+
+**请求**
+
+```
+GET /api/knowledge/:id/documents/:doc_id/preview
+```
+
+| 参数 | 类型 | 必填 | 说明 |
+|------|------|------|------|
+| page | Number | 否 | 页码（默认 1） |
+
+**响应**
+
+```json
+{
+  "success": true,
+  "data": {
+    "total_pages": 3,
+    "current_page": 1,
+    "content": "第一章 产品介绍..."
+  }
+}
+```