feat: 添加算法目录和知识库 API 文档
- 新增 algorithm/ 目录 - 添加知识库 API 需求文档 - 添加相关截图 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
112
algorithm/README.md
Normal file
112
algorithm/README.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# Algorithm Service
|
||||
|
||||
Python 算法服务,提供文档解析、Embedding、LLM 调用等功能。
|
||||
|
||||
## 环境要求
|
||||
|
||||
- Python 3.9+
|
||||
- FastAPI
|
||||
- Uvicorn
|
||||
|
||||
## 安装依赖
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## 运行服务
|
||||
|
||||
```bash
|
||||
# 开发模式
|
||||
uvicorn main:app --reload --port 8081
|
||||
|
||||
# 生产模式
|
||||
uvicorn main:app --host 0.0.0.0 --port 8081
|
||||
```
|
||||
|
||||
## 接口列表
|
||||
|
||||
### 1. 文档解析
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
POST /parse
|
||||
Content-Type: application/json
|
||||
```
|
||||
|
||||
| 参数 | 类型 | 必填 | 说明 |
|
||||
|------|------|------|------|
|
||||
| file_url | String | 是 | 文件 URL |
|
||||
| engine | String | 是 | 解析引擎:markitdown / docling |
|
||||
| docling_url | String | 否 | Docling 服务 URL |
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"content": "解析后的文本内容...",
|
||||
"chunks": ["chunk1", "chunk2"],
|
||||
"total_pages": 10,
|
||||
"metadata": {
|
||||
"filename": "document.pdf",
|
||||
"file_size": 1234567
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. 生成 Embedding
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
POST /embedding
|
||||
Content-Type: application/json
|
||||
```
|
||||
|
||||
| 参数 | 类型 | 必填 | 说明 |
|
||||
|------|------|------|------|
|
||||
| input | String/Array | 是 | 要 embedding 的文本 |
|
||||
| model | String | 是 | 模型名称 |
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"embeddings": [[0.1, 0.2, ...], [0.3, 0.4, ...]],
|
||||
"model": "text-embedding-3-small"
|
||||
}
|
||||
```
|
||||
|
||||
### 3. LLM 对话
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
POST /chat
|
||||
Content-Type: application/json
|
||||
```
|
||||
|
||||
| 参数 | 类型 | 必填 | 说明 |
|
||||
|------|------|------|------|
|
||||
| messages | Array | 是 | 消息列表 |
|
||||
| model | String | 是 | 模型名称 |
|
||||
| temperature | Float | 否 | 温度参数 |
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "回复内容..."
|
||||
},
|
||||
"usage": {
|
||||
"prompt_tokens": 100,
|
||||
"completion_tokens": 50
|
||||
}
|
||||
}
|
||||
```
|
||||
175
algorithm/main.py
Normal file
175
algorithm/main.py
Normal file
@@ -0,0 +1,175 @@
|
||||
"""
|
||||
Algorithm Service - 文档解析、Embedding、LLM 调用服务
|
||||
"""
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, List, Dict, Any
|
||||
import requests
|
||||
import os
|
||||
import json
|
||||
|
||||
app = FastAPI(title="Algorithm Service")
|
||||
|
||||
|
||||
# ========== Models ==========
|
||||
|
||||
class ParseRequest(BaseModel):
|
||||
file_url: str
|
||||
engine: str # markitdown / docling
|
||||
docling_url: Optional[str] = None
|
||||
|
||||
|
||||
class EmbeddingRequest(BaseModel):
|
||||
input: str | List[str]
|
||||
model: str
|
||||
|
||||
|
||||
class ChatMessage(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class ChatRequest(BaseModel):
|
||||
messages: List[ChatMessage]
|
||||
model: str
|
||||
temperature: Optional[float] = 0.7
|
||||
api_key: Optional[str] = None
|
||||
base_url: Optional[str] = None
|
||||
|
||||
|
||||
# ========== 文档解析 ==========
|
||||
|
||||
@app.post("/parse")
|
||||
async def parse_document(req: ParseRequest):
|
||||
"""解析文档,支持 markitdown 和 docling"""
|
||||
try:
|
||||
if req.engine == "markitdown":
|
||||
return await parse_with_markitdown(req.file_url)
|
||||
elif req.engine == "docling":
|
||||
return await parse_with_docling(req.file_url, req.docling_url)
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail=f"Unsupported engine: {req.engine}")
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
async def parse_with_markitdown(file_url: str) -> Dict[str, Any]:
|
||||
"""使用 markitdown 解析文档"""
|
||||
try:
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(file_url)
|
||||
|
||||
# 简单分块(按段落分割)
|
||||
content = result.text_content if hasattr(result, 'text_content') else str(result)
|
||||
chunks = [c.strip() for c in content.split('\n\n') if c.strip()]
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"content": content,
|
||||
"chunks": chunks[:100], # 限制 chunk 数量
|
||||
"total_pages": 1,
|
||||
"metadata": {
|
||||
"filename": file_url.split('/')[-1]
|
||||
}
|
||||
}
|
||||
except ImportError:
|
||||
raise HTTPException(status_code=500, detail="markitdown not installed. Run: pip install markitdown")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to parse with markitdown: {str(e)}")
|
||||
|
||||
|
||||
async def parse_with_docling(file_url: str, docling_url: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""使用 docling 解析文档"""
|
||||
if not docling_url:
|
||||
raise HTTPException(status_code=400, detail="docling_url is required for docling engine")
|
||||
|
||||
try:
|
||||
# 调用 docling 服务
|
||||
response = requests.post(
|
||||
f"{docling_url}/convert",
|
||||
json={"url": file_url},
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise HTTPException(status_code=500, detail=f"Docling service error: {response.text}")
|
||||
|
||||
result = response.json()
|
||||
|
||||
content = result.get("text", "")
|
||||
chunks = [c.strip() for c in content.split('\n\n') if c.strip()]
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"content": content,
|
||||
"chunks": chunks[:100],
|
||||
"total_pages": result.get("num_pages", 1),
|
||||
"metadata": {
|
||||
"filename": file_url.split('/')[-1]
|
||||
}
|
||||
}
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to connect docling service: {str(e)}")
|
||||
|
||||
|
||||
# ========== Embedding ==========
|
||||
|
||||
@app.post("/embedding")
|
||||
async def generate_embedding(req: EmbeddingRequest):
|
||||
"""生成 Embedding"""
|
||||
try:
|
||||
# TODO: 根据不同 provider 调用不同的 embedding 服务
|
||||
# 目前返回模拟数据
|
||||
|
||||
texts = [req.input] if isinstance(req.input, str) else req.input
|
||||
|
||||
# 模拟 embedding 返回
|
||||
embeddings = [[0.1] * 1536 for _ in texts] # 1536 维向量
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"embeddings": embeddings,
|
||||
"model": req.model
|
||||
}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
# ========== Chat ==========
|
||||
|
||||
@app.post("/chat")
|
||||
async def chat(req: ChatRequest):
|
||||
"""LLM 对话"""
|
||||
try:
|
||||
# TODO: 根据 model 和 base_url 调用实际的 LLM 服务
|
||||
# 目前返回模拟数据
|
||||
|
||||
last_message = req.messages[-1].content if req.messages else ""
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": f"Echo: {last_message}"
|
||||
},
|
||||
"usage": {
|
||||
"prompt_tokens": len(last_message),
|
||||
"completion_tokens": 10
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
# ========== Health Check ==========
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8081)
|
||||
17
algorithm/requirements.txt
Normal file
17
algorithm/requirements.txt
Normal file
@@ -0,0 +1,17 @@
|
||||
# FastAPI
|
||||
fastapi>=0.100.0
|
||||
uvicorn[standard]>=0.23.0
|
||||
|
||||
# HTTP 请求
|
||||
requests>=2.31.0
|
||||
|
||||
# 文档解析
|
||||
markitdown>=0.0.1
|
||||
|
||||
# Pydantic
|
||||
pydantic>=2.0.0
|
||||
|
||||
# 可选:其他解析库
|
||||
# docling>=0.1.0
|
||||
# pypdf>=3.0.0
|
||||
# python-docx>=0.8.11
|
||||
30
algorithm/start.bat
Normal file
30
algorithm/start.bat
Normal file
@@ -0,0 +1,30 @@
|
||||
@echo off
|
||||
chcp 65001 >nul
|
||||
title Algorithm Service
|
||||
|
||||
echo ========================================
|
||||
echo 启动 Algorithm 服务
|
||||
echo ========================================
|
||||
|
||||
cd /d %~dp0
|
||||
|
||||
echo.
|
||||
echo 检查虚拟环境...
|
||||
if not exist venv (
|
||||
echo [INFO] 创建虚拟环境...
|
||||
python -m venv venv
|
||||
)
|
||||
|
||||
echo.
|
||||
echo 安装/更新依赖...
|
||||
call venv\Scripts\pip install -r requirements.txt -q
|
||||
|
||||
echo.
|
||||
echo 启动服务...
|
||||
echo 访问 http://localhost:8081/docs 查看 API 文档
|
||||
echo 按 Ctrl+C 停止服务
|
||||
echo.
|
||||
|
||||
call venv\Scripts\uvicorn main:app --reload --port 8081 --host 0.0.0.0
|
||||
|
||||
pause
|
||||
26
algorithm/start.sh
Normal file
26
algorithm/start.sh
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "========================================"
|
||||
echo " 启动 Algorithm 服务"
|
||||
echo "========================================"
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# 检查虚拟环境
|
||||
if [ ! -d "venv" ]; then
|
||||
echo "[INFO] 创建虚拟环境..."
|
||||
python3 -m venv venv
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "安装/更新依赖..."
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt -q
|
||||
|
||||
echo ""
|
||||
echo "启动服务..."
|
||||
echo "访问 http://localhost:8081/docs 查看 API 文档"
|
||||
echo "按 Ctrl+C 停止服务"
|
||||
echo ""
|
||||
|
||||
uvicorn main:app --reload --port 8081 --host 0.0.0.0
|
||||
BIN
screenshots/创建文件夹.png
Normal file
BIN
screenshots/创建文件夹.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 15 KiB |
BIN
screenshots/文件解析失败.png
Normal file
BIN
screenshots/文件解析失败.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 13 KiB |
BIN
screenshots/窗口bug.png
Normal file
BIN
screenshots/窗口bug.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 87 KiB |
259
team-require/api/knowledge-api.md
Normal file
259
team-require/api/knowledge-api.md
Normal file
@@ -0,0 +1,259 @@
|
||||
# 知识库 API
|
||||
|
||||
## 基础信息
|
||||
|
||||
| 项目 | 说明 |
|
||||
|------|------|
|
||||
| 基础URL | `http://localhost:8082` |
|
||||
|
||||
## 接口列表
|
||||
|
||||
### 1. 创建知识库
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
POST /api/knowledge/create
|
||||
Content-Type: application/json
|
||||
```
|
||||
|
||||
| 参数 | 类型 | 必填 | 说明 |
|
||||
|------|------|------|------|
|
||||
| name | String | 是 | 知识库名称 |
|
||||
| description | String | 否 | 知识库描述 |
|
||||
| llm_model_id | String | 是 | LLM 模型 ID |
|
||||
| embedding_model_id | String | 是 | Embedding 模型 ID |
|
||||
| parsing_config | Object | 是 | 解析配置 |
|
||||
| - engine | String | 是 | 解析引擎:markitdown / docling |
|
||||
| - docling_url | String | 条件 | Docling URL(engine=docling 时必填) |
|
||||
| - enable_pdf | Boolean | 否 | 是否启用 PDF 解析 |
|
||||
| - pandoc | Boolean | 否 | 是否启用 Pandoc |
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"id": "kb_xxx",
|
||||
"message": "Knowledge base created successfully"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. 获取知识库列表
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
GET /api/knowledge/list
|
||||
```
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": [
|
||||
{
|
||||
"id": "kb_001",
|
||||
"name": "产品文档知识库",
|
||||
"description": "用于存储产品手册",
|
||||
"llm_model_id": "model_001",
|
||||
"embedding_model_id": "model_002",
|
||||
"status": "active",
|
||||
"document_count": 15,
|
||||
"chunk_count": 156,
|
||||
"created_at": "2024-01-15T10:30:00Z",
|
||||
"updated_at": "2024-01-15T10:30:00Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. 获取知识库详情
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
GET /api/knowledge/:id
|
||||
```
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"id": "kb_001",
|
||||
"name": "产品文档知识库",
|
||||
"description": "用于存储产品手册",
|
||||
"llm_model_id": "model_001",
|
||||
"embedding_model_id": "model_002",
|
||||
"parsing_config": {
|
||||
"engine": "markitdown",
|
||||
"enable_pdf": true,
|
||||
"pandoc": true
|
||||
},
|
||||
"status": "active",
|
||||
"document_count": 15,
|
||||
"chunk_count": 156,
|
||||
"created_at": "2024-01-15T10:30:00Z",
|
||||
"updated_at": "2024-01-15T10:30:00Z"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. 删除知识库
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
DELETE /api/knowledge/:id
|
||||
```
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"message": "Knowledge base deleted"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 5. 获取知识库下的文档列表
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
GET /api/knowledge/:id/documents
|
||||
```
|
||||
|
||||
| 参数 | 类型 | 必填 | 说明 |
|
||||
|------|------|------|------|
|
||||
| status | String | 否 | 过滤状态:all / parsed / parsing / failed |
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": [
|
||||
{
|
||||
"id": "doc_001",
|
||||
"knowledge_base_id": "kb_001",
|
||||
"name": "产品手册_v2.0.pdf",
|
||||
"file_size": 2516582,
|
||||
"status": "parsed",
|
||||
"chunk_count": 156,
|
||||
"uploaded_at": "2024-01-15T10:30:00Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 6. 上传文档到知识库
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
POST /api/knowledge/:id/documents
|
||||
Content-Type: multipart/form-data
|
||||
```
|
||||
|
||||
| 参数 | 类型 | 必填 | 说明 |
|
||||
|------|------|------|------|
|
||||
| file | File | 是 | 要上传的文件 |
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"id": "doc_001",
|
||||
"url": "http://localhost:8082/files/abc123.pdf",
|
||||
"document": {
|
||||
"id": "doc_001",
|
||||
"knowledge_base_id": "kb_001",
|
||||
"name": "产品手册_v2.0.pdf",
|
||||
"file_size": 2516582,
|
||||
"status": "parsing",
|
||||
"chunk_count": 0,
|
||||
"uploaded_at": "2024-01-15T10:30:00Z"
|
||||
},
|
||||
"message": "Document uploaded"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 7. 删除知识库文档
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
DELETE /api/knowledge/:id/documents/:doc_id
|
||||
```
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"message": "Document deleted"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 8. 重新解析文档
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
POST /api/knowledge/:id/documents/:doc_id/reparse
|
||||
```
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"message": "Document reparse started"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 9. 获取文档预览内容
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
GET /api/knowledge/:id/documents/:doc_id/preview
|
||||
```
|
||||
|
||||
| 参数 | 类型 | 必填 | 说明 |
|
||||
|------|------|------|------|
|
||||
| page | Number | 否 | 页码(默认 1) |
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"total_pages": 3,
|
||||
"current_page": 1,
|
||||
"content": "第一章 产品介绍..."
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user