diff --git a/algorithm/README.md b/algorithm/README.md new file mode 100644 index 0000000..eee8baa --- /dev/null +++ b/algorithm/README.md @@ -0,0 +1,112 @@ +# Algorithm Service + +Python 算法服务,提供文档解析、Embedding、LLM 调用等功能。 + +## 环境要求 + +- Python 3.9+ +- FastAPI +- Uvicorn + +## 安装依赖 + +```bash +pip install -r requirements.txt +``` + +## 运行服务 + +```bash +# 开发模式 +uvicorn main:app --reload --port 8081 + +# 生产模式 +uvicorn main:app --host 0.0.0.0 --port 8081 +``` + +## 接口列表 + +### 1. 文档解析 + +**请求** + +``` +POST /parse +Content-Type: application/json +``` + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| file_url | String | 是 | 文件 URL | +| engine | String | 是 | 解析引擎:markitdown / docling | +| docling_url | String | 否 | Docling 服务 URL | + +**响应** + +```json +{ + "success": true, + "content": "解析后的文本内容...", + "chunks": ["chunk1", "chunk2"], + "total_pages": 10, + "metadata": { + "filename": "document.pdf", + "file_size": 1234567 + } +} +``` + +### 2. 生成 Embedding + +**请求** + +``` +POST /embedding +Content-Type: application/json +``` + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| input | String/Array | 是 | 要 embedding 的文本 | +| model | String | 是 | 模型名称 | + +**响应** + +```json +{ + "success": true, + "embeddings": [[0.1, 0.2, ...], [0.3, 0.4, ...]], + "model": "text-embedding-3-small" +} +``` + +### 3. LLM 对话 + +**请求** + +``` +POST /chat +Content-Type: application/json +``` + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| messages | Array | 是 | 消息列表 | +| model | String | 是 | 模型名称 | +| temperature | Float | 否 | 温度参数 | + +**响应** + +```json +{ + "success": true, + "message": { + "role": "assistant", + "content": "回复内容..." + }, + "usage": { + "prompt_tokens": 100, + "completion_tokens": 50 + } +} +``` diff --git a/algorithm/main.py b/algorithm/main.py new file mode 100644 index 0000000..af59362 --- /dev/null +++ b/algorithm/main.py @@ -0,0 +1,175 @@ +""" +Algorithm Service - 文档解析、Embedding、LLM 调用服务 +""" +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +from typing import Optional, List, Dict, Any +import requests +import os +import json + +app = FastAPI(title="Algorithm Service") + + +# ========== Models ========== + +class ParseRequest(BaseModel): + file_url: str + engine: str # markitdown / docling + docling_url: Optional[str] = None + + +class EmbeddingRequest(BaseModel): + input: str | List[str] + model: str + + +class ChatMessage(BaseModel): + role: str + content: str + + +class ChatRequest(BaseModel): + messages: List[ChatMessage] + model: str + temperature: Optional[float] = 0.7 + api_key: Optional[str] = None + base_url: Optional[str] = None + + +# ========== 文档解析 ========== + +@app.post("/parse") +async def parse_document(req: ParseRequest): + """解析文档,支持 markitdown 和 docling""" + try: + if req.engine == "markitdown": + return await parse_with_markitdown(req.file_url) + elif req.engine == "docling": + return await parse_with_docling(req.file_url, req.docling_url) + else: + raise HTTPException(status_code=400, detail=f"Unsupported engine: {req.engine}") + except Exception as e: + return {"success": False, "error": str(e)} + + +async def parse_with_markitdown(file_url: str) -> Dict[str, Any]: + """使用 markitdown 解析文档""" + try: + from markitdown import MarkItDown + + md = MarkItDown() + result = md.convert(file_url) + + # 简单分块(按段落分割) + content = result.text_content if hasattr(result, 'text_content') else str(result) + chunks = [c.strip() for c in content.split('\n\n') if c.strip()] + + return { + "success": True, + "content": content, + "chunks": chunks[:100], # 限制 chunk 数量 + "total_pages": 1, + "metadata": { + "filename": file_url.split('/')[-1] + } + } + except ImportError: + raise HTTPException(status_code=500, detail="markitdown not installed. Run: pip install markitdown") + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to parse with markitdown: {str(e)}") + + +async def parse_with_docling(file_url: str, docling_url: Optional[str] = None) -> Dict[str, Any]: + """使用 docling 解析文档""" + if not docling_url: + raise HTTPException(status_code=400, detail="docling_url is required for docling engine") + + try: + # 调用 docling 服务 + response = requests.post( + f"{docling_url}/convert", + json={"url": file_url}, + timeout=60 + ) + + if response.status_code != 200: + raise HTTPException(status_code=500, detail=f"Docling service error: {response.text}") + + result = response.json() + + content = result.get("text", "") + chunks = [c.strip() for c in content.split('\n\n') if c.strip()] + + return { + "success": True, + "content": content, + "chunks": chunks[:100], + "total_pages": result.get("num_pages", 1), + "metadata": { + "filename": file_url.split('/')[-1] + } + } + except requests.exceptions.RequestException as e: + raise HTTPException(status_code=500, detail=f"Failed to connect docling service: {str(e)}") + + +# ========== Embedding ========== + +@app.post("/embedding") +async def generate_embedding(req: EmbeddingRequest): + """生成 Embedding""" + try: + # TODO: 根据不同 provider 调用不同的 embedding 服务 + # 目前返回模拟数据 + + texts = [req.input] if isinstance(req.input, str) else req.input + + # 模拟 embedding 返回 + embeddings = [[0.1] * 1536 for _ in texts] # 1536 维向量 + + return { + "success": True, + "embeddings": embeddings, + "model": req.model + } + except Exception as e: + return {"success": False, "error": str(e)} + + +# ========== Chat ========== + +@app.post("/chat") +async def chat(req: ChatRequest): + """LLM 对话""" + try: + # TODO: 根据 model 和 base_url 调用实际的 LLM 服务 + # 目前返回模拟数据 + + last_message = req.messages[-1].content if req.messages else "" + + return { + "success": True, + "message": { + "role": "assistant", + "content": f"Echo: {last_message}" + }, + "usage": { + "prompt_tokens": len(last_message), + "completion_tokens": 10 + } + } + except Exception as e: + return {"success": False, "error": str(e)} + + +# ========== Health Check ========== + +@app.get("/health") +async def health(): + return {"status": "ok"} + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8081) diff --git a/algorithm/requirements.txt b/algorithm/requirements.txt new file mode 100644 index 0000000..e32d43d --- /dev/null +++ b/algorithm/requirements.txt @@ -0,0 +1,17 @@ +# FastAPI +fastapi>=0.100.0 +uvicorn[standard]>=0.23.0 + +# HTTP 请求 +requests>=2.31.0 + +# 文档解析 +markitdown>=0.0.1 + +# Pydantic +pydantic>=2.0.0 + +# 可选:其他解析库 +# docling>=0.1.0 +# pypdf>=3.0.0 +# python-docx>=0.8.11 diff --git a/algorithm/start.bat b/algorithm/start.bat new file mode 100644 index 0000000..0a374e2 --- /dev/null +++ b/algorithm/start.bat @@ -0,0 +1,30 @@ +@echo off +chcp 65001 >nul +title Algorithm Service + +echo ======================================== +echo 启动 Algorithm 服务 +echo ======================================== + +cd /d %~dp0 + +echo. +echo 检查虚拟环境... +if not exist venv ( + echo [INFO] 创建虚拟环境... + python -m venv venv +) + +echo. +echo 安装/更新依赖... +call venv\Scripts\pip install -r requirements.txt -q + +echo. +echo 启动服务... +echo 访问 http://localhost:8081/docs 查看 API 文档 +echo 按 Ctrl+C 停止服务 +echo. + +call venv\Scripts\uvicorn main:app --reload --port 8081 --host 0.0.0.0 + +pause diff --git a/algorithm/start.sh b/algorithm/start.sh new file mode 100644 index 0000000..f60d9ad --- /dev/null +++ b/algorithm/start.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +echo "========================================" +echo " 启动 Algorithm 服务" +echo "========================================" + +cd "$(dirname "$0")" + +# 检查虚拟环境 +if [ ! -d "venv" ]; then + echo "[INFO] 创建虚拟环境..." + python3 -m venv venv +fi + +echo "" +echo "安装/更新依赖..." +source venv/bin/activate +pip install -r requirements.txt -q + +echo "" +echo "启动服务..." +echo "访问 http://localhost:8081/docs 查看 API 文档" +echo "按 Ctrl+C 停止服务" +echo "" + +uvicorn main:app --reload --port 8081 --host 0.0.0.0 diff --git a/screenshots/创建文件夹.png b/screenshots/创建文件夹.png new file mode 100644 index 0000000..37c5f51 Binary files /dev/null and b/screenshots/创建文件夹.png differ diff --git a/screenshots/文件解析失败.png b/screenshots/文件解析失败.png new file mode 100644 index 0000000..9f4821a Binary files /dev/null and b/screenshots/文件解析失败.png differ diff --git a/screenshots/窗口bug.png b/screenshots/窗口bug.png new file mode 100644 index 0000000..c21170a Binary files /dev/null and b/screenshots/窗口bug.png differ diff --git a/team-require/api/knowledge-api.md b/team-require/api/knowledge-api.md new file mode 100644 index 0000000..962a23f --- /dev/null +++ b/team-require/api/knowledge-api.md @@ -0,0 +1,259 @@ +# 知识库 API + +## 基础信息 + +| 项目 | 说明 | +|------|------| +| 基础URL | `http://localhost:8082` | + +## 接口列表 + +### 1. 创建知识库 + +**请求** + +``` +POST /api/knowledge/create +Content-Type: application/json +``` + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| name | String | 是 | 知识库名称 | +| description | String | 否 | 知识库描述 | +| llm_model_id | String | 是 | LLM 模型 ID | +| embedding_model_id | String | 是 | Embedding 模型 ID | +| parsing_config | Object | 是 | 解析配置 | +| - engine | String | 是 | 解析引擎:markitdown / docling | +| - docling_url | String | 条件 | Docling URL(engine=docling 时必填) | +| - enable_pdf | Boolean | 否 | 是否启用 PDF 解析 | +| - pandoc | Boolean | 否 | 是否启用 Pandoc | + +**响应** + +```json +{ + "success": true, + "id": "kb_xxx", + "message": "Knowledge base created successfully" +} +``` + +--- + +### 2. 获取知识库列表 + +**请求** + +``` +GET /api/knowledge/list +``` + +**响应** + +```json +{ + "success": true, + "data": [ + { + "id": "kb_001", + "name": "产品文档知识库", + "description": "用于存储产品手册", + "llm_model_id": "model_001", + "embedding_model_id": "model_002", + "status": "active", + "document_count": 15, + "chunk_count": 156, + "created_at": "2024-01-15T10:30:00Z", + "updated_at": "2024-01-15T10:30:00Z" + } + ] +} +``` + +--- + +### 3. 获取知识库详情 + +**请求** + +``` +GET /api/knowledge/:id +``` + +**响应** + +```json +{ + "success": true, + "data": { + "id": "kb_001", + "name": "产品文档知识库", + "description": "用于存储产品手册", + "llm_model_id": "model_001", + "embedding_model_id": "model_002", + "parsing_config": { + "engine": "markitdown", + "enable_pdf": true, + "pandoc": true + }, + "status": "active", + "document_count": 15, + "chunk_count": 156, + "created_at": "2024-01-15T10:30:00Z", + "updated_at": "2024-01-15T10:30:00Z" + } +} +``` + +--- + +### 4. 删除知识库 + +**请求** + +``` +DELETE /api/knowledge/:id +``` + +**响应** + +```json +{ + "success": true, + "message": "Knowledge base deleted" +} +``` + +--- + +### 5. 获取知识库下的文档列表 + +**请求** + +``` +GET /api/knowledge/:id/documents +``` + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| status | String | 否 | 过滤状态:all / parsed / parsing / failed | + +**响应** + +```json +{ + "success": true, + "data": [ + { + "id": "doc_001", + "knowledge_base_id": "kb_001", + "name": "产品手册_v2.0.pdf", + "file_size": 2516582, + "status": "parsed", + "chunk_count": 156, + "uploaded_at": "2024-01-15T10:30:00Z" + } + ] +} +``` + +--- + +### 6. 上传文档到知识库 + +**请求** + +``` +POST /api/knowledge/:id/documents +Content-Type: multipart/form-data +``` + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| file | File | 是 | 要上传的文件 | + +**响应** + +```json +{ + "success": true, + "id": "doc_001", + "url": "http://localhost:8082/files/abc123.pdf", + "document": { + "id": "doc_001", + "knowledge_base_id": "kb_001", + "name": "产品手册_v2.0.pdf", + "file_size": 2516582, + "status": "parsing", + "chunk_count": 0, + "uploaded_at": "2024-01-15T10:30:00Z" + }, + "message": "Document uploaded" +} +``` + +--- + +### 7. 删除知识库文档 + +**请求** + +``` +DELETE /api/knowledge/:id/documents/:doc_id +``` + +**响应** + +```json +{ + "success": true, + "message": "Document deleted" +} +``` + +--- + +### 8. 重新解析文档 + +**请求** + +``` +POST /api/knowledge/:id/documents/:doc_id/reparse +``` + +**响应** + +```json +{ + "success": true, + "message": "Document reparse started" +} +``` + +--- + +### 9. 获取文档预览内容 + +**请求** + +``` +GET /api/knowledge/:id/documents/:doc_id/preview +``` + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| page | Number | 否 | 页码(默认 1) | + +**响应** + +```json +{ + "success": true, + "data": { + "total_pages": 3, + "current_page": 1, + "content": "第一章 产品介绍..." + } +} +```