diff --git a/algorithm/README.md b/algorithm/README.md deleted file mode 100644 index eee8baa..0000000 --- a/algorithm/README.md +++ /dev/null @@ -1,112 +0,0 @@ -# Algorithm Service - -Python 算法服务,提供文档解析、Embedding、LLM 调用等功能。 - -## 环境要求 - -- Python 3.9+ -- FastAPI -- Uvicorn - -## 安装依赖 - -```bash -pip install -r requirements.txt -``` - -## 运行服务 - -```bash -# 开发模式 -uvicorn main:app --reload --port 8081 - -# 生产模式 -uvicorn main:app --host 0.0.0.0 --port 8081 -``` - -## 接口列表 - -### 1. 文档解析 - -**请求** - -``` -POST /parse -Content-Type: application/json -``` - -| 参数 | 类型 | 必填 | 说明 | -|------|------|------|------| -| file_url | String | 是 | 文件 URL | -| engine | String | 是 | 解析引擎:markitdown / docling | -| docling_url | String | 否 | Docling 服务 URL | - -**响应** - -```json -{ - "success": true, - "content": "解析后的文本内容...", - "chunks": ["chunk1", "chunk2"], - "total_pages": 10, - "metadata": { - "filename": "document.pdf", - "file_size": 1234567 - } -} -``` - -### 2. 生成 Embedding - -**请求** - -``` -POST /embedding -Content-Type: application/json -``` - -| 参数 | 类型 | 必填 | 说明 | -|------|------|------|------| -| input | String/Array | 是 | 要 embedding 的文本 | -| model | String | 是 | 模型名称 | - -**响应** - -```json -{ - "success": true, - "embeddings": [[0.1, 0.2, ...], [0.3, 0.4, ...]], - "model": "text-embedding-3-small" -} -``` - -### 3. LLM 对话 - -**请求** - -``` -POST /chat -Content-Type: application/json -``` - -| 参数 | 类型 | 必填 | 说明 | -|------|------|------|------| -| messages | Array | 是 | 消息列表 | -| model | String | 是 | 模型名称 | -| temperature | Float | 否 | 温度参数 | - -**响应** - -```json -{ - "success": true, - "message": { - "role": "assistant", - "content": "回复内容..." - }, - "usage": { - "prompt_tokens": 100, - "completion_tokens": 50 - } -} -``` diff --git a/algorithm/main.py b/algorithm/main.py deleted file mode 100644 index af59362..0000000 --- a/algorithm/main.py +++ /dev/null @@ -1,175 +0,0 @@ -""" -Algorithm Service - 文档解析、Embedding、LLM 调用服务 -""" -from fastapi import FastAPI, HTTPException -from pydantic import BaseModel -from typing import Optional, List, Dict, Any -import requests -import os -import json - -app = FastAPI(title="Algorithm Service") - - -# ========== Models ========== - -class ParseRequest(BaseModel): - file_url: str - engine: str # markitdown / docling - docling_url: Optional[str] = None - - -class EmbeddingRequest(BaseModel): - input: str | List[str] - model: str - - -class ChatMessage(BaseModel): - role: str - content: str - - -class ChatRequest(BaseModel): - messages: List[ChatMessage] - model: str - temperature: Optional[float] = 0.7 - api_key: Optional[str] = None - base_url: Optional[str] = None - - -# ========== 文档解析 ========== - -@app.post("/parse") -async def parse_document(req: ParseRequest): - """解析文档,支持 markitdown 和 docling""" - try: - if req.engine == "markitdown": - return await parse_with_markitdown(req.file_url) - elif req.engine == "docling": - return await parse_with_docling(req.file_url, req.docling_url) - else: - raise HTTPException(status_code=400, detail=f"Unsupported engine: {req.engine}") - except Exception as e: - return {"success": False, "error": str(e)} - - -async def parse_with_markitdown(file_url: str) -> Dict[str, Any]: - """使用 markitdown 解析文档""" - try: - from markitdown import MarkItDown - - md = MarkItDown() - result = md.convert(file_url) - - # 简单分块(按段落分割) - content = result.text_content if hasattr(result, 'text_content') else str(result) - chunks = [c.strip() for c in content.split('\n\n') if c.strip()] - - return { - "success": True, - "content": content, - "chunks": chunks[:100], # 限制 chunk 数量 - "total_pages": 1, - "metadata": { - "filename": file_url.split('/')[-1] - } - } - except ImportError: - raise HTTPException(status_code=500, detail="markitdown not installed. Run: pip install markitdown") - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to parse with markitdown: {str(e)}") - - -async def parse_with_docling(file_url: str, docling_url: Optional[str] = None) -> Dict[str, Any]: - """使用 docling 解析文档""" - if not docling_url: - raise HTTPException(status_code=400, detail="docling_url is required for docling engine") - - try: - # 调用 docling 服务 - response = requests.post( - f"{docling_url}/convert", - json={"url": file_url}, - timeout=60 - ) - - if response.status_code != 200: - raise HTTPException(status_code=500, detail=f"Docling service error: {response.text}") - - result = response.json() - - content = result.get("text", "") - chunks = [c.strip() for c in content.split('\n\n') if c.strip()] - - return { - "success": True, - "content": content, - "chunks": chunks[:100], - "total_pages": result.get("num_pages", 1), - "metadata": { - "filename": file_url.split('/')[-1] - } - } - except requests.exceptions.RequestException as e: - raise HTTPException(status_code=500, detail=f"Failed to connect docling service: {str(e)}") - - -# ========== Embedding ========== - -@app.post("/embedding") -async def generate_embedding(req: EmbeddingRequest): - """生成 Embedding""" - try: - # TODO: 根据不同 provider 调用不同的 embedding 服务 - # 目前返回模拟数据 - - texts = [req.input] if isinstance(req.input, str) else req.input - - # 模拟 embedding 返回 - embeddings = [[0.1] * 1536 for _ in texts] # 1536 维向量 - - return { - "success": True, - "embeddings": embeddings, - "model": req.model - } - except Exception as e: - return {"success": False, "error": str(e)} - - -# ========== Chat ========== - -@app.post("/chat") -async def chat(req: ChatRequest): - """LLM 对话""" - try: - # TODO: 根据 model 和 base_url 调用实际的 LLM 服务 - # 目前返回模拟数据 - - last_message = req.messages[-1].content if req.messages else "" - - return { - "success": True, - "message": { - "role": "assistant", - "content": f"Echo: {last_message}" - }, - "usage": { - "prompt_tokens": len(last_message), - "completion_tokens": 10 - } - } - except Exception as e: - return {"success": False, "error": str(e)} - - -# ========== Health Check ========== - -@app.get("/health") -async def health(): - return {"status": "ok"} - - -if __name__ == "__main__": - import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8081) diff --git a/algorithm/requirements.txt b/algorithm/requirements.txt deleted file mode 100644 index e32d43d..0000000 --- a/algorithm/requirements.txt +++ /dev/null @@ -1,17 +0,0 @@ -# FastAPI -fastapi>=0.100.0 -uvicorn[standard]>=0.23.0 - -# HTTP 请求 -requests>=2.31.0 - -# 文档解析 -markitdown>=0.0.1 - -# Pydantic -pydantic>=2.0.0 - -# 可选:其他解析库 -# docling>=0.1.0 -# pypdf>=3.0.0 -# python-docx>=0.8.11 diff --git a/algorithm/start.bat b/algorithm/start.bat deleted file mode 100644 index 0a374e2..0000000 --- a/algorithm/start.bat +++ /dev/null @@ -1,30 +0,0 @@ -@echo off -chcp 65001 >nul -title Algorithm Service - -echo ======================================== -echo 启动 Algorithm 服务 -echo ======================================== - -cd /d %~dp0 - -echo. -echo 检查虚拟环境... -if not exist venv ( - echo [INFO] 创建虚拟环境... - python -m venv venv -) - -echo. -echo 安装/更新依赖... -call venv\Scripts\pip install -r requirements.txt -q - -echo. -echo 启动服务... -echo 访问 http://localhost:8081/docs 查看 API 文档 -echo 按 Ctrl+C 停止服务 -echo. - -call venv\Scripts\uvicorn main:app --reload --port 8081 --host 0.0.0.0 - -pause diff --git a/algorithm/start.sh b/algorithm/start.sh deleted file mode 100644 index f60d9ad..0000000 --- a/algorithm/start.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -echo "========================================" -echo " 启动 Algorithm 服务" -echo "========================================" - -cd "$(dirname "$0")" - -# 检查虚拟环境 -if [ ! -d "venv" ]; then - echo "[INFO] 创建虚拟环境..." - python3 -m venv venv -fi - -echo "" -echo "安装/更新依赖..." -source venv/bin/activate -pip install -r requirements.txt -q - -echo "" -echo "启动服务..." -echo "访问 http://localhost:8081/docs 查看 API 文档" -echo "按 Ctrl+C 停止服务" -echo "" - -uvicorn main:app --reload --port 8081 --host 0.0.0.0