- 新增 algorithm/ 目录 - 添加知识库 API 需求文档 - 添加相关截图 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
176 lines
4.9 KiB
Python
176 lines
4.9 KiB
Python
"""
|
|
Algorithm Service - 文档解析、Embedding、LLM 调用服务
|
|
"""
|
|
from fastapi import FastAPI, HTTPException
|
|
from pydantic import BaseModel
|
|
from typing import Optional, List, Dict, Any
|
|
import requests
|
|
import os
|
|
import json
|
|
|
|
app = FastAPI(title="Algorithm Service")
|
|
|
|
|
|
# ========== Models ==========
|
|
|
|
class ParseRequest(BaseModel):
|
|
file_url: str
|
|
engine: str # markitdown / docling
|
|
docling_url: Optional[str] = None
|
|
|
|
|
|
class EmbeddingRequest(BaseModel):
|
|
input: str | List[str]
|
|
model: str
|
|
|
|
|
|
class ChatMessage(BaseModel):
|
|
role: str
|
|
content: str
|
|
|
|
|
|
class ChatRequest(BaseModel):
|
|
messages: List[ChatMessage]
|
|
model: str
|
|
temperature: Optional[float] = 0.7
|
|
api_key: Optional[str] = None
|
|
base_url: Optional[str] = None
|
|
|
|
|
|
# ========== 文档解析 ==========
|
|
|
|
@app.post("/parse")
|
|
async def parse_document(req: ParseRequest):
|
|
"""解析文档,支持 markitdown 和 docling"""
|
|
try:
|
|
if req.engine == "markitdown":
|
|
return await parse_with_markitdown(req.file_url)
|
|
elif req.engine == "docling":
|
|
return await parse_with_docling(req.file_url, req.docling_url)
|
|
else:
|
|
raise HTTPException(status_code=400, detail=f"Unsupported engine: {req.engine}")
|
|
except Exception as e:
|
|
return {"success": False, "error": str(e)}
|
|
|
|
|
|
async def parse_with_markitdown(file_url: str) -> Dict[str, Any]:
|
|
"""使用 markitdown 解析文档"""
|
|
try:
|
|
from markitdown import MarkItDown
|
|
|
|
md = MarkItDown()
|
|
result = md.convert(file_url)
|
|
|
|
# 简单分块(按段落分割)
|
|
content = result.text_content if hasattr(result, 'text_content') else str(result)
|
|
chunks = [c.strip() for c in content.split('\n\n') if c.strip()]
|
|
|
|
return {
|
|
"success": True,
|
|
"content": content,
|
|
"chunks": chunks[:100], # 限制 chunk 数量
|
|
"total_pages": 1,
|
|
"metadata": {
|
|
"filename": file_url.split('/')[-1]
|
|
}
|
|
}
|
|
except ImportError:
|
|
raise HTTPException(status_code=500, detail="markitdown not installed. Run: pip install markitdown")
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to parse with markitdown: {str(e)}")
|
|
|
|
|
|
async def parse_with_docling(file_url: str, docling_url: Optional[str] = None) -> Dict[str, Any]:
|
|
"""使用 docling 解析文档"""
|
|
if not docling_url:
|
|
raise HTTPException(status_code=400, detail="docling_url is required for docling engine")
|
|
|
|
try:
|
|
# 调用 docling 服务
|
|
response = requests.post(
|
|
f"{docling_url}/convert",
|
|
json={"url": file_url},
|
|
timeout=60
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise HTTPException(status_code=500, detail=f"Docling service error: {response.text}")
|
|
|
|
result = response.json()
|
|
|
|
content = result.get("text", "")
|
|
chunks = [c.strip() for c in content.split('\n\n') if c.strip()]
|
|
|
|
return {
|
|
"success": True,
|
|
"content": content,
|
|
"chunks": chunks[:100],
|
|
"total_pages": result.get("num_pages", 1),
|
|
"metadata": {
|
|
"filename": file_url.split('/')[-1]
|
|
}
|
|
}
|
|
except requests.exceptions.RequestException as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to connect docling service: {str(e)}")
|
|
|
|
|
|
# ========== Embedding ==========
|
|
|
|
@app.post("/embedding")
|
|
async def generate_embedding(req: EmbeddingRequest):
|
|
"""生成 Embedding"""
|
|
try:
|
|
# TODO: 根据不同 provider 调用不同的 embedding 服务
|
|
# 目前返回模拟数据
|
|
|
|
texts = [req.input] if isinstance(req.input, str) else req.input
|
|
|
|
# 模拟 embedding 返回
|
|
embeddings = [[0.1] * 1536 for _ in texts] # 1536 维向量
|
|
|
|
return {
|
|
"success": True,
|
|
"embeddings": embeddings,
|
|
"model": req.model
|
|
}
|
|
except Exception as e:
|
|
return {"success": False, "error": str(e)}
|
|
|
|
|
|
# ========== Chat ==========
|
|
|
|
@app.post("/chat")
|
|
async def chat(req: ChatRequest):
|
|
"""LLM 对话"""
|
|
try:
|
|
# TODO: 根据 model 和 base_url 调用实际的 LLM 服务
|
|
# 目前返回模拟数据
|
|
|
|
last_message = req.messages[-1].content if req.messages else ""
|
|
|
|
return {
|
|
"success": True,
|
|
"message": {
|
|
"role": "assistant",
|
|
"content": f"Echo: {last_message}"
|
|
},
|
|
"usage": {
|
|
"prompt_tokens": len(last_message),
|
|
"completion_tokens": 10
|
|
}
|
|
}
|
|
except Exception as e:
|
|
return {"success": False, "error": str(e)}
|
|
|
|
|
|
# ========== Health Check ==========
|
|
|
|
@app.get("/health")
|
|
async def health():
|
|
return {"status": "ok"}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run(app, host="0.0.0.0", port=8081)
|