algorithm/main.py

"""
Algorithm Service - 文档解析、Embedding、LLM 调用服务
"""
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, List, Dict, Any
import requests
import os
import json

app = FastAPI(title="Algorithm Service")


# ========== Models ==========

class ParseRequest(BaseModel):
    file_url: str
    engine: str  # markitdown / docling
    docling_url: Optional[str] = None


class EmbeddingRequest(BaseModel):
    input: str | List[str]
    model: str


class ChatMessage(BaseModel):
    role: str
    content: str


class ChatRequest(BaseModel):
    messages: List[ChatMessage]
    model: str
    temperature: Optional[float] = 0.7
    api_key: Optional[str] = None
    base_url: Optional[str] = None


# ========== 文档解析 ==========

@app.post("/parse")
async def parse_document(req: ParseRequest):
    """解析文档，支持 markitdown 和 docling"""
    try:
        if req.engine == "markitdown":
            return await parse_with_markitdown(req.file_url)
        elif req.engine == "docling":
            return await parse_with_docling(req.file_url, req.docling_url)
        else:
            raise HTTPException(status_code=400, detail=f"Unsupported engine: {req.engine}")
    except Exception as e:
        return {"success": False, "error": str(e)}


async def parse_with_markitdown(file_url: str) -> Dict[str, Any]:
    """使用 markitdown 解析文档"""
    try:
        from markitdown import MarkItDown

        md = MarkItDown()
        result = md.convert(file_url)

        # 简单分块（按段落分割）
        content = result.text_content if hasattr(result, 'text_content') else str(result)
        chunks = [c.strip() for c in content.split('\n\n') if c.strip()]

        return {
            "success": True,
            "content": content,
            "chunks": chunks[:100],  # 限制 chunk 数量
            "total_pages": 1,
            "metadata": {
                "filename": file_url.split('/')[-1]
            }
        }
    except ImportError:
        raise HTTPException(status_code=500, detail="markitdown not installed. Run: pip install markitdown")
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to parse with markitdown: {str(e)}")


async def parse_with_docling(file_url: str, docling_url: Optional[str] = None) -> Dict[str, Any]:
    """使用 docling 解析文档"""
    if not docling_url:
        raise HTTPException(status_code=400, detail="docling_url is required for docling engine")

    try:
        # 调用 docling 服务
        response = requests.post(
            f"{docling_url}/convert",
            json={"url": file_url},
            timeout=60
        )

        if response.status_code != 200:
            raise HTTPException(status_code=500, detail=f"Docling service error: {response.text}")

        result = response.json()

        content = result.get("text", "")
        chunks = [c.strip() for c in content.split('\n\n') if c.strip()]

        return {
            "success": True,
            "content": content,
            "chunks": chunks[:100],
            "total_pages": result.get("num_pages", 1),
            "metadata": {
                "filename": file_url.split('/')[-1]
            }
        }
    except requests.exceptions.RequestException as e:
        raise HTTPException(status_code=500, detail=f"Failed to connect docling service: {str(e)}")


# ========== Embedding ==========

@app.post("/embedding")
async def generate_embedding(req: EmbeddingRequest):
    """生成 Embedding"""
    try:
        # TODO: 根据不同 provider 调用不同的 embedding 服务
        # 目前返回模拟数据

        texts = [req.input] if isinstance(req.input, str) else req.input

        # 模拟 embedding 返回
        embeddings = [[0.1] * 1536 for _ in texts]  # 1536 维向量

        return {
            "success": True,
            "embeddings": embeddings,
            "model": req.model
        }
    except Exception as e:
        return {"success": False, "error": str(e)}


# ========== Chat ==========

@app.post("/chat")
async def chat(req: ChatRequest):
    """LLM 对话"""
    try:
        # TODO: 根据 model 和 base_url 调用实际的 LLM 服务
        # 目前返回模拟数据

        last_message = req.messages[-1].content if req.messages else ""

        return {
            "success": True,
            "message": {
                "role": "assistant",
                "content": f"Echo: {last_message}"
            },
            "usage": {
                "prompt_tokens": len(last_message),
                "completion_tokens": 10
            }
        }
    except Exception as e:
        return {"success": False, "error": str(e)}


# ========== Health Check ==========

@app.get("/health")
async def health():
    return {"status": "ok"}


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8081)
feat: 添加算法目录和知识库 API 文档 - 新增 algorithm/ 目录 - 添加知识库 API 需求文档 - 添加相关截图 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-08 20:34:41 +08:00			`"""`
			`Algorithm Service - 文档解析、Embedding、LLM 调用服务`
			`"""`
			`from fastapi import FastAPI, HTTPException`
			`from pydantic import BaseModel`
			`from typing import Optional, List, Dict, Any`
			`import requests`
			`import os`
			`import json`

			`app = FastAPI(title="Algorithm Service")`


			`# ========== Models ==========`

			`class ParseRequest(BaseModel):`
			`file_url: str`
			`engine: str # markitdown / docling`
			`docling_url: Optional[str] = None`


			`class EmbeddingRequest(BaseModel):`
			`input: str \| List[str]`
			`model: str`


			`class ChatMessage(BaseModel):`
			`role: str`
			`content: str`


			`class ChatRequest(BaseModel):`
			`messages: List[ChatMessage]`
			`model: str`
			`temperature: Optional[float] = 0.7`
			`api_key: Optional[str] = None`
			`base_url: Optional[str] = None`


			`# ========== 文档解析 ==========`

			`@app.post("/parse")`
			`async def parse_document(req: ParseRequest):`
			`"""解析文档，支持 markitdown 和 docling"""`
			`try:`
			`if req.engine == "markitdown":`
			`return await parse_with_markitdown(req.file_url)`
			`elif req.engine == "docling":`
			`return await parse_with_docling(req.file_url, req.docling_url)`
			`else:`
			`raise HTTPException(status_code=400, detail=f"Unsupported engine: {req.engine}")`
			`except Exception as e:`
			`return {"success": False, "error": str(e)}`


			`async def parse_with_markitdown(file_url: str) -> Dict[str, Any]:`
			`"""使用 markitdown 解析文档"""`
			`try:`
			`from markitdown import MarkItDown`

			`md = MarkItDown()`
			`result = md.convert(file_url)`

			`# 简单分块（按段落分割）`
			`content = result.text_content if hasattr(result, 'text_content') else str(result)`
			`chunks = [c.strip() for c in content.split('\n\n') if c.strip()]`

			`return {`
			`"success": True,`
			`"content": content,`
			`"chunks": chunks[:100], # 限制 chunk 数量`
			`"total_pages": 1,`
			`"metadata": {`
			`"filename": file_url.split('/')[-1]`
			`}`
			`}`
			`except ImportError:`
			`raise HTTPException(status_code=500, detail="markitdown not installed. Run: pip install markitdown")`
			`except Exception as e:`
			`raise HTTPException(status_code=500, detail=f"Failed to parse with markitdown: {str(e)}")`


			`async def parse_with_docling(file_url: str, docling_url: Optional[str] = None) -> Dict[str, Any]:`
			`"""使用 docling 解析文档"""`
			`if not docling_url:`
			`raise HTTPException(status_code=400, detail="docling_url is required for docling engine")`

			`try:`
			`# 调用 docling 服务`
			`response = requests.post(`
			`f"{docling_url}/convert",`
			`json={"url": file_url},`
			`timeout=60`
			`)`

			`if response.status_code != 200:`
			`raise HTTPException(status_code=500, detail=f"Docling service error: {response.text}")`

			`result = response.json()`

			`content = result.get("text", "")`
			`chunks = [c.strip() for c in content.split('\n\n') if c.strip()]`

			`return {`
			`"success": True,`
			`"content": content,`
			`"chunks": chunks[:100],`
			`"total_pages": result.get("num_pages", 1),`
			`"metadata": {`
			`"filename": file_url.split('/')[-1]`
			`}`
			`}`
			`except requests.exceptions.RequestException as e:`
			`raise HTTPException(status_code=500, detail=f"Failed to connect docling service: {str(e)}")`


			`# ========== Embedding ==========`

			`@app.post("/embedding")`
			`async def generate_embedding(req: EmbeddingRequest):`
			`"""生成 Embedding"""`
			`try:`
			`# TODO: 根据不同 provider 调用不同的 embedding 服务`
			`# 目前返回模拟数据`

			`texts = [req.input] if isinstance(req.input, str) else req.input`

			`# 模拟 embedding 返回`
			`embeddings = [[0.1] * 1536 for _ in texts] # 1536 维向量`

			`return {`
			`"success": True,`
			`"embeddings": embeddings,`
			`"model": req.model`
			`}`
			`except Exception as e:`
			`return {"success": False, "error": str(e)}`


			`# ========== Chat ==========`

			`@app.post("/chat")`
			`async def chat(req: ChatRequest):`
			`"""LLM 对话"""`
			`try:`
			`# TODO: 根据 model 和 base_url 调用实际的 LLM 服务`
			`# 目前返回模拟数据`

			`last_message = req.messages[-1].content if req.messages else ""`

			`return {`
			`"success": True,`
			`"message": {`
			`"role": "assistant",`
			`"content": f"Echo: {last_message}"`
			`},`
			`"usage": {`
			`"prompt_tokens": len(last_message),`
			`"completion_tokens": 10`
			`}`
			`}`
			`except Exception as e:`
			`return {"success": False, "error": str(e)}`


			`# ========== Health Check ==========`

			`@app.get("/health")`
			`async def health():`
			`return {"status": "ok"}`


			`if __name__ == "__main__":`
			`import uvicorn`
			`uvicorn.run(app, host="0.0.0.0", port=8081)`