refactor: 重构 ai-core 代码结构
- 移除旧的 parser 和 grpc_server 实现 - 保留必要的配置和 proto 文件 - 删除 docker-compose.dev.yml Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,33 +1,31 @@
|
||||
# AI-Core 文档解析服务
|
||||
|
||||
基于 Python 和 Microsoft MarkItDown 的 gRPC 文档解析服务,支持多种文件格式转换为 Markdown。
|
||||
基于 Python 的 gRPC 文档解析服务,支持多种文件格式转换为 Markdown。
|
||||
|
||||
## 特性
|
||||
## 功能特性
|
||||
|
||||
- **统一解析引擎** - 使用 Microsoft MarkItDown,一个库支持所有格式
|
||||
- **支持格式广泛** - PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页等
|
||||
- **gRPC 接口** - 高性能、类型安全的 RPC 通信
|
||||
- **依赖简单** - 只需安装 3 个核心包
|
||||
- **易于部署** - 一键启动,开箱即用
|
||||
- 支持多种文件格式:PDF、DOCX、DOC、XLSX、XLS、CSV、Markdown、图片等
|
||||
- 多解析引擎支持(builtin、markitdown)
|
||||
- gRPC 接口,高性能通信
|
||||
- 支持通过 URL 下载文件并解析
|
||||
- 可配置的解析引擎和参数
|
||||
|
||||
## 项目结构
|
||||
|
||||
```
|
||||
ai-core/
|
||||
├── main.py # 服务启动入口
|
||||
├── requirements.txt # Python 依赖(仅 3 个包)
|
||||
├── generate_grpc.py # gRPC 代码生成脚本
|
||||
├── start.sh # Linux/Mac 启动脚本
|
||||
├── start.ps1 # Windows 启动脚本
|
||||
├── requirements.txt # Python 依赖
|
||||
├── proto/ # gRPC 协议定义
|
||||
│ ├── document_parser.proto # Protocol Buffers 定义
|
||||
│ ├── document_parser_pb2.py # 生成的 Python 代码
|
||||
│ └── document_parser_pb2_grpc.py
|
||||
│ └── document_parser.proto # Protocol Buffers 定义
|
||||
├── parser/ # 文档解析器模块
|
||||
│ ├── __init__.py
|
||||
│ └── parser.py # MarkItDown 解析器
|
||||
│ ├── base_parser.py # 基础解析器接口
|
||||
│ ├── parser.py # 解析器门面
|
||||
│ ├── registry.py # 解析器注册表
|
||||
│ ├── docx_parser.py # DOCX 解析器
|
||||
│ ├── pdf_parser.py # PDF 解析器
|
||||
│ └── ...
|
||||
└── service/ # gRPC 服务实现
|
||||
├── __init__.py
|
||||
└── grpc_server.py # gRPC 服务器
|
||||
```
|
||||
|
||||
@@ -39,39 +37,19 @@ ai-core/
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
依赖包:
|
||||
- `grpcio` - gRPC 框架
|
||||
- `grpcio-tools` - gRPC 工具
|
||||
- `grpcio-reflection` - gRPC 反射
|
||||
- `protobuf` - Protocol Buffers
|
||||
- `requests` - HTTP 请求
|
||||
- `markitdown` - Microsoft 文档解析引擎
|
||||
|
||||
### 2. 生成 gRPC 代码
|
||||
|
||||
```bash
|
||||
python generate_grpc.py
|
||||
python -m grpc_tools.protoc \
|
||||
--proto_path=proto \
|
||||
--python_out=proto \
|
||||
--grpc_python_out=proto \
|
||||
proto/document_parser.proto
|
||||
```
|
||||
|
||||
这会在 `proto` 目录下生成两个文件:
|
||||
- `document_parser_pb2.py`
|
||||
- `document_parser_pb2_grpc.py`
|
||||
|
||||
## 使用
|
||||
|
||||
### 方式 1: 使用启动脚本(推荐)
|
||||
|
||||
**Windows:**
|
||||
```powershell
|
||||
.\start.ps1
|
||||
```
|
||||
|
||||
**Linux/Mac:**
|
||||
```bash
|
||||
bash start.sh
|
||||
```
|
||||
|
||||
### 方式 2: 直接运行
|
||||
### 启动服务
|
||||
|
||||
```bash
|
||||
python main.py --port 50051 --max-workers 10
|
||||
@@ -82,9 +60,9 @@ python main.py --port 50051 --max-workers 10
|
||||
- `--max-workers`: 最大工作线程数(默认 10)
|
||||
- `--log-level`: 日志级别(DEBUG/INFO/WARNING/ERROR,默认 INFO)
|
||||
|
||||
## gRPC 接口
|
||||
### gRPC 接口
|
||||
|
||||
### ParseDocument
|
||||
#### ParseDocument
|
||||
|
||||
解析文档为 Markdown
|
||||
|
||||
@@ -92,129 +70,80 @@ python main.py --port 50051 --max-workers 10
|
||||
message ParseRequest {
|
||||
string file_url = 1; // 文件 URL(必填)
|
||||
string file_name = 2; // 文件名(必填)
|
||||
string file_type = 3; // 文件类型(可选)
|
||||
string parser_engine = 4; // 解析引擎(可选)
|
||||
string file_type = 3; // 文件类型(必填,如 pdf、docx)
|
||||
string parser_engine = 4; // 解析引擎(可选,默认 builtin)
|
||||
map<string, string> engine_overrides = 5;// 引擎参数覆盖(可选)
|
||||
}
|
||||
|
||||
message ParseResponse {
|
||||
bool success = 1; // 是否成功
|
||||
string content = 2; // Markdown 内容
|
||||
string message = 3; // 消息
|
||||
string message = 3; // 消息
|
||||
int32 content_length = 4; // 内容长度
|
||||
string file_type = 5; // 文件类型
|
||||
string parser_engine = 6; // 使用的解析引擎
|
||||
}
|
||||
```
|
||||
|
||||
### GetSupportedFormats
|
||||
#### GetSupportedFormats
|
||||
|
||||
获取支持的文件格式列表
|
||||
|
||||
### GetEngines
|
||||
#### GetEngines
|
||||
|
||||
获取可用的解析引擎列表
|
||||
|
||||
## Go 客户端调用示例
|
||||
|
||||
```go
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
)
|
||||
|
||||
func main() {
|
||||
conn, err := grpc.Dial("localhost:50051", grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to connect: %v", err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
client := docparser.NewDocumentParserClient(conn)
|
||||
|
||||
resp, err := client.ParseDocument(context.Background(), &docparser.ParseRequest{
|
||||
FileUrl: "http://localhost:8082/files/abc123.pdf",
|
||||
FileName: "example.pdf",
|
||||
FileType: "pdf",
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to parse: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Success: %v", resp.Success)
|
||||
log.Printf("Content length: %d", resp.ContentLength)
|
||||
log.Printf("Markdown content:\n%s", resp.Content)
|
||||
conn, err := grpc.Dial("localhost:50051", grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to connect: %v", err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
client := docparser.NewDocumentParserClient(conn)
|
||||
|
||||
resp, err := client.ParseDocument(context.Background(), &docparser.ParseRequest{
|
||||
FileUrl: "http://localhost:8082/files/abc123.pdf",
|
||||
FileName: "example.pdf",
|
||||
FileType: "pdf",
|
||||
ParserEngine: "builtin",
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to parse: %v", err)
|
||||
}
|
||||
|
||||
fmt.Println("Markdown content:")
|
||||
fmt.Println(resp.Content)
|
||||
```
|
||||
|
||||
## 支持的文件格式
|
||||
|
||||
| 类别 | 支持的扩展名 |
|
||||
|------|-------------|
|
||||
| **文档** | pdf, docx, doc, pptx, ppt |
|
||||
| **表格** | xlsx, xls, csv |
|
||||
| **文本** | md, markdown, txt |
|
||||
| **图片** | jpg, jpeg, png, gif, bmp, tiff, webp |
|
||||
| **网页** | html, htm |
|
||||
|
||||
## 为什么选择 MarkItDown?
|
||||
|
||||
1. **微软官方支持** - Microsoft 开发,持续维护
|
||||
2. **格式覆盖全** - 一个库支持所有常见格式
|
||||
3. **统一接口** - 无需为每种格式单独实现
|
||||
4. **安装简单** - 只需 `pip install markitdown`
|
||||
5. **性能优秀** - 基于优化的解析算法
|
||||
|
||||
## 故障排查
|
||||
|
||||
### 端口已被占用
|
||||
|
||||
如果提示端口 50051 已被占用,可以更换端口:
|
||||
|
||||
```bash
|
||||
python main.py --port 50052
|
||||
```
|
||||
|
||||
### gRPC 代码未生成
|
||||
|
||||
如果提示找不到 `docparser_pb2`,运行:
|
||||
|
||||
```bash
|
||||
python generate_grpc.py
|
||||
```
|
||||
|
||||
### 依赖安装失败
|
||||
|
||||
确保使用 Python 3.8+:
|
||||
|
||||
```bash
|
||||
python --version
|
||||
pip --version
|
||||
```
|
||||
| 格式 | 扩展名 | 说明 |
|
||||
|------|--------|------|
|
||||
| PDF | pdf | PDF 文档 |
|
||||
| Word | docx, doc | Microsoft Word 文档 |
|
||||
| Excel | xlsx, xls | Microsoft Excel 表格 |
|
||||
| CSV | csv | 逗号分隔值文件 |
|
||||
| Markdown | md, markdown | Markdown 文件 |
|
||||
| 图片 | jpg, jpeg, png, gif, bmp, tiff, webp | 常见图片格式 |
|
||||
| PowerPoint | pptx, ppt | PowerPoint 演示文稿 |
|
||||
|
||||
## 开发
|
||||
|
||||
### 测试解析器
|
||||
### 添加新的解析器
|
||||
|
||||
```python
|
||||
from parser import Parser
|
||||
1. 继承 `BaseParser` 类
|
||||
2. 实现 `parse_into_text` 方法
|
||||
3. 在 `registry.py` 中注册
|
||||
|
||||
parser = Parser()
|
||||
### 添加新的解析引擎
|
||||
|
||||
# 解析文件
|
||||
result = parser.parse("path/to/file.pdf")
|
||||
print(result["content"])
|
||||
|
||||
# 解析字节内容
|
||||
with open("file.pdf", "rb") as f:
|
||||
content = f.read()
|
||||
result = parser.parse_bytes(content, "file.pdf")
|
||||
print(result["content"])
|
||||
```
|
||||
1. 在 `registry.py` 中使用 `register()` 方法注册
|
||||
2. 提供 `check_available` 函数检查依赖
|
||||
3. 添加对应的解析器类
|
||||
|
||||
## 许可证
|
||||
|
||||
|
||||
@@ -1,59 +0,0 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from service.grpc_server import serve
|
||||
|
||||
DEFAULT_PORT = 50051
|
||||
DEFAULT_MAX_WORKERS = 10
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Document Parser gRPC Server (MarkItDown)",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=DEFAULT_PORT,
|
||||
help="Port to listen on",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-workers",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_WORKERS,
|
||||
help="Maximum number of worker threads",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log-level",
|
||||
type=str,
|
||||
default="INFO",
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
help="Log level",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, args.log_level),
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Starting Document Parser gRPC Server (MarkItDown)")
|
||||
logger.info("Port: %d", args.port)
|
||||
logger.info("Max workers: %d", args.max_workers)
|
||||
|
||||
try:
|
||||
serve(port=args.port, max_workers=args.max_workers)
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Server shutdown requested")
|
||||
except Exception as e:
|
||||
logger.error("Server error: %s", str(e), exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,9 +1,38 @@
|
||||
"""
|
||||
Parser module for AI-Core document processing system.
|
||||
Parser module for WeKnora document processing system.
|
||||
|
||||
This module provides document parsing using Microsoft MarkItDown.
|
||||
This module provides document parsers for various file formats including:
|
||||
- Microsoft Word documents (.doc, .docx)
|
||||
- PDF documents
|
||||
- Markdown files
|
||||
- Plain text files
|
||||
- Images with text content
|
||||
- Web pages
|
||||
|
||||
The parsers extract content from documents and can split them into
|
||||
meaningful chunks for further processing and indexing.
|
||||
"""
|
||||
|
||||
from .doc_parser import DocParser
|
||||
from .docx2_parser import Docx2Parser
|
||||
from .excel_parser import ExcelParser
|
||||
from .image_parser import ImageParser
|
||||
from .markdown_parser import MarkdownParser
|
||||
from .parser import Parser
|
||||
from .pdf_parser import PDFParser
|
||||
from .registry import ParserEngineRegistry, registry
|
||||
from .web_parser import WebParser
|
||||
|
||||
__all__ = ["Parser"]
|
||||
# Export public classes and modules
|
||||
__all__ = [
|
||||
"Docx2Parser",
|
||||
"DocParser",
|
||||
"PDFParser",
|
||||
"MarkdownParser",
|
||||
"ImageParser",
|
||||
"WebParser",
|
||||
"Parser",
|
||||
"ExcelParser",
|
||||
"ParserEngineRegistry",
|
||||
"registry",
|
||||
]
|
||||
|
||||
@@ -1,199 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from typing import Optional, Dict, Any
|
||||
from markitdown import MarkItDown
|
||||
|
||||
from .vlm_client import VLMClient
|
||||
from .config import get_vlm_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Parser:
|
||||
"""基于 MarkItDown + VLM 的统一文档解析器
|
||||
|
||||
支持格式:PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页、Markdown 等
|
||||
|
||||
VLM 解析:
|
||||
- 方式一:启动时配置(config.yaml 或环境变量)
|
||||
- 方式二:gRPC 请求时传入 VLM 配置(优先级更高)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.markitdown = MarkItDown()
|
||||
self.vlm_client: Optional[VLMClient] = None
|
||||
|
||||
# 尝试加载配置的 VLM
|
||||
vlm_config = get_vlm_config()
|
||||
if vlm_config:
|
||||
self.vlm_client = VLMClient(vlm_config)
|
||||
logger.info(f"VLM enabled: provider={vlm_config.get('provider')}, model={vlm_config.get('model')}")
|
||||
else:
|
||||
logger.info("VLM not configured, using MarkItDown only")
|
||||
|
||||
def set_vlm_config(self, config: Dict[str, Any]) -> None:
|
||||
"""手动设置 VLM 配置(优先级高于全局配置)"""
|
||||
if config and config.get("enabled") and config.get("api_key"):
|
||||
self.vlm_client = VLMClient(config)
|
||||
logger.info(f"VLM enabled: provider={config.get('provider')}, model={config.get('model')}")
|
||||
else:
|
||||
self.vlm_client = None
|
||||
logger.info("VLM disabled")
|
||||
|
||||
def parse(self, file_path: str, file_type: Optional[str] = None, vlm_config: Optional[Dict[str, Any]] = None) -> dict:
|
||||
"""解析文档为 Markdown
|
||||
|
||||
Args:
|
||||
file_path: 文件路径或 URL
|
||||
file_type: 文件类型(可选,MarkItDown 会自动检测)
|
||||
vlm_config: VLM 配置(可选,优先级高于全局配置)
|
||||
|
||||
Returns:
|
||||
dict: 包含 markdown 内容和元数据
|
||||
"""
|
||||
# 如果有 VLM 配置,覆盖全局配置
|
||||
if vlm_config:
|
||||
self.set_vlm_config(vlm_config)
|
||||
|
||||
try:
|
||||
logger.info(f"Parsing file: {file_path}")
|
||||
|
||||
result = self.markitdown.convert(file_path)
|
||||
|
||||
logger.info(f"Parse successful: {len(result.text_content)} characters")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"content": result.text_content,
|
||||
"content_length": len(result.text_content),
|
||||
"metadata": result.metadata if hasattr(result, 'metadata') else {}
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Parse error: {e}", exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"content": "",
|
||||
"content_length": 0,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def parse_bytes(self, content: bytes, file_name: str, file_type: Optional[str] = None, vlm_config: Optional[Dict[str, Any]] = None) -> dict:
|
||||
"""解析字节内容为 Markdown
|
||||
|
||||
Args:
|
||||
content: 文件字节内容
|
||||
file_name: 文件名
|
||||
file_type: 文件类型(可选)
|
||||
vlm_config: VLM 配置(可选,优先级高于全局配置)
|
||||
|
||||
Returns:
|
||||
dict: 包含 markdown 内容和元数据
|
||||
"""
|
||||
# 如果有 VLM 配置,覆盖全局配置
|
||||
if vlm_config:
|
||||
self.set_vlm_config(vlm_config)
|
||||
|
||||
try:
|
||||
logger.info(f"Parsing bytes: {file_name}, size: {len(content)} bytes")
|
||||
|
||||
# 检查是否应该使用 VLM(根据文件名自动判断)
|
||||
if self._should_use_vlm(file_name):
|
||||
logger.info("Using VLM for parsing")
|
||||
return self._parse_with_vlm(content, file_name)
|
||||
|
||||
# 否则使用 MarkItDown
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1] or '') as temp_file:
|
||||
temp_file.write(content)
|
||||
temp_path = temp_file.name
|
||||
|
||||
try:
|
||||
result = self.markitdown.convert(temp_path)
|
||||
|
||||
logger.info(f"Parse successful: {len(result.text_content)} characters")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"content": result.text_content,
|
||||
"content_length": len(result.text_content),
|
||||
"metadata": result.metadata if hasattr(result, 'metadata') else {}
|
||||
}
|
||||
finally:
|
||||
os.unlink(temp_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Parse bytes error: {e}", exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"content": "",
|
||||
"content_length": 0,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def _should_use_vlm(self, file_name: str) -> bool:
|
||||
"""判断是否应该使用 VLM"""
|
||||
if not self.vlm_client:
|
||||
return False
|
||||
|
||||
# 图片文件使用 VLM
|
||||
image_exts = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff']
|
||||
ext = os.path.splitext(file_name)[1].lower()
|
||||
return ext in image_exts
|
||||
|
||||
def _parse_with_vlm(self, content: bytes, file_name: str) -> dict:
|
||||
"""使用 VLM 解析"""
|
||||
if not self.vlm_client:
|
||||
return {
|
||||
"success": False,
|
||||
"content": "",
|
||||
"content_length": 0,
|
||||
"error": "VLM not configured"
|
||||
}
|
||||
|
||||
# 确定 MIME 类型
|
||||
ext = os.path.splitext(file_name)[1].lower()
|
||||
mime_types = {
|
||||
'.jpg': 'image/jpeg',
|
||||
'.jpeg': 'image/jpeg',
|
||||
'.png': 'image/png',
|
||||
'.gif': 'image/gif',
|
||||
'.bmp': 'image/bmp',
|
||||
'.webp': 'image/webp',
|
||||
'.tiff': 'image/tiff',
|
||||
}
|
||||
mime_type = mime_types.get(ext, 'image/png')
|
||||
|
||||
try:
|
||||
result = self.vlm_client.analyze_image(content, mime_type)
|
||||
|
||||
if result.get("success"):
|
||||
return {
|
||||
"success": True,
|
||||
"content": result["content"],
|
||||
"content_length": len(result["content"]),
|
||||
"metadata": {"vlm_used": True}
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"content": "",
|
||||
"content_length": 0,
|
||||
"error": result.get("error", "VLM parsing failed")
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"VLM parsing error: {e}")
|
||||
return {
|
||||
"success": False,
|
||||
"content": "",
|
||||
"content_length": 0,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = Parser()
|
||||
|
||||
# 测试
|
||||
test_url = "https://example.com"
|
||||
result = parser.parse(test_url)
|
||||
print(f"Success: {result['success']}")
|
||||
print(f"Content length: {result['content_length']}")
|
||||
@@ -1,14 +0,0 @@
|
||||
# AI-Core Document Parser - 基于 MarkItDown
|
||||
|
||||
# gRPC 框架
|
||||
grpcio>=1.60.0
|
||||
grpcio-tools>=1.60.0
|
||||
grpcio-reflection>=1.60.0
|
||||
protobuf>=4.25.0
|
||||
|
||||
# 配置文件解析
|
||||
pyyaml>=6.0
|
||||
requests>=2.31.0
|
||||
|
||||
# 文档解析 - markitdown 及其所有依赖
|
||||
markitdown[pdf,docx,pptx,xlsx,all]>=0.0.1
|
||||
@@ -1,259 +0,0 @@
|
||||
import logging
|
||||
import requests
|
||||
from concurrent import futures
|
||||
|
||||
import grpc
|
||||
from grpc_reflection.v1alpha import reflection
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "proto"))
|
||||
|
||||
from parser.parser import Parser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
docparser_pb2 = None
|
||||
docparser_pb2_grpc = None
|
||||
|
||||
def _import_grpc_protobuf():
|
||||
"""Import gRPC protobuf modules"""
|
||||
global docparser_pb2, docparser_pb2_grpc
|
||||
if docparser_pb2 is not None and docparser_pb2_grpc is not None:
|
||||
return
|
||||
|
||||
try:
|
||||
import document_parser_pb2 as dpb2
|
||||
import document_parser_pb2_grpc as dpb2_grpc
|
||||
docparser_pb2 = dpb2
|
||||
docparser_pb2_grpc = dpb2_grpc
|
||||
logger.info("Successfully imported gRPC protobuf modules")
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to import gRPC protobuf: {e}")
|
||||
raise ImportError(
|
||||
"gRPC protobuf files not found. Please run: python generate_grpc.py"
|
||||
) from e
|
||||
|
||||
|
||||
class DocumentParserServicer:
|
||||
"""gRPC 服务实现,使用 MarkItDown"""
|
||||
|
||||
def __init__(self, max_workers: int = 10):
|
||||
_import_grpc_protobuf()
|
||||
self.parser = Parser()
|
||||
self.max_workers = max_workers
|
||||
logger.info("DocumentParserServicer initialized")
|
||||
|
||||
def ParseDocument(self, request, context):
|
||||
"""解析文档"""
|
||||
try:
|
||||
logger.info(
|
||||
"ParseDocument request: file_url=%s, file_name=%s, file_type=%s",
|
||||
request.file_url,
|
||||
request.file_name,
|
||||
request.file_type,
|
||||
)
|
||||
|
||||
file_url = request.file_url
|
||||
file_name = request.file_name
|
||||
|
||||
if not file_url:
|
||||
return docparser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message="file_url is required",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
if not file_name:
|
||||
return docparser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message="file_name is required",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
# 提取 VLM 配置
|
||||
vlm_config = None
|
||||
if hasattr(request, 'vlm_config') and request.vlm_config:
|
||||
vlm_cfg = request.vlm_config
|
||||
if vlm_cfg.enabled:
|
||||
vlm_config = {
|
||||
"enabled": vlm_cfg.enabled,
|
||||
"provider": vlm_cfg.provider,
|
||||
"model": vlm_cfg.model,
|
||||
"api_key": vlm_cfg.api_key,
|
||||
"base_url": vlm_cfg.base_url,
|
||||
"prompt": vlm_cfg.prompt,
|
||||
}
|
||||
logger.info(f"VLM config: provider={vlm_cfg.provider}, model={vlm_cfg.model}")
|
||||
|
||||
logger.info("Downloading file from URL: %s", file_url)
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
file_url,
|
||||
timeout=60,
|
||||
headers={"User-Agent": "DocParser/1.0"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
content = response.content
|
||||
logger.info("Downloaded %d bytes", len(content))
|
||||
except requests.RequestException as e:
|
||||
logger.error("Failed to download file: %s", str(e))
|
||||
return docparser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message=f"Failed to download file: {str(e)}",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
logger.info("Parsing file with MarkItDown + VLM")
|
||||
|
||||
result = self.parser.parse_bytes(content, file_name, vlm_config=vlm_config)
|
||||
|
||||
if not result.get("success", False):
|
||||
logger.warning("Parser returned failure: %s", result.get("error", "Unknown error"))
|
||||
return docparser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message=result.get("error", "Parse failed"),
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
markdown_content = result.get("content", "")
|
||||
logger.info(
|
||||
"Parse successful: content_length=%d",
|
||||
len(markdown_content),
|
||||
)
|
||||
|
||||
return docparser_pb2.ParseResponse(
|
||||
success=True,
|
||||
content=markdown_content,
|
||||
message="Parse successful",
|
||||
content_length=len(markdown_content),
|
||||
file_type=request.file_type or "auto",
|
||||
parser_engine="markitdown",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("ParseDocument error: %s", str(e), exc_info=True)
|
||||
return docparser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message=f"Parse error: {str(e)}",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
def GetSupportedFormats(self, request, context):
|
||||
"""获取支持的文件格式"""
|
||||
try:
|
||||
logger.info("GetSupportedFormats request")
|
||||
|
||||
file_types = [
|
||||
"pdf", "docx", "doc", "pptx", "ppt",
|
||||
"xlsx", "xls", "csv",
|
||||
"md", "markdown",
|
||||
"jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp",
|
||||
"html", "htm", "txt",
|
||||
]
|
||||
|
||||
file_type_descriptions = {
|
||||
"pdf": "PDF Document",
|
||||
"docx": "Microsoft Word Document",
|
||||
"doc": "Microsoft Word Document (Legacy)",
|
||||
"pptx": "Microsoft PowerPoint Presentation",
|
||||
"ppt": "Microsoft PowerPoint Presentation (Legacy)",
|
||||
"xlsx": "Microsoft Excel Spreadsheet",
|
||||
"xls": "Microsoft Excel Spreadsheet (Legacy)",
|
||||
"csv": "Comma-Separated Values",
|
||||
"md": "Markdown File",
|
||||
"markdown": "Markdown File",
|
||||
"jpg": "JPEG Image",
|
||||
"jpeg": "JPEG Image",
|
||||
"png": "PNG Image",
|
||||
"gif": "GIF Image",
|
||||
"bmp": "BMP Image",
|
||||
"tiff": "TIFF Image",
|
||||
"webp": "WebP Image",
|
||||
"html": "HTML Document",
|
||||
"htm": "HTML Document",
|
||||
"txt": "Plain Text File",
|
||||
}
|
||||
|
||||
return docparser_pb2.SupportedFormatsResponse(
|
||||
file_types=file_types,
|
||||
file_type_descriptions=file_type_descriptions,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("GetSupportedFormats error: %s", str(e), exc_info=True)
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return docparser_pb2.SupportedFormatsResponse()
|
||||
|
||||
def GetEngines(self, request, context):
|
||||
"""获取可用的解析引擎列表"""
|
||||
try:
|
||||
logger.info("GetEngines request")
|
||||
|
||||
engine_info = docparser_pb2.EngineInfo(
|
||||
name="markitdown",
|
||||
description="Microsoft MarkItDown - 统一文档解析引擎",
|
||||
supported_file_types=[
|
||||
"pdf", "docx", "doc", "pptx", "ppt",
|
||||
"xlsx", "xls", "csv",
|
||||
"md", "markdown",
|
||||
"jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp",
|
||||
"html", "htm", "txt",
|
||||
],
|
||||
available=True,
|
||||
unavailable_reason="",
|
||||
)
|
||||
|
||||
return docparser_pb2.EnginesResponse(engines=[engine_info])
|
||||
except Exception as e:
|
||||
logger.error("GetEngines error: %s", str(e), exc_info=True)
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return docparser_pb2.EnginesResponse()
|
||||
|
||||
|
||||
def serve(port: int = 50051, max_workers: int = 10):
|
||||
"""启动 gRPC 服务"""
|
||||
_import_grpc_protobuf()
|
||||
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers))
|
||||
|
||||
servicer = DocumentParserServicer(max_workers=max_workers)
|
||||
docparser_pb2_grpc.add_DocumentParserServicer_to_server(servicer, server)
|
||||
|
||||
reflection.enable_server_reflection(
|
||||
service_names=[
|
||||
docparser_pb2.DESCRIPTOR.services_by_name["DocumentParser"].full_name,
|
||||
reflection.SERVICE_NAME,
|
||||
],
|
||||
server=server,
|
||||
)
|
||||
|
||||
server.add_insecure_port(f"0.0.0.0:{port}")
|
||||
server.start()
|
||||
|
||||
logger.info("DocumentParser gRPC server (MarkItDown) started on port %d", port)
|
||||
logger.info("gRPC reflection enabled")
|
||||
|
||||
try:
|
||||
server.wait_for_termination()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Shutting down server...")
|
||||
server.stop(0)
|
||||
logger.info("Server stopped")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
serve()
|
||||
@@ -1,33 +0,0 @@
|
||||
services:
|
||||
# MySQL 数据库
|
||||
x-agent-mysql:
|
||||
image: mysql:8.0
|
||||
container_name: x-agents-mysql
|
||||
environment:
|
||||
MYSQL_ROOT_PASSWORD: root
|
||||
MYSQL_DATABASE: x_agents
|
||||
volumes:
|
||||
- mysql-data:/var/lib/mysql
|
||||
ports:
|
||||
- "6036:3306"
|
||||
healthcheck:
|
||||
test: ["CMD", "mysqladmin", "ping", "-h", "localhost"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
restart: unless-stopped
|
||||
command: --default-authentication-plugin=mysql_native_password
|
||||
|
||||
# Redis
|
||||
x-agent-redis:
|
||||
image: redis:7-alpine
|
||||
container_name: x-agents-redis
|
||||
ports:
|
||||
- "6037:6379"
|
||||
volumes:
|
||||
- redis-data:/data
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
mysql-data:
|
||||
redis-data:
|
||||
Reference in New Issue
Block a user