- 添加多种文档解析器 (PDF, Word, Excel, Markdown 等) - 添加基础解析器和链式解析器 - 添加存储和注册机制 - 添加 gRPC 服务实现 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
209 lines
6.8 KiB
Python
209 lines
6.8 KiB
Python
"""
|
|
gRPC Server for Document Parser
|
|
"""
|
|
import logging
|
|
import requests
|
|
from concurrent import futures
|
|
import grpc
|
|
from grpc_reflection.v1alpha import reflection
|
|
import sys
|
|
import os
|
|
import io
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "proto"))
|
|
|
|
from parser import Parser
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# 导入 proto 生成的文件
|
|
try:
|
|
import document_parser_pb2
|
|
import document_parser_pb2_grpc
|
|
PROTO_AVAILABLE = True
|
|
except ImportError:
|
|
logger.warning("Proto files not found, please run: python generate_grpc.py")
|
|
PROTO_AVAILABLE = False
|
|
|
|
|
|
class DocumentParserServicer:
|
|
"""gRPC 服务实现"""
|
|
|
|
def __init__(self, max_workers: int = 10):
|
|
self.parser = Parser()
|
|
self.max_workers = max_workers
|
|
logger.info("DocumentParserServicer initialized")
|
|
|
|
def ParseDocument(self, request, context):
|
|
"""解析文档"""
|
|
if not PROTO_AVAILABLE:
|
|
return {"success": False, "message": "Proto not available"}
|
|
|
|
try:
|
|
logger.info(
|
|
"ParseDocument request: file_url=%s, file_name=%s",
|
|
request.file_url,
|
|
request.file_name,
|
|
)
|
|
|
|
file_url = request.file_url
|
|
file_name = request.file_name
|
|
|
|
if not file_url:
|
|
return document_parser_pb2.ParseResponse(
|
|
success=False,
|
|
content="",
|
|
message="file_url is required",
|
|
content_length=0,
|
|
)
|
|
|
|
if not file_name:
|
|
return document_parser_pb2.ParseResponse(
|
|
success=False,
|
|
content="",
|
|
message="file_name is required",
|
|
content_length=0,
|
|
)
|
|
|
|
# 提取 VLM 配置
|
|
vlm_config = None
|
|
if hasattr(request, 'vlm_config') and request.vlm_config:
|
|
vlm_cfg = request.vlm_config
|
|
if vlm_cfg.enabled:
|
|
vlm_config = {
|
|
"enabled": vlm_cfg.enabled,
|
|
"provider": vlm_cfg.provider,
|
|
"model": vlm_cfg.model,
|
|
"api_key": vlm_cfg.api_key,
|
|
"base_url": vlm_cfg.base_url,
|
|
"prompt": vlm_cfg.prompt,
|
|
}
|
|
logger.info(f"VLM config: provider={vlm_cfg.provider}, model={vlm_cfg.model}")
|
|
|
|
# 下载文件
|
|
logger.info("Downloading file from URL: %s", file_url)
|
|
try:
|
|
response = requests.get(
|
|
file_url,
|
|
timeout=60,
|
|
headers={"User-Agent": "DocParser/1.0"},
|
|
)
|
|
response.raise_for_status()
|
|
content = response.content
|
|
logger.info("Downloaded %d bytes", len(content))
|
|
except requests.RequestException as e:
|
|
logger.error("Failed to download file: %s", str(e))
|
|
return document_parser_pb2.ParseResponse(
|
|
success=False,
|
|
content="",
|
|
message=f"Failed to download file: {str(e)}",
|
|
content_length=0,
|
|
)
|
|
|
|
# 解析
|
|
logger.info("Parsing file")
|
|
file_type = os.path.splitext(file_name)[1][1:] # 去掉点的扩展名
|
|
|
|
result = self.parser.parse_file(
|
|
file_name=file_name,
|
|
file_type=file_type,
|
|
content=content,
|
|
vlm_config=vlm_config,
|
|
)
|
|
|
|
if not result.content:
|
|
return document_parser_pb2.ParseResponse(
|
|
success=False,
|
|
content="",
|
|
message="Parse failed or empty content",
|
|
content_length=0,
|
|
)
|
|
|
|
markdown_content = result.content
|
|
logger.info("Parse successful: content_length=%d", len(markdown_content))
|
|
|
|
return document_parser_pb2.ParseResponse(
|
|
success=True,
|
|
content=markdown_content,
|
|
message="Parse successful",
|
|
content_length=len(markdown_content),
|
|
file_type=file_type or "auto",
|
|
parser_engine="markitdown",
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("ParseDocument error: %s", str(e), exc_info=True)
|
|
return document_parser_pb2.ParseResponse(
|
|
success=False,
|
|
content="",
|
|
message=f"Parse error: {str(e)}",
|
|
content_length=0,
|
|
)
|
|
|
|
def GetSupportedFormats(self, request, context):
|
|
"""获取支持的格式"""
|
|
if not PROTO_AVAILABLE:
|
|
return None
|
|
|
|
try:
|
|
file_types = [
|
|
"pdf", "docx", "doc", "pptx", "ppt",
|
|
"xlsx", "xls", "csv",
|
|
"md", "markdown",
|
|
"jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp",
|
|
"html", "htm", "txt",
|
|
]
|
|
return document_parser_pb2.SupportedFormatsResponse(
|
|
file_types=file_types,
|
|
)
|
|
except Exception as e:
|
|
logger.error("GetSupportedFormats error: %s", str(e))
|
|
return None
|
|
|
|
def GetEngines(self, request, context):
|
|
"""获取解析引擎"""
|
|
if not PROTO_AVAILABLE:
|
|
return None
|
|
|
|
try:
|
|
engines = [
|
|
document_parser_pb2.EngineInfo(
|
|
name="markitdown",
|
|
description="MarkItDown parser - supports various document formats",
|
|
supported_file_types=["pdf", "docx", "pptx", "xlsx", "md", "html", "txt"],
|
|
available=True,
|
|
)
|
|
]
|
|
return document_parser_pb2.EnginesResponse(engines=engines)
|
|
except Exception as e:
|
|
logger.error("GetEngines error: %s", str(e))
|
|
return None
|
|
|
|
|
|
def serve(port: int = 50051, max_workers: int = 10):
|
|
"""启动 gRPC 服务"""
|
|
if not PROTO_AVAILABLE:
|
|
logger.error("Proto files not available, cannot start server")
|
|
return
|
|
|
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers))
|
|
servicer = DocumentParserServicer(max_workers=max_workers)
|
|
|
|
# 注册服务
|
|
document_parser_pb2_grpc.add_DocumentParserServicer_to_server(
|
|
servicer, server
|
|
)
|
|
|
|
# 启用反射
|
|
reflection.enable_server_reflection(
|
|
[document_parser_pb2.DESCRIPTOR.services_by_name['DocumentParser']],
|
|
server
|
|
)
|
|
|
|
server.add_insecure_port(f"0.0.0.0:{port}")
|
|
server.start()
|
|
logger.info(f"DocumentParser gRPC server started on port {port}")
|
|
logger.info("gRPC reflection enabled")
|
|
server.wait_for_termination()
|