- 添加 VLM 客户端支持 - 优化解析器配置 - 添加配置示例文件 - 生成新的 gRPC protobuf 文件 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
260 lines
8.9 KiB
Python
260 lines
8.9 KiB
Python
import logging
|
|
import requests
|
|
from concurrent import futures
|
|
|
|
import grpc
|
|
from grpc_reflection.v1alpha import reflection
|
|
|
|
import sys
|
|
import os
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "proto"))
|
|
|
|
from parser.parser import Parser
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
docparser_pb2 = None
|
|
docparser_pb2_grpc = None
|
|
|
|
def _import_grpc_protobuf():
|
|
"""Import gRPC protobuf modules"""
|
|
global docparser_pb2, docparser_pb2_grpc
|
|
if docparser_pb2 is not None and docparser_pb2_grpc is not None:
|
|
return
|
|
|
|
try:
|
|
import document_parser_pb2 as dpb2
|
|
import document_parser_pb2_grpc as dpb2_grpc
|
|
docparser_pb2 = dpb2
|
|
docparser_pb2_grpc = dpb2_grpc
|
|
logger.info("Successfully imported gRPC protobuf modules")
|
|
except ImportError as e:
|
|
logger.error(f"Failed to import gRPC protobuf: {e}")
|
|
raise ImportError(
|
|
"gRPC protobuf files not found. Please run: python generate_grpc.py"
|
|
) from e
|
|
|
|
|
|
class DocumentParserServicer:
|
|
"""gRPC 服务实现,使用 MarkItDown"""
|
|
|
|
def __init__(self, max_workers: int = 10):
|
|
_import_grpc_protobuf()
|
|
self.parser = Parser()
|
|
self.max_workers = max_workers
|
|
logger.info("DocumentParserServicer initialized")
|
|
|
|
def ParseDocument(self, request, context):
|
|
"""解析文档"""
|
|
try:
|
|
logger.info(
|
|
"ParseDocument request: file_url=%s, file_name=%s, file_type=%s",
|
|
request.file_url,
|
|
request.file_name,
|
|
request.file_type,
|
|
)
|
|
|
|
file_url = request.file_url
|
|
file_name = request.file_name
|
|
|
|
if not file_url:
|
|
return docparser_pb2.ParseResponse(
|
|
success=False,
|
|
content="",
|
|
message="file_url is required",
|
|
content_length=0,
|
|
)
|
|
|
|
if not file_name:
|
|
return docparser_pb2.ParseResponse(
|
|
success=False,
|
|
content="",
|
|
message="file_name is required",
|
|
content_length=0,
|
|
)
|
|
|
|
# 提取 VLM 配置
|
|
vlm_config = None
|
|
if hasattr(request, 'vlm_config') and request.vlm_config:
|
|
vlm_cfg = request.vlm_config
|
|
if vlm_cfg.enabled:
|
|
vlm_config = {
|
|
"enabled": vlm_cfg.enabled,
|
|
"provider": vlm_cfg.provider,
|
|
"model": vlm_cfg.model,
|
|
"api_key": vlm_cfg.api_key,
|
|
"base_url": vlm_cfg.base_url,
|
|
"prompt": vlm_cfg.prompt,
|
|
}
|
|
logger.info(f"VLM config: provider={vlm_cfg.provider}, model={vlm_cfg.model}")
|
|
|
|
logger.info("Downloading file from URL: %s", file_url)
|
|
|
|
try:
|
|
response = requests.get(
|
|
file_url,
|
|
timeout=60,
|
|
headers={"User-Agent": "DocParser/1.0"},
|
|
)
|
|
response.raise_for_status()
|
|
content = response.content
|
|
logger.info("Downloaded %d bytes", len(content))
|
|
except requests.RequestException as e:
|
|
logger.error("Failed to download file: %s", str(e))
|
|
return docparser_pb2.ParseResponse(
|
|
success=False,
|
|
content="",
|
|
message=f"Failed to download file: {str(e)}",
|
|
content_length=0,
|
|
)
|
|
|
|
logger.info("Parsing file with MarkItDown + VLM")
|
|
|
|
result = self.parser.parse_bytes(content, file_name, vlm_config=vlm_config)
|
|
|
|
if not result.get("success", False):
|
|
logger.warning("Parser returned failure: %s", result.get("error", "Unknown error"))
|
|
return docparser_pb2.ParseResponse(
|
|
success=False,
|
|
content="",
|
|
message=result.get("error", "Parse failed"),
|
|
content_length=0,
|
|
)
|
|
|
|
markdown_content = result.get("content", "")
|
|
logger.info(
|
|
"Parse successful: content_length=%d",
|
|
len(markdown_content),
|
|
)
|
|
|
|
return docparser_pb2.ParseResponse(
|
|
success=True,
|
|
content=markdown_content,
|
|
message="Parse successful",
|
|
content_length=len(markdown_content),
|
|
file_type=request.file_type or "auto",
|
|
parser_engine="markitdown",
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("ParseDocument error: %s", str(e), exc_info=True)
|
|
return docparser_pb2.ParseResponse(
|
|
success=False,
|
|
content="",
|
|
message=f"Parse error: {str(e)}",
|
|
content_length=0,
|
|
)
|
|
|
|
def GetSupportedFormats(self, request, context):
|
|
"""获取支持的文件格式"""
|
|
try:
|
|
logger.info("GetSupportedFormats request")
|
|
|
|
file_types = [
|
|
"pdf", "docx", "doc", "pptx", "ppt",
|
|
"xlsx", "xls", "csv",
|
|
"md", "markdown",
|
|
"jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp",
|
|
"html", "htm", "txt",
|
|
]
|
|
|
|
file_type_descriptions = {
|
|
"pdf": "PDF Document",
|
|
"docx": "Microsoft Word Document",
|
|
"doc": "Microsoft Word Document (Legacy)",
|
|
"pptx": "Microsoft PowerPoint Presentation",
|
|
"ppt": "Microsoft PowerPoint Presentation (Legacy)",
|
|
"xlsx": "Microsoft Excel Spreadsheet",
|
|
"xls": "Microsoft Excel Spreadsheet (Legacy)",
|
|
"csv": "Comma-Separated Values",
|
|
"md": "Markdown File",
|
|
"markdown": "Markdown File",
|
|
"jpg": "JPEG Image",
|
|
"jpeg": "JPEG Image",
|
|
"png": "PNG Image",
|
|
"gif": "GIF Image",
|
|
"bmp": "BMP Image",
|
|
"tiff": "TIFF Image",
|
|
"webp": "WebP Image",
|
|
"html": "HTML Document",
|
|
"htm": "HTML Document",
|
|
"txt": "Plain Text File",
|
|
}
|
|
|
|
return docparser_pb2.SupportedFormatsResponse(
|
|
file_types=file_types,
|
|
file_type_descriptions=file_type_descriptions,
|
|
)
|
|
except Exception as e:
|
|
logger.error("GetSupportedFormats error: %s", str(e), exc_info=True)
|
|
context.set_code(grpc.StatusCode.INTERNAL)
|
|
context.set_details(str(e))
|
|
return docparser_pb2.SupportedFormatsResponse()
|
|
|
|
def GetEngines(self, request, context):
|
|
"""获取可用的解析引擎列表"""
|
|
try:
|
|
logger.info("GetEngines request")
|
|
|
|
engine_info = docparser_pb2.EngineInfo(
|
|
name="markitdown",
|
|
description="Microsoft MarkItDown - 统一文档解析引擎",
|
|
supported_file_types=[
|
|
"pdf", "docx", "doc", "pptx", "ppt",
|
|
"xlsx", "xls", "csv",
|
|
"md", "markdown",
|
|
"jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp",
|
|
"html", "htm", "txt",
|
|
],
|
|
available=True,
|
|
unavailable_reason="",
|
|
)
|
|
|
|
return docparser_pb2.EnginesResponse(engines=[engine_info])
|
|
except Exception as e:
|
|
logger.error("GetEngines error: %s", str(e), exc_info=True)
|
|
context.set_code(grpc.StatusCode.INTERNAL)
|
|
context.set_details(str(e))
|
|
return docparser_pb2.EnginesResponse()
|
|
|
|
|
|
def serve(port: int = 50051, max_workers: int = 10):
|
|
"""启动 gRPC 服务"""
|
|
_import_grpc_protobuf()
|
|
|
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers))
|
|
|
|
servicer = DocumentParserServicer(max_workers=max_workers)
|
|
docparser_pb2_grpc.add_DocumentParserServicer_to_server(servicer, server)
|
|
|
|
reflection.enable_server_reflection(
|
|
service_names=[
|
|
docparser_pb2.DESCRIPTOR.services_by_name["DocumentParser"].full_name,
|
|
reflection.SERVICE_NAME,
|
|
],
|
|
server=server,
|
|
)
|
|
|
|
server.add_insecure_port(f"0.0.0.0:{port}")
|
|
server.start()
|
|
|
|
logger.info("DocumentParser gRPC server (MarkItDown) started on port %d", port)
|
|
logger.info("gRPC reflection enabled")
|
|
|
|
try:
|
|
server.wait_for_termination()
|
|
except KeyboardInterrupt:
|
|
logger.info("Shutting down server...")
|
|
server.stop(0)
|
|
logger.info("Server stopped")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
)
|
|
serve()
|