""" gRPC Server for Document Parser """ import logging import requests from concurrent import futures import grpc from grpc_reflection.v1alpha import reflection import sys import os import io sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "proto")) from parser import Parser logger = logging.getLogger(__name__) # 导入 proto 生成的文件 try: import document_parser_pb2 import document_parser_pb2_grpc PROTO_AVAILABLE = True except ImportError: logger.warning("Proto files not found, please run: python generate_grpc.py") PROTO_AVAILABLE = False class DocumentParserServicer: """gRPC 服务实现""" def __init__(self, max_workers: int = 10): self.parser = Parser() self.max_workers = max_workers logger.info("DocumentParserServicer initialized") def ParseDocument(self, request, context): """解析文档""" if not PROTO_AVAILABLE: return {"success": False, "message": "Proto not available"} try: logger.info( "ParseDocument request: file_url=%s, file_name=%s", request.file_url, request.file_name, ) file_url = request.file_url file_name = request.file_name if not file_url: return document_parser_pb2.ParseResponse( success=False, content="", message="file_url is required", content_length=0, ) if not file_name: return document_parser_pb2.ParseResponse( success=False, content="", message="file_name is required", content_length=0, ) # 提取 VLM 配置 vlm_config = None if hasattr(request, 'vlm_config') and request.vlm_config: vlm_cfg = request.vlm_config if vlm_cfg.enabled: vlm_config = { "enabled": vlm_cfg.enabled, "provider": vlm_cfg.provider, "model": vlm_cfg.model, "api_key": vlm_cfg.api_key, "base_url": vlm_cfg.base_url, "prompt": vlm_cfg.prompt, } logger.info(f"VLM config: provider={vlm_cfg.provider}, model={vlm_cfg.model}") # 下载文件 logger.info("Downloading file from URL: %s", file_url) try: response = requests.get( file_url, timeout=60, headers={"User-Agent": "DocParser/1.0"}, ) response.raise_for_status() content = response.content logger.info("Downloaded %d bytes", len(content)) except requests.RequestException as e: logger.error("Failed to download file: %s", str(e)) return document_parser_pb2.ParseResponse( success=False, content="", message=f"Failed to download file: {str(e)}", content_length=0, ) # 解析 logger.info("Parsing file") file_type = os.path.splitext(file_name)[1][1:] # 去掉点的扩展名 result = self.parser.parse_file( file_name=file_name, file_type=file_type, content=content, vlm_config=vlm_config, ) if not result.content: return document_parser_pb2.ParseResponse( success=False, content="", message="Parse failed or empty content", content_length=0, ) markdown_content = result.content logger.info("Parse successful: content_length=%d", len(markdown_content)) return document_parser_pb2.ParseResponse( success=True, content=markdown_content, message="Parse successful", content_length=len(markdown_content), file_type=file_type or "auto", parser_engine="markitdown", ) except Exception as e: logger.error("ParseDocument error: %s", str(e), exc_info=True) return document_parser_pb2.ParseResponse( success=False, content="", message=f"Parse error: {str(e)}", content_length=0, ) def GetSupportedFormats(self, request, context): """获取支持的格式""" if not PROTO_AVAILABLE: return None try: file_types = [ "pdf", "docx", "doc", "pptx", "ppt", "xlsx", "xls", "csv", "md", "markdown", "jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp", "html", "htm", "txt", ] return document_parser_pb2.SupportedFormatsResponse( file_types=file_types, ) except Exception as e: logger.error("GetSupportedFormats error: %s", str(e)) return None def GetEngines(self, request, context): """获取解析引擎""" if not PROTO_AVAILABLE: return None try: engines = [ document_parser_pb2.EngineInfo( name="markitdown", description="MarkItDown parser - supports various document formats", supported_file_types=["pdf", "docx", "pptx", "xlsx", "md", "html", "txt"], available=True, ) ] return document_parser_pb2.EnginesResponse(engines=engines) except Exception as e: logger.error("GetEngines error: %s", str(e)) return None def serve(port: int = 50051, max_workers: int = 10): """启动 gRPC 服务""" if not PROTO_AVAILABLE: logger.error("Proto files not available, cannot start server") return server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers)) servicer = DocumentParserServicer(max_workers=max_workers) # 注册服务 document_parser_pb2_grpc.add_DocumentParserServicer_to_server( servicer, server ) # 启用反射 reflection.enable_server_reflection( [document_parser_pb2.DESCRIPTOR.services_by_name['DocumentParser']], server ) server.add_insecure_port(f"0.0.0.0:{port}") server.start() logger.info(f"DocumentParser gRPC server started on port {port}") logger.info("gRPC reflection enabled") server.wait_for_termination()