import logging import requests from concurrent import futures import grpc from grpc_reflection.v1alpha import reflection import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "proto")) from parser.parser import Parser logger = logging.getLogger(__name__) docparser_pb2 = None docparser_pb2_grpc = None def _import_grpc_protobuf(): """Import gRPC protobuf modules""" global docparser_pb2, docparser_pb2_grpc if docparser_pb2 is not None and docparser_pb2_grpc is not None: return try: import document_parser_pb2 as dpb2 import document_parser_pb2_grpc as dpb2_grpc docparser_pb2 = dpb2 docparser_pb2_grpc = dpb2_grpc logger.info("Successfully imported gRPC protobuf modules") except ImportError as e: logger.error(f"Failed to import gRPC protobuf: {e}") raise ImportError( "gRPC protobuf files not found. Please run: python generate_grpc.py" ) from e class DocumentParserServicer: """gRPC 服务实现,使用 MarkItDown""" def __init__(self, max_workers: int = 10): _import_grpc_protobuf() self.parser = Parser() self.max_workers = max_workers logger.info("DocumentParserServicer initialized") def ParseDocument(self, request, context): """解析文档""" try: logger.info( "ParseDocument request: file_url=%s, file_name=%s, file_type=%s", request.file_url, request.file_name, request.file_type, ) file_url = request.file_url file_name = request.file_name if not file_url: return docparser_pb2.ParseResponse( success=False, content="", message="file_url is required", content_length=0, ) if not file_name: return docparser_pb2.ParseResponse( success=False, content="", message="file_name is required", content_length=0, ) logger.info("Downloading file from URL: %s", file_url) try: response = requests.get( file_url, timeout=60, headers={"User-Agent": "DocParser/1.0"}, ) response.raise_for_status() content = response.content logger.info("Downloaded %d bytes", len(content)) except requests.RequestException as e: logger.error("Failed to download file: %s", str(e)) return docparser_pb2.ParseResponse( success=False, content="", message=f"Failed to download file: {str(e)}", content_length=0, ) logger.info("Parsing file with MarkItDown") result = self.parser.parse_bytes(content, file_name) if not result.get("success", False): logger.warning("Parser returned failure: %s", result.get("error", "Unknown error")) return docparser_pb2.ParseResponse( success=False, content="", message=result.get("error", "Parse failed"), content_length=0, ) markdown_content = result.get("content", "") logger.info( "Parse successful: content_length=%d", len(markdown_content), ) return docparser_pb2.ParseResponse( success=True, content=markdown_content, message="Parse successful", content_length=len(markdown_content), file_type=request.file_type or "auto", parser_engine="markitdown", ) except Exception as e: logger.error("ParseDocument error: %s", str(e), exc_info=True) return docparser_pb2.ParseResponse( success=False, content="", message=f"Parse error: {str(e)}", content_length=0, ) def GetSupportedFormats(self, request, context): """获取支持的文件格式""" try: logger.info("GetSupportedFormats request") file_types = [ "pdf", "docx", "doc", "pptx", "ppt", "xlsx", "xls", "csv", "md", "markdown", "jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp", "html", "htm", "txt", ] file_type_descriptions = { "pdf": "PDF Document", "docx": "Microsoft Word Document", "doc": "Microsoft Word Document (Legacy)", "pptx": "Microsoft PowerPoint Presentation", "ppt": "Microsoft PowerPoint Presentation (Legacy)", "xlsx": "Microsoft Excel Spreadsheet", "xls": "Microsoft Excel Spreadsheet (Legacy)", "csv": "Comma-Separated Values", "md": "Markdown File", "markdown": "Markdown File", "jpg": "JPEG Image", "jpeg": "JPEG Image", "png": "PNG Image", "gif": "GIF Image", "bmp": "BMP Image", "tiff": "TIFF Image", "webp": "WebP Image", "html": "HTML Document", "htm": "HTML Document", "txt": "Plain Text File", } return docparser_pb2.SupportedFormatsResponse( file_types=file_types, file_type_descriptions=file_type_descriptions, ) except Exception as e: logger.error("GetSupportedFormats error: %s", str(e), exc_info=True) context.set_code(grpc.StatusCode.INTERNAL) context.set_details(str(e)) return docparser_pb2.SupportedFormatsResponse() def GetEngines(self, request, context): """获取可用的解析引擎列表""" try: logger.info("GetEngines request") engine_info = docparser_pb2.EngineInfo( name="markitdown", description="Microsoft MarkItDown - 统一文档解析引擎", supported_file_types=[ "pdf", "docx", "doc", "pptx", "ppt", "xlsx", "xls", "csv", "md", "markdown", "jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp", "html", "htm", "txt", ], available=True, unavailable_reason="", ) return docparser_pb2.EnginesResponse(engines=[engine_info]) except Exception as e: logger.error("GetEngines error: %s", str(e), exc_info=True) context.set_code(grpc.StatusCode.INTERNAL) context.set_details(str(e)) return docparser_pb2.EnginesResponse() def serve(port: int = 50051, max_workers: int = 10): """启动 gRPC 服务""" _import_grpc_protobuf() server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers)) servicer = DocumentParserServicer(max_workers=max_workers) docparser_pb2_grpc.add_DocumentParserServicer_to_server(servicer, server) reflection.enable_server_reflection( service_names=[ docparser_pb2.DESCRIPTOR.services_by_name["DocumentParser"].full_name, reflection.SERVICE_NAME, ], server=server, ) server.add_insecure_port(f"[::]:{port}") server.start() logger.info("DocumentParser gRPC server (MarkItDown) started on port %d", port) logger.info("gRPC reflection enabled") try: server.wait_for_termination() except KeyboardInterrupt: logger.info("Shutting down server...") server.stop(0) logger.info("Server stopped") if __name__ == "__main__": logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) serve()