feat: 完善 AI-Core 文档解析器
- 添加多种文档解析器 (PDF, Word, Excel, Markdown 等) - 添加基础解析器和链式解析器 - 添加存储和注册机制 - 添加 gRPC 服务实现 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
208
ai-core/service/grpc_server.py
Normal file
208
ai-core/service/grpc_server.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""
|
||||
gRPC Server for Document Parser
|
||||
"""
|
||||
import logging
|
||||
import requests
|
||||
from concurrent import futures
|
||||
import grpc
|
||||
from grpc_reflection.v1alpha import reflection
|
||||
import sys
|
||||
import os
|
||||
import io
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "proto"))
|
||||
|
||||
from parser import Parser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 导入 proto 生成的文件
|
||||
try:
|
||||
import document_parser_pb2
|
||||
import document_parser_pb2_grpc
|
||||
PROTO_AVAILABLE = True
|
||||
except ImportError:
|
||||
logger.warning("Proto files not found, please run: python generate_grpc.py")
|
||||
PROTO_AVAILABLE = False
|
||||
|
||||
|
||||
class DocumentParserServicer:
|
||||
"""gRPC 服务实现"""
|
||||
|
||||
def __init__(self, max_workers: int = 10):
|
||||
self.parser = Parser()
|
||||
self.max_workers = max_workers
|
||||
logger.info("DocumentParserServicer initialized")
|
||||
|
||||
def ParseDocument(self, request, context):
|
||||
"""解析文档"""
|
||||
if not PROTO_AVAILABLE:
|
||||
return {"success": False, "message": "Proto not available"}
|
||||
|
||||
try:
|
||||
logger.info(
|
||||
"ParseDocument request: file_url=%s, file_name=%s",
|
||||
request.file_url,
|
||||
request.file_name,
|
||||
)
|
||||
|
||||
file_url = request.file_url
|
||||
file_name = request.file_name
|
||||
|
||||
if not file_url:
|
||||
return document_parser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message="file_url is required",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
if not file_name:
|
||||
return document_parser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message="file_name is required",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
# 提取 VLM 配置
|
||||
vlm_config = None
|
||||
if hasattr(request, 'vlm_config') and request.vlm_config:
|
||||
vlm_cfg = request.vlm_config
|
||||
if vlm_cfg.enabled:
|
||||
vlm_config = {
|
||||
"enabled": vlm_cfg.enabled,
|
||||
"provider": vlm_cfg.provider,
|
||||
"model": vlm_cfg.model,
|
||||
"api_key": vlm_cfg.api_key,
|
||||
"base_url": vlm_cfg.base_url,
|
||||
"prompt": vlm_cfg.prompt,
|
||||
}
|
||||
logger.info(f"VLM config: provider={vlm_cfg.provider}, model={vlm_cfg.model}")
|
||||
|
||||
# 下载文件
|
||||
logger.info("Downloading file from URL: %s", file_url)
|
||||
try:
|
||||
response = requests.get(
|
||||
file_url,
|
||||
timeout=60,
|
||||
headers={"User-Agent": "DocParser/1.0"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
content = response.content
|
||||
logger.info("Downloaded %d bytes", len(content))
|
||||
except requests.RequestException as e:
|
||||
logger.error("Failed to download file: %s", str(e))
|
||||
return document_parser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message=f"Failed to download file: {str(e)}",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
# 解析
|
||||
logger.info("Parsing file")
|
||||
file_type = os.path.splitext(file_name)[1][1:] # 去掉点的扩展名
|
||||
|
||||
result = self.parser.parse_file(
|
||||
file_name=file_name,
|
||||
file_type=file_type,
|
||||
content=content,
|
||||
vlm_config=vlm_config,
|
||||
)
|
||||
|
||||
if not result.content:
|
||||
return document_parser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message="Parse failed or empty content",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
markdown_content = result.content
|
||||
logger.info("Parse successful: content_length=%d", len(markdown_content))
|
||||
|
||||
return document_parser_pb2.ParseResponse(
|
||||
success=True,
|
||||
content=markdown_content,
|
||||
message="Parse successful",
|
||||
content_length=len(markdown_content),
|
||||
file_type=file_type or "auto",
|
||||
parser_engine="markitdown",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("ParseDocument error: %s", str(e), exc_info=True)
|
||||
return document_parser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message=f"Parse error: {str(e)}",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
def GetSupportedFormats(self, request, context):
|
||||
"""获取支持的格式"""
|
||||
if not PROTO_AVAILABLE:
|
||||
return None
|
||||
|
||||
try:
|
||||
file_types = [
|
||||
"pdf", "docx", "doc", "pptx", "ppt",
|
||||
"xlsx", "xls", "csv",
|
||||
"md", "markdown",
|
||||
"jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp",
|
||||
"html", "htm", "txt",
|
||||
]
|
||||
return document_parser_pb2.SupportedFormatsResponse(
|
||||
file_types=file_types,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("GetSupportedFormats error: %s", str(e))
|
||||
return None
|
||||
|
||||
def GetEngines(self, request, context):
|
||||
"""获取解析引擎"""
|
||||
if not PROTO_AVAILABLE:
|
||||
return None
|
||||
|
||||
try:
|
||||
engines = [
|
||||
document_parser_pb2.EngineInfo(
|
||||
name="markitdown",
|
||||
description="MarkItDown parser - supports various document formats",
|
||||
supported_file_types=["pdf", "docx", "pptx", "xlsx", "md", "html", "txt"],
|
||||
available=True,
|
||||
)
|
||||
]
|
||||
return document_parser_pb2.EnginesResponse(engines=engines)
|
||||
except Exception as e:
|
||||
logger.error("GetEngines error: %s", str(e))
|
||||
return None
|
||||
|
||||
|
||||
def serve(port: int = 50051, max_workers: int = 10):
|
||||
"""启动 gRPC 服务"""
|
||||
if not PROTO_AVAILABLE:
|
||||
logger.error("Proto files not available, cannot start server")
|
||||
return
|
||||
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers))
|
||||
servicer = DocumentParserServicer(max_workers=max_workers)
|
||||
|
||||
# 注册服务
|
||||
document_parser_pb2_grpc.add_DocumentParserServicer_to_server(
|
||||
servicer, server
|
||||
)
|
||||
|
||||
# 启用反射
|
||||
reflection.enable_server_reflection(
|
||||
[document_parser_pb2.DESCRIPTOR.services_by_name['DocumentParser']],
|
||||
server
|
||||
)
|
||||
|
||||
server.add_insecure_port(f"0.0.0.0:{port}")
|
||||
server.start()
|
||||
logger.info(f"DocumentParser gRPC server started on port {port}")
|
||||
logger.info("gRPC reflection enabled")
|
||||
server.wait_for_termination()
|
||||
Reference in New Issue
Block a user