diff --git a/ai-core/README.md b/ai-core/README.md index 7596d67..3e6f371 100644 --- a/ai-core/README.md +++ b/ai-core/README.md @@ -1,33 +1,31 @@ # AI-Core 文档解析服务 -基于 Python 和 Microsoft MarkItDown 的 gRPC 文档解析服务,支持多种文件格式转换为 Markdown。 +基于 Python 的 gRPC 文档解析服务,支持多种文件格式转换为 Markdown。 -## 特性 +## 功能特性 -- **统一解析引擎** - 使用 Microsoft MarkItDown,一个库支持所有格式 -- **支持格式广泛** - PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页等 -- **gRPC 接口** - 高性能、类型安全的 RPC 通信 -- **依赖简单** - 只需安装 3 个核心包 -- **易于部署** - 一键启动,开箱即用 +- 支持多种文件格式:PDF、DOCX、DOC、XLSX、XLS、CSV、Markdown、图片等 +- 多解析引擎支持(builtin、markitdown) +- gRPC 接口,高性能通信 +- 支持通过 URL 下载文件并解析 +- 可配置的解析引擎和参数 ## 项目结构 ``` ai-core/ ├── main.py # 服务启动入口 -├── requirements.txt # Python 依赖(仅 3 个包) -├── generate_grpc.py # gRPC 代码生成脚本 -├── start.sh # Linux/Mac 启动脚本 -├── start.ps1 # Windows 启动脚本 +├── requirements.txt # Python 依赖 ├── proto/ # gRPC 协议定义 -│ ├── document_parser.proto # Protocol Buffers 定义 -│ ├── document_parser_pb2.py # 生成的 Python 代码 -│ └── document_parser_pb2_grpc.py +│ └── document_parser.proto # Protocol Buffers 定义 ├── parser/ # 文档解析器模块 -│ ├── __init__.py -│ └── parser.py # MarkItDown 解析器 +│ ├── base_parser.py # 基础解析器接口 +│ ├── parser.py # 解析器门面 +│ ├── registry.py # 解析器注册表 +│ ├── docx_parser.py # DOCX 解析器 +│ ├── pdf_parser.py # PDF 解析器 +│ └── ... └── service/ # gRPC 服务实现 - ├── __init__.py └── grpc_server.py # gRPC 服务器 ``` @@ -39,39 +37,19 @@ ai-core/ pip install -r requirements.txt ``` -依赖包: -- `grpcio` - gRPC 框架 -- `grpcio-tools` - gRPC 工具 -- `grpcio-reflection` - gRPC 反射 -- `protobuf` - Protocol Buffers -- `requests` - HTTP 请求 -- `markitdown` - Microsoft 文档解析引擎 - ### 2. 生成 gRPC 代码 ```bash -python generate_grpc.py +python -m grpc_tools.protoc \ + --proto_path=proto \ + --python_out=proto \ + --grpc_python_out=proto \ + proto/document_parser.proto ``` -这会在 `proto` 目录下生成两个文件: -- `document_parser_pb2.py` -- `document_parser_pb2_grpc.py` - ## 使用 -### 方式 1: 使用启动脚本(推荐) - -**Windows:** -```powershell -.\start.ps1 -``` - -**Linux/Mac:** -```bash -bash start.sh -``` - -### 方式 2: 直接运行 +### 启动服务 ```bash python main.py --port 50051 --max-workers 10 @@ -82,9 +60,9 @@ python main.py --port 50051 --max-workers 10 - `--max-workers`: 最大工作线程数(默认 10) - `--log-level`: 日志级别(DEBUG/INFO/WARNING/ERROR,默认 INFO) -## gRPC 接口 +### gRPC 接口 -### ParseDocument +#### ParseDocument 解析文档为 Markdown @@ -92,129 +70,80 @@ python main.py --port 50051 --max-workers 10 message ParseRequest { string file_url = 1; // 文件 URL(必填) string file_name = 2; // 文件名(必填) - string file_type = 3; // 文件类型(可选) - string parser_engine = 4; // 解析引擎(可选) + string file_type = 3; // 文件类型(必填,如 pdf、docx) + string parser_engine = 4; // 解析引擎(可选,默认 builtin) map engine_overrides = 5;// 引擎参数覆盖(可选) } message ParseResponse { bool success = 1; // 是否成功 string content = 2; // Markdown 内容 - string message = 3; // 消息 + string message = 3; // 消息 int32 content_length = 4; // 内容长度 string file_type = 5; // 文件类型 string parser_engine = 6; // 使用的解析引擎 } ``` -### GetSupportedFormats +#### GetSupportedFormats 获取支持的文件格式列表 -### GetEngines +#### GetEngines 获取可用的解析引擎列表 ## Go 客户端调用示例 ```go -import ( - "context" - "log" - - "google.golang.org/grpc" - "google.golang.org/grpc/credentials/insecure" -) - -func main() { - conn, err := grpc.Dial("localhost:50051", grpc.WithTransportCredentials(insecure.NewCredentials())) - if err != nil { - log.Fatalf("Failed to connect: %v", err) - } - defer conn.Close() - - client := docparser.NewDocumentParserClient(conn) - - resp, err := client.ParseDocument(context.Background(), &docparser.ParseRequest{ - FileUrl: "http://localhost:8082/files/abc123.pdf", - FileName: "example.pdf", - FileType: "pdf", - }) - - if err != nil { - log.Fatalf("Failed to parse: %v", err) - } - - log.Printf("Success: %v", resp.Success) - log.Printf("Content length: %d", resp.ContentLength) - log.Printf("Markdown content:\n%s", resp.Content) +conn, err := grpc.Dial("localhost:50051", grpc.WithTransportCredentials(insecure.NewCredentials())) +if err != nil { + log.Fatalf("Failed to connect: %v", err) } +defer conn.Close() + +client := docparser.NewDocumentParserClient(conn) + +resp, err := client.ParseDocument(context.Background(), &docparser.ParseRequest{ + FileUrl: "http://localhost:8082/files/abc123.pdf", + FileName: "example.pdf", + FileType: "pdf", + ParserEngine: "builtin", +}) + +if err != nil { + log.Fatalf("Failed to parse: %v", err) +} + +fmt.Println("Markdown content:") +fmt.Println(resp.Content) ``` ## 支持的文件格式 -| 类别 | 支持的扩展名 | -|------|-------------| -| **文档** | pdf, docx, doc, pptx, ppt | -| **表格** | xlsx, xls, csv | -| **文本** | md, markdown, txt | -| **图片** | jpg, jpeg, png, gif, bmp, tiff, webp | -| **网页** | html, htm | - -## 为什么选择 MarkItDown? - -1. **微软官方支持** - Microsoft 开发,持续维护 -2. **格式覆盖全** - 一个库支持所有常见格式 -3. **统一接口** - 无需为每种格式单独实现 -4. **安装简单** - 只需 `pip install markitdown` -5. **性能优秀** - 基于优化的解析算法 - -## 故障排查 - -### 端口已被占用 - -如果提示端口 50051 已被占用,可以更换端口: - -```bash -python main.py --port 50052 -``` - -### gRPC 代码未生成 - -如果提示找不到 `docparser_pb2`,运行: - -```bash -python generate_grpc.py -``` - -### 依赖安装失败 - -确保使用 Python 3.8+: - -```bash -python --version -pip --version -``` +| 格式 | 扩展名 | 说明 | +|------|--------|------| +| PDF | pdf | PDF 文档 | +| Word | docx, doc | Microsoft Word 文档 | +| Excel | xlsx, xls | Microsoft Excel 表格 | +| CSV | csv | 逗号分隔值文件 | +| Markdown | md, markdown | Markdown 文件 | +| 图片 | jpg, jpeg, png, gif, bmp, tiff, webp | 常见图片格式 | +| PowerPoint | pptx, ppt | PowerPoint 演示文稿 | ## 开发 -### 测试解析器 +### 添加新的解析器 -```python -from parser import Parser +1. 继承 `BaseParser` 类 +2. 实现 `parse_into_text` 方法 +3. 在 `registry.py` 中注册 -parser = Parser() +### 添加新的解析引擎 -# 解析文件 -result = parser.parse("path/to/file.pdf") -print(result["content"]) - -# 解析字节内容 -with open("file.pdf", "rb") as f: - content = f.read() -result = parser.parse_bytes(content, "file.pdf") -print(result["content"]) -``` +1. 在 `registry.py` 中使用 `register()` 方法注册 +2. 提供 `check_available` 函数检查依赖 +3. 添加对应的解析器类 ## 许可证 diff --git a/ai-core/main.py b/ai-core/main.py deleted file mode 100644 index 45f29de..0000000 --- a/ai-core/main.py +++ /dev/null @@ -1,59 +0,0 @@ -import argparse -import logging -import os -import sys - -sys.path.insert(0, os.path.dirname(__file__)) - -from service.grpc_server import serve - -DEFAULT_PORT = 50051 -DEFAULT_MAX_WORKERS = 10 - -def main(): - parser = argparse.ArgumentParser( - description="Document Parser gRPC Server (MarkItDown)", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - "--port", - type=int, - default=DEFAULT_PORT, - help="Port to listen on", - ) - parser.add_argument( - "--max-workers", - type=int, - default=DEFAULT_MAX_WORKERS, - help="Maximum number of worker threads", - ) - parser.add_argument( - "--log-level", - type=str, - default="INFO", - choices=["DEBUG", "INFO", "WARNING", "ERROR"], - help="Log level", - ) - - args = parser.parse_args() - - logging.basicConfig( - level=getattr(logging, args.log_level), - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - ) - - logger = logging.getLogger(__name__) - logger.info("Starting Document Parser gRPC Server (MarkItDown)") - logger.info("Port: %d", args.port) - logger.info("Max workers: %d", args.max_workers) - - try: - serve(port=args.port, max_workers=args.max_workers) - except KeyboardInterrupt: - logger.info("Server shutdown requested") - except Exception as e: - logger.error("Server error: %s", str(e), exc_info=True) - sys.exit(1) - -if __name__ == "__main__": - main() diff --git a/ai-core/parser/__init__.py b/ai-core/parser/__init__.py index fca4512..f37075d 100644 --- a/ai-core/parser/__init__.py +++ b/ai-core/parser/__init__.py @@ -1,9 +1,38 @@ """ -Parser module for AI-Core document processing system. +Parser module for WeKnora document processing system. -This module provides document parsing using Microsoft MarkItDown. +This module provides document parsers for various file formats including: +- Microsoft Word documents (.doc, .docx) +- PDF documents +- Markdown files +- Plain text files +- Images with text content +- Web pages + +The parsers extract content from documents and can split them into +meaningful chunks for further processing and indexing. """ +from .doc_parser import DocParser +from .docx2_parser import Docx2Parser +from .excel_parser import ExcelParser +from .image_parser import ImageParser +from .markdown_parser import MarkdownParser from .parser import Parser +from .pdf_parser import PDFParser +from .registry import ParserEngineRegistry, registry +from .web_parser import WebParser -__all__ = ["Parser"] +# Export public classes and modules +__all__ = [ + "Docx2Parser", + "DocParser", + "PDFParser", + "MarkdownParser", + "ImageParser", + "WebParser", + "Parser", + "ExcelParser", + "ParserEngineRegistry", + "registry", +] diff --git a/ai-core/parser/parser.py b/ai-core/parser/parser.py deleted file mode 100644 index a5d1271..0000000 --- a/ai-core/parser/parser.py +++ /dev/null @@ -1,199 +0,0 @@ -import logging -import os -import tempfile -from typing import Optional, Dict, Any -from markitdown import MarkItDown - -from .vlm_client import VLMClient -from .config import get_vlm_config - -logger = logging.getLogger(__name__) - - -class Parser: - """基于 MarkItDown + VLM 的统一文档解析器 - - 支持格式:PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页、Markdown 等 - - VLM 解析: - - 方式一:启动时配置(config.yaml 或环境变量) - - 方式二:gRPC 请求时传入 VLM 配置(优先级更高) - """ - - def __init__(self): - self.markitdown = MarkItDown() - self.vlm_client: Optional[VLMClient] = None - - # 尝试加载配置的 VLM - vlm_config = get_vlm_config() - if vlm_config: - self.vlm_client = VLMClient(vlm_config) - logger.info(f"VLM enabled: provider={vlm_config.get('provider')}, model={vlm_config.get('model')}") - else: - logger.info("VLM not configured, using MarkItDown only") - - def set_vlm_config(self, config: Dict[str, Any]) -> None: - """手动设置 VLM 配置(优先级高于全局配置)""" - if config and config.get("enabled") and config.get("api_key"): - self.vlm_client = VLMClient(config) - logger.info(f"VLM enabled: provider={config.get('provider')}, model={config.get('model')}") - else: - self.vlm_client = None - logger.info("VLM disabled") - - def parse(self, file_path: str, file_type: Optional[str] = None, vlm_config: Optional[Dict[str, Any]] = None) -> dict: - """解析文档为 Markdown - - Args: - file_path: 文件路径或 URL - file_type: 文件类型(可选,MarkItDown 会自动检测) - vlm_config: VLM 配置(可选,优先级高于全局配置) - - Returns: - dict: 包含 markdown 内容和元数据 - """ - # 如果有 VLM 配置,覆盖全局配置 - if vlm_config: - self.set_vlm_config(vlm_config) - - try: - logger.info(f"Parsing file: {file_path}") - - result = self.markitdown.convert(file_path) - - logger.info(f"Parse successful: {len(result.text_content)} characters") - - return { - "success": True, - "content": result.text_content, - "content_length": len(result.text_content), - "metadata": result.metadata if hasattr(result, 'metadata') else {} - } - except Exception as e: - logger.error(f"Parse error: {e}", exc_info=True) - return { - "success": False, - "content": "", - "content_length": 0, - "error": str(e) - } - - def parse_bytes(self, content: bytes, file_name: str, file_type: Optional[str] = None, vlm_config: Optional[Dict[str, Any]] = None) -> dict: - """解析字节内容为 Markdown - - Args: - content: 文件字节内容 - file_name: 文件名 - file_type: 文件类型(可选) - vlm_config: VLM 配置(可选,优先级高于全局配置) - - Returns: - dict: 包含 markdown 内容和元数据 - """ - # 如果有 VLM 配置,覆盖全局配置 - if vlm_config: - self.set_vlm_config(vlm_config) - - try: - logger.info(f"Parsing bytes: {file_name}, size: {len(content)} bytes") - - # 检查是否应该使用 VLM(根据文件名自动判断) - if self._should_use_vlm(file_name): - logger.info("Using VLM for parsing") - return self._parse_with_vlm(content, file_name) - - # 否则使用 MarkItDown - with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1] or '') as temp_file: - temp_file.write(content) - temp_path = temp_file.name - - try: - result = self.markitdown.convert(temp_path) - - logger.info(f"Parse successful: {len(result.text_content)} characters") - - return { - "success": True, - "content": result.text_content, - "content_length": len(result.text_content), - "metadata": result.metadata if hasattr(result, 'metadata') else {} - } - finally: - os.unlink(temp_path) - except Exception as e: - logger.error(f"Parse bytes error: {e}", exc_info=True) - return { - "success": False, - "content": "", - "content_length": 0, - "error": str(e) - } - - def _should_use_vlm(self, file_name: str) -> bool: - """判断是否应该使用 VLM""" - if not self.vlm_client: - return False - - # 图片文件使用 VLM - image_exts = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff'] - ext = os.path.splitext(file_name)[1].lower() - return ext in image_exts - - def _parse_with_vlm(self, content: bytes, file_name: str) -> dict: - """使用 VLM 解析""" - if not self.vlm_client: - return { - "success": False, - "content": "", - "content_length": 0, - "error": "VLM not configured" - } - - # 确定 MIME 类型 - ext = os.path.splitext(file_name)[1].lower() - mime_types = { - '.jpg': 'image/jpeg', - '.jpeg': 'image/jpeg', - '.png': 'image/png', - '.gif': 'image/gif', - '.bmp': 'image/bmp', - '.webp': 'image/webp', - '.tiff': 'image/tiff', - } - mime_type = mime_types.get(ext, 'image/png') - - try: - result = self.vlm_client.analyze_image(content, mime_type) - - if result.get("success"): - return { - "success": True, - "content": result["content"], - "content_length": len(result["content"]), - "metadata": {"vlm_used": True} - } - else: - return { - "success": False, - "content": "", - "content_length": 0, - "error": result.get("error", "VLM parsing failed") - } - except Exception as e: - logger.error(f"VLM parsing error: {e}") - return { - "success": False, - "content": "", - "content_length": 0, - "error": str(e) - } - - -if __name__ == "__main__": - parser = Parser() - - # 测试 - test_url = "https://example.com" - result = parser.parse(test_url) - print(f"Success: {result['success']}") - print(f"Content length: {result['content_length']}") diff --git a/ai-core/requirements.txt b/ai-core/requirements.txt deleted file mode 100644 index a348f9f..0000000 --- a/ai-core/requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ -# AI-Core Document Parser - 基于 MarkItDown - -# gRPC 框架 -grpcio>=1.60.0 -grpcio-tools>=1.60.0 -grpcio-reflection>=1.60.0 -protobuf>=4.25.0 - -# 配置文件解析 -pyyaml>=6.0 -requests>=2.31.0 - -# 文档解析 - markitdown 及其所有依赖 -markitdown[pdf,docx,pptx,xlsx,all]>=0.0.1 \ No newline at end of file diff --git a/ai-core/service/grpc_server.py b/ai-core/service/grpc_server.py deleted file mode 100644 index 2d8a88c..0000000 --- a/ai-core/service/grpc_server.py +++ /dev/null @@ -1,259 +0,0 @@ -import logging -import requests -from concurrent import futures - -import grpc -from grpc_reflection.v1alpha import reflection - -import sys -import os - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "proto")) - -from parser.parser import Parser - -logger = logging.getLogger(__name__) - -docparser_pb2 = None -docparser_pb2_grpc = None - -def _import_grpc_protobuf(): - """Import gRPC protobuf modules""" - global docparser_pb2, docparser_pb2_grpc - if docparser_pb2 is not None and docparser_pb2_grpc is not None: - return - - try: - import document_parser_pb2 as dpb2 - import document_parser_pb2_grpc as dpb2_grpc - docparser_pb2 = dpb2 - docparser_pb2_grpc = dpb2_grpc - logger.info("Successfully imported gRPC protobuf modules") - except ImportError as e: - logger.error(f"Failed to import gRPC protobuf: {e}") - raise ImportError( - "gRPC protobuf files not found. Please run: python generate_grpc.py" - ) from e - - -class DocumentParserServicer: - """gRPC 服务实现,使用 MarkItDown""" - - def __init__(self, max_workers: int = 10): - _import_grpc_protobuf() - self.parser = Parser() - self.max_workers = max_workers - logger.info("DocumentParserServicer initialized") - - def ParseDocument(self, request, context): - """解析文档""" - try: - logger.info( - "ParseDocument request: file_url=%s, file_name=%s, file_type=%s", - request.file_url, - request.file_name, - request.file_type, - ) - - file_url = request.file_url - file_name = request.file_name - - if not file_url: - return docparser_pb2.ParseResponse( - success=False, - content="", - message="file_url is required", - content_length=0, - ) - - if not file_name: - return docparser_pb2.ParseResponse( - success=False, - content="", - message="file_name is required", - content_length=0, - ) - - # 提取 VLM 配置 - vlm_config = None - if hasattr(request, 'vlm_config') and request.vlm_config: - vlm_cfg = request.vlm_config - if vlm_cfg.enabled: - vlm_config = { - "enabled": vlm_cfg.enabled, - "provider": vlm_cfg.provider, - "model": vlm_cfg.model, - "api_key": vlm_cfg.api_key, - "base_url": vlm_cfg.base_url, - "prompt": vlm_cfg.prompt, - } - logger.info(f"VLM config: provider={vlm_cfg.provider}, model={vlm_cfg.model}") - - logger.info("Downloading file from URL: %s", file_url) - - try: - response = requests.get( - file_url, - timeout=60, - headers={"User-Agent": "DocParser/1.0"}, - ) - response.raise_for_status() - content = response.content - logger.info("Downloaded %d bytes", len(content)) - except requests.RequestException as e: - logger.error("Failed to download file: %s", str(e)) - return docparser_pb2.ParseResponse( - success=False, - content="", - message=f"Failed to download file: {str(e)}", - content_length=0, - ) - - logger.info("Parsing file with MarkItDown + VLM") - - result = self.parser.parse_bytes(content, file_name, vlm_config=vlm_config) - - if not result.get("success", False): - logger.warning("Parser returned failure: %s", result.get("error", "Unknown error")) - return docparser_pb2.ParseResponse( - success=False, - content="", - message=result.get("error", "Parse failed"), - content_length=0, - ) - - markdown_content = result.get("content", "") - logger.info( - "Parse successful: content_length=%d", - len(markdown_content), - ) - - return docparser_pb2.ParseResponse( - success=True, - content=markdown_content, - message="Parse successful", - content_length=len(markdown_content), - file_type=request.file_type or "auto", - parser_engine="markitdown", - ) - - except Exception as e: - logger.error("ParseDocument error: %s", str(e), exc_info=True) - return docparser_pb2.ParseResponse( - success=False, - content="", - message=f"Parse error: {str(e)}", - content_length=0, - ) - - def GetSupportedFormats(self, request, context): - """获取支持的文件格式""" - try: - logger.info("GetSupportedFormats request") - - file_types = [ - "pdf", "docx", "doc", "pptx", "ppt", - "xlsx", "xls", "csv", - "md", "markdown", - "jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp", - "html", "htm", "txt", - ] - - file_type_descriptions = { - "pdf": "PDF Document", - "docx": "Microsoft Word Document", - "doc": "Microsoft Word Document (Legacy)", - "pptx": "Microsoft PowerPoint Presentation", - "ppt": "Microsoft PowerPoint Presentation (Legacy)", - "xlsx": "Microsoft Excel Spreadsheet", - "xls": "Microsoft Excel Spreadsheet (Legacy)", - "csv": "Comma-Separated Values", - "md": "Markdown File", - "markdown": "Markdown File", - "jpg": "JPEG Image", - "jpeg": "JPEG Image", - "png": "PNG Image", - "gif": "GIF Image", - "bmp": "BMP Image", - "tiff": "TIFF Image", - "webp": "WebP Image", - "html": "HTML Document", - "htm": "HTML Document", - "txt": "Plain Text File", - } - - return docparser_pb2.SupportedFormatsResponse( - file_types=file_types, - file_type_descriptions=file_type_descriptions, - ) - except Exception as e: - logger.error("GetSupportedFormats error: %s", str(e), exc_info=True) - context.set_code(grpc.StatusCode.INTERNAL) - context.set_details(str(e)) - return docparser_pb2.SupportedFormatsResponse() - - def GetEngines(self, request, context): - """获取可用的解析引擎列表""" - try: - logger.info("GetEngines request") - - engine_info = docparser_pb2.EngineInfo( - name="markitdown", - description="Microsoft MarkItDown - 统一文档解析引擎", - supported_file_types=[ - "pdf", "docx", "doc", "pptx", "ppt", - "xlsx", "xls", "csv", - "md", "markdown", - "jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp", - "html", "htm", "txt", - ], - available=True, - unavailable_reason="", - ) - - return docparser_pb2.EnginesResponse(engines=[engine_info]) - except Exception as e: - logger.error("GetEngines error: %s", str(e), exc_info=True) - context.set_code(grpc.StatusCode.INTERNAL) - context.set_details(str(e)) - return docparser_pb2.EnginesResponse() - - -def serve(port: int = 50051, max_workers: int = 10): - """启动 gRPC 服务""" - _import_grpc_protobuf() - - server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers)) - - servicer = DocumentParserServicer(max_workers=max_workers) - docparser_pb2_grpc.add_DocumentParserServicer_to_server(servicer, server) - - reflection.enable_server_reflection( - service_names=[ - docparser_pb2.DESCRIPTOR.services_by_name["DocumentParser"].full_name, - reflection.SERVICE_NAME, - ], - server=server, - ) - - server.add_insecure_port(f"0.0.0.0:{port}") - server.start() - - logger.info("DocumentParser gRPC server (MarkItDown) started on port %d", port) - logger.info("gRPC reflection enabled") - - try: - server.wait_for_termination() - except KeyboardInterrupt: - logger.info("Shutting down server...") - server.stop(0) - logger.info("Server stopped") - - -if __name__ == "__main__": - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - ) - serve() diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml deleted file mode 100644 index 340dc1f..0000000 --- a/docker-compose.dev.yml +++ /dev/null @@ -1,33 +0,0 @@ -services: - # MySQL 数据库 - x-agent-mysql: - image: mysql:8.0 - container_name: x-agents-mysql - environment: - MYSQL_ROOT_PASSWORD: root - MYSQL_DATABASE: x_agents - volumes: - - mysql-data:/var/lib/mysql - ports: - - "6036:3306" - healthcheck: - test: ["CMD", "mysqladmin", "ping", "-h", "localhost"] - interval: 10s - timeout: 5s - retries: 5 - restart: unless-stopped - command: --default-authentication-plugin=mysql_native_password - - # Redis - x-agent-redis: - image: redis:7-alpine - container_name: x-agents-redis - ports: - - "6037:6379" - volumes: - - redis-data:/data - restart: unless-stopped - -volumes: - mysql-data: - redis-data: