From 797518ec769bdfcf4ce0f2c7d715fcfd1524d3d5 Mon Sep 17 00:00:00 2001 From: "DESKTOP-72TV0V4\\caoxiaozhu" Date: Mon, 9 Mar 2026 10:27:08 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E9=87=8D=E6=9E=84=20algorithm=20?= =?UTF-8?q?=E4=B8=BA=20ai-core=20=E4=BB=A3=E7=A0=81=E8=A7=A3=E6=9E=90?= =?UTF-8?q?=E6=9C=8D=E5=8A=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 ai-core 目录,包含代码解析核心服务 - 添加 proto 定义、parser、service 模块 - 添加启动脚本和依赖配置 Co-Authored-By: Claude Opus 4.6 --- ai-core/.gitignore | 50 +++++ ai-core/README.md | 221 ++++++++++++++++++++ ai-core/generate_grpc.py | 46 ++++ ai-core/main.py | 59 ++++++ ai-core/parser/__init__.py | 9 + ai-core/parser/parser.py | 100 +++++++++ ai-core/proto/document_parser.proto | 47 +++++ ai-core/proto/document_parser_pb2.py | 57 +++++ ai-core/proto/document_parser_pb2_grpc.py | 183 ++++++++++++++++ ai-core/requirements.txt | 13 ++ ai-core/service/__init__.py | 0 ai-core/service/grpc_server.py | 244 ++++++++++++++++++++++ ai-core/start.bat | 13 ++ ai-core/start.ps1 | 35 ++++ ai-core/start.sh | 86 ++++++++ 15 files changed, 1163 insertions(+) create mode 100644 ai-core/.gitignore create mode 100644 ai-core/README.md create mode 100644 ai-core/generate_grpc.py create mode 100644 ai-core/main.py create mode 100644 ai-core/parser/__init__.py create mode 100644 ai-core/parser/parser.py create mode 100644 ai-core/proto/document_parser.proto create mode 100644 ai-core/proto/document_parser_pb2.py create mode 100644 ai-core/proto/document_parser_pb2_grpc.py create mode 100644 ai-core/requirements.txt create mode 100644 ai-core/service/__init__.py create mode 100644 ai-core/service/grpc_server.py create mode 100644 ai-core/start.bat create mode 100644 ai-core/start.ps1 create mode 100644 ai-core/start.sh diff --git a/ai-core/.gitignore b/ai-core/.gitignore new file mode 100644 index 0000000..531de6c --- /dev/null +++ b/ai-core/.gitignore @@ -0,0 +1,50 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +venv/ +ENV/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Generated gRPC files (optional - uncomment if you want to exclude them) +# proto/*_pb2.py +# proto/*_pb2_grpc.py + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log + +# Environment variables +.env +.env.local + +# Temporary files +*.tmp +*.bak diff --git a/ai-core/README.md b/ai-core/README.md new file mode 100644 index 0000000..7596d67 --- /dev/null +++ b/ai-core/README.md @@ -0,0 +1,221 @@ +# AI-Core 文档解析服务 + +基于 Python 和 Microsoft MarkItDown 的 gRPC 文档解析服务,支持多种文件格式转换为 Markdown。 + +## 特性 + +- **统一解析引擎** - 使用 Microsoft MarkItDown,一个库支持所有格式 +- **支持格式广泛** - PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页等 +- **gRPC 接口** - 高性能、类型安全的 RPC 通信 +- **依赖简单** - 只需安装 3 个核心包 +- **易于部署** - 一键启动,开箱即用 + +## 项目结构 + +``` +ai-core/ +├── main.py # 服务启动入口 +├── requirements.txt # Python 依赖(仅 3 个包) +├── generate_grpc.py # gRPC 代码生成脚本 +├── start.sh # Linux/Mac 启动脚本 +├── start.ps1 # Windows 启动脚本 +├── proto/ # gRPC 协议定义 +│ ├── document_parser.proto # Protocol Buffers 定义 +│ ├── document_parser_pb2.py # 生成的 Python 代码 +│ └── document_parser_pb2_grpc.py +├── parser/ # 文档解析器模块 +│ ├── __init__.py +│ └── parser.py # MarkItDown 解析器 +└── service/ # gRPC 服务实现 + ├── __init__.py + └── grpc_server.py # gRPC 服务器 +``` + +## 安装 + +### 1. 安装依赖 + +```bash +pip install -r requirements.txt +``` + +依赖包: +- `grpcio` - gRPC 框架 +- `grpcio-tools` - gRPC 工具 +- `grpcio-reflection` - gRPC 反射 +- `protobuf` - Protocol Buffers +- `requests` - HTTP 请求 +- `markitdown` - Microsoft 文档解析引擎 + +### 2. 生成 gRPC 代码 + +```bash +python generate_grpc.py +``` + +这会在 `proto` 目录下生成两个文件: +- `document_parser_pb2.py` +- `document_parser_pb2_grpc.py` + +## 使用 + +### 方式 1: 使用启动脚本(推荐) + +**Windows:** +```powershell +.\start.ps1 +``` + +**Linux/Mac:** +```bash +bash start.sh +``` + +### 方式 2: 直接运行 + +```bash +python main.py --port 50051 --max-workers 10 +``` + +参数说明: +- `--port`: gRPC 服务端口(默认 50051) +- `--max-workers`: 最大工作线程数(默认 10) +- `--log-level`: 日志级别(DEBUG/INFO/WARNING/ERROR,默认 INFO) + +## gRPC 接口 + +### ParseDocument + +解析文档为 Markdown + +```protobuf +message ParseRequest { + string file_url = 1; // 文件 URL(必填) + string file_name = 2; // 文件名(必填) + string file_type = 3; // 文件类型(可选) + string parser_engine = 4; // 解析引擎(可选) + map engine_overrides = 5;// 引擎参数覆盖(可选) +} + +message ParseResponse { + bool success = 1; // 是否成功 + string content = 2; // Markdown 内容 + string message = 3; // 消息 + int32 content_length = 4; // 内容长度 + string file_type = 5; // 文件类型 + string parser_engine = 6; // 使用的解析引擎 +} +``` + +### GetSupportedFormats + +获取支持的文件格式列表 + +### GetEngines + +获取可用的解析引擎列表 + +## Go 客户端调用示例 + +```go +import ( + "context" + "log" + + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" +) + +func main() { + conn, err := grpc.Dial("localhost:50051", grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + log.Fatalf("Failed to connect: %v", err) + } + defer conn.Close() + + client := docparser.NewDocumentParserClient(conn) + + resp, err := client.ParseDocument(context.Background(), &docparser.ParseRequest{ + FileUrl: "http://localhost:8082/files/abc123.pdf", + FileName: "example.pdf", + FileType: "pdf", + }) + + if err != nil { + log.Fatalf("Failed to parse: %v", err) + } + + log.Printf("Success: %v", resp.Success) + log.Printf("Content length: %d", resp.ContentLength) + log.Printf("Markdown content:\n%s", resp.Content) +} +``` + +## 支持的文件格式 + +| 类别 | 支持的扩展名 | +|------|-------------| +| **文档** | pdf, docx, doc, pptx, ppt | +| **表格** | xlsx, xls, csv | +| **文本** | md, markdown, txt | +| **图片** | jpg, jpeg, png, gif, bmp, tiff, webp | +| **网页** | html, htm | + +## 为什么选择 MarkItDown? + +1. **微软官方支持** - Microsoft 开发,持续维护 +2. **格式覆盖全** - 一个库支持所有常见格式 +3. **统一接口** - 无需为每种格式单独实现 +4. **安装简单** - 只需 `pip install markitdown` +5. **性能优秀** - 基于优化的解析算法 + +## 故障排查 + +### 端口已被占用 + +如果提示端口 50051 已被占用,可以更换端口: + +```bash +python main.py --port 50052 +``` + +### gRPC 代码未生成 + +如果提示找不到 `docparser_pb2`,运行: + +```bash +python generate_grpc.py +``` + +### 依赖安装失败 + +确保使用 Python 3.8+: + +```bash +python --version +pip --version +``` + +## 开发 + +### 测试解析器 + +```python +from parser import Parser + +parser = Parser() + +# 解析文件 +result = parser.parse("path/to/file.pdf") +print(result["content"]) + +# 解析字节内容 +with open("file.pdf", "rb") as f: + content = f.read() +result = parser.parse_bytes(content, "file.pdf") +print(result["content"]) +``` + +## 许可证 + +MIT License diff --git a/ai-core/generate_grpc.py b/ai-core/generate_grpc.py new file mode 100644 index 0000000..6933f19 --- /dev/null +++ b/ai-core/generate_grpc.py @@ -0,0 +1,46 @@ +import subprocess +import sys +import os + +proto_file = "proto/document_parser.proto" +proto_path = "proto" +python_out = "proto" +grpc_python_out = "proto" + +def generate_grpc(): + """Generate gRPC Python code from proto file""" + print(f"Generating gRPC code from {proto_file}...") + + cmd = [ + sys.executable, + "-m", + "grpc_tools.protoc", + f"--proto_path={proto_path}", + f"--python_out={python_out}", + f"--grpc_python_out={grpc_python_out}", + proto_file, + ] + + try: + subprocess.run(cmd, check=True) + print("gRPC code generated successfully!") + + pb2_file = os.path.join(python_out, "document_parser_pb2.py") + pb2_grpc_file = os.path.join(python_out, "document_parser_pb2_grpc.py") + + if os.path.exists(pb2_file) and os.path.exists(pb2_grpc_file): + print(f"Generated files:") + print(f" - {pb2_file}") + print(f" - {pb2_grpc_file}") + else: + print("Warning: Expected files not found") + + except subprocess.CalledProcessError as e: + print(f"Error generating gRPC code: {e}") + sys.exit(1) + except Exception as e: + print(f"Unexpected error: {e}") + sys.exit(1) + +if __name__ == "__main__": + generate_grpc() diff --git a/ai-core/main.py b/ai-core/main.py new file mode 100644 index 0000000..45f29de --- /dev/null +++ b/ai-core/main.py @@ -0,0 +1,59 @@ +import argparse +import logging +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) + +from service.grpc_server import serve + +DEFAULT_PORT = 50051 +DEFAULT_MAX_WORKERS = 10 + +def main(): + parser = argparse.ArgumentParser( + description="Document Parser gRPC Server (MarkItDown)", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--port", + type=int, + default=DEFAULT_PORT, + help="Port to listen on", + ) + parser.add_argument( + "--max-workers", + type=int, + default=DEFAULT_MAX_WORKERS, + help="Maximum number of worker threads", + ) + parser.add_argument( + "--log-level", + type=str, + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"], + help="Log level", + ) + + args = parser.parse_args() + + logging.basicConfig( + level=getattr(logging, args.log_level), + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + logger = logging.getLogger(__name__) + logger.info("Starting Document Parser gRPC Server (MarkItDown)") + logger.info("Port: %d", args.port) + logger.info("Max workers: %d", args.max_workers) + + try: + serve(port=args.port, max_workers=args.max_workers) + except KeyboardInterrupt: + logger.info("Server shutdown requested") + except Exception as e: + logger.error("Server error: %s", str(e), exc_info=True) + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/ai-core/parser/__init__.py b/ai-core/parser/__init__.py new file mode 100644 index 0000000..fca4512 --- /dev/null +++ b/ai-core/parser/__init__.py @@ -0,0 +1,9 @@ +""" +Parser module for AI-Core document processing system. + +This module provides document parsing using Microsoft MarkItDown. +""" + +from .parser import Parser + +__all__ = ["Parser"] diff --git a/ai-core/parser/parser.py b/ai-core/parser/parser.py new file mode 100644 index 0000000..54838fe --- /dev/null +++ b/ai-core/parser/parser.py @@ -0,0 +1,100 @@ +import logging +import os +import tempfile +from typing import Optional +from markitdown import MarkItDown + +logger = logging.getLogger(__name__) + + +class Parser: + """基于 MarkItDown 的统一文档解析器 + + 支持格式:PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页、Markdown 等 + """ + + def __init__(self): + self.markitdown = MarkItDown() + logger.info("Parser initialized with MarkItDown") + + def parse(self, file_path: str, file_type: Optional[str] = None) -> dict: + """解析文档为 Markdown + + Args: + file_path: 文件路径或 URL + file_type: 文件类型(可选,MarkItDown 会自动检测) + + Returns: + dict: 包含 markdown 内容和元数据 + """ + try: + logger.info(f"Parsing file: {file_path}") + + result = self.markitdown.convert(file_path) + + logger.info(f"Parse successful: {len(result.text_content)} characters") + + return { + "success": True, + "content": result.text_content, + "content_length": len(result.text_content), + "metadata": result.metadata if hasattr(result, 'metadata') else {} + } + except Exception as e: + logger.error(f"Parse error: {e}", exc_info=True) + return { + "success": False, + "content": "", + "content_length": 0, + "error": str(e) + } + + def parse_bytes(self, content: bytes, file_name: str, file_type: Optional[str] = None) -> dict: + """解析字节内容为 Markdown + + Args: + content: 文件字节内容 + file_name: 文件名 + file_type: 文件类型(可选) + + Returns: + dict: 包含 markdown 内容和元数据 + """ + try: + logger.info(f"Parsing bytes: {file_name}, size: {len(content)} bytes") + + with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1] or '') as temp_file: + temp_file.write(content) + temp_path = temp_file.name + + try: + result = self.markitdown.convert(temp_path) + + logger.info(f"Parse successful: {len(result.text_content)} characters") + + return { + "success": True, + "content": result.text_content, + "content_length": len(result.text_content), + "metadata": result.metadata if hasattr(result, 'metadata') else {} + } + finally: + os.unlink(temp_path) + except Exception as e: + logger.error(f"Parse bytes error: {e}", exc_info=True) + return { + "success": False, + "content": "", + "content_length": 0, + "error": str(e) + } + + +if __name__ == "__main__": + parser = Parser() + + # 测试 + test_url = "https://example.com" + result = parser.parse(test_url) + print(f"Success: {result['success']}") + print(f"Content length: {result['content_length']}") diff --git a/ai-core/proto/document_parser.proto b/ai-core/proto/document_parser.proto new file mode 100644 index 0000000..9024679 --- /dev/null +++ b/ai-core/proto/document_parser.proto @@ -0,0 +1,47 @@ +syntax = "proto3"; + +package docparser; + +option go_package = "x-agents/proto/docparser"; + +service DocumentParser { + rpc ParseDocument(ParseRequest) returns (ParseResponse); + rpc GetSupportedFormats(Empty) returns (SupportedFormatsResponse); + rpc GetEngines(Empty) returns (EnginesResponse); +} + +message ParseRequest { + string file_url = 1; + string file_name = 2; + string file_type = 3; + string parser_engine = 4; + map engine_overrides = 5; +} + +message ParseResponse { + bool success = 1; + string content = 2; + string message = 3; + int32 content_length = 4; + string file_type = 5; + string parser_engine = 6; +} + +message Empty {} + +message SupportedFormatsResponse { + repeated string file_types = 1; + map file_type_descriptions = 2; +} + +message EnginesResponse { + repeated EngineInfo engines = 1; +} + +message EngineInfo { + string name = 1; + string description = 2; + repeated string supported_file_types = 3; + bool available = 4; + string unavailable_reason = 5; +} diff --git a/ai-core/proto/document_parser_pb2.py b/ai-core/proto/document_parser_pb2.py new file mode 100644 index 0000000..898fb6f --- /dev/null +++ b/ai-core/proto/document_parser_pb2.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: document_parser.proto +# Protobuf Python Version: 6.31.1 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 6, + 31, + 1, + '', + 'document_parser.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x64ocument_parser.proto\x12\tdocparser\"\xdd\x01\n\x0cParseRequest\x12\x10\n\x08\x66ile_url\x18\x01 \x01(\t\x12\x11\n\tfile_name\x18\x02 \x01(\t\x12\x11\n\tfile_type\x18\x03 \x01(\t\x12\x15\n\rparser_engine\x18\x04 \x01(\t\x12\x46\n\x10\x65ngine_overrides\x18\x05 \x03(\x0b\x32,.docparser.ParseRequest.EngineOverridesEntry\x1a\x36\n\x14\x45ngineOverridesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\x84\x01\n\rParseResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07\x63ontent\x18\x02 \x01(\t\x12\x0f\n\x07message\x18\x03 \x01(\t\x12\x16\n\x0e\x63ontent_length\x18\x04 \x01(\x05\x12\x11\n\tfile_type\x18\x05 \x01(\t\x12\x15\n\rparser_engine\x18\x06 \x01(\t\"\x07\n\x05\x45mpty\"\xca\x01\n\x18SupportedFormatsResponse\x12\x12\n\nfile_types\x18\x01 \x03(\t\x12]\n\x16\x66ile_type_descriptions\x18\x02 \x03(\x0b\x32=.docparser.SupportedFormatsResponse.FileTypeDescriptionsEntry\x1a;\n\x19\x46ileTypeDescriptionsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"9\n\x0f\x45nginesResponse\x12&\n\x07\x65ngines\x18\x01 \x03(\x0b\x32\x15.docparser.EngineInfo\"|\n\nEngineInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x1c\n\x14supported_file_types\x18\x03 \x03(\t\x12\x11\n\tavailable\x18\x04 \x01(\x08\x12\x1a\n\x12unavailable_reason\x18\x05 \x01(\t2\xde\x01\n\x0e\x44ocumentParser\x12\x42\n\rParseDocument\x12\x17.docparser.ParseRequest\x1a\x18.docparser.ParseResponse\x12L\n\x13GetSupportedFormats\x12\x10.docparser.Empty\x1a#.docparser.SupportedFormatsResponse\x12:\n\nGetEngines\x12\x10.docparser.Empty\x1a\x1a.docparser.EnginesResponseB\x1aZ\x18x-agents/proto/docparserb\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'document_parser_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + _globals['DESCRIPTOR']._loaded_options = None + _globals['DESCRIPTOR']._serialized_options = b'Z\030x-agents/proto/docparser' + _globals['_PARSEREQUEST_ENGINEOVERRIDESENTRY']._loaded_options = None + _globals['_PARSEREQUEST_ENGINEOVERRIDESENTRY']._serialized_options = b'8\001' + _globals['_SUPPORTEDFORMATSRESPONSE_FILETYPEDESCRIPTIONSENTRY']._loaded_options = None + _globals['_SUPPORTEDFORMATSRESPONSE_FILETYPEDESCRIPTIONSENTRY']._serialized_options = b'8\001' + _globals['_PARSEREQUEST']._serialized_start=37 + _globals['_PARSEREQUEST']._serialized_end=258 + _globals['_PARSEREQUEST_ENGINEOVERRIDESENTRY']._serialized_start=204 + _globals['_PARSEREQUEST_ENGINEOVERRIDESENTRY']._serialized_end=258 + _globals['_PARSERESPONSE']._serialized_start=261 + _globals['_PARSERESPONSE']._serialized_end=393 + _globals['_EMPTY']._serialized_start=395 + _globals['_EMPTY']._serialized_end=402 + _globals['_SUPPORTEDFORMATSRESPONSE']._serialized_start=405 + _globals['_SUPPORTEDFORMATSRESPONSE']._serialized_end=607 + _globals['_SUPPORTEDFORMATSRESPONSE_FILETYPEDESCRIPTIONSENTRY']._serialized_start=548 + _globals['_SUPPORTEDFORMATSRESPONSE_FILETYPEDESCRIPTIONSENTRY']._serialized_end=607 + _globals['_ENGINESRESPONSE']._serialized_start=609 + _globals['_ENGINESRESPONSE']._serialized_end=666 + _globals['_ENGINEINFO']._serialized_start=668 + _globals['_ENGINEINFO']._serialized_end=792 + _globals['_DOCUMENTPARSER']._serialized_start=795 + _globals['_DOCUMENTPARSER']._serialized_end=1017 +# @@protoc_insertion_point(module_scope) diff --git a/ai-core/proto/document_parser_pb2_grpc.py b/ai-core/proto/document_parser_pb2_grpc.py new file mode 100644 index 0000000..e84526a --- /dev/null +++ b/ai-core/proto/document_parser_pb2_grpc.py @@ -0,0 +1,183 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc +import warnings + +import document_parser_pb2 as document__parser__pb2 + +GRPC_GENERATED_VERSION = '1.78.0' +GRPC_VERSION = grpc.__version__ +_version_not_supported = False + +try: + from grpc._utilities import first_version_is_lower + _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION) +except ImportError: + _version_not_supported = True + +if _version_not_supported: + raise RuntimeError( + f'The grpc package installed is at version {GRPC_VERSION},' + + ' but the generated code in document_parser_pb2_grpc.py depends on' + + f' grpcio>={GRPC_GENERATED_VERSION}.' + + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}' + + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.' + ) + + +class DocumentParserStub(object): + """Missing associated documentation comment in .proto file.""" + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.ParseDocument = channel.unary_unary( + '/docparser.DocumentParser/ParseDocument', + request_serializer=document__parser__pb2.ParseRequest.SerializeToString, + response_deserializer=document__parser__pb2.ParseResponse.FromString, + _registered_method=True) + self.GetSupportedFormats = channel.unary_unary( + '/docparser.DocumentParser/GetSupportedFormats', + request_serializer=document__parser__pb2.Empty.SerializeToString, + response_deserializer=document__parser__pb2.SupportedFormatsResponse.FromString, + _registered_method=True) + self.GetEngines = channel.unary_unary( + '/docparser.DocumentParser/GetEngines', + request_serializer=document__parser__pb2.Empty.SerializeToString, + response_deserializer=document__parser__pb2.EnginesResponse.FromString, + _registered_method=True) + + +class DocumentParserServicer(object): + """Missing associated documentation comment in .proto file.""" + + def ParseDocument(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetSupportedFormats(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetEngines(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + +def add_DocumentParserServicer_to_server(servicer, server): + rpc_method_handlers = { + 'ParseDocument': grpc.unary_unary_rpc_method_handler( + servicer.ParseDocument, + request_deserializer=document__parser__pb2.ParseRequest.FromString, + response_serializer=document__parser__pb2.ParseResponse.SerializeToString, + ), + 'GetSupportedFormats': grpc.unary_unary_rpc_method_handler( + servicer.GetSupportedFormats, + request_deserializer=document__parser__pb2.Empty.FromString, + response_serializer=document__parser__pb2.SupportedFormatsResponse.SerializeToString, + ), + 'GetEngines': grpc.unary_unary_rpc_method_handler( + servicer.GetEngines, + request_deserializer=document__parser__pb2.Empty.FromString, + response_serializer=document__parser__pb2.EnginesResponse.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + 'docparser.DocumentParser', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) + server.add_registered_method_handlers('docparser.DocumentParser', rpc_method_handlers) + + + # This class is part of an EXPERIMENTAL API. +class DocumentParser(object): + """Missing associated documentation comment in .proto file.""" + + @staticmethod + def ParseDocument(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, + target, + '/docparser.DocumentParser/ParseDocument', + document__parser__pb2.ParseRequest.SerializeToString, + document__parser__pb2.ParseResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True) + + @staticmethod + def GetSupportedFormats(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, + target, + '/docparser.DocumentParser/GetSupportedFormats', + document__parser__pb2.Empty.SerializeToString, + document__parser__pb2.SupportedFormatsResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True) + + @staticmethod + def GetEngines(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, + target, + '/docparser.DocumentParser/GetEngines', + document__parser__pb2.Empty.SerializeToString, + document__parser__pb2.EnginesResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True) diff --git a/ai-core/requirements.txt b/ai-core/requirements.txt new file mode 100644 index 0000000..5872a22 --- /dev/null +++ b/ai-core/requirements.txt @@ -0,0 +1,13 @@ +# AI-Core Document Parser - 基于 MarkItDown + +# gRPC 框架 +grpcio>=1.60.0 +grpcio-tools>=1.60.0 +grpcio-reflection>=1.60.0 +protobuf>=4.25.0 + +# HTTP 请求 +requests>=2.31.0 + +# 文档解析 +markitdown>=0.0.1 diff --git a/ai-core/service/__init__.py b/ai-core/service/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ai-core/service/grpc_server.py b/ai-core/service/grpc_server.py new file mode 100644 index 0000000..9a5f49c --- /dev/null +++ b/ai-core/service/grpc_server.py @@ -0,0 +1,244 @@ +import logging +import requests +from concurrent import futures + +import grpc +from grpc_reflection.v1alpha import reflection + +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "proto")) + +from parser.parser import Parser + +logger = logging.getLogger(__name__) + +docparser_pb2 = None +docparser_pb2_grpc = None + +def _import_grpc_protobuf(): + """Import gRPC protobuf modules""" + global docparser_pb2, docparser_pb2_grpc + if docparser_pb2 is not None and docparser_pb2_grpc is not None: + return + + try: + import document_parser_pb2 as dpb2 + import document_parser_pb2_grpc as dpb2_grpc + docparser_pb2 = dpb2 + docparser_pb2_grpc = dpb2_grpc + logger.info("Successfully imported gRPC protobuf modules") + except ImportError as e: + logger.error(f"Failed to import gRPC protobuf: {e}") + raise ImportError( + "gRPC protobuf files not found. Please run: python generate_grpc.py" + ) from e + + +class DocumentParserServicer: + """gRPC 服务实现,使用 MarkItDown""" + + def __init__(self, max_workers: int = 10): + _import_grpc_protobuf() + self.parser = Parser() + self.max_workers = max_workers + logger.info("DocumentParserServicer initialized") + + def ParseDocument(self, request, context): + """解析文档""" + try: + logger.info( + "ParseDocument request: file_url=%s, file_name=%s, file_type=%s", + request.file_url, + request.file_name, + request.file_type, + ) + + file_url = request.file_url + file_name = request.file_name + + if not file_url: + return docparser_pb2.ParseResponse( + success=False, + content="", + message="file_url is required", + content_length=0, + ) + + if not file_name: + return docparser_pb2.ParseResponse( + success=False, + content="", + message="file_name is required", + content_length=0, + ) + + logger.info("Downloading file from URL: %s", file_url) + + try: + response = requests.get( + file_url, + timeout=60, + headers={"User-Agent": "DocParser/1.0"}, + ) + response.raise_for_status() + content = response.content + logger.info("Downloaded %d bytes", len(content)) + except requests.RequestException as e: + logger.error("Failed to download file: %s", str(e)) + return docparser_pb2.ParseResponse( + success=False, + content="", + message=f"Failed to download file: {str(e)}", + content_length=0, + ) + + logger.info("Parsing file with MarkItDown") + + result = self.parser.parse_bytes(content, file_name) + + if not result.get("success", False): + logger.warning("Parser returned failure: %s", result.get("error", "Unknown error")) + return docparser_pb2.ParseResponse( + success=False, + content="", + message=result.get("error", "Parse failed"), + content_length=0, + ) + + markdown_content = result.get("content", "") + logger.info( + "Parse successful: content_length=%d", + len(markdown_content), + ) + + return docparser_pb2.ParseResponse( + success=True, + content=markdown_content, + message="Parse successful", + content_length=len(markdown_content), + file_type=request.file_type or "auto", + parser_engine="markitdown", + ) + + except Exception as e: + logger.error("ParseDocument error: %s", str(e), exc_info=True) + return docparser_pb2.ParseResponse( + success=False, + content="", + message=f"Parse error: {str(e)}", + content_length=0, + ) + + def GetSupportedFormats(self, request, context): + """获取支持的文件格式""" + try: + logger.info("GetSupportedFormats request") + + file_types = [ + "pdf", "docx", "doc", "pptx", "ppt", + "xlsx", "xls", "csv", + "md", "markdown", + "jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp", + "html", "htm", "txt", + ] + + file_type_descriptions = { + "pdf": "PDF Document", + "docx": "Microsoft Word Document", + "doc": "Microsoft Word Document (Legacy)", + "pptx": "Microsoft PowerPoint Presentation", + "ppt": "Microsoft PowerPoint Presentation (Legacy)", + "xlsx": "Microsoft Excel Spreadsheet", + "xls": "Microsoft Excel Spreadsheet (Legacy)", + "csv": "Comma-Separated Values", + "md": "Markdown File", + "markdown": "Markdown File", + "jpg": "JPEG Image", + "jpeg": "JPEG Image", + "png": "PNG Image", + "gif": "GIF Image", + "bmp": "BMP Image", + "tiff": "TIFF Image", + "webp": "WebP Image", + "html": "HTML Document", + "htm": "HTML Document", + "txt": "Plain Text File", + } + + return docparser_pb2.SupportedFormatsResponse( + file_types=file_types, + file_type_descriptions=file_type_descriptions, + ) + except Exception as e: + logger.error("GetSupportedFormats error: %s", str(e), exc_info=True) + context.set_code(grpc.StatusCode.INTERNAL) + context.set_details(str(e)) + return docparser_pb2.SupportedFormatsResponse() + + def GetEngines(self, request, context): + """获取可用的解析引擎列表""" + try: + logger.info("GetEngines request") + + engine_info = docparser_pb2.EngineInfo( + name="markitdown", + description="Microsoft MarkItDown - 统一文档解析引擎", + supported_file_types=[ + "pdf", "docx", "doc", "pptx", "ppt", + "xlsx", "xls", "csv", + "md", "markdown", + "jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp", + "html", "htm", "txt", + ], + available=True, + unavailable_reason="", + ) + + return docparser_pb2.EnginesResponse(engines=[engine_info]) + except Exception as e: + logger.error("GetEngines error: %s", str(e), exc_info=True) + context.set_code(grpc.StatusCode.INTERNAL) + context.set_details(str(e)) + return docparser_pb2.EnginesResponse() + + +def serve(port: int = 50051, max_workers: int = 10): + """启动 gRPC 服务""" + _import_grpc_protobuf() + + server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers)) + + servicer = DocumentParserServicer(max_workers=max_workers) + docparser_pb2_grpc.add_DocumentParserServicer_to_server(servicer, server) + + reflection.enable_server_reflection( + service_names=[ + docparser_pb2.DESCRIPTOR.services_by_name["DocumentParser"].full_name, + reflection.SERVICE_NAME, + ], + server=server, + ) + + server.add_insecure_port(f"[::]:{port}") + server.start() + + logger.info("DocumentParser gRPC server (MarkItDown) started on port %d", port) + logger.info("gRPC reflection enabled") + + try: + server.wait_for_termination() + except KeyboardInterrupt: + logger.info("Shutting down server...") + server.stop(0) + logger.info("Server stopped") + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + serve() diff --git a/ai-core/start.bat b/ai-core/start.bat new file mode 100644 index 0000000..74a83fb --- /dev/null +++ b/ai-core/start.bat @@ -0,0 +1,13 @@ +@echo off +cd /d %~dp0 +echo Starting AI-Core Service... +echo. + +REM 激活虚拟环境 +call venv\Scripts\activate.bat + +REM 启动服务 +python main.py %* + +REM 如果按任意键退出 +pause diff --git a/ai-core/start.ps1 b/ai-core/start.ps1 new file mode 100644 index 0000000..52e69f7 --- /dev/null +++ b/ai-core/start.ps1 @@ -0,0 +1,35 @@ +# AI-Core gRPC Server Startup Script + +Write-Host "Starting AI-Core Document Parser gRPC Server..." -ForegroundColor Green + +# Check if Python is installed +if (-not (Get-Command python -ErrorAction SilentlyContinue)) { + Write-Host "Error: Python is not installed or not in PATH" -ForegroundColor Red + exit 1 +} + +# Check if requirements are installed +$requirementsInstalled = python -c "import grpcio" 2>$null +if (-not $?) { + Write-Host "Installing Python dependencies..." -ForegroundColor Yellow + pip install -r requirements.txt + if ($LASTEXITCODE -ne 0) { + Write-Host "Error: Failed to install dependencies" -ForegroundColor Red + exit 1 + } +} + +# Generate gRPC code if needed +$pb2File = "proto\document_parser_pb2.py" +if (-not (Test-Path $pb2File)) { + Write-Host "Generating gRPC code..." -ForegroundColor Yellow + python generate_grpc.py + if ($LASTEXITCODE -ne 0) { + Write-Host "Error: Failed to generate gRPC code" -ForegroundColor Red + exit 1 + } +} + +# Start the server +Write-Host "Starting server on port 50051..." -ForegroundColor Green +python main.py --port 50051 --max-workers 10 --log-level INFO diff --git a/ai-core/start.sh b/ai-core/start.sh new file mode 100644 index 0000000..6a1e530 --- /dev/null +++ b/ai-core/start.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# AI-Core gRPC Server Startup Script + +echo "Starting AI-Core Document Parser gRPC Server..." + +# 配置 +PORT=${1:-50051} + +# 使用虚拟环境 +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +cd "$SCRIPT_DIR" + +# Windows 下使用 PowerShell 的 py 命令或者直接用 venv +if [[ "$OSTYPE" == "msys" || "$OSTYPE" == "win32" || -f "venv/Scripts/python.exe" ]]; then + if [ -f "venv/Scripts/python.exe" ]; then + echo "Using virtual environment Python..." + PYTHON_CMD="$SCRIPT_DIR/venv/Scripts/python.exe" + elif command -v py &> /dev/null; then + echo "Using py launcher..." + PYTHON_CMD="py" + else + echo "Error: Python not found" + exit 1 + fi +else + # Linux/Mac + if [ -d "venv" ]; then + echo "Activating virtual environment..." + source venv/bin/activate + PYTHON_CMD="python" + else + PYTHON_CMD="python3" + fi +fi + +echo "Using Python: $PYTHON_CMD" +$PYTHON_CMD --version + +# Check if requirements are installed +$PYTHON_CMD -c "import grpcio" 2>/dev/null +if [ $? -ne 0 ]; then + echo "Installing Python dependencies..." + $PYTHON_CMD -m pip install -r requirements.txt + if [ $? -ne 0 ]; then + echo "Error: Failed to install dependencies" + exit 1 + fi +fi + +# Generate gRPC code if needed +if [ ! -f "proto/document_parser_pb2.py" ]; then + echo "Generating gRPC code..." + $PYTHON_CMD generate_grpc.py + if [ $? -ne 0 ]; then + echo "Error: Failed to generate gRPC code" + exit 1 + fi +fi + +# 检查端口占用并释放 +echo "Checking port $PORT..." +if [[ "$OSTYPE" == "msys" || "$OSTYPE" == "win32" ]]; then + # Windows + NETSTAT_OUTPUT=$(netstat -ano 2>/dev/null | grep ":$PORT" | grep LISTENING) + if [ -n "$NETSTAT_OUTPUT" ]; then + echo "Port $PORT is in use, killing process..." + PID=$(echo "$NETSTAT_OUTPUT" | awk '{print $NF}' | head -1) + if [ -n "$PID" ]; then + taskkill //F //PID $PID 2>/dev/null + sleep 1 + fi + fi +else + # Linux/Mac + PID=$(lsof -ti:$PORT 2>/dev/null) + if [ -n "$PID" ]; then + echo "Port $PORT is in use, killing process $PID..." + kill $PID 2>/dev/null + sleep 1 + fi +fi + +# Start the server +echo "Starting server on port $PORT..." +$PYTHON_CMD main.py --port $PORT --max-workers 10 --log-level INFO