Compare commits
7 Commits
b8a683ce67
...
ab7131eb05
| Author | SHA1 | Date | |
|---|---|---|---|
| ab7131eb05 | |||
| 6a27451a6e | |||
| 16b6aa0004 | |||
| dc1c825d2e | |||
| 0d4fd6b425 | |||
| 797518ec76 | |||
| f22f823a4a |
50
ai-core/.gitignore
vendored
Normal file
50
ai-core/.gitignore
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# Generated gRPC files (optional - uncomment if you want to exclude them)
|
||||
# proto/*_pb2.py
|
||||
# proto/*_pb2_grpc.py
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# Environment variables
|
||||
.env
|
||||
.env.local
|
||||
|
||||
# Temporary files
|
||||
*.tmp
|
||||
*.bak
|
||||
221
ai-core/README.md
Normal file
221
ai-core/README.md
Normal file
@@ -0,0 +1,221 @@
|
||||
# AI-Core 文档解析服务
|
||||
|
||||
基于 Python 和 Microsoft MarkItDown 的 gRPC 文档解析服务,支持多种文件格式转换为 Markdown。
|
||||
|
||||
## 特性
|
||||
|
||||
- **统一解析引擎** - 使用 Microsoft MarkItDown,一个库支持所有格式
|
||||
- **支持格式广泛** - PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页等
|
||||
- **gRPC 接口** - 高性能、类型安全的 RPC 通信
|
||||
- **依赖简单** - 只需安装 3 个核心包
|
||||
- **易于部署** - 一键启动,开箱即用
|
||||
|
||||
## 项目结构
|
||||
|
||||
```
|
||||
ai-core/
|
||||
├── main.py # 服务启动入口
|
||||
├── requirements.txt # Python 依赖(仅 3 个包)
|
||||
├── generate_grpc.py # gRPC 代码生成脚本
|
||||
├── start.sh # Linux/Mac 启动脚本
|
||||
├── start.ps1 # Windows 启动脚本
|
||||
├── proto/ # gRPC 协议定义
|
||||
│ ├── document_parser.proto # Protocol Buffers 定义
|
||||
│ ├── document_parser_pb2.py # 生成的 Python 代码
|
||||
│ └── document_parser_pb2_grpc.py
|
||||
├── parser/ # 文档解析器模块
|
||||
│ ├── __init__.py
|
||||
│ └── parser.py # MarkItDown 解析器
|
||||
└── service/ # gRPC 服务实现
|
||||
├── __init__.py
|
||||
└── grpc_server.py # gRPC 服务器
|
||||
```
|
||||
|
||||
## 安装
|
||||
|
||||
### 1. 安装依赖
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
依赖包:
|
||||
- `grpcio` - gRPC 框架
|
||||
- `grpcio-tools` - gRPC 工具
|
||||
- `grpcio-reflection` - gRPC 反射
|
||||
- `protobuf` - Protocol Buffers
|
||||
- `requests` - HTTP 请求
|
||||
- `markitdown` - Microsoft 文档解析引擎
|
||||
|
||||
### 2. 生成 gRPC 代码
|
||||
|
||||
```bash
|
||||
python generate_grpc.py
|
||||
```
|
||||
|
||||
这会在 `proto` 目录下生成两个文件:
|
||||
- `document_parser_pb2.py`
|
||||
- `document_parser_pb2_grpc.py`
|
||||
|
||||
## 使用
|
||||
|
||||
### 方式 1: 使用启动脚本(推荐)
|
||||
|
||||
**Windows:**
|
||||
```powershell
|
||||
.\start.ps1
|
||||
```
|
||||
|
||||
**Linux/Mac:**
|
||||
```bash
|
||||
bash start.sh
|
||||
```
|
||||
|
||||
### 方式 2: 直接运行
|
||||
|
||||
```bash
|
||||
python main.py --port 50051 --max-workers 10
|
||||
```
|
||||
|
||||
参数说明:
|
||||
- `--port`: gRPC 服务端口(默认 50051)
|
||||
- `--max-workers`: 最大工作线程数(默认 10)
|
||||
- `--log-level`: 日志级别(DEBUG/INFO/WARNING/ERROR,默认 INFO)
|
||||
|
||||
## gRPC 接口
|
||||
|
||||
### ParseDocument
|
||||
|
||||
解析文档为 Markdown
|
||||
|
||||
```protobuf
|
||||
message ParseRequest {
|
||||
string file_url = 1; // 文件 URL(必填)
|
||||
string file_name = 2; // 文件名(必填)
|
||||
string file_type = 3; // 文件类型(可选)
|
||||
string parser_engine = 4; // 解析引擎(可选)
|
||||
map<string, string> engine_overrides = 5;// 引擎参数覆盖(可选)
|
||||
}
|
||||
|
||||
message ParseResponse {
|
||||
bool success = 1; // 是否成功
|
||||
string content = 2; // Markdown 内容
|
||||
string message = 3; // 消息
|
||||
int32 content_length = 4; // 内容长度
|
||||
string file_type = 5; // 文件类型
|
||||
string parser_engine = 6; // 使用的解析引擎
|
||||
}
|
||||
```
|
||||
|
||||
### GetSupportedFormats
|
||||
|
||||
获取支持的文件格式列表
|
||||
|
||||
### GetEngines
|
||||
|
||||
获取可用的解析引擎列表
|
||||
|
||||
## Go 客户端调用示例
|
||||
|
||||
```go
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
)
|
||||
|
||||
func main() {
|
||||
conn, err := grpc.Dial("localhost:50051", grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to connect: %v", err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
client := docparser.NewDocumentParserClient(conn)
|
||||
|
||||
resp, err := client.ParseDocument(context.Background(), &docparser.ParseRequest{
|
||||
FileUrl: "http://localhost:8082/files/abc123.pdf",
|
||||
FileName: "example.pdf",
|
||||
FileType: "pdf",
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to parse: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Success: %v", resp.Success)
|
||||
log.Printf("Content length: %d", resp.ContentLength)
|
||||
log.Printf("Markdown content:\n%s", resp.Content)
|
||||
}
|
||||
```
|
||||
|
||||
## 支持的文件格式
|
||||
|
||||
| 类别 | 支持的扩展名 |
|
||||
|------|-------------|
|
||||
| **文档** | pdf, docx, doc, pptx, ppt |
|
||||
| **表格** | xlsx, xls, csv |
|
||||
| **文本** | md, markdown, txt |
|
||||
| **图片** | jpg, jpeg, png, gif, bmp, tiff, webp |
|
||||
| **网页** | html, htm |
|
||||
|
||||
## 为什么选择 MarkItDown?
|
||||
|
||||
1. **微软官方支持** - Microsoft 开发,持续维护
|
||||
2. **格式覆盖全** - 一个库支持所有常见格式
|
||||
3. **统一接口** - 无需为每种格式单独实现
|
||||
4. **安装简单** - 只需 `pip install markitdown`
|
||||
5. **性能优秀** - 基于优化的解析算法
|
||||
|
||||
## 故障排查
|
||||
|
||||
### 端口已被占用
|
||||
|
||||
如果提示端口 50051 已被占用,可以更换端口:
|
||||
|
||||
```bash
|
||||
python main.py --port 50052
|
||||
```
|
||||
|
||||
### gRPC 代码未生成
|
||||
|
||||
如果提示找不到 `docparser_pb2`,运行:
|
||||
|
||||
```bash
|
||||
python generate_grpc.py
|
||||
```
|
||||
|
||||
### 依赖安装失败
|
||||
|
||||
确保使用 Python 3.8+:
|
||||
|
||||
```bash
|
||||
python --version
|
||||
pip --version
|
||||
```
|
||||
|
||||
## 开发
|
||||
|
||||
### 测试解析器
|
||||
|
||||
```python
|
||||
from parser import Parser
|
||||
|
||||
parser = Parser()
|
||||
|
||||
# 解析文件
|
||||
result = parser.parse("path/to/file.pdf")
|
||||
print(result["content"])
|
||||
|
||||
# 解析字节内容
|
||||
with open("file.pdf", "rb") as f:
|
||||
content = f.read()
|
||||
result = parser.parse_bytes(content, "file.pdf")
|
||||
print(result["content"])
|
||||
```
|
||||
|
||||
## 许可证
|
||||
|
||||
MIT License
|
||||
46
ai-core/generate_grpc.py
Normal file
46
ai-core/generate_grpc.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
|
||||
proto_file = "proto/document_parser.proto"
|
||||
proto_path = "proto"
|
||||
python_out = "proto"
|
||||
grpc_python_out = "proto"
|
||||
|
||||
def generate_grpc():
|
||||
"""Generate gRPC Python code from proto file"""
|
||||
print(f"Generating gRPC code from {proto_file}...")
|
||||
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"grpc_tools.protoc",
|
||||
f"--proto_path={proto_path}",
|
||||
f"--python_out={python_out}",
|
||||
f"--grpc_python_out={grpc_python_out}",
|
||||
proto_file,
|
||||
]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True)
|
||||
print("gRPC code generated successfully!")
|
||||
|
||||
pb2_file = os.path.join(python_out, "document_parser_pb2.py")
|
||||
pb2_grpc_file = os.path.join(python_out, "document_parser_pb2_grpc.py")
|
||||
|
||||
if os.path.exists(pb2_file) and os.path.exists(pb2_grpc_file):
|
||||
print(f"Generated files:")
|
||||
print(f" - {pb2_file}")
|
||||
print(f" - {pb2_grpc_file}")
|
||||
else:
|
||||
print("Warning: Expected files not found")
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error generating gRPC code: {e}")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Unexpected error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
generate_grpc()
|
||||
59
ai-core/main.py
Normal file
59
ai-core/main.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from service.grpc_server import serve
|
||||
|
||||
DEFAULT_PORT = 50051
|
||||
DEFAULT_MAX_WORKERS = 10
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Document Parser gRPC Server (MarkItDown)",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=DEFAULT_PORT,
|
||||
help="Port to listen on",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-workers",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_WORKERS,
|
||||
help="Maximum number of worker threads",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log-level",
|
||||
type=str,
|
||||
default="INFO",
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
help="Log level",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, args.log_level),
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Starting Document Parser gRPC Server (MarkItDown)")
|
||||
logger.info("Port: %d", args.port)
|
||||
logger.info("Max workers: %d", args.max_workers)
|
||||
|
||||
try:
|
||||
serve(port=args.port, max_workers=args.max_workers)
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Server shutdown requested")
|
||||
except Exception as e:
|
||||
logger.error("Server error: %s", str(e), exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
9
ai-core/parser/__init__.py
Normal file
9
ai-core/parser/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
"""
|
||||
Parser module for AI-Core document processing system.
|
||||
|
||||
This module provides document parsing using Microsoft MarkItDown.
|
||||
"""
|
||||
|
||||
from .parser import Parser
|
||||
|
||||
__all__ = ["Parser"]
|
||||
100
ai-core/parser/parser.py
Normal file
100
ai-core/parser/parser.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from typing import Optional
|
||||
from markitdown import MarkItDown
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Parser:
|
||||
"""基于 MarkItDown 的统一文档解析器
|
||||
|
||||
支持格式:PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页、Markdown 等
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.markitdown = MarkItDown()
|
||||
logger.info("Parser initialized with MarkItDown")
|
||||
|
||||
def parse(self, file_path: str, file_type: Optional[str] = None) -> dict:
|
||||
"""解析文档为 Markdown
|
||||
|
||||
Args:
|
||||
file_path: 文件路径或 URL
|
||||
file_type: 文件类型(可选,MarkItDown 会自动检测)
|
||||
|
||||
Returns:
|
||||
dict: 包含 markdown 内容和元数据
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Parsing file: {file_path}")
|
||||
|
||||
result = self.markitdown.convert(file_path)
|
||||
|
||||
logger.info(f"Parse successful: {len(result.text_content)} characters")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"content": result.text_content,
|
||||
"content_length": len(result.text_content),
|
||||
"metadata": result.metadata if hasattr(result, 'metadata') else {}
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Parse error: {e}", exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"content": "",
|
||||
"content_length": 0,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def parse_bytes(self, content: bytes, file_name: str, file_type: Optional[str] = None) -> dict:
|
||||
"""解析字节内容为 Markdown
|
||||
|
||||
Args:
|
||||
content: 文件字节内容
|
||||
file_name: 文件名
|
||||
file_type: 文件类型(可选)
|
||||
|
||||
Returns:
|
||||
dict: 包含 markdown 内容和元数据
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Parsing bytes: {file_name}, size: {len(content)} bytes")
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1] or '') as temp_file:
|
||||
temp_file.write(content)
|
||||
temp_path = temp_file.name
|
||||
|
||||
try:
|
||||
result = self.markitdown.convert(temp_path)
|
||||
|
||||
logger.info(f"Parse successful: {len(result.text_content)} characters")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"content": result.text_content,
|
||||
"content_length": len(result.text_content),
|
||||
"metadata": result.metadata if hasattr(result, 'metadata') else {}
|
||||
}
|
||||
finally:
|
||||
os.unlink(temp_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Parse bytes error: {e}", exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"content": "",
|
||||
"content_length": 0,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = Parser()
|
||||
|
||||
# 测试
|
||||
test_url = "https://example.com"
|
||||
result = parser.parse(test_url)
|
||||
print(f"Success: {result['success']}")
|
||||
print(f"Content length: {result['content_length']}")
|
||||
47
ai-core/proto/document_parser.proto
Normal file
47
ai-core/proto/document_parser.proto
Normal file
@@ -0,0 +1,47 @@
|
||||
syntax = "proto3";
|
||||
|
||||
package docparser;
|
||||
|
||||
option go_package = "x-agents/proto/docparser";
|
||||
|
||||
service DocumentParser {
|
||||
rpc ParseDocument(ParseRequest) returns (ParseResponse);
|
||||
rpc GetSupportedFormats(Empty) returns (SupportedFormatsResponse);
|
||||
rpc GetEngines(Empty) returns (EnginesResponse);
|
||||
}
|
||||
|
||||
message ParseRequest {
|
||||
string file_url = 1;
|
||||
string file_name = 2;
|
||||
string file_type = 3;
|
||||
string parser_engine = 4;
|
||||
map<string, string> engine_overrides = 5;
|
||||
}
|
||||
|
||||
message ParseResponse {
|
||||
bool success = 1;
|
||||
string content = 2;
|
||||
string message = 3;
|
||||
int32 content_length = 4;
|
||||
string file_type = 5;
|
||||
string parser_engine = 6;
|
||||
}
|
||||
|
||||
message Empty {}
|
||||
|
||||
message SupportedFormatsResponse {
|
||||
repeated string file_types = 1;
|
||||
map<string, string> file_type_descriptions = 2;
|
||||
}
|
||||
|
||||
message EnginesResponse {
|
||||
repeated EngineInfo engines = 1;
|
||||
}
|
||||
|
||||
message EngineInfo {
|
||||
string name = 1;
|
||||
string description = 2;
|
||||
repeated string supported_file_types = 3;
|
||||
bool available = 4;
|
||||
string unavailable_reason = 5;
|
||||
}
|
||||
57
ai-core/proto/document_parser_pb2.py
Normal file
57
ai-core/proto/document_parser_pb2.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# NO CHECKED-IN PROTOBUF GENCODE
|
||||
# source: document_parser.proto
|
||||
# Protobuf Python Version: 6.31.1
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import runtime_version as _runtime_version
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
_runtime_version.ValidateProtobufRuntimeVersion(
|
||||
_runtime_version.Domain.PUBLIC,
|
||||
6,
|
||||
31,
|
||||
1,
|
||||
'',
|
||||
'document_parser.proto'
|
||||
)
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x64ocument_parser.proto\x12\tdocparser\"\xdd\x01\n\x0cParseRequest\x12\x10\n\x08\x66ile_url\x18\x01 \x01(\t\x12\x11\n\tfile_name\x18\x02 \x01(\t\x12\x11\n\tfile_type\x18\x03 \x01(\t\x12\x15\n\rparser_engine\x18\x04 \x01(\t\x12\x46\n\x10\x65ngine_overrides\x18\x05 \x03(\x0b\x32,.docparser.ParseRequest.EngineOverridesEntry\x1a\x36\n\x14\x45ngineOverridesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\x84\x01\n\rParseResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07\x63ontent\x18\x02 \x01(\t\x12\x0f\n\x07message\x18\x03 \x01(\t\x12\x16\n\x0e\x63ontent_length\x18\x04 \x01(\x05\x12\x11\n\tfile_type\x18\x05 \x01(\t\x12\x15\n\rparser_engine\x18\x06 \x01(\t\"\x07\n\x05\x45mpty\"\xca\x01\n\x18SupportedFormatsResponse\x12\x12\n\nfile_types\x18\x01 \x03(\t\x12]\n\x16\x66ile_type_descriptions\x18\x02 \x03(\x0b\x32=.docparser.SupportedFormatsResponse.FileTypeDescriptionsEntry\x1a;\n\x19\x46ileTypeDescriptionsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"9\n\x0f\x45nginesResponse\x12&\n\x07\x65ngines\x18\x01 \x03(\x0b\x32\x15.docparser.EngineInfo\"|\n\nEngineInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x1c\n\x14supported_file_types\x18\x03 \x03(\t\x12\x11\n\tavailable\x18\x04 \x01(\x08\x12\x1a\n\x12unavailable_reason\x18\x05 \x01(\t2\xde\x01\n\x0e\x44ocumentParser\x12\x42\n\rParseDocument\x12\x17.docparser.ParseRequest\x1a\x18.docparser.ParseResponse\x12L\n\x13GetSupportedFormats\x12\x10.docparser.Empty\x1a#.docparser.SupportedFormatsResponse\x12:\n\nGetEngines\x12\x10.docparser.Empty\x1a\x1a.docparser.EnginesResponseB\x1aZ\x18x-agents/proto/docparserb\x06proto3')
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'document_parser_pb2', _globals)
|
||||
if not _descriptor._USE_C_DESCRIPTORS:
|
||||
_globals['DESCRIPTOR']._loaded_options = None
|
||||
_globals['DESCRIPTOR']._serialized_options = b'Z\030x-agents/proto/docparser'
|
||||
_globals['_PARSEREQUEST_ENGINEOVERRIDESENTRY']._loaded_options = None
|
||||
_globals['_PARSEREQUEST_ENGINEOVERRIDESENTRY']._serialized_options = b'8\001'
|
||||
_globals['_SUPPORTEDFORMATSRESPONSE_FILETYPEDESCRIPTIONSENTRY']._loaded_options = None
|
||||
_globals['_SUPPORTEDFORMATSRESPONSE_FILETYPEDESCRIPTIONSENTRY']._serialized_options = b'8\001'
|
||||
_globals['_PARSEREQUEST']._serialized_start=37
|
||||
_globals['_PARSEREQUEST']._serialized_end=258
|
||||
_globals['_PARSEREQUEST_ENGINEOVERRIDESENTRY']._serialized_start=204
|
||||
_globals['_PARSEREQUEST_ENGINEOVERRIDESENTRY']._serialized_end=258
|
||||
_globals['_PARSERESPONSE']._serialized_start=261
|
||||
_globals['_PARSERESPONSE']._serialized_end=393
|
||||
_globals['_EMPTY']._serialized_start=395
|
||||
_globals['_EMPTY']._serialized_end=402
|
||||
_globals['_SUPPORTEDFORMATSRESPONSE']._serialized_start=405
|
||||
_globals['_SUPPORTEDFORMATSRESPONSE']._serialized_end=607
|
||||
_globals['_SUPPORTEDFORMATSRESPONSE_FILETYPEDESCRIPTIONSENTRY']._serialized_start=548
|
||||
_globals['_SUPPORTEDFORMATSRESPONSE_FILETYPEDESCRIPTIONSENTRY']._serialized_end=607
|
||||
_globals['_ENGINESRESPONSE']._serialized_start=609
|
||||
_globals['_ENGINESRESPONSE']._serialized_end=666
|
||||
_globals['_ENGINEINFO']._serialized_start=668
|
||||
_globals['_ENGINEINFO']._serialized_end=792
|
||||
_globals['_DOCUMENTPARSER']._serialized_start=795
|
||||
_globals['_DOCUMENTPARSER']._serialized_end=1017
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
183
ai-core/proto/document_parser_pb2_grpc.py
Normal file
183
ai-core/proto/document_parser_pb2_grpc.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
|
||||
"""Client and server classes corresponding to protobuf-defined services."""
|
||||
import grpc
|
||||
import warnings
|
||||
|
||||
import document_parser_pb2 as document__parser__pb2
|
||||
|
||||
GRPC_GENERATED_VERSION = '1.78.0'
|
||||
GRPC_VERSION = grpc.__version__
|
||||
_version_not_supported = False
|
||||
|
||||
try:
|
||||
from grpc._utilities import first_version_is_lower
|
||||
_version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
|
||||
except ImportError:
|
||||
_version_not_supported = True
|
||||
|
||||
if _version_not_supported:
|
||||
raise RuntimeError(
|
||||
f'The grpc package installed is at version {GRPC_VERSION},'
|
||||
+ ' but the generated code in document_parser_pb2_grpc.py depends on'
|
||||
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
|
||||
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
|
||||
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
|
||||
)
|
||||
|
||||
|
||||
class DocumentParserStub(object):
|
||||
"""Missing associated documentation comment in .proto file."""
|
||||
|
||||
def __init__(self, channel):
|
||||
"""Constructor.
|
||||
|
||||
Args:
|
||||
channel: A grpc.Channel.
|
||||
"""
|
||||
self.ParseDocument = channel.unary_unary(
|
||||
'/docparser.DocumentParser/ParseDocument',
|
||||
request_serializer=document__parser__pb2.ParseRequest.SerializeToString,
|
||||
response_deserializer=document__parser__pb2.ParseResponse.FromString,
|
||||
_registered_method=True)
|
||||
self.GetSupportedFormats = channel.unary_unary(
|
||||
'/docparser.DocumentParser/GetSupportedFormats',
|
||||
request_serializer=document__parser__pb2.Empty.SerializeToString,
|
||||
response_deserializer=document__parser__pb2.SupportedFormatsResponse.FromString,
|
||||
_registered_method=True)
|
||||
self.GetEngines = channel.unary_unary(
|
||||
'/docparser.DocumentParser/GetEngines',
|
||||
request_serializer=document__parser__pb2.Empty.SerializeToString,
|
||||
response_deserializer=document__parser__pb2.EnginesResponse.FromString,
|
||||
_registered_method=True)
|
||||
|
||||
|
||||
class DocumentParserServicer(object):
|
||||
"""Missing associated documentation comment in .proto file."""
|
||||
|
||||
def ParseDocument(self, request, context):
|
||||
"""Missing associated documentation comment in .proto file."""
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details('Method not implemented!')
|
||||
raise NotImplementedError('Method not implemented!')
|
||||
|
||||
def GetSupportedFormats(self, request, context):
|
||||
"""Missing associated documentation comment in .proto file."""
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details('Method not implemented!')
|
||||
raise NotImplementedError('Method not implemented!')
|
||||
|
||||
def GetEngines(self, request, context):
|
||||
"""Missing associated documentation comment in .proto file."""
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details('Method not implemented!')
|
||||
raise NotImplementedError('Method not implemented!')
|
||||
|
||||
|
||||
def add_DocumentParserServicer_to_server(servicer, server):
|
||||
rpc_method_handlers = {
|
||||
'ParseDocument': grpc.unary_unary_rpc_method_handler(
|
||||
servicer.ParseDocument,
|
||||
request_deserializer=document__parser__pb2.ParseRequest.FromString,
|
||||
response_serializer=document__parser__pb2.ParseResponse.SerializeToString,
|
||||
),
|
||||
'GetSupportedFormats': grpc.unary_unary_rpc_method_handler(
|
||||
servicer.GetSupportedFormats,
|
||||
request_deserializer=document__parser__pb2.Empty.FromString,
|
||||
response_serializer=document__parser__pb2.SupportedFormatsResponse.SerializeToString,
|
||||
),
|
||||
'GetEngines': grpc.unary_unary_rpc_method_handler(
|
||||
servicer.GetEngines,
|
||||
request_deserializer=document__parser__pb2.Empty.FromString,
|
||||
response_serializer=document__parser__pb2.EnginesResponse.SerializeToString,
|
||||
),
|
||||
}
|
||||
generic_handler = grpc.method_handlers_generic_handler(
|
||||
'docparser.DocumentParser', rpc_method_handlers)
|
||||
server.add_generic_rpc_handlers((generic_handler,))
|
||||
server.add_registered_method_handlers('docparser.DocumentParser', rpc_method_handlers)
|
||||
|
||||
|
||||
# This class is part of an EXPERIMENTAL API.
|
||||
class DocumentParser(object):
|
||||
"""Missing associated documentation comment in .proto file."""
|
||||
|
||||
@staticmethod
|
||||
def ParseDocument(request,
|
||||
target,
|
||||
options=(),
|
||||
channel_credentials=None,
|
||||
call_credentials=None,
|
||||
insecure=False,
|
||||
compression=None,
|
||||
wait_for_ready=None,
|
||||
timeout=None,
|
||||
metadata=None):
|
||||
return grpc.experimental.unary_unary(
|
||||
request,
|
||||
target,
|
||||
'/docparser.DocumentParser/ParseDocument',
|
||||
document__parser__pb2.ParseRequest.SerializeToString,
|
||||
document__parser__pb2.ParseResponse.FromString,
|
||||
options,
|
||||
channel_credentials,
|
||||
insecure,
|
||||
call_credentials,
|
||||
compression,
|
||||
wait_for_ready,
|
||||
timeout,
|
||||
metadata,
|
||||
_registered_method=True)
|
||||
|
||||
@staticmethod
|
||||
def GetSupportedFormats(request,
|
||||
target,
|
||||
options=(),
|
||||
channel_credentials=None,
|
||||
call_credentials=None,
|
||||
insecure=False,
|
||||
compression=None,
|
||||
wait_for_ready=None,
|
||||
timeout=None,
|
||||
metadata=None):
|
||||
return grpc.experimental.unary_unary(
|
||||
request,
|
||||
target,
|
||||
'/docparser.DocumentParser/GetSupportedFormats',
|
||||
document__parser__pb2.Empty.SerializeToString,
|
||||
document__parser__pb2.SupportedFormatsResponse.FromString,
|
||||
options,
|
||||
channel_credentials,
|
||||
insecure,
|
||||
call_credentials,
|
||||
compression,
|
||||
wait_for_ready,
|
||||
timeout,
|
||||
metadata,
|
||||
_registered_method=True)
|
||||
|
||||
@staticmethod
|
||||
def GetEngines(request,
|
||||
target,
|
||||
options=(),
|
||||
channel_credentials=None,
|
||||
call_credentials=None,
|
||||
insecure=False,
|
||||
compression=None,
|
||||
wait_for_ready=None,
|
||||
timeout=None,
|
||||
metadata=None):
|
||||
return grpc.experimental.unary_unary(
|
||||
request,
|
||||
target,
|
||||
'/docparser.DocumentParser/GetEngines',
|
||||
document__parser__pb2.Empty.SerializeToString,
|
||||
document__parser__pb2.EnginesResponse.FromString,
|
||||
options,
|
||||
channel_credentials,
|
||||
insecure,
|
||||
call_credentials,
|
||||
compression,
|
||||
wait_for_ready,
|
||||
timeout,
|
||||
metadata,
|
||||
_registered_method=True)
|
||||
13
ai-core/requirements.txt
Normal file
13
ai-core/requirements.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
# AI-Core Document Parser - 基于 MarkItDown
|
||||
|
||||
# gRPC 框架
|
||||
grpcio>=1.60.0
|
||||
grpcio-tools>=1.60.0
|
||||
grpcio-reflection>=1.60.0
|
||||
protobuf>=4.25.0
|
||||
|
||||
# HTTP 请求
|
||||
requests>=2.31.0
|
||||
|
||||
# 文档解析
|
||||
markitdown>=0.0.1
|
||||
0
ai-core/service/__init__.py
Normal file
0
ai-core/service/__init__.py
Normal file
244
ai-core/service/grpc_server.py
Normal file
244
ai-core/service/grpc_server.py
Normal file
@@ -0,0 +1,244 @@
|
||||
import logging
|
||||
import requests
|
||||
from concurrent import futures
|
||||
|
||||
import grpc
|
||||
from grpc_reflection.v1alpha import reflection
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "proto"))
|
||||
|
||||
from parser.parser import Parser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
docparser_pb2 = None
|
||||
docparser_pb2_grpc = None
|
||||
|
||||
def _import_grpc_protobuf():
|
||||
"""Import gRPC protobuf modules"""
|
||||
global docparser_pb2, docparser_pb2_grpc
|
||||
if docparser_pb2 is not None and docparser_pb2_grpc is not None:
|
||||
return
|
||||
|
||||
try:
|
||||
import document_parser_pb2 as dpb2
|
||||
import document_parser_pb2_grpc as dpb2_grpc
|
||||
docparser_pb2 = dpb2
|
||||
docparser_pb2_grpc = dpb2_grpc
|
||||
logger.info("Successfully imported gRPC protobuf modules")
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to import gRPC protobuf: {e}")
|
||||
raise ImportError(
|
||||
"gRPC protobuf files not found. Please run: python generate_grpc.py"
|
||||
) from e
|
||||
|
||||
|
||||
class DocumentParserServicer:
|
||||
"""gRPC 服务实现,使用 MarkItDown"""
|
||||
|
||||
def __init__(self, max_workers: int = 10):
|
||||
_import_grpc_protobuf()
|
||||
self.parser = Parser()
|
||||
self.max_workers = max_workers
|
||||
logger.info("DocumentParserServicer initialized")
|
||||
|
||||
def ParseDocument(self, request, context):
|
||||
"""解析文档"""
|
||||
try:
|
||||
logger.info(
|
||||
"ParseDocument request: file_url=%s, file_name=%s, file_type=%s",
|
||||
request.file_url,
|
||||
request.file_name,
|
||||
request.file_type,
|
||||
)
|
||||
|
||||
file_url = request.file_url
|
||||
file_name = request.file_name
|
||||
|
||||
if not file_url:
|
||||
return docparser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message="file_url is required",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
if not file_name:
|
||||
return docparser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message="file_name is required",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
logger.info("Downloading file from URL: %s", file_url)
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
file_url,
|
||||
timeout=60,
|
||||
headers={"User-Agent": "DocParser/1.0"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
content = response.content
|
||||
logger.info("Downloaded %d bytes", len(content))
|
||||
except requests.RequestException as e:
|
||||
logger.error("Failed to download file: %s", str(e))
|
||||
return docparser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message=f"Failed to download file: {str(e)}",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
logger.info("Parsing file with MarkItDown")
|
||||
|
||||
result = self.parser.parse_bytes(content, file_name)
|
||||
|
||||
if not result.get("success", False):
|
||||
logger.warning("Parser returned failure: %s", result.get("error", "Unknown error"))
|
||||
return docparser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message=result.get("error", "Parse failed"),
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
markdown_content = result.get("content", "")
|
||||
logger.info(
|
||||
"Parse successful: content_length=%d",
|
||||
len(markdown_content),
|
||||
)
|
||||
|
||||
return docparser_pb2.ParseResponse(
|
||||
success=True,
|
||||
content=markdown_content,
|
||||
message="Parse successful",
|
||||
content_length=len(markdown_content),
|
||||
file_type=request.file_type or "auto",
|
||||
parser_engine="markitdown",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("ParseDocument error: %s", str(e), exc_info=True)
|
||||
return docparser_pb2.ParseResponse(
|
||||
success=False,
|
||||
content="",
|
||||
message=f"Parse error: {str(e)}",
|
||||
content_length=0,
|
||||
)
|
||||
|
||||
def GetSupportedFormats(self, request, context):
|
||||
"""获取支持的文件格式"""
|
||||
try:
|
||||
logger.info("GetSupportedFormats request")
|
||||
|
||||
file_types = [
|
||||
"pdf", "docx", "doc", "pptx", "ppt",
|
||||
"xlsx", "xls", "csv",
|
||||
"md", "markdown",
|
||||
"jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp",
|
||||
"html", "htm", "txt",
|
||||
]
|
||||
|
||||
file_type_descriptions = {
|
||||
"pdf": "PDF Document",
|
||||
"docx": "Microsoft Word Document",
|
||||
"doc": "Microsoft Word Document (Legacy)",
|
||||
"pptx": "Microsoft PowerPoint Presentation",
|
||||
"ppt": "Microsoft PowerPoint Presentation (Legacy)",
|
||||
"xlsx": "Microsoft Excel Spreadsheet",
|
||||
"xls": "Microsoft Excel Spreadsheet (Legacy)",
|
||||
"csv": "Comma-Separated Values",
|
||||
"md": "Markdown File",
|
||||
"markdown": "Markdown File",
|
||||
"jpg": "JPEG Image",
|
||||
"jpeg": "JPEG Image",
|
||||
"png": "PNG Image",
|
||||
"gif": "GIF Image",
|
||||
"bmp": "BMP Image",
|
||||
"tiff": "TIFF Image",
|
||||
"webp": "WebP Image",
|
||||
"html": "HTML Document",
|
||||
"htm": "HTML Document",
|
||||
"txt": "Plain Text File",
|
||||
}
|
||||
|
||||
return docparser_pb2.SupportedFormatsResponse(
|
||||
file_types=file_types,
|
||||
file_type_descriptions=file_type_descriptions,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("GetSupportedFormats error: %s", str(e), exc_info=True)
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return docparser_pb2.SupportedFormatsResponse()
|
||||
|
||||
def GetEngines(self, request, context):
|
||||
"""获取可用的解析引擎列表"""
|
||||
try:
|
||||
logger.info("GetEngines request")
|
||||
|
||||
engine_info = docparser_pb2.EngineInfo(
|
||||
name="markitdown",
|
||||
description="Microsoft MarkItDown - 统一文档解析引擎",
|
||||
supported_file_types=[
|
||||
"pdf", "docx", "doc", "pptx", "ppt",
|
||||
"xlsx", "xls", "csv",
|
||||
"md", "markdown",
|
||||
"jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp",
|
||||
"html", "htm", "txt",
|
||||
],
|
||||
available=True,
|
||||
unavailable_reason="",
|
||||
)
|
||||
|
||||
return docparser_pb2.EnginesResponse(engines=[engine_info])
|
||||
except Exception as e:
|
||||
logger.error("GetEngines error: %s", str(e), exc_info=True)
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return docparser_pb2.EnginesResponse()
|
||||
|
||||
|
||||
def serve(port: int = 50051, max_workers: int = 10):
|
||||
"""启动 gRPC 服务"""
|
||||
_import_grpc_protobuf()
|
||||
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers))
|
||||
|
||||
servicer = DocumentParserServicer(max_workers=max_workers)
|
||||
docparser_pb2_grpc.add_DocumentParserServicer_to_server(servicer, server)
|
||||
|
||||
reflection.enable_server_reflection(
|
||||
service_names=[
|
||||
docparser_pb2.DESCRIPTOR.services_by_name["DocumentParser"].full_name,
|
||||
reflection.SERVICE_NAME,
|
||||
],
|
||||
server=server,
|
||||
)
|
||||
|
||||
server.add_insecure_port(f"0.0.0.0:{port}")
|
||||
server.start()
|
||||
|
||||
logger.info("DocumentParser gRPC server (MarkItDown) started on port %d", port)
|
||||
logger.info("gRPC reflection enabled")
|
||||
|
||||
try:
|
||||
server.wait_for_termination()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Shutting down server...")
|
||||
server.stop(0)
|
||||
logger.info("Server stopped")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
serve()
|
||||
36
ai-core/start.bat
Normal file
36
ai-core/start.bat
Normal file
@@ -0,0 +1,36 @@
|
||||
@echo off
|
||||
chcp 65001 >nul
|
||||
echo Starting AI-Core Document Parser gRPC Server...
|
||||
|
||||
set PORT=50051
|
||||
|
||||
echo Checking and cleaning up port %PORT%...
|
||||
for /f "tokens=5" %%a in ('netstat -ano ^| findstr :%PORT% ^| findstr LISTENING') do (
|
||||
echo Killing process %%a on port %PORT%...
|
||||
taskkill /F /PID %%a 2>nul
|
||||
)
|
||||
timeout /t 2 /nobreak >nul
|
||||
|
||||
cd /d %~dp0
|
||||
|
||||
echo Using virtual environment Python...
|
||||
if exist "venv\Scripts\python.exe" (
|
||||
set PYTHON_CMD=%~dp0venv\Scripts\python.exe
|
||||
) else (
|
||||
set PYTHON_CMD=py
|
||||
)
|
||||
|
||||
echo Using Python: %PYTHON_CMD%
|
||||
%PYTHON_CMD% --version
|
||||
|
||||
echo Checking port %PORT%...
|
||||
%PYTHON_CMD% -c "import socket; s=socket.socket(); s.settimeout(1); r=s.connect_ex(('127.0.0.1',%PORT%)); s.close(); exit(0 if r!=0 else 1)" 2>nul
|
||||
if %ERRORLEVEL% NEQ 0 (
|
||||
echo Port %PORT% is free, starting server...
|
||||
) else (
|
||||
echo Port %PORT% is still in use, please check manually
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
echo Starting server on port %PORT%...
|
||||
%PYTHON_CMD% main.py --port %PORT% --max-workers 10 --log-level INFO
|
||||
110
ai-core/start.sh
Normal file
110
ai-core/start.sh
Normal file
@@ -0,0 +1,110 @@
|
||||
#!/bin/bash
|
||||
|
||||
# AI-Core gRPC Server Startup Script
|
||||
|
||||
echo "Starting AI-Core Document Parser gRPC Server..."
|
||||
|
||||
# 配置
|
||||
PORT=${1:-50051}
|
||||
|
||||
# 使用虚拟环境
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
cd "$SCRIPT_DIR"
|
||||
|
||||
# Windows 下使用 PowerShell 的 py 命令或者直接用 venv
|
||||
if [[ "$OSTYPE" == "msys" || "$OSTYPE" == "win32" || -f "venv/Scripts/python.exe" ]]; then
|
||||
if [ -f "venv/Scripts/python.exe" ]; then
|
||||
echo "Using virtual environment Python..."
|
||||
PYTHON_CMD="$SCRIPT_DIR/venv/Scripts/python.exe"
|
||||
elif command -v py &> /dev/null; then
|
||||
echo "Using py launcher..."
|
||||
PYTHON_CMD="py"
|
||||
else
|
||||
echo "Error: Python not found"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
# Linux/Mac
|
||||
if [ -d "venv" ]; then
|
||||
echo "Activating virtual environment..."
|
||||
source venv/bin/activate
|
||||
PYTHON_CMD="python"
|
||||
else
|
||||
PYTHON_CMD="python3"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Using Python: $PYTHON_CMD"
|
||||
$PYTHON_CMD --version
|
||||
|
||||
# Check if requirements are installed
|
||||
$PYTHON_CMD -c "import grpcio" 2>/dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Installing Python dependencies..."
|
||||
$PYTHON_CMD -m pip install -r requirements.txt
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error: Failed to install dependencies"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Generate gRPC code if needed
|
||||
if [ ! -f "proto/document_parser_pb2.py" ]; then
|
||||
echo "Generating gRPC code..."
|
||||
$PYTHON_CMD generate_grpc.py
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error: Failed to generate gRPC code"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# 用 Python 来检测和杀死占用端口的进程(跨平台更可靠)
|
||||
echo "Checking and cleaning up port $PORT..."
|
||||
|
||||
# 先尝试直接用 Windows 命令杀死(更可靠)
|
||||
if [[ "$OSTYPE" == "msys" || "$OSTYPE" == "win32" || "$(uname)" == "MINGW"* ]]; then
|
||||
# 直接用 cmd /c 执行
|
||||
cmd //c "for /f \"tokens=5\" %a in ('netstat -ano ^| findstr :$PORT ^| findstr LISTENING') do taskkill /F /PID %a"
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
# 再用 Python 检测
|
||||
$PYTHON_CMD -c "
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import os
|
||||
|
||||
port = $PORT
|
||||
print(f'Checking port {port}...')
|
||||
|
||||
# 检查端口是否被占用
|
||||
try:
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
s.settimeout(1)
|
||||
result = s.connect_ex(('127.0.0.1', port))
|
||||
s.close()
|
||||
if result != 0:
|
||||
print(f'Port {port} is free (not listening)')
|
||||
else:
|
||||
print(f'Port {port} is still in use!')
|
||||
# 尝试杀死
|
||||
try:
|
||||
result = subprocess.run(['netstat', '-ano'], capture_output=True, text=True, shell=True)
|
||||
for line in result.stdout.split('\n'):
|
||||
if f':{port}' in line and 'LISTENING' in line:
|
||||
parts = line.split()
|
||||
pid = parts[-1]
|
||||
print(f'Found process {pid}, killing...')
|
||||
os.system(f'taskkill /F /PID {pid}')
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
print(f'Error: {e}')
|
||||
except Exception as e:
|
||||
print(f'Check error: {e}')
|
||||
"
|
||||
|
||||
# Start the server
|
||||
echo "Starting server on port $PORT..."
|
||||
$PYTHON_CMD main.py --port $PORT --max-workers 10 --log-level INFO
|
||||
@@ -1,112 +0,0 @@
|
||||
# Algorithm Service
|
||||
|
||||
Python 算法服务,提供文档解析、Embedding、LLM 调用等功能。
|
||||
|
||||
## 环境要求
|
||||
|
||||
- Python 3.9+
|
||||
- FastAPI
|
||||
- Uvicorn
|
||||
|
||||
## 安装依赖
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## 运行服务
|
||||
|
||||
```bash
|
||||
# 开发模式
|
||||
uvicorn main:app --reload --port 8081
|
||||
|
||||
# 生产模式
|
||||
uvicorn main:app --host 0.0.0.0 --port 8081
|
||||
```
|
||||
|
||||
## 接口列表
|
||||
|
||||
### 1. 文档解析
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
POST /parse
|
||||
Content-Type: application/json
|
||||
```
|
||||
|
||||
| 参数 | 类型 | 必填 | 说明 |
|
||||
|------|------|------|------|
|
||||
| file_url | String | 是 | 文件 URL |
|
||||
| engine | String | 是 | 解析引擎:markitdown / docling |
|
||||
| docling_url | String | 否 | Docling 服务 URL |
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"content": "解析后的文本内容...",
|
||||
"chunks": ["chunk1", "chunk2"],
|
||||
"total_pages": 10,
|
||||
"metadata": {
|
||||
"filename": "document.pdf",
|
||||
"file_size": 1234567
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. 生成 Embedding
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
POST /embedding
|
||||
Content-Type: application/json
|
||||
```
|
||||
|
||||
| 参数 | 类型 | 必填 | 说明 |
|
||||
|------|------|------|------|
|
||||
| input | String/Array | 是 | 要 embedding 的文本 |
|
||||
| model | String | 是 | 模型名称 |
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"embeddings": [[0.1, 0.2, ...], [0.3, 0.4, ...]],
|
||||
"model": "text-embedding-3-small"
|
||||
}
|
||||
```
|
||||
|
||||
### 3. LLM 对话
|
||||
|
||||
**请求**
|
||||
|
||||
```
|
||||
POST /chat
|
||||
Content-Type: application/json
|
||||
```
|
||||
|
||||
| 参数 | 类型 | 必填 | 说明 |
|
||||
|------|------|------|------|
|
||||
| messages | Array | 是 | 消息列表 |
|
||||
| model | String | 是 | 模型名称 |
|
||||
| temperature | Float | 否 | 温度参数 |
|
||||
|
||||
**响应**
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "回复内容..."
|
||||
},
|
||||
"usage": {
|
||||
"prompt_tokens": 100,
|
||||
"completion_tokens": 50
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -1,175 +0,0 @@
|
||||
"""
|
||||
Algorithm Service - 文档解析、Embedding、LLM 调用服务
|
||||
"""
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, List, Dict, Any
|
||||
import requests
|
||||
import os
|
||||
import json
|
||||
|
||||
app = FastAPI(title="Algorithm Service")
|
||||
|
||||
|
||||
# ========== Models ==========
|
||||
|
||||
class ParseRequest(BaseModel):
|
||||
file_url: str
|
||||
engine: str # markitdown / docling
|
||||
docling_url: Optional[str] = None
|
||||
|
||||
|
||||
class EmbeddingRequest(BaseModel):
|
||||
input: str | List[str]
|
||||
model: str
|
||||
|
||||
|
||||
class ChatMessage(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class ChatRequest(BaseModel):
|
||||
messages: List[ChatMessage]
|
||||
model: str
|
||||
temperature: Optional[float] = 0.7
|
||||
api_key: Optional[str] = None
|
||||
base_url: Optional[str] = None
|
||||
|
||||
|
||||
# ========== 文档解析 ==========
|
||||
|
||||
@app.post("/parse")
|
||||
async def parse_document(req: ParseRequest):
|
||||
"""解析文档,支持 markitdown 和 docling"""
|
||||
try:
|
||||
if req.engine == "markitdown":
|
||||
return await parse_with_markitdown(req.file_url)
|
||||
elif req.engine == "docling":
|
||||
return await parse_with_docling(req.file_url, req.docling_url)
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail=f"Unsupported engine: {req.engine}")
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
async def parse_with_markitdown(file_url: str) -> Dict[str, Any]:
|
||||
"""使用 markitdown 解析文档"""
|
||||
try:
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(file_url)
|
||||
|
||||
# 简单分块(按段落分割)
|
||||
content = result.text_content if hasattr(result, 'text_content') else str(result)
|
||||
chunks = [c.strip() for c in content.split('\n\n') if c.strip()]
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"content": content,
|
||||
"chunks": chunks[:100], # 限制 chunk 数量
|
||||
"total_pages": 1,
|
||||
"metadata": {
|
||||
"filename": file_url.split('/')[-1]
|
||||
}
|
||||
}
|
||||
except ImportError:
|
||||
raise HTTPException(status_code=500, detail="markitdown not installed. Run: pip install markitdown")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to parse with markitdown: {str(e)}")
|
||||
|
||||
|
||||
async def parse_with_docling(file_url: str, docling_url: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""使用 docling 解析文档"""
|
||||
if not docling_url:
|
||||
raise HTTPException(status_code=400, detail="docling_url is required for docling engine")
|
||||
|
||||
try:
|
||||
# 调用 docling 服务
|
||||
response = requests.post(
|
||||
f"{docling_url}/convert",
|
||||
json={"url": file_url},
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise HTTPException(status_code=500, detail=f"Docling service error: {response.text}")
|
||||
|
||||
result = response.json()
|
||||
|
||||
content = result.get("text", "")
|
||||
chunks = [c.strip() for c in content.split('\n\n') if c.strip()]
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"content": content,
|
||||
"chunks": chunks[:100],
|
||||
"total_pages": result.get("num_pages", 1),
|
||||
"metadata": {
|
||||
"filename": file_url.split('/')[-1]
|
||||
}
|
||||
}
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to connect docling service: {str(e)}")
|
||||
|
||||
|
||||
# ========== Embedding ==========
|
||||
|
||||
@app.post("/embedding")
|
||||
async def generate_embedding(req: EmbeddingRequest):
|
||||
"""生成 Embedding"""
|
||||
try:
|
||||
# TODO: 根据不同 provider 调用不同的 embedding 服务
|
||||
# 目前返回模拟数据
|
||||
|
||||
texts = [req.input] if isinstance(req.input, str) else req.input
|
||||
|
||||
# 模拟 embedding 返回
|
||||
embeddings = [[0.1] * 1536 for _ in texts] # 1536 维向量
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"embeddings": embeddings,
|
||||
"model": req.model
|
||||
}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
# ========== Chat ==========
|
||||
|
||||
@app.post("/chat")
|
||||
async def chat(req: ChatRequest):
|
||||
"""LLM 对话"""
|
||||
try:
|
||||
# TODO: 根据 model 和 base_url 调用实际的 LLM 服务
|
||||
# 目前返回模拟数据
|
||||
|
||||
last_message = req.messages[-1].content if req.messages else ""
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": f"Echo: {last_message}"
|
||||
},
|
||||
"usage": {
|
||||
"prompt_tokens": len(last_message),
|
||||
"completion_tokens": 10
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
# ========== Health Check ==========
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8081)
|
||||
@@ -1,17 +0,0 @@
|
||||
# FastAPI
|
||||
fastapi>=0.100.0
|
||||
uvicorn[standard]>=0.23.0
|
||||
|
||||
# HTTP 请求
|
||||
requests>=2.31.0
|
||||
|
||||
# 文档解析
|
||||
markitdown>=0.0.1
|
||||
|
||||
# Pydantic
|
||||
pydantic>=2.0.0
|
||||
|
||||
# 可选:其他解析库
|
||||
# docling>=0.1.0
|
||||
# pypdf>=3.0.0
|
||||
# python-docx>=0.8.11
|
||||
@@ -1,30 +0,0 @@
|
||||
@echo off
|
||||
chcp 65001 >nul
|
||||
title Algorithm Service
|
||||
|
||||
echo ========================================
|
||||
echo 启动 Algorithm 服务
|
||||
echo ========================================
|
||||
|
||||
cd /d %~dp0
|
||||
|
||||
echo.
|
||||
echo 检查虚拟环境...
|
||||
if not exist venv (
|
||||
echo [INFO] 创建虚拟环境...
|
||||
python -m venv venv
|
||||
)
|
||||
|
||||
echo.
|
||||
echo 安装/更新依赖...
|
||||
call venv\Scripts\pip install -r requirements.txt -q
|
||||
|
||||
echo.
|
||||
echo 启动服务...
|
||||
echo 访问 http://localhost:8081/docs 查看 API 文档
|
||||
echo 按 Ctrl+C 停止服务
|
||||
echo.
|
||||
|
||||
call venv\Scripts\uvicorn main:app --reload --port 8081 --host 0.0.0.0
|
||||
|
||||
pause
|
||||
@@ -1,26 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "========================================"
|
||||
echo " 启动 Algorithm 服务"
|
||||
echo "========================================"
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# 检查虚拟环境
|
||||
if [ ! -d "venv" ]; then
|
||||
echo "[INFO] 创建虚拟环境..."
|
||||
python3 -m venv venv
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "安装/更新依赖..."
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt -q
|
||||
|
||||
echo ""
|
||||
echo "启动服务..."
|
||||
echo "访问 http://localhost:8081/docs 查看 API 文档"
|
||||
echo "按 Ctrl+C 停止服务"
|
||||
echo ""
|
||||
|
||||
uvicorn main:app --reload --port 8081 --host 0.0.0.0
|
||||
@@ -87,7 +87,7 @@ func main() {
|
||||
if err != nil {
|
||||
log.Printf("Warning: Failed to initialize upload service: %v (files will not be available)", err)
|
||||
}
|
||||
knowledgeService := service.NewKnowledgeService(knowledgeRepo, modelRepo, uploadService, cfg.PythonServiceURL)
|
||||
knowledgeService := service.NewKnowledgeService(knowledgeRepo, modelRepo, uploadService, cfg.PythonServiceURL, cfg.AICoreServiceAddr)
|
||||
|
||||
// 6. 初始化 Handler
|
||||
dbHandler := handler.NewDatabaseHandler(dbService)
|
||||
|
||||
@@ -1,9 +1,17 @@
|
||||
# 本地开发配置
|
||||
port: "8082"
|
||||
jwt_secret: "dev-secret-key"
|
||||
# Docker 内访问用 db:3306,本地访问用 localhost:6036
|
||||
database_url: "root:root@tcp(localhost:6036)/x_agents?charset=utf8mb4&parseTime=True&loc=Local"
|
||||
|
||||
# 数据库配置
|
||||
database_host: "localhost"
|
||||
database_port: "6036"
|
||||
database_user: "root"
|
||||
database_password: "root"
|
||||
database_name: "x_agents"
|
||||
|
||||
# AI 服务配置
|
||||
python_service_url: "http://localhost:8081"
|
||||
ai_core_service_addr: "localhost:50051"
|
||||
|
||||
# 文件上传配置 (local 或 minio)
|
||||
upload_mode: "local"
|
||||
|
||||
@@ -70,7 +70,9 @@ require (
|
||||
golang.org/x/net v0.48.0 // indirect
|
||||
golang.org/x/sys v0.39.0 // indirect
|
||||
golang.org/x/text v0.32.0 // indirect
|
||||
google.golang.org/protobuf v1.31.0 // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect
|
||||
google.golang.org/grpc v1.79.2 // indirect
|
||||
google.golang.org/protobuf v1.36.10 // indirect
|
||||
gopkg.in/ini.v1 v1.67.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
|
||||
@@ -44,6 +44,7 @@ github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
|
||||
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
|
||||
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU=
|
||||
github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
@@ -185,9 +186,16 @@ golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
|
||||
golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/genproto v0.0.0-20231106174013-bbf56f31fb17 h1:wpZ8pe2x1Q3f2KyT5f8oP/fa9rHAKgFPr/HZdNuS+PQ=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
|
||||
google.golang.org/grpc v1.79.2 h1:fRMD94s2tITpyJGtBBn7MkMseNpOZU8ZxgC3MMBaXRU=
|
||||
google.golang.org/grpc v1.79.2/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ=
|
||||
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
||||
google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8=
|
||||
google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
|
||||
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
|
||||
google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
|
||||
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
|
||||
@@ -14,8 +14,14 @@ import (
|
||||
type Config struct {
|
||||
Port string
|
||||
JWTSecret string
|
||||
DatabaseURL string
|
||||
DatabaseHost string
|
||||
DatabasePort string
|
||||
DatabaseUser string
|
||||
DatabasePassword string
|
||||
DatabaseName string
|
||||
DatabaseURL string // 拼接后的完整连接字符串
|
||||
PythonServiceURL string
|
||||
AICoreServiceAddr string // AI-Core gRPC 服务地址,如 "localhost:50051"
|
||||
// 文件上传配置
|
||||
UploadMode string // "local" 或 "minio"
|
||||
UploadLocalPath string // 本地存储路径,如 "resource/files"
|
||||
@@ -39,7 +45,13 @@ func Load() *Config {
|
||||
viper.SetDefault("port", "8080")
|
||||
viper.SetDefault("jwt_secret", "your-secret-key-change-in-production")
|
||||
viper.SetDefault("python_service_url", "http://localhost:8081")
|
||||
viper.SetDefault("database_url", "root:root@tcp(localhost:3306)/x_agents?charset=utf8mb4&parseTime=True&loc=Local")
|
||||
viper.SetDefault("ai_core_service_addr", "localhost:50051")
|
||||
// 数据库默认配置
|
||||
viper.SetDefault("database_host", "localhost")
|
||||
viper.SetDefault("database_port", "3306")
|
||||
viper.SetDefault("database_user", "root")
|
||||
viper.SetDefault("database_password", "root")
|
||||
viper.SetDefault("database_name", "x_agents")
|
||||
// 文件上传默认配置
|
||||
viper.SetDefault("upload_mode", "local")
|
||||
viper.SetDefault("upload_local_path", "resource/files")
|
||||
@@ -54,11 +66,26 @@ func Load() *Config {
|
||||
log.Printf("Using default config: %v", err)
|
||||
}
|
||||
|
||||
// 拼接数据库连接字符串
|
||||
dbHost := viper.GetString("database_host")
|
||||
dbPort := viper.GetString("database_port")
|
||||
dbUser := viper.GetString("database_user")
|
||||
dbPassword := viper.GetString("database_password")
|
||||
dbName := viper.GetString("database_name")
|
||||
databaseURL := fmt.Sprintf("%s:%s@tcp(%s:%s)/%s?charset=utf8mb4&parseTime=True&loc=Local",
|
||||
dbUser, dbPassword, dbHost, dbPort, dbName)
|
||||
|
||||
return &Config{
|
||||
Port: viper.GetString("port"),
|
||||
JWTSecret: viper.GetString("jwt_secret"),
|
||||
DatabaseURL: viper.GetString("database_url"),
|
||||
PythonServiceURL: viper.GetString("python_service_url"),
|
||||
Port: viper.GetString("port"),
|
||||
JWTSecret: viper.GetString("jwt_secret"),
|
||||
DatabaseURL: databaseURL,
|
||||
DatabaseHost: dbHost,
|
||||
DatabasePort: dbPort,
|
||||
DatabaseUser: dbUser,
|
||||
DatabasePassword: dbPassword,
|
||||
DatabaseName: dbName,
|
||||
PythonServiceURL: viper.GetString("python_service_url"),
|
||||
AICoreServiceAddr: viper.GetString("ai_core_service_addr"),
|
||||
// 文件上传配置
|
||||
UploadMode: viper.GetString("upload_mode"),
|
||||
UploadLocalPath: viper.GetString("upload_local_path"),
|
||||
|
||||
@@ -87,6 +87,7 @@ type KnowledgeDocument struct {
|
||||
FileKey string `json:"file_key" gorm:"type:varchar(500)"`
|
||||
FileURL string `json:"file_url" gorm:"type:varchar(500)"` // 文件访问 URL
|
||||
FileSize int64 `json:"file_size" gorm:"type:bigint;default:0"`
|
||||
Content string `json:"content" gorm:"type:longtext"` // Markdown 内容(AI-Core 解析结果)
|
||||
Status string `json:"status" gorm:"type:varchar(20);default:parsing"` // parsing / parsed / failed
|
||||
ChunkCount int `json:"chunk_count" gorm:"default:0"`
|
||||
UploadedAt time.Time `json:"uploaded_at"`
|
||||
|
||||
132
server/internal/service/document_parser_client.go
Normal file
132
server/internal/service/document_parser_client.go
Normal file
@@ -0,0 +1,132 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
)
|
||||
|
||||
// AICoreClient AI-Core 文档解析服务客户端
|
||||
type AICoreClient struct {
|
||||
conn *grpc.ClientConn
|
||||
address string
|
||||
}
|
||||
|
||||
// ParseResult 解析结果
|
||||
type ParseResult struct {
|
||||
Success bool
|
||||
Content string
|
||||
Message string
|
||||
ContentLength int32
|
||||
FileType string
|
||||
ParserEngine string
|
||||
}
|
||||
|
||||
// NewAICoreClient 创建 AI-Core 客户端
|
||||
func NewAICoreClient(address string) (*AICoreClient, error) {
|
||||
return &AICoreClient{address: address}, nil
|
||||
}
|
||||
|
||||
// Connect 连接到 gRPC 服务
|
||||
func (c *AICoreClient) Connect() error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
conn, err := grpc.DialContext(ctx, c.address,
|
||||
grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithBlock(),
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to connect to AI-Core service: %w", err)
|
||||
}
|
||||
c.conn = conn
|
||||
return nil
|
||||
}
|
||||
|
||||
// Close 关闭连接
|
||||
func (c *AICoreClient) Close() {
|
||||
if c.conn != nil {
|
||||
c.conn.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// ParseDocument 解析文档
|
||||
func (c *AICoreClient) ParseDocument(fileURL, fileName, fileType string) (*ParseResult, error) {
|
||||
if c.conn == nil {
|
||||
if err := c.Connect(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// 使用 gRPC raw bytes 调用
|
||||
// 由于没有生成 protobuf 代码,使用 raw bytes 方式调用
|
||||
client := NewDocumentParserClient(c.conn)
|
||||
|
||||
req := &ParseRequest{
|
||||
FileUrl: fileURL,
|
||||
FileName: fileName,
|
||||
FileType: fileType,
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
resp, err := client.ParseDocument(ctx, req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse document: %w", err)
|
||||
}
|
||||
|
||||
return &ParseResult{
|
||||
Success: resp.Success,
|
||||
Content: resp.Content,
|
||||
Message: resp.Message,
|
||||
ContentLength: resp.ContentLength,
|
||||
FileType: resp.FileType,
|
||||
ParserEngine: resp.ParserEngine,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// 以下是手动定义的 protobuf messages(与 proto 文件一致)
|
||||
// 不需要生成 .pb.go 文件,直接手动定义
|
||||
|
||||
type ParseRequest struct {
|
||||
FileUrl string
|
||||
FileName string
|
||||
FileType string
|
||||
ParserEngine string
|
||||
}
|
||||
|
||||
type ParseResponse struct {
|
||||
Success bool
|
||||
Content string
|
||||
Message string
|
||||
ContentLength int32
|
||||
FileType string
|
||||
ParserEngine string
|
||||
}
|
||||
|
||||
// DocumentParserClient gRPC 客户端接口(手动实现)
|
||||
type DocumentParserClient interface {
|
||||
ParseDocument(ctx context.Context, in *ParseRequest, opts ...grpc.CallOption) (*ParseResponse, error)
|
||||
}
|
||||
|
||||
type documentParserClient struct {
|
||||
cc grpc.ClientConnInterface
|
||||
}
|
||||
|
||||
// NewDocumentParserClient 创建 DocumentParser 客户端
|
||||
func NewDocumentParserClient(cc grpc.ClientConnInterface) DocumentParserClient {
|
||||
return &documentParserClient{cc: cc}
|
||||
}
|
||||
|
||||
func (c *documentParserClient) ParseDocument(ctx context.Context, in *ParseRequest, opts ...grpc.CallOption) (*ParseResponse, error) {
|
||||
out := new(ParseResponse)
|
||||
err := c.cc.Invoke(ctx, "/docparser.DocumentParser/ParseDocument", in, out, opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
@@ -24,18 +24,21 @@ func init() {
|
||||
}
|
||||
|
||||
type KnowledgeService struct {
|
||||
repo *repository.KnowledgeRepository
|
||||
modelRepo *repository.ModelRepository
|
||||
uploadService *UploadService
|
||||
repo *repository.KnowledgeRepository
|
||||
modelRepo *repository.ModelRepository
|
||||
uploadService *UploadService
|
||||
pythonServiceURL string
|
||||
aiCoreClient *AICoreClient
|
||||
}
|
||||
|
||||
func NewKnowledgeService(repo *repository.KnowledgeRepository, modelRepo *repository.ModelRepository, uploadService *UploadService, pythonServiceURL string) *KnowledgeService {
|
||||
func NewKnowledgeService(repo *repository.KnowledgeRepository, modelRepo *repository.ModelRepository, uploadService *UploadService, pythonServiceURL, aiCoreServiceAddr string) *KnowledgeService {
|
||||
aiCoreClient, _ := NewAICoreClient(aiCoreServiceAddr)
|
||||
return &KnowledgeService{
|
||||
repo: repo,
|
||||
modelRepo: modelRepo,
|
||||
uploadService: uploadService,
|
||||
repo: repo,
|
||||
modelRepo: modelRepo,
|
||||
uploadService: uploadService,
|
||||
pythonServiceURL: pythonServiceURL,
|
||||
aiCoreClient: aiCoreClient,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -227,6 +230,9 @@ func (s *KnowledgeService) UploadDocument(kbID string, file *multipart.FileHeade
|
||||
// 异步调用 Python 服务解析文档
|
||||
go s.parseDocument(kbID, doc.ID, result.URL, kb.ParsingConfig)
|
||||
|
||||
// 异步调用 AI-Core gRPC 服务解析文档(获取 Markdown)
|
||||
go s.parseDocumentWithAICore(doc.ID, result.URL, doc.Name)
|
||||
|
||||
return doc, result.URL, nil
|
||||
}
|
||||
|
||||
@@ -284,6 +290,32 @@ func (s *KnowledgeService) parseDocument(kbID, docID, fileURL string, config mod
|
||||
}
|
||||
}
|
||||
|
||||
// parseDocumentWithAICore 调用 AI-Core gRPC 服务解析文档
|
||||
func (s *KnowledgeService) parseDocumentWithAICore(docID, fileURL, fileName string) {
|
||||
if s.aiCoreClient == nil {
|
||||
knowledgeDebugLog.Printf("[AICore] AI-Core 客户端未初始化")
|
||||
return
|
||||
}
|
||||
|
||||
knowledgeDebugLog.Printf("[AICore] 开始解析文档: docID=%s, fileURL=%s, fileName=%s", docID, fileURL, fileName)
|
||||
|
||||
result, err := s.aiCoreClient.ParseDocument(fileURL, fileName, "")
|
||||
if err != nil {
|
||||
knowledgeDebugLog.Printf("[AICore] 解析失败: docID=%s, err=%v", docID, err)
|
||||
return
|
||||
}
|
||||
|
||||
if result.Success && result.Content != "" {
|
||||
knowledgeDebugLog.Printf("[AICore] 解析成功: docID=%s, contentLength=%d", docID, len(result.Content))
|
||||
// 更新文档的 Content 字段
|
||||
s.repo.UpdateDocument(docID, map[string]interface{}{
|
||||
"content": result.Content,
|
||||
})
|
||||
} else {
|
||||
knowledgeDebugLog.Printf("[AICore] 解析返回失败: docID=%s, message=%s", docID, result.Message)
|
||||
}
|
||||
}
|
||||
|
||||
// DeleteDocument 删除文档
|
||||
func (s *KnowledgeService) DeleteDocument(kbID, docID string) error {
|
||||
// 验证文档存在
|
||||
|
||||
143
team-require/ai/ai-core-api.md
Normal file
143
team-require/ai/ai-core-api.md
Normal file
@@ -0,0 +1,143 @@
|
||||
# AI-Core 文档解析服务 API 对接文档
|
||||
|
||||
## 服务地址
|
||||
|
||||
```
|
||||
localhost:50051
|
||||
```
|
||||
|
||||
## gRPC API 定义
|
||||
|
||||
### 1. ParseDocument - 解析文档
|
||||
|
||||
**请求 (ParseRequest)**
|
||||
```protobuf
|
||||
message ParseRequest {
|
||||
string file_url = 1; // 文件 URL(必填)
|
||||
string file_name = 2; // 文件名,带扩展名(必填)
|
||||
string file_type = 3; // 文件类型(可选,自动检测)
|
||||
map<string, string> engine_overrides = 4; // 引擎配置
|
||||
}
|
||||
```
|
||||
|
||||
**响应 (ParseResponse)**
|
||||
```protobuf
|
||||
message ParseResponse {
|
||||
bool success = 1; // 是否成功
|
||||
string content = 2; // Markdown 内容
|
||||
string message = 3; // 状态消息
|
||||
int32 content_length = 4; // 内容长度
|
||||
string file_type = 5; // 文件类型
|
||||
string parser_engine = 6; // 解析引擎 (markitdown)
|
||||
}
|
||||
```
|
||||
|
||||
### 2. GetSupportedFormats - 获取支持的格式
|
||||
|
||||
**请求**: 空消息
|
||||
|
||||
**响应**
|
||||
- `file_types`: string[] - 支持的扩展名列表
|
||||
- `file_type_descriptions`: map<string, string> - 格式描述
|
||||
|
||||
---
|
||||
|
||||
## Golang 对接示例
|
||||
|
||||
### 1. 安装依赖
|
||||
|
||||
```bash
|
||||
go get google.golang.org/grpc
|
||||
go get google.golang.org/grpc/credentials/insecure
|
||||
```
|
||||
|
||||
### 2. 生成 Go Proto 代码
|
||||
|
||||
需要先将 `proto/document_parser.proto` 生成 Go 代码:
|
||||
|
||||
```bash
|
||||
# 方法一:使用 grpc_tools
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
|
||||
protoc --go_out=. --go_opt=paths=source_relative \
|
||||
--go-grpc_out=. --go-grpc_opt=paths=source_relative \
|
||||
proto/document_parser.proto
|
||||
```
|
||||
|
||||
### 3. 完整调用代码
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
|
||||
pb "your-project/proto" // 替换为你的 proto 包路径
|
||||
)
|
||||
|
||||
func main() {
|
||||
// 连接 gRPC 服务
|
||||
conn, err := grpc.Dial(
|
||||
"localhost:50051",
|
||||
grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithBlock(),
|
||||
)
|
||||
if err != nil {
|
||||
log.Fatalf("连接失败: %v", err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
client := pb.NewDocumentParserClient(conn)
|
||||
ctx := context.Background()
|
||||
|
||||
// 调用 ParseDocument
|
||||
req := &pb.ParseRequest{
|
||||
FileUrl: "https://example.com/document.pdf",
|
||||
FileName: "document.pdf",
|
||||
}
|
||||
|
||||
resp, err := client.ParseDocument(ctx, req)
|
||||
if err != nil {
|
||||
log.Fatalf("解析失败: %v", err)
|
||||
}
|
||||
|
||||
// 处理响应
|
||||
if resp.Success {
|
||||
fmt.Printf("解析成功!\n")
|
||||
fmt.Printf("内容长度: %d 字符\n", resp.ContentLength)
|
||||
fmt.Printf("Markdown 内容:\n%s\n", resp.Content)
|
||||
} else {
|
||||
fmt.Printf("解析失败: %s\n", resp.Message)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4. 获取支持的格式
|
||||
|
||||
```go
|
||||
// 获取支持的文件格式
|
||||
formatsReq := &pb.Empty{}
|
||||
formatsResp, err := client.GetSupportedFormats(ctx, formatsReq)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
fmt.Println("支持的格式:")
|
||||
for _, ft := range formatsResp.FileTypes {
|
||||
desc := formatsResp.FileTypeDescriptions[ft]
|
||||
fmt.Printf(" - %s: %s\n", ft, desc)
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. **文件 URL**: 必须是可直接访问的 URL,服务会下载文件到内存解析
|
||||
2. **文件名**: 必须带扩展名(如 `.pdf`, `.docx`),用于自动识别文件类型
|
||||
3. **返回内容**: 直接返回 Markdown 格式文本,可用于向量检索或 LLM 处理
|
||||
15
team-require/ai/todo.md
Normal file
15
team-require/ai/todo.md
Normal file
@@ -0,0 +1,15 @@
|
||||
# AI 服务需求 TODO
|
||||
|
||||
## 2026年3月
|
||||
|
||||
### 2026-03-09
|
||||
|
||||
- [ ] **AI-Core 文档解析服务对接**
|
||||
- 服务:ai-core (gRPC, 端口 50051)
|
||||
- 功能:将文档(PDF/DOCX/PPTX 等)转换为 Markdown
|
||||
- 对接方式:gRPC 调用
|
||||
- 详细需求:[ai-core-api.md](./ai-core-api.md)
|
||||
|
||||
---
|
||||
|
||||
> 需求完成后请完成者打 ✔
|
||||
@@ -863,15 +863,12 @@ const deleteDocument = async (docId: string) => {
|
||||
width="calc(100vw - 40px)"
|
||||
top="20px"
|
||||
:close-on-click-modal="false"
|
||||
:show-close="false"
|
||||
class="kb-dialog file-upload-dialog"
|
||||
>
|
||||
<div class="file-upload-layout">
|
||||
<!-- 顶部导航 -->
|
||||
<div class="file-header">
|
||||
<button class="back-btn" @click="showFileUploadDialog = false">
|
||||
<i class="fa-solid fa-arrow-left"></i>
|
||||
</button>
|
||||
<h2 class="file-title">{{ selectedKnowledge?.name || 'Knowledge Base' }}</h2>
|
||||
<input
|
||||
type="file"
|
||||
ref="fileInput"
|
||||
@@ -883,6 +880,10 @@ const deleteDocument = async (docId: string) => {
|
||||
<i class="fa-solid fa-upload"></i>
|
||||
Upload
|
||||
</button>
|
||||
<h2 class="file-title">{{ selectedKnowledge?.name || 'Knowledge Base' }}</h2>
|
||||
<button class="back-btn" @click="showFileUploadDialog = false">
|
||||
<i class="fa-solid fa-xmark"></i>
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- 标签栏 -->
|
||||
|
||||
@@ -645,6 +645,8 @@
|
||||
}
|
||||
|
||||
.file-title {
|
||||
flex: 1;
|
||||
text-align: center;
|
||||
font-size: 16px;
|
||||
font-weight: 600;
|
||||
color: #ffffff;
|
||||
|
||||
Reference in New Issue
Block a user