Files
X-Agents/ai-core/parser/parser.py
DESKTOP-72TV0V4\caoxiaozhu 5012a25f99 feat: 增强 AI-Core 文档解析器
- 添加 VLM 客户端支持
- 优化解析器配置
- 添加配置示例文件
- 生成新的 gRPC protobuf 文件

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 15:42:35 +08:00

200 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import logging
import os
import tempfile
from typing import Optional, Dict, Any
from markitdown import MarkItDown
from .vlm_client import VLMClient
from .config import get_vlm_config
logger = logging.getLogger(__name__)
class Parser:
"""基于 MarkItDown + VLM 的统一文档解析器
支持格式PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页、Markdown 等
VLM 解析:
- 方式一启动时配置config.yaml 或环境变量)
- 方式二gRPC 请求时传入 VLM 配置(优先级更高)
"""
def __init__(self):
self.markitdown = MarkItDown()
self.vlm_client: Optional[VLMClient] = None
# 尝试加载配置的 VLM
vlm_config = get_vlm_config()
if vlm_config:
self.vlm_client = VLMClient(vlm_config)
logger.info(f"VLM enabled: provider={vlm_config.get('provider')}, model={vlm_config.get('model')}")
else:
logger.info("VLM not configured, using MarkItDown only")
def set_vlm_config(self, config: Dict[str, Any]) -> None:
"""手动设置 VLM 配置(优先级高于全局配置)"""
if config and config.get("enabled") and config.get("api_key"):
self.vlm_client = VLMClient(config)
logger.info(f"VLM enabled: provider={config.get('provider')}, model={config.get('model')}")
else:
self.vlm_client = None
logger.info("VLM disabled")
def parse(self, file_path: str, file_type: Optional[str] = None, vlm_config: Optional[Dict[str, Any]] = None) -> dict:
"""解析文档为 Markdown
Args:
file_path: 文件路径或 URL
file_type: 文件类型可选MarkItDown 会自动检测)
vlm_config: VLM 配置(可选,优先级高于全局配置)
Returns:
dict: 包含 markdown 内容和元数据
"""
# 如果有 VLM 配置,覆盖全局配置
if vlm_config:
self.set_vlm_config(vlm_config)
try:
logger.info(f"Parsing file: {file_path}")
result = self.markitdown.convert(file_path)
logger.info(f"Parse successful: {len(result.text_content)} characters")
return {
"success": True,
"content": result.text_content,
"content_length": len(result.text_content),
"metadata": result.metadata if hasattr(result, 'metadata') else {}
}
except Exception as e:
logger.error(f"Parse error: {e}", exc_info=True)
return {
"success": False,
"content": "",
"content_length": 0,
"error": str(e)
}
def parse_bytes(self, content: bytes, file_name: str, file_type: Optional[str] = None, vlm_config: Optional[Dict[str, Any]] = None) -> dict:
"""解析字节内容为 Markdown
Args:
content: 文件字节内容
file_name: 文件名
file_type: 文件类型(可选)
vlm_config: VLM 配置(可选,优先级高于全局配置)
Returns:
dict: 包含 markdown 内容和元数据
"""
# 如果有 VLM 配置,覆盖全局配置
if vlm_config:
self.set_vlm_config(vlm_config)
try:
logger.info(f"Parsing bytes: {file_name}, size: {len(content)} bytes")
# 检查是否应该使用 VLM根据文件名自动判断
if self._should_use_vlm(file_name):
logger.info("Using VLM for parsing")
return self._parse_with_vlm(content, file_name)
# 否则使用 MarkItDown
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1] or '') as temp_file:
temp_file.write(content)
temp_path = temp_file.name
try:
result = self.markitdown.convert(temp_path)
logger.info(f"Parse successful: {len(result.text_content)} characters")
return {
"success": True,
"content": result.text_content,
"content_length": len(result.text_content),
"metadata": result.metadata if hasattr(result, 'metadata') else {}
}
finally:
os.unlink(temp_path)
except Exception as e:
logger.error(f"Parse bytes error: {e}", exc_info=True)
return {
"success": False,
"content": "",
"content_length": 0,
"error": str(e)
}
def _should_use_vlm(self, file_name: str) -> bool:
"""判断是否应该使用 VLM"""
if not self.vlm_client:
return False
# 图片文件使用 VLM
image_exts = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff']
ext = os.path.splitext(file_name)[1].lower()
return ext in image_exts
def _parse_with_vlm(self, content: bytes, file_name: str) -> dict:
"""使用 VLM 解析"""
if not self.vlm_client:
return {
"success": False,
"content": "",
"content_length": 0,
"error": "VLM not configured"
}
# 确定 MIME 类型
ext = os.path.splitext(file_name)[1].lower()
mime_types = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.bmp': 'image/bmp',
'.webp': 'image/webp',
'.tiff': 'image/tiff',
}
mime_type = mime_types.get(ext, 'image/png')
try:
result = self.vlm_client.analyze_image(content, mime_type)
if result.get("success"):
return {
"success": True,
"content": result["content"],
"content_length": len(result["content"]),
"metadata": {"vlm_used": True}
}
else:
return {
"success": False,
"content": "",
"content_length": 0,
"error": result.get("error", "VLM parsing failed")
}
except Exception as e:
logger.error(f"VLM parsing error: {e}")
return {
"success": False,
"content": "",
"content_length": 0,
"error": str(e)
}
if __name__ == "__main__":
parser = Parser()
# 测试
test_url = "https://example.com"
result = parser.parse(test_url)
print(f"Success: {result['success']}")
print(f"Content length: {result['content_length']}")