2026-03-09 10:27:08 +08:00
|
|
|
|
import logging
|
|
|
|
|
|
import os
|
|
|
|
|
|
import tempfile
|
2026-03-09 15:42:35 +08:00
|
|
|
|
from typing import Optional, Dict, Any
|
2026-03-09 10:27:08 +08:00
|
|
|
|
from markitdown import MarkItDown
|
|
|
|
|
|
|
2026-03-09 15:42:35 +08:00
|
|
|
|
from .vlm_client import VLMClient
|
|
|
|
|
|
from .config import get_vlm_config
|
|
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Parser:
|
2026-03-09 15:42:35 +08:00
|
|
|
|
"""基于 MarkItDown + VLM 的统一文档解析器
|
|
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
支持格式:PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页、Markdown 等
|
2026-03-09 15:42:35 +08:00
|
|
|
|
|
|
|
|
|
|
VLM 解析:
|
|
|
|
|
|
- 方式一:启动时配置(config.yaml 或环境变量)
|
|
|
|
|
|
- 方式二:gRPC 请求时传入 VLM 配置(优先级更高)
|
2026-03-09 10:27:08 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
|
self.markitdown = MarkItDown()
|
2026-03-09 15:42:35 +08:00
|
|
|
|
self.vlm_client: Optional[VLMClient] = None
|
2026-03-09 10:27:08 +08:00
|
|
|
|
|
2026-03-09 15:42:35 +08:00
|
|
|
|
# 尝试加载配置的 VLM
|
|
|
|
|
|
vlm_config = get_vlm_config()
|
|
|
|
|
|
if vlm_config:
|
|
|
|
|
|
self.vlm_client = VLMClient(vlm_config)
|
|
|
|
|
|
logger.info(f"VLM enabled: provider={vlm_config.get('provider')}, model={vlm_config.get('model')}")
|
|
|
|
|
|
else:
|
|
|
|
|
|
logger.info("VLM not configured, using MarkItDown only")
|
|
|
|
|
|
|
|
|
|
|
|
def set_vlm_config(self, config: Dict[str, Any]) -> None:
|
|
|
|
|
|
"""手动设置 VLM 配置(优先级高于全局配置)"""
|
|
|
|
|
|
if config and config.get("enabled") and config.get("api_key"):
|
|
|
|
|
|
self.vlm_client = VLMClient(config)
|
|
|
|
|
|
logger.info(f"VLM enabled: provider={config.get('provider')}, model={config.get('model')}")
|
|
|
|
|
|
else:
|
|
|
|
|
|
self.vlm_client = None
|
|
|
|
|
|
logger.info("VLM disabled")
|
|
|
|
|
|
|
|
|
|
|
|
def parse(self, file_path: str, file_type: Optional[str] = None, vlm_config: Optional[Dict[str, Any]] = None) -> dict:
|
2026-03-09 10:27:08 +08:00
|
|
|
|
"""解析文档为 Markdown
|
2026-03-09 15:42:35 +08:00
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
Args:
|
|
|
|
|
|
file_path: 文件路径或 URL
|
|
|
|
|
|
file_type: 文件类型(可选,MarkItDown 会自动检测)
|
2026-03-09 15:42:35 +08:00
|
|
|
|
vlm_config: VLM 配置(可选,优先级高于全局配置)
|
|
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
Returns:
|
|
|
|
|
|
dict: 包含 markdown 内容和元数据
|
|
|
|
|
|
"""
|
2026-03-09 15:42:35 +08:00
|
|
|
|
# 如果有 VLM 配置,覆盖全局配置
|
|
|
|
|
|
if vlm_config:
|
|
|
|
|
|
self.set_vlm_config(vlm_config)
|
|
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
try:
|
|
|
|
|
|
logger.info(f"Parsing file: {file_path}")
|
2026-03-09 15:42:35 +08:00
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
result = self.markitdown.convert(file_path)
|
2026-03-09 15:42:35 +08:00
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
logger.info(f"Parse successful: {len(result.text_content)} characters")
|
2026-03-09 15:42:35 +08:00
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
return {
|
|
|
|
|
|
"success": True,
|
|
|
|
|
|
"content": result.text_content,
|
|
|
|
|
|
"content_length": len(result.text_content),
|
|
|
|
|
|
"metadata": result.metadata if hasattr(result, 'metadata') else {}
|
|
|
|
|
|
}
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"Parse error: {e}", exc_info=True)
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"content": "",
|
|
|
|
|
|
"content_length": 0,
|
|
|
|
|
|
"error": str(e)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-09 15:42:35 +08:00
|
|
|
|
def parse_bytes(self, content: bytes, file_name: str, file_type: Optional[str] = None, vlm_config: Optional[Dict[str, Any]] = None) -> dict:
|
2026-03-09 10:27:08 +08:00
|
|
|
|
"""解析字节内容为 Markdown
|
2026-03-09 15:42:35 +08:00
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
Args:
|
|
|
|
|
|
content: 文件字节内容
|
|
|
|
|
|
file_name: 文件名
|
|
|
|
|
|
file_type: 文件类型(可选)
|
2026-03-09 15:42:35 +08:00
|
|
|
|
vlm_config: VLM 配置(可选,优先级高于全局配置)
|
|
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
Returns:
|
|
|
|
|
|
dict: 包含 markdown 内容和元数据
|
|
|
|
|
|
"""
|
2026-03-09 15:42:35 +08:00
|
|
|
|
# 如果有 VLM 配置,覆盖全局配置
|
|
|
|
|
|
if vlm_config:
|
|
|
|
|
|
self.set_vlm_config(vlm_config)
|
|
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
try:
|
|
|
|
|
|
logger.info(f"Parsing bytes: {file_name}, size: {len(content)} bytes")
|
2026-03-09 15:42:35 +08:00
|
|
|
|
|
|
|
|
|
|
# 检查是否应该使用 VLM(根据文件名自动判断)
|
|
|
|
|
|
if self._should_use_vlm(file_name):
|
|
|
|
|
|
logger.info("Using VLM for parsing")
|
|
|
|
|
|
return self._parse_with_vlm(content, file_name)
|
|
|
|
|
|
|
|
|
|
|
|
# 否则使用 MarkItDown
|
2026-03-09 10:27:08 +08:00
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1] or '') as temp_file:
|
|
|
|
|
|
temp_file.write(content)
|
|
|
|
|
|
temp_path = temp_file.name
|
2026-03-09 15:42:35 +08:00
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
try:
|
|
|
|
|
|
result = self.markitdown.convert(temp_path)
|
2026-03-09 15:42:35 +08:00
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
logger.info(f"Parse successful: {len(result.text_content)} characters")
|
2026-03-09 15:42:35 +08:00
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
return {
|
|
|
|
|
|
"success": True,
|
|
|
|
|
|
"content": result.text_content,
|
|
|
|
|
|
"content_length": len(result.text_content),
|
|
|
|
|
|
"metadata": result.metadata if hasattr(result, 'metadata') else {}
|
|
|
|
|
|
}
|
|
|
|
|
|
finally:
|
|
|
|
|
|
os.unlink(temp_path)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"Parse bytes error: {e}", exc_info=True)
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"content": "",
|
|
|
|
|
|
"content_length": 0,
|
|
|
|
|
|
"error": str(e)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-09 15:42:35 +08:00
|
|
|
|
def _should_use_vlm(self, file_name: str) -> bool:
|
|
|
|
|
|
"""判断是否应该使用 VLM"""
|
|
|
|
|
|
if not self.vlm_client:
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
# 图片文件使用 VLM
|
|
|
|
|
|
image_exts = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff']
|
|
|
|
|
|
ext = os.path.splitext(file_name)[1].lower()
|
|
|
|
|
|
return ext in image_exts
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_with_vlm(self, content: bytes, file_name: str) -> dict:
|
|
|
|
|
|
"""使用 VLM 解析"""
|
|
|
|
|
|
if not self.vlm_client:
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"content": "",
|
|
|
|
|
|
"content_length": 0,
|
|
|
|
|
|
"error": "VLM not configured"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 确定 MIME 类型
|
|
|
|
|
|
ext = os.path.splitext(file_name)[1].lower()
|
|
|
|
|
|
mime_types = {
|
|
|
|
|
|
'.jpg': 'image/jpeg',
|
|
|
|
|
|
'.jpeg': 'image/jpeg',
|
|
|
|
|
|
'.png': 'image/png',
|
|
|
|
|
|
'.gif': 'image/gif',
|
|
|
|
|
|
'.bmp': 'image/bmp',
|
|
|
|
|
|
'.webp': 'image/webp',
|
|
|
|
|
|
'.tiff': 'image/tiff',
|
|
|
|
|
|
}
|
|
|
|
|
|
mime_type = mime_types.get(ext, 'image/png')
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
result = self.vlm_client.analyze_image(content, mime_type)
|
|
|
|
|
|
|
|
|
|
|
|
if result.get("success"):
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": True,
|
|
|
|
|
|
"content": result["content"],
|
|
|
|
|
|
"content_length": len(result["content"]),
|
|
|
|
|
|
"metadata": {"vlm_used": True}
|
|
|
|
|
|
}
|
|
|
|
|
|
else:
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"content": "",
|
|
|
|
|
|
"content_length": 0,
|
|
|
|
|
|
"error": result.get("error", "VLM parsing failed")
|
|
|
|
|
|
}
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"VLM parsing error: {e}")
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"content": "",
|
|
|
|
|
|
"content_length": 0,
|
|
|
|
|
|
"error": str(e)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
parser = Parser()
|
2026-03-09 15:42:35 +08:00
|
|
|
|
|
2026-03-09 10:27:08 +08:00
|
|
|
|
# 测试
|
|
|
|
|
|
test_url = "https://example.com"
|
|
|
|
|
|
result = parser.parse(test_url)
|
|
|
|
|
|
print(f"Success: {result['success']}")
|
|
|
|
|
|
print(f"Content length: {result['content_length']}")
|