import logging import os import tempfile from typing import Optional, Dict, Any from markitdown import MarkItDown from .vlm_client import VLMClient from .config import get_vlm_config logger = logging.getLogger(__name__) class Parser: """基于 MarkItDown + VLM 的统一文档解析器 支持格式:PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页、Markdown 等 VLM 解析: - 方式一:启动时配置(config.yaml 或环境变量) - 方式二:gRPC 请求时传入 VLM 配置(优先级更高) """ def __init__(self): self.markitdown = MarkItDown() self.vlm_client: Optional[VLMClient] = None # 尝试加载配置的 VLM vlm_config = get_vlm_config() if vlm_config: self.vlm_client = VLMClient(vlm_config) logger.info(f"VLM enabled: provider={vlm_config.get('provider')}, model={vlm_config.get('model')}") else: logger.info("VLM not configured, using MarkItDown only") def set_vlm_config(self, config: Dict[str, Any]) -> None: """手动设置 VLM 配置(优先级高于全局配置)""" if config and config.get("enabled") and config.get("api_key"): self.vlm_client = VLMClient(config) logger.info(f"VLM enabled: provider={config.get('provider')}, model={config.get('model')}") else: self.vlm_client = None logger.info("VLM disabled") def parse(self, file_path: str, file_type: Optional[str] = None, vlm_config: Optional[Dict[str, Any]] = None) -> dict: """解析文档为 Markdown Args: file_path: 文件路径或 URL file_type: 文件类型(可选,MarkItDown 会自动检测) vlm_config: VLM 配置(可选,优先级高于全局配置) Returns: dict: 包含 markdown 内容和元数据 """ # 如果有 VLM 配置,覆盖全局配置 if vlm_config: self.set_vlm_config(vlm_config) try: logger.info(f"Parsing file: {file_path}") result = self.markitdown.convert(file_path) logger.info(f"Parse successful: {len(result.text_content)} characters") return { "success": True, "content": result.text_content, "content_length": len(result.text_content), "metadata": result.metadata if hasattr(result, 'metadata') else {} } except Exception as e: logger.error(f"Parse error: {e}", exc_info=True) return { "success": False, "content": "", "content_length": 0, "error": str(e) } def parse_bytes(self, content: bytes, file_name: str, file_type: Optional[str] = None, vlm_config: Optional[Dict[str, Any]] = None) -> dict: """解析字节内容为 Markdown Args: content: 文件字节内容 file_name: 文件名 file_type: 文件类型(可选) vlm_config: VLM 配置(可选,优先级高于全局配置) Returns: dict: 包含 markdown 内容和元数据 """ # 如果有 VLM 配置,覆盖全局配置 if vlm_config: self.set_vlm_config(vlm_config) try: logger.info(f"Parsing bytes: {file_name}, size: {len(content)} bytes") # 检查是否应该使用 VLM(根据文件名自动判断) if self._should_use_vlm(file_name): logger.info("Using VLM for parsing") return self._parse_with_vlm(content, file_name) # 否则使用 MarkItDown with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1] or '') as temp_file: temp_file.write(content) temp_path = temp_file.name try: result = self.markitdown.convert(temp_path) logger.info(f"Parse successful: {len(result.text_content)} characters") return { "success": True, "content": result.text_content, "content_length": len(result.text_content), "metadata": result.metadata if hasattr(result, 'metadata') else {} } finally: os.unlink(temp_path) except Exception as e: logger.error(f"Parse bytes error: {e}", exc_info=True) return { "success": False, "content": "", "content_length": 0, "error": str(e) } def _should_use_vlm(self, file_name: str) -> bool: """判断是否应该使用 VLM""" if not self.vlm_client: return False # 图片文件使用 VLM image_exts = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff'] ext = os.path.splitext(file_name)[1].lower() return ext in image_exts def _parse_with_vlm(self, content: bytes, file_name: str) -> dict: """使用 VLM 解析""" if not self.vlm_client: return { "success": False, "content": "", "content_length": 0, "error": "VLM not configured" } # 确定 MIME 类型 ext = os.path.splitext(file_name)[1].lower() mime_types = { '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.gif': 'image/gif', '.bmp': 'image/bmp', '.webp': 'image/webp', '.tiff': 'image/tiff', } mime_type = mime_types.get(ext, 'image/png') try: result = self.vlm_client.analyze_image(content, mime_type) if result.get("success"): return { "success": True, "content": result["content"], "content_length": len(result["content"]), "metadata": {"vlm_used": True} } else: return { "success": False, "content": "", "content_length": 0, "error": result.get("error", "VLM parsing failed") } except Exception as e: logger.error(f"VLM parsing error: {e}") return { "success": False, "content": "", "content_length": 0, "error": str(e) } if __name__ == "__main__": parser = Parser() # 测试 test_url = "https://example.com" result = parser.parse(test_url) print(f"Success: {result['success']}") print(f"Content length: {result['content_length']}")