import io import logging import re import base64 from markitdown import MarkItDown from docreader.models.document import Document from docreader.parser.base_parser import BaseParser from docreader.parser.chain_parser import PipelineParser from docreader.parser.markdown_parser import MarkdownParser # 尝试导入 VLMClient try: from parser.vlm_client import VLMClient except ImportError: VLMClient = None logger = logging.getLogger(__name__) class StdMarkitdownParser(BaseParser): """ Standard MarkItDown Parser Wrapper This parser uses the markitdown library to convert various document formats (docx, pptx, pdf, etc.) into text/markdown. Optionally uses VLM to process images. """ def __init__(self, *args, vlm_config=None, **kwargs): # 这里的 super() 会调用 BaseParser 的初始化,确保 self.file_type 被正确赋值 super().__init__(*args, **kwargs) self.markitdown = MarkItDown() self.vlm_config = vlm_config self.vlm_client = None # 如果有 VLM 配置,初始化 VLM 客户端 if vlm_config and vlm_config.get("enabled") and VLMClient: try: self.vlm_client = VLMClient(vlm_config) logger.info(f"VLM client initialized: provider={vlm_config.get('provider')}, model={vlm_config.get('model')}") except Exception as e: logger.warning(f"Failed to initialize VLM client: {e}") def parse_into_text(self, content: bytes) -> Document: """ Parses content using MarkItDown. Uses self.file_type (inherited from BaseParser) to hint the stream format. """ ext = self.file_type if ext and not ext.startswith('.'): ext = '.' + ext # 直接调用 convert,移除 try-catch,让异常由上层 PipelineParser 统一捕获 result = self.markitdown.convert( io.BytesIO(content), file_extension=ext, keep_data_uris=True ) markdown_content = result.text_content # 如果有 VLM 客户端,尝试处理图片 if self.vlm_client and markdown_content: markdown_content = self._process_images_with_vlm(markdown_content) return Document(content=markdown_content) def _process_images_with_vlm(self, content: str) -> str: """ 处理 Markdown 内容中的图片,使用 VLM 分析并替换 """ # 匹配 data:image 开头的 Base64 图片 pattern = r'!\[([^\]]*)\]\((data:image/([^;]+);base64,([A-Za-z0-9+/=]+))\)' def replace_image(match): alt_text = match.group(1) data_url = match.group(2) mime_type = match.group(3) or "image/png" base64_data = match.group(4) try: # 解码 Base64 图片 image_bytes = base64.b64decode(base64_data) # 调用 VLM 分析图片 logger.info(f"Processing image with VLM: {alt_text or 'unnamed'}") vlm_result = self.vlm_client.analyze_image(image_bytes, mime_type) if vlm_result.get("success"): vlm_content = vlm_result.get("content", "") logger.info(f"VLM processed image successfully, content length: {len(vlm_content)}") # 替换为 VLM 解析的内容 return f"\n{vlm_content}\n" else: logger.warning(f"VLM failed for image: {vlm_result.get('error')}") return match.group(0) # 保留原图片引用 except Exception as e: logger.error(f"Error processing image with VLM: {e}") return match.group(0) # 保留原图片引用 return re.sub(pattern, replace_image, content) class MarkitdownParser(PipelineParser): _parser_cls = (StdMarkitdownParser, MarkdownParser)