ai-core/parser/markitdown_parser.py

import io
import logging
import re
import base64

from markitdown import MarkItDown

from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.chain_parser import PipelineParser
from docreader.parser.markdown_parser import MarkdownParser

# 尝试导入 VLMClient
try:
    from parser.vlm_client import VLMClient
except ImportError:
    VLMClient = None

logger = logging.getLogger(__name__)


class StdMarkitdownParser(BaseParser):
    """
    Standard MarkItDown Parser Wrapper

    This parser uses the markitdown library to convert various document formats
    (docx, pptx, pdf, etc.) into text/markdown.
    Optionally uses VLM to process images.
    """

    def __init__(self, *args, vlm_config=None, **kwargs):
        # 这里的 super() 会调用 BaseParser 的初始化，确保 self.file_type 被正确赋值
        super().__init__(*args, **kwargs)
        self.markitdown = MarkItDown()
        self.vlm_config = vlm_config
        self.vlm_client = None

        # 如果有 VLM 配置，初始化 VLM 客户端
        if vlm_config and vlm_config.get("enabled") and VLMClient:
            try:
                self.vlm_client = VLMClient(vlm_config)
                logger.info(f"VLM client initialized: provider={vlm_config.get('provider')}, model={vlm_config.get('model')}")
            except Exception as e:
                logger.warning(f"Failed to initialize VLM client: {e}")

    def parse_into_text(self, content: bytes) -> Document:
        """
        Parses content using MarkItDown.
        Uses self.file_type (inherited from BaseParser) to hint the stream format.
        """
        ext = self.file_type
        if ext and not ext.startswith('.'):
            ext = '.' + ext

        # 直接调用 convert，移除 try-catch，让异常由上层 PipelineParser 统一捕获
        result = self.markitdown.convert(
            io.BytesIO(content),
            file_extension=ext,
            keep_data_uris=True
        )

        markdown_content = result.text_content

        # 如果有 VLM 客户端，尝试处理图片
        if self.vlm_client and markdown_content:
            markdown_content = self._process_images_with_vlm(markdown_content)

        return Document(content=markdown_content)

    def _process_images_with_vlm(self, content: str) -> str:
        """
        处理 Markdown 内容中的图片，使用 VLM 分析并替换
        """
        # 匹配 data:image 开头的 Base64 图片
        pattern = r'!\[([^\]]*)\]\((data:image/([^;]+);base64,([A-Za-z0-9+/=]+))\)'

        def replace_image(match):
            alt_text = match.group(1)
            data_url = match.group(2)
            mime_type = match.group(3) or "image/png"
            base64_data = match.group(4)

            try:
                # 解码 Base64 图片
                image_bytes = base64.b64decode(base64_data)

                # 调用 VLM 分析图片
                logger.info(f"Processing image with VLM: {alt_text or 'unnamed'}")
                vlm_result = self.vlm_client.analyze_image(image_bytes, mime_type)

                if vlm_result.get("success"):
                    vlm_content = vlm_result.get("content", "")
                    logger.info(f"VLM processed image successfully, content length: {len(vlm_content)}")
                    # 替换为 VLM 解析的内容
                    return f"<!-- Image: {alt_text} -->\n{vlm_content}\n<!-- End Image -->"
                else:
                    logger.warning(f"VLM failed for image: {vlm_result.get('error')}")
                    return match.group(0)  # 保留原图片引用
            except Exception as e:
                logger.error(f"Error processing image with VLM: {e}")
                return match.group(0)  # 保留原图片引用

        return re.sub(pattern, replace_image, content)


class MarkitdownParser(PipelineParser):
    _parser_cls = (StdMarkitdownParser, MarkdownParser)