Files
X-Agents/ai-core/parser/markitdown_parser.py
DESKTOP-72TV0V4\caoxiaozhu d24b29afe4 feat: 完善 AI-Core 文档解析器
- 添加多种文档解析器 (PDF, Word, Excel, Markdown 等)
- 添加基础解析器和链式解析器
- 添加存储和注册机制
- 添加 gRPC 服务实现

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 15:01:52 +08:00

107 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import io
import logging
import re
import base64
from markitdown import MarkItDown
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.chain_parser import PipelineParser
from docreader.parser.markdown_parser import MarkdownParser
# 尝试导入 VLMClient
try:
from parser.vlm_client import VLMClient
except ImportError:
VLMClient = None
logger = logging.getLogger(__name__)
class StdMarkitdownParser(BaseParser):
"""
Standard MarkItDown Parser Wrapper
This parser uses the markitdown library to convert various document formats
(docx, pptx, pdf, etc.) into text/markdown.
Optionally uses VLM to process images.
"""
def __init__(self, *args, vlm_config=None, **kwargs):
# 这里的 super() 会调用 BaseParser 的初始化,确保 self.file_type 被正确赋值
super().__init__(*args, **kwargs)
self.markitdown = MarkItDown()
self.vlm_config = vlm_config
self.vlm_client = None
# 如果有 VLM 配置,初始化 VLM 客户端
if vlm_config and vlm_config.get("enabled") and VLMClient:
try:
self.vlm_client = VLMClient(vlm_config)
logger.info(f"VLM client initialized: provider={vlm_config.get('provider')}, model={vlm_config.get('model')}")
except Exception as e:
logger.warning(f"Failed to initialize VLM client: {e}")
def parse_into_text(self, content: bytes) -> Document:
"""
Parses content using MarkItDown.
Uses self.file_type (inherited from BaseParser) to hint the stream format.
"""
ext = self.file_type
if ext and not ext.startswith('.'):
ext = '.' + ext
# 直接调用 convert移除 try-catch让异常由上层 PipelineParser 统一捕获
result = self.markitdown.convert(
io.BytesIO(content),
file_extension=ext,
keep_data_uris=True
)
markdown_content = result.text_content
# 如果有 VLM 客户端,尝试处理图片
if self.vlm_client and markdown_content:
markdown_content = self._process_images_with_vlm(markdown_content)
return Document(content=markdown_content)
def _process_images_with_vlm(self, content: str) -> str:
"""
处理 Markdown 内容中的图片,使用 VLM 分析并替换
"""
# 匹配 data:image 开头的 Base64 图片
pattern = r'!\[([^\]]*)\]\((data:image/([^;]+);base64,([A-Za-z0-9+/=]+))\)'
def replace_image(match):
alt_text = match.group(1)
data_url = match.group(2)
mime_type = match.group(3) or "image/png"
base64_data = match.group(4)
try:
# 解码 Base64 图片
image_bytes = base64.b64decode(base64_data)
# 调用 VLM 分析图片
logger.info(f"Processing image with VLM: {alt_text or 'unnamed'}")
vlm_result = self.vlm_client.analyze_image(image_bytes, mime_type)
if vlm_result.get("success"):
vlm_content = vlm_result.get("content", "")
logger.info(f"VLM processed image successfully, content length: {len(vlm_content)}")
# 替换为 VLM 解析的内容
return f"<!-- Image: {alt_text} -->\n{vlm_content}\n<!-- End Image -->"
else:
logger.warning(f"VLM failed for image: {vlm_result.get('error')}")
return match.group(0) # 保留原图片引用
except Exception as e:
logger.error(f"Error processing image with VLM: {e}")
return match.group(0) # 保留原图片引用
return re.sub(pattern, replace_image, content)
class MarkitdownParser(PipelineParser):
_parser_cls = (StdMarkitdownParser, MarkdownParser)