feat: 完善 AI-Core 文档解析器
- 添加多种文档解析器 (PDF, Word, Excel, Markdown 等) - 添加基础解析器和链式解析器 - 添加存储和注册机制 - 添加 gRPC 服务实现 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
107
ai-core/parser/markitdown_parser.py
Normal file
107
ai-core/parser/markitdown_parser.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import base64
|
||||
|
||||
from markitdown import MarkItDown
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
|
||||
# 尝试导入 VLMClient
|
||||
try:
|
||||
from parser.vlm_client import VLMClient
|
||||
except ImportError:
|
||||
VLMClient = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StdMarkitdownParser(BaseParser):
|
||||
"""
|
||||
Standard MarkItDown Parser Wrapper
|
||||
|
||||
This parser uses the markitdown library to convert various document formats
|
||||
(docx, pptx, pdf, etc.) into text/markdown.
|
||||
Optionally uses VLM to process images.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, vlm_config=None, **kwargs):
|
||||
# 这里的 super() 会调用 BaseParser 的初始化,确保 self.file_type 被正确赋值
|
||||
super().__init__(*args, **kwargs)
|
||||
self.markitdown = MarkItDown()
|
||||
self.vlm_config = vlm_config
|
||||
self.vlm_client = None
|
||||
|
||||
# 如果有 VLM 配置,初始化 VLM 客户端
|
||||
if vlm_config and vlm_config.get("enabled") and VLMClient:
|
||||
try:
|
||||
self.vlm_client = VLMClient(vlm_config)
|
||||
logger.info(f"VLM client initialized: provider={vlm_config.get('provider')}, model={vlm_config.get('model')}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to initialize VLM client: {e}")
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""
|
||||
Parses content using MarkItDown.
|
||||
Uses self.file_type (inherited from BaseParser) to hint the stream format.
|
||||
"""
|
||||
ext = self.file_type
|
||||
if ext and not ext.startswith('.'):
|
||||
ext = '.' + ext
|
||||
|
||||
# 直接调用 convert,移除 try-catch,让异常由上层 PipelineParser 统一捕获
|
||||
result = self.markitdown.convert(
|
||||
io.BytesIO(content),
|
||||
file_extension=ext,
|
||||
keep_data_uris=True
|
||||
)
|
||||
|
||||
markdown_content = result.text_content
|
||||
|
||||
# 如果有 VLM 客户端,尝试处理图片
|
||||
if self.vlm_client and markdown_content:
|
||||
markdown_content = self._process_images_with_vlm(markdown_content)
|
||||
|
||||
return Document(content=markdown_content)
|
||||
|
||||
def _process_images_with_vlm(self, content: str) -> str:
|
||||
"""
|
||||
处理 Markdown 内容中的图片,使用 VLM 分析并替换
|
||||
"""
|
||||
# 匹配 data:image 开头的 Base64 图片
|
||||
pattern = r'!\[([^\]]*)\]\((data:image/([^;]+);base64,([A-Za-z0-9+/=]+))\)'
|
||||
|
||||
def replace_image(match):
|
||||
alt_text = match.group(1)
|
||||
data_url = match.group(2)
|
||||
mime_type = match.group(3) or "image/png"
|
||||
base64_data = match.group(4)
|
||||
|
||||
try:
|
||||
# 解码 Base64 图片
|
||||
image_bytes = base64.b64decode(base64_data)
|
||||
|
||||
# 调用 VLM 分析图片
|
||||
logger.info(f"Processing image with VLM: {alt_text or 'unnamed'}")
|
||||
vlm_result = self.vlm_client.analyze_image(image_bytes, mime_type)
|
||||
|
||||
if vlm_result.get("success"):
|
||||
vlm_content = vlm_result.get("content", "")
|
||||
logger.info(f"VLM processed image successfully, content length: {len(vlm_content)}")
|
||||
# 替换为 VLM 解析的内容
|
||||
return f"<!-- Image: {alt_text} -->\n{vlm_content}\n<!-- End Image -->"
|
||||
else:
|
||||
logger.warning(f"VLM failed for image: {vlm_result.get('error')}")
|
||||
return match.group(0) # 保留原图片引用
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing image with VLM: {e}")
|
||||
return match.group(0) # 保留原图片引用
|
||||
|
||||
return re.sub(pattern, replace_image, content)
|
||||
|
||||
|
||||
class MarkitdownParser(PipelineParser):
|
||||
_parser_cls = (StdMarkitdownParser, MarkdownParser)
|
||||
Reference in New Issue
Block a user