feat: 增强 AI-Core 文档解析器

- 添加 VLM 客户端支持 - 优化解析器配置 - 添加配置示例文件 - 生成新的 gRPC protobuf 文件 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 15:42:35 +08:00
parent ab7131eb05
commit 5012a25f99
10 changed files with 1177 additions and 42 deletions
--- a/ai-core/parser/parser.py
+++ b/ai-core/parser/parser.py
@@ -1,39 +1,68 @@
 import logging
 import os
 import tempfile
-from typing import Optional
+from typing import Optional, Dict, Any
 from markitdown import MarkItDown

+from .vlm_client import VLMClient
+from .config import get_vlm_config
+
 logger = logging.getLogger(__name__)


 class Parser:
-    """基于 MarkItDown 的统一文档解析器
-    
+    """基于 MarkItDown + VLM 的统一文档解析器
+
    支持格式：PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页、Markdown 等
+
+    VLM 解析：
+    - 方式一：启动时配置（config.yaml 或环境变量）
+    - 方式二：gRPC 请求时传入 VLM 配置（优先级更高）
    """

    def __init__(self):
        self.markitdown = MarkItDown()
-        logger.info("Parser initialized with MarkItDown")
+        self.vlm_client: Optional[VLMClient] = None

-    def parse(self, file_path: str, file_type: Optional[str] = None) -> dict:
+        # 尝试加载配置的 VLM
+        vlm_config = get_vlm_config()
+        if vlm_config:
+            self.vlm_client = VLMClient(vlm_config)
+            logger.info(f"VLM enabled: provider={vlm_config.get('provider')}, model={vlm_config.get('model')}")
+        else:
+            logger.info("VLM not configured, using MarkItDown only")
+
+    def set_vlm_config(self, config: Dict[str, Any]) -> None:
+        """手动设置 VLM 配置（优先级高于全局配置）"""
+        if config and config.get("enabled") and config.get("api_key"):
+            self.vlm_client = VLMClient(config)
+            logger.info(f"VLM enabled: provider={config.get('provider')}, model={config.get('model')}")
+        else:
+            self.vlm_client = None
+            logger.info("VLM disabled")
+
+    def parse(self, file_path: str, file_type: Optional[str] = None, vlm_config: Optional[Dict[str, Any]] = None) -> dict:
        """解析文档为 Markdown
-        
+
        Args:
            file_path: 文件路径或 URL
            file_type: 文件类型（可选，MarkItDown 会自动检测）
-            
+            vlm_config: VLM 配置（可选，优先级高于全局配置）
+
        Returns:
            dict: 包含 markdown 内容和元数据
        """
+        # 如果有 VLM 配置，覆盖全局配置
+        if vlm_config:
+            self.set_vlm_config(vlm_config)
+
        try:
            logger.info(f"Parsing file: {file_path}")
-            
+
            result = self.markitdown.convert(file_path)
-            
+
            logger.info(f"Parse successful: {len(result.text_content)} characters")
-            
+
            return {
                "success": True,
                "content": result.text_content,
@@ -49,29 +78,40 @@ class Parser:
                "error": str(e)
            }

-    def parse_bytes(self, content: bytes, file_name: str, file_type: Optional[str] = None) -> dict:
+    def parse_bytes(self, content: bytes, file_name: str, file_type: Optional[str] = None, vlm_config: Optional[Dict[str, Any]] = None) -> dict:
        """解析字节内容为 Markdown
-        
+
        Args:
            content: 文件字节内容
            file_name: 文件名
            file_type: 文件类型（可选）
-            
+            vlm_config: VLM 配置（可选，优先级高于全局配置）
+
        Returns:
            dict: 包含 markdown 内容和元数据
        """
+        # 如果有 VLM 配置，覆盖全局配置
+        if vlm_config:
+            self.set_vlm_config(vlm_config)
+
        try:
            logger.info(f"Parsing bytes: {file_name}, size: {len(content)} bytes")
-            
+
+            # 检查是否应该使用 VLM（根据文件名自动判断）
+            if self._should_use_vlm(file_name):
+                logger.info("Using VLM for parsing")
+                return self._parse_with_vlm(content, file_name)
+
+            # 否则使用 MarkItDown
            with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1] or '') as temp_file:
                temp_file.write(content)
                temp_path = temp_file.name
-            
+
            try:
                result = self.markitdown.convert(temp_path)
-                
+
                logger.info(f"Parse successful: {len(result.text_content)} characters")
-                
+
                return {
                    "success": True,
                    "content": result.text_content,
@@ -89,10 +129,69 @@ class Parser:
                "error": str(e)
            }

+    def _should_use_vlm(self, file_name: str) -> bool:
+        """判断是否应该使用 VLM"""
+        if not self.vlm_client:
+            return False
+
+        # 图片文件使用 VLM
+        image_exts = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff']
+        ext = os.path.splitext(file_name)[1].lower()
+        return ext in image_exts
+
+    def _parse_with_vlm(self, content: bytes, file_name: str) -> dict:
+        """使用 VLM 解析"""
+        if not self.vlm_client:
+            return {
+                "success": False,
+                "content": "",
+                "content_length": 0,
+                "error": "VLM not configured"
+            }
+
+        # 确定 MIME 类型
+        ext = os.path.splitext(file_name)[1].lower()
+        mime_types = {
+            '.jpg': 'image/jpeg',
+            '.jpeg': 'image/jpeg',
+            '.png': 'image/png',
+            '.gif': 'image/gif',
+            '.bmp': 'image/bmp',
+            '.webp': 'image/webp',
+            '.tiff': 'image/tiff',
+        }
+        mime_type = mime_types.get(ext, 'image/png')
+
+        try:
+            result = self.vlm_client.analyze_image(content, mime_type)
+
+            if result.get("success"):
+                return {
+                    "success": True,
+                    "content": result["content"],
+                    "content_length": len(result["content"]),
+                    "metadata": {"vlm_used": True}
+                }
+            else:
+                return {
+                    "success": False,
+                    "content": "",
+                    "content_length": 0,
+                    "error": result.get("error", "VLM parsing failed")
+                }
+        except Exception as e:
+            logger.error(f"VLM parsing error: {e}")
+            return {
+                "success": False,
+                "content": "",
+                "content_length": 0,
+                "error": str(e)
+            }
+

 if __name__ == "__main__":
    parser = Parser()
-    
+
    # 测试
    test_url = "https://example.com"
    result = parser.parse(test_url)