refactor: 重构 algorithm 为 ai-core 代码解析服务

- 新增 ai-core 目录，包含代码解析核心服务 - 添加 proto 定义、parser、service 模块 - 添加启动脚本和依赖配置 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 10:27:08 +08:00
parent f22f823a4a
commit 797518ec76
15 changed files with 1163 additions and 0 deletions
--- a/ai-core/parser/parser.py
+++ b/ai-core/parser/parser.py
@@ -0,0 +1,100 @@
+import logging
+import os
+import tempfile
+from typing import Optional
+from markitdown import MarkItDown
+
+logger = logging.getLogger(__name__)
+
+
+class Parser:
+    """基于 MarkItDown 的统一文档解析器
+    
+    支持格式：PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页、Markdown 等
+    """
+
+    def __init__(self):
+        self.markitdown = MarkItDown()
+        logger.info("Parser initialized with MarkItDown")
+
+    def parse(self, file_path: str, file_type: Optional[str] = None) -> dict:
+        """解析文档为 Markdown
+        
+        Args:
+            file_path: 文件路径或 URL
+            file_type: 文件类型（可选，MarkItDown 会自动检测）
+            
+        Returns:
+            dict: 包含 markdown 内容和元数据
+        """
+        try:
+            logger.info(f"Parsing file: {file_path}")
+            
+            result = self.markitdown.convert(file_path)
+            
+            logger.info(f"Parse successful: {len(result.text_content)} characters")
+            
+            return {
+                "success": True,
+                "content": result.text_content,
+                "content_length": len(result.text_content),
+                "metadata": result.metadata if hasattr(result, 'metadata') else {}
+            }
+        except Exception as e:
+            logger.error(f"Parse error: {e}", exc_info=True)
+            return {
+                "success": False,
+                "content": "",
+                "content_length": 0,
+                "error": str(e)
+            }
+
+    def parse_bytes(self, content: bytes, file_name: str, file_type: Optional[str] = None) -> dict:
+        """解析字节内容为 Markdown
+        
+        Args:
+            content: 文件字节内容
+            file_name: 文件名
+            file_type: 文件类型（可选）
+            
+        Returns:
+            dict: 包含 markdown 内容和元数据
+        """
+        try:
+            logger.info(f"Parsing bytes: {file_name}, size: {len(content)} bytes")
+            
+            with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1] or '') as temp_file:
+                temp_file.write(content)
+                temp_path = temp_file.name
+            
+            try:
+                result = self.markitdown.convert(temp_path)
+                
+                logger.info(f"Parse successful: {len(result.text_content)} characters")
+                
+                return {
+                    "success": True,
+                    "content": result.text_content,
+                    "content_length": len(result.text_content),
+                    "metadata": result.metadata if hasattr(result, 'metadata') else {}
+                }
+            finally:
+                os.unlink(temp_path)
+        except Exception as e:
+            logger.error(f"Parse bytes error: {e}", exc_info=True)
+            return {
+                "success": False,
+                "content": "",
+                "content_length": 0,
+                "error": str(e)
+            }
+
+
+if __name__ == "__main__":
+    parser = Parser()
+    
+    # 测试
+    test_url = "https://example.com"
+    result = parser.parse(test_url)
+    print(f"Success: {result['success']}")
+    print(f"Content length: {result['content_length']}")