refactor: 重构 algorithm 为 ai-core 代码解析服务
- 新增 ai-core 目录,包含代码解析核心服务 - 添加 proto 定义、parser、service 模块 - 添加启动脚本和依赖配置 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
100
ai-core/parser/parser.py
Normal file
100
ai-core/parser/parser.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from typing import Optional
|
||||
from markitdown import MarkItDown
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Parser:
|
||||
"""基于 MarkItDown 的统一文档解析器
|
||||
|
||||
支持格式:PDF、DOCX、DOC、PPTX、PPT、XLSX、XLS、CSV、图片、网页、Markdown 等
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.markitdown = MarkItDown()
|
||||
logger.info("Parser initialized with MarkItDown")
|
||||
|
||||
def parse(self, file_path: str, file_type: Optional[str] = None) -> dict:
|
||||
"""解析文档为 Markdown
|
||||
|
||||
Args:
|
||||
file_path: 文件路径或 URL
|
||||
file_type: 文件类型(可选,MarkItDown 会自动检测)
|
||||
|
||||
Returns:
|
||||
dict: 包含 markdown 内容和元数据
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Parsing file: {file_path}")
|
||||
|
||||
result = self.markitdown.convert(file_path)
|
||||
|
||||
logger.info(f"Parse successful: {len(result.text_content)} characters")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"content": result.text_content,
|
||||
"content_length": len(result.text_content),
|
||||
"metadata": result.metadata if hasattr(result, 'metadata') else {}
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Parse error: {e}", exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"content": "",
|
||||
"content_length": 0,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def parse_bytes(self, content: bytes, file_name: str, file_type: Optional[str] = None) -> dict:
|
||||
"""解析字节内容为 Markdown
|
||||
|
||||
Args:
|
||||
content: 文件字节内容
|
||||
file_name: 文件名
|
||||
file_type: 文件类型(可选)
|
||||
|
||||
Returns:
|
||||
dict: 包含 markdown 内容和元数据
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Parsing bytes: {file_name}, size: {len(content)} bytes")
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1] or '') as temp_file:
|
||||
temp_file.write(content)
|
||||
temp_path = temp_file.name
|
||||
|
||||
try:
|
||||
result = self.markitdown.convert(temp_path)
|
||||
|
||||
logger.info(f"Parse successful: {len(result.text_content)} characters")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"content": result.text_content,
|
||||
"content_length": len(result.text_content),
|
||||
"metadata": result.metadata if hasattr(result, 'metadata') else {}
|
||||
}
|
||||
finally:
|
||||
os.unlink(temp_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Parse bytes error: {e}", exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"content": "",
|
||||
"content_length": 0,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = Parser()
|
||||
|
||||
# 测试
|
||||
test_url = "https://example.com"
|
||||
result = parser.parse(test_url)
|
||||
print(f"Success: {result['success']}")
|
||||
print(f"Content length: {result['content_length']}")
|
||||
Reference in New Issue
Block a user