feat: 完善 AI-Core 文档解析器
- 添加多种文档解析器 (PDF, Word, Excel, Markdown 等) - 添加基础解析器和链式解析器 - 添加存储和注册机制 - 添加 gRPC 服务实现 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
28
ai-core/parser/docx2_parser.py
Normal file
28
ai-core/parser/docx2_parser.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import logging
|
||||
|
||||
from docreader.parser.chain_parser import FirstParser
|
||||
from docreader.parser.docx_parser import DocxParser
|
||||
from docreader.parser.markitdown_parser import MarkitdownParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Docx2Parser(FirstParser):
|
||||
_parser_cls = (MarkitdownParser, DocxParser)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_file = "/path/to/your/file.docx"
|
||||
parser = Docx2Parser(separators=[".", "?", "!", "。", "?", "!"])
|
||||
with open(your_file, "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
document = parser.parse(content)
|
||||
for cc in document.chunks:
|
||||
logger.info(f"chunk: {cc}")
|
||||
|
||||
# document = parser.parse_into_text(content)
|
||||
# logger.info(f"docx content: {document.content}")
|
||||
# logger.info(f"find images {document.images.keys()}")
|
||||
Reference in New Issue
Block a user