feat: 完善 AI-Core 文档解析器

- 添加多种文档解析器 (PDF, Word, Excel, Markdown 等) - 添加基础解析器和链式解析器 - 添加存储和注册机制 - 添加 gRPC 服务实现 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 15:01:52 +08:00
parent 54473bc378
commit d24b29afe4
19 changed files with 4056 additions and 31 deletions
--- a/ai-core/parser/docx2_parser.py
+++ b/ai-core/parser/docx2_parser.py
@@ -0,0 +1,28 @@
+import logging
+
+from docreader.parser.chain_parser import FirstParser
+from docreader.parser.docx_parser import DocxParser
+from docreader.parser.markitdown_parser import MarkitdownParser
+
+logger = logging.getLogger(__name__)
+
+
+class Docx2Parser(FirstParser):
+    _parser_cls = (MarkitdownParser, DocxParser)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+
+    your_file = "/path/to/your/file.docx"
+    parser = Docx2Parser(separators=[".", "?", "!", "。", "？", "！"])
+    with open(your_file, "rb") as f:
+        content = f.read()
+
+        document = parser.parse(content)
+        for cc in document.chunks:
+            logger.info(f"chunk: {cc}")
+
+        # document = parser.parse_into_text(content)
+        # logger.info(f"docx content: {document.content}")
+        # logger.info(f"find images {document.images.keys()}")