feat: 完善 AI-Core 文档解析器

- 添加多种文档解析器 (PDF, Word, Excel, Markdown 等)
- 添加基础解析器和链式解析器
- 添加存储和注册机制
- 添加 gRPC 服务实现

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-10 15:01:52 +08:00
parent 54473bc378
commit d24b29afe4
19 changed files with 4056 additions and 31 deletions

View File

@@ -0,0 +1,28 @@
import logging
from docreader.parser.chain_parser import FirstParser
from docreader.parser.docx_parser import DocxParser
from docreader.parser.markitdown_parser import MarkitdownParser
logger = logging.getLogger(__name__)
class Docx2Parser(FirstParser):
_parser_cls = (MarkitdownParser, DocxParser)
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
your_file = "/path/to/your/file.docx"
parser = Docx2Parser(separators=[".", "?", "!", "", "", ""])
with open(your_file, "rb") as f:
content = f.read()
document = parser.parse(content)
for cc in document.chunks:
logger.info(f"chunk: {cc}")
# document = parser.parse_into_text(content)
# logger.info(f"docx content: {document.content}")
# logger.info(f"find images {document.images.keys()}")