2026-03-09 10:27:08 +08:00
|
|
|
"""
|
2026-03-09 16:08:44 +08:00
|
|
|
Parser module for WeKnora document processing system.
|
2026-03-09 10:27:08 +08:00
|
|
|
|
2026-03-09 16:08:44 +08:00
|
|
|
This module provides document parsers for various file formats including:
|
|
|
|
|
- Microsoft Word documents (.doc, .docx)
|
|
|
|
|
- PDF documents
|
|
|
|
|
- Markdown files
|
|
|
|
|
- Plain text files
|
|
|
|
|
- Images with text content
|
|
|
|
|
- Web pages
|
|
|
|
|
|
|
|
|
|
The parsers extract content from documents and can split them into
|
|
|
|
|
meaningful chunks for further processing and indexing.
|
2026-03-09 10:27:08 +08:00
|
|
|
"""
|
|
|
|
|
|
2026-03-09 16:08:44 +08:00
|
|
|
from .doc_parser import DocParser
|
|
|
|
|
from .docx2_parser import Docx2Parser
|
|
|
|
|
from .excel_parser import ExcelParser
|
|
|
|
|
from .image_parser import ImageParser
|
|
|
|
|
from .markdown_parser import MarkdownParser
|
2026-03-09 10:27:08 +08:00
|
|
|
from .parser import Parser
|
2026-03-09 16:08:44 +08:00
|
|
|
from .pdf_parser import PDFParser
|
|
|
|
|
from .registry import ParserEngineRegistry, registry
|
|
|
|
|
from .web_parser import WebParser
|
2026-03-09 10:27:08 +08:00
|
|
|
|
2026-03-09 16:08:44 +08:00
|
|
|
# Export public classes and modules
|
|
|
|
|
__all__ = [
|
|
|
|
|
"Docx2Parser",
|
|
|
|
|
"DocParser",
|
|
|
|
|
"PDFParser",
|
|
|
|
|
"MarkdownParser",
|
|
|
|
|
"ImageParser",
|
|
|
|
|
"WebParser",
|
|
|
|
|
"Parser",
|
|
|
|
|
"ExcelParser",
|
|
|
|
|
"ParserEngineRegistry",
|
|
|
|
|
"registry",
|
|
|
|
|
]
|