- 添加多种文档解析器 (PDF, Word, Excel, Markdown 等) - 添加基础解析器和链式解析器 - 添加存储和注册机制 - 添加 gRPC 服务实现 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
16 lines
576 B
Python
16 lines
576 B
Python
from docreader.parser.chain_parser import FirstParser
|
|
from docreader.parser.markitdown_parser import MarkitdownParser
|
|
|
|
|
|
class PDFParser(FirstParser):
|
|
"""PDF Parser using chain of responsibility pattern
|
|
|
|
Attempts to parse PDF files using multiple parser backends in order:
|
|
1. MinerUParser - Primary parser for PDF documents
|
|
2. MarkitdownParser - Fallback parser if MinerU fails
|
|
|
|
The first successful parser result will be returned.
|
|
"""
|
|
# Parser classes to try in order (chain of responsibility pattern)
|
|
_parser_cls = (MarkitdownParser,)
|