Files
X-Agents/ai-core/parser/registry.py
DESKTOP-72TV0V4\caoxiaozhu d24b29afe4 feat: 完善 AI-Core 文档解析器
- 添加多种文档解析器 (PDF, Word, Excel, Markdown 等)
- 添加基础解析器和链式解析器
- 添加存储和注册机制
- 添加 gRPC 服务实现

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 15:01:52 +08:00

161 lines
5.4 KiB
Python

import logging
from typing import Any, Callable, Dict, List, Optional, Tuple, Type
from docreader.parser.base_parser import BaseParser
from docreader.parser.doc_parser import DocParser
from docreader.parser.docx2_parser import Docx2Parser
from docreader.parser.excel_parser import ExcelParser
from docreader.parser.image_parser import ImageParser
from docreader.parser.markdown_parser import MarkdownParser
from docreader.parser.markitdown_parser import MarkitdownParser
from docreader.parser.pdf_parser import PDFParser
logger = logging.getLogger(__name__)
BUILTIN_ENGINE = "builtin"
class ParserEngineRegistry:
"""Registry for parser engines.
Each engine maps file extensions to parser classes.
When a requested engine doesn't support a file type, the registry
falls back to the builtin engine automatically.
"""
def __init__(self):
self._engines: Dict[str, Dict[str, Type[BaseParser]]] = {}
self._descriptions: Dict[str, str] = {}
self._check_available: Dict[str, Callable[..., Tuple[bool, str]]] = {}
self._unavailable_hint: Dict[str, str] = {}
def register(
self,
name: str,
file_types: Dict[str, Type[BaseParser]],
description: str = "",
check_available: Callable[..., Tuple[bool, str]] | None = None,
unavailable_hint: str = "",
):
self._engines[name] = file_types
self._descriptions[name] = description
if check_available is not None:
self._check_available[name] = check_available
self._unavailable_hint[name] = unavailable_hint
logger.info(
"Registered parser engine '%s' with file types: %s",
name,
", ".join(file_types.keys()),
)
def get_parser_class(self, engine: str, file_type: str) -> Type[BaseParser]:
"""Resolve parser class for the given engine and file type.
Falls back to builtin engine when the requested engine doesn't
support the file type.
"""
ft = file_type.lower()
if engine and engine in self._engines:
cls = self._engines[engine].get(ft)
if cls:
logger.info("Using engine '%s' for file type '%s'", engine, ft)
return cls
logger.info(
"Engine '%s' does not support '%s', falling back to builtin",
engine,
ft,
)
builtin = self._engines.get(BUILTIN_ENGINE, {})
cls = builtin.get(ft)
if cls:
return cls
raise ValueError(f"Unsupported file type: {file_type}")
def list_engines(self, overrides: Optional[Dict[str, str]] = None) -> List[Dict]:
"""Return metadata for all registered engines, including availability.
Args:
overrides: tenant-level config overrides (e.g. mineru_endpoint, mineru_api_key)
forwarded to each engine's check_available function.
"""
result = []
for name, parsers in self._engines.items():
available = True
unavailable_reason = ""
check = self._check_available.get(name)
if check is not None:
try:
available, unavailable_reason = check(overrides)
except Exception as e:
available = False
unavailable_reason = str(e) or self._unavailable_hint.get(name, "")
if not available and not unavailable_reason:
unavailable_reason = self._unavailable_hint.get(name, "不可用")
result.append(
{
"name": name,
"description": self._descriptions.get(name, ""),
"file_types": sorted(parsers.keys()),
"available": available,
"unavailable_reason": unavailable_reason,
}
)
return result
def get_engine_names(self) -> List[str]:
return list(self._engines.keys())
def _build_default_registry() -> ParserEngineRegistry:
"""Create and populate the default registry with all known engines."""
reg = ParserEngineRegistry()
_image_types = {
ext: ImageParser for ext in ("jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp")
}
reg.register(
BUILTIN_ENGINE,
{
"docx": Docx2Parser,
"doc": DocParser,
"pdf": PDFParser,
"md": MarkdownParser,
"markdown": MarkdownParser,
"xlsx": ExcelParser,
"xls": ExcelParser,
**_image_types,
},
description="内置解析引擎",
)
reg.register(
"markitdown",
{
"md": MarkitdownParser,
"markdown": MarkitdownParser,
"pdf": MarkitdownParser,
"docx": MarkitdownParser,
"doc": MarkitdownParser,
"pptx": MarkitdownParser,
"ppt": MarkitdownParser,
"xlsx": MarkitdownParser,
"xls": MarkitdownParser,
"csv": MarkitdownParser,
},
description="MarkItDown 解析引擎(微软 MarkItDown 库)",
)
# NOTE: Engine listing is managed by Go-side engine registry
# (docparser.ListAllEngines). The Python list_engines method is kept for
# backward compatibility with the gRPC ListEngines RPC but the Go app
# no longer calls it. MinerU engines are handled natively by Go.
return reg
registry = _build_default_registry()