Files
X-Agents/ai-core/parser/base_parser.py
DESKTOP-72TV0V4\caoxiaozhu d24b29afe4 feat: 完善 AI-Core 文档解析器
- 添加多种文档解析器 (PDF, Word, Excel, Markdown 等)
- 添加基础解析器和链式解析器
- 添加存储和注册机制
- 添加 gRPC 服务实现

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 15:01:52 +08:00

62 lines
1.7 KiB
Python

# -*- coding: utf-8 -*-
import logging
import os
from abc import ABC, abstractmethod
from typing import Optional
from docreader.models.document import Document
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class BaseParser(ABC):
"""Base parser interface.
After the lightweight refactoring, BaseParser only extracts markdown text
and raw image references from documents. Chunking, image storage, OCR,
and VLM caption are handled by the Go App module.
"""
def __init__(
self,
file_name: str = "",
file_type: Optional[str] = None,
**kwargs,
):
self.file_name = file_name
self.file_type = file_type or os.path.splitext(file_name)[1].lstrip(".")
logger.info(
"Initializing parser for file=%s, type=%s",
file_name,
self.file_type,
)
@abstractmethod
def parse_into_text(self, content: bytes) -> Document:
"""Parse document content into markdown text.
Returns:
Document with ``content`` (markdown string) and optional
``images`` dict mapping storage-relative paths to base64 data.
"""
def parse(self, content: bytes) -> Document:
"""Parse document and return markdown + image references.
No chunking, no OCR, no VLM caption — those are done in Go.
"""
logger.info(
"Parsing document with %s, bytes: %d",
self.__class__.__name__,
len(content),
)
document = self.parse_into_text(content)
logger.info(
"Extracted %d characters from %s",
len(document.content),
self.file_name,
)
return document