feat: 完善 AI-Core 文档解析器
- 添加多种文档解析器 (PDF, Word, Excel, Markdown 等) - 添加基础解析器和链式解析器 - 添加存储和注册机制 - 添加 gRPC 服务实现 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
61
ai-core/parser/base_parser.py
Normal file
61
ai-core/parser/base_parser.py
Normal file
@@ -0,0 +1,61 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import logging
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
from docreader.models.document import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class BaseParser(ABC):
|
||||
"""Base parser interface.
|
||||
|
||||
After the lightweight refactoring, BaseParser only extracts markdown text
|
||||
and raw image references from documents. Chunking, image storage, OCR,
|
||||
and VLM caption are handled by the Go App module.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_name: str = "",
|
||||
file_type: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.file_name = file_name
|
||||
self.file_type = file_type or os.path.splitext(file_name)[1].lstrip(".")
|
||||
|
||||
logger.info(
|
||||
"Initializing parser for file=%s, type=%s",
|
||||
file_name,
|
||||
self.file_type,
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""Parse document content into markdown text.
|
||||
|
||||
Returns:
|
||||
Document with ``content`` (markdown string) and optional
|
||||
``images`` dict mapping storage-relative paths to base64 data.
|
||||
"""
|
||||
|
||||
def parse(self, content: bytes) -> Document:
|
||||
"""Parse document and return markdown + image references.
|
||||
|
||||
No chunking, no OCR, no VLM caption — those are done in Go.
|
||||
"""
|
||||
logger.info(
|
||||
"Parsing document with %s, bytes: %d",
|
||||
self.__class__.__name__,
|
||||
len(content),
|
||||
)
|
||||
document = self.parse_into_text(content)
|
||||
logger.info(
|
||||
"Extracted %d characters from %s",
|
||||
len(document.content),
|
||||
self.file_name,
|
||||
)
|
||||
return document
|
||||
Reference in New Issue
Block a user