backend/app/services/file_processor/pdf_processor.py

"""
PDF Text Extractor
"""
import asyncio
from typing import Dict, List
import pdfplumber


class PDFProcessor:
    """Extract text from PDF files"""

    def extract_text(self, file_path: str) -> str:
        """Extract all text from PDF"""
        text_parts = []

        with pdfplumber.open(file_path) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                text = page.extract_text()
                if text:
                    text_parts.append(f"--- Page {page_num} ---\n{text}")

        return "\n\n".join(text_parts)

    async def extract_text_async(self, file_path: str) -> str:
        """Extract all text from PDF asynchronously"""
        return await asyncio.get_event_loop().run_in_executor(
            None, self.extract_text, file_path
        )

    def extract_pages(self, file_path: str) -> List[Dict]:
        """Extract text page by page with metadata"""
        pages = []

        with pdfplumber.open(file_path) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                text = page.extract_text()
                if text:
                    pages.append({
                        "page_number": page_num,
                        "text": text.strip(),
                        "word_count": len(text.split())
                    })

        return pages

    async def extract_pages_async(self, file_path: str) -> List[Dict]:
        """Extract pages asynchronously"""
        return await asyncio.get_event_loop().run_in_executor(
            None, self.extract_pages, file_path
        )

    def extract_with_metadata(self, file_path: str) -> Dict:
        """Extract text with PDF metadata"""
        result = {
            "text": "",
            "pages": [],
            "metadata": {}
        }

        with pdfplumber.open(file_path) as pdf:
            # Get metadata
            result["metadata"] = {
                "page_count": len(pdf.pages),
                "metadata": pdf.metadata
            }

            # Extract pages
            pages = self.extract_pages(file_path)
            result["pages"] = pages
            result["text"] = "\n\n".join([p["text"] for p in pages])

        return result

    async def extract_with_metadata_async(self, file_path: str) -> Dict:
        """Extract with metadata asynchronously"""
        return await asyncio.get_event_loop().run_in_executor(
            None, self.extract_with_metadata, file_path
        )


async def process_pdf(file_path: str) -> str:
    """Process PDF file and return text"""
    processor = PDFProcessor()
    return await processor.extract_with_metadata_async(file_path)
first-update 2026-03-17 14:36:31 +08:00			`"""`
			`PDF Text Extractor`
			`"""`
feat(backend): 更新核心模块和文件处理 - 更新配置模块 (config.py) - 更新数据库连接 (database.py) - 更新主应用入口 (main.py) - 更新数据模型 (models.py) - 更新基础 Schema (base.py) - 更新文件处理器 (docx, excel, pdf) - 更新 Dockerfile Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-17 17:30:11 +08:00			`import asyncio`
			`from typing import Dict, List`
first-update 2026-03-17 14:36:31 +08:00			`import pdfplumber`


			`class PDFProcessor:`
			`"""Extract text from PDF files"""`

			`def extract_text(self, file_path: str) -> str:`
			`"""Extract all text from PDF"""`
			`text_parts = []`

			`with pdfplumber.open(file_path) as pdf:`
			`for page_num, page in enumerate(pdf.pages, 1):`
			`text = page.extract_text()`
			`if text:`
			`text_parts.append(f"--- Page {page_num} ---\n{text}")`

			`return "\n\n".join(text_parts)`

feat(backend): 更新核心模块和文件处理 - 更新配置模块 (config.py) - 更新数据库连接 (database.py) - 更新主应用入口 (main.py) - 更新数据模型 (models.py) - 更新基础 Schema (base.py) - 更新文件处理器 (docx, excel, pdf) - 更新 Dockerfile Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-17 17:30:11 +08:00			`async def extract_text_async(self, file_path: str) -> str:`
			`"""Extract all text from PDF asynchronously"""`
			`return await asyncio.get_event_loop().run_in_executor(`
			`None, self.extract_text, file_path`
			`)`

first-update 2026-03-17 14:36:31 +08:00			`def extract_pages(self, file_path: str) -> List[Dict]:`
			`"""Extract text page by page with metadata"""`
			`pages = []`

			`with pdfplumber.open(file_path) as pdf:`
			`for page_num, page in enumerate(pdf.pages, 1):`
			`text = page.extract_text()`
			`if text:`
			`pages.append({`
			`"page_number": page_num,`
			`"text": text.strip(),`
			`"word_count": len(text.split())`
			`})`

			`return pages`

feat(backend): 更新核心模块和文件处理 - 更新配置模块 (config.py) - 更新数据库连接 (database.py) - 更新主应用入口 (main.py) - 更新数据模型 (models.py) - 更新基础 Schema (base.py) - 更新文件处理器 (docx, excel, pdf) - 更新 Dockerfile Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-17 17:30:11 +08:00			`async def extract_pages_async(self, file_path: str) -> List[Dict]:`
			`"""Extract pages asynchronously"""`
			`return await asyncio.get_event_loop().run_in_executor(`
			`None, self.extract_pages, file_path`
			`)`

first-update 2026-03-17 14:36:31 +08:00			`def extract_with_metadata(self, file_path: str) -> Dict:`
			`"""Extract text with PDF metadata"""`
			`result = {`
			`"text": "",`
			`"pages": [],`
			`"metadata": {}`
			`}`

			`with pdfplumber.open(file_path) as pdf:`
			`# Get metadata`
			`result["metadata"] = {`
			`"page_count": len(pdf.pages),`
			`"metadata": pdf.metadata`
			`}`

			`# Extract pages`
			`pages = self.extract_pages(file_path)`
			`result["pages"] = pages`
			`result["text"] = "\n\n".join([p["text"] for p in pages])`

			`return result`

feat(backend): 更新核心模块和文件处理 - 更新配置模块 (config.py) - 更新数据库连接 (database.py) - 更新主应用入口 (main.py) - 更新数据模型 (models.py) - 更新基础 Schema (base.py) - 更新文件处理器 (docx, excel, pdf) - 更新 Dockerfile Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-17 17:30:11 +08:00			`async def extract_with_metadata_async(self, file_path: str) -> Dict:`
			`"""Extract with metadata asynchronously"""`
			`return await asyncio.get_event_loop().run_in_executor(`
			`None, self.extract_with_metadata, file_path`
			`)`

first-update 2026-03-17 14:36:31 +08:00
feat(backend): 更新核心模块和文件处理 - 更新配置模块 (config.py) - 更新数据库连接 (database.py) - 更新主应用入口 (main.py) - 更新数据模型 (models.py) - 更新基础 Schema (base.py) - 更新文件处理器 (docx, excel, pdf) - 更新 Dockerfile Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-17 17:30:11 +08:00			`async def process_pdf(file_path: str) -> str:`
first-update 2026-03-17 14:36:31 +08:00			`"""Process PDF file and return text"""`
			`processor = PDFProcessor()`
feat(backend): 更新核心模块和文件处理 - 更新配置模块 (config.py) - 更新数据库连接 (database.py) - 更新主应用入口 (main.py) - 更新数据模型 (models.py) - 更新基础 Schema (base.py) - 更新文件处理器 (docx, excel, pdf) - 更新 Dockerfile Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-17 17:30:11 +08:00			`return await processor.extract_with_metadata_async(file_path)`