""" PDF Text Extractor """ import pdfplumber from typing import Dict, List, Optional class PDFProcessor: """Extract text from PDF files""" def extract_text(self, file_path: str) -> str: """Extract all text from PDF""" text_parts = [] with pdfplumber.open(file_path) as pdf: for page_num, page in enumerate(pdf.pages, 1): text = page.extract_text() if text: text_parts.append(f"--- Page {page_num} ---\n{text}") return "\n\n".join(text_parts) def extract_pages(self, file_path: str) -> List[Dict]: """Extract text page by page with metadata""" pages = [] with pdfplumber.open(file_path) as pdf: for page_num, page in enumerate(pdf.pages, 1): text = page.extract_text() if text: pages.append({ "page_number": page_num, "text": text.strip(), "word_count": len(text.split()) }) return pages def extract_with_metadata(self, file_path: str) -> Dict: """Extract text with PDF metadata""" result = { "text": "", "pages": [], "metadata": {} } with pdfplumber.open(file_path) as pdf: # Get metadata result["metadata"] = { "page_count": len(pdf.pages), "metadata": pdf.metadata } # Extract pages pages = self.extract_pages(file_path) result["pages"] = pages result["text"] = "\n\n".join([p["text"] for p in pages]) return result def process_pdf(file_path: str) -> str: """Process PDF file and return text""" processor = PDFProcessor() return processor.extract_with_metadata(file_path)["text"]