""" DOCX Text Extractor """ import asyncio from typing import Dict from docx import Document class DOCXProcessor: """Extract text from DOCX files""" def extract_text(self, file_path: str) -> str: """Extract all text from DOCX""" doc = Document(file_path) text_parts = [] for para in doc.paragraphs: if para.text.strip(): text_parts.append(para.text) # Also extract text from tables for table in doc.tables: for row in table.rows: for cell in row.cells: if cell.text.strip(): text_parts.append(cell.text) return "\n\n".join(text_parts) async def extract_text_async(self, file_path: str) -> str: """Extract all text from DOCX asynchronously""" return await asyncio.get_event_loop().run_in_executor( None, self.extract_text, file_path ) def extract_with_metadata(self, file_path: str) -> Dict: """Extract text with DOCX metadata""" doc = Document(file_path) result = { "text": self.extract_text(file_path), "paragraphs": len(doc.paragraphs), "tables": len(doc.tables), "sections": len(doc.sections), "metadata": { "author": doc.core_properties.author, "title": doc.core_properties.title, "subject": doc.core_properties.subject, "created": doc.core_properties.created, "modified": doc.core_properties.modified } } return result async def extract_with_metadata_async(self, file_path: str) -> Dict: """Extract with metadata asynchronously""" return await asyncio.get_event_loop().run_in_executor( None, self.extract_with_metadata, file_path ) async def process_docx(file_path: str) -> str: """Process DOCX file and return text""" processor = DOCXProcessor() return await processor.extract_text_async(file_path)