YG-Datasets/backend/app/services/file_processor/docx_processor.py

"""
DOCX Text Extractor
"""
import asyncio
from typing import Dict
from docx import Document


class DOCXProcessor:
    """Extract text from DOCX files"""

    def extract_text(self, file_path: str) -> str:
        """Extract all text from DOCX"""
        doc = Document(file_path)
        text_parts = []

        for para in doc.paragraphs:
            if para.text.strip():
                text_parts.append(para.text)

        # Also extract text from tables
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        text_parts.append(cell.text)

        return "\n\n".join(text_parts)

    async def extract_text_async(self, file_path: str) -> str:
        """Extract all text from DOCX asynchronously"""
        return await asyncio.get_event_loop().run_in_executor(
            None, self.extract_text, file_path
        )

    def extract_with_metadata(self, file_path: str) -> Dict:
        """Extract text with DOCX metadata"""
        doc = Document(file_path)

        result = {
            "text": self.extract_text(file_path),
            "paragraphs": len(doc.paragraphs),
            "tables": len(doc.tables),
            "sections": len(doc.sections),
            "metadata": {
                "author": doc.core_properties.author,
                "title": doc.core_properties.title,
                "subject": doc.core_properties.subject,
                "created": doc.core_properties.created,
                "modified": doc.core_properties.modified
            }
        }

        return result

    async def extract_with_metadata_async(self, file_path: str) -> Dict:
        """Extract with metadata asynchronously"""
        return await asyncio.get_event_loop().run_in_executor(
            None, self.extract_with_metadata, file_path
        )


async def process_docx(file_path: str) -> str:
    """Process DOCX file and return text"""
    processor = DOCXProcessor()
    return await processor.extract_text_async(file_path)