backend/app/services/file_processor/docx_processor.py

"""
DOCX Text Extractor
"""
from docx import Document
from typing import Dict, List


class DOCXProcessor:
    """Extract text from DOCX files"""

    def extract_text(self, file_path: str) -> str:
        """Extract all text from DOCX"""
        doc = Document(file_path)
        text_parts = []

        for para in doc.paragraphs:
            if para.text.strip():
                text_parts.append(para.text)

        # Also extract text from tables
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        text_parts.append(cell.text)

        return "\n\n".join(text_parts)

    def extract_with_metadata(self, file_path: str) -> Dict:
        """Extract text with DOCX metadata"""
        doc = Document(file_path)

        result = {
            "text": self.extract_text(file_path),
            "paragraphs": len(doc.paragraphs),
            "tables": len(doc.tables),
            "sections": len(doc.sections),
            "metadata": {
                "author": doc.core_properties.author,
                "title": doc.core_properties.title,
                "subject": doc.core_properties.subject,
                "created": doc.core_properties.created,
                "modified": doc.core_properties.modified
            }
        }

        return result


def process_docx(file_path: str) -> str:
    """Process DOCX file and return text"""
    processor = DOCXProcessor()
    return processor.extract_text(file_path)
first-update 2026-03-17 14:36:31 +08:00			`"""`
			`DOCX Text Extractor`
			`"""`
			`from docx import Document`
			`from typing import Dict, List`


			`class DOCXProcessor:`
			`"""Extract text from DOCX files"""`

			`def extract_text(self, file_path: str) -> str:`
			`"""Extract all text from DOCX"""`
			`doc = Document(file_path)`
			`text_parts = []`

			`for para in doc.paragraphs:`
			`if para.text.strip():`
			`text_parts.append(para.text)`

			`# Also extract text from tables`
			`for table in doc.tables:`
			`for row in table.rows:`
			`for cell in row.cells:`
			`if cell.text.strip():`
			`text_parts.append(cell.text)`

			`return "\n\n".join(text_parts)`

			`def extract_with_metadata(self, file_path: str) -> Dict:`
			`"""Extract text with DOCX metadata"""`
			`doc = Document(file_path)`

			`result = {`
			`"text": self.extract_text(file_path),`
			`"paragraphs": len(doc.paragraphs),`
			`"tables": len(doc.tables),`
			`"sections": len(doc.sections),`
			`"metadata": {`
			`"author": doc.core_properties.author,`
			`"title": doc.core_properties.title,`
			`"subject": doc.core_properties.subject,`
			`"created": doc.core_properties.created,`
			`"modified": doc.core_properties.modified`
			`}`
			`}`

			`return result`


			`def process_docx(file_path: str) -> str:`
			`"""Process DOCX file and return text"""`
			`processor = DOCXProcessor()`
			`return processor.extract_text(file_path)`