Files
YG-Datasets/backend/app/services/file_processor/docx_processor.py

54 lines
1.5 KiB
Python
Raw Normal View History

2026-03-17 14:36:31 +08:00
"""
DOCX Text Extractor
"""
from docx import Document
from typing import Dict, List
class DOCXProcessor:
"""Extract text from DOCX files"""
def extract_text(self, file_path: str) -> str:
"""Extract all text from DOCX"""
doc = Document(file_path)
text_parts = []
for para in doc.paragraphs:
if para.text.strip():
text_parts.append(para.text)
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
text_parts.append(cell.text)
return "\n\n".join(text_parts)
def extract_with_metadata(self, file_path: str) -> Dict:
"""Extract text with DOCX metadata"""
doc = Document(file_path)
result = {
"text": self.extract_text(file_path),
"paragraphs": len(doc.paragraphs),
"tables": len(doc.tables),
"sections": len(doc.sections),
"metadata": {
"author": doc.core_properties.author,
"title": doc.core_properties.title,
"subject": doc.core_properties.subject,
"created": doc.core_properties.created,
"modified": doc.core_properties.modified
}
}
return result
def process_docx(file_path: str) -> str:
"""Process DOCX file and return text"""
processor = DOCXProcessor()
return processor.extract_text(file_path)