first-update
This commit is contained in:
53
backend/app/services/file_processor/docx_processor.py
Normal file
53
backend/app/services/file_processor/docx_processor.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""
|
||||
DOCX Text Extractor
|
||||
"""
|
||||
from docx import Document
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
class DOCXProcessor:
|
||||
"""Extract text from DOCX files"""
|
||||
|
||||
def extract_text(self, file_path: str) -> str:
|
||||
"""Extract all text from DOCX"""
|
||||
doc = Document(file_path)
|
||||
text_parts = []
|
||||
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
text_parts.append(para.text)
|
||||
|
||||
# Also extract text from tables
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
if cell.text.strip():
|
||||
text_parts.append(cell.text)
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
def extract_with_metadata(self, file_path: str) -> Dict:
|
||||
"""Extract text with DOCX metadata"""
|
||||
doc = Document(file_path)
|
||||
|
||||
result = {
|
||||
"text": self.extract_text(file_path),
|
||||
"paragraphs": len(doc.paragraphs),
|
||||
"tables": len(doc.tables),
|
||||
"sections": len(doc.sections),
|
||||
"metadata": {
|
||||
"author": doc.core_properties.author,
|
||||
"title": doc.core_properties.title,
|
||||
"subject": doc.core_properties.subject,
|
||||
"created": doc.core_properties.created,
|
||||
"modified": doc.core_properties.modified
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def process_docx(file_path: str) -> str:
|
||||
"""Process DOCX file and return text"""
|
||||
processor = DOCXProcessor()
|
||||
return processor.extract_text(file_path)
|
||||
Reference in New Issue
Block a user