first-update

2026-03-17 14:36:31 +08:00
parent 72f08aee7c
commit 4eddf05e79
516 changed files with 115270 additions and 1 deletions
--- a/backend/app/services/file_processor/docx_processor.py
+++ b/backend/app/services/file_processor/docx_processor.py
@@ -0,0 +1,53 @@
+"""
+DOCX Text Extractor
+"""
+from docx import Document
+from typing import Dict, List
+
+
+class DOCXProcessor:
+    """Extract text from DOCX files"""
+
+    def extract_text(self, file_path: str) -> str:
+        """Extract all text from DOCX"""
+        doc = Document(file_path)
+        text_parts = []
+
+        for para in doc.paragraphs:
+            if para.text.strip():
+                text_parts.append(para.text)
+
+        # Also extract text from tables
+        for table in doc.tables:
+            for row in table.rows:
+                for cell in row.cells:
+                    if cell.text.strip():
+                        text_parts.append(cell.text)
+
+        return "\n\n".join(text_parts)
+
+    def extract_with_metadata(self, file_path: str) -> Dict:
+        """Extract text with DOCX metadata"""
+        doc = Document(file_path)
+
+        result = {
+            "text": self.extract_text(file_path),
+            "paragraphs": len(doc.paragraphs),
+            "tables": len(doc.tables),
+            "sections": len(doc.sections),
+            "metadata": {
+                "author": doc.core_properties.author,
+                "title": doc.core_properties.title,
+                "subject": doc.core_properties.subject,
+                "created": doc.core_properties.created,
+                "modified": doc.core_properties.modified
+            }
+        }
+
+        return result
+
+
+def process_docx(file_path: str) -> str:
+    """Process DOCX file and return text"""
+    processor = DOCXProcessor()
+    return processor.extract_text(file_path)