- 更新配置模块 (config.py) - 更新数据库连接 (database.py) - 更新主应用入口 (main.py) - 更新数据模型 (models.py) - 更新基础 Schema (base.py) - 更新文件处理器 (docx, excel, pdf) - 更新 Dockerfile Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
67 lines
2.0 KiB
Python
67 lines
2.0 KiB
Python
"""
|
|
DOCX Text Extractor
|
|
"""
|
|
import asyncio
|
|
from typing import Dict
|
|
from docx import Document
|
|
|
|
|
|
class DOCXProcessor:
|
|
"""Extract text from DOCX files"""
|
|
|
|
def extract_text(self, file_path: str) -> str:
|
|
"""Extract all text from DOCX"""
|
|
doc = Document(file_path)
|
|
text_parts = []
|
|
|
|
for para in doc.paragraphs:
|
|
if para.text.strip():
|
|
text_parts.append(para.text)
|
|
|
|
# Also extract text from tables
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
if cell.text.strip():
|
|
text_parts.append(cell.text)
|
|
|
|
return "\n\n".join(text_parts)
|
|
|
|
async def extract_text_async(self, file_path: str) -> str:
|
|
"""Extract all text from DOCX asynchronously"""
|
|
return await asyncio.get_event_loop().run_in_executor(
|
|
None, self.extract_text, file_path
|
|
)
|
|
|
|
def extract_with_metadata(self, file_path: str) -> Dict:
|
|
"""Extract text with DOCX metadata"""
|
|
doc = Document(file_path)
|
|
|
|
result = {
|
|
"text": self.extract_text(file_path),
|
|
"paragraphs": len(doc.paragraphs),
|
|
"tables": len(doc.tables),
|
|
"sections": len(doc.sections),
|
|
"metadata": {
|
|
"author": doc.core_properties.author,
|
|
"title": doc.core_properties.title,
|
|
"subject": doc.core_properties.subject,
|
|
"created": doc.core_properties.created,
|
|
"modified": doc.core_properties.modified
|
|
}
|
|
}
|
|
|
|
return result
|
|
|
|
async def extract_with_metadata_async(self, file_path: str) -> Dict:
|
|
"""Extract with metadata asynchronously"""
|
|
return await asyncio.get_event_loop().run_in_executor(
|
|
None, self.extract_with_metadata, file_path
|
|
)
|
|
|
|
|
|
async def process_docx(file_path: str) -> str:
|
|
"""Process DOCX file and return text"""
|
|
processor = DOCXProcessor()
|
|
return await processor.extract_text_async(file_path)
|