feat(backend): 更新核心模块和文件处理

- 更新配置模块 (config.py) - 更新数据库连接 (database.py) - 更新主应用入口 (main.py) - 更新数据模型 (models.py) - 更新基础 Schema (base.py) - 更新文件处理器 (docx, excel, pdf) - 更新 Dockerfile Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 17:30:11 +08:00
parent db11429290
commit 47d1da7cea
10 changed files with 393 additions and 189 deletions
--- a/backend/app/services/file_processor/pdf_processor.py
+++ b/backend/app/services/file_processor/pdf_processor.py
@@ -1,8 +1,9 @@
 """
 PDF Text Extractor
 """
+import asyncio
+from typing import Dict, List
 import pdfplumber
-from typing import Dict, List, Optional


 class PDFProcessor:
@@ -20,6 +21,12 @@ class PDFProcessor:

        return "\n\n".join(text_parts)

+    async def extract_text_async(self, file_path: str) -> str:
+        """Extract all text from PDF asynchronously"""
+        return await asyncio.get_event_loop().run_in_executor(
+            None, self.extract_text, file_path
+        )
+
    def extract_pages(self, file_path: str) -> List[Dict]:
        """Extract text page by page with metadata"""
        pages = []
@@ -36,6 +43,12 @@ class PDFProcessor:

        return pages

+    async def extract_pages_async(self, file_path: str) -> List[Dict]:
+        """Extract pages asynchronously"""
+        return await asyncio.get_event_loop().run_in_executor(
+            None, self.extract_pages, file_path
+        )
+
    def extract_with_metadata(self, file_path: str) -> Dict:
        """Extract text with PDF metadata"""
        result = {
@@ -58,8 +71,14 @@ class PDFProcessor:

        return result

+    async def extract_with_metadata_async(self, file_path: str) -> Dict:
+        """Extract with metadata asynchronously"""
+        return await asyncio.get_event_loop().run_in_executor(
+            None, self.extract_with_metadata, file_path
+        )

-def process_pdf(file_path: str) -> str:
+
+async def process_pdf(file_path: str) -> str:
    """Process PDF file and return text"""
    processor = PDFProcessor()
-    return processor.extract_with_metadata(file_path)["text"]
+    return await processor.extract_with_metadata_async(file_path)