feat(backend): 更新核心模块和文件处理
- 更新配置模块 (config.py) - 更新数据库连接 (database.py) - 更新主应用入口 (main.py) - 更新数据模型 (models.py) - 更新基础 Schema (base.py) - 更新文件处理器 (docx, excel, pdf) - 更新 Dockerfile Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
"""
|
||||
PDF Text Extractor
|
||||
"""
|
||||
import asyncio
|
||||
from typing import Dict, List
|
||||
import pdfplumber
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
class PDFProcessor:
|
||||
@@ -20,6 +21,12 @@ class PDFProcessor:
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
async def extract_text_async(self, file_path: str) -> str:
|
||||
"""Extract all text from PDF asynchronously"""
|
||||
return await asyncio.get_event_loop().run_in_executor(
|
||||
None, self.extract_text, file_path
|
||||
)
|
||||
|
||||
def extract_pages(self, file_path: str) -> List[Dict]:
|
||||
"""Extract text page by page with metadata"""
|
||||
pages = []
|
||||
@@ -36,6 +43,12 @@ class PDFProcessor:
|
||||
|
||||
return pages
|
||||
|
||||
async def extract_pages_async(self, file_path: str) -> List[Dict]:
|
||||
"""Extract pages asynchronously"""
|
||||
return await asyncio.get_event_loop().run_in_executor(
|
||||
None, self.extract_pages, file_path
|
||||
)
|
||||
|
||||
def extract_with_metadata(self, file_path: str) -> Dict:
|
||||
"""Extract text with PDF metadata"""
|
||||
result = {
|
||||
@@ -58,8 +71,14 @@ class PDFProcessor:
|
||||
|
||||
return result
|
||||
|
||||
async def extract_with_metadata_async(self, file_path: str) -> Dict:
|
||||
"""Extract with metadata asynchronously"""
|
||||
return await asyncio.get_event_loop().run_in_executor(
|
||||
None, self.extract_with_metadata, file_path
|
||||
)
|
||||
|
||||
def process_pdf(file_path: str) -> str:
|
||||
|
||||
async def process_pdf(file_path: str) -> str:
|
||||
"""Process PDF file and return text"""
|
||||
processor = PDFProcessor()
|
||||
return processor.extract_with_metadata(file_path)["text"]
|
||||
return await processor.extract_with_metadata_async(file_path)
|
||||
|
||||
Reference in New Issue
Block a user