- 更新配置模块 (config.py) - 更新数据库连接 (database.py) - 更新主应用入口 (main.py) - 更新数据模型 (models.py) - 更新基础 Schema (base.py) - 更新文件处理器 (docx, excel, pdf) - 更新 Dockerfile Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
86 lines
3.0 KiB
Python
86 lines
3.0 KiB
Python
"""
|
|
Excel/CSV Text Extractor
|
|
"""
|
|
import asyncio
|
|
from typing import Dict, List
|
|
import pandas as pd
|
|
|
|
|
|
class ExcelProcessor:
|
|
"""Extract text from Excel and CSV files"""
|
|
|
|
def extract_csv(self, file_path: str) -> str:
|
|
"""Extract text from CSV file"""
|
|
df = pd.read_csv(file_path)
|
|
return self._dataframe_to_text(df)
|
|
|
|
async def extract_csv_async(self, file_path: str) -> str:
|
|
"""Extract CSV asynchronously"""
|
|
return await asyncio.get_event_loop().run_in_executor(
|
|
None, self.extract_csv, file_path
|
|
)
|
|
|
|
def extract_excel(self, file_path: str, sheet_name: str = None) -> str:
|
|
"""Extract text from Excel file"""
|
|
if sheet_name:
|
|
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
|
return self._dataframe_to_text(df)
|
|
else:
|
|
# Read all sheets
|
|
sheets = pd.read_excel(file_path, sheet_name=None)
|
|
text_parts = []
|
|
for sheet_name, df in sheets.items():
|
|
text_parts.append(f"=== Sheet: {sheet_name} ===\n")
|
|
text_parts.append(self._dataframe_to_text(df))
|
|
return "\n\n".join(text_parts)
|
|
|
|
async def extract_excel_async(self, file_path: str, sheet_name: str = None) -> str:
|
|
"""Extract Excel asynchronously"""
|
|
return await asyncio.get_event_loop().run_in_executor(
|
|
None, self.extract_excel, file_path, sheet_name
|
|
)
|
|
|
|
def _dataframe_to_text(self, df: pd.DataFrame) -> str:
|
|
"""Convert DataFrame to readable text"""
|
|
text_parts = []
|
|
|
|
# Add column headers
|
|
if not df.empty:
|
|
text_parts.append(" | ".join(str(col) for col in df.columns))
|
|
text_parts.append("-" * len(text_parts[-1]))
|
|
|
|
# Add rows
|
|
for _, row in df.iterrows():
|
|
row_text = " | ".join(str(val) for val in row.values)
|
|
text_parts.append(row_text)
|
|
|
|
return "\n".join(text_parts)
|
|
|
|
def extract_all_sheets(self, file_path: str) -> Dict[str, str]:
|
|
"""Extract all sheets from Excel file"""
|
|
sheets = pd.read_excel(file_path, sheet_name=None)
|
|
return {name: self._dataframe_to_text(df) for name, df in sheets.items()}
|
|
|
|
async def extract_all_sheets_async(self, file_path: str) -> Dict[str, str]:
|
|
"""Extract all sheets asynchronously"""
|
|
return await asyncio.get_event_loop().run_in_executor(
|
|
None, self.extract_all_sheets, file_path
|
|
)
|
|
|
|
def get_sheet_names(self, file_path: str) -> List[str]:
|
|
"""Get all sheet names from Excel file"""
|
|
xl = pd.ExcelFile(file_path)
|
|
return xl.sheet_names
|
|
|
|
|
|
async def process_csv(file_path: str) -> str:
|
|
"""Process CSV file and return text"""
|
|
processor = ExcelProcessor()
|
|
return await processor.extract_csv_async(file_path)
|
|
|
|
|
|
async def process_excel(file_path: str) -> str:
|
|
"""Process Excel file and return text"""
|
|
processor = ExcelProcessor()
|
|
return await processor.extract_excel_async(file_path)
|