YG-Datasets/backend/app/services/file_processor/excel_processor.py

"""
Excel/CSV Text Extractor
"""
import asyncio
from typing import Dict, List
import pandas as pd


class ExcelProcessor:
    """Extract text from Excel and CSV files"""

    def extract_csv(self, file_path: str) -> str:
        """Extract text from CSV file"""
        df = pd.read_csv(file_path)
        return self._dataframe_to_text(df)

    async def extract_csv_async(self, file_path: str) -> str:
        """Extract CSV asynchronously"""
        return await asyncio.get_event_loop().run_in_executor(
            None, self.extract_csv, file_path
        )

    def extract_excel(self, file_path: str, sheet_name: str = None) -> str:
        """Extract text from Excel file"""
        if sheet_name:
            df = pd.read_excel(file_path, sheet_name=sheet_name)
            return self._dataframe_to_text(df)
        else:
            # Read all sheets
            sheets = pd.read_excel(file_path, sheet_name=None)
            text_parts = []
            for sheet_name, df in sheets.items():
                text_parts.append(f"=== Sheet: {sheet_name} ===\n")
                text_parts.append(self._dataframe_to_text(df))
            return "\n\n".join(text_parts)

    async def extract_excel_async(self, file_path: str, sheet_name: str = None) -> str:
        """Extract Excel asynchronously"""
        return await asyncio.get_event_loop().run_in_executor(
            None, self.extract_excel, file_path, sheet_name
        )

    def _dataframe_to_text(self, df: pd.DataFrame) -> str:
        """Convert DataFrame to readable text"""
        text_parts = []

        # Add column headers
        if not df.empty:
            text_parts.append(" | ".join(str(col) for col in df.columns))
            text_parts.append("-" * len(text_parts[-1]))

            # Add rows
            for _, row in df.iterrows():
                row_text = " | ".join(str(val) for val in row.values)
                text_parts.append(row_text)

        return "\n".join(text_parts)

    def extract_all_sheets(self, file_path: str) -> Dict[str, str]:
        """Extract all sheets from Excel file"""
        sheets = pd.read_excel(file_path, sheet_name=None)
        return {name: self._dataframe_to_text(df) for name, df in sheets.items()}

    async def extract_all_sheets_async(self, file_path: str) -> Dict[str, str]:
        """Extract all sheets asynchronously"""
        return await asyncio.get_event_loop().run_in_executor(
            None, self.extract_all_sheets, file_path
        )

    def get_sheet_names(self, file_path: str) -> List[str]:
        """Get all sheet names from Excel file"""
        xl = pd.ExcelFile(file_path)
        return xl.sheet_names


async def process_csv(file_path: str) -> str:
    """Process CSV file and return text"""
    processor = ExcelProcessor()
    return await processor.extract_csv_async(file_path)


async def process_excel(file_path: str) -> str:
    """Process Excel file and return text"""
    processor = ExcelProcessor()
    return await processor.extract_excel_async(file_path)