first-update
This commit is contained in:
3
backend/app/services/file_processor/__init__.py
Normal file
3
backend/app/services/file_processor/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
File Processing Services
|
||||
"""
|
||||
53
backend/app/services/file_processor/docx_processor.py
Normal file
53
backend/app/services/file_processor/docx_processor.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""
|
||||
DOCX Text Extractor
|
||||
"""
|
||||
from docx import Document
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
class DOCXProcessor:
|
||||
"""Extract text from DOCX files"""
|
||||
|
||||
def extract_text(self, file_path: str) -> str:
|
||||
"""Extract all text from DOCX"""
|
||||
doc = Document(file_path)
|
||||
text_parts = []
|
||||
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
text_parts.append(para.text)
|
||||
|
||||
# Also extract text from tables
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
if cell.text.strip():
|
||||
text_parts.append(cell.text)
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
def extract_with_metadata(self, file_path: str) -> Dict:
|
||||
"""Extract text with DOCX metadata"""
|
||||
doc = Document(file_path)
|
||||
|
||||
result = {
|
||||
"text": self.extract_text(file_path),
|
||||
"paragraphs": len(doc.paragraphs),
|
||||
"tables": len(doc.tables),
|
||||
"sections": len(doc.sections),
|
||||
"metadata": {
|
||||
"author": doc.core_properties.author,
|
||||
"title": doc.core_properties.title,
|
||||
"subject": doc.core_properties.subject,
|
||||
"created": doc.core_properties.created,
|
||||
"modified": doc.core_properties.modified
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def process_docx(file_path: str) -> str:
|
||||
"""Process DOCX file and return text"""
|
||||
processor = DOCXProcessor()
|
||||
return processor.extract_text(file_path)
|
||||
66
backend/app/services/file_processor/excel_processor.py
Normal file
66
backend/app/services/file_processor/excel_processor.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""
|
||||
Excel/CSV Text Extractor
|
||||
"""
|
||||
import pandas as pd
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
class ExcelProcessor:
|
||||
"""Extract text from Excel and CSV files"""
|
||||
|
||||
def extract_csv(self, file_path: str) -> str:
|
||||
"""Extract text from CSV file"""
|
||||
df = pd.read_csv(file_path)
|
||||
return self._dataframe_to_text(df)
|
||||
|
||||
def extract_excel(self, file_path: str, sheet_name: str = None) -> str:
|
||||
"""Extract text from Excel file"""
|
||||
if sheet_name:
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
||||
return self._dataframe_to_text(df)
|
||||
else:
|
||||
# Read all sheets
|
||||
sheets = pd.read_excel(file_path, sheet_name=None)
|
||||
text_parts = []
|
||||
for sheet_name, df in sheets.items():
|
||||
text_parts.append(f"=== Sheet: {sheet_name} ===\n")
|
||||
text_parts.append(self._dataframe_to_text(df))
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
def _dataframe_to_text(self, df: pd.DataFrame) -> str:
|
||||
"""Convert DataFrame to readable text"""
|
||||
text_parts = []
|
||||
|
||||
# Add column headers
|
||||
if not df.empty:
|
||||
text_parts.append(" | ".join(str(col) for col in df.columns))
|
||||
text_parts.append("-" * len(text_parts[-1]))
|
||||
|
||||
# Add rows
|
||||
for _, row in df.iterrows():
|
||||
row_text = " | ".join(str(val) for val in row.values)
|
||||
text_parts.append(row_text)
|
||||
|
||||
return "\n".join(text_parts)
|
||||
|
||||
def extract_all_sheets(self, file_path: str) -> Dict[str, str]:
|
||||
"""Extract all sheets from Excel file"""
|
||||
sheets = pd.read_excel(file_path, sheet_name=None)
|
||||
return {name: self._dataframe_to_text(df) for name, df in sheets.items()}
|
||||
|
||||
def get_sheet_names(self, file_path: str) -> List[str]:
|
||||
"""Get all sheet names from Excel file"""
|
||||
xl = pd.ExcelFile(file_path)
|
||||
return xl.sheet_names
|
||||
|
||||
|
||||
def process_csv(file_path: str) -> str:
|
||||
"""Process CSV file and return text"""
|
||||
processor = ExcelProcessor()
|
||||
return processor.extract_csv(file_path)
|
||||
|
||||
|
||||
def process_excel(file_path: str) -> str:
|
||||
"""Process Excel file and return text"""
|
||||
processor = ExcelProcessor()
|
||||
return processor.extract_excel(file_path)
|
||||
65
backend/app/services/file_processor/pdf_processor.py
Normal file
65
backend/app/services/file_processor/pdf_processor.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""
|
||||
PDF Text Extractor
|
||||
"""
|
||||
import pdfplumber
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
class PDFProcessor:
|
||||
"""Extract text from PDF files"""
|
||||
|
||||
def extract_text(self, file_path: str) -> str:
|
||||
"""Extract all text from PDF"""
|
||||
text_parts = []
|
||||
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_parts.append(f"--- Page {page_num} ---\n{text}")
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
def extract_pages(self, file_path: str) -> List[Dict]:
|
||||
"""Extract text page by page with metadata"""
|
||||
pages = []
|
||||
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
pages.append({
|
||||
"page_number": page_num,
|
||||
"text": text.strip(),
|
||||
"word_count": len(text.split())
|
||||
})
|
||||
|
||||
return pages
|
||||
|
||||
def extract_with_metadata(self, file_path: str) -> Dict:
|
||||
"""Extract text with PDF metadata"""
|
||||
result = {
|
||||
"text": "",
|
||||
"pages": [],
|
||||
"metadata": {}
|
||||
}
|
||||
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
# Get metadata
|
||||
result["metadata"] = {
|
||||
"page_count": len(pdf.pages),
|
||||
"metadata": pdf.metadata
|
||||
}
|
||||
|
||||
# Extract pages
|
||||
pages = self.extract_pages(file_path)
|
||||
result["pages"] = pages
|
||||
result["text"] = "\n\n".join([p["text"] for p in pages])
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def process_pdf(file_path: str) -> str:
|
||||
"""Process PDF file and return text"""
|
||||
processor = PDFProcessor()
|
||||
return processor.extract_with_metadata(file_path)["text"]
|
||||
Reference in New Issue
Block a user