first-update

This commit is contained in:
2026-03-17 14:36:31 +08:00
parent 72f08aee7c
commit 4eddf05e79
516 changed files with 115270 additions and 1 deletions

View File

@@ -0,0 +1,3 @@
"""
File Processing Services
"""

View File

@@ -0,0 +1,53 @@
"""
DOCX Text Extractor
"""
from docx import Document
from typing import Dict, List
class DOCXProcessor:
"""Extract text from DOCX files"""
def extract_text(self, file_path: str) -> str:
"""Extract all text from DOCX"""
doc = Document(file_path)
text_parts = []
for para in doc.paragraphs:
if para.text.strip():
text_parts.append(para.text)
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
text_parts.append(cell.text)
return "\n\n".join(text_parts)
def extract_with_metadata(self, file_path: str) -> Dict:
"""Extract text with DOCX metadata"""
doc = Document(file_path)
result = {
"text": self.extract_text(file_path),
"paragraphs": len(doc.paragraphs),
"tables": len(doc.tables),
"sections": len(doc.sections),
"metadata": {
"author": doc.core_properties.author,
"title": doc.core_properties.title,
"subject": doc.core_properties.subject,
"created": doc.core_properties.created,
"modified": doc.core_properties.modified
}
}
return result
def process_docx(file_path: str) -> str:
"""Process DOCX file and return text"""
processor = DOCXProcessor()
return processor.extract_text(file_path)

View File

@@ -0,0 +1,66 @@
"""
Excel/CSV Text Extractor
"""
import pandas as pd
from typing import Dict, List
class ExcelProcessor:
"""Extract text from Excel and CSV files"""
def extract_csv(self, file_path: str) -> str:
"""Extract text from CSV file"""
df = pd.read_csv(file_path)
return self._dataframe_to_text(df)
def extract_excel(self, file_path: str, sheet_name: str = None) -> str:
"""Extract text from Excel file"""
if sheet_name:
df = pd.read_excel(file_path, sheet_name=sheet_name)
return self._dataframe_to_text(df)
else:
# Read all sheets
sheets = pd.read_excel(file_path, sheet_name=None)
text_parts = []
for sheet_name, df in sheets.items():
text_parts.append(f"=== Sheet: {sheet_name} ===\n")
text_parts.append(self._dataframe_to_text(df))
return "\n\n".join(text_parts)
def _dataframe_to_text(self, df: pd.DataFrame) -> str:
"""Convert DataFrame to readable text"""
text_parts = []
# Add column headers
if not df.empty:
text_parts.append(" | ".join(str(col) for col in df.columns))
text_parts.append("-" * len(text_parts[-1]))
# Add rows
for _, row in df.iterrows():
row_text = " | ".join(str(val) for val in row.values)
text_parts.append(row_text)
return "\n".join(text_parts)
def extract_all_sheets(self, file_path: str) -> Dict[str, str]:
"""Extract all sheets from Excel file"""
sheets = pd.read_excel(file_path, sheet_name=None)
return {name: self._dataframe_to_text(df) for name, df in sheets.items()}
def get_sheet_names(self, file_path: str) -> List[str]:
"""Get all sheet names from Excel file"""
xl = pd.ExcelFile(file_path)
return xl.sheet_names
def process_csv(file_path: str) -> str:
"""Process CSV file and return text"""
processor = ExcelProcessor()
return processor.extract_csv(file_path)
def process_excel(file_path: str) -> str:
"""Process Excel file and return text"""
processor = ExcelProcessor()
return processor.extract_excel(file_path)

View File

@@ -0,0 +1,65 @@
"""
PDF Text Extractor
"""
import pdfplumber
from typing import Dict, List, Optional
class PDFProcessor:
"""Extract text from PDF files"""
def extract_text(self, file_path: str) -> str:
"""Extract all text from PDF"""
text_parts = []
with pdfplumber.open(file_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
text = page.extract_text()
if text:
text_parts.append(f"--- Page {page_num} ---\n{text}")
return "\n\n".join(text_parts)
def extract_pages(self, file_path: str) -> List[Dict]:
"""Extract text page by page with metadata"""
pages = []
with pdfplumber.open(file_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
text = page.extract_text()
if text:
pages.append({
"page_number": page_num,
"text": text.strip(),
"word_count": len(text.split())
})
return pages
def extract_with_metadata(self, file_path: str) -> Dict:
"""Extract text with PDF metadata"""
result = {
"text": "",
"pages": [],
"metadata": {}
}
with pdfplumber.open(file_path) as pdf:
# Get metadata
result["metadata"] = {
"page_count": len(pdf.pages),
"metadata": pdf.metadata
}
# Extract pages
pages = self.extract_pages(file_path)
result["pages"] = pages
result["text"] = "\n\n".join([p["text"] for p in pages])
return result
def process_pdf(file_path: str) -> str:
"""Process PDF file and return text"""
processor = PDFProcessor()
return processor.extract_with_metadata(file_path)["text"]