first-update
This commit is contained in:
3
backend/app/services/__init__.py
Normal file
3
backend/app/services/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
Services module
|
||||
"""
|
||||
3
backend/app/services/file_processor/__init__.py
Normal file
3
backend/app/services/file_processor/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
File Processing Services
|
||||
"""
|
||||
53
backend/app/services/file_processor/docx_processor.py
Normal file
53
backend/app/services/file_processor/docx_processor.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""
|
||||
DOCX Text Extractor
|
||||
"""
|
||||
from docx import Document
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
class DOCXProcessor:
|
||||
"""Extract text from DOCX files"""
|
||||
|
||||
def extract_text(self, file_path: str) -> str:
|
||||
"""Extract all text from DOCX"""
|
||||
doc = Document(file_path)
|
||||
text_parts = []
|
||||
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
text_parts.append(para.text)
|
||||
|
||||
# Also extract text from tables
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
if cell.text.strip():
|
||||
text_parts.append(cell.text)
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
def extract_with_metadata(self, file_path: str) -> Dict:
|
||||
"""Extract text with DOCX metadata"""
|
||||
doc = Document(file_path)
|
||||
|
||||
result = {
|
||||
"text": self.extract_text(file_path),
|
||||
"paragraphs": len(doc.paragraphs),
|
||||
"tables": len(doc.tables),
|
||||
"sections": len(doc.sections),
|
||||
"metadata": {
|
||||
"author": doc.core_properties.author,
|
||||
"title": doc.core_properties.title,
|
||||
"subject": doc.core_properties.subject,
|
||||
"created": doc.core_properties.created,
|
||||
"modified": doc.core_properties.modified
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def process_docx(file_path: str) -> str:
|
||||
"""Process DOCX file and return text"""
|
||||
processor = DOCXProcessor()
|
||||
return processor.extract_text(file_path)
|
||||
66
backend/app/services/file_processor/excel_processor.py
Normal file
66
backend/app/services/file_processor/excel_processor.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""
|
||||
Excel/CSV Text Extractor
|
||||
"""
|
||||
import pandas as pd
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
class ExcelProcessor:
|
||||
"""Extract text from Excel and CSV files"""
|
||||
|
||||
def extract_csv(self, file_path: str) -> str:
|
||||
"""Extract text from CSV file"""
|
||||
df = pd.read_csv(file_path)
|
||||
return self._dataframe_to_text(df)
|
||||
|
||||
def extract_excel(self, file_path: str, sheet_name: str = None) -> str:
|
||||
"""Extract text from Excel file"""
|
||||
if sheet_name:
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
||||
return self._dataframe_to_text(df)
|
||||
else:
|
||||
# Read all sheets
|
||||
sheets = pd.read_excel(file_path, sheet_name=None)
|
||||
text_parts = []
|
||||
for sheet_name, df in sheets.items():
|
||||
text_parts.append(f"=== Sheet: {sheet_name} ===\n")
|
||||
text_parts.append(self._dataframe_to_text(df))
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
def _dataframe_to_text(self, df: pd.DataFrame) -> str:
|
||||
"""Convert DataFrame to readable text"""
|
||||
text_parts = []
|
||||
|
||||
# Add column headers
|
||||
if not df.empty:
|
||||
text_parts.append(" | ".join(str(col) for col in df.columns))
|
||||
text_parts.append("-" * len(text_parts[-1]))
|
||||
|
||||
# Add rows
|
||||
for _, row in df.iterrows():
|
||||
row_text = " | ".join(str(val) for val in row.values)
|
||||
text_parts.append(row_text)
|
||||
|
||||
return "\n".join(text_parts)
|
||||
|
||||
def extract_all_sheets(self, file_path: str) -> Dict[str, str]:
|
||||
"""Extract all sheets from Excel file"""
|
||||
sheets = pd.read_excel(file_path, sheet_name=None)
|
||||
return {name: self._dataframe_to_text(df) for name, df in sheets.items()}
|
||||
|
||||
def get_sheet_names(self, file_path: str) -> List[str]:
|
||||
"""Get all sheet names from Excel file"""
|
||||
xl = pd.ExcelFile(file_path)
|
||||
return xl.sheet_names
|
||||
|
||||
|
||||
def process_csv(file_path: str) -> str:
|
||||
"""Process CSV file and return text"""
|
||||
processor = ExcelProcessor()
|
||||
return processor.extract_csv(file_path)
|
||||
|
||||
|
||||
def process_excel(file_path: str) -> str:
|
||||
"""Process Excel file and return text"""
|
||||
processor = ExcelProcessor()
|
||||
return processor.extract_excel(file_path)
|
||||
65
backend/app/services/file_processor/pdf_processor.py
Normal file
65
backend/app/services/file_processor/pdf_processor.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""
|
||||
PDF Text Extractor
|
||||
"""
|
||||
import pdfplumber
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
class PDFProcessor:
|
||||
"""Extract text from PDF files"""
|
||||
|
||||
def extract_text(self, file_path: str) -> str:
|
||||
"""Extract all text from PDF"""
|
||||
text_parts = []
|
||||
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_parts.append(f"--- Page {page_num} ---\n{text}")
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
def extract_pages(self, file_path: str) -> List[Dict]:
|
||||
"""Extract text page by page with metadata"""
|
||||
pages = []
|
||||
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
pages.append({
|
||||
"page_number": page_num,
|
||||
"text": text.strip(),
|
||||
"word_count": len(text.split())
|
||||
})
|
||||
|
||||
return pages
|
||||
|
||||
def extract_with_metadata(self, file_path: str) -> Dict:
|
||||
"""Extract text with PDF metadata"""
|
||||
result = {
|
||||
"text": "",
|
||||
"pages": [],
|
||||
"metadata": {}
|
||||
}
|
||||
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
# Get metadata
|
||||
result["metadata"] = {
|
||||
"page_count": len(pdf.pages),
|
||||
"metadata": pdf.metadata
|
||||
}
|
||||
|
||||
# Extract pages
|
||||
pages = self.extract_pages(file_path)
|
||||
result["pages"] = pages
|
||||
result["text"] = "\n\n".join([p["text"] for p in pages])
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def process_pdf(file_path: str) -> str:
|
||||
"""Process PDF file and return text"""
|
||||
processor = PDFProcessor()
|
||||
return processor.extract_with_metadata(file_path)["text"]
|
||||
3
backend/app/services/text_splitter/__init__.py
Normal file
3
backend/app/services/text_splitter/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
Text Splitter Services
|
||||
"""
|
||||
248
backend/app/services/text_splitter/splitter.py
Normal file
248
backend/app/services/text_splitter/splitter.py
Normal file
@@ -0,0 +1,248 @@
|
||||
"""
|
||||
Text Splitter
|
||||
"""
|
||||
import re
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
|
||||
class TextSplitter:
|
||||
"""Base text splitter"""
|
||||
|
||||
def __init__(self, chunk_size: int = 500, overlap: int = 50):
|
||||
self.chunk_size = chunk_size
|
||||
self.overlap = overlap
|
||||
|
||||
def split(self, text: str) -> List[Dict]:
|
||||
"""Split text into chunks"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class RecursiveTextSplitter(TextSplitter):
|
||||
"""Recursive character text splitter"""
|
||||
|
||||
def __init__(self, chunk_size: int = 500, overlap: int = 50, separators: List[str] = None):
|
||||
super().__init__(chunk_size, overlap)
|
||||
self.separators = separators or ["\n\n", "\n", ". ", " ", ""]
|
||||
|
||||
def split(self, text: str) -> List[Dict]:
|
||||
"""Split text recursively"""
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
chunk_index = 0
|
||||
|
||||
for separator in self.separators:
|
||||
if separator in text:
|
||||
parts = text.split(separator)
|
||||
for part in parts:
|
||||
if len(current_chunk) + len(part) > self.chunk_size:
|
||||
if current_chunk:
|
||||
chunks.append({
|
||||
"index": chunk_index,
|
||||
"content": current_chunk.strip(),
|
||||
"word_count": len(current_chunk.split())
|
||||
})
|
||||
chunk_index += 1
|
||||
|
||||
# Handle overlap
|
||||
if self.overlap > 0 and chunks:
|
||||
overlap_text = " ".join(chunks[-1]["content"].split()[-self.overlap:])
|
||||
current_chunk = overlap_text + separator + part
|
||||
else:
|
||||
current_chunk = part
|
||||
else:
|
||||
current_chunk += separator + part if current_chunk else part
|
||||
|
||||
if current_chunk:
|
||||
chunks.append({
|
||||
"index": chunk_index,
|
||||
"content": current_chunk.strip(),
|
||||
"word_count": len(current_chunk.split())
|
||||
})
|
||||
break
|
||||
else:
|
||||
continue
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
class MarkdownStructureSplitter(TextSplitter):
|
||||
"""Split text based on Markdown structure (headings)"""
|
||||
|
||||
def __init__(self, chunk_size: int = 2000, overlap: int = 100):
|
||||
super().__init__(chunk_size, overlap)
|
||||
|
||||
def split(self, text: str) -> List[Dict]:
|
||||
"""Split text by Markdown headings"""
|
||||
# Find all heading patterns
|
||||
heading_pattern = r'^(#{1,6})\s+(.+)$'
|
||||
lines = text.split('\n')
|
||||
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
current_heading = "文档开头"
|
||||
chunk_index = 0
|
||||
|
||||
for line in lines:
|
||||
heading_match = re.match(heading_pattern, line.strip())
|
||||
|
||||
if heading_match:
|
||||
# Save previous chunk if exists
|
||||
if current_chunk.strip():
|
||||
chunks.append({
|
||||
"index": chunk_index,
|
||||
"name": current_heading,
|
||||
"content": current_chunk.strip(),
|
||||
"word_count": len(current_chunk.split())
|
||||
})
|
||||
chunk_index += 1
|
||||
|
||||
current_heading = heading_match.group(2).strip()
|
||||
current_chunk = line + "\n"
|
||||
else:
|
||||
# Check chunk size
|
||||
if len(current_chunk) > self.chunk_size:
|
||||
chunks.append({
|
||||
"index": chunk_index,
|
||||
"name": current_heading,
|
||||
"content": current_chunk.strip(),
|
||||
"word_count": len(current_chunk.split())
|
||||
})
|
||||
chunk_index += 1
|
||||
|
||||
# Handle overlap
|
||||
if self.overlap > 0:
|
||||
overlap_lines = current_chunk.split('\n')[-self.overlap:]
|
||||
current_chunk = '\n'.join(overlap_lines) + '\n'
|
||||
else:
|
||||
current_chunk = ""
|
||||
|
||||
current_chunk += line + "\n"
|
||||
|
||||
# Add last chunk
|
||||
if current_chunk.strip():
|
||||
chunks.append({
|
||||
"index": chunk_index,
|
||||
"name": current_heading,
|
||||
"content": current_chunk.strip(),
|
||||
"word_count": len(current_chunk.split())
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
class TokenSplitter(TextSplitter):
|
||||
"""Split text by token count"""
|
||||
|
||||
def __init__(self, chunk_size: int = 500, overlap: int = 50):
|
||||
super().__init__(chunk_size, overlap)
|
||||
|
||||
def split(self, text: str) -> List[Dict]:
|
||||
"""Split text by approximate token count"""
|
||||
words = text.split()
|
||||
chunks = []
|
||||
chunk_index = 0
|
||||
|
||||
for i in range(0, len(words), self.chunk_size - self.overlap):
|
||||
chunk_words = words[i:i + self.chunk_size]
|
||||
chunk_text = " ".join(chunk_words)
|
||||
|
||||
chunks.append({
|
||||
"index": chunk_index,
|
||||
"content": chunk_text,
|
||||
"word_count": len(chunk_words),
|
||||
"token_estimate": len(chunk_words) * 1.3 # rough token estimate
|
||||
})
|
||||
chunk_index += 1
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
class CodeSplitter(TextSplitter):
|
||||
"""Split text with code awareness"""
|
||||
|
||||
def __init__(self, chunk_size: int = 500, overlap: int = 50):
|
||||
super().__init__(chunk_size, overlap)
|
||||
|
||||
def split(self, text: str) -> List[Dict]:
|
||||
"""Split text preserving code blocks"""
|
||||
# Split by code blocks first
|
||||
code_pattern = r'```[\s\S]*?```'
|
||||
parts = re.split(code_pattern, text)
|
||||
|
||||
chunks = []
|
||||
chunk_index = 0
|
||||
current_chunk = ""
|
||||
|
||||
for part in parts:
|
||||
if len(current_chunk) + len(part) > self.chunk_size:
|
||||
if current_chunk.strip():
|
||||
chunks.append({
|
||||
"index": chunk_index,
|
||||
"content": current_chunk.strip(),
|
||||
"word_count": len(current_chunk.split())
|
||||
})
|
||||
chunk_index += 1
|
||||
current_chunk = part
|
||||
else:
|
||||
current_chunk += part
|
||||
|
||||
if current_chunk.strip():
|
||||
chunks.append({
|
||||
"index": chunk_index,
|
||||
"content": current_chunk.strip(),
|
||||
"word_count": len(current_chunk.split())
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
class CustomSplitter(TextSplitter):
|
||||
"""Custom separator splitter"""
|
||||
|
||||
def __init__(self, separator: str = "\n\n", chunk_size: int = 500):
|
||||
super().__init__(chunk_size, 0)
|
||||
self.separator = separator
|
||||
|
||||
def split(self, text: str) -> List[Dict]:
|
||||
"""Split by custom separator"""
|
||||
parts = text.split(self.separator)
|
||||
chunks = []
|
||||
|
||||
current_chunk = ""
|
||||
chunk_index = 0
|
||||
|
||||
for part in parts:
|
||||
if len(current_chunk) + len(part) > self.chunk_size:
|
||||
if current_chunk.strip():
|
||||
chunks.append({
|
||||
"index": chunk_index,
|
||||
"content": current_chunk.strip(),
|
||||
"word_count": len(current_chunk.split())
|
||||
})
|
||||
chunk_index += 1
|
||||
current_chunk = part
|
||||
else:
|
||||
current_chunk += self.separator + part if current_chunk else part
|
||||
|
||||
if current_chunk.strip():
|
||||
chunks.append({
|
||||
"index": chunk_index,
|
||||
"content": current_chunk.strip(),
|
||||
"word_count": len(current_chunk.split())
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def get_splitter(method: str, **kwargs) -> TextSplitter:
|
||||
"""Get text splitter by method name"""
|
||||
splitters = {
|
||||
"recursive": RecursiveTextSplitter,
|
||||
"markdown_structure": MarkdownStructureSplitter,
|
||||
"token": TokenSplitter,
|
||||
"code": CodeSplitter,
|
||||
"custom": CustomSplitter
|
||||
}
|
||||
|
||||
splitter_class = splitters.get(method, RecursiveTextSplitter)
|
||||
return splitter_class(**kwargs)
|
||||
Reference in New Issue
Block a user