first-update

This commit is contained in:
2026-03-17 14:36:31 +08:00
parent 72f08aee7c
commit 4eddf05e79
516 changed files with 115270 additions and 1 deletions

View File

@@ -0,0 +1,3 @@
"""
Services module
"""

View File

@@ -0,0 +1,3 @@
"""
File Processing Services
"""

View File

@@ -0,0 +1,53 @@
"""
DOCX Text Extractor
"""
from docx import Document
from typing import Dict, List
class DOCXProcessor:
"""Extract text from DOCX files"""
def extract_text(self, file_path: str) -> str:
"""Extract all text from DOCX"""
doc = Document(file_path)
text_parts = []
for para in doc.paragraphs:
if para.text.strip():
text_parts.append(para.text)
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
text_parts.append(cell.text)
return "\n\n".join(text_parts)
def extract_with_metadata(self, file_path: str) -> Dict:
"""Extract text with DOCX metadata"""
doc = Document(file_path)
result = {
"text": self.extract_text(file_path),
"paragraphs": len(doc.paragraphs),
"tables": len(doc.tables),
"sections": len(doc.sections),
"metadata": {
"author": doc.core_properties.author,
"title": doc.core_properties.title,
"subject": doc.core_properties.subject,
"created": doc.core_properties.created,
"modified": doc.core_properties.modified
}
}
return result
def process_docx(file_path: str) -> str:
"""Process DOCX file and return text"""
processor = DOCXProcessor()
return processor.extract_text(file_path)

View File

@@ -0,0 +1,66 @@
"""
Excel/CSV Text Extractor
"""
import pandas as pd
from typing import Dict, List
class ExcelProcessor:
"""Extract text from Excel and CSV files"""
def extract_csv(self, file_path: str) -> str:
"""Extract text from CSV file"""
df = pd.read_csv(file_path)
return self._dataframe_to_text(df)
def extract_excel(self, file_path: str, sheet_name: str = None) -> str:
"""Extract text from Excel file"""
if sheet_name:
df = pd.read_excel(file_path, sheet_name=sheet_name)
return self._dataframe_to_text(df)
else:
# Read all sheets
sheets = pd.read_excel(file_path, sheet_name=None)
text_parts = []
for sheet_name, df in sheets.items():
text_parts.append(f"=== Sheet: {sheet_name} ===\n")
text_parts.append(self._dataframe_to_text(df))
return "\n\n".join(text_parts)
def _dataframe_to_text(self, df: pd.DataFrame) -> str:
"""Convert DataFrame to readable text"""
text_parts = []
# Add column headers
if not df.empty:
text_parts.append(" | ".join(str(col) for col in df.columns))
text_parts.append("-" * len(text_parts[-1]))
# Add rows
for _, row in df.iterrows():
row_text = " | ".join(str(val) for val in row.values)
text_parts.append(row_text)
return "\n".join(text_parts)
def extract_all_sheets(self, file_path: str) -> Dict[str, str]:
"""Extract all sheets from Excel file"""
sheets = pd.read_excel(file_path, sheet_name=None)
return {name: self._dataframe_to_text(df) for name, df in sheets.items()}
def get_sheet_names(self, file_path: str) -> List[str]:
"""Get all sheet names from Excel file"""
xl = pd.ExcelFile(file_path)
return xl.sheet_names
def process_csv(file_path: str) -> str:
"""Process CSV file and return text"""
processor = ExcelProcessor()
return processor.extract_csv(file_path)
def process_excel(file_path: str) -> str:
"""Process Excel file and return text"""
processor = ExcelProcessor()
return processor.extract_excel(file_path)

View File

@@ -0,0 +1,65 @@
"""
PDF Text Extractor
"""
import pdfplumber
from typing import Dict, List, Optional
class PDFProcessor:
"""Extract text from PDF files"""
def extract_text(self, file_path: str) -> str:
"""Extract all text from PDF"""
text_parts = []
with pdfplumber.open(file_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
text = page.extract_text()
if text:
text_parts.append(f"--- Page {page_num} ---\n{text}")
return "\n\n".join(text_parts)
def extract_pages(self, file_path: str) -> List[Dict]:
"""Extract text page by page with metadata"""
pages = []
with pdfplumber.open(file_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
text = page.extract_text()
if text:
pages.append({
"page_number": page_num,
"text": text.strip(),
"word_count": len(text.split())
})
return pages
def extract_with_metadata(self, file_path: str) -> Dict:
"""Extract text with PDF metadata"""
result = {
"text": "",
"pages": [],
"metadata": {}
}
with pdfplumber.open(file_path) as pdf:
# Get metadata
result["metadata"] = {
"page_count": len(pdf.pages),
"metadata": pdf.metadata
}
# Extract pages
pages = self.extract_pages(file_path)
result["pages"] = pages
result["text"] = "\n\n".join([p["text"] for p in pages])
return result
def process_pdf(file_path: str) -> str:
"""Process PDF file and return text"""
processor = PDFProcessor()
return processor.extract_with_metadata(file_path)["text"]

View File

@@ -0,0 +1,3 @@
"""
Text Splitter Services
"""

View File

@@ -0,0 +1,248 @@
"""
Text Splitter
"""
import re
from typing import List, Dict, Optional
class TextSplitter:
"""Base text splitter"""
def __init__(self, chunk_size: int = 500, overlap: int = 50):
self.chunk_size = chunk_size
self.overlap = overlap
def split(self, text: str) -> List[Dict]:
"""Split text into chunks"""
raise NotImplementedError
class RecursiveTextSplitter(TextSplitter):
"""Recursive character text splitter"""
def __init__(self, chunk_size: int = 500, overlap: int = 50, separators: List[str] = None):
super().__init__(chunk_size, overlap)
self.separators = separators or ["\n\n", "\n", ". ", " ", ""]
def split(self, text: str) -> List[Dict]:
"""Split text recursively"""
chunks = []
current_chunk = ""
chunk_index = 0
for separator in self.separators:
if separator in text:
parts = text.split(separator)
for part in parts:
if len(current_chunk) + len(part) > self.chunk_size:
if current_chunk:
chunks.append({
"index": chunk_index,
"content": current_chunk.strip(),
"word_count": len(current_chunk.split())
})
chunk_index += 1
# Handle overlap
if self.overlap > 0 and chunks:
overlap_text = " ".join(chunks[-1]["content"].split()[-self.overlap:])
current_chunk = overlap_text + separator + part
else:
current_chunk = part
else:
current_chunk += separator + part if current_chunk else part
if current_chunk:
chunks.append({
"index": chunk_index,
"content": current_chunk.strip(),
"word_count": len(current_chunk.split())
})
break
else:
continue
return chunks
class MarkdownStructureSplitter(TextSplitter):
"""Split text based on Markdown structure (headings)"""
def __init__(self, chunk_size: int = 2000, overlap: int = 100):
super().__init__(chunk_size, overlap)
def split(self, text: str) -> List[Dict]:
"""Split text by Markdown headings"""
# Find all heading patterns
heading_pattern = r'^(#{1,6})\s+(.+)$'
lines = text.split('\n')
chunks = []
current_chunk = ""
current_heading = "文档开头"
chunk_index = 0
for line in lines:
heading_match = re.match(heading_pattern, line.strip())
if heading_match:
# Save previous chunk if exists
if current_chunk.strip():
chunks.append({
"index": chunk_index,
"name": current_heading,
"content": current_chunk.strip(),
"word_count": len(current_chunk.split())
})
chunk_index += 1
current_heading = heading_match.group(2).strip()
current_chunk = line + "\n"
else:
# Check chunk size
if len(current_chunk) > self.chunk_size:
chunks.append({
"index": chunk_index,
"name": current_heading,
"content": current_chunk.strip(),
"word_count": len(current_chunk.split())
})
chunk_index += 1
# Handle overlap
if self.overlap > 0:
overlap_lines = current_chunk.split('\n')[-self.overlap:]
current_chunk = '\n'.join(overlap_lines) + '\n'
else:
current_chunk = ""
current_chunk += line + "\n"
# Add last chunk
if current_chunk.strip():
chunks.append({
"index": chunk_index,
"name": current_heading,
"content": current_chunk.strip(),
"word_count": len(current_chunk.split())
})
return chunks
class TokenSplitter(TextSplitter):
"""Split text by token count"""
def __init__(self, chunk_size: int = 500, overlap: int = 50):
super().__init__(chunk_size, overlap)
def split(self, text: str) -> List[Dict]:
"""Split text by approximate token count"""
words = text.split()
chunks = []
chunk_index = 0
for i in range(0, len(words), self.chunk_size - self.overlap):
chunk_words = words[i:i + self.chunk_size]
chunk_text = " ".join(chunk_words)
chunks.append({
"index": chunk_index,
"content": chunk_text,
"word_count": len(chunk_words),
"token_estimate": len(chunk_words) * 1.3 # rough token estimate
})
chunk_index += 1
return chunks
class CodeSplitter(TextSplitter):
"""Split text with code awareness"""
def __init__(self, chunk_size: int = 500, overlap: int = 50):
super().__init__(chunk_size, overlap)
def split(self, text: str) -> List[Dict]:
"""Split text preserving code blocks"""
# Split by code blocks first
code_pattern = r'```[\s\S]*?```'
parts = re.split(code_pattern, text)
chunks = []
chunk_index = 0
current_chunk = ""
for part in parts:
if len(current_chunk) + len(part) > self.chunk_size:
if current_chunk.strip():
chunks.append({
"index": chunk_index,
"content": current_chunk.strip(),
"word_count": len(current_chunk.split())
})
chunk_index += 1
current_chunk = part
else:
current_chunk += part
if current_chunk.strip():
chunks.append({
"index": chunk_index,
"content": current_chunk.strip(),
"word_count": len(current_chunk.split())
})
return chunks
class CustomSplitter(TextSplitter):
"""Custom separator splitter"""
def __init__(self, separator: str = "\n\n", chunk_size: int = 500):
super().__init__(chunk_size, 0)
self.separator = separator
def split(self, text: str) -> List[Dict]:
"""Split by custom separator"""
parts = text.split(self.separator)
chunks = []
current_chunk = ""
chunk_index = 0
for part in parts:
if len(current_chunk) + len(part) > self.chunk_size:
if current_chunk.strip():
chunks.append({
"index": chunk_index,
"content": current_chunk.strip(),
"word_count": len(current_chunk.split())
})
chunk_index += 1
current_chunk = part
else:
current_chunk += self.separator + part if current_chunk else part
if current_chunk.strip():
chunks.append({
"index": chunk_index,
"content": current_chunk.strip(),
"word_count": len(current_chunk.split())
})
return chunks
def get_splitter(method: str, **kwargs) -> TextSplitter:
"""Get text splitter by method name"""
splitters = {
"recursive": RecursiveTextSplitter,
"markdown_structure": MarkdownStructureSplitter,
"token": TokenSplitter,
"code": CodeSplitter,
"custom": CustomSplitter
}
splitter_class = splitters.get(method, RecursiveTextSplitter)
return splitter_class(**kwargs)