first-update

2026-03-17 14:36:31 +08:00
parent 72f08aee7c
commit 4eddf05e79
516 changed files with 115270 additions and 1 deletions
--- a/backend/app/services/init.py
+++ b/backend/app/services/init.py
@@ -0,0 +1,3 @@
+"""
+Services module
+"""
--- a/backend/app/services/file_processor/init.py
+++ b/backend/app/services/file_processor/init.py
@@ -0,0 +1,3 @@
+"""
+File Processing Services
+"""
--- a/backend/app/services/file_processor/docx_processor.py
+++ b/backend/app/services/file_processor/docx_processor.py
@@ -0,0 +1,53 @@
+"""
+DOCX Text Extractor
+"""
+from docx import Document
+from typing import Dict, List
+
+
+class DOCXProcessor:
+    """Extract text from DOCX files"""
+
+    def extract_text(self, file_path: str) -> str:
+        """Extract all text from DOCX"""
+        doc = Document(file_path)
+        text_parts = []
+
+        for para in doc.paragraphs:
+            if para.text.strip():
+                text_parts.append(para.text)
+
+        # Also extract text from tables
+        for table in doc.tables:
+            for row in table.rows:
+                for cell in row.cells:
+                    if cell.text.strip():
+                        text_parts.append(cell.text)
+
+        return "\n\n".join(text_parts)
+
+    def extract_with_metadata(self, file_path: str) -> Dict:
+        """Extract text with DOCX metadata"""
+        doc = Document(file_path)
+
+        result = {
+            "text": self.extract_text(file_path),
+            "paragraphs": len(doc.paragraphs),
+            "tables": len(doc.tables),
+            "sections": len(doc.sections),
+            "metadata": {
+                "author": doc.core_properties.author,
+                "title": doc.core_properties.title,
+                "subject": doc.core_properties.subject,
+                "created": doc.core_properties.created,
+                "modified": doc.core_properties.modified
+            }
+        }
+
+        return result
+
+
+def process_docx(file_path: str) -> str:
+    """Process DOCX file and return text"""
+    processor = DOCXProcessor()
+    return processor.extract_text(file_path)
--- a/backend/app/services/file_processor/excel_processor.py
+++ b/backend/app/services/file_processor/excel_processor.py
@@ -0,0 +1,66 @@
+"""
+Excel/CSV Text Extractor
+"""
+import pandas as pd
+from typing import Dict, List
+
+
+class ExcelProcessor:
+    """Extract text from Excel and CSV files"""
+
+    def extract_csv(self, file_path: str) -> str:
+        """Extract text from CSV file"""
+        df = pd.read_csv(file_path)
+        return self._dataframe_to_text(df)
+
+    def extract_excel(self, file_path: str, sheet_name: str = None) -> str:
+        """Extract text from Excel file"""
+        if sheet_name:
+            df = pd.read_excel(file_path, sheet_name=sheet_name)
+            return self._dataframe_to_text(df)
+        else:
+            # Read all sheets
+            sheets = pd.read_excel(file_path, sheet_name=None)
+            text_parts = []
+            for sheet_name, df in sheets.items():
+                text_parts.append(f"=== Sheet: {sheet_name} ===\n")
+                text_parts.append(self._dataframe_to_text(df))
+            return "\n\n".join(text_parts)
+
+    def _dataframe_to_text(self, df: pd.DataFrame) -> str:
+        """Convert DataFrame to readable text"""
+        text_parts = []
+
+        # Add column headers
+        if not df.empty:
+            text_parts.append(" | ".join(str(col) for col in df.columns))
+            text_parts.append("-" * len(text_parts[-1]))
+
+            # Add rows
+            for _, row in df.iterrows():
+                row_text = " | ".join(str(val) for val in row.values)
+                text_parts.append(row_text)
+
+        return "\n".join(text_parts)
+
+    def extract_all_sheets(self, file_path: str) -> Dict[str, str]:
+        """Extract all sheets from Excel file"""
+        sheets = pd.read_excel(file_path, sheet_name=None)
+        return {name: self._dataframe_to_text(df) for name, df in sheets.items()}
+
+    def get_sheet_names(self, file_path: str) -> List[str]:
+        """Get all sheet names from Excel file"""
+        xl = pd.ExcelFile(file_path)
+        return xl.sheet_names
+
+
+def process_csv(file_path: str) -> str:
+    """Process CSV file and return text"""
+    processor = ExcelProcessor()
+    return processor.extract_csv(file_path)
+
+
+def process_excel(file_path: str) -> str:
+    """Process Excel file and return text"""
+    processor = ExcelProcessor()
+    return processor.extract_excel(file_path)
--- a/backend/app/services/file_processor/pdf_processor.py
+++ b/backend/app/services/file_processor/pdf_processor.py
@@ -0,0 +1,65 @@
+"""
+PDF Text Extractor
+"""
+import pdfplumber
+from typing import Dict, List, Optional
+
+
+class PDFProcessor:
+    """Extract text from PDF files"""
+
+    def extract_text(self, file_path: str) -> str:
+        """Extract all text from PDF"""
+        text_parts = []
+
+        with pdfplumber.open(file_path) as pdf:
+            for page_num, page in enumerate(pdf.pages, 1):
+                text = page.extract_text()
+                if text:
+                    text_parts.append(f"--- Page {page_num} ---\n{text}")
+
+        return "\n\n".join(text_parts)
+
+    def extract_pages(self, file_path: str) -> List[Dict]:
+        """Extract text page by page with metadata"""
+        pages = []
+
+        with pdfplumber.open(file_path) as pdf:
+            for page_num, page in enumerate(pdf.pages, 1):
+                text = page.extract_text()
+                if text:
+                    pages.append({
+                        "page_number": page_num,
+                        "text": text.strip(),
+                        "word_count": len(text.split())
+                    })
+
+        return pages
+
+    def extract_with_metadata(self, file_path: str) -> Dict:
+        """Extract text with PDF metadata"""
+        result = {
+            "text": "",
+            "pages": [],
+            "metadata": {}
+        }
+
+        with pdfplumber.open(file_path) as pdf:
+            # Get metadata
+            result["metadata"] = {
+                "page_count": len(pdf.pages),
+                "metadata": pdf.metadata
+            }
+
+            # Extract pages
+            pages = self.extract_pages(file_path)
+            result["pages"] = pages
+            result["text"] = "\n\n".join([p["text"] for p in pages])
+
+        return result
+
+
+def process_pdf(file_path: str) -> str:
+    """Process PDF file and return text"""
+    processor = PDFProcessor()
+    return processor.extract_with_metadata(file_path)["text"]
--- a/backend/app/services/text_splitter/init.py
+++ b/backend/app/services/text_splitter/init.py
@@ -0,0 +1,3 @@
+"""
+Text Splitter Services
+"""
--- a/backend/app/services/text_splitter/splitter.py
+++ b/backend/app/services/text_splitter/splitter.py
@@ -0,0 +1,248 @@
+"""
+Text Splitter
+"""
+import re
+from typing import List, Dict, Optional
+
+
+class TextSplitter:
+    """Base text splitter"""
+
+    def __init__(self, chunk_size: int = 500, overlap: int = 50):
+        self.chunk_size = chunk_size
+        self.overlap = overlap
+
+    def split(self, text: str) -> List[Dict]:
+        """Split text into chunks"""
+        raise NotImplementedError
+
+
+class RecursiveTextSplitter(TextSplitter):
+    """Recursive character text splitter"""
+
+    def __init__(self, chunk_size: int = 500, overlap: int = 50, separators: List[str] = None):
+        super().__init__(chunk_size, overlap)
+        self.separators = separators or ["\n\n", "\n", ". ", " ", ""]
+
+    def split(self, text: str) -> List[Dict]:
+        """Split text recursively"""
+        chunks = []
+        current_chunk = ""
+        chunk_index = 0
+
+        for separator in self.separators:
+            if separator in text:
+                parts = text.split(separator)
+                for part in parts:
+                    if len(current_chunk) + len(part) > self.chunk_size:
+                        if current_chunk:
+                            chunks.append({
+                                "index": chunk_index,
+                                "content": current_chunk.strip(),
+                                "word_count": len(current_chunk.split())
+                            })
+                            chunk_index += 1
+
+                            # Handle overlap
+                            if self.overlap > 0 and chunks:
+                                overlap_text = " ".join(chunks[-1]["content"].split()[-self.overlap:])
+                                current_chunk = overlap_text + separator + part
+                            else:
+                                current_chunk = part
+                    else:
+                        current_chunk += separator + part if current_chunk else part
+
+                if current_chunk:
+                    chunks.append({
+                        "index": chunk_index,
+                        "content": current_chunk.strip(),
+                        "word_count": len(current_chunk.split())
+                    })
+                break
+            else:
+                continue
+
+        return chunks
+
+
+class MarkdownStructureSplitter(TextSplitter):
+    """Split text based on Markdown structure (headings)"""
+
+    def __init__(self, chunk_size: int = 2000, overlap: int = 100):
+        super().__init__(chunk_size, overlap)
+
+    def split(self, text: str) -> List[Dict]:
+        """Split text by Markdown headings"""
+        # Find all heading patterns
+        heading_pattern = r'^(#{1,6})\s+(.+)$'
+        lines = text.split('\n')
+
+        chunks = []
+        current_chunk = ""
+        current_heading = "文档开头"
+        chunk_index = 0
+
+        for line in lines:
+            heading_match = re.match(heading_pattern, line.strip())
+
+            if heading_match:
+                # Save previous chunk if exists
+                if current_chunk.strip():
+                    chunks.append({
+                        "index": chunk_index,
+                        "name": current_heading,
+                        "content": current_chunk.strip(),
+                        "word_count": len(current_chunk.split())
+                    })
+                    chunk_index += 1
+
+                current_heading = heading_match.group(2).strip()
+                current_chunk = line + "\n"
+            else:
+                # Check chunk size
+                if len(current_chunk) > self.chunk_size:
+                    chunks.append({
+                        "index": chunk_index,
+                        "name": current_heading,
+                        "content": current_chunk.strip(),
+                        "word_count": len(current_chunk.split())
+                    })
+                    chunk_index += 1
+
+                    # Handle overlap
+                    if self.overlap > 0:
+                        overlap_lines = current_chunk.split('\n')[-self.overlap:]
+                        current_chunk = '\n'.join(overlap_lines) + '\n'
+                    else:
+                        current_chunk = ""
+
+                current_chunk += line + "\n"
+
+        # Add last chunk
+        if current_chunk.strip():
+            chunks.append({
+                "index": chunk_index,
+                "name": current_heading,
+                "content": current_chunk.strip(),
+                "word_count": len(current_chunk.split())
+            })
+
+        return chunks
+
+
+class TokenSplitter(TextSplitter):
+    """Split text by token count"""
+
+    def __init__(self, chunk_size: int = 500, overlap: int = 50):
+        super().__init__(chunk_size, overlap)
+
+    def split(self, text: str) -> List[Dict]:
+        """Split text by approximate token count"""
+        words = text.split()
+        chunks = []
+        chunk_index = 0
+
+        for i in range(0, len(words), self.chunk_size - self.overlap):
+            chunk_words = words[i:i + self.chunk_size]
+            chunk_text = " ".join(chunk_words)
+
+            chunks.append({
+                "index": chunk_index,
+                "content": chunk_text,
+                "word_count": len(chunk_words),
+                "token_estimate": len(chunk_words) * 1.3  # rough token estimate
+            })
+            chunk_index += 1
+
+        return chunks
+
+
+class CodeSplitter(TextSplitter):
+    """Split text with code awareness"""
+
+    def __init__(self, chunk_size: int = 500, overlap: int = 50):
+        super().__init__(chunk_size, overlap)
+
+    def split(self, text: str) -> List[Dict]:
+        """Split text preserving code blocks"""
+        # Split by code blocks first
+        code_pattern = r'```[\s\S]*?```'
+        parts = re.split(code_pattern, text)
+
+        chunks = []
+        chunk_index = 0
+        current_chunk = ""
+
+        for part in parts:
+            if len(current_chunk) + len(part) > self.chunk_size:
+                if current_chunk.strip():
+                    chunks.append({
+                        "index": chunk_index,
+                        "content": current_chunk.strip(),
+                        "word_count": len(current_chunk.split())
+                    })
+                    chunk_index += 1
+                current_chunk = part
+            else:
+                current_chunk += part
+
+        if current_chunk.strip():
+            chunks.append({
+                "index": chunk_index,
+                "content": current_chunk.strip(),
+                "word_count": len(current_chunk.split())
+            })
+
+        return chunks
+
+
+class CustomSplitter(TextSplitter):
+    """Custom separator splitter"""
+
+    def __init__(self, separator: str = "\n\n", chunk_size: int = 500):
+        super().__init__(chunk_size, 0)
+        self.separator = separator
+
+    def split(self, text: str) -> List[Dict]:
+        """Split by custom separator"""
+        parts = text.split(self.separator)
+        chunks = []
+
+        current_chunk = ""
+        chunk_index = 0
+
+        for part in parts:
+            if len(current_chunk) + len(part) > self.chunk_size:
+                if current_chunk.strip():
+                    chunks.append({
+                        "index": chunk_index,
+                        "content": current_chunk.strip(),
+                        "word_count": len(current_chunk.split())
+                    })
+                    chunk_index += 1
+                current_chunk = part
+            else:
+                current_chunk += self.separator + part if current_chunk else part
+
+        if current_chunk.strip():
+            chunks.append({
+                "index": chunk_index,
+                "content": current_chunk.strip(),
+                "word_count": len(current_chunk.split())
+            })
+
+        return chunks
+
+
+def get_splitter(method: str, **kwargs) -> TextSplitter:
+    """Get text splitter by method name"""
+    splitters = {
+        "recursive": RecursiveTextSplitter,
+        "markdown_structure": MarkdownStructureSplitter,
+        "token": TokenSplitter,
+        "code": CodeSplitter,
+        "custom": CustomSplitter
+    }
+
+    splitter_class = splitters.get(method, RecursiveTextSplitter)
+    return splitter_class(**kwargs)