refactor: 前端架构重构 - 提取 CSS 和逻辑到独立模块

前端重构： - 删除旧的大体积 Vue 组件（HomeView, FileManage, TextSplit 等） - 删除旧的 composables（useFormatters, useModels, useProjects） - 新增 core/, page-logic/, pages/, shared/ 模块化目录结构 - 提取 CSS 到 styles/pages/ 目录 - 添加全局样式 variables.css 和 common.css 后端 API 更新： - chunks: 语义分割 API 增强 - files: 文件处理 API 更新 - models: 模型管理 API 更新 - questions: 问答管理 API 更新 - database: 数据库连接优化 - semantic_embedding: 语义嵌入服务优化 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 14:23:34 +08:00
parent a280b4f014
commit 6aa271c4f7
75 changed files with 22636 additions and 6519 deletions
--- a/backend/app/services/text_splitter/semantic_embedding.py
+++ b/backend/app/services/text_splitter/semantic_embedding.py
@@ -8,6 +8,7 @@ import httpx
 import numpy as np
 from typing import List, Dict, Optional
 from abc import ABC, abstractmethod
+from langchain_text_splitters import RecursiveCharacterTextSplitter


 class EmbeddingProvider(ABC):
@@ -109,32 +110,28 @@ class EmbeddingSplitter:

    def _tokenize_sentences(self, text: str) -> List[str]:
        """将文本切分为句子"""
-        # 中英文句末符号
-        # 先按换行分割，保持段落结构
-        paragraphs = re.split(r'\n+', text)
-
+        paragraphs = re.split(r'\n\s*\n+', text)
        sentences = []
        for para in paragraphs:
-            if not para.strip():
+            para = para.strip()
+            if not para:
                continue

-            # 按句子符号分割
-            # 中文：。！？；
-            # 英文：. ! ? ;
-            parts = re.split(r'([。！？；\n]|(?<=[.!?])\s+)', para)
+            parts = re.split(r'(?<=[。！？；.!?])\s+|(?<=[。！？；])', para)
+            buffer = []

-            # 重新组合句子
-            current_sentence = ""
            for part in parts:
-                if part in '。！？；.\n':
-                    if current_sentence.strip():
-                        sentences.append(current_sentence.strip())
-                    current_sentence = ""
-                elif part and part.strip():
-                    current_sentence += part
-            # 处理最后一个句子
-            if current_sentence.strip():
-                sentences.append(current_sentence.strip())
+                part = part.strip()
+                if not part:
+                    continue
+
+                # 过短的片段先暂存，尽量与后一句合并，避免 embedding 粒度过碎
+                if len(part) < 8 and buffer:
+                    buffer[-1] = f"{buffer[-1]} {part}".strip()
+                else:
+                    buffer.append(part)
+
+            sentences.extend(buffer)

        return sentences

@@ -162,51 +159,48 @@ class EmbeddingSplitter:
        if not similarities:
            return []

-        window = self.window_size
+        window = max(1, self.window_size)
        smoothed = []

        for i in range(len(similarities)):
-            start = max(0, i - window + 1)
-            end = i + 1
+            start = max(0, i - window)
+            end = min(len(similarities), i + window + 1)
            window_vals = similarities[start:end]
            smoothed.append(sum(window_vals) / len(window_vals))

        return smoothed

-    def _detect_boundaries(self, similarities: List[float]) -> List[int]:
+    def _detect_boundaries(self, similarities: List[float], sentence_lengths: List[int]) -> List[int]:
        """检测分割点（相似度显著下降的位置）"""
        if not similarities:
            return [0]

-        # 平滑
        smoothed = self._smooth_similarities(similarities)
-
-        # 计算深度分数（类似 TextTiling）
-        depth_scores = []
-        for i in range(1, len(smoothed) - 1):
-            # 当前位置的深度 = 当前位置的值 - 平均值
-            # 但更准确的是：左侧平均 - 右侧平均
-            left_avg = sum(smoothed[max(0, i - self.window_size):i]) / self.window_size
-            right_avg = sum(smoothed[i:min(len(smoothed), i + self.window_size)]) / self.window_size
-            depth = left_avg - right_avg
-            depth_scores.append(depth)
-
-        # 如果没有足够的点，直接返回
-        if not depth_scores:
+        if len(smoothed) <= 1:
            return [0]

-        # 阈值判断
-        mean_depth = np.mean(depth_scores)
-        std_depth = np.std(depth_scores)
-
-        # 找分割点：depth 显著高于均值的位置
-        threshold = mean_depth + 0.5 * std_depth
+        mean_sim = float(np.mean(smoothed))
+        std_sim = float(np.std(smoothed))
+        dynamic_threshold = max(0.0, min(0.95, mean_sim - 0.5 * std_sim))
+        effective_threshold = max(self.similarity_threshold, dynamic_threshold)

        boundaries = [0]  # 起始点
-        for i, depth in enumerate(depth_scores):
-            if depth > threshold and depth > self.similarity_threshold:
-                boundaries.append(i + 1)  # 对应相似度的下一个位置
-        boundaries.append(len(self._tokenize_sentences.__name__))  # 结束点
+        accumulated_chars = 0
+
+        for i, sim in enumerate(smoothed):
+            accumulated_chars += sentence_lengths[i]
+
+            left_sim = smoothed[i - 1] if i > 0 else 1.0
+            right_sim = smoothed[i + 1] if i < len(smoothed) - 1 else 1.0
+            is_local_min = sim <= left_sim and sim <= right_sim
+            has_enough_context = accumulated_chars >= self.min_chunk_size
+            oversize_guard = accumulated_chars >= self.chunk_size
+
+            if (is_local_min and has_enough_context and sim <= effective_threshold) or oversize_guard:
+                boundaries.append(i + 1)
+                accumulated_chars = 0
+
+        boundaries.append(len(sentence_lengths))

        return sorted(list(set(boundaries)))

@@ -225,7 +219,12 @@ class EmbeddingSplitter:
        for i in range(len(boundaries) - 1):
            start = boundaries[i]
            end = boundaries[i + 1]
-            chunk_text = ' '.join(sentences[start:end])
+            if start >= end:
+                continue
+
+            chunk_text = ' '.join(sentences[start:end]).strip()
+            if not chunk_text:
+                continue

            # 如果 chunk 过大，递归分割
            if len(chunk_text) > self.chunk_size * 1.5:
@@ -278,14 +277,22 @@ class EmbeddingSplitter:
        merged = [chunks[0]]

        for chunk in chunks[1:]:
-            # 如果前一个 chunk 太小，合并
-            if merged[-1]["char_count"] < self.min_chunk_size:
-                merged[-1]["content"] += " " + chunk["content"]
-                merged[-1]["word_count"] += chunk["word_count"]
-                merged[-1]["char_count"] += chunk["char_count"]
+            previous = merged[-1]
+            should_merge = (
+                previous["char_count"] < self.min_chunk_size or
+                chunk["char_count"] < self.min_chunk_size
+            )
+
+            if should_merge and previous["char_count"] + chunk["char_count"] <= self.chunk_size * 1.5:
+                previous["content"] += " " + chunk["content"]
+                previous["word_count"] += chunk["word_count"]
+                previous["char_count"] += chunk["char_count"]
            else:
                merged.append(chunk)

+        for index, chunk in enumerate(merged):
+            chunk["index"] = index
+
        return merged

    async def split_with_embedding(self, text: str) -> List[Dict]:
@@ -295,8 +302,8 @@ class EmbeddingSplitter:
        if not sentences:
            return []

-        # 过滤过短的句子
-        sentences = [s for s in sentences if len(s) >= 10]
+        # 过滤纯噪音片段，但保留正常短句
+        sentences = [s for s in sentences if len(s.strip()) >= 4]

        if not sentences:
            return []
@@ -312,17 +319,22 @@ class EmbeddingSplitter:

        # 3. 调用 Embedding API
        try:
+            if self.embedding_provider is None:
+                raise ValueError("embedding provider is not configured")
            embeddings = await self.embedding_provider.get_embeddings(sentences)
        except Exception as e:
            # 如果 embedding 失败，降级到规则分割
            print(f"Embedding failed, falling back to rule-based: {e}")
            return self._fallback_split(text)

+        if len(embeddings) != len(sentences):
+            return self._fallback_split(text)
+
        # 4. 计算相似度
        similarities = self._compute_similarities(embeddings)

        # 5. 检测分割点
-        boundaries = self._detect_boundaries(similarities)
+        boundaries = self._detect_boundaries(similarities, [len(sentence) for sentence in sentences])

        # 6. 组装 chunks
        chunks = self._assemble_chunks(sentences, boundaries)
@@ -387,7 +399,7 @@ class SemanticEmbeddingSplitter(EmbeddingSplitter):

 def create_embedding_provider(provider: str, api_key: str, base_url: str, model: str = None) -> EmbeddingProvider:
    """创建 Embedding 提供商"""
-    if provider in ["openai", "compatible"]:
+    if provider in ["openai", "compatible", "ali", "glm"]:
        return OpenAIEmbedding(api_key, base_url, model or "text-embedding-3-small")
    elif provider == "minimax":
        return MiniMaxEmbedding(api_key, base_url)