refactor: 前端架构重构 - 提取 CSS 和逻辑到独立模块
前端重构: - 删除旧的大体积 Vue 组件(HomeView, FileManage, TextSplit 等) - 删除旧的 composables(useFormatters, useModels, useProjects) - 新增 core/, page-logic/, pages/, shared/ 模块化目录结构 - 提取 CSS 到 styles/pages/ 目录 - 添加全局样式 variables.css 和 common.css 后端 API 更新: - chunks: 语义分割 API 增强 - files: 文件处理 API 更新 - models: 模型管理 API 更新 - questions: 问答管理 API 更新 - database: 数据库连接优化 - semantic_embedding: 语义嵌入服务优化 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,7 @@ import httpx
|
||||
import numpy as np
|
||||
from typing import List, Dict, Optional
|
||||
from abc import ABC, abstractmethod
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
|
||||
|
||||
class EmbeddingProvider(ABC):
|
||||
@@ -109,32 +110,28 @@ class EmbeddingSplitter:
|
||||
|
||||
def _tokenize_sentences(self, text: str) -> List[str]:
|
||||
"""将文本切分为句子"""
|
||||
# 中英文句末符号
|
||||
# 先按换行分割,保持段落结构
|
||||
paragraphs = re.split(r'\n+', text)
|
||||
|
||||
paragraphs = re.split(r'\n\s*\n+', text)
|
||||
sentences = []
|
||||
for para in paragraphs:
|
||||
if not para.strip():
|
||||
para = para.strip()
|
||||
if not para:
|
||||
continue
|
||||
|
||||
# 按句子符号分割
|
||||
# 中文:。!?;
|
||||
# 英文:. ! ? ;
|
||||
parts = re.split(r'([。!?;\n]|(?<=[.!?])\s+)', para)
|
||||
parts = re.split(r'(?<=[。!?;.!?])\s+|(?<=[。!?;])', para)
|
||||
buffer = []
|
||||
|
||||
# 重新组合句子
|
||||
current_sentence = ""
|
||||
for part in parts:
|
||||
if part in '。!?;.\n':
|
||||
if current_sentence.strip():
|
||||
sentences.append(current_sentence.strip())
|
||||
current_sentence = ""
|
||||
elif part and part.strip():
|
||||
current_sentence += part
|
||||
# 处理最后一个句子
|
||||
if current_sentence.strip():
|
||||
sentences.append(current_sentence.strip())
|
||||
part = part.strip()
|
||||
if not part:
|
||||
continue
|
||||
|
||||
# 过短的片段先暂存,尽量与后一句合并,避免 embedding 粒度过碎
|
||||
if len(part) < 8 and buffer:
|
||||
buffer[-1] = f"{buffer[-1]} {part}".strip()
|
||||
else:
|
||||
buffer.append(part)
|
||||
|
||||
sentences.extend(buffer)
|
||||
|
||||
return sentences
|
||||
|
||||
@@ -162,51 +159,48 @@ class EmbeddingSplitter:
|
||||
if not similarities:
|
||||
return []
|
||||
|
||||
window = self.window_size
|
||||
window = max(1, self.window_size)
|
||||
smoothed = []
|
||||
|
||||
for i in range(len(similarities)):
|
||||
start = max(0, i - window + 1)
|
||||
end = i + 1
|
||||
start = max(0, i - window)
|
||||
end = min(len(similarities), i + window + 1)
|
||||
window_vals = similarities[start:end]
|
||||
smoothed.append(sum(window_vals) / len(window_vals))
|
||||
|
||||
return smoothed
|
||||
|
||||
def _detect_boundaries(self, similarities: List[float]) -> List[int]:
|
||||
def _detect_boundaries(self, similarities: List[float], sentence_lengths: List[int]) -> List[int]:
|
||||
"""检测分割点(相似度显著下降的位置)"""
|
||||
if not similarities:
|
||||
return [0]
|
||||
|
||||
# 平滑
|
||||
smoothed = self._smooth_similarities(similarities)
|
||||
|
||||
# 计算深度分数(类似 TextTiling)
|
||||
depth_scores = []
|
||||
for i in range(1, len(smoothed) - 1):
|
||||
# 当前位置的深度 = 当前位置的值 - 平均值
|
||||
# 但更准确的是:左侧平均 - 右侧平均
|
||||
left_avg = sum(smoothed[max(0, i - self.window_size):i]) / self.window_size
|
||||
right_avg = sum(smoothed[i:min(len(smoothed), i + self.window_size)]) / self.window_size
|
||||
depth = left_avg - right_avg
|
||||
depth_scores.append(depth)
|
||||
|
||||
# 如果没有足够的点,直接返回
|
||||
if not depth_scores:
|
||||
if len(smoothed) <= 1:
|
||||
return [0]
|
||||
|
||||
# 阈值判断
|
||||
mean_depth = np.mean(depth_scores)
|
||||
std_depth = np.std(depth_scores)
|
||||
|
||||
# 找分割点:depth 显著高于均值的位置
|
||||
threshold = mean_depth + 0.5 * std_depth
|
||||
mean_sim = float(np.mean(smoothed))
|
||||
std_sim = float(np.std(smoothed))
|
||||
dynamic_threshold = max(0.0, min(0.95, mean_sim - 0.5 * std_sim))
|
||||
effective_threshold = max(self.similarity_threshold, dynamic_threshold)
|
||||
|
||||
boundaries = [0] # 起始点
|
||||
for i, depth in enumerate(depth_scores):
|
||||
if depth > threshold and depth > self.similarity_threshold:
|
||||
boundaries.append(i + 1) # 对应相似度的下一个位置
|
||||
boundaries.append(len(self._tokenize_sentences.__name__)) # 结束点
|
||||
accumulated_chars = 0
|
||||
|
||||
for i, sim in enumerate(smoothed):
|
||||
accumulated_chars += sentence_lengths[i]
|
||||
|
||||
left_sim = smoothed[i - 1] if i > 0 else 1.0
|
||||
right_sim = smoothed[i + 1] if i < len(smoothed) - 1 else 1.0
|
||||
is_local_min = sim <= left_sim and sim <= right_sim
|
||||
has_enough_context = accumulated_chars >= self.min_chunk_size
|
||||
oversize_guard = accumulated_chars >= self.chunk_size
|
||||
|
||||
if (is_local_min and has_enough_context and sim <= effective_threshold) or oversize_guard:
|
||||
boundaries.append(i + 1)
|
||||
accumulated_chars = 0
|
||||
|
||||
boundaries.append(len(sentence_lengths))
|
||||
|
||||
return sorted(list(set(boundaries)))
|
||||
|
||||
@@ -225,7 +219,12 @@ class EmbeddingSplitter:
|
||||
for i in range(len(boundaries) - 1):
|
||||
start = boundaries[i]
|
||||
end = boundaries[i + 1]
|
||||
chunk_text = ' '.join(sentences[start:end])
|
||||
if start >= end:
|
||||
continue
|
||||
|
||||
chunk_text = ' '.join(sentences[start:end]).strip()
|
||||
if not chunk_text:
|
||||
continue
|
||||
|
||||
# 如果 chunk 过大,递归分割
|
||||
if len(chunk_text) > self.chunk_size * 1.5:
|
||||
@@ -278,14 +277,22 @@ class EmbeddingSplitter:
|
||||
merged = [chunks[0]]
|
||||
|
||||
for chunk in chunks[1:]:
|
||||
# 如果前一个 chunk 太小,合并
|
||||
if merged[-1]["char_count"] < self.min_chunk_size:
|
||||
merged[-1]["content"] += " " + chunk["content"]
|
||||
merged[-1]["word_count"] += chunk["word_count"]
|
||||
merged[-1]["char_count"] += chunk["char_count"]
|
||||
previous = merged[-1]
|
||||
should_merge = (
|
||||
previous["char_count"] < self.min_chunk_size or
|
||||
chunk["char_count"] < self.min_chunk_size
|
||||
)
|
||||
|
||||
if should_merge and previous["char_count"] + chunk["char_count"] <= self.chunk_size * 1.5:
|
||||
previous["content"] += " " + chunk["content"]
|
||||
previous["word_count"] += chunk["word_count"]
|
||||
previous["char_count"] += chunk["char_count"]
|
||||
else:
|
||||
merged.append(chunk)
|
||||
|
||||
for index, chunk in enumerate(merged):
|
||||
chunk["index"] = index
|
||||
|
||||
return merged
|
||||
|
||||
async def split_with_embedding(self, text: str) -> List[Dict]:
|
||||
@@ -295,8 +302,8 @@ class EmbeddingSplitter:
|
||||
if not sentences:
|
||||
return []
|
||||
|
||||
# 过滤过短的句子
|
||||
sentences = [s for s in sentences if len(s) >= 10]
|
||||
# 过滤纯噪音片段,但保留正常短句
|
||||
sentences = [s for s in sentences if len(s.strip()) >= 4]
|
||||
|
||||
if not sentences:
|
||||
return []
|
||||
@@ -312,17 +319,22 @@ class EmbeddingSplitter:
|
||||
|
||||
# 3. 调用 Embedding API
|
||||
try:
|
||||
if self.embedding_provider is None:
|
||||
raise ValueError("embedding provider is not configured")
|
||||
embeddings = await self.embedding_provider.get_embeddings(sentences)
|
||||
except Exception as e:
|
||||
# 如果 embedding 失败,降级到规则分割
|
||||
print(f"Embedding failed, falling back to rule-based: {e}")
|
||||
return self._fallback_split(text)
|
||||
|
||||
if len(embeddings) != len(sentences):
|
||||
return self._fallback_split(text)
|
||||
|
||||
# 4. 计算相似度
|
||||
similarities = self._compute_similarities(embeddings)
|
||||
|
||||
# 5. 检测分割点
|
||||
boundaries = self._detect_boundaries(similarities)
|
||||
boundaries = self._detect_boundaries(similarities, [len(sentence) for sentence in sentences])
|
||||
|
||||
# 6. 组装 chunks
|
||||
chunks = self._assemble_chunks(sentences, boundaries)
|
||||
@@ -387,7 +399,7 @@ class SemanticEmbeddingSplitter(EmbeddingSplitter):
|
||||
|
||||
def create_embedding_provider(provider: str, api_key: str, base_url: str, model: str = None) -> EmbeddingProvider:
|
||||
"""创建 Embedding 提供商"""
|
||||
if provider in ["openai", "compatible"]:
|
||||
if provider in ["openai", "compatible", "ali", "glm"]:
|
||||
return OpenAIEmbedding(api_key, base_url, model or "text-embedding-3-small")
|
||||
elif provider == "minimax":
|
||||
return MiniMaxEmbedding(api_key, base_url)
|
||||
|
||||
Reference in New Issue
Block a user