feat(backend): 更新 API 支持语义分割和 embedding 配置

- chunks API 添加 embedding 配置字段
- projects API 更新路由和方法

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Developer
2026-03-18 16:08:08 +08:00
parent da2887d913
commit cc2e73c595
2 changed files with 26 additions and 0 deletions

View File

@@ -44,6 +44,14 @@ class SplitRequest(BaseModel):
chunk_size: int = Field(500, ge=50, le=5000)
overlap: int = Field(50, ge=0, le=500)
separator: Optional[str] = None
# Embedding 相关参数(用于 semantic_embedding 方法)
embedding_provider: Optional[str] = Field(None, description="embedding provider: openai, minimax")
embedding_api_key: Optional[str] = Field(None, description="API key for embedding")
embedding_base_url: Optional[str] = Field(None, description="API base URL")
embedding_model: Optional[str] = Field(None, description="Embedding model name")
# 语义分割参数
similarity_threshold: float = Field(0.3, ge=0.0, le=1.0, description="Similarity threshold for semantic split")
min_chunk_size: int = Field(100, ge=10, le=1000, description="Minimum chunk size")
async def process_file_by_type(file: File) -> str:
@@ -111,6 +119,15 @@ async def split_text(
if request.method == "custom" and request.separator:
kwargs["separator"] = request.separator
# 如果使用 semantic_embedding 方法,传递 embedding 参数
if request.method == "semantic_embedding":
kwargs["embedding_provider_type"] = request.embedding_provider or "openai"
kwargs["embedding_api_key"] = request.embedding_api_key
kwargs["embedding_base_url"] = request.embedding_base_url or "https://api.minimax.chat/v1"
kwargs["embedding_model"] = request.embedding_model or "text-embedding-3-small"
kwargs["similarity_threshold"] = request.similarity_threshold
kwargs["min_chunk_size"] = request.min_chunk_size
splitter = get_splitter(request.method, **kwargs)
split_results = splitter.split(text)

View File

@@ -2,6 +2,8 @@
Projects API Router
"""
import logging
import shutil
from pathlib import Path
from typing import List, Optional
from uuid import UUID
from fastapi import APIRouter, Depends, Query
@@ -107,5 +109,12 @@ async def delete_project(
logger.info(f"Deleting project: id={project_id}")
await project_crud.get_or_raise(db, project_id, "Project")
await project_crud.delete(db, project_id)
# 删除项目对应的本地数据目录
project_data_dir = Path("/data/code/YG-Datasets/data") / str(project_id)
if project_data_dir.exists():
shutil.rmtree(project_data_dir)
logger.info(f"Project data directory deleted: {project_data_dir}")
logger.info(f"Project deleted: id={project_id}")
return ApiResponse.ok(message="Project deleted successfully")