feat(backend): 更新 API 支持语义分割和 embedding 配置

- chunks API 添加 embedding 配置字段
- projects API 更新路由和方法

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Developer
2026-03-18 16:08:08 +08:00
parent da2887d913
commit cc2e73c595
2 changed files with 26 additions and 0 deletions

View File

@@ -44,6 +44,14 @@ class SplitRequest(BaseModel):
chunk_size: int = Field(500, ge=50, le=5000) chunk_size: int = Field(500, ge=50, le=5000)
overlap: int = Field(50, ge=0, le=500) overlap: int = Field(50, ge=0, le=500)
separator: Optional[str] = None separator: Optional[str] = None
# Embedding 相关参数(用于 semantic_embedding 方法)
embedding_provider: Optional[str] = Field(None, description="embedding provider: openai, minimax")
embedding_api_key: Optional[str] = Field(None, description="API key for embedding")
embedding_base_url: Optional[str] = Field(None, description="API base URL")
embedding_model: Optional[str] = Field(None, description="Embedding model name")
# 语义分割参数
similarity_threshold: float = Field(0.3, ge=0.0, le=1.0, description="Similarity threshold for semantic split")
min_chunk_size: int = Field(100, ge=10, le=1000, description="Minimum chunk size")
async def process_file_by_type(file: File) -> str: async def process_file_by_type(file: File) -> str:
@@ -111,6 +119,15 @@ async def split_text(
if request.method == "custom" and request.separator: if request.method == "custom" and request.separator:
kwargs["separator"] = request.separator kwargs["separator"] = request.separator
# 如果使用 semantic_embedding 方法,传递 embedding 参数
if request.method == "semantic_embedding":
kwargs["embedding_provider_type"] = request.embedding_provider or "openai"
kwargs["embedding_api_key"] = request.embedding_api_key
kwargs["embedding_base_url"] = request.embedding_base_url or "https://api.minimax.chat/v1"
kwargs["embedding_model"] = request.embedding_model or "text-embedding-3-small"
kwargs["similarity_threshold"] = request.similarity_threshold
kwargs["min_chunk_size"] = request.min_chunk_size
splitter = get_splitter(request.method, **kwargs) splitter = get_splitter(request.method, **kwargs)
split_results = splitter.split(text) split_results = splitter.split(text)

View File

@@ -2,6 +2,8 @@
Projects API Router Projects API Router
""" """
import logging import logging
import shutil
from pathlib import Path
from typing import List, Optional from typing import List, Optional
from uuid import UUID from uuid import UUID
from fastapi import APIRouter, Depends, Query from fastapi import APIRouter, Depends, Query
@@ -107,5 +109,12 @@ async def delete_project(
logger.info(f"Deleting project: id={project_id}") logger.info(f"Deleting project: id={project_id}")
await project_crud.get_or_raise(db, project_id, "Project") await project_crud.get_or_raise(db, project_id, "Project")
await project_crud.delete(db, project_id) await project_crud.delete(db, project_id)
# 删除项目对应的本地数据目录
project_data_dir = Path("/data/code/YG-Datasets/data") / str(project_id)
if project_data_dir.exists():
shutil.rmtree(project_data_dir)
logger.info(f"Project data directory deleted: {project_data_dir}")
logger.info(f"Project deleted: id={project_id}") logger.info(f"Project deleted: id={project_id}")
return ApiResponse.ok(message="Project deleted successfully") return ApiResponse.ok(message="Project deleted successfully")