feat(backend): 更新 API 支持语义分割和 embedding 配置
- chunks API 添加 embedding 配置字段 - projects API 更新路由和方法 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -44,6 +44,14 @@ class SplitRequest(BaseModel):
|
|||||||
chunk_size: int = Field(500, ge=50, le=5000)
|
chunk_size: int = Field(500, ge=50, le=5000)
|
||||||
overlap: int = Field(50, ge=0, le=500)
|
overlap: int = Field(50, ge=0, le=500)
|
||||||
separator: Optional[str] = None
|
separator: Optional[str] = None
|
||||||
|
# Embedding 相关参数(用于 semantic_embedding 方法)
|
||||||
|
embedding_provider: Optional[str] = Field(None, description="embedding provider: openai, minimax")
|
||||||
|
embedding_api_key: Optional[str] = Field(None, description="API key for embedding")
|
||||||
|
embedding_base_url: Optional[str] = Field(None, description="API base URL")
|
||||||
|
embedding_model: Optional[str] = Field(None, description="Embedding model name")
|
||||||
|
# 语义分割参数
|
||||||
|
similarity_threshold: float = Field(0.3, ge=0.0, le=1.0, description="Similarity threshold for semantic split")
|
||||||
|
min_chunk_size: int = Field(100, ge=10, le=1000, description="Minimum chunk size")
|
||||||
|
|
||||||
|
|
||||||
async def process_file_by_type(file: File) -> str:
|
async def process_file_by_type(file: File) -> str:
|
||||||
@@ -111,6 +119,15 @@ async def split_text(
|
|||||||
if request.method == "custom" and request.separator:
|
if request.method == "custom" and request.separator:
|
||||||
kwargs["separator"] = request.separator
|
kwargs["separator"] = request.separator
|
||||||
|
|
||||||
|
# 如果使用 semantic_embedding 方法,传递 embedding 参数
|
||||||
|
if request.method == "semantic_embedding":
|
||||||
|
kwargs["embedding_provider_type"] = request.embedding_provider or "openai"
|
||||||
|
kwargs["embedding_api_key"] = request.embedding_api_key
|
||||||
|
kwargs["embedding_base_url"] = request.embedding_base_url or "https://api.minimax.chat/v1"
|
||||||
|
kwargs["embedding_model"] = request.embedding_model or "text-embedding-3-small"
|
||||||
|
kwargs["similarity_threshold"] = request.similarity_threshold
|
||||||
|
kwargs["min_chunk_size"] = request.min_chunk_size
|
||||||
|
|
||||||
splitter = get_splitter(request.method, **kwargs)
|
splitter = get_splitter(request.method, **kwargs)
|
||||||
split_results = splitter.split(text)
|
split_results = splitter.split(text)
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
Projects API Router
|
Projects API Router
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
from fastapi import APIRouter, Depends, Query
|
from fastapi import APIRouter, Depends, Query
|
||||||
@@ -107,5 +109,12 @@ async def delete_project(
|
|||||||
logger.info(f"Deleting project: id={project_id}")
|
logger.info(f"Deleting project: id={project_id}")
|
||||||
await project_crud.get_or_raise(db, project_id, "Project")
|
await project_crud.get_or_raise(db, project_id, "Project")
|
||||||
await project_crud.delete(db, project_id)
|
await project_crud.delete(db, project_id)
|
||||||
|
|
||||||
|
# 删除项目对应的本地数据目录
|
||||||
|
project_data_dir = Path("/data/code/YG-Datasets/data") / str(project_id)
|
||||||
|
if project_data_dir.exists():
|
||||||
|
shutil.rmtree(project_data_dir)
|
||||||
|
logger.info(f"Project data directory deleted: {project_data_dir}")
|
||||||
|
|
||||||
logger.info(f"Project deleted: id={project_id}")
|
logger.info(f"Project deleted: id={project_id}")
|
||||||
return ApiResponse.ok(message="Project deleted successfully")
|
return ApiResponse.ok(message="Project deleted successfully")
|
||||||
|
|||||||
Reference in New Issue
Block a user