feat(backend): 更新 API 支持语义分割和 embedding 配置

- chunks API 添加 embedding 配置字段 - projects API 更新路由和方法 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 16:08:08 +08:00
parent da2887d913
commit cc2e73c595
2 changed files with 26 additions and 0 deletions
--- a/backend/app/api/v1/chunks/init.py
+++ b/backend/app/api/v1/chunks/init.py
@@ -44,6 +44,14 @@ class SplitRequest(BaseModel):
    chunk_size: int = Field(500, ge=50, le=5000)
    overlap: int = Field(50, ge=0, le=500)
    separator: Optional[str] = None
    # Embedding 相关参数（用于 semantic_embedding 方法）
    embedding_provider: Optional[str] = Field(None, description="embedding provider: openai, minimax")
    embedding_api_key: Optional[str] = Field(None, description="API key for embedding")
    embedding_base_url: Optional[str] = Field(None, description="API base URL")
    embedding_model: Optional[str] = Field(None, description="Embedding model name")
    # 语义分割参数
    similarity_threshold: float = Field(0.3, ge=0.0, le=1.0, description="Similarity threshold for semantic split")
    min_chunk_size: int = Field(100, ge=10, le=1000, description="Minimum chunk size")
 async def process_file_by_type(file: File) -> str:
@@ -111,6 +119,15 @@ async def split_text(
        if request.method == "custom" and request.separator:
            kwargs["separator"] = request.separator
        # 如果使用 semantic_embedding 方法，传递 embedding 参数
        if request.method == "semantic_embedding":
            kwargs["embedding_provider_type"] = request.embedding_provider or "openai"
            kwargs["embedding_api_key"] = request.embedding_api_key
            kwargs["embedding_base_url"] = request.embedding_base_url or "https://api.minimax.chat/v1"
            kwargs["embedding_model"] = request.embedding_model or "text-embedding-3-small"
            kwargs["similarity_threshold"] = request.similarity_threshold
            kwargs["min_chunk_size"] = request.min_chunk_size
        splitter = get_splitter(request.method, **kwargs)
        split_results = splitter.split(text)
--- a/backend/app/api/v1/projects/init.py
+++ b/backend/app/api/v1/projects/init.py
@@ -2,6 +2,8 @@
 Projects API Router
 """
 import logging
 import shutil
 from pathlib import Path
 from typing import List, Optional
 from uuid import UUID
 from fastapi import APIRouter, Depends, Query
@@ -107,5 +109,12 @@ async def delete_project(
    logger.info(f"Deleting project: id={project_id}")
    await project_crud.get_or_raise(db, project_id, "Project")
    await project_crud.delete(db, project_id)
    # 删除项目对应的本地数据目录
    project_data_dir = Path("/data/code/YG-Datasets/data") / str(project_id)
    if project_data_dir.exists():
        shutil.rmtree(project_data_dir)
        logger.info(f"Project data directory deleted: {project_data_dir}")
    logger.info(f"Project deleted: id={project_id}")
    return ApiResponse.ok(message="Project deleted successfully")