first-update

2026-03-17 14:36:31 +08:00
parent 72f08aee7c
commit 4eddf05e79
516 changed files with 115270 additions and 1 deletions
--- a/backend/app/api/v1/init.py
+++ b/backend/app/api/v1/init.py
@@ -0,0 +1,17 @@
+"""
+API v1 Router
+"""
+
+from fastapi import APIRouter
+
+from app.api.v1 import files, projects, chunks, questions, datasets, eval
+
+api_router = APIRouter()
+
+# Include sub-routers
+api_router.include_router(projects.router, prefix="/projects", tags=["projects"])
+api_router.include_router(files.router, prefix="/files", tags=["files"])
+api_router.include_router(chunks.router, prefix="/chunks", tags=["chunks"])
+api_router.include_router(questions.router, prefix="/questions", tags=["questions"])
+api_router.include_router(datasets.router, prefix="/datasets", tags=["datasets"])
+api_router.include_router(eval.router, prefix="/eval", tags=["eval"])
--- a/backend/app/api/v1/chunks/init.py
+++ b/backend/app/api/v1/chunks/init.py
@@ -0,0 +1,182 @@
+"""
+Chunks API Router
+"""
+from typing import List, Optional
+from uuid import UUID
+from pydantic import BaseModel
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select
+from app.core.database import get_db
+from app.models.models import Chunk, File
+from app.schemas.base import ChunkCreate, ChunkResponse
+from app.services.text_splitter.splitter import get_splitter
+from app.services.file_processor.pdf_processor import process_pdf
+from app.services.file_processor.docx_processor import process_docx
+from app.services.file_processor.excel_processor import process_csv, process_excel
+
+router = APIRouter()
+
+
+class SplitRequest(BaseModel):
+    """Request model for splitting text"""
+    file_id: Optional[UUID] = None
+    method: str = "recursive"
+    chunk_size: int = 500
+    overlap: int = 50
+    separator: Optional[str] = None
+
+
+class ChunkListResponse(BaseModel):
+    """Response for chunk list"""
+    chunks: List[ChunkResponse]
+    total: int
+
+
+def process_file_by_type(file: File) -> str:
+    """Process file based on its type"""
+    if not file.file_path:
+        raise HTTPException(status_code=400, detail="File path not found")
+
+    processors = {
+        "pdf": process_pdf,
+        "docx": process_docx,
+        "xlsx": process_excel,
+        "csv": process_csv,
+    }
+
+    processor = processors.get(file.file_type)
+    if not processor:
+        # Return raw text for txt, md files
+        with open(file.file_path, 'r', encoding='utf-8') as f:
+            return f.read()
+
+    return processor(file.file_path)
+
+
+@router.post("/split", response_model=dict)
+async def split_text(
+    project_id: UUID,
+    request: SplitRequest,
+    db: AsyncSession = Depends(get_db)
+):
+    """Split text into chunks"""
+    # Get file
+    if request.file_id:
+        result = await db.execute(
+            select(File).where(File.id == request.file_id, File.project_id == project_id)
+        )
+        file = result.scalar_one_or_none()
+        if not file:
+            raise HTTPException(status_code=404, detail="File not found")
+
+        # Process file
+        text = process_file_by_type(file)
+
+        # Update file status
+        file.status = "processing"
+        await db.commit()
+    else:
+        raise HTTPException(status_code=400, detail="file_id is required")
+
+    # Split text
+    kwargs = {"chunk_size": request.chunk_size, "overlap": request.overlap}
+    if request.method == "custom" and request.separator:
+        kwargs["separator"] = request.separator
+
+    splitter = get_splitter(request.method, **kwargs)
+    split_results = splitter.split(text)
+
+    # Save chunks
+    chunks = []
+    for chunk_data in split_results:
+        db_chunk = Chunk(
+            project_id=project_id,
+            file_id=file.id,
+            name=chunk_data.get("name", f"Chunk {chunk_data['index'] + 1}"),
+            content=chunk_data["content"],
+            word_count=chunk_data.get("word_count", len(chunk_data["content"].split()))
+        )
+        db.add(db_chunk)
+        chunks.append(db_chunk)
+
+    await db.commit()
+
+    # Update file status
+    file.status = "completed"
+    await db.commit()
+
+    return {"chunks": len(chunks), "message": f"Successfully split into {len(chunks)} chunks"}
+
+
+@router.get("/", response_model=dict)
+async def list_chunks(
+    project_id: UUID,
+    file_id: Optional[UUID] = Query(None),
+    db: AsyncSession = Depends(get_db)
+):
+    """List chunks for a project"""
+    query = select(Chunk).where(Chunk.project_id == project_id)
+
+    if file_id:
+        query = query.where(Chunk.file_id == file_id)
+
+    query = query.order_by(Chunk.created_at.desc())
+
+    result = await db.execute(query)
+    chunks = result.scalars().all()
+
+    return {
+        "chunks": [ChunkResponse.model_validate(c) for c in chunks],
+        "total": len(chunks)
+    }
+
+
+@router.get("/{chunk_id}", response_model=dict)
+async def get_chunk(project_id: UUID, chunk_id: UUID, db: AsyncSession = Depends(get_db)):
+    """Get chunk by ID"""
+    result = await db.execute(
+        select(Chunk).where(Chunk.id == chunk_id, Chunk.project_id == project_id)
+    )
+    chunk = result.scalar_one_or_none()
+    if not chunk:
+        raise HTTPException(status_code=404, detail="Chunk not found")
+    return ChunkResponse.model_validate(chunk)
+
+
+@router.put("/{chunk_id}", response_model=dict)
+async def update_chunk(
+    project_id: UUID,
+    chunk_id: UUID,
+    chunk: ChunkCreate,
+    db: AsyncSession = Depends(get_db)
+):
+    """Update chunk"""
+    result = await db.execute(
+        select(Chunk).where(Chunk.id == chunk_id, Chunk.project_id == project_id)
+    )
+    db_chunk = result.scalar_one_or_none()
+    if not db_chunk:
+        raise HTTPException(status_code=404, detail="Chunk not found")
+
+    for key, value in chunk.model_dump(exclude_unset=True).items():
+        setattr(db_chunk, key, value)
+
+    await db.commit()
+    await db.refresh(db_chunk)
+    return ChunkResponse.model_validate(db_chunk)
+
+
+@router.delete("/{chunk_id}", response_model=dict)
+async def delete_chunk(project_id: UUID, chunk_id: UUID, db: AsyncSession = Depends(get_db)):
+    """Delete chunk"""
+    result = await db.execute(
+        select(Chunk).where(Chunk.id == chunk_id, Chunk.project_id == project_id)
+    )
+    chunk = result.scalar_one_or_none()
+    if not chunk:
+        raise HTTPException(status_code=404, detail="Chunk not found")
+
+    await db.delete(chunk)
+    await db.commit()
+    return {"message": "Chunk deleted successfully"}
--- a/backend/app/api/v1/datasets/init.py
+++ b/backend/app/api/v1/datasets/init.py
@@ -0,0 +1,126 @@
+"""
+Datasets API Router
+"""
+from typing import List, Optional
+from uuid import UUID
+from pydantic import BaseModel
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, func
+from app.core.database import get_db
+from app.models.models import Dataset, Question
+from app.schemas.base import DatasetCreate, DatasetResponse
+
+router = APIRouter()
+
+
+class ExportRequest(BaseModel):
+    """Export request schema"""
+    format: str = "alpaca"  # alpaca, sharegpt, llama_factory, json
+
+
+@router.get("/", response_model=dict)
+async def list_datasets(project_id: UUID, db: AsyncSession = Depends(get_db)):
+    """List datasets for a project"""
+    result = await db.execute(
+        select(Dataset).where(Dataset.project_id == project_id).order_by(Dataset.created_at.desc())
+    )
+    datasets = result.scalars().all()
+
+    # Get question count for each dataset
+    dataset_list = []
+    for dataset in datasets:
+        dataset_data = DatasetResponse.model_validate(dataset)
+        # TODO: Count questions in dataset
+        dataset_data.question_count = 0
+        dataset_list.append(dataset_data)
+
+    return {"datasets": dataset_list}
+
+
+@router.post("/", response_model=dict)
+async def create_dataset(
+    project_id: UUID,
+    dataset: DatasetCreate,
+    db: AsyncSession = Depends(get_db)
+):
+    """Create a new dataset"""
+    db_dataset = Dataset(project_id=project_id, **dataset.model_dump())
+    db.add(db_dataset)
+    await db.commit()
+    await db.refresh(db_dataset)
+
+    return {"id": str(db_dataset.id)}
+
+
+@router.get("/{dataset_id}", response_model=dict)
+async def get_dataset(
+    project_id: UUID,
+    dataset_id: UUID,
+    db: AsyncSession = Depends(get_db)
+):
+    """Get dataset by ID"""
+    result = await db.execute(
+        select(Dataset).where(Dataset.id == dataset_id, Dataset.project_id == project_id)
+    )
+    dataset = result.scalar_one_or_none()
+    if not dataset:
+        raise HTTPException(status_code=404, detail="Dataset not found")
+
+    return DatasetResponse.model_validate(dataset)
+
+
+@router.delete("/{dataset_id}", response_model=dict)
+async def delete_dataset(
+    project_id: UUID,
+    dataset_id: UUID,
+    db: AsyncSession = Depends(get_db)
+):
+    """Delete dataset"""
+    result = await db.execute(
+        select(Dataset).where(Dataset.id == dataset_id, Dataset.project_id == project_id)
+    )
+    dataset = result.scalar_one_or_none()
+    if not dataset:
+        raise HTTPException(status_code=404, detail="Dataset not found")
+
+    await db.delete(dataset)
+    await db.commit()
+
+    return {"message": "Dataset deleted successfully"}
+
+
+@router.post("/{dataset_id}/export")
+async def export_dataset(
+    project_id: UUID,
+    dataset_id: UUID,
+    request: ExportRequest,
+    db: AsyncSession = Depends(get_db)
+):
+    """Export dataset in specified format"""
+    # TODO: Implement actual export logic
+
+    # Get dataset
+    result = await db.execute(
+        select(Dataset).where(Dataset.id == dataset_id, Dataset.project_id == project_id)
+    )
+    dataset = result.scalar_one_or_none()
+    if not dataset:
+        raise HTTPException(status_code=404, detail="Dataset not found")
+
+    # Get questions for this dataset (placeholder)
+    # In real implementation, would link questions to datasets
+
+    # Return sample data based on format
+    sample_data = [
+        {
+            "instruction": "这是一个示例指令",
+            "input": "",
+            "output": "这是一个示例输出"
+        }
+    ]
+
+    if request.format == "json":
+        return sample_data
+
+    return {"data": sample_data, "format": request.format}
--- a/backend/app/api/v1/eval/init.py
+++ b/backend/app/api/v1/eval/init.py
@@ -0,0 +1,100 @@
+"""
+Evaluation API Router
+"""
+from typing import List, Optional
+from uuid import UUID
+from pydantic import BaseModel
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select
+from app.core.database import get_db
+from app.models.models import EvalDataset, Task
+from app.schemas.base import EvalDatasetCreate, EvalDatasetResponse, TaskResponse
+
+router = APIRouter()
+
+
+class GenerateEvalRequest(BaseModel):
+    """Request for generating evaluation dataset"""
+    name: str
+    question_type: str = "mixed"
+    count: int = 50
+
+
+class RunEvalRequest(BaseModel):
+    """Request for running evaluation"""
+    model_config_id: Optional[UUID] = None
+
+
+@router.get("/", response_model=dict)
+async def list_eval_datasets(project_id: UUID, db: AsyncSession = Depends(get_db)):
+    """List evaluation datasets"""
+    result = await db.execute(
+        select(EvalDataset).where(EvalDataset.project_id == project_id).order_by(EvalDataset.created_at.desc())
+    )
+    datasets = result.scalars().all()
+
+    return {"datasets": [EvalDatasetResponse.model_validate(d) for d in datasets]}
+
+
+@router.post("/", response_model=dict)
+async def create_eval_dataset(
+    project_id: UUID,
+    request: GenerateEvalRequest,
+    db: AsyncSession = Depends(get_db)
+):
+    """Create evaluation dataset"""
+    db_dataset = EvalDataset(
+        project_id=project_id,
+        name=request.name,
+        question_type=request.question_type
+    )
+    db.add(db_dataset)
+    await db.commit()
+    await db.refresh(db_dataset)
+
+    return {"id": str(db_dataset.id)}
+
+
+@router.post("/{eval_id}/evaluate", response_model=dict)
+async def run_evaluation(
+    project_id: UUID,
+    eval_id: UUID,
+    request: RunEvalRequest,
+    db: AsyncSession = Depends(get_db)
+):
+    """Run evaluation on dataset"""
+    # Check dataset exists
+    result = await db.execute(
+        select(EvalDataset).where(EvalDataset.id == eval_id, EvalDataset.project_id == project_id)
+    )
+    dataset = result.scalar_one_or_none()
+    if not dataset:
+        raise HTTPException(status_code=404, detail="Evaluation dataset not found")
+
+    # Create evaluation task
+    task = Task(
+        project_id=project_id,
+        task_type="eval",
+        status="pending"
+    )
+    db.add(task)
+    await db.commit()
+    await db.refresh(task)
+
+    # TODO: Start evaluation in background
+
+    return {"task_id": str(task.id), "message": "Evaluation task started"}
+
+
+@router.get("/results", response_model=dict)
+async def get_eval_results(project_id: UUID, task_id: UUID, db: AsyncSession = Depends(get_db)):
+    """Get evaluation results"""
+    result = await db.execute(
+        select(Task).where(Task.id == task_id, Task.project_id == project_id)
+    )
+    task = result.scalar_one_or_none()
+    if not task:
+        raise HTTPException(status_code=404, detail="Task not found")
+
+    return TaskResponse.model_validate(task)
--- a/backend/app/api/v1/files/init.py
+++ b/backend/app/api/v1/files/init.py
@@ -0,0 +1,110 @@
+"""
+Files API Router
+"""
+import os
+import aiofiles
+from pathlib import Path
+from typing import List
+from uuid import UUID
+from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select
+from app.core.database import get_db
+from app.core.config import get_settings
+from app.models.models import File
+from app.schemas.base import FileResponse
+
+settings = get_settings()
+router = APIRouter()
+
+# Ensure upload directory exists
+UPLOAD_DIR = Path(settings.UPLOAD_DIR)
+UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def get_file_type(filename: str) -> str:
+    """Get file type from extension"""
+    ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
+    type_map = {
+        'pdf': 'pdf',
+        'docx': 'docx',
+        'doc': 'docx',
+        'xlsx': 'xlsx',
+        'xls': 'xlsx',
+        'csv': 'csv',
+        'epub': 'epub',
+        'md': 'md',
+        'markdown': 'md',
+        'txt': 'txt'
+    }
+    return type_map.get(ext, 'txt')
+
+
+@router.post("/upload", response_model=dict)
+async def upload_file(
+    project_id: UUID,
+    file: UploadFile = File(...),
+    db: AsyncSession = Depends(get_db)
+):
+    """Upload a file"""
+    # Save file to disk
+    file_path = UPLOAD_DIR / f"{project_id}_{file.filename}"
+    async with aiofiles.open(file_path, 'wb') as f:
+        content = await file.read()
+        await f.write(content)
+
+    # Create file record
+    db_file = File(
+        project_id=project_id,
+        filename=file.filename,
+        file_type=get_file_type(file.filename),
+        file_path=str(file_path),
+        size=len(content),
+        status="pending"
+    )
+    db.add(db_file)
+    await db.commit()
+    await db.refresh(db_file)
+
+    return {"id": str(db_file.id), "filename": db_file.filename, "status": db_file.status}
+
+
+@router.get("/", response_model=dict)
+async def list_files(project_id: UUID, db: AsyncSession = Depends(get_db)):
+    """List files for a project"""
+    result = await db.execute(
+        select(File).where(File.project_id == project_id).order_by(File.created_at.desc())
+    )
+    files = result.scalars().all()
+    return {"files": [FileResponse.model_validate(f) for f in files]}
+
+
+@router.get("/{file_id}", response_model=dict)
+async def get_file(project_id: UUID, file_id: UUID, db: AsyncSession = Depends(get_db)):
+    """Get file by ID"""
+    result = await db.execute(
+        select(File).where(File.id == file_id, File.project_id == project_id)
+    )
+    file = result.scalar_one_or_none()
+    if not file:
+        raise HTTPException(status_code=404, detail="File not found")
+    return FileResponse.model_validate(file)
+
+
+@router.delete("/{file_id}", response_model=dict)
+async def delete_file(project_id: UUID, file_id: UUID, db: AsyncSession = Depends(get_db)):
+    """Delete file"""
+    result = await db.execute(
+        select(File).where(File.id == file_id, File.project_id == project_id)
+    )
+    file = result.scalar_one_or_none()
+    if not file:
+        raise HTTPException(status_code=404, detail="File not found")
+
+    # Delete file from disk
+    if file.file_path and os.path.exists(file.file_path):
+        os.remove(file.file_path)
+
+    await db.delete(file)
+    await db.commit()
+    return {"message": "File deleted successfully"}
--- a/backend/app/api/v1/projects/init.py
+++ b/backend/app/api/v1/projects/init.py
@@ -0,0 +1,74 @@
+"""
+Projects API Router
+"""
+from typing import List
+from uuid import UUID
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select
+from app.core.database import get_db
+from app.models.models import Project
+from app.schemas.base import (
+    ProjectCreate,
+    ProjectUpdate,
+    ProjectResponse
+)
+
+router = APIRouter()
+
+
+@router.get("/", response_model=dict)
+async def list_projects(db: AsyncSession = Depends(get_db)):
+    """List all projects"""
+    result = await db.execute(select(Project).order_by(Project.created_at.desc()))
+    projects = result.scalars().all()
+    return {"projects": [ProjectResponse.model_validate(p) for p in projects]}
+
+
+@router.post("/", response_model=dict)
+async def create_project(project: ProjectCreate, db: AsyncSession = Depends(get_db)):
+    """Create a new project"""
+    db_project = Project(**project.model_dump())
+    db.add(db_project)
+    await db.commit()
+    await db.refresh(db_project)
+    return {"id": str(db_project.id)}
+
+
+@router.get("/{project_id}", response_model=dict)
+async def get_project(project_id: UUID, db: AsyncSession = Depends(get_db)):
+    """Get project by ID"""
+    result = await db.execute(select(Project).where(Project.id == project_id))
+    project = result.scalar_one_or_none()
+    if not project:
+        raise HTTPException(status_code=404, detail="Project not found")
+    return ProjectResponse.model_validate(project)
+
+
+@router.put("/{project_id}", response_model=dict)
+async def update_project(project_id: UUID, project: ProjectUpdate, db: AsyncSession = Depends(get_db)):
+    """Update project"""
+    result = await db.execute(select(Project).where(Project.id == project_id))
+    db_project = result.scalar_one_or_none()
+    if not db_project:
+        raise HTTPException(status_code=404, detail="Project not found")
+
+    for key, value in project.model_dump(exclude_unset=True).items():
+        setattr(db_project, key, value)
+
+    await db.commit()
+    await db.refresh(db_project)
+    return ProjectResponse.model_validate(db_project)
+
+
+@router.delete("/{project_id}", response_model=dict)
+async def delete_project(project_id: UUID, db: AsyncSession = Depends(get_db)):
+    """Delete project"""
+    result = await db.execute(select(Project).where(Project.id == project_id))
+    project = result.scalar_one_or_none()
+    if not project:
+        raise HTTPException(status_code=404, detail="Project not found")
+
+    await db.delete(project)
+    await db.commit()
+    return {"message": "Project deleted successfully"}
--- a/backend/app/api/v1/questions/init.py
+++ b/backend/app/api/v1/questions/init.py
@@ -0,0 +1,122 @@
+"""
+Questions API Router
+"""
+from typing import List, Optional
+from uuid import UUID
+from pydantic import BaseModel
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select
+from app.core.database import get_db
+from app.models.models import Question, Chunk
+from app.schemas.base import QuestionCreate, QuestionResponse
+
+router = APIRouter()
+
+
+class GenerateRequest(BaseModel):
+    """Request model for generating questions"""
+    chunk_ids: List[UUID] = []
+    count: int = 5
+    question_types: List[str] = ["fact", "summary"]
+
+
+@router.post("/generate", response_model=dict)
+async def generate_questions(
+    project_id: UUID,
+    request: GenerateRequest,
+    db: AsyncSession = Depends(get_db)
+):
+    """Generate questions from chunks using LLM"""
+    # TODO: Implement LLM-based question generation
+    # This is a placeholder that creates sample questions
+
+    if not request.chunk_ids:
+        raise HTTPException(status_code=400, detail="chunk_ids is required")
+
+    # Get chunks
+    result = await db.execute(
+        select(Chunk).where(Chunk.id.in_(request.chunk_ids), Chunk.project_id == project_id)
+    )
+    chunks = result.scalars().all()
+
+    if not chunks:
+        raise HTTPException(status_code=404, detail="No chunks found")
+
+    # Create sample questions (placeholder)
+    created_questions = []
+    for chunk in chunks:
+        for i in range(request.count):
+            question = Question(
+                project_id=project_id,
+                chunk_id=chunk.id,
+                content=f"这是关于「{chunk.name}」的问题 {i+1}？",
+                answer=f"这是问题 {i+1} 的答案。",
+                question_type=request.question_types[0] if request.question_types else "fact",
+                source="generated"
+            )
+            db.add(question)
+            created_questions.append(question)
+
+    await db.commit()
+
+    return {
+        "questions": len(created_questions),
+        "message": f"Successfully generated {len(created_questions)} questions"
+    }
+
+
+@router.get("/", response_model=dict)
+async def list_questions(
+    project_id: UUID,
+    chunk_id: Optional[UUID] = Query(None),
+    db: AsyncSession = Depends(get_db)
+):
+    """List questions for a project"""
+    query = select(Question).where(Question.project_id == project_id)
+
+    if chunk_id:
+        query = query.where(Question.chunk_id == chunk_id)
+
+    result = await db.execute(query)
+    questions = result.scalars().all()
+
+    return {"questions": [QuestionResponse.model_validate(q) for q in questions]}
+
+
+@router.put("/{question_id}", response_model=dict)
+async def update_question(
+    project_id: UUID,
+    question_id: UUID,
+    question: QuestionCreate,
+    db: AsyncSession = Depends(get_db)
+):
+    """Update question"""
+    result = await db.execute(
+        select(Question).where(Question.id == question_id, Question.project_id == project_id)
+    )
+    db_question = result.scalar_one_or_none()
+    if not db_question:
+        raise HTTPException(status_code=404, detail="Question not found")
+
+    for key, value in question.model_dump(exclude_unset=True).items():
+        setattr(db_question, key, value)
+
+    await db.commit()
+    await db.refresh(db_question)
+    return QuestionResponse.model_validate(db_question)
+
+
+@router.delete("/{question_id}", response_model=dict)
+async def delete_question(project_id: UUID, question_id: UUID, db: AsyncSession = Depends(get_db)):
+    """Delete question"""
+    result = await db.execute(
+        select(Question).where(Question.id == question_id, Question.project_id == project_id)
+    )
+    question = result.scalar_one_or_none()
+    if not question:
+        raise HTTPException(status_code=404, detail="Question not found")
+
+    await db.delete(question)
+    await db.commit()
+    return {"message": "Question deleted successfully"}