first-update

This commit is contained in:
2026-03-17 14:36:31 +08:00
parent 72f08aee7c
commit 4eddf05e79
516 changed files with 115270 additions and 1 deletions

View File

@@ -0,0 +1,17 @@
"""
API v1 Router
"""
from fastapi import APIRouter
from app.api.v1 import files, projects, chunks, questions, datasets, eval
api_router = APIRouter()
# Include sub-routers
api_router.include_router(projects.router, prefix="/projects", tags=["projects"])
api_router.include_router(files.router, prefix="/files", tags=["files"])
api_router.include_router(chunks.router, prefix="/chunks", tags=["chunks"])
api_router.include_router(questions.router, prefix="/questions", tags=["questions"])
api_router.include_router(datasets.router, prefix="/datasets", tags=["datasets"])
api_router.include_router(eval.router, prefix="/eval", tags=["eval"])

View File

@@ -0,0 +1,182 @@
"""
Chunks API Router
"""
from typing import List, Optional
from uuid import UUID
from pydantic import BaseModel
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from app.core.database import get_db
from app.models.models import Chunk, File
from app.schemas.base import ChunkCreate, ChunkResponse
from app.services.text_splitter.splitter import get_splitter
from app.services.file_processor.pdf_processor import process_pdf
from app.services.file_processor.docx_processor import process_docx
from app.services.file_processor.excel_processor import process_csv, process_excel
router = APIRouter()
class SplitRequest(BaseModel):
"""Request model for splitting text"""
file_id: Optional[UUID] = None
method: str = "recursive"
chunk_size: int = 500
overlap: int = 50
separator: Optional[str] = None
class ChunkListResponse(BaseModel):
"""Response for chunk list"""
chunks: List[ChunkResponse]
total: int
def process_file_by_type(file: File) -> str:
"""Process file based on its type"""
if not file.file_path:
raise HTTPException(status_code=400, detail="File path not found")
processors = {
"pdf": process_pdf,
"docx": process_docx,
"xlsx": process_excel,
"csv": process_csv,
}
processor = processors.get(file.file_type)
if not processor:
# Return raw text for txt, md files
with open(file.file_path, 'r', encoding='utf-8') as f:
return f.read()
return processor(file.file_path)
@router.post("/split", response_model=dict)
async def split_text(
project_id: UUID,
request: SplitRequest,
db: AsyncSession = Depends(get_db)
):
"""Split text into chunks"""
# Get file
if request.file_id:
result = await db.execute(
select(File).where(File.id == request.file_id, File.project_id == project_id)
)
file = result.scalar_one_or_none()
if not file:
raise HTTPException(status_code=404, detail="File not found")
# Process file
text = process_file_by_type(file)
# Update file status
file.status = "processing"
await db.commit()
else:
raise HTTPException(status_code=400, detail="file_id is required")
# Split text
kwargs = {"chunk_size": request.chunk_size, "overlap": request.overlap}
if request.method == "custom" and request.separator:
kwargs["separator"] = request.separator
splitter = get_splitter(request.method, **kwargs)
split_results = splitter.split(text)
# Save chunks
chunks = []
for chunk_data in split_results:
db_chunk = Chunk(
project_id=project_id,
file_id=file.id,
name=chunk_data.get("name", f"Chunk {chunk_data['index'] + 1}"),
content=chunk_data["content"],
word_count=chunk_data.get("word_count", len(chunk_data["content"].split()))
)
db.add(db_chunk)
chunks.append(db_chunk)
await db.commit()
# Update file status
file.status = "completed"
await db.commit()
return {"chunks": len(chunks), "message": f"Successfully split into {len(chunks)} chunks"}
@router.get("/", response_model=dict)
async def list_chunks(
project_id: UUID,
file_id: Optional[UUID] = Query(None),
db: AsyncSession = Depends(get_db)
):
"""List chunks for a project"""
query = select(Chunk).where(Chunk.project_id == project_id)
if file_id:
query = query.where(Chunk.file_id == file_id)
query = query.order_by(Chunk.created_at.desc())
result = await db.execute(query)
chunks = result.scalars().all()
return {
"chunks": [ChunkResponse.model_validate(c) for c in chunks],
"total": len(chunks)
}
@router.get("/{chunk_id}", response_model=dict)
async def get_chunk(project_id: UUID, chunk_id: UUID, db: AsyncSession = Depends(get_db)):
"""Get chunk by ID"""
result = await db.execute(
select(Chunk).where(Chunk.id == chunk_id, Chunk.project_id == project_id)
)
chunk = result.scalar_one_or_none()
if not chunk:
raise HTTPException(status_code=404, detail="Chunk not found")
return ChunkResponse.model_validate(chunk)
@router.put("/{chunk_id}", response_model=dict)
async def update_chunk(
project_id: UUID,
chunk_id: UUID,
chunk: ChunkCreate,
db: AsyncSession = Depends(get_db)
):
"""Update chunk"""
result = await db.execute(
select(Chunk).where(Chunk.id == chunk_id, Chunk.project_id == project_id)
)
db_chunk = result.scalar_one_or_none()
if not db_chunk:
raise HTTPException(status_code=404, detail="Chunk not found")
for key, value in chunk.model_dump(exclude_unset=True).items():
setattr(db_chunk, key, value)
await db.commit()
await db.refresh(db_chunk)
return ChunkResponse.model_validate(db_chunk)
@router.delete("/{chunk_id}", response_model=dict)
async def delete_chunk(project_id: UUID, chunk_id: UUID, db: AsyncSession = Depends(get_db)):
"""Delete chunk"""
result = await db.execute(
select(Chunk).where(Chunk.id == chunk_id, Chunk.project_id == project_id)
)
chunk = result.scalar_one_or_none()
if not chunk:
raise HTTPException(status_code=404, detail="Chunk not found")
await db.delete(chunk)
await db.commit()
return {"message": "Chunk deleted successfully"}

View File

@@ -0,0 +1,126 @@
"""
Datasets API Router
"""
from typing import List, Optional
from uuid import UUID
from pydantic import BaseModel
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, func
from app.core.database import get_db
from app.models.models import Dataset, Question
from app.schemas.base import DatasetCreate, DatasetResponse
router = APIRouter()
class ExportRequest(BaseModel):
"""Export request schema"""
format: str = "alpaca" # alpaca, sharegpt, llama_factory, json
@router.get("/", response_model=dict)
async def list_datasets(project_id: UUID, db: AsyncSession = Depends(get_db)):
"""List datasets for a project"""
result = await db.execute(
select(Dataset).where(Dataset.project_id == project_id).order_by(Dataset.created_at.desc())
)
datasets = result.scalars().all()
# Get question count for each dataset
dataset_list = []
for dataset in datasets:
dataset_data = DatasetResponse.model_validate(dataset)
# TODO: Count questions in dataset
dataset_data.question_count = 0
dataset_list.append(dataset_data)
return {"datasets": dataset_list}
@router.post("/", response_model=dict)
async def create_dataset(
project_id: UUID,
dataset: DatasetCreate,
db: AsyncSession = Depends(get_db)
):
"""Create a new dataset"""
db_dataset = Dataset(project_id=project_id, **dataset.model_dump())
db.add(db_dataset)
await db.commit()
await db.refresh(db_dataset)
return {"id": str(db_dataset.id)}
@router.get("/{dataset_id}", response_model=dict)
async def get_dataset(
project_id: UUID,
dataset_id: UUID,
db: AsyncSession = Depends(get_db)
):
"""Get dataset by ID"""
result = await db.execute(
select(Dataset).where(Dataset.id == dataset_id, Dataset.project_id == project_id)
)
dataset = result.scalar_one_or_none()
if not dataset:
raise HTTPException(status_code=404, detail="Dataset not found")
return DatasetResponse.model_validate(dataset)
@router.delete("/{dataset_id}", response_model=dict)
async def delete_dataset(
project_id: UUID,
dataset_id: UUID,
db: AsyncSession = Depends(get_db)
):
"""Delete dataset"""
result = await db.execute(
select(Dataset).where(Dataset.id == dataset_id, Dataset.project_id == project_id)
)
dataset = result.scalar_one_or_none()
if not dataset:
raise HTTPException(status_code=404, detail="Dataset not found")
await db.delete(dataset)
await db.commit()
return {"message": "Dataset deleted successfully"}
@router.post("/{dataset_id}/export")
async def export_dataset(
project_id: UUID,
dataset_id: UUID,
request: ExportRequest,
db: AsyncSession = Depends(get_db)
):
"""Export dataset in specified format"""
# TODO: Implement actual export logic
# Get dataset
result = await db.execute(
select(Dataset).where(Dataset.id == dataset_id, Dataset.project_id == project_id)
)
dataset = result.scalar_one_or_none()
if not dataset:
raise HTTPException(status_code=404, detail="Dataset not found")
# Get questions for this dataset (placeholder)
# In real implementation, would link questions to datasets
# Return sample data based on format
sample_data = [
{
"instruction": "这是一个示例指令",
"input": "",
"output": "这是一个示例输出"
}
]
if request.format == "json":
return sample_data
return {"data": sample_data, "format": request.format}

View File

@@ -0,0 +1,100 @@
"""
Evaluation API Router
"""
from typing import List, Optional
from uuid import UUID
from pydantic import BaseModel
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from app.core.database import get_db
from app.models.models import EvalDataset, Task
from app.schemas.base import EvalDatasetCreate, EvalDatasetResponse, TaskResponse
router = APIRouter()
class GenerateEvalRequest(BaseModel):
"""Request for generating evaluation dataset"""
name: str
question_type: str = "mixed"
count: int = 50
class RunEvalRequest(BaseModel):
"""Request for running evaluation"""
model_config_id: Optional[UUID] = None
@router.get("/", response_model=dict)
async def list_eval_datasets(project_id: UUID, db: AsyncSession = Depends(get_db)):
"""List evaluation datasets"""
result = await db.execute(
select(EvalDataset).where(EvalDataset.project_id == project_id).order_by(EvalDataset.created_at.desc())
)
datasets = result.scalars().all()
return {"datasets": [EvalDatasetResponse.model_validate(d) for d in datasets]}
@router.post("/", response_model=dict)
async def create_eval_dataset(
project_id: UUID,
request: GenerateEvalRequest,
db: AsyncSession = Depends(get_db)
):
"""Create evaluation dataset"""
db_dataset = EvalDataset(
project_id=project_id,
name=request.name,
question_type=request.question_type
)
db.add(db_dataset)
await db.commit()
await db.refresh(db_dataset)
return {"id": str(db_dataset.id)}
@router.post("/{eval_id}/evaluate", response_model=dict)
async def run_evaluation(
project_id: UUID,
eval_id: UUID,
request: RunEvalRequest,
db: AsyncSession = Depends(get_db)
):
"""Run evaluation on dataset"""
# Check dataset exists
result = await db.execute(
select(EvalDataset).where(EvalDataset.id == eval_id, EvalDataset.project_id == project_id)
)
dataset = result.scalar_one_or_none()
if not dataset:
raise HTTPException(status_code=404, detail="Evaluation dataset not found")
# Create evaluation task
task = Task(
project_id=project_id,
task_type="eval",
status="pending"
)
db.add(task)
await db.commit()
await db.refresh(task)
# TODO: Start evaluation in background
return {"task_id": str(task.id), "message": "Evaluation task started"}
@router.get("/results", response_model=dict)
async def get_eval_results(project_id: UUID, task_id: UUID, db: AsyncSession = Depends(get_db)):
"""Get evaluation results"""
result = await db.execute(
select(Task).where(Task.id == task_id, Task.project_id == project_id)
)
task = result.scalar_one_or_none()
if not task:
raise HTTPException(status_code=404, detail="Task not found")
return TaskResponse.model_validate(task)

View File

@@ -0,0 +1,110 @@
"""
Files API Router
"""
import os
import aiofiles
from pathlib import Path
from typing import List
from uuid import UUID
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from app.core.database import get_db
from app.core.config import get_settings
from app.models.models import File
from app.schemas.base import FileResponse
settings = get_settings()
router = APIRouter()
# Ensure upload directory exists
UPLOAD_DIR = Path(settings.UPLOAD_DIR)
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
def get_file_type(filename: str) -> str:
"""Get file type from extension"""
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
type_map = {
'pdf': 'pdf',
'docx': 'docx',
'doc': 'docx',
'xlsx': 'xlsx',
'xls': 'xlsx',
'csv': 'csv',
'epub': 'epub',
'md': 'md',
'markdown': 'md',
'txt': 'txt'
}
return type_map.get(ext, 'txt')
@router.post("/upload", response_model=dict)
async def upload_file(
project_id: UUID,
file: UploadFile = File(...),
db: AsyncSession = Depends(get_db)
):
"""Upload a file"""
# Save file to disk
file_path = UPLOAD_DIR / f"{project_id}_{file.filename}"
async with aiofiles.open(file_path, 'wb') as f:
content = await file.read()
await f.write(content)
# Create file record
db_file = File(
project_id=project_id,
filename=file.filename,
file_type=get_file_type(file.filename),
file_path=str(file_path),
size=len(content),
status="pending"
)
db.add(db_file)
await db.commit()
await db.refresh(db_file)
return {"id": str(db_file.id), "filename": db_file.filename, "status": db_file.status}
@router.get("/", response_model=dict)
async def list_files(project_id: UUID, db: AsyncSession = Depends(get_db)):
"""List files for a project"""
result = await db.execute(
select(File).where(File.project_id == project_id).order_by(File.created_at.desc())
)
files = result.scalars().all()
return {"files": [FileResponse.model_validate(f) for f in files]}
@router.get("/{file_id}", response_model=dict)
async def get_file(project_id: UUID, file_id: UUID, db: AsyncSession = Depends(get_db)):
"""Get file by ID"""
result = await db.execute(
select(File).where(File.id == file_id, File.project_id == project_id)
)
file = result.scalar_one_or_none()
if not file:
raise HTTPException(status_code=404, detail="File not found")
return FileResponse.model_validate(file)
@router.delete("/{file_id}", response_model=dict)
async def delete_file(project_id: UUID, file_id: UUID, db: AsyncSession = Depends(get_db)):
"""Delete file"""
result = await db.execute(
select(File).where(File.id == file_id, File.project_id == project_id)
)
file = result.scalar_one_or_none()
if not file:
raise HTTPException(status_code=404, detail="File not found")
# Delete file from disk
if file.file_path and os.path.exists(file.file_path):
os.remove(file.file_path)
await db.delete(file)
await db.commit()
return {"message": "File deleted successfully"}

View File

@@ -0,0 +1,74 @@
"""
Projects API Router
"""
from typing import List
from uuid import UUID
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from app.core.database import get_db
from app.models.models import Project
from app.schemas.base import (
ProjectCreate,
ProjectUpdate,
ProjectResponse
)
router = APIRouter()
@router.get("/", response_model=dict)
async def list_projects(db: AsyncSession = Depends(get_db)):
"""List all projects"""
result = await db.execute(select(Project).order_by(Project.created_at.desc()))
projects = result.scalars().all()
return {"projects": [ProjectResponse.model_validate(p) for p in projects]}
@router.post("/", response_model=dict)
async def create_project(project: ProjectCreate, db: AsyncSession = Depends(get_db)):
"""Create a new project"""
db_project = Project(**project.model_dump())
db.add(db_project)
await db.commit()
await db.refresh(db_project)
return {"id": str(db_project.id)}
@router.get("/{project_id}", response_model=dict)
async def get_project(project_id: UUID, db: AsyncSession = Depends(get_db)):
"""Get project by ID"""
result = await db.execute(select(Project).where(Project.id == project_id))
project = result.scalar_one_or_none()
if not project:
raise HTTPException(status_code=404, detail="Project not found")
return ProjectResponse.model_validate(project)
@router.put("/{project_id}", response_model=dict)
async def update_project(project_id: UUID, project: ProjectUpdate, db: AsyncSession = Depends(get_db)):
"""Update project"""
result = await db.execute(select(Project).where(Project.id == project_id))
db_project = result.scalar_one_or_none()
if not db_project:
raise HTTPException(status_code=404, detail="Project not found")
for key, value in project.model_dump(exclude_unset=True).items():
setattr(db_project, key, value)
await db.commit()
await db.refresh(db_project)
return ProjectResponse.model_validate(db_project)
@router.delete("/{project_id}", response_model=dict)
async def delete_project(project_id: UUID, db: AsyncSession = Depends(get_db)):
"""Delete project"""
result = await db.execute(select(Project).where(Project.id == project_id))
project = result.scalar_one_or_none()
if not project:
raise HTTPException(status_code=404, detail="Project not found")
await db.delete(project)
await db.commit()
return {"message": "Project deleted successfully"}

View File

@@ -0,0 +1,122 @@
"""
Questions API Router
"""
from typing import List, Optional
from uuid import UUID
from pydantic import BaseModel
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from app.core.database import get_db
from app.models.models import Question, Chunk
from app.schemas.base import QuestionCreate, QuestionResponse
router = APIRouter()
class GenerateRequest(BaseModel):
"""Request model for generating questions"""
chunk_ids: List[UUID] = []
count: int = 5
question_types: List[str] = ["fact", "summary"]
@router.post("/generate", response_model=dict)
async def generate_questions(
project_id: UUID,
request: GenerateRequest,
db: AsyncSession = Depends(get_db)
):
"""Generate questions from chunks using LLM"""
# TODO: Implement LLM-based question generation
# This is a placeholder that creates sample questions
if not request.chunk_ids:
raise HTTPException(status_code=400, detail="chunk_ids is required")
# Get chunks
result = await db.execute(
select(Chunk).where(Chunk.id.in_(request.chunk_ids), Chunk.project_id == project_id)
)
chunks = result.scalars().all()
if not chunks:
raise HTTPException(status_code=404, detail="No chunks found")
# Create sample questions (placeholder)
created_questions = []
for chunk in chunks:
for i in range(request.count):
question = Question(
project_id=project_id,
chunk_id=chunk.id,
content=f"这是关于「{chunk.name}」的问题 {i+1}",
answer=f"这是问题 {i+1} 的答案。",
question_type=request.question_types[0] if request.question_types else "fact",
source="generated"
)
db.add(question)
created_questions.append(question)
await db.commit()
return {
"questions": len(created_questions),
"message": f"Successfully generated {len(created_questions)} questions"
}
@router.get("/", response_model=dict)
async def list_questions(
project_id: UUID,
chunk_id: Optional[UUID] = Query(None),
db: AsyncSession = Depends(get_db)
):
"""List questions for a project"""
query = select(Question).where(Question.project_id == project_id)
if chunk_id:
query = query.where(Question.chunk_id == chunk_id)
result = await db.execute(query)
questions = result.scalars().all()
return {"questions": [QuestionResponse.model_validate(q) for q in questions]}
@router.put("/{question_id}", response_model=dict)
async def update_question(
project_id: UUID,
question_id: UUID,
question: QuestionCreate,
db: AsyncSession = Depends(get_db)
):
"""Update question"""
result = await db.execute(
select(Question).where(Question.id == question_id, Question.project_id == project_id)
)
db_question = result.scalar_one_or_none()
if not db_question:
raise HTTPException(status_code=404, detail="Question not found")
for key, value in question.model_dump(exclude_unset=True).items():
setattr(db_question, key, value)
await db.commit()
await db.refresh(db_question)
return QuestionResponse.model_validate(db_question)
@router.delete("/{question_id}", response_model=dict)
async def delete_question(project_id: UUID, question_id: UUID, db: AsyncSession = Depends(get_db)):
"""Delete question"""
result = await db.execute(
select(Question).where(Question.id == question_id, Question.project_id == project_id)
)
question = result.scalar_one_or_none()
if not question:
raise HTTPException(status_code=404, detail="Question not found")
await db.delete(question)
await db.commit()
return {"message": "Question deleted successfully"}