Files
YG-Datasets/backend/app/models/models.py

163 lines
6.6 KiB
Python
Raw Normal View History

2026-03-17 14:36:31 +08:00
"""
Database Models for YG-Dataset
"""
from sqlalchemy import Column, String, Text, Integer, BigInteger, ForeignKey, JSON
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import relationship
from app.core.database import Base
from app.models.base import UUIDMixin, TimestampMixin
class Project(Base, UUIDMixin, TimestampMixin):
"""Project model"""
__tablename__ = "projects"
name = Column(String(255), nullable=False)
description = Column(Text)
# Relationships
files = relationship("File", back_populates="project", cascade="all, delete-orphan")
chunks = relationship("Chunk", back_populates="project", cascade="all, delete-orphan")
tags = relationship("Tag", back_populates="project", cascade="all, delete-orphan")
datasets = relationship("Dataset", back_populates="project", cascade="all, delete-orphan")
eval_datasets = relationship("EvalDataset", back_populates="project", cascade="all, delete-orphan")
model_configs = relationship("ModelConfig", back_populates="project", cascade="all, delete-orphan")
tasks = relationship("Task", back_populates="project", cascade="all, delete-orphan")
class File(Base, UUIDMixin, TimestampMixin):
"""File model for uploaded documents"""
__tablename__ = "files"
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
filename = Column(String(255), nullable=False)
file_type = Column(String(50), nullable=False) # pdf, docx, xlsx, csv, epub, md, txt
file_path = Column(String(500))
size = Column(BigInteger) # file size in bytes
status = Column(String(20), default="pending") # pending, processing, completed, failed
# Relationships
project = relationship("Project", back_populates="files")
chunks = relationship("Chunk", back_populates="file", cascade="all, delete-orphan")
class Chunk(Base, UUIDMixin, TimestampMixin):
"""Text chunk model after splitting"""
__tablename__ = "chunks"
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
file_id = Column(UUID(as_uuid=True), ForeignKey("files.id", ondelete="CASCADE"))
name = Column(String(255))
content = Column(Text, nullable=False)
summary = Column(Text)
word_count = Column(Integer)
extra_data = Column(JSON) # store additional info like headings, page numbers
2026-03-17 14:36:31 +08:00
# Relationships
project = relationship("Project", back_populates="chunks")
file = relationship("File", back_populates="chunks")
questions = relationship("Question", back_populates="chunk", cascade="all, delete-orphan")
chunk_tags = relationship("ChunkTag", back_populates="chunk", cascade="all, delete-orphan")
class Tag(Base, UUIDMixin, TimestampMixin):
"""Tag/Label model for categorizing content"""
__tablename__ = "tags"
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
label = Column(String(255), nullable=False)
parent_id = Column(UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"))
color = Column(String(20)) # hex color code
# Relationships
project = relationship("Project", back_populates="tags")
parent = relationship("Tag", remote_side="Tag.id", back_populates="children")
children = relationship("Tag", back_populates="parent")
chunk_tags = relationship("ChunkTag", back_populates="tag")
class ChunkTag(Base, UUIDMixin):
"""Many-to-many relationship between chunks and tags"""
__tablename__ = "chunk_tags"
chunk_id = Column(UUID(as_uuid=True), ForeignKey("chunks.id", ondelete="CASCADE"), nullable=False)
tag_id = Column(UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"), nullable=False)
# Relationships
chunk = relationship("Chunk", back_populates="chunk_tags")
tag = relationship("Tag", back_populates="chunk_tags")
class Question(Base, UUIDMixin, TimestampMixin):
"""Question/QA pair model"""
__tablename__ = "questions"
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
chunk_id = Column(UUID(as_uuid=True), ForeignKey("chunks.id", ondelete="CASCADE"))
content = Column(Text, nullable=False) # question content
answer = Column(Text) # answer content
question_type = Column(String(50)) # fact, summary, reasoning, etc.
source = Column(String(50), default="manual") # manual, generated
# Relationships
project = relationship("Project")
chunk = relationship("Chunk", back_populates="questions")
class Dataset(Base, UUIDMixin, TimestampMixin):
"""Dataset model"""
__tablename__ = "datasets"
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
name = Column(String(255), nullable=False)
description = Column(Text)
dataset_type = Column(String(50)) # qa, conversation, instruction
extra_data = Column(JSON)
2026-03-17 14:36:31 +08:00
# Relationships
project = relationship("Project", back_populates="datasets")
class EvalDataset(Base, UUIDMixin, TimestampMixin):
"""Evaluation dataset model"""
__tablename__ = "eval_datasets"
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
name = Column(String(255), nullable=False)
question_type = Column(String(50)) # mixed, fact, reasoning
extra_data = Column(JSON)
2026-03-17 14:36:31 +08:00
# Relationships
project = relationship("Project", back_populates="eval_datasets")
class ModelConfig(Base, UUIDMixin, TimestampMixin):
"""Model configuration for LLM providers"""
__tablename__ = "model_configs"
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=True)
provider = Column(String(50), nullable=False) # minimax, glm, openai
2026-03-17 14:36:31 +08:00
model_name = Column(String(100))
api_key = Column(String(500))
api_base = Column(String(500))
is_default = Column(String(10), default="false")
connection_status = Column(String(20), default="untested") # untested, connected, disconnected
2026-03-17 14:36:31 +08:00
# Relationships
project = relationship("Project", back_populates="model_configs")
class Task(Base, UUIDMixin, TimestampMixin):
"""Task model for background jobs"""
__tablename__ = "tasks"
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"))
task_type = Column(String(50)) # split, generate, eval, export
status = Column(String(20), default="pending") # pending, running, completed, failed
progress = Column(Integer, default=0) # 0-100
result = Column(JSON)
error = Column(Text)
# Relationships
project = relationship("Project", back_populates="tasks")