""" Database Models for YG-Dataset """ from sqlalchemy import Column, String, Text, Integer, BigInteger, ForeignKey, JSON from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.orm import relationship from app.core.database import Base from app.models.base import UUIDMixin, TimestampMixin class Project(Base, UUIDMixin, TimestampMixin): """Project model""" __tablename__ = "projects" name = Column(String(255), nullable=False) description = Column(Text) # Relationships files = relationship("File", back_populates="project", cascade="all, delete-orphan") chunks = relationship("Chunk", back_populates="project", cascade="all, delete-orphan") tags = relationship("Tag", back_populates="project", cascade="all, delete-orphan") datasets = relationship("Dataset", back_populates="project", cascade="all, delete-orphan") eval_datasets = relationship("EvalDataset", back_populates="project", cascade="all, delete-orphan") model_configs = relationship("ModelConfig", back_populates="project", cascade="all, delete-orphan") tasks = relationship("Task", back_populates="project", cascade="all, delete-orphan") class File(Base, UUIDMixin, TimestampMixin): """File model for uploaded documents""" __tablename__ = "files" project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False) filename = Column(String(255), nullable=False) file_type = Column(String(50), nullable=False) # pdf, docx, xlsx, csv, epub, md, txt file_path = Column(String(500)) size = Column(BigInteger) # file size in bytes status = Column(String(20), default="pending") # pending, processing, completed, failed # Relationships project = relationship("Project", back_populates="files") chunks = relationship("Chunk", back_populates="file", cascade="all, delete-orphan") class Chunk(Base, UUIDMixin, TimestampMixin): """Text chunk model after splitting""" __tablename__ = "chunks" project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False) file_id = Column(UUID(as_uuid=True), ForeignKey("files.id", ondelete="CASCADE")) name = Column(String(255)) content = Column(Text, nullable=False) summary = Column(Text) word_count = Column(Integer) metadata = Column(JSON) # store additional info like headings, page numbers # Relationships project = relationship("Project", back_populates="chunks") file = relationship("File", back_populates="chunks") questions = relationship("Question", back_populates="chunk", cascade="all, delete-orphan") chunk_tags = relationship("ChunkTag", back_populates="chunk", cascade="all, delete-orphan") class Tag(Base, UUIDMixin, TimestampMixin): """Tag/Label model for categorizing content""" __tablename__ = "tags" project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False) label = Column(String(255), nullable=False) parent_id = Column(UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE")) color = Column(String(20)) # hex color code # Relationships project = relationship("Project", back_populates="tags") parent = relationship("Tag", remote_side="Tag.id", back_populates="children") children = relationship("Tag", back_populates="parent") chunk_tags = relationship("ChunkTag", back_populates="tag") class ChunkTag(Base, UUIDMixin): """Many-to-many relationship between chunks and tags""" __tablename__ = "chunk_tags" chunk_id = Column(UUID(as_uuid=True), ForeignKey("chunks.id", ondelete="CASCADE"), nullable=False) tag_id = Column(UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"), nullable=False) # Relationships chunk = relationship("Chunk", back_populates="chunk_tags") tag = relationship("Tag", back_populates="chunk_tags") class Question(Base, UUIDMixin, TimestampMixin): """Question/QA pair model""" __tablename__ = "questions" project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False) chunk_id = Column(UUID(as_uuid=True), ForeignKey("chunks.id", ondelete="CASCADE")) content = Column(Text, nullable=False) # question content answer = Column(Text) # answer content question_type = Column(String(50)) # fact, summary, reasoning, etc. source = Column(String(50), default="manual") # manual, generated # Relationships project = relationship("Project") chunk = relationship("Chunk", back_populates="questions") class Dataset(Base, UUIDMixin, TimestampMixin): """Dataset model""" __tablename__ = "datasets" project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False) name = Column(String(255), nullable=False) description = Column(Text) dataset_type = Column(String(50)) # qa, conversation, instruction metadata = Column(JSON) # Relationships project = relationship("Project", back_populates="datasets") class EvalDataset(Base, UUIDMixin, TimestampMixin): """Evaluation dataset model""" __tablename__ = "eval_datasets" project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False) name = Column(String(255), nullable=False) question_type = Column(String(50)) # mixed, fact, reasoning metadata = Column(JSON) # Relationships project = relationship("Project", back_populates="eval_datasets") class ModelConfig(Base, UUIDMixin, TimestampMixin): """Model configuration for LLM providers""" __tablename__ = "model_configs" project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False) provider = Column(String(50), nullable=False) # openai, anthropic, ollama, custom model_name = Column(String(100)) api_key = Column(String(500)) api_base = Column(String(500)) is_default = Column(String(10), default="false") # Relationships project = relationship("Project", back_populates="model_configs") class Task(Base, UUIDMixin, TimestampMixin): """Task model for background jobs""" __tablename__ = "tasks" project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE")) task_type = Column(String(50)) # split, generate, eval, export status = Column(String(20), default="pending") # pending, running, completed, failed progress = Column(Integer, default=0) # 0-100 result = Column(JSON) error = Column(Text) # Relationships project = relationship("Project", back_populates="tasks")