2026-03-17 14:36:31 +08:00
|
|
|
"""
|
|
|
|
|
Database Models for YG-Dataset
|
|
|
|
|
"""
|
|
|
|
|
from sqlalchemy import Column, String, Text, Integer, BigInteger, ForeignKey, JSON
|
|
|
|
|
from sqlalchemy.dialects.postgresql import UUID
|
|
|
|
|
from sqlalchemy.orm import relationship
|
|
|
|
|
from app.core.database import Base
|
|
|
|
|
from app.models.base import UUIDMixin, TimestampMixin
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Project(Base, UUIDMixin, TimestampMixin):
|
|
|
|
|
"""Project model"""
|
|
|
|
|
__tablename__ = "projects"
|
|
|
|
|
|
|
|
|
|
name = Column(String(255), nullable=False)
|
|
|
|
|
description = Column(Text)
|
2026-03-18 10:45:32 +08:00
|
|
|
type = Column(String(50), default="qa") # qa, table, database
|
2026-03-17 14:36:31 +08:00
|
|
|
|
|
|
|
|
# Relationships
|
|
|
|
|
files = relationship("File", back_populates="project", cascade="all, delete-orphan")
|
|
|
|
|
chunks = relationship("Chunk", back_populates="project", cascade="all, delete-orphan")
|
|
|
|
|
tags = relationship("Tag", back_populates="project", cascade="all, delete-orphan")
|
|
|
|
|
datasets = relationship("Dataset", back_populates="project", cascade="all, delete-orphan")
|
|
|
|
|
eval_datasets = relationship("EvalDataset", back_populates="project", cascade="all, delete-orphan")
|
|
|
|
|
model_configs = relationship("ModelConfig", back_populates="project", cascade="all, delete-orphan")
|
|
|
|
|
tasks = relationship("Task", back_populates="project", cascade="all, delete-orphan")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class File(Base, UUIDMixin, TimestampMixin):
|
|
|
|
|
"""File model for uploaded documents"""
|
|
|
|
|
__tablename__ = "files"
|
|
|
|
|
|
|
|
|
|
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
|
|
|
|
|
filename = Column(String(255), nullable=False)
|
|
|
|
|
file_type = Column(String(50), nullable=False) # pdf, docx, xlsx, csv, epub, md, txt
|
|
|
|
|
file_path = Column(String(500))
|
|
|
|
|
size = Column(BigInteger) # file size in bytes
|
|
|
|
|
status = Column(String(20), default="pending") # pending, processing, completed, failed
|
|
|
|
|
|
|
|
|
|
# Relationships
|
|
|
|
|
project = relationship("Project", back_populates="files")
|
|
|
|
|
chunks = relationship("Chunk", back_populates="file", cascade="all, delete-orphan")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Chunk(Base, UUIDMixin, TimestampMixin):
|
|
|
|
|
"""Text chunk model after splitting"""
|
|
|
|
|
__tablename__ = "chunks"
|
|
|
|
|
|
|
|
|
|
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
|
|
|
|
|
file_id = Column(UUID(as_uuid=True), ForeignKey("files.id", ondelete="CASCADE"))
|
|
|
|
|
name = Column(String(255))
|
|
|
|
|
content = Column(Text, nullable=False)
|
|
|
|
|
summary = Column(Text)
|
|
|
|
|
word_count = Column(Integer)
|
2026-03-17 17:30:11 +08:00
|
|
|
extra_data = Column(JSON) # store additional info like headings, page numbers
|
2026-03-17 14:36:31 +08:00
|
|
|
|
|
|
|
|
# Relationships
|
|
|
|
|
project = relationship("Project", back_populates="chunks")
|
|
|
|
|
file = relationship("File", back_populates="chunks")
|
|
|
|
|
questions = relationship("Question", back_populates="chunk", cascade="all, delete-orphan")
|
|
|
|
|
chunk_tags = relationship("ChunkTag", back_populates="chunk", cascade="all, delete-orphan")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Tag(Base, UUIDMixin, TimestampMixin):
|
|
|
|
|
"""Tag/Label model for categorizing content"""
|
|
|
|
|
__tablename__ = "tags"
|
|
|
|
|
|
|
|
|
|
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
|
|
|
|
|
label = Column(String(255), nullable=False)
|
|
|
|
|
parent_id = Column(UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"))
|
|
|
|
|
color = Column(String(20)) # hex color code
|
|
|
|
|
|
|
|
|
|
# Relationships
|
|
|
|
|
project = relationship("Project", back_populates="tags")
|
|
|
|
|
parent = relationship("Tag", remote_side="Tag.id", back_populates="children")
|
|
|
|
|
children = relationship("Tag", back_populates="parent")
|
|
|
|
|
chunk_tags = relationship("ChunkTag", back_populates="tag")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ChunkTag(Base, UUIDMixin):
|
|
|
|
|
"""Many-to-many relationship between chunks and tags"""
|
|
|
|
|
__tablename__ = "chunk_tags"
|
|
|
|
|
|
|
|
|
|
chunk_id = Column(UUID(as_uuid=True), ForeignKey("chunks.id", ondelete="CASCADE"), nullable=False)
|
|
|
|
|
tag_id = Column(UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"), nullable=False)
|
|
|
|
|
|
|
|
|
|
# Relationships
|
|
|
|
|
chunk = relationship("Chunk", back_populates="chunk_tags")
|
|
|
|
|
tag = relationship("Tag", back_populates="chunk_tags")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Question(Base, UUIDMixin, TimestampMixin):
|
|
|
|
|
"""Question/QA pair model"""
|
|
|
|
|
__tablename__ = "questions"
|
|
|
|
|
|
|
|
|
|
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
|
|
|
|
|
chunk_id = Column(UUID(as_uuid=True), ForeignKey("chunks.id", ondelete="CASCADE"))
|
|
|
|
|
content = Column(Text, nullable=False) # question content
|
|
|
|
|
answer = Column(Text) # answer content
|
|
|
|
|
question_type = Column(String(50)) # fact, summary, reasoning, etc.
|
|
|
|
|
source = Column(String(50), default="manual") # manual, generated
|
|
|
|
|
|
|
|
|
|
# Relationships
|
|
|
|
|
project = relationship("Project")
|
|
|
|
|
chunk = relationship("Chunk", back_populates="questions")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Dataset(Base, UUIDMixin, TimestampMixin):
|
|
|
|
|
"""Dataset model"""
|
|
|
|
|
__tablename__ = "datasets"
|
|
|
|
|
|
|
|
|
|
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
|
|
|
|
|
name = Column(String(255), nullable=False)
|
|
|
|
|
description = Column(Text)
|
|
|
|
|
dataset_type = Column(String(50)) # qa, conversation, instruction
|
2026-03-17 17:30:11 +08:00
|
|
|
extra_data = Column(JSON)
|
2026-03-17 14:36:31 +08:00
|
|
|
|
|
|
|
|
# Relationships
|
|
|
|
|
project = relationship("Project", back_populates="datasets")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EvalDataset(Base, UUIDMixin, TimestampMixin):
|
|
|
|
|
"""Evaluation dataset model"""
|
|
|
|
|
__tablename__ = "eval_datasets"
|
|
|
|
|
|
|
|
|
|
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
|
|
|
|
|
name = Column(String(255), nullable=False)
|
|
|
|
|
question_type = Column(String(50)) # mixed, fact, reasoning
|
2026-03-17 17:30:11 +08:00
|
|
|
extra_data = Column(JSON)
|
2026-03-17 14:36:31 +08:00
|
|
|
|
|
|
|
|
# Relationships
|
|
|
|
|
project = relationship("Project", back_populates="eval_datasets")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ModelConfig(Base, UUIDMixin, TimestampMixin):
|
|
|
|
|
"""Model configuration for LLM providers"""
|
|
|
|
|
__tablename__ = "model_configs"
|
|
|
|
|
|
2026-03-17 23:02:43 +08:00
|
|
|
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=True)
|
|
|
|
|
provider = Column(String(50), nullable=False) # minimax, glm, openai
|
2026-03-17 14:36:31 +08:00
|
|
|
model_name = Column(String(100))
|
|
|
|
|
api_key = Column(String(500))
|
|
|
|
|
api_base = Column(String(500))
|
|
|
|
|
is_default = Column(String(10), default="false")
|
2026-03-17 23:02:43 +08:00
|
|
|
connection_status = Column(String(20), default="untested") # untested, connected, disconnected
|
2026-03-17 14:36:31 +08:00
|
|
|
|
|
|
|
|
# Relationships
|
|
|
|
|
project = relationship("Project", back_populates="model_configs")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Task(Base, UUIDMixin, TimestampMixin):
|
|
|
|
|
"""Task model for background jobs"""
|
|
|
|
|
__tablename__ = "tasks"
|
|
|
|
|
|
|
|
|
|
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"))
|
|
|
|
|
task_type = Column(String(50)) # split, generate, eval, export
|
|
|
|
|
status = Column(String(20), default="pending") # pending, running, completed, failed
|
|
|
|
|
progress = Column(Integer, default=0) # 0-100
|
|
|
|
|
result = Column(JSON)
|
|
|
|
|
error = Column(Text)
|
|
|
|
|
|
|
|
|
|
# Relationships
|
|
|
|
|
project = relationship("Project", back_populates="tasks")
|