Normalize uploaded documents into structured markdown, add clearer parser errors for missing dependencies, and cover the ingestion flow with backend tests. This also replaces deprecated UTC timestamp helpers in the touched backend paths so the knowledge pipeline stays warning-free. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
41 lines
1.8 KiB
Python
41 lines
1.8 KiB
Python
from sqlalchemy import Column, String, Integer, Text, ForeignKey, Boolean, DateTime
|
|
from sqlalchemy.orm import relationship
|
|
from app.models.base import BaseModel
|
|
|
|
|
|
class Document(BaseModel):
|
|
__tablename__ = "documents"
|
|
|
|
user_id = Column(String(36), ForeignKey("users.id"), nullable=False, index=True)
|
|
title = Column(String(500), nullable=False)
|
|
filename = Column(String(500), nullable=False)
|
|
file_type = Column(String(50), nullable=False) # pdf, md, txt, docx
|
|
file_size = Column(Integer, nullable=False)
|
|
file_path = Column(String(1000), nullable=False)
|
|
folder_id = Column(String(36), ForeignKey("folders.id"), nullable=True) # 新增
|
|
summary = Column(Text, nullable=True)
|
|
chunk_count = Column(Integer, default=0)
|
|
is_indexed = Column(Boolean, default=False)
|
|
ingestion_status = Column(String(50), default="uploaded", nullable=False)
|
|
ingestion_error = Column(Text, nullable=True)
|
|
indexed_at = Column(DateTime, nullable=True)
|
|
parser_version = Column(String(50), nullable=True)
|
|
index_version = Column(String(50), nullable=True)
|
|
normalized_content = Column(Text, nullable=True)
|
|
normalized_format = Column(String(50), nullable=True)
|
|
|
|
chunks = relationship("DocumentChunk", back_populates="document", cascade="all, delete-orphan")
|
|
|
|
|
|
class DocumentChunk(BaseModel):
|
|
__tablename__ = "document_chunks"
|
|
|
|
document_id = Column(String(36), ForeignKey("documents.id"), nullable=False, index=True)
|
|
chunk_index = Column(Integer, nullable=False)
|
|
content = Column(Text, nullable=False)
|
|
metadata_ = Column(String(2000), nullable=True) # JSON 存储元数据
|
|
chroma_collection = Column(String(255), nullable=True)
|
|
chroma_id = Column(String(255), nullable=True)
|
|
|
|
document = relationship("Document", back_populates="chunks")
|