Add MinerU document ingestion support
Normalize uploaded documents into structured markdown, add clearer parser errors for missing dependencies, and cover the ingestion flow with backend tests. This also replaces deprecated UTC timestamp helpers in the touched backend paths so the knowledge pipeline stays warning-free. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,12 +1,16 @@
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from datetime import UTC, datetime
|
||||
from sqlalchemy import Column, String, DateTime
|
||||
from app.database import Base
|
||||
|
||||
|
||||
def utc_now() -> datetime:
|
||||
return datetime.now(UTC)
|
||||
|
||||
|
||||
class BaseModel(Base):
|
||||
__abstract__ = True
|
||||
|
||||
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
||||
created_at = Column(DateTime, default=utc_now, nullable=False)
|
||||
updated_at = Column(DateTime, default=utc_now, onupdate=utc_now, nullable=False)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from sqlalchemy import Column, String, Integer, Text, ForeignKey, Boolean
|
||||
from sqlalchemy import Column, String, Integer, Text, ForeignKey, Boolean, DateTime
|
||||
from sqlalchemy.orm import relationship
|
||||
from app.models.base import BaseModel
|
||||
|
||||
@@ -16,6 +16,13 @@ class Document(BaseModel):
|
||||
summary = Column(Text, nullable=True)
|
||||
chunk_count = Column(Integer, default=0)
|
||||
is_indexed = Column(Boolean, default=False)
|
||||
ingestion_status = Column(String(50), default="uploaded", nullable=False)
|
||||
ingestion_error = Column(Text, nullable=True)
|
||||
indexed_at = Column(DateTime, nullable=True)
|
||||
parser_version = Column(String(50), nullable=True)
|
||||
index_version = Column(String(50), nullable=True)
|
||||
normalized_content = Column(Text, nullable=True)
|
||||
normalized_format = Column(String(50), nullable=True)
|
||||
|
||||
chunks = relationship("DocumentChunk", back_populates="document", cascade="all, delete-orphan")
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from sqlalchemy import Column, String, Text, Integer, ForeignKey, Boolean, DateTime, Enum as SQLEnum
|
||||
from datetime import datetime
|
||||
from app.models.base import BaseModel
|
||||
from app.models.base import BaseModel, utc_now
|
||||
|
||||
|
||||
class MemorySummary(BaseModel):
|
||||
@@ -14,7 +13,7 @@ class MemorySummary(BaseModel):
|
||||
conversation_id = Column(String(36), ForeignKey("conversations.id"), nullable=False, index=True)
|
||||
summary_text = Column(Text, nullable=False) # 摘要内容
|
||||
turn_count = Column(Integer, default=0) # 摘要时累计轮数
|
||||
summary_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
summary_at = Column(DateTime, default=utc_now, nullable=False)
|
||||
|
||||
|
||||
class UserMemory(BaseModel):
|
||||
@@ -31,5 +30,5 @@ class UserMemory(BaseModel):
|
||||
is_recalled = Column(Boolean, default=False) # 是否在当前对话中被召回
|
||||
recall_count = Column(Integer, default=0) # 被召回次数
|
||||
source_conversation_id = Column(String(36), nullable=True) # 来源对话
|
||||
extracted_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
extracted_at = Column(DateTime, default=utc_now, nullable=False)
|
||||
last_recalled_at = Column(DateTime, nullable=True)
|
||||
|
||||
Reference in New Issue
Block a user