Add MinerU document ingestion support

Normalize uploaded documents into structured markdown, add clearer parser errors for missing dependencies, and cover the ingestion flow with backend tests. This also replaces deprecated UTC timestamp helpers in the touched backend paths so the knowledge pipeline stays warning-free. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 13:42:16 +08:00
parent a9ddf3c9b4
commit 3ee825aa90
20 changed files with 2159 additions and 156 deletions
--- a/backend/app/models/base.py
+++ b/backend/app/models/base.py
@@ -1,12 +1,16 @@
 import uuid
-from datetime import datetime
+from datetime import UTC, datetime
 from sqlalchemy import Column, String, DateTime
 from app.database import Base


+def utc_now() -> datetime:
+    return datetime.now(UTC)
+
+
 class BaseModel(Base):
    __abstract__ = True

    id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
-    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
-    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
+    created_at = Column(DateTime, default=utc_now, nullable=False)
+    updated_at = Column(DateTime, default=utc_now, onupdate=utc_now, nullable=False)
--- a/backend/app/models/document.py
+++ b/backend/app/models/document.py
@@ -1,4 +1,4 @@
-from sqlalchemy import Column, String, Integer, Text, ForeignKey, Boolean
+from sqlalchemy import Column, String, Integer, Text, ForeignKey, Boolean, DateTime
 from sqlalchemy.orm import relationship
 from app.models.base import BaseModel

@@ -16,6 +16,13 @@ class Document(BaseModel):
    summary = Column(Text, nullable=True)
    chunk_count = Column(Integer, default=0)
    is_indexed = Column(Boolean, default=False)
+    ingestion_status = Column(String(50), default="uploaded", nullable=False)
+    ingestion_error = Column(Text, nullable=True)
+    indexed_at = Column(DateTime, nullable=True)
+    parser_version = Column(String(50), nullable=True)
+    index_version = Column(String(50), nullable=True)
+    normalized_content = Column(Text, nullable=True)
+    normalized_format = Column(String(50), nullable=True)

    chunks = relationship("DocumentChunk", back_populates="document", cascade="all, delete-orphan")

--- a/backend/app/models/memory.py
+++ b/backend/app/models/memory.py
@@ -1,6 +1,5 @@
 from sqlalchemy import Column, String, Text, Integer, ForeignKey, Boolean, DateTime, Enum as SQLEnum
-from datetime import datetime
-from app.models.base import BaseModel
+from app.models.base import BaseModel, utc_now


 class MemorySummary(BaseModel):
@@ -14,7 +13,7 @@ class MemorySummary(BaseModel):
    conversation_id = Column(String(36), ForeignKey("conversations.id"), nullable=False, index=True)
    summary_text = Column(Text, nullable=False)           # 摘要内容
    turn_count = Column(Integer, default=0)                # 摘要时累计轮数
-    summary_at = Column(DateTime, default=datetime.utcnow, nullable=False)
+    summary_at = Column(DateTime, default=utc_now, nullable=False)


 class UserMemory(BaseModel):
@@ -31,5 +30,5 @@ class UserMemory(BaseModel):
    is_recalled = Column(Boolean, default=False)           # 是否在当前对话中被召回
    recall_count = Column(Integer, default=0)            # 被召回次数
    source_conversation_id = Column(String(36), nullable=True)  # 来源对话
-    extracted_at = Column(DateTime, default=datetime.utcnow, nullable=False)
+    extracted_at = Column(DateTime, default=utc_now, nullable=False)
    last_recalled_at = Column(DateTime, nullable=True)