Add MinerU document ingestion support

Normalize uploaded documents into structured markdown, add clearer parser
errors for missing dependencies, and cover the ingestion flow with
backend tests. This also replaces deprecated UTC timestamp helpers in
the touched backend paths so the knowledge pipeline stays warning-free.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-22 13:42:16 +08:00
parent a9ddf3c9b4
commit 3ee825aa90
20 changed files with 2159 additions and 156 deletions

View File

@@ -1,12 +1,16 @@
import uuid
from datetime import datetime
from datetime import UTC, datetime
from sqlalchemy import Column, String, DateTime
from app.database import Base
def utc_now() -> datetime:
return datetime.now(UTC)
class BaseModel(Base):
__abstract__ = True
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
created_at = Column(DateTime, default=utc_now, nullable=False)
updated_at = Column(DateTime, default=utc_now, onupdate=utc_now, nullable=False)

View File

@@ -1,4 +1,4 @@
from sqlalchemy import Column, String, Integer, Text, ForeignKey, Boolean
from sqlalchemy import Column, String, Integer, Text, ForeignKey, Boolean, DateTime
from sqlalchemy.orm import relationship
from app.models.base import BaseModel
@@ -16,6 +16,13 @@ class Document(BaseModel):
summary = Column(Text, nullable=True)
chunk_count = Column(Integer, default=0)
is_indexed = Column(Boolean, default=False)
ingestion_status = Column(String(50), default="uploaded", nullable=False)
ingestion_error = Column(Text, nullable=True)
indexed_at = Column(DateTime, nullable=True)
parser_version = Column(String(50), nullable=True)
index_version = Column(String(50), nullable=True)
normalized_content = Column(Text, nullable=True)
normalized_format = Column(String(50), nullable=True)
chunks = relationship("DocumentChunk", back_populates="document", cascade="all, delete-orphan")

View File

@@ -1,6 +1,5 @@
from sqlalchemy import Column, String, Text, Integer, ForeignKey, Boolean, DateTime, Enum as SQLEnum
from datetime import datetime
from app.models.base import BaseModel
from app.models.base import BaseModel, utc_now
class MemorySummary(BaseModel):
@@ -14,7 +13,7 @@ class MemorySummary(BaseModel):
conversation_id = Column(String(36), ForeignKey("conversations.id"), nullable=False, index=True)
summary_text = Column(Text, nullable=False) # 摘要内容
turn_count = Column(Integer, default=0) # 摘要时累计轮数
summary_at = Column(DateTime, default=datetime.utcnow, nullable=False)
summary_at = Column(DateTime, default=utc_now, nullable=False)
class UserMemory(BaseModel):
@@ -31,5 +30,5 @@ class UserMemory(BaseModel):
is_recalled = Column(Boolean, default=False) # 是否在当前对话中被召回
recall_count = Column(Integer, default=0) # 被召回次数
source_conversation_id = Column(String(36), nullable=True) # 来源对话
extracted_at = Column(DateTime, default=datetime.utcnow, nullable=False)
extracted_at = Column(DateTime, default=utc_now, nullable=False)
last_recalled_at = Column(DateTime, nullable=True)