Add MinerU document ingestion support

Normalize uploaded documents into structured markdown, add clearer parser
errors for missing dependencies, and cover the ingestion flow with
backend tests. This also replaces deprecated UTC timestamp helpers in
the touched backend paths so the knowledge pipeline stays warning-free.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-22 13:42:16 +08:00
parent a9ddf3c9b4
commit 3ee825aa90
20 changed files with 2159 additions and 156 deletions

View File

@@ -1,3 +1,4 @@
from sqlalchemy import text
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker
from sqlalchemy.orm import DeclarativeBase
from app.config import settings
@@ -33,3 +34,62 @@ async def get_db() -> AsyncSession:
async def init_db():
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
await ensure_log_columns(conn)
await ensure_message_columns(conn)
await ensure_document_columns(conn)
async def ensure_log_columns(conn):
result = await conn.execute(text("PRAGMA table_info(logs)"))
rows = result.fetchall()
if not rows:
return
columns = {row[1] for row in rows}
required_columns = {
"request_id": "ALTER TABLE logs ADD COLUMN request_id VARCHAR(64)",
"route": "ALTER TABLE logs ADD COLUMN route VARCHAR(255)",
"method": "ALTER TABLE logs ADD COLUMN method VARCHAR(16)",
"status_code": "ALTER TABLE logs ADD COLUMN status_code INTEGER",
"error_type": "ALTER TABLE logs ADD COLUMN error_type VARCHAR(100)",
"operation": "ALTER TABLE logs ADD COLUMN operation VARCHAR(100)",
}
for column, ddl in required_columns.items():
if column not in columns:
await conn.execute(text(ddl))
async def ensure_message_columns(conn):
result = await conn.execute(text("PRAGMA table_info(messages)"))
rows = result.fetchall()
if not rows:
return
columns = {row[1] for row in rows}
required_columns = {
"attachments": "ALTER TABLE messages ADD COLUMN attachments JSON",
}
for column, ddl in required_columns.items():
if column not in columns:
await conn.execute(text(ddl))
async def ensure_document_columns(conn):
result = await conn.execute(text("PRAGMA table_info(documents)"))
rows = result.fetchall()
if not rows:
return
columns = {row[1] for row in rows}
required_columns = {
"ingestion_status": "ALTER TABLE documents ADD COLUMN ingestion_status VARCHAR(50) DEFAULT 'uploaded' NOT NULL",
"ingestion_error": "ALTER TABLE documents ADD COLUMN ingestion_error TEXT",
"indexed_at": "ALTER TABLE documents ADD COLUMN indexed_at DATETIME",
"parser_version": "ALTER TABLE documents ADD COLUMN parser_version VARCHAR(50)",
"index_version": "ALTER TABLE documents ADD COLUMN index_version VARCHAR(50)",
"normalized_content": "ALTER TABLE documents ADD COLUMN normalized_content TEXT",
"normalized_format": "ALTER TABLE documents ADD COLUMN normalized_format VARCHAR(50)",
}
for column, ddl in required_columns.items():
if column not in columns:
await conn.execute(text(ddl))