Normalize uploaded documents into structured markdown, add clearer parser errors for missing dependencies, and cover the ingestion flow with backend tests. This also replaces deprecated UTC timestamp helpers in the touched backend paths so the knowledge pipeline stays warning-free. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
96 lines
3.1 KiB
Python
96 lines
3.1 KiB
Python
from sqlalchemy import text
|
|
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker
|
|
from sqlalchemy.orm import DeclarativeBase
|
|
from app.config import settings
|
|
import os
|
|
|
|
os.makedirs(settings.DATA_DIR, exist_ok=True)
|
|
|
|
engine = create_async_engine(
|
|
settings.DATABASE_URL,
|
|
echo=settings.DEBUG,
|
|
pool_pre_ping=True,
|
|
)
|
|
|
|
async_session = async_sessionmaker(
|
|
engine,
|
|
class_=AsyncSession,
|
|
expire_on_commit=False,
|
|
)
|
|
|
|
|
|
class Base(DeclarativeBase):
|
|
pass
|
|
|
|
|
|
async def get_db() -> AsyncSession:
|
|
async with async_session() as session:
|
|
try:
|
|
yield session
|
|
finally:
|
|
await session.close()
|
|
|
|
|
|
async def init_db():
|
|
async with engine.begin() as conn:
|
|
await conn.run_sync(Base.metadata.create_all)
|
|
await ensure_log_columns(conn)
|
|
await ensure_message_columns(conn)
|
|
await ensure_document_columns(conn)
|
|
|
|
|
|
async def ensure_log_columns(conn):
|
|
result = await conn.execute(text("PRAGMA table_info(logs)"))
|
|
rows = result.fetchall()
|
|
if not rows:
|
|
return
|
|
|
|
columns = {row[1] for row in rows}
|
|
required_columns = {
|
|
"request_id": "ALTER TABLE logs ADD COLUMN request_id VARCHAR(64)",
|
|
"route": "ALTER TABLE logs ADD COLUMN route VARCHAR(255)",
|
|
"method": "ALTER TABLE logs ADD COLUMN method VARCHAR(16)",
|
|
"status_code": "ALTER TABLE logs ADD COLUMN status_code INTEGER",
|
|
"error_type": "ALTER TABLE logs ADD COLUMN error_type VARCHAR(100)",
|
|
"operation": "ALTER TABLE logs ADD COLUMN operation VARCHAR(100)",
|
|
}
|
|
for column, ddl in required_columns.items():
|
|
if column not in columns:
|
|
await conn.execute(text(ddl))
|
|
|
|
|
|
async def ensure_message_columns(conn):
|
|
result = await conn.execute(text("PRAGMA table_info(messages)"))
|
|
rows = result.fetchall()
|
|
if not rows:
|
|
return
|
|
|
|
columns = {row[1] for row in rows}
|
|
required_columns = {
|
|
"attachments": "ALTER TABLE messages ADD COLUMN attachments JSON",
|
|
}
|
|
for column, ddl in required_columns.items():
|
|
if column not in columns:
|
|
await conn.execute(text(ddl))
|
|
|
|
|
|
async def ensure_document_columns(conn):
|
|
result = await conn.execute(text("PRAGMA table_info(documents)"))
|
|
rows = result.fetchall()
|
|
if not rows:
|
|
return
|
|
|
|
columns = {row[1] for row in rows}
|
|
required_columns = {
|
|
"ingestion_status": "ALTER TABLE documents ADD COLUMN ingestion_status VARCHAR(50) DEFAULT 'uploaded' NOT NULL",
|
|
"ingestion_error": "ALTER TABLE documents ADD COLUMN ingestion_error TEXT",
|
|
"indexed_at": "ALTER TABLE documents ADD COLUMN indexed_at DATETIME",
|
|
"parser_version": "ALTER TABLE documents ADD COLUMN parser_version VARCHAR(50)",
|
|
"index_version": "ALTER TABLE documents ADD COLUMN index_version VARCHAR(50)",
|
|
"normalized_content": "ALTER TABLE documents ADD COLUMN normalized_content TEXT",
|
|
"normalized_format": "ALTER TABLE documents ADD COLUMN normalized_format VARCHAR(50)",
|
|
}
|
|
for column, ddl in required_columns.items():
|
|
if column not in columns:
|
|
await conn.execute(text(ddl))
|