Add MinerU document ingestion support

Normalize uploaded documents into structured markdown, add clearer parser
errors for missing dependencies, and cover the ingestion flow with
backend tests. This also replaces deprecated UTC timestamp helpers in
the touched backend paths so the knowledge pipeline stays warning-free.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-22 13:42:16 +08:00
parent a9ddf3c9b4
commit 3ee825aa90
20 changed files with 2159 additions and 156 deletions

View File

@@ -0,0 +1,234 @@
import json
from io import BytesIO
import pytest
from httpx import ASGITransport, AsyncClient
from sqlalchemy import select
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
import app.models # noqa: F401
from app.database import Base, get_db
from app.main import app
from app.models.document import Document, DocumentChunk
from app.models.user import User
from app.routers.auth import get_current_user
from app.services.auth_service import get_password_hash
from app.services.document_service import DocumentService
from starlette.datastructures import UploadFile
@pytest.fixture
async def document_router_env(tmp_path):
db_path = tmp_path / 'test_documents_router.db'
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
session_factory = async_sessionmaker(engine, expire_on_commit=False)
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
async with session_factory() as session:
user = User(
email='docs@example.com',
hashed_password=get_password_hash('secret123'),
full_name='Docs Tester',
)
session.add(user)
await session.flush()
document = Document(
id='doc-1',
user_id=user.id,
title='Uploaded spec',
filename='uploaded-spec.docx',
file_type='docx',
file_size=2048,
file_path=str(tmp_path / 'uploaded-spec.docx'),
summary='spec summary',
chunk_count=3,
is_indexed=True,
ingestion_status='ready',
normalized_content='# Uploaded spec\n\nnormalized body',
normalized_format='structured_markdown',
)
session.add(document)
await session.flush()
session.add_all([
DocumentChunk(
id='chunk-1',
document_id=document.id,
chunk_index=0,
content='original chunk content',
metadata_=json.dumps({'content_type': 'paragraph', 'section_title': 'Intro'}),
),
DocumentChunk(
id='chunk-2',
document_id=document.id,
chunk_index=1,
content='second chunk content',
metadata_=json.dumps({'content_type': 'paragraph', 'section_title': 'Details'}),
),
])
await session.commit()
await session.refresh(user)
async def override_get_db():
async with session_factory() as session:
yield session
async def override_get_current_user():
return user
app.dependency_overrides[get_db] = override_get_db
app.dependency_overrides[get_current_user] = override_get_current_user
try:
yield
finally:
app.dependency_overrides.clear()
await engine.dispose()
@pytest.mark.asyncio
async def test_list_documents_returns_serializable_document_payload(document_router_env):
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.get('/api/documents')
assert response.status_code == 200
payload = response.json()
assert len(payload) == 1
assert payload[0]['title'] == 'Uploaded spec'
assert payload[0]['ingestion_status'] == 'ready'
@pytest.mark.asyncio
async def test_get_document_chunks_returns_serializable_chunk_payload(document_router_env):
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.get('/api/documents/doc-1/chunks')
assert response.status_code == 200
payload = response.json()
assert [chunk['id'] for chunk in payload] == ['chunk-1', 'chunk-2']
assert payload[0]['content'] == 'original chunk content'
@pytest.mark.asyncio
async def test_update_document_chunk_persists_content_and_reindexes_existing_chunks(document_router_env):
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.put('/api/documents/doc-1/chunks/chunk-1', json={'content': 'edited chunk content'})
assert response.status_code == 200
payload = response.json()
assert payload['id'] == 'chunk-1'
assert payload['content'] == 'edited chunk content'
async for session in app.dependency_overrides[get_db]():
chunk_result = await session.execute(select(DocumentChunk).where(DocumentChunk.id == 'chunk-1'))
updated_chunk = chunk_result.scalar_one()
document_result = await session.execute(select(Document).where(Document.id == 'doc-1'))
updated_document = document_result.scalar_one()
assert updated_chunk.content == 'edited chunk content'
assert updated_document.ingestion_status == 'ready'
assert updated_document.indexed_at is not None
@pytest.mark.asyncio
async def test_get_document_content_prefers_normalized_content(document_router_env):
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.get('/api/documents/doc-1/content')
assert response.status_code == 200
assert response.json() == {'content': '# Uploaded spec\n\nnormalized body'}
@pytest.mark.asyncio
async def test_upload_document_returns_400_for_unsupported_file_type(document_router_env):
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.post(
'/api/documents/upload',
files={'file': ('payload.exe', BytesIO(b'bad'), 'application/octet-stream')},
)
assert response.status_code == 400
assert response.json()['detail'] == '不支持的文件类型: .exe'
@pytest.mark.asyncio
async def test_upload_document_returns_400_for_missing_parser_dependency(document_router_env, monkeypatch):
async def raise_missing_dependency(self, file_path: str, ext: str):
raise ValueError('DOCX 解析依赖缺失: python-docx')
monkeypatch.setattr(DocumentService, '_parse_document', raise_missing_dependency)
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.post(
'/api/documents/upload',
files={'file': ('payload.docx', BytesIO(b'bad'), 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')},
)
assert response.status_code == 400
assert response.json()['detail'] == 'DOCX 解析依赖缺失: python-docx'
@pytest.mark.asyncio
async def test_upload_document_returns_400_for_missing_mineru_dependency(document_router_env, monkeypatch):
async def raise_missing_mineru(self, file_path: str, ext: str):
raise ValueError('PDF 解析依赖缺失: mineru')
monkeypatch.setattr(DocumentService, '_parse_document', raise_missing_mineru)
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.post(
'/api/documents/upload',
files={'file': ('payload.pdf', BytesIO(b'%PDF-1.4 bad'), 'application/pdf')},
)
assert response.status_code == 400
assert response.json()['detail'] == 'PDF 解析依赖缺失: mineru'
@pytest.mark.asyncio
async def test_upload_document_returns_success_payload_for_pdf(document_router_env, monkeypatch):
async def fake_upload_document(self, user_id: str, file, folder_id=None):
return Document(
id='pdf-doc-1',
user_id=user_id,
title='PDF Spec',
filename='payload.pdf',
file_type='pdf',
file_size=2048,
file_path='fake/path/payload.pdf',
chunk_count=4,
ingestion_status='uploaded',
normalized_content='# PDF Spec\n\nBody',
normalized_format='structured_markdown',
)
monkeypatch.setattr(DocumentService, 'upload_document', fake_upload_document)
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.post(
'/api/documents/upload',
files={'file': ('payload.pdf', BytesIO(b'%PDF-1.4 fake'), 'application/pdf')},
)
assert response.status_code == 201
assert response.json() == {
'id': 'pdf-doc-1',
'title': 'PDF Spec',
'chunk_count': 4,
'status': '上传成功,正在索引...',
}

View File

@@ -0,0 +1,371 @@
import json
from io import BytesIO
import builtins
import sys
import types
import pytest
from docx import Document as DocxDocument
from openpyxl import Workbook
from sqlalchemy import select
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from starlette.datastructures import UploadFile
import app.models # noqa: F401
from app.database import Base
from app.models.document import Document, DocumentChunk
from app.models.user import User
from app.services.auth_service import get_password_hash
from app.services.document_service import DocumentService
@pytest.fixture
async def document_test_env(tmp_path, monkeypatch):
db_path = tmp_path / 'test_documents.db'
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
session_factory = async_sessionmaker(engine, expire_on_commit=False)
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
async with session_factory() as session:
user = User(
email='doc-tester@example.com',
hashed_password=get_password_hash('secret123'),
full_name='Doc Tester',
)
session.add(user)
await session.commit()
await session.refresh(user)
monkeypatch.setattr('app.services.document_service.settings.UPLOAD_DIR', str(tmp_path / 'uploads'))
monkeypatch.setattr('app.services.document_service.settings.CHUNK_SIZE', 120)
monkeypatch.setattr('app.services.document_service.settings.CHUNK_OVERLAP', 20)
async with session_factory() as session:
yield session, user
await engine.dispose()
@pytest.mark.asyncio
async def test_upload_document_creates_schema_and_row_chunks_for_csv(document_test_env):
session, user = document_test_env
service = DocumentService(session)
payload = '\n'.join([
'region,month,revenue',
'East,2025-01,100',
'West,2025-01,200',
'East,2025-02,150',
'West,2025-02,250',
])
upload = UploadFile(filename='sales.csv', file=BytesIO(payload.encode('utf-8')))
document = await service.upload_document(user.id, upload)
assert document.file_type == 'csv'
assert document.ingestion_status == 'uploaded'
assert document.parser_version == 'v2'
assert document.index_version == 'v2'
assert document.chunk_count >= 2
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunks = list(chunk_result.scalars().all())
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
assert metadata[0]['content_type'] == 'table_schema'
assert metadata[0]['headers'] == ['region', 'month', 'revenue']
assert any(item['content_type'] == 'table_rows' for item in metadata)
assert any('region=East' in chunk.content for chunk in chunks)
@pytest.mark.asyncio
async def test_upload_document_creates_sheet_metadata_chunks_for_xlsx(document_test_env):
session, user = document_test_env
service = DocumentService(session)
workbook = Workbook()
ws = workbook.active
ws.title = 'Revenue'
ws.append(['region', 'quarter', 'amount'])
ws.append(['East', 'Q1', 300])
ws.append(['West', 'Q1', 280])
detail = workbook.create_sheet('Detail')
detail.append(['project', 'owner'])
detail.append(['Jarvis', 'Ops'])
file_obj = BytesIO()
workbook.save(file_obj)
file_obj.seek(0)
upload = UploadFile(filename='report.xlsx', file=file_obj)
document = await service.upload_document(user.id, upload)
assert document.file_type == 'xlsx'
assert document.chunk_count >= 3
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunks = list(chunk_result.scalars().all())
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
assert any(item['sheet_name'] == 'Revenue' for item in metadata)
assert any(item['sheet_name'] == 'Detail' for item in metadata)
assert any(item['content_type'] == 'table_schema' for item in metadata)
assert any(item['content_type'] == 'table_rows' for item in metadata)
@pytest.mark.asyncio
async def test_upload_document_preserves_section_metadata_for_markdown(document_test_env):
session, user = document_test_env
service = DocumentService(session)
payload = '\n'.join([
'# Overview',
'Jarvis overview paragraph.',
'',
'## Retrieval',
'Hybrid retrieval favors relevant chunks.',
])
upload = UploadFile(filename='guide.md', file=BytesIO(payload.encode('utf-8')))
document = await service.upload_document(user.id, upload)
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunks = list(chunk_result.scalars().all())
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
assert any(item['content_type'] == 'heading' for item in metadata)
assert any(item['section_path'] == ['Overview', 'Retrieval'] for item in metadata if item['content_type'] != 'heading')
assert any(item.get('section_title') == 'Retrieval' for item in metadata)
heading_item = next(item for item in metadata if item['content_type'] == 'heading' and item['section_title'] == 'Overview')
child_item = next(item for item in metadata if item['content_type'] == 'paragraph' and item['section_title'] == 'Retrieval')
assert heading_item['chunk_level'] == 1
assert heading_item['parent_key'] is None
assert heading_item['block_key'] == 'Overview'
assert child_item['chunk_level'] == 2
assert child_item['parent_key'] == 'Overview'
assert child_item['block_key'] == 'Overview/Retrieval'
@pytest.mark.asyncio
async def test_upload_document_rejects_unsupported_extension(document_test_env):
session, user = document_test_env
service = DocumentService(session)
upload = UploadFile(filename='malware.exe', file=BytesIO(b'bad'))
with pytest.raises(ValueError, match='不支持的文件类型'):
await service.upload_document(user.id, upload)
@pytest.mark.asyncio
async def test_upload_document_persists_structured_metadata_json(document_test_env):
session, user = document_test_env
service = DocumentService(session)
payload = 'title\n\nplain text body for metadata storage'
upload = UploadFile(filename='notes.txt', file=BytesIO(payload.encode('utf-8')))
document = await service.upload_document(user.id, upload)
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunk = chunk_result.scalars().first()
parsed = json.loads(chunk.metadata_)
assert parsed['content_type'] == 'text'
assert parsed['parser_version'] == 'v2'
assert parsed['index_version'] == 'v2'
assert parsed['source_order'] == 0
document_result = await session.execute(select(Document).where(Document.id == document.id))
stored_document = document_result.scalar_one()
assert stored_document.ingestion_status == 'uploaded'
assert stored_document.normalized_format == 'structured_markdown'
assert stored_document.normalized_content == 'title\n\nplain text body for metadata storage'
@pytest.mark.asyncio
async def test_upload_document_extracts_docx_heading_and_table_structure(document_test_env):
session, user = document_test_env
service = DocumentService(session)
doc = DocxDocument()
doc.add_heading('Architecture', level=1)
doc.add_paragraph('System overview paragraph.')
doc.add_heading('Retrieval', level=2)
doc.add_paragraph('Section-aware retrieval paragraph.')
table = doc.add_table(rows=2, cols=2)
table.rows[0].cells[0].text = 'metric'
table.rows[0].cells[1].text = 'value'
table.rows[1].cells[0].text = 'latency'
table.rows[1].cells[1].text = '120ms'
file_obj = BytesIO()
doc.save(file_obj)
file_obj.seek(0)
upload = UploadFile(filename='architecture.docx', file=file_obj)
document = await service.upload_document(user.id, upload)
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunks = list(chunk_result.scalars().all())
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
retrieval_paragraph = next(item for item in metadata if item['section_title'] == 'Retrieval' and item['content_type'] == 'paragraph')
table_schema = next(item for item in metadata if item['content_type'] == 'table_schema')
assert retrieval_paragraph['section_path'] == ['Architecture', 'Retrieval']
assert table_schema['headers'] == ['metric', 'value']
assert any(item['content_type'] == 'table_rows' for item in metadata)
assert document.normalized_format == 'structured_markdown'
assert '# Architecture' in document.normalized_content
assert '## Retrieval' in document.normalized_content
assert '| metric | value |' in document.normalized_content
@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_docx_dependency_is_missing(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
original_import = builtins.__import__
def fake_import(name, *args, **kwargs):
if name == 'docx':
raise ModuleNotFoundError("No module named 'docx'")
return original_import(name, *args, **kwargs)
monkeypatch.setattr(builtins, '__import__', fake_import)
upload = UploadFile(filename='missing.docx', file=BytesIO(b'fake-docx'))
with pytest.raises(ValueError, match='DOCX 解析依赖缺失: python-docx'):
await service.upload_document(user.id, upload)
@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_xlsx_dependency_is_missing(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
original_import = builtins.__import__
def fake_import(name, *args, **kwargs):
if name == 'openpyxl':
raise ModuleNotFoundError("No module named 'openpyxl'")
return original_import(name, *args, **kwargs)
monkeypatch.setattr(builtins, '__import__', fake_import)
upload = UploadFile(filename='missing.xlsx', file=BytesIO(b'fake-xlsx'))
with pytest.raises(ValueError, match='XLSX 解析依赖缺失: openpyxl'):
await service.upload_document(user.id, upload)
@pytest.mark.asyncio
async def test_upload_document_uses_mineru_markdown_for_pdf(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
fake_mineru = types.SimpleNamespace(
to_markdown=lambda file_path: '# PDF Title\n\n## Section\n\nMinerU extracted paragraph.'
)
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
upload = UploadFile(filename='spec.pdf', file=BytesIO(b'%PDF-1.4 fake'))
document = await service.upload_document(user.id, upload)
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunks = list(chunk_result.scalars().all())
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
assert document.normalized_format == 'structured_markdown'
assert '# PDF Title' in document.normalized_content
assert '## Section' in document.normalized_content
assert any(item['content_type'] == 'heading' for item in metadata)
assert any(item['content_type'] == 'paragraph' and item['section_title'] == 'Section' for item in metadata)
@pytest.mark.asyncio
async def test_upload_document_preserves_mineru_image_markdown_in_pdf(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
fake_mineru = types.SimpleNamespace(
to_markdown=lambda file_path: '# PDF Title\n\n![System diagram](images/system.png)\n\nSystem diagram shows retrieval flow.'
)
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
upload = UploadFile(filename='diagram.pdf', file=BytesIO(b'%PDF-1.4 fake'))
document = await service.upload_document(user.id, upload)
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunks = list(chunk_result.scalars().all())
assert '![System diagram](images/system.png)' in document.normalized_content
assert any('System diagram' in chunk.content for chunk in chunks)
@pytest.mark.asyncio
async def test_get_document_content_returns_normalized_pdf_content(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
fake_mineru = types.SimpleNamespace(
to_markdown=lambda file_path: '# PDF Title\n\nNormalized pdf body.'
)
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
upload = UploadFile(filename='preview.pdf', file=BytesIO(b'%PDF-1.4 fake'))
document = await service.upload_document(user.id, upload)
content = await service.get_document_content(user.id, document.id)
assert content == '# PDF Title\n\nNormalized pdf body.'
@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_pdf_dependency_is_missing(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
original_import = builtins.__import__
def fake_import(name, *args, **kwargs):
if name == 'mineru':
raise ModuleNotFoundError("No module named 'mineru'")
return original_import(name, *args, **kwargs)
monkeypatch.setattr(builtins, '__import__', fake_import)
upload = UploadFile(filename='missing.pdf', file=BytesIO(b'%PDF-1.4 fake'))
with pytest.raises(ValueError, match='PDF 解析依赖缺失: mineru'):
await service.upload_document(user.id, upload)

View File

@@ -0,0 +1,130 @@
import importlib
from pathlib import Path
from unittest.mock import AsyncMock, Mock
import pytest
from langchain_core.messages import AIMessage, HumanMessage
from sqlalchemy import text
from sqlalchemy.ext.asyncio import create_async_engine
import app.models # noqa: F401
from app.database import Base, ensure_document_columns, ensure_message_columns
from app.agents.graph import _ainvoke, _compile_graph
@pytest.mark.anyio
async def test_ensure_message_columns_adds_attachments_for_existing_messages_table(tmp_path):
db_path = tmp_path / 'test_messages.db'
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
async with engine.begin() as conn:
await conn.execute(text(
'''
CREATE TABLE messages (
id VARCHAR(36) PRIMARY KEY,
conversation_id VARCHAR(36) NOT NULL,
role VARCHAR(20) NOT NULL,
content TEXT NOT NULL,
model VARCHAR(100),
tokens_used INTEGER,
created_at DATETIME,
updated_at DATETIME
)
'''
))
result = await conn.execute(text("PRAGMA table_info(messages)"))
columns_before = {row[1] for row in result.fetchall()}
assert 'attachments' not in columns_before
await ensure_message_columns(conn)
result = await conn.execute(text("PRAGMA table_info(messages)"))
columns_after = {row[1] for row in result.fetchall()}
assert 'attachments' in columns_after
await engine.dispose()
@pytest.mark.anyio
async def test_ainvoke_falls_back_to_invoke_for_wrapped_llm_services():
llm = Mock()
llm.ainvoke = None
llm.invoke = AsyncMock(return_value=AIMessage(content='ok'))
response = await _ainvoke(llm, [HumanMessage(content='ping')])
assert response.content == 'ok'
llm.invoke.assert_awaited_once()
def test_compile_graph_falls_back_when_callbacks_are_unsupported():
compiled_without_callbacks = object()
graph = Mock()
graph.compile.side_effect = [TypeError("unexpected keyword argument 'callbacks'"), compiled_without_callbacks]
compiled = _compile_graph(graph, callbacks=['cb'])
assert compiled is compiled_without_callbacks
assert graph.compile.call_count == 2
assert graph.compile.call_args_list[0].kwargs == {'callbacks': ['cb']}
assert graph.compile.call_args_list[1].kwargs == {}
def test_settings_resolve_data_paths_from_backend_directory():
config_module = importlib.import_module('app.config')
expected_data_dir = (Path(config_module.__file__).resolve().parent.parent / 'data').resolve()
assert Path(config_module.settings.DATA_DIR) == expected_data_dir
assert config_module.settings.DATABASE_URL.replace('\\', '/').endswith('/backend/data/jarvis.db')
assert Path(config_module.settings.CHROMA_PERSIST_DIR) == expected_data_dir / 'chroma'
assert Path(config_module.settings.UPLOAD_DIR) == expected_data_dir / 'uploads'
@pytest.mark.anyio
async def test_ensure_document_columns_adds_ingestion_fields_for_existing_documents_table(tmp_path):
db_path = tmp_path / 'test_documents.db'
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
async with engine.begin() as conn:
await conn.execute(text(
'''
CREATE TABLE documents (
id VARCHAR(36) PRIMARY KEY,
user_id VARCHAR(36) NOT NULL,
title VARCHAR(500) NOT NULL,
filename VARCHAR(500) NOT NULL,
file_type VARCHAR(50) NOT NULL,
file_size INTEGER NOT NULL,
file_path VARCHAR(1000) NOT NULL,
folder_id VARCHAR(36),
summary TEXT,
chunk_count INTEGER,
is_indexed BOOLEAN,
created_at DATETIME,
updated_at DATETIME
)
'''
))
result = await conn.execute(text("PRAGMA table_info(documents)"))
columns_before = {row[1] for row in result.fetchall()}
assert 'ingestion_status' not in columns_before
assert 'ingestion_error' not in columns_before
assert 'indexed_at' not in columns_before
assert 'parser_version' not in columns_before
assert 'index_version' not in columns_before
assert 'normalized_content' not in columns_before
assert 'normalized_format' not in columns_before
await ensure_document_columns(conn)
result = await conn.execute(text("PRAGMA table_info(documents)"))
columns_after = {row[1] for row in result.fetchall()}
assert 'ingestion_status' in columns_after
assert 'ingestion_error' in columns_after
assert 'indexed_at' in columns_after
assert 'parser_version' in columns_after
assert 'index_version' in columns_after
assert 'normalized_content' in columns_after
assert 'normalized_format' in columns_after
await engine.dispose()