Normalize uploaded documents into structured markdown, add clearer parser errors for missing dependencies, and cover the ingestion flow with backend tests. This also replaces deprecated UTC timestamp helpers in the touched backend paths so the knowledge pipeline stays warning-free. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
235 lines
8.6 KiB
Python
235 lines
8.6 KiB
Python
import json
|
|
from io import BytesIO
|
|
|
|
import pytest
|
|
from httpx import ASGITransport, AsyncClient
|
|
from sqlalchemy import select
|
|
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
|
|
|
|
import app.models # noqa: F401
|
|
from app.database import Base, get_db
|
|
from app.main import app
|
|
from app.models.document import Document, DocumentChunk
|
|
from app.models.user import User
|
|
from app.routers.auth import get_current_user
|
|
from app.services.auth_service import get_password_hash
|
|
from app.services.document_service import DocumentService
|
|
from starlette.datastructures import UploadFile
|
|
|
|
|
|
@pytest.fixture
|
|
async def document_router_env(tmp_path):
|
|
db_path = tmp_path / 'test_documents_router.db'
|
|
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
|
|
session_factory = async_sessionmaker(engine, expire_on_commit=False)
|
|
|
|
async with engine.begin() as conn:
|
|
await conn.run_sync(Base.metadata.create_all)
|
|
|
|
async with session_factory() as session:
|
|
user = User(
|
|
email='docs@example.com',
|
|
hashed_password=get_password_hash('secret123'),
|
|
full_name='Docs Tester',
|
|
)
|
|
session.add(user)
|
|
await session.flush()
|
|
|
|
document = Document(
|
|
id='doc-1',
|
|
user_id=user.id,
|
|
title='Uploaded spec',
|
|
filename='uploaded-spec.docx',
|
|
file_type='docx',
|
|
file_size=2048,
|
|
file_path=str(tmp_path / 'uploaded-spec.docx'),
|
|
summary='spec summary',
|
|
chunk_count=3,
|
|
is_indexed=True,
|
|
ingestion_status='ready',
|
|
normalized_content='# Uploaded spec\n\nnormalized body',
|
|
normalized_format='structured_markdown',
|
|
)
|
|
session.add(document)
|
|
await session.flush()
|
|
session.add_all([
|
|
DocumentChunk(
|
|
id='chunk-1',
|
|
document_id=document.id,
|
|
chunk_index=0,
|
|
content='original chunk content',
|
|
metadata_=json.dumps({'content_type': 'paragraph', 'section_title': 'Intro'}),
|
|
),
|
|
DocumentChunk(
|
|
id='chunk-2',
|
|
document_id=document.id,
|
|
chunk_index=1,
|
|
content='second chunk content',
|
|
metadata_=json.dumps({'content_type': 'paragraph', 'section_title': 'Details'}),
|
|
),
|
|
])
|
|
await session.commit()
|
|
await session.refresh(user)
|
|
|
|
async def override_get_db():
|
|
async with session_factory() as session:
|
|
yield session
|
|
|
|
async def override_get_current_user():
|
|
return user
|
|
|
|
app.dependency_overrides[get_db] = override_get_db
|
|
app.dependency_overrides[get_current_user] = override_get_current_user
|
|
|
|
try:
|
|
yield
|
|
finally:
|
|
app.dependency_overrides.clear()
|
|
await engine.dispose()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_list_documents_returns_serializable_document_payload(document_router_env):
|
|
transport = ASGITransport(app=app)
|
|
|
|
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
|
response = await client.get('/api/documents')
|
|
|
|
assert response.status_code == 200
|
|
payload = response.json()
|
|
assert len(payload) == 1
|
|
assert payload[0]['title'] == 'Uploaded spec'
|
|
assert payload[0]['ingestion_status'] == 'ready'
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_document_chunks_returns_serializable_chunk_payload(document_router_env):
|
|
transport = ASGITransport(app=app)
|
|
|
|
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
|
response = await client.get('/api/documents/doc-1/chunks')
|
|
|
|
assert response.status_code == 200
|
|
payload = response.json()
|
|
assert [chunk['id'] for chunk in payload] == ['chunk-1', 'chunk-2']
|
|
assert payload[0]['content'] == 'original chunk content'
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_update_document_chunk_persists_content_and_reindexes_existing_chunks(document_router_env):
|
|
transport = ASGITransport(app=app)
|
|
|
|
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
|
response = await client.put('/api/documents/doc-1/chunks/chunk-1', json={'content': 'edited chunk content'})
|
|
|
|
assert response.status_code == 200
|
|
payload = response.json()
|
|
assert payload['id'] == 'chunk-1'
|
|
assert payload['content'] == 'edited chunk content'
|
|
|
|
async for session in app.dependency_overrides[get_db]():
|
|
chunk_result = await session.execute(select(DocumentChunk).where(DocumentChunk.id == 'chunk-1'))
|
|
updated_chunk = chunk_result.scalar_one()
|
|
document_result = await session.execute(select(Document).where(Document.id == 'doc-1'))
|
|
updated_document = document_result.scalar_one()
|
|
|
|
assert updated_chunk.content == 'edited chunk content'
|
|
assert updated_document.ingestion_status == 'ready'
|
|
assert updated_document.indexed_at is not None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_document_content_prefers_normalized_content(document_router_env):
|
|
transport = ASGITransport(app=app)
|
|
|
|
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
|
response = await client.get('/api/documents/doc-1/content')
|
|
|
|
assert response.status_code == 200
|
|
assert response.json() == {'content': '# Uploaded spec\n\nnormalized body'}
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_returns_400_for_unsupported_file_type(document_router_env):
|
|
transport = ASGITransport(app=app)
|
|
|
|
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
|
response = await client.post(
|
|
'/api/documents/upload',
|
|
files={'file': ('payload.exe', BytesIO(b'bad'), 'application/octet-stream')},
|
|
)
|
|
|
|
assert response.status_code == 400
|
|
assert response.json()['detail'] == '不支持的文件类型: .exe'
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_returns_400_for_missing_parser_dependency(document_router_env, monkeypatch):
|
|
async def raise_missing_dependency(self, file_path: str, ext: str):
|
|
raise ValueError('DOCX 解析依赖缺失: python-docx')
|
|
|
|
monkeypatch.setattr(DocumentService, '_parse_document', raise_missing_dependency)
|
|
transport = ASGITransport(app=app)
|
|
|
|
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
|
response = await client.post(
|
|
'/api/documents/upload',
|
|
files={'file': ('payload.docx', BytesIO(b'bad'), 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')},
|
|
)
|
|
|
|
assert response.status_code == 400
|
|
assert response.json()['detail'] == 'DOCX 解析依赖缺失: python-docx'
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_returns_400_for_missing_mineru_dependency(document_router_env, monkeypatch):
|
|
async def raise_missing_mineru(self, file_path: str, ext: str):
|
|
raise ValueError('PDF 解析依赖缺失: mineru')
|
|
|
|
monkeypatch.setattr(DocumentService, '_parse_document', raise_missing_mineru)
|
|
transport = ASGITransport(app=app)
|
|
|
|
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
|
response = await client.post(
|
|
'/api/documents/upload',
|
|
files={'file': ('payload.pdf', BytesIO(b'%PDF-1.4 bad'), 'application/pdf')},
|
|
)
|
|
|
|
assert response.status_code == 400
|
|
assert response.json()['detail'] == 'PDF 解析依赖缺失: mineru'
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_returns_success_payload_for_pdf(document_router_env, monkeypatch):
|
|
async def fake_upload_document(self, user_id: str, file, folder_id=None):
|
|
return Document(
|
|
id='pdf-doc-1',
|
|
user_id=user_id,
|
|
title='PDF Spec',
|
|
filename='payload.pdf',
|
|
file_type='pdf',
|
|
file_size=2048,
|
|
file_path='fake/path/payload.pdf',
|
|
chunk_count=4,
|
|
ingestion_status='uploaded',
|
|
normalized_content='# PDF Spec\n\nBody',
|
|
normalized_format='structured_markdown',
|
|
)
|
|
|
|
monkeypatch.setattr(DocumentService, 'upload_document', fake_upload_document)
|
|
transport = ASGITransport(app=app)
|
|
|
|
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
|
response = await client.post(
|
|
'/api/documents/upload',
|
|
files={'file': ('payload.pdf', BytesIO(b'%PDF-1.4 fake'), 'application/pdf')},
|
|
)
|
|
|
|
assert response.status_code == 201
|
|
assert response.json() == {
|
|
'id': 'pdf-doc-1',
|
|
'title': 'PDF Spec',
|
|
'chunk_count': 4,
|
|
'status': '上传成功,正在索引...',
|
|
}
|