Files
JARVIS/backend/tests/backend/app/services/test_document_router.py
DESKTOP-72TV0V4\caoxiaozhu 3ee825aa90 Add MinerU document ingestion support
Normalize uploaded documents into structured markdown, add clearer parser
errors for missing dependencies, and cover the ingestion flow with
backend tests. This also replaces deprecated UTC timestamp helpers in
the touched backend paths so the knowledge pipeline stays warning-free.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 13:42:16 +08:00

235 lines
8.6 KiB
Python

import json
from io import BytesIO
import pytest
from httpx import ASGITransport, AsyncClient
from sqlalchemy import select
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
import app.models # noqa: F401
from app.database import Base, get_db
from app.main import app
from app.models.document import Document, DocumentChunk
from app.models.user import User
from app.routers.auth import get_current_user
from app.services.auth_service import get_password_hash
from app.services.document_service import DocumentService
from starlette.datastructures import UploadFile
@pytest.fixture
async def document_router_env(tmp_path):
db_path = tmp_path / 'test_documents_router.db'
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
session_factory = async_sessionmaker(engine, expire_on_commit=False)
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
async with session_factory() as session:
user = User(
email='docs@example.com',
hashed_password=get_password_hash('secret123'),
full_name='Docs Tester',
)
session.add(user)
await session.flush()
document = Document(
id='doc-1',
user_id=user.id,
title='Uploaded spec',
filename='uploaded-spec.docx',
file_type='docx',
file_size=2048,
file_path=str(tmp_path / 'uploaded-spec.docx'),
summary='spec summary',
chunk_count=3,
is_indexed=True,
ingestion_status='ready',
normalized_content='# Uploaded spec\n\nnormalized body',
normalized_format='structured_markdown',
)
session.add(document)
await session.flush()
session.add_all([
DocumentChunk(
id='chunk-1',
document_id=document.id,
chunk_index=0,
content='original chunk content',
metadata_=json.dumps({'content_type': 'paragraph', 'section_title': 'Intro'}),
),
DocumentChunk(
id='chunk-2',
document_id=document.id,
chunk_index=1,
content='second chunk content',
metadata_=json.dumps({'content_type': 'paragraph', 'section_title': 'Details'}),
),
])
await session.commit()
await session.refresh(user)
async def override_get_db():
async with session_factory() as session:
yield session
async def override_get_current_user():
return user
app.dependency_overrides[get_db] = override_get_db
app.dependency_overrides[get_current_user] = override_get_current_user
try:
yield
finally:
app.dependency_overrides.clear()
await engine.dispose()
@pytest.mark.asyncio
async def test_list_documents_returns_serializable_document_payload(document_router_env):
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.get('/api/documents')
assert response.status_code == 200
payload = response.json()
assert len(payload) == 1
assert payload[0]['title'] == 'Uploaded spec'
assert payload[0]['ingestion_status'] == 'ready'
@pytest.mark.asyncio
async def test_get_document_chunks_returns_serializable_chunk_payload(document_router_env):
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.get('/api/documents/doc-1/chunks')
assert response.status_code == 200
payload = response.json()
assert [chunk['id'] for chunk in payload] == ['chunk-1', 'chunk-2']
assert payload[0]['content'] == 'original chunk content'
@pytest.mark.asyncio
async def test_update_document_chunk_persists_content_and_reindexes_existing_chunks(document_router_env):
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.put('/api/documents/doc-1/chunks/chunk-1', json={'content': 'edited chunk content'})
assert response.status_code == 200
payload = response.json()
assert payload['id'] == 'chunk-1'
assert payload['content'] == 'edited chunk content'
async for session in app.dependency_overrides[get_db]():
chunk_result = await session.execute(select(DocumentChunk).where(DocumentChunk.id == 'chunk-1'))
updated_chunk = chunk_result.scalar_one()
document_result = await session.execute(select(Document).where(Document.id == 'doc-1'))
updated_document = document_result.scalar_one()
assert updated_chunk.content == 'edited chunk content'
assert updated_document.ingestion_status == 'ready'
assert updated_document.indexed_at is not None
@pytest.mark.asyncio
async def test_get_document_content_prefers_normalized_content(document_router_env):
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.get('/api/documents/doc-1/content')
assert response.status_code == 200
assert response.json() == {'content': '# Uploaded spec\n\nnormalized body'}
@pytest.mark.asyncio
async def test_upload_document_returns_400_for_unsupported_file_type(document_router_env):
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.post(
'/api/documents/upload',
files={'file': ('payload.exe', BytesIO(b'bad'), 'application/octet-stream')},
)
assert response.status_code == 400
assert response.json()['detail'] == '不支持的文件类型: .exe'
@pytest.mark.asyncio
async def test_upload_document_returns_400_for_missing_parser_dependency(document_router_env, monkeypatch):
async def raise_missing_dependency(self, file_path: str, ext: str):
raise ValueError('DOCX 解析依赖缺失: python-docx')
monkeypatch.setattr(DocumentService, '_parse_document', raise_missing_dependency)
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.post(
'/api/documents/upload',
files={'file': ('payload.docx', BytesIO(b'bad'), 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')},
)
assert response.status_code == 400
assert response.json()['detail'] == 'DOCX 解析依赖缺失: python-docx'
@pytest.mark.asyncio
async def test_upload_document_returns_400_for_missing_mineru_dependency(document_router_env, monkeypatch):
async def raise_missing_mineru(self, file_path: str, ext: str):
raise ValueError('PDF 解析依赖缺失: mineru')
monkeypatch.setattr(DocumentService, '_parse_document', raise_missing_mineru)
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.post(
'/api/documents/upload',
files={'file': ('payload.pdf', BytesIO(b'%PDF-1.4 bad'), 'application/pdf')},
)
assert response.status_code == 400
assert response.json()['detail'] == 'PDF 解析依赖缺失: mineru'
@pytest.mark.asyncio
async def test_upload_document_returns_success_payload_for_pdf(document_router_env, monkeypatch):
async def fake_upload_document(self, user_id: str, file, folder_id=None):
return Document(
id='pdf-doc-1',
user_id=user_id,
title='PDF Spec',
filename='payload.pdf',
file_type='pdf',
file_size=2048,
file_path='fake/path/payload.pdf',
chunk_count=4,
ingestion_status='uploaded',
normalized_content='# PDF Spec\n\nBody',
normalized_format='structured_markdown',
)
monkeypatch.setattr(DocumentService, 'upload_document', fake_upload_document)
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
response = await client.post(
'/api/documents/upload',
files={'file': ('payload.pdf', BytesIO(b'%PDF-1.4 fake'), 'application/pdf')},
)
assert response.status_code == 201
assert response.json() == {
'id': 'pdf-doc-1',
'title': 'PDF Spec',
'chunk_count': 4,
'status': '上传成功,正在索引...',
}