Add MinerU document ingestion support
Normalize uploaded documents into structured markdown, add clearer parser errors for missing dependencies, and cover the ingestion flow with backend tests. This also replaces deprecated UTC timestamp helpers in the touched backend paths so the knowledge pipeline stays warning-free. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
234
backend/tests/backend/app/services/test_document_router.py
Normal file
234
backend/tests/backend/app/services/test_document_router.py
Normal file
@@ -0,0 +1,234 @@
|
||||
import json
|
||||
from io import BytesIO
|
||||
|
||||
import pytest
|
||||
from httpx import ASGITransport, AsyncClient
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
|
||||
|
||||
import app.models # noqa: F401
|
||||
from app.database import Base, get_db
|
||||
from app.main import app
|
||||
from app.models.document import Document, DocumentChunk
|
||||
from app.models.user import User
|
||||
from app.routers.auth import get_current_user
|
||||
from app.services.auth_service import get_password_hash
|
||||
from app.services.document_service import DocumentService
|
||||
from starlette.datastructures import UploadFile
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
async def document_router_env(tmp_path):
|
||||
db_path = tmp_path / 'test_documents_router.db'
|
||||
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
|
||||
session_factory = async_sessionmaker(engine, expire_on_commit=False)
|
||||
|
||||
async with engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
|
||||
async with session_factory() as session:
|
||||
user = User(
|
||||
email='docs@example.com',
|
||||
hashed_password=get_password_hash('secret123'),
|
||||
full_name='Docs Tester',
|
||||
)
|
||||
session.add(user)
|
||||
await session.flush()
|
||||
|
||||
document = Document(
|
||||
id='doc-1',
|
||||
user_id=user.id,
|
||||
title='Uploaded spec',
|
||||
filename='uploaded-spec.docx',
|
||||
file_type='docx',
|
||||
file_size=2048,
|
||||
file_path=str(tmp_path / 'uploaded-spec.docx'),
|
||||
summary='spec summary',
|
||||
chunk_count=3,
|
||||
is_indexed=True,
|
||||
ingestion_status='ready',
|
||||
normalized_content='# Uploaded spec\n\nnormalized body',
|
||||
normalized_format='structured_markdown',
|
||||
)
|
||||
session.add(document)
|
||||
await session.flush()
|
||||
session.add_all([
|
||||
DocumentChunk(
|
||||
id='chunk-1',
|
||||
document_id=document.id,
|
||||
chunk_index=0,
|
||||
content='original chunk content',
|
||||
metadata_=json.dumps({'content_type': 'paragraph', 'section_title': 'Intro'}),
|
||||
),
|
||||
DocumentChunk(
|
||||
id='chunk-2',
|
||||
document_id=document.id,
|
||||
chunk_index=1,
|
||||
content='second chunk content',
|
||||
metadata_=json.dumps({'content_type': 'paragraph', 'section_title': 'Details'}),
|
||||
),
|
||||
])
|
||||
await session.commit()
|
||||
await session.refresh(user)
|
||||
|
||||
async def override_get_db():
|
||||
async with session_factory() as session:
|
||||
yield session
|
||||
|
||||
async def override_get_current_user():
|
||||
return user
|
||||
|
||||
app.dependency_overrides[get_db] = override_get_db
|
||||
app.dependency_overrides[get_current_user] = override_get_current_user
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
app.dependency_overrides.clear()
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_list_documents_returns_serializable_document_payload(document_router_env):
|
||||
transport = ASGITransport(app=app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
||||
response = await client.get('/api/documents')
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert len(payload) == 1
|
||||
assert payload[0]['title'] == 'Uploaded spec'
|
||||
assert payload[0]['ingestion_status'] == 'ready'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_document_chunks_returns_serializable_chunk_payload(document_router_env):
|
||||
transport = ASGITransport(app=app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
||||
response = await client.get('/api/documents/doc-1/chunks')
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert [chunk['id'] for chunk in payload] == ['chunk-1', 'chunk-2']
|
||||
assert payload[0]['content'] == 'original chunk content'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_update_document_chunk_persists_content_and_reindexes_existing_chunks(document_router_env):
|
||||
transport = ASGITransport(app=app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
||||
response = await client.put('/api/documents/doc-1/chunks/chunk-1', json={'content': 'edited chunk content'})
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert payload['id'] == 'chunk-1'
|
||||
assert payload['content'] == 'edited chunk content'
|
||||
|
||||
async for session in app.dependency_overrides[get_db]():
|
||||
chunk_result = await session.execute(select(DocumentChunk).where(DocumentChunk.id == 'chunk-1'))
|
||||
updated_chunk = chunk_result.scalar_one()
|
||||
document_result = await session.execute(select(Document).where(Document.id == 'doc-1'))
|
||||
updated_document = document_result.scalar_one()
|
||||
|
||||
assert updated_chunk.content == 'edited chunk content'
|
||||
assert updated_document.ingestion_status == 'ready'
|
||||
assert updated_document.indexed_at is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_document_content_prefers_normalized_content(document_router_env):
|
||||
transport = ASGITransport(app=app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
||||
response = await client.get('/api/documents/doc-1/content')
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json() == {'content': '# Uploaded spec\n\nnormalized body'}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_returns_400_for_unsupported_file_type(document_router_env):
|
||||
transport = ASGITransport(app=app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
||||
response = await client.post(
|
||||
'/api/documents/upload',
|
||||
files={'file': ('payload.exe', BytesIO(b'bad'), 'application/octet-stream')},
|
||||
)
|
||||
|
||||
assert response.status_code == 400
|
||||
assert response.json()['detail'] == '不支持的文件类型: .exe'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_returns_400_for_missing_parser_dependency(document_router_env, monkeypatch):
|
||||
async def raise_missing_dependency(self, file_path: str, ext: str):
|
||||
raise ValueError('DOCX 解析依赖缺失: python-docx')
|
||||
|
||||
monkeypatch.setattr(DocumentService, '_parse_document', raise_missing_dependency)
|
||||
transport = ASGITransport(app=app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
||||
response = await client.post(
|
||||
'/api/documents/upload',
|
||||
files={'file': ('payload.docx', BytesIO(b'bad'), 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')},
|
||||
)
|
||||
|
||||
assert response.status_code == 400
|
||||
assert response.json()['detail'] == 'DOCX 解析依赖缺失: python-docx'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_returns_400_for_missing_mineru_dependency(document_router_env, monkeypatch):
|
||||
async def raise_missing_mineru(self, file_path: str, ext: str):
|
||||
raise ValueError('PDF 解析依赖缺失: mineru')
|
||||
|
||||
monkeypatch.setattr(DocumentService, '_parse_document', raise_missing_mineru)
|
||||
transport = ASGITransport(app=app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
||||
response = await client.post(
|
||||
'/api/documents/upload',
|
||||
files={'file': ('payload.pdf', BytesIO(b'%PDF-1.4 bad'), 'application/pdf')},
|
||||
)
|
||||
|
||||
assert response.status_code == 400
|
||||
assert response.json()['detail'] == 'PDF 解析依赖缺失: mineru'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_returns_success_payload_for_pdf(document_router_env, monkeypatch):
|
||||
async def fake_upload_document(self, user_id: str, file, folder_id=None):
|
||||
return Document(
|
||||
id='pdf-doc-1',
|
||||
user_id=user_id,
|
||||
title='PDF Spec',
|
||||
filename='payload.pdf',
|
||||
file_type='pdf',
|
||||
file_size=2048,
|
||||
file_path='fake/path/payload.pdf',
|
||||
chunk_count=4,
|
||||
ingestion_status='uploaded',
|
||||
normalized_content='# PDF Spec\n\nBody',
|
||||
normalized_format='structured_markdown',
|
||||
)
|
||||
|
||||
monkeypatch.setattr(DocumentService, 'upload_document', fake_upload_document)
|
||||
transport = ASGITransport(app=app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url='http://testserver') as client:
|
||||
response = await client.post(
|
||||
'/api/documents/upload',
|
||||
files={'file': ('payload.pdf', BytesIO(b'%PDF-1.4 fake'), 'application/pdf')},
|
||||
)
|
||||
|
||||
assert response.status_code == 201
|
||||
assert response.json() == {
|
||||
'id': 'pdf-doc-1',
|
||||
'title': 'PDF Spec',
|
||||
'chunk_count': 4,
|
||||
'status': '上传成功,正在索引...',
|
||||
}
|
||||
371
backend/tests/backend/app/services/test_document_service.py
Normal file
371
backend/tests/backend/app/services/test_document_service.py
Normal file
@@ -0,0 +1,371 @@
|
||||
import json
|
||||
from io import BytesIO
|
||||
import builtins
|
||||
import sys
|
||||
import types
|
||||
|
||||
import pytest
|
||||
from docx import Document as DocxDocument
|
||||
from openpyxl import Workbook
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
|
||||
from starlette.datastructures import UploadFile
|
||||
|
||||
import app.models # noqa: F401
|
||||
from app.database import Base
|
||||
from app.models.document import Document, DocumentChunk
|
||||
from app.models.user import User
|
||||
from app.services.auth_service import get_password_hash
|
||||
from app.services.document_service import DocumentService
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
async def document_test_env(tmp_path, monkeypatch):
|
||||
db_path = tmp_path / 'test_documents.db'
|
||||
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
|
||||
session_factory = async_sessionmaker(engine, expire_on_commit=False)
|
||||
|
||||
async with engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
|
||||
async with session_factory() as session:
|
||||
user = User(
|
||||
email='doc-tester@example.com',
|
||||
hashed_password=get_password_hash('secret123'),
|
||||
full_name='Doc Tester',
|
||||
)
|
||||
session.add(user)
|
||||
await session.commit()
|
||||
await session.refresh(user)
|
||||
|
||||
monkeypatch.setattr('app.services.document_service.settings.UPLOAD_DIR', str(tmp_path / 'uploads'))
|
||||
monkeypatch.setattr('app.services.document_service.settings.CHUNK_SIZE', 120)
|
||||
monkeypatch.setattr('app.services.document_service.settings.CHUNK_OVERLAP', 20)
|
||||
|
||||
async with session_factory() as session:
|
||||
yield session, user
|
||||
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_creates_schema_and_row_chunks_for_csv(document_test_env):
|
||||
session, user = document_test_env
|
||||
service = DocumentService(session)
|
||||
payload = '\n'.join([
|
||||
'region,month,revenue',
|
||||
'East,2025-01,100',
|
||||
'West,2025-01,200',
|
||||
'East,2025-02,150',
|
||||
'West,2025-02,250',
|
||||
])
|
||||
upload = UploadFile(filename='sales.csv', file=BytesIO(payload.encode('utf-8')))
|
||||
|
||||
document = await service.upload_document(user.id, upload)
|
||||
|
||||
assert document.file_type == 'csv'
|
||||
assert document.ingestion_status == 'uploaded'
|
||||
assert document.parser_version == 'v2'
|
||||
assert document.index_version == 'v2'
|
||||
assert document.chunk_count >= 2
|
||||
|
||||
chunk_result = await session.execute(
|
||||
select(DocumentChunk)
|
||||
.where(DocumentChunk.document_id == document.id)
|
||||
.order_by(DocumentChunk.chunk_index)
|
||||
)
|
||||
chunks = list(chunk_result.scalars().all())
|
||||
|
||||
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
|
||||
assert metadata[0]['content_type'] == 'table_schema'
|
||||
assert metadata[0]['headers'] == ['region', 'month', 'revenue']
|
||||
assert any(item['content_type'] == 'table_rows' for item in metadata)
|
||||
assert any('region=East' in chunk.content for chunk in chunks)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_creates_sheet_metadata_chunks_for_xlsx(document_test_env):
|
||||
session, user = document_test_env
|
||||
service = DocumentService(session)
|
||||
|
||||
workbook = Workbook()
|
||||
ws = workbook.active
|
||||
ws.title = 'Revenue'
|
||||
ws.append(['region', 'quarter', 'amount'])
|
||||
ws.append(['East', 'Q1', 300])
|
||||
ws.append(['West', 'Q1', 280])
|
||||
detail = workbook.create_sheet('Detail')
|
||||
detail.append(['project', 'owner'])
|
||||
detail.append(['Jarvis', 'Ops'])
|
||||
|
||||
file_obj = BytesIO()
|
||||
workbook.save(file_obj)
|
||||
file_obj.seek(0)
|
||||
upload = UploadFile(filename='report.xlsx', file=file_obj)
|
||||
|
||||
document = await service.upload_document(user.id, upload)
|
||||
|
||||
assert document.file_type == 'xlsx'
|
||||
assert document.chunk_count >= 3
|
||||
|
||||
chunk_result = await session.execute(
|
||||
select(DocumentChunk)
|
||||
.where(DocumentChunk.document_id == document.id)
|
||||
.order_by(DocumentChunk.chunk_index)
|
||||
)
|
||||
chunks = list(chunk_result.scalars().all())
|
||||
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
|
||||
|
||||
assert any(item['sheet_name'] == 'Revenue' for item in metadata)
|
||||
assert any(item['sheet_name'] == 'Detail' for item in metadata)
|
||||
assert any(item['content_type'] == 'table_schema' for item in metadata)
|
||||
assert any(item['content_type'] == 'table_rows' for item in metadata)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_preserves_section_metadata_for_markdown(document_test_env):
|
||||
session, user = document_test_env
|
||||
service = DocumentService(session)
|
||||
payload = '\n'.join([
|
||||
'# Overview',
|
||||
'Jarvis overview paragraph.',
|
||||
'',
|
||||
'## Retrieval',
|
||||
'Hybrid retrieval favors relevant chunks.',
|
||||
])
|
||||
upload = UploadFile(filename='guide.md', file=BytesIO(payload.encode('utf-8')))
|
||||
|
||||
document = await service.upload_document(user.id, upload)
|
||||
|
||||
chunk_result = await session.execute(
|
||||
select(DocumentChunk)
|
||||
.where(DocumentChunk.document_id == document.id)
|
||||
.order_by(DocumentChunk.chunk_index)
|
||||
)
|
||||
chunks = list(chunk_result.scalars().all())
|
||||
|
||||
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
|
||||
assert any(item['content_type'] == 'heading' for item in metadata)
|
||||
assert any(item['section_path'] == ['Overview', 'Retrieval'] for item in metadata if item['content_type'] != 'heading')
|
||||
assert any(item.get('section_title') == 'Retrieval' for item in metadata)
|
||||
|
||||
heading_item = next(item for item in metadata if item['content_type'] == 'heading' and item['section_title'] == 'Overview')
|
||||
child_item = next(item for item in metadata if item['content_type'] == 'paragraph' and item['section_title'] == 'Retrieval')
|
||||
assert heading_item['chunk_level'] == 1
|
||||
assert heading_item['parent_key'] is None
|
||||
assert heading_item['block_key'] == 'Overview'
|
||||
assert child_item['chunk_level'] == 2
|
||||
assert child_item['parent_key'] == 'Overview'
|
||||
assert child_item['block_key'] == 'Overview/Retrieval'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_rejects_unsupported_extension(document_test_env):
|
||||
session, user = document_test_env
|
||||
service = DocumentService(session)
|
||||
upload = UploadFile(filename='malware.exe', file=BytesIO(b'bad'))
|
||||
|
||||
with pytest.raises(ValueError, match='不支持的文件类型'):
|
||||
await service.upload_document(user.id, upload)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_persists_structured_metadata_json(document_test_env):
|
||||
session, user = document_test_env
|
||||
service = DocumentService(session)
|
||||
payload = 'title\n\nplain text body for metadata storage'
|
||||
upload = UploadFile(filename='notes.txt', file=BytesIO(payload.encode('utf-8')))
|
||||
|
||||
document = await service.upload_document(user.id, upload)
|
||||
|
||||
chunk_result = await session.execute(
|
||||
select(DocumentChunk)
|
||||
.where(DocumentChunk.document_id == document.id)
|
||||
.order_by(DocumentChunk.chunk_index)
|
||||
)
|
||||
chunk = chunk_result.scalars().first()
|
||||
|
||||
parsed = json.loads(chunk.metadata_)
|
||||
assert parsed['content_type'] == 'text'
|
||||
assert parsed['parser_version'] == 'v2'
|
||||
assert parsed['index_version'] == 'v2'
|
||||
assert parsed['source_order'] == 0
|
||||
|
||||
document_result = await session.execute(select(Document).where(Document.id == document.id))
|
||||
stored_document = document_result.scalar_one()
|
||||
assert stored_document.ingestion_status == 'uploaded'
|
||||
assert stored_document.normalized_format == 'structured_markdown'
|
||||
assert stored_document.normalized_content == 'title\n\nplain text body for metadata storage'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_extracts_docx_heading_and_table_structure(document_test_env):
|
||||
session, user = document_test_env
|
||||
service = DocumentService(session)
|
||||
|
||||
doc = DocxDocument()
|
||||
doc.add_heading('Architecture', level=1)
|
||||
doc.add_paragraph('System overview paragraph.')
|
||||
doc.add_heading('Retrieval', level=2)
|
||||
doc.add_paragraph('Section-aware retrieval paragraph.')
|
||||
table = doc.add_table(rows=2, cols=2)
|
||||
table.rows[0].cells[0].text = 'metric'
|
||||
table.rows[0].cells[1].text = 'value'
|
||||
table.rows[1].cells[0].text = 'latency'
|
||||
table.rows[1].cells[1].text = '120ms'
|
||||
|
||||
file_obj = BytesIO()
|
||||
doc.save(file_obj)
|
||||
file_obj.seek(0)
|
||||
upload = UploadFile(filename='architecture.docx', file=file_obj)
|
||||
|
||||
document = await service.upload_document(user.id, upload)
|
||||
|
||||
chunk_result = await session.execute(
|
||||
select(DocumentChunk)
|
||||
.where(DocumentChunk.document_id == document.id)
|
||||
.order_by(DocumentChunk.chunk_index)
|
||||
)
|
||||
chunks = list(chunk_result.scalars().all())
|
||||
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
|
||||
|
||||
retrieval_paragraph = next(item for item in metadata if item['section_title'] == 'Retrieval' and item['content_type'] == 'paragraph')
|
||||
table_schema = next(item for item in metadata if item['content_type'] == 'table_schema')
|
||||
|
||||
assert retrieval_paragraph['section_path'] == ['Architecture', 'Retrieval']
|
||||
assert table_schema['headers'] == ['metric', 'value']
|
||||
assert any(item['content_type'] == 'table_rows' for item in metadata)
|
||||
assert document.normalized_format == 'structured_markdown'
|
||||
assert '# Architecture' in document.normalized_content
|
||||
assert '## Retrieval' in document.normalized_content
|
||||
assert '| metric | value |' in document.normalized_content
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_raises_clear_error_when_docx_dependency_is_missing(document_test_env, monkeypatch):
|
||||
session, user = document_test_env
|
||||
service = DocumentService(session)
|
||||
|
||||
original_import = builtins.__import__
|
||||
|
||||
def fake_import(name, *args, **kwargs):
|
||||
if name == 'docx':
|
||||
raise ModuleNotFoundError("No module named 'docx'")
|
||||
return original_import(name, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(builtins, '__import__', fake_import)
|
||||
|
||||
upload = UploadFile(filename='missing.docx', file=BytesIO(b'fake-docx'))
|
||||
|
||||
with pytest.raises(ValueError, match='DOCX 解析依赖缺失: python-docx'):
|
||||
await service.upload_document(user.id, upload)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_raises_clear_error_when_xlsx_dependency_is_missing(document_test_env, monkeypatch):
|
||||
session, user = document_test_env
|
||||
service = DocumentService(session)
|
||||
|
||||
original_import = builtins.__import__
|
||||
|
||||
def fake_import(name, *args, **kwargs):
|
||||
if name == 'openpyxl':
|
||||
raise ModuleNotFoundError("No module named 'openpyxl'")
|
||||
return original_import(name, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(builtins, '__import__', fake_import)
|
||||
|
||||
upload = UploadFile(filename='missing.xlsx', file=BytesIO(b'fake-xlsx'))
|
||||
|
||||
with pytest.raises(ValueError, match='XLSX 解析依赖缺失: openpyxl'):
|
||||
await service.upload_document(user.id, upload)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_uses_mineru_markdown_for_pdf(document_test_env, monkeypatch):
|
||||
session, user = document_test_env
|
||||
service = DocumentService(session)
|
||||
|
||||
fake_mineru = types.SimpleNamespace(
|
||||
to_markdown=lambda file_path: '# PDF Title\n\n## Section\n\nMinerU extracted paragraph.'
|
||||
)
|
||||
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
|
||||
|
||||
upload = UploadFile(filename='spec.pdf', file=BytesIO(b'%PDF-1.4 fake'))
|
||||
document = await service.upload_document(user.id, upload)
|
||||
|
||||
chunk_result = await session.execute(
|
||||
select(DocumentChunk)
|
||||
.where(DocumentChunk.document_id == document.id)
|
||||
.order_by(DocumentChunk.chunk_index)
|
||||
)
|
||||
chunks = list(chunk_result.scalars().all())
|
||||
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
|
||||
|
||||
assert document.normalized_format == 'structured_markdown'
|
||||
assert '# PDF Title' in document.normalized_content
|
||||
assert '## Section' in document.normalized_content
|
||||
assert any(item['content_type'] == 'heading' for item in metadata)
|
||||
assert any(item['content_type'] == 'paragraph' and item['section_title'] == 'Section' for item in metadata)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_preserves_mineru_image_markdown_in_pdf(document_test_env, monkeypatch):
|
||||
session, user = document_test_env
|
||||
service = DocumentService(session)
|
||||
|
||||
fake_mineru = types.SimpleNamespace(
|
||||
to_markdown=lambda file_path: '# PDF Title\n\n\n\nSystem diagram shows retrieval flow.'
|
||||
)
|
||||
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
|
||||
|
||||
upload = UploadFile(filename='diagram.pdf', file=BytesIO(b'%PDF-1.4 fake'))
|
||||
document = await service.upload_document(user.id, upload)
|
||||
|
||||
chunk_result = await session.execute(
|
||||
select(DocumentChunk)
|
||||
.where(DocumentChunk.document_id == document.id)
|
||||
.order_by(DocumentChunk.chunk_index)
|
||||
)
|
||||
chunks = list(chunk_result.scalars().all())
|
||||
|
||||
assert '' in document.normalized_content
|
||||
assert any('System diagram' in chunk.content for chunk in chunks)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_document_content_returns_normalized_pdf_content(document_test_env, monkeypatch):
|
||||
session, user = document_test_env
|
||||
service = DocumentService(session)
|
||||
|
||||
fake_mineru = types.SimpleNamespace(
|
||||
to_markdown=lambda file_path: '# PDF Title\n\nNormalized pdf body.'
|
||||
)
|
||||
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
|
||||
|
||||
upload = UploadFile(filename='preview.pdf', file=BytesIO(b'%PDF-1.4 fake'))
|
||||
document = await service.upload_document(user.id, upload)
|
||||
|
||||
content = await service.get_document_content(user.id, document.id)
|
||||
|
||||
assert content == '# PDF Title\n\nNormalized pdf body.'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_document_raises_clear_error_when_pdf_dependency_is_missing(document_test_env, monkeypatch):
|
||||
session, user = document_test_env
|
||||
service = DocumentService(session)
|
||||
|
||||
original_import = builtins.__import__
|
||||
|
||||
def fake_import(name, *args, **kwargs):
|
||||
if name == 'mineru':
|
||||
raise ModuleNotFoundError("No module named 'mineru'")
|
||||
return original_import(name, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(builtins, '__import__', fake_import)
|
||||
|
||||
upload = UploadFile(filename='missing.pdf', file=BytesIO(b'%PDF-1.4 fake'))
|
||||
|
||||
with pytest.raises(ValueError, match='PDF 解析依赖缺失: mineru'):
|
||||
await service.upload_document(user.id, upload)
|
||||
130
backend/tests/backend/app/test_database.py
Normal file
130
backend/tests/backend/app/test_database.py
Normal file
@@ -0,0 +1,130 @@
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
from unittest.mock import AsyncMock, Mock
|
||||
|
||||
import pytest
|
||||
from langchain_core.messages import AIMessage, HumanMessage
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.ext.asyncio import create_async_engine
|
||||
|
||||
import app.models # noqa: F401
|
||||
from app.database import Base, ensure_document_columns, ensure_message_columns
|
||||
from app.agents.graph import _ainvoke, _compile_graph
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_ensure_message_columns_adds_attachments_for_existing_messages_table(tmp_path):
|
||||
db_path = tmp_path / 'test_messages.db'
|
||||
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
|
||||
|
||||
async with engine.begin() as conn:
|
||||
await conn.execute(text(
|
||||
'''
|
||||
CREATE TABLE messages (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
conversation_id VARCHAR(36) NOT NULL,
|
||||
role VARCHAR(20) NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
model VARCHAR(100),
|
||||
tokens_used INTEGER,
|
||||
created_at DATETIME,
|
||||
updated_at DATETIME
|
||||
)
|
||||
'''
|
||||
))
|
||||
result = await conn.execute(text("PRAGMA table_info(messages)"))
|
||||
columns_before = {row[1] for row in result.fetchall()}
|
||||
assert 'attachments' not in columns_before
|
||||
|
||||
await ensure_message_columns(conn)
|
||||
|
||||
result = await conn.execute(text("PRAGMA table_info(messages)"))
|
||||
columns_after = {row[1] for row in result.fetchall()}
|
||||
assert 'attachments' in columns_after
|
||||
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_ainvoke_falls_back_to_invoke_for_wrapped_llm_services():
|
||||
llm = Mock()
|
||||
llm.ainvoke = None
|
||||
llm.invoke = AsyncMock(return_value=AIMessage(content='ok'))
|
||||
|
||||
response = await _ainvoke(llm, [HumanMessage(content='ping')])
|
||||
|
||||
assert response.content == 'ok'
|
||||
llm.invoke.assert_awaited_once()
|
||||
|
||||
|
||||
def test_compile_graph_falls_back_when_callbacks_are_unsupported():
|
||||
compiled_without_callbacks = object()
|
||||
graph = Mock()
|
||||
graph.compile.side_effect = [TypeError("unexpected keyword argument 'callbacks'"), compiled_without_callbacks]
|
||||
|
||||
compiled = _compile_graph(graph, callbacks=['cb'])
|
||||
|
||||
assert compiled is compiled_without_callbacks
|
||||
assert graph.compile.call_count == 2
|
||||
assert graph.compile.call_args_list[0].kwargs == {'callbacks': ['cb']}
|
||||
assert graph.compile.call_args_list[1].kwargs == {}
|
||||
|
||||
|
||||
def test_settings_resolve_data_paths_from_backend_directory():
|
||||
config_module = importlib.import_module('app.config')
|
||||
expected_data_dir = (Path(config_module.__file__).resolve().parent.parent / 'data').resolve()
|
||||
|
||||
assert Path(config_module.settings.DATA_DIR) == expected_data_dir
|
||||
assert config_module.settings.DATABASE_URL.replace('\\', '/').endswith('/backend/data/jarvis.db')
|
||||
assert Path(config_module.settings.CHROMA_PERSIST_DIR) == expected_data_dir / 'chroma'
|
||||
assert Path(config_module.settings.UPLOAD_DIR) == expected_data_dir / 'uploads'
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_ensure_document_columns_adds_ingestion_fields_for_existing_documents_table(tmp_path):
|
||||
db_path = tmp_path / 'test_documents.db'
|
||||
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
|
||||
|
||||
async with engine.begin() as conn:
|
||||
await conn.execute(text(
|
||||
'''
|
||||
CREATE TABLE documents (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
user_id VARCHAR(36) NOT NULL,
|
||||
title VARCHAR(500) NOT NULL,
|
||||
filename VARCHAR(500) NOT NULL,
|
||||
file_type VARCHAR(50) NOT NULL,
|
||||
file_size INTEGER NOT NULL,
|
||||
file_path VARCHAR(1000) NOT NULL,
|
||||
folder_id VARCHAR(36),
|
||||
summary TEXT,
|
||||
chunk_count INTEGER,
|
||||
is_indexed BOOLEAN,
|
||||
created_at DATETIME,
|
||||
updated_at DATETIME
|
||||
)
|
||||
'''
|
||||
))
|
||||
result = await conn.execute(text("PRAGMA table_info(documents)"))
|
||||
columns_before = {row[1] for row in result.fetchall()}
|
||||
assert 'ingestion_status' not in columns_before
|
||||
assert 'ingestion_error' not in columns_before
|
||||
assert 'indexed_at' not in columns_before
|
||||
assert 'parser_version' not in columns_before
|
||||
assert 'index_version' not in columns_before
|
||||
assert 'normalized_content' not in columns_before
|
||||
assert 'normalized_format' not in columns_before
|
||||
|
||||
await ensure_document_columns(conn)
|
||||
|
||||
result = await conn.execute(text("PRAGMA table_info(documents)"))
|
||||
columns_after = {row[1] for row in result.fetchall()}
|
||||
assert 'ingestion_status' in columns_after
|
||||
assert 'ingestion_error' in columns_after
|
||||
assert 'indexed_at' in columns_after
|
||||
assert 'parser_version' in columns_after
|
||||
assert 'index_version' in columns_after
|
||||
assert 'normalized_content' in columns_after
|
||||
assert 'normalized_format' in columns_after
|
||||
|
||||
await engine.dispose()
|
||||
Reference in New Issue
Block a user