Files
JARVIS/backend/tests/backend/app/services/test_document_service.py
WIN-JHFT4D3SIVT\caoxiaozhu 8c7cf0732b Align knowledge storage with real folders and add WebDAV import surface
Knowledge files were only partitioned in the database, which made nested uploads, local folder visibility, and delete behavior diverge from the UI. This change makes folder selection drive physical storage paths, keeps original filenames, adds a minimal WebDAV mount/sync path, and reshapes the knowledge panel so local and remote sources can share the same surface.

Constraint: Existing knowledge flow already depends on local-folder-backed uploads and document indexing
Rejected: Real-time bidirectional WebDAV sync | too much conflict and lifecycle complexity for the first pass
Confidence: medium
Scope-risk: moderate
Reversibility: messy
Directive: Keep remote mounts single-direction into local knowledge folders until etag-based incremental sync and conflict rules are verified
Tested: Python py_compile on new/modified backend files; LSP diagnostics on new frontend/backend files; manual targeted code-path inspection
Not-tested: Full pytest/vitest end-to-end runs blocked by environment temp/cache permission errors; live WebDAV server interoperability
2026-04-09 17:26:37 +08:00

464 lines
18 KiB
Python

import json
from io import BytesIO
import builtins
from pathlib import Path
import sys
import types
import pytest
from docx import Document as DocxDocument
from openpyxl import Workbook
from sqlalchemy import select
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from starlette.datastructures import UploadFile
import app.models # noqa: F401
from app.database import Base
from app.models.document import Document, DocumentChunk
from app.models.folder import Folder
from app.models.user import User
from app.services.auth_service import get_password_hash
from app.services.document_service import DocumentService
@pytest.fixture
async def document_test_env(tmp_path, monkeypatch):
db_path = tmp_path / 'test_documents.db'
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
session_factory = async_sessionmaker(engine, expire_on_commit=False)
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
async with session_factory() as session:
user = User(
email='doc-tester@example.com',
hashed_password=get_password_hash('secret123'),
full_name='Doc Tester',
)
session.add(user)
await session.commit()
await session.refresh(user)
monkeypatch.setattr('app.services.document_service.settings.UPLOAD_DIR', str(tmp_path / 'uploads'))
monkeypatch.setattr('app.services.document_service.settings.CHUNK_SIZE', 120)
monkeypatch.setattr('app.services.document_service.settings.CHUNK_OVERLAP', 20)
async with session_factory() as session:
yield session, user
await engine.dispose()
@pytest.mark.asyncio
async def test_upload_document_creates_schema_and_row_chunks_for_csv(document_test_env):
session, user = document_test_env
service = DocumentService(session)
payload = '\n'.join([
'region,month,revenue',
'East,2025-01,100',
'West,2025-01,200',
'East,2025-02,150',
'West,2025-02,250',
])
upload = UploadFile(filename='sales.csv', file=BytesIO(payload.encode('utf-8')))
document = await service.upload_document(user.id, upload)
assert document.file_type == 'csv'
assert document.ingestion_status == 'uploaded'
assert document.parser_version == 'v2'
assert document.index_version == 'v2'
assert document.chunk_count >= 2
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunks = list(chunk_result.scalars().all())
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
assert metadata[0]['content_type'] == 'table_schema'
assert metadata[0]['headers'] == ['region', 'month', 'revenue']
assert any(item['content_type'] == 'table_rows' for item in metadata)
assert any('region=East' in chunk.content for chunk in chunks)
@pytest.mark.asyncio
async def test_upload_document_creates_sheet_metadata_chunks_for_xlsx(document_test_env):
session, user = document_test_env
service = DocumentService(session)
workbook = Workbook()
ws = workbook.active
ws.title = 'Revenue'
ws.append(['region', 'quarter', 'amount'])
ws.append(['East', 'Q1', 300])
ws.append(['West', 'Q1', 280])
detail = workbook.create_sheet('Detail')
detail.append(['project', 'owner'])
detail.append(['Jarvis', 'Ops'])
file_obj = BytesIO()
workbook.save(file_obj)
file_obj.seek(0)
upload = UploadFile(filename='report.xlsx', file=file_obj)
document = await service.upload_document(user.id, upload)
assert document.file_type == 'xlsx'
assert document.chunk_count >= 3
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunks = list(chunk_result.scalars().all())
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
assert any(item['sheet_name'] == 'Revenue' for item in metadata)
assert any(item['sheet_name'] == 'Detail' for item in metadata)
assert any(item['content_type'] == 'table_schema' for item in metadata)
assert any(item['content_type'] == 'table_rows' for item in metadata)
@pytest.mark.asyncio
async def test_upload_document_preserves_section_metadata_for_markdown(document_test_env):
session, user = document_test_env
service = DocumentService(session)
payload = '\n'.join([
'# Overview',
'Jarvis overview paragraph.',
'',
'## Retrieval',
'Hybrid retrieval favors relevant chunks.',
])
upload = UploadFile(filename='guide.md', file=BytesIO(payload.encode('utf-8')))
document = await service.upload_document(user.id, upload)
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunks = list(chunk_result.scalars().all())
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
assert any(item['content_type'] == 'heading' for item in metadata)
assert any(item['section_path'] == ['Overview', 'Retrieval'] for item in metadata if item['content_type'] != 'heading')
assert any(item.get('section_title') == 'Retrieval' for item in metadata)
heading_item = next(item for item in metadata if item['content_type'] == 'heading' and item['section_title'] == 'Overview')
child_item = next(item for item in metadata if item['content_type'] == 'paragraph' and item['section_title'] == 'Retrieval')
assert heading_item['chunk_level'] == 1
assert heading_item['parent_key'] is None
assert heading_item['block_key'] == 'Overview'
assert child_item['chunk_level'] == 2
assert child_item['parent_key'] == 'Overview'
assert child_item['block_key'] == 'Overview/Retrieval'
@pytest.mark.asyncio
async def test_upload_document_rejects_unsupported_extension(document_test_env):
session, user = document_test_env
service = DocumentService(session)
upload = UploadFile(filename='malware.exe', file=BytesIO(b'bad'))
with pytest.raises(ValueError, match='不支持的文件类型'):
await service.upload_document(user.id, upload)
@pytest.mark.asyncio
async def test_upload_document_persists_structured_metadata_json(document_test_env):
session, user = document_test_env
service = DocumentService(session)
payload = 'title\n\nplain text body for metadata storage'
upload = UploadFile(filename='notes.txt', file=BytesIO(payload.encode('utf-8')))
document = await service.upload_document(user.id, upload)
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunk = chunk_result.scalars().first()
parsed = json.loads(chunk.metadata_)
assert parsed['content_type'] == 'text'
assert parsed['parser_version'] == 'v2'
assert parsed['index_version'] == 'v2'
assert parsed['source_order'] == 0
document_result = await session.execute(select(Document).where(Document.id == document.id))
stored_document = document_result.scalar_one()
assert stored_document.ingestion_status == 'uploaded'
assert stored_document.normalized_format == 'structured_markdown'
assert stored_document.normalized_content == 'title\n\nplain text body for metadata storage'
@pytest.mark.asyncio
async def test_upload_document_stores_file_in_nested_folder_with_original_name(document_test_env):
session, user = document_test_env
service = DocumentService(session)
root = Folder(user_id=user.id, name='Projects')
session.add(root)
await session.flush()
child = Folder(user_id=user.id, name='Specs', parent_id=root.id)
session.add(child)
await session.commit()
await session.refresh(child)
upload = UploadFile(filename='system-design.md', file=BytesIO(b'# Design'))
document = await service.upload_document(user.id, upload, folder_id=child.id)
file_path = Path(document.file_path)
assert file_path.name == 'system-design.md'
assert file_path.parent.name == 'Specs'
assert file_path.parent.parent.name == 'Projects'
assert file_path.exists()
@pytest.mark.asyncio
async def test_upload_document_extracts_docx_heading_and_table_structure(document_test_env):
session, user = document_test_env
service = DocumentService(session)
doc = DocxDocument()
doc.add_heading('Architecture', level=1)
doc.add_paragraph('System overview paragraph.')
doc.add_heading('Retrieval', level=2)
doc.add_paragraph('Section-aware retrieval paragraph.')
table = doc.add_table(rows=2, cols=2)
table.rows[0].cells[0].text = 'metric'
table.rows[0].cells[1].text = 'value'
table.rows[1].cells[0].text = 'latency'
table.rows[1].cells[1].text = '120ms'
file_obj = BytesIO()
doc.save(file_obj)
file_obj.seek(0)
upload = UploadFile(filename='architecture.docx', file=file_obj)
document = await service.upload_document(user.id, upload)
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunks = list(chunk_result.scalars().all())
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
retrieval_paragraph = next(item for item in metadata if item['section_title'] == 'Retrieval' and item['content_type'] == 'paragraph')
table_schema = next(item for item in metadata if item['content_type'] == 'table_schema')
assert retrieval_paragraph['section_path'] == ['Architecture', 'Retrieval']
assert table_schema['headers'] == ['metric', 'value']
assert any(item['content_type'] == 'table_rows' for item in metadata)
assert document.normalized_format == 'structured_markdown'
assert '# Architecture' in document.normalized_content
assert '## Retrieval' in document.normalized_content
assert '| metric | value |' in document.normalized_content
@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_docx_dependency_is_missing(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
original_import = builtins.__import__
def fake_import(name, *args, **kwargs):
if name == 'docx':
raise ModuleNotFoundError("No module named 'docx'")
return original_import(name, *args, **kwargs)
monkeypatch.setattr(builtins, '__import__', fake_import)
upload = UploadFile(filename='missing.docx', file=BytesIO(b'fake-docx'))
with pytest.raises(ValueError, match='DOCX 解析依赖缺失: python-docx'):
await service.upload_document(user.id, upload)
@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_xlsx_dependency_is_missing(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
original_import = builtins.__import__
def fake_import(name, *args, **kwargs):
if name == 'openpyxl':
raise ModuleNotFoundError("No module named 'openpyxl'")
return original_import(name, *args, **kwargs)
monkeypatch.setattr(builtins, '__import__', fake_import)
upload = UploadFile(filename='missing.xlsx', file=BytesIO(b'fake-xlsx'))
with pytest.raises(ValueError, match='XLSX 解析依赖缺失: openpyxl'):
await service.upload_document(user.id, upload)
@pytest.mark.asyncio
async def test_upload_document_uses_mineru_markdown_for_pdf(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
fake_mineru = types.SimpleNamespace(
to_markdown=lambda file_path: '# PDF Title\n\n## Section\n\nMinerU extracted paragraph.'
)
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
upload = UploadFile(filename='spec.pdf', file=BytesIO(b'%PDF-1.4 fake'))
document = await service.upload_document(user.id, upload)
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunks = list(chunk_result.scalars().all())
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
assert document.normalized_format == 'structured_markdown'
assert '# PDF Title' in document.normalized_content
assert '## Section' in document.normalized_content
assert any(item['content_type'] == 'heading' for item in metadata)
assert any(item['content_type'] == 'paragraph' and item['section_title'] == 'Section' for item in metadata)
@pytest.mark.asyncio
async def test_upload_document_preserves_mineru_image_markdown_in_pdf(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
fake_mineru = types.SimpleNamespace(
to_markdown=lambda file_path: '# PDF Title\n\n![System diagram](images/system.png)\n\nSystem diagram shows retrieval flow.'
)
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
upload = UploadFile(filename='diagram.pdf', file=BytesIO(b'%PDF-1.4 fake'))
document = await service.upload_document(user.id, upload)
chunk_result = await session.execute(
select(DocumentChunk)
.where(DocumentChunk.document_id == document.id)
.order_by(DocumentChunk.chunk_index)
)
chunks = list(chunk_result.scalars().all())
assert '![System diagram](images/system.png)' in document.normalized_content
assert any('System diagram' in chunk.content for chunk in chunks)
@pytest.mark.asyncio
async def test_get_document_content_returns_normalized_pdf_content(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
fake_mineru = types.SimpleNamespace(
to_markdown=lambda file_path: '# PDF Title\n\nNormalized pdf body.'
)
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
upload = UploadFile(filename='preview.pdf', file=BytesIO(b'%PDF-1.4 fake'))
document = await service.upload_document(user.id, upload)
content = await service.get_document_content(user.id, document.id)
assert content == '# PDF Title\n\nNormalized pdf body.'
@pytest.mark.asyncio
async def test_upload_document_uses_mineru_cli_do_parse_fallback_for_pdf(document_test_env, monkeypatch, tmp_path):
session, user = document_test_env
service = DocumentService(session)
fake_mineru = types.SimpleNamespace()
fake_common = types.SimpleNamespace()
def fake_do_parse(output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, **kwargs):
output_path = Path(output_dir) / pdf_file_names[0] / 'pipeline'
output_path.mkdir(parents=True, exist_ok=True)
(output_path / f'{pdf_file_names[0]}.md').write_text('# PDF Title\n\nCLI fallback content.', encoding='utf-8')
fake_common.do_parse = fake_do_parse
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
monkeypatch.setitem(sys.modules, 'mineru.cli', types.SimpleNamespace(common=fake_common))
monkeypatch.setitem(sys.modules, 'mineru.cli.common', fake_common)
monkeypatch.setattr('app.services.document_service.settings.UPLOAD_DIR', str(tmp_path / 'uploads'))
upload = UploadFile(filename='fallback.pdf', file=BytesIO(b'%PDF-1.4 fake'))
document = await service.upload_document(user.id, upload)
assert document.normalized_format == 'structured_markdown'
assert '# PDF Title' in document.normalized_content
assert 'CLI fallback content.' in document.normalized_content
@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_mineru_cli_runtime_dependency_is_missing(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
fake_mineru = types.SimpleNamespace()
fake_common = types.SimpleNamespace()
fake_enum_class = types.SimpleNamespace(MakeMode=types.SimpleNamespace(MM_MD='mm_markdown'))
def fake_do_parse(*args, **kwargs):
raise ModuleNotFoundError("No module named 'torch'")
fake_common.do_parse = fake_do_parse
fake_common.read_fn = lambda path: b'%PDF-1.4 fake'
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
monkeypatch.setitem(sys.modules, 'mineru.cli', types.SimpleNamespace(common=fake_common))
monkeypatch.setitem(sys.modules, 'mineru.cli.common', fake_common)
monkeypatch.setitem(sys.modules, 'mineru.utils', types.SimpleNamespace(enum_class=fake_enum_class))
monkeypatch.setitem(sys.modules, 'mineru.utils.enum_class', fake_enum_class)
upload = UploadFile(filename='runtime-missing.pdf', file=BytesIO(b'%PDF-1.4 fake'))
with pytest.raises(ValueError, match="PDF 解析依赖缺失: MinerU 运行时依赖 torch"):
await service.upload_document(user.id, upload)
@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_mineru_interface_is_unsupported(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
fake_mineru = types.SimpleNamespace()
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
upload = UploadFile(filename='unsupported.pdf', file=BytesIO(b'%PDF-1.4 fake'))
with pytest.raises(ValueError, match='PDF 解析失败: 当前安装的 MinerU 版本接口不兼容'):
await service.upload_document(user.id, upload)
@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_pdf_dependency_is_missing(document_test_env, monkeypatch):
session, user = document_test_env
service = DocumentService(session)
original_import = builtins.__import__
def fake_import(name, *args, **kwargs):
if name == 'mineru':
raise ModuleNotFoundError("No module named 'mineru'")
return original_import(name, *args, **kwargs)
monkeypatch.setattr(builtins, '__import__', fake_import)
upload = UploadFile(filename='missing.pdf', file=BytesIO(b'%PDF-1.4 fake'))
with pytest.raises(ValueError, match='PDF 解析依赖缺失: mineru'):
await service.upload_document(user.id, upload)