Knowledge files were only partitioned in the database, which made nested uploads, local folder visibility, and delete behavior diverge from the UI. This change makes folder selection drive physical storage paths, keeps original filenames, adds a minimal WebDAV mount/sync path, and reshapes the knowledge panel so local and remote sources can share the same surface. Constraint: Existing knowledge flow already depends on local-folder-backed uploads and document indexing Rejected: Real-time bidirectional WebDAV sync | too much conflict and lifecycle complexity for the first pass Confidence: medium Scope-risk: moderate Reversibility: messy Directive: Keep remote mounts single-direction into local knowledge folders until etag-based incremental sync and conflict rules are verified Tested: Python py_compile on new/modified backend files; LSP diagnostics on new frontend/backend files; manual targeted code-path inspection Not-tested: Full pytest/vitest end-to-end runs blocked by environment temp/cache permission errors; live WebDAV server interoperability
464 lines
18 KiB
Python
464 lines
18 KiB
Python
import json
|
|
from io import BytesIO
|
|
import builtins
|
|
from pathlib import Path
|
|
import sys
|
|
import types
|
|
|
|
import pytest
|
|
from docx import Document as DocxDocument
|
|
from openpyxl import Workbook
|
|
from sqlalchemy import select
|
|
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
|
|
from starlette.datastructures import UploadFile
|
|
|
|
import app.models # noqa: F401
|
|
from app.database import Base
|
|
from app.models.document import Document, DocumentChunk
|
|
from app.models.folder import Folder
|
|
from app.models.user import User
|
|
from app.services.auth_service import get_password_hash
|
|
from app.services.document_service import DocumentService
|
|
|
|
|
|
@pytest.fixture
|
|
async def document_test_env(tmp_path, monkeypatch):
|
|
db_path = tmp_path / 'test_documents.db'
|
|
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
|
|
session_factory = async_sessionmaker(engine, expire_on_commit=False)
|
|
|
|
async with engine.begin() as conn:
|
|
await conn.run_sync(Base.metadata.create_all)
|
|
|
|
async with session_factory() as session:
|
|
user = User(
|
|
email='doc-tester@example.com',
|
|
hashed_password=get_password_hash('secret123'),
|
|
full_name='Doc Tester',
|
|
)
|
|
session.add(user)
|
|
await session.commit()
|
|
await session.refresh(user)
|
|
|
|
monkeypatch.setattr('app.services.document_service.settings.UPLOAD_DIR', str(tmp_path / 'uploads'))
|
|
monkeypatch.setattr('app.services.document_service.settings.CHUNK_SIZE', 120)
|
|
monkeypatch.setattr('app.services.document_service.settings.CHUNK_OVERLAP', 20)
|
|
|
|
async with session_factory() as session:
|
|
yield session, user
|
|
|
|
await engine.dispose()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_creates_schema_and_row_chunks_for_csv(document_test_env):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
payload = '\n'.join([
|
|
'region,month,revenue',
|
|
'East,2025-01,100',
|
|
'West,2025-01,200',
|
|
'East,2025-02,150',
|
|
'West,2025-02,250',
|
|
])
|
|
upload = UploadFile(filename='sales.csv', file=BytesIO(payload.encode('utf-8')))
|
|
|
|
document = await service.upload_document(user.id, upload)
|
|
|
|
assert document.file_type == 'csv'
|
|
assert document.ingestion_status == 'uploaded'
|
|
assert document.parser_version == 'v2'
|
|
assert document.index_version == 'v2'
|
|
assert document.chunk_count >= 2
|
|
|
|
chunk_result = await session.execute(
|
|
select(DocumentChunk)
|
|
.where(DocumentChunk.document_id == document.id)
|
|
.order_by(DocumentChunk.chunk_index)
|
|
)
|
|
chunks = list(chunk_result.scalars().all())
|
|
|
|
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
|
|
assert metadata[0]['content_type'] == 'table_schema'
|
|
assert metadata[0]['headers'] == ['region', 'month', 'revenue']
|
|
assert any(item['content_type'] == 'table_rows' for item in metadata)
|
|
assert any('region=East' in chunk.content for chunk in chunks)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_creates_sheet_metadata_chunks_for_xlsx(document_test_env):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
|
|
workbook = Workbook()
|
|
ws = workbook.active
|
|
ws.title = 'Revenue'
|
|
ws.append(['region', 'quarter', 'amount'])
|
|
ws.append(['East', 'Q1', 300])
|
|
ws.append(['West', 'Q1', 280])
|
|
detail = workbook.create_sheet('Detail')
|
|
detail.append(['project', 'owner'])
|
|
detail.append(['Jarvis', 'Ops'])
|
|
|
|
file_obj = BytesIO()
|
|
workbook.save(file_obj)
|
|
file_obj.seek(0)
|
|
upload = UploadFile(filename='report.xlsx', file=file_obj)
|
|
|
|
document = await service.upload_document(user.id, upload)
|
|
|
|
assert document.file_type == 'xlsx'
|
|
assert document.chunk_count >= 3
|
|
|
|
chunk_result = await session.execute(
|
|
select(DocumentChunk)
|
|
.where(DocumentChunk.document_id == document.id)
|
|
.order_by(DocumentChunk.chunk_index)
|
|
)
|
|
chunks = list(chunk_result.scalars().all())
|
|
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
|
|
|
|
assert any(item['sheet_name'] == 'Revenue' for item in metadata)
|
|
assert any(item['sheet_name'] == 'Detail' for item in metadata)
|
|
assert any(item['content_type'] == 'table_schema' for item in metadata)
|
|
assert any(item['content_type'] == 'table_rows' for item in metadata)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_preserves_section_metadata_for_markdown(document_test_env):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
payload = '\n'.join([
|
|
'# Overview',
|
|
'Jarvis overview paragraph.',
|
|
'',
|
|
'## Retrieval',
|
|
'Hybrid retrieval favors relevant chunks.',
|
|
])
|
|
upload = UploadFile(filename='guide.md', file=BytesIO(payload.encode('utf-8')))
|
|
|
|
document = await service.upload_document(user.id, upload)
|
|
|
|
chunk_result = await session.execute(
|
|
select(DocumentChunk)
|
|
.where(DocumentChunk.document_id == document.id)
|
|
.order_by(DocumentChunk.chunk_index)
|
|
)
|
|
chunks = list(chunk_result.scalars().all())
|
|
|
|
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
|
|
assert any(item['content_type'] == 'heading' for item in metadata)
|
|
assert any(item['section_path'] == ['Overview', 'Retrieval'] for item in metadata if item['content_type'] != 'heading')
|
|
assert any(item.get('section_title') == 'Retrieval' for item in metadata)
|
|
|
|
heading_item = next(item for item in metadata if item['content_type'] == 'heading' and item['section_title'] == 'Overview')
|
|
child_item = next(item for item in metadata if item['content_type'] == 'paragraph' and item['section_title'] == 'Retrieval')
|
|
assert heading_item['chunk_level'] == 1
|
|
assert heading_item['parent_key'] is None
|
|
assert heading_item['block_key'] == 'Overview'
|
|
assert child_item['chunk_level'] == 2
|
|
assert child_item['parent_key'] == 'Overview'
|
|
assert child_item['block_key'] == 'Overview/Retrieval'
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_rejects_unsupported_extension(document_test_env):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
upload = UploadFile(filename='malware.exe', file=BytesIO(b'bad'))
|
|
|
|
with pytest.raises(ValueError, match='不支持的文件类型'):
|
|
await service.upload_document(user.id, upload)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_persists_structured_metadata_json(document_test_env):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
payload = 'title\n\nplain text body for metadata storage'
|
|
upload = UploadFile(filename='notes.txt', file=BytesIO(payload.encode('utf-8')))
|
|
|
|
document = await service.upload_document(user.id, upload)
|
|
|
|
chunk_result = await session.execute(
|
|
select(DocumentChunk)
|
|
.where(DocumentChunk.document_id == document.id)
|
|
.order_by(DocumentChunk.chunk_index)
|
|
)
|
|
chunk = chunk_result.scalars().first()
|
|
|
|
parsed = json.loads(chunk.metadata_)
|
|
assert parsed['content_type'] == 'text'
|
|
assert parsed['parser_version'] == 'v2'
|
|
assert parsed['index_version'] == 'v2'
|
|
assert parsed['source_order'] == 0
|
|
|
|
document_result = await session.execute(select(Document).where(Document.id == document.id))
|
|
stored_document = document_result.scalar_one()
|
|
assert stored_document.ingestion_status == 'uploaded'
|
|
assert stored_document.normalized_format == 'structured_markdown'
|
|
assert stored_document.normalized_content == 'title\n\nplain text body for metadata storage'
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_stores_file_in_nested_folder_with_original_name(document_test_env):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
|
|
root = Folder(user_id=user.id, name='Projects')
|
|
session.add(root)
|
|
await session.flush()
|
|
child = Folder(user_id=user.id, name='Specs', parent_id=root.id)
|
|
session.add(child)
|
|
await session.commit()
|
|
await session.refresh(child)
|
|
|
|
upload = UploadFile(filename='system-design.md', file=BytesIO(b'# Design'))
|
|
document = await service.upload_document(user.id, upload, folder_id=child.id)
|
|
|
|
file_path = Path(document.file_path)
|
|
assert file_path.name == 'system-design.md'
|
|
assert file_path.parent.name == 'Specs'
|
|
assert file_path.parent.parent.name == 'Projects'
|
|
assert file_path.exists()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_extracts_docx_heading_and_table_structure(document_test_env):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
|
|
doc = DocxDocument()
|
|
doc.add_heading('Architecture', level=1)
|
|
doc.add_paragraph('System overview paragraph.')
|
|
doc.add_heading('Retrieval', level=2)
|
|
doc.add_paragraph('Section-aware retrieval paragraph.')
|
|
table = doc.add_table(rows=2, cols=2)
|
|
table.rows[0].cells[0].text = 'metric'
|
|
table.rows[0].cells[1].text = 'value'
|
|
table.rows[1].cells[0].text = 'latency'
|
|
table.rows[1].cells[1].text = '120ms'
|
|
|
|
file_obj = BytesIO()
|
|
doc.save(file_obj)
|
|
file_obj.seek(0)
|
|
upload = UploadFile(filename='architecture.docx', file=file_obj)
|
|
|
|
document = await service.upload_document(user.id, upload)
|
|
|
|
chunk_result = await session.execute(
|
|
select(DocumentChunk)
|
|
.where(DocumentChunk.document_id == document.id)
|
|
.order_by(DocumentChunk.chunk_index)
|
|
)
|
|
chunks = list(chunk_result.scalars().all())
|
|
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
|
|
|
|
retrieval_paragraph = next(item for item in metadata if item['section_title'] == 'Retrieval' and item['content_type'] == 'paragraph')
|
|
table_schema = next(item for item in metadata if item['content_type'] == 'table_schema')
|
|
|
|
assert retrieval_paragraph['section_path'] == ['Architecture', 'Retrieval']
|
|
assert table_schema['headers'] == ['metric', 'value']
|
|
assert any(item['content_type'] == 'table_rows' for item in metadata)
|
|
assert document.normalized_format == 'structured_markdown'
|
|
assert '# Architecture' in document.normalized_content
|
|
assert '## Retrieval' in document.normalized_content
|
|
assert '| metric | value |' in document.normalized_content
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_raises_clear_error_when_docx_dependency_is_missing(document_test_env, monkeypatch):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
|
|
original_import = builtins.__import__
|
|
|
|
def fake_import(name, *args, **kwargs):
|
|
if name == 'docx':
|
|
raise ModuleNotFoundError("No module named 'docx'")
|
|
return original_import(name, *args, **kwargs)
|
|
|
|
monkeypatch.setattr(builtins, '__import__', fake_import)
|
|
|
|
upload = UploadFile(filename='missing.docx', file=BytesIO(b'fake-docx'))
|
|
|
|
with pytest.raises(ValueError, match='DOCX 解析依赖缺失: python-docx'):
|
|
await service.upload_document(user.id, upload)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_raises_clear_error_when_xlsx_dependency_is_missing(document_test_env, monkeypatch):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
|
|
original_import = builtins.__import__
|
|
|
|
def fake_import(name, *args, **kwargs):
|
|
if name == 'openpyxl':
|
|
raise ModuleNotFoundError("No module named 'openpyxl'")
|
|
return original_import(name, *args, **kwargs)
|
|
|
|
monkeypatch.setattr(builtins, '__import__', fake_import)
|
|
|
|
upload = UploadFile(filename='missing.xlsx', file=BytesIO(b'fake-xlsx'))
|
|
|
|
with pytest.raises(ValueError, match='XLSX 解析依赖缺失: openpyxl'):
|
|
await service.upload_document(user.id, upload)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_uses_mineru_markdown_for_pdf(document_test_env, monkeypatch):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
|
|
fake_mineru = types.SimpleNamespace(
|
|
to_markdown=lambda file_path: '# PDF Title\n\n## Section\n\nMinerU extracted paragraph.'
|
|
)
|
|
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
|
|
|
|
upload = UploadFile(filename='spec.pdf', file=BytesIO(b'%PDF-1.4 fake'))
|
|
document = await service.upload_document(user.id, upload)
|
|
|
|
chunk_result = await session.execute(
|
|
select(DocumentChunk)
|
|
.where(DocumentChunk.document_id == document.id)
|
|
.order_by(DocumentChunk.chunk_index)
|
|
)
|
|
chunks = list(chunk_result.scalars().all())
|
|
metadata = [json.loads(chunk.metadata_) for chunk in chunks]
|
|
|
|
assert document.normalized_format == 'structured_markdown'
|
|
assert '# PDF Title' in document.normalized_content
|
|
assert '## Section' in document.normalized_content
|
|
assert any(item['content_type'] == 'heading' for item in metadata)
|
|
assert any(item['content_type'] == 'paragraph' and item['section_title'] == 'Section' for item in metadata)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_preserves_mineru_image_markdown_in_pdf(document_test_env, monkeypatch):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
|
|
fake_mineru = types.SimpleNamespace(
|
|
to_markdown=lambda file_path: '# PDF Title\n\n\n\nSystem diagram shows retrieval flow.'
|
|
)
|
|
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
|
|
|
|
upload = UploadFile(filename='diagram.pdf', file=BytesIO(b'%PDF-1.4 fake'))
|
|
document = await service.upload_document(user.id, upload)
|
|
|
|
chunk_result = await session.execute(
|
|
select(DocumentChunk)
|
|
.where(DocumentChunk.document_id == document.id)
|
|
.order_by(DocumentChunk.chunk_index)
|
|
)
|
|
chunks = list(chunk_result.scalars().all())
|
|
|
|
assert '' in document.normalized_content
|
|
assert any('System diagram' in chunk.content for chunk in chunks)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_document_content_returns_normalized_pdf_content(document_test_env, monkeypatch):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
|
|
fake_mineru = types.SimpleNamespace(
|
|
to_markdown=lambda file_path: '# PDF Title\n\nNormalized pdf body.'
|
|
)
|
|
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
|
|
|
|
upload = UploadFile(filename='preview.pdf', file=BytesIO(b'%PDF-1.4 fake'))
|
|
document = await service.upload_document(user.id, upload)
|
|
|
|
content = await service.get_document_content(user.id, document.id)
|
|
|
|
assert content == '# PDF Title\n\nNormalized pdf body.'
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_uses_mineru_cli_do_parse_fallback_for_pdf(document_test_env, monkeypatch, tmp_path):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
|
|
fake_mineru = types.SimpleNamespace()
|
|
fake_common = types.SimpleNamespace()
|
|
|
|
def fake_do_parse(output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, **kwargs):
|
|
output_path = Path(output_dir) / pdf_file_names[0] / 'pipeline'
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
(output_path / f'{pdf_file_names[0]}.md').write_text('# PDF Title\n\nCLI fallback content.', encoding='utf-8')
|
|
|
|
fake_common.do_parse = fake_do_parse
|
|
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
|
|
monkeypatch.setitem(sys.modules, 'mineru.cli', types.SimpleNamespace(common=fake_common))
|
|
monkeypatch.setitem(sys.modules, 'mineru.cli.common', fake_common)
|
|
monkeypatch.setattr('app.services.document_service.settings.UPLOAD_DIR', str(tmp_path / 'uploads'))
|
|
|
|
upload = UploadFile(filename='fallback.pdf', file=BytesIO(b'%PDF-1.4 fake'))
|
|
document = await service.upload_document(user.id, upload)
|
|
|
|
assert document.normalized_format == 'structured_markdown'
|
|
assert '# PDF Title' in document.normalized_content
|
|
assert 'CLI fallback content.' in document.normalized_content
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_raises_clear_error_when_mineru_cli_runtime_dependency_is_missing(document_test_env, monkeypatch):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
|
|
fake_mineru = types.SimpleNamespace()
|
|
fake_common = types.SimpleNamespace()
|
|
fake_enum_class = types.SimpleNamespace(MakeMode=types.SimpleNamespace(MM_MD='mm_markdown'))
|
|
|
|
def fake_do_parse(*args, **kwargs):
|
|
raise ModuleNotFoundError("No module named 'torch'")
|
|
|
|
fake_common.do_parse = fake_do_parse
|
|
fake_common.read_fn = lambda path: b'%PDF-1.4 fake'
|
|
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
|
|
monkeypatch.setitem(sys.modules, 'mineru.cli', types.SimpleNamespace(common=fake_common))
|
|
monkeypatch.setitem(sys.modules, 'mineru.cli.common', fake_common)
|
|
monkeypatch.setitem(sys.modules, 'mineru.utils', types.SimpleNamespace(enum_class=fake_enum_class))
|
|
monkeypatch.setitem(sys.modules, 'mineru.utils.enum_class', fake_enum_class)
|
|
|
|
upload = UploadFile(filename='runtime-missing.pdf', file=BytesIO(b'%PDF-1.4 fake'))
|
|
|
|
with pytest.raises(ValueError, match="PDF 解析依赖缺失: MinerU 运行时依赖 torch"):
|
|
await service.upload_document(user.id, upload)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_raises_clear_error_when_mineru_interface_is_unsupported(document_test_env, monkeypatch):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
|
|
fake_mineru = types.SimpleNamespace()
|
|
monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
|
|
|
|
upload = UploadFile(filename='unsupported.pdf', file=BytesIO(b'%PDF-1.4 fake'))
|
|
|
|
with pytest.raises(ValueError, match='PDF 解析失败: 当前安装的 MinerU 版本接口不兼容'):
|
|
await service.upload_document(user.id, upload)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_document_raises_clear_error_when_pdf_dependency_is_missing(document_test_env, monkeypatch):
|
|
session, user = document_test_env
|
|
service = DocumentService(session)
|
|
|
|
original_import = builtins.__import__
|
|
|
|
def fake_import(name, *args, **kwargs):
|
|
if name == 'mineru':
|
|
raise ModuleNotFoundError("No module named 'mineru'")
|
|
return original_import(name, *args, **kwargs)
|
|
|
|
monkeypatch.setattr(builtins, '__import__', fake_import)
|
|
|
|
upload = UploadFile(filename='missing.pdf', file=BytesIO(b'%PDF-1.4 fake'))
|
|
|
|
with pytest.raises(ValueError, match='PDF 解析依赖缺失: mineru'):
|
|
await service.upload_document(user.id, upload)
|