JARVIS/backend/tests/backend/app/services/test_document_service.py

import json
from io import BytesIO
import builtins
from pathlib import Path
import sys
import types

import pytest
from docx import Document as DocxDocument
from openpyxl import Workbook
from sqlalchemy import select
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from starlette.datastructures import UploadFile

import app.models  # noqa: F401
from app.database import Base
from app.models.document import Document, DocumentChunk
from app.models.folder import Folder
from app.models.user import User
from app.services.auth_service import get_password_hash
from app.services.document_service import DocumentService


@pytest.fixture
async def document_test_env(tmp_path, monkeypatch):
    db_path = tmp_path / 'test_documents.db'
    engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
    session_factory = async_sessionmaker(engine, expire_on_commit=False)

    async with engine.begin() as conn:
        await conn.run_sync(Base.metadata.create_all)

    async with session_factory() as session:
        user = User(
            email='doc-tester@example.com',
            hashed_password=get_password_hash('secret123'),
            full_name='Doc Tester',
        )
        session.add(user)
        await session.commit()
        await session.refresh(user)

    monkeypatch.setattr('app.services.document_service.settings.UPLOAD_DIR', str(tmp_path / 'uploads'))
    monkeypatch.setattr('app.services.document_service.settings.CHUNK_SIZE', 120)
    monkeypatch.setattr('app.services.document_service.settings.CHUNK_OVERLAP', 20)

    async with session_factory() as session:
        yield session, user

    await engine.dispose()


@pytest.mark.asyncio
async def test_upload_document_creates_schema_and_row_chunks_for_csv(document_test_env):
    session, user = document_test_env
    service = DocumentService(session)
    payload = '\n'.join([
        'region,month,revenue',
        'East,2025-01,100',
        'West,2025-01,200',
        'East,2025-02,150',
        'West,2025-02,250',
    ])
    upload = UploadFile(filename='sales.csv', file=BytesIO(payload.encode('utf-8')))

    document = await service.upload_document(user.id, upload)

    assert document.file_type == 'csv'
    assert document.ingestion_status == 'uploaded'
    assert document.parser_version == 'v2'
    assert document.index_version == 'v2'
    assert document.chunk_count >= 2

    chunk_result = await session.execute(
        select(DocumentChunk)
        .where(DocumentChunk.document_id == document.id)
        .order_by(DocumentChunk.chunk_index)
    )
    chunks = list(chunk_result.scalars().all())

    metadata = [json.loads(chunk.metadata_) for chunk in chunks]
    assert metadata[0]['content_type'] == 'table_schema'
    assert metadata[0]['headers'] == ['region', 'month', 'revenue']
    assert any(item['content_type'] == 'table_rows' for item in metadata)
    assert any('region=East' in chunk.content for chunk in chunks)


@pytest.mark.asyncio
async def test_upload_document_creates_sheet_metadata_chunks_for_xlsx(document_test_env):
    session, user = document_test_env
    service = DocumentService(session)

    workbook = Workbook()
    ws = workbook.active
    ws.title = 'Revenue'
    ws.append(['region', 'quarter', 'amount'])
    ws.append(['East', 'Q1', 300])
    ws.append(['West', 'Q1', 280])
    detail = workbook.create_sheet('Detail')
    detail.append(['project', 'owner'])
    detail.append(['Jarvis', 'Ops'])

    file_obj = BytesIO()
    workbook.save(file_obj)
    file_obj.seek(0)
    upload = UploadFile(filename='report.xlsx', file=file_obj)

    document = await service.upload_document(user.id, upload)

    assert document.file_type == 'xlsx'
    assert document.chunk_count >= 3

    chunk_result = await session.execute(
        select(DocumentChunk)
        .where(DocumentChunk.document_id == document.id)
        .order_by(DocumentChunk.chunk_index)
    )
    chunks = list(chunk_result.scalars().all())
    metadata = [json.loads(chunk.metadata_) for chunk in chunks]

    assert any(item['sheet_name'] == 'Revenue' for item in metadata)
    assert any(item['sheet_name'] == 'Detail' for item in metadata)
    assert any(item['content_type'] == 'table_schema' for item in metadata)
    assert any(item['content_type'] == 'table_rows' for item in metadata)


@pytest.mark.asyncio
async def test_upload_document_preserves_section_metadata_for_markdown(document_test_env):
    session, user = document_test_env
    service = DocumentService(session)
    payload = '\n'.join([
        '# Overview',
        'Jarvis overview paragraph.',
        '',
        '## Retrieval',
        'Hybrid retrieval favors relevant chunks.',
    ])
    upload = UploadFile(filename='guide.md', file=BytesIO(payload.encode('utf-8')))

    document = await service.upload_document(user.id, upload)

    chunk_result = await session.execute(
        select(DocumentChunk)
        .where(DocumentChunk.document_id == document.id)
        .order_by(DocumentChunk.chunk_index)
    )
    chunks = list(chunk_result.scalars().all())

    metadata = [json.loads(chunk.metadata_) for chunk in chunks]
    assert any(item['content_type'] == 'heading' for item in metadata)
    assert any(item['section_path'] == ['Overview', 'Retrieval'] for item in metadata if item['content_type'] != 'heading')
    assert any(item.get('section_title') == 'Retrieval' for item in metadata)

    heading_item = next(item for item in metadata if item['content_type'] == 'heading' and item['section_title'] == 'Overview')
    child_item = next(item for item in metadata if item['content_type'] == 'paragraph' and item['section_title'] == 'Retrieval')
    assert heading_item['chunk_level'] == 1
    assert heading_item['parent_key'] is None
    assert heading_item['block_key'] == 'Overview'
    assert child_item['chunk_level'] == 2
    assert child_item['parent_key'] == 'Overview'
    assert child_item['block_key'] == 'Overview/Retrieval'


@pytest.mark.asyncio
async def test_upload_document_rejects_unsupported_extension(document_test_env):
    session, user = document_test_env
    service = DocumentService(session)
    upload = UploadFile(filename='malware.exe', file=BytesIO(b'bad'))

    with pytest.raises(ValueError, match='不支持的文件类型'):
        await service.upload_document(user.id, upload)


@pytest.mark.asyncio
async def test_upload_document_persists_structured_metadata_json(document_test_env):
    session, user = document_test_env
    service = DocumentService(session)
    payload = 'title\n\nplain text body for metadata storage'
    upload = UploadFile(filename='notes.txt', file=BytesIO(payload.encode('utf-8')))

    document = await service.upload_document(user.id, upload)

    chunk_result = await session.execute(
        select(DocumentChunk)
        .where(DocumentChunk.document_id == document.id)
        .order_by(DocumentChunk.chunk_index)
    )
    chunk = chunk_result.scalars().first()

    parsed = json.loads(chunk.metadata_)
    assert parsed['content_type'] == 'text'
    assert parsed['parser_version'] == 'v2'
    assert parsed['index_version'] == 'v2'
    assert parsed['source_order'] == 0

    document_result = await session.execute(select(Document).where(Document.id == document.id))
    stored_document = document_result.scalar_one()
    assert stored_document.ingestion_status == 'uploaded'
    assert stored_document.normalized_format == 'structured_markdown'
    assert stored_document.normalized_content == 'title\n\nplain text body for metadata storage'


@pytest.mark.asyncio
async def test_upload_document_stores_file_in_nested_folder_with_original_name(document_test_env):
    session, user = document_test_env
    service = DocumentService(session)

    root = Folder(user_id=user.id, name='Projects')
    session.add(root)
    await session.flush()
    child = Folder(user_id=user.id, name='Specs', parent_id=root.id)
    session.add(child)
    await session.commit()
    await session.refresh(child)

    upload = UploadFile(filename='system-design.md', file=BytesIO(b'# Design'))
    document = await service.upload_document(user.id, upload, folder_id=child.id)

    file_path = Path(document.file_path)
    assert file_path.name == 'system-design.md'
    assert file_path.parent.name == 'Specs'
    assert file_path.parent.parent.name == 'Projects'
    assert file_path.exists()


@pytest.mark.asyncio
async def test_upload_document_extracts_docx_heading_and_table_structure(document_test_env):
    session, user = document_test_env
    service = DocumentService(session)

    doc = DocxDocument()
    doc.add_heading('Architecture', level=1)
    doc.add_paragraph('System overview paragraph.')
    doc.add_heading('Retrieval', level=2)
    doc.add_paragraph('Section-aware retrieval paragraph.')
    table = doc.add_table(rows=2, cols=2)
    table.rows[0].cells[0].text = 'metric'
    table.rows[0].cells[1].text = 'value'
    table.rows[1].cells[0].text = 'latency'
    table.rows[1].cells[1].text = '120ms'

    file_obj = BytesIO()
    doc.save(file_obj)
    file_obj.seek(0)
    upload = UploadFile(filename='architecture.docx', file=file_obj)

    document = await service.upload_document(user.id, upload)

    chunk_result = await session.execute(
        select(DocumentChunk)
        .where(DocumentChunk.document_id == document.id)
        .order_by(DocumentChunk.chunk_index)
    )
    chunks = list(chunk_result.scalars().all())
    metadata = [json.loads(chunk.metadata_) for chunk in chunks]

    retrieval_paragraph = next(item for item in metadata if item['section_title'] == 'Retrieval' and item['content_type'] == 'paragraph')
    table_schema = next(item for item in metadata if item['content_type'] == 'table_schema')

    assert retrieval_paragraph['section_path'] == ['Architecture', 'Retrieval']
    assert table_schema['headers'] == ['metric', 'value']
    assert any(item['content_type'] == 'table_rows' for item in metadata)
    assert document.normalized_format == 'structured_markdown'
    assert '# Architecture' in document.normalized_content
    assert '## Retrieval' in document.normalized_content
    assert '| metric | value |' in document.normalized_content


@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_docx_dependency_is_missing(document_test_env, monkeypatch):
    session, user = document_test_env
    service = DocumentService(session)

    original_import = builtins.__import__

    def fake_import(name, *args, **kwargs):
        if name == 'docx':
            raise ModuleNotFoundError("No module named 'docx'")
        return original_import(name, *args, **kwargs)

    monkeypatch.setattr(builtins, '__import__', fake_import)

    upload = UploadFile(filename='missing.docx', file=BytesIO(b'fake-docx'))

    with pytest.raises(ValueError, match='DOCX 解析依赖缺失: python-docx'):
        await service.upload_document(user.id, upload)


@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_xlsx_dependency_is_missing(document_test_env, monkeypatch):
    session, user = document_test_env
    service = DocumentService(session)

    original_import = builtins.__import__

    def fake_import(name, *args, **kwargs):
        if name == 'openpyxl':
            raise ModuleNotFoundError("No module named 'openpyxl'")
        return original_import(name, *args, **kwargs)

    monkeypatch.setattr(builtins, '__import__', fake_import)

    upload = UploadFile(filename='missing.xlsx', file=BytesIO(b'fake-xlsx'))

    with pytest.raises(ValueError, match='XLSX 解析依赖缺失: openpyxl'):
        await service.upload_document(user.id, upload)


@pytest.mark.asyncio
async def test_upload_document_uses_mineru_markdown_for_pdf(document_test_env, monkeypatch):
    session, user = document_test_env
    service = DocumentService(session)

    fake_mineru = types.SimpleNamespace(
        to_markdown=lambda file_path: '# PDF Title\n\n## Section\n\nMinerU extracted paragraph.'
    )
    monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)

    upload = UploadFile(filename='spec.pdf', file=BytesIO(b'%PDF-1.4 fake'))
    document = await service.upload_document(user.id, upload)

    chunk_result = await session.execute(
        select(DocumentChunk)
        .where(DocumentChunk.document_id == document.id)
        .order_by(DocumentChunk.chunk_index)
    )
    chunks = list(chunk_result.scalars().all())
    metadata = [json.loads(chunk.metadata_) for chunk in chunks]

    assert document.normalized_format == 'structured_markdown'
    assert '# PDF Title' in document.normalized_content
    assert '## Section' in document.normalized_content
    assert any(item['content_type'] == 'heading' for item in metadata)
    assert any(item['content_type'] == 'paragraph' and item['section_title'] == 'Section' for item in metadata)


@pytest.mark.asyncio
async def test_upload_document_preserves_mineru_image_markdown_in_pdf(document_test_env, monkeypatch):
    session, user = document_test_env
    service = DocumentService(session)

    fake_mineru = types.SimpleNamespace(
        to_markdown=lambda file_path: '# PDF Title\n\n![System diagram](images/system.png)\n\nSystem diagram shows retrieval flow.'
    )
    monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)

    upload = UploadFile(filename='diagram.pdf', file=BytesIO(b'%PDF-1.4 fake'))
    document = await service.upload_document(user.id, upload)

    chunk_result = await session.execute(
        select(DocumentChunk)
        .where(DocumentChunk.document_id == document.id)
        .order_by(DocumentChunk.chunk_index)
    )
    chunks = list(chunk_result.scalars().all())

    assert '![System diagram](images/system.png)' in document.normalized_content
    assert any('System diagram' in chunk.content for chunk in chunks)


@pytest.mark.asyncio
async def test_get_document_content_returns_normalized_pdf_content(document_test_env, monkeypatch):
    session, user = document_test_env
    service = DocumentService(session)

    fake_mineru = types.SimpleNamespace(
        to_markdown=lambda file_path: '# PDF Title\n\nNormalized pdf body.'
    )
    monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)

    upload = UploadFile(filename='preview.pdf', file=BytesIO(b'%PDF-1.4 fake'))
    document = await service.upload_document(user.id, upload)

    content = await service.get_document_content(user.id, document.id)

    assert content == '# PDF Title\n\nNormalized pdf body.'


@pytest.mark.asyncio
async def test_upload_document_uses_mineru_cli_do_parse_fallback_for_pdf(document_test_env, monkeypatch, tmp_path):
    session, user = document_test_env
    service = DocumentService(session)

    fake_mineru = types.SimpleNamespace()
    fake_common = types.SimpleNamespace()

    def fake_do_parse(output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, **kwargs):
        output_path = Path(output_dir) / pdf_file_names[0] / 'pipeline'
        output_path.mkdir(parents=True, exist_ok=True)
        (output_path / f'{pdf_file_names[0]}.md').write_text('# PDF Title\n\nCLI fallback content.', encoding='utf-8')

    fake_common.do_parse = fake_do_parse
    monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
    monkeypatch.setitem(sys.modules, 'mineru.cli', types.SimpleNamespace(common=fake_common))
    monkeypatch.setitem(sys.modules, 'mineru.cli.common', fake_common)
    monkeypatch.setattr('app.services.document_service.settings.UPLOAD_DIR', str(tmp_path / 'uploads'))

    upload = UploadFile(filename='fallback.pdf', file=BytesIO(b'%PDF-1.4 fake'))
    document = await service.upload_document(user.id, upload)

    assert document.normalized_format == 'structured_markdown'
    assert '# PDF Title' in document.normalized_content
    assert 'CLI fallback content.' in document.normalized_content


@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_mineru_cli_runtime_dependency_is_missing(document_test_env, monkeypatch):
    session, user = document_test_env
    service = DocumentService(session)

    fake_mineru = types.SimpleNamespace()
    fake_common = types.SimpleNamespace()
    fake_enum_class = types.SimpleNamespace(MakeMode=types.SimpleNamespace(MM_MD='mm_markdown'))

    def fake_do_parse(*args, **kwargs):
        raise ModuleNotFoundError("No module named 'torch'")

    fake_common.do_parse = fake_do_parse
    fake_common.read_fn = lambda path: b'%PDF-1.4 fake'
    monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)
    monkeypatch.setitem(sys.modules, 'mineru.cli', types.SimpleNamespace(common=fake_common))
    monkeypatch.setitem(sys.modules, 'mineru.cli.common', fake_common)
    monkeypatch.setitem(sys.modules, 'mineru.utils', types.SimpleNamespace(enum_class=fake_enum_class))
    monkeypatch.setitem(sys.modules, 'mineru.utils.enum_class', fake_enum_class)

    upload = UploadFile(filename='runtime-missing.pdf', file=BytesIO(b'%PDF-1.4 fake'))

    with pytest.raises(ValueError, match="PDF 解析依赖缺失: MinerU 运行时依赖 torch"):
        await service.upload_document(user.id, upload)


@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_mineru_interface_is_unsupported(document_test_env, monkeypatch):
    session, user = document_test_env
    service = DocumentService(session)

    fake_mineru = types.SimpleNamespace()
    monkeypatch.setitem(sys.modules, 'mineru', fake_mineru)

    upload = UploadFile(filename='unsupported.pdf', file=BytesIO(b'%PDF-1.4 fake'))

    with pytest.raises(ValueError, match='PDF 解析失败: 当前安装的 MinerU 版本接口不兼容'):
        await service.upload_document(user.id, upload)


@pytest.mark.asyncio
async def test_upload_document_raises_clear_error_when_pdf_dependency_is_missing(document_test_env, monkeypatch):
    session, user = document_test_env
    service = DocumentService(session)

    original_import = builtins.__import__

    def fake_import(name, *args, **kwargs):
        if name == 'mineru':
            raise ModuleNotFoundError("No module named 'mineru'")
        return original_import(name, *args, **kwargs)

    monkeypatch.setattr(builtins, '__import__', fake_import)

    upload = UploadFile(filename='missing.pdf', file=BytesIO(b'%PDF-1.4 fake'))

    with pytest.raises(ValueError, match='PDF 解析依赖缺失: mineru'):
        await service.upload_document(user.id, upload)