Add MinerU document ingestion support

Normalize uploaded documents into structured markdown, add clearer parser errors for missing dependencies, and cover the ingestion flow with backend tests. This also replaces deprecated UTC timestamp helpers in the touched backend paths so the knowledge pipeline stays warning-free. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 13:42:16 +08:00
parent a9ddf3c9b4
commit 3ee825aa90
20 changed files with 2159 additions and 156 deletions
--- a/backend/app/routers/document.py
+++ b/backend/app/routers/document.py
@@ -8,12 +8,13 @@ from app.models.user import User
 from app.routers.auth import get_current_user
 from app.services.document_service import DocumentService
 from app.services.knowledge_service import KnowledgeService
+from app.schemas.document import DocumentChunkOut, DocumentChunkUpdate, DocumentOut
 from dataclasses import asdict

 router = APIRouter(prefix="/api/documents", tags=["知识库"])


-@router.get("", response_model=list)
+@router.get("", response_model=list[DocumentOut])
 async def list_documents(
    folder_id: Optional[str] = None,
    current_user: User = Depends(get_current_user),
@@ -36,7 +37,10 @@ async def upload_document(
 ):
    """上传文档，自动分块并向量化"""
    doc_svc = DocumentService(db)
-    doc = await doc_svc.upload_document(current_user.id, file, folder_id=folder_id)
+    try:
+        doc = await doc_svc.upload_document(current_user.id, file, folder_id=folder_id)
+    except ValueError as error:
+        raise HTTPException(status_code=400, detail=str(error)) from error

    # 后台索引到 ChromaDB
    def index_task():
@@ -73,7 +77,7 @@ async def get_document(
    return doc


-@router.get("/{document_id}/chunks")
+@router.get("/{document_id}/chunks", response_model=list[DocumentChunkOut])
 async def get_document_chunks(
    document_id: str,
    current_user: User = Depends(get_current_user),
@@ -98,6 +102,33 @@ async def get_document_chunks(
    return chunks_result.scalars().all()


+@router.put("/{document_id}/chunks/{chunk_id}", response_model=DocumentChunkOut)
+async def update_document_chunk(
+    document_id: str,
+    chunk_id: str,
+    payload: DocumentChunkUpdate,
+    current_user: User = Depends(get_current_user),
+    db: AsyncSession = Depends(get_db),
+):
+    doc_svc = DocumentService(db)
+    kb_svc = KnowledgeService(db, user_id=current_user.id)
+
+    try:
+        chunk = await doc_svc.update_document_chunk(current_user.id, document_id, chunk_id, payload.content)
+    except ValueError as error:
+        raise HTTPException(status_code=404, detail=str(error)) from error
+
+    reindexed = await kb_svc.reindex_document_chunks(document_id, current_user.id)
+    if not reindexed:
+        raise HTTPException(status_code=500, detail="切片更新后重新索引失败")
+
+    refreshed_chunk_result = await db.execute(
+        select(DocumentChunk).where(DocumentChunk.id == chunk.id)
+    )
+    refreshed_chunk = refreshed_chunk_result.scalar_one()
+    return refreshed_chunk
+
+
@router.delete("/{document_id}", status_code=204)
 async def delete_document(
    document_id: str,
@@ -129,7 +160,7 @@ async def search_documents(
    if mode == "keyword":
        results = await kb_svc._keyword_search(query, current_user.id, top_k)
    elif mode == "semantic":
-        results = await kb_svc.retrieve(query, current_user.id, top_k, use_rerank=True)
+        results = await kb_svc.retrieve(query, current_user.id, top_k=top_k, use_rerank=True)
    else:
        results = await kb_svc.hybrid_search(query, current_user.id, top_k)

--- a/backend/app/routers/task.py
+++ b/backend/app/routers/task.py
@@ -64,8 +64,8 @@ async def update_task(
        if field == "tags":
            setattr(task, field, json.dumps(value))
        elif field == "status" and value == TaskStatus.DONE:
-            from datetime import datetime
-            task.completed_at = datetime.utcnow()
+            from datetime import UTC, datetime
+            task.completed_at = datetime.now(UTC)
            setattr(task, field, value)
        else:
            setattr(task, field, value)
--- a/backend/app/routers/todo.py
+++ b/backend/app/routers/todo.py
@@ -81,9 +81,9 @@ async def update_todo(
    if data.title is not None:
        todo.title = data.title
    if data.is_completed is not None:
-        from datetime import datetime
+        from datetime import UTC, datetime
        todo.is_completed = data.is_completed
-        todo.completed_at = datetime.utcnow() if data.is_completed else None
+        todo.completed_at = datetime.now(UTC) if data.is_completed else None

    await db.commit()
    await db.refresh(todo)