Add MinerU document ingestion support

Normalize uploaded documents into structured markdown, add clearer parser
errors for missing dependencies, and cover the ingestion flow with
backend tests. This also replaces deprecated UTC timestamp helpers in
the touched backend paths so the knowledge pipeline stays warning-free.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-22 13:42:16 +08:00
parent a9ddf3c9b4
commit 3ee825aa90
20 changed files with 2159 additions and 156 deletions

View File

@@ -8,12 +8,13 @@ from app.models.user import User
from app.routers.auth import get_current_user
from app.services.document_service import DocumentService
from app.services.knowledge_service import KnowledgeService
from app.schemas.document import DocumentChunkOut, DocumentChunkUpdate, DocumentOut
from dataclasses import asdict
router = APIRouter(prefix="/api/documents", tags=["知识库"])
@router.get("", response_model=list)
@router.get("", response_model=list[DocumentOut])
async def list_documents(
folder_id: Optional[str] = None,
current_user: User = Depends(get_current_user),
@@ -36,7 +37,10 @@ async def upload_document(
):
"""上传文档,自动分块并向量化"""
doc_svc = DocumentService(db)
doc = await doc_svc.upload_document(current_user.id, file, folder_id=folder_id)
try:
doc = await doc_svc.upload_document(current_user.id, file, folder_id=folder_id)
except ValueError as error:
raise HTTPException(status_code=400, detail=str(error)) from error
# 后台索引到 ChromaDB
def index_task():
@@ -73,7 +77,7 @@ async def get_document(
return doc
@router.get("/{document_id}/chunks")
@router.get("/{document_id}/chunks", response_model=list[DocumentChunkOut])
async def get_document_chunks(
document_id: str,
current_user: User = Depends(get_current_user),
@@ -98,6 +102,33 @@ async def get_document_chunks(
return chunks_result.scalars().all()
@router.put("/{document_id}/chunks/{chunk_id}", response_model=DocumentChunkOut)
async def update_document_chunk(
document_id: str,
chunk_id: str,
payload: DocumentChunkUpdate,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
doc_svc = DocumentService(db)
kb_svc = KnowledgeService(db, user_id=current_user.id)
try:
chunk = await doc_svc.update_document_chunk(current_user.id, document_id, chunk_id, payload.content)
except ValueError as error:
raise HTTPException(status_code=404, detail=str(error)) from error
reindexed = await kb_svc.reindex_document_chunks(document_id, current_user.id)
if not reindexed:
raise HTTPException(status_code=500, detail="切片更新后重新索引失败")
refreshed_chunk_result = await db.execute(
select(DocumentChunk).where(DocumentChunk.id == chunk.id)
)
refreshed_chunk = refreshed_chunk_result.scalar_one()
return refreshed_chunk
@router.delete("/{document_id}", status_code=204)
async def delete_document(
document_id: str,
@@ -129,7 +160,7 @@ async def search_documents(
if mode == "keyword":
results = await kb_svc._keyword_search(query, current_user.id, top_k)
elif mode == "semantic":
results = await kb_svc.retrieve(query, current_user.id, top_k, use_rerank=True)
results = await kb_svc.retrieve(query, current_user.id, top_k=top_k, use_rerank=True)
else:
results = await kb_svc.hybrid_search(query, current_user.id, top_k)

View File

@@ -64,8 +64,8 @@ async def update_task(
if field == "tags":
setattr(task, field, json.dumps(value))
elif field == "status" and value == TaskStatus.DONE:
from datetime import datetime
task.completed_at = datetime.utcnow()
from datetime import UTC, datetime
task.completed_at = datetime.now(UTC)
setattr(task, field, value)
else:
setattr(task, field, value)

View File

@@ -81,9 +81,9 @@ async def update_todo(
if data.title is not None:
todo.title = data.title
if data.is_completed is not None:
from datetime import datetime
from datetime import UTC, datetime
todo.is_completed = data.is_completed
todo.completed_at = datetime.utcnow() if data.is_completed else None
todo.completed_at = datetime.now(UTC) if data.is_completed else None
await db.commit()
await db.refresh(todo)