Add MinerU document ingestion support
Normalize uploaded documents into structured markdown, add clearer parser errors for missing dependencies, and cover the ingestion flow with backend tests. This also replaces deprecated UTC timestamp helpers in the touched backend paths so the knowledge pipeline stays warning-free. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,12 +8,13 @@ from app.models.user import User
|
||||
from app.routers.auth import get_current_user
|
||||
from app.services.document_service import DocumentService
|
||||
from app.services.knowledge_service import KnowledgeService
|
||||
from app.schemas.document import DocumentChunkOut, DocumentChunkUpdate, DocumentOut
|
||||
from dataclasses import asdict
|
||||
|
||||
router = APIRouter(prefix="/api/documents", tags=["知识库"])
|
||||
|
||||
|
||||
@router.get("", response_model=list)
|
||||
@router.get("", response_model=list[DocumentOut])
|
||||
async def list_documents(
|
||||
folder_id: Optional[str] = None,
|
||||
current_user: User = Depends(get_current_user),
|
||||
@@ -36,7 +37,10 @@ async def upload_document(
|
||||
):
|
||||
"""上传文档,自动分块并向量化"""
|
||||
doc_svc = DocumentService(db)
|
||||
doc = await doc_svc.upload_document(current_user.id, file, folder_id=folder_id)
|
||||
try:
|
||||
doc = await doc_svc.upload_document(current_user.id, file, folder_id=folder_id)
|
||||
except ValueError as error:
|
||||
raise HTTPException(status_code=400, detail=str(error)) from error
|
||||
|
||||
# 后台索引到 ChromaDB
|
||||
def index_task():
|
||||
@@ -73,7 +77,7 @@ async def get_document(
|
||||
return doc
|
||||
|
||||
|
||||
@router.get("/{document_id}/chunks")
|
||||
@router.get("/{document_id}/chunks", response_model=list[DocumentChunkOut])
|
||||
async def get_document_chunks(
|
||||
document_id: str,
|
||||
current_user: User = Depends(get_current_user),
|
||||
@@ -98,6 +102,33 @@ async def get_document_chunks(
|
||||
return chunks_result.scalars().all()
|
||||
|
||||
|
||||
@router.put("/{document_id}/chunks/{chunk_id}", response_model=DocumentChunkOut)
|
||||
async def update_document_chunk(
|
||||
document_id: str,
|
||||
chunk_id: str,
|
||||
payload: DocumentChunkUpdate,
|
||||
current_user: User = Depends(get_current_user),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
doc_svc = DocumentService(db)
|
||||
kb_svc = KnowledgeService(db, user_id=current_user.id)
|
||||
|
||||
try:
|
||||
chunk = await doc_svc.update_document_chunk(current_user.id, document_id, chunk_id, payload.content)
|
||||
except ValueError as error:
|
||||
raise HTTPException(status_code=404, detail=str(error)) from error
|
||||
|
||||
reindexed = await kb_svc.reindex_document_chunks(document_id, current_user.id)
|
||||
if not reindexed:
|
||||
raise HTTPException(status_code=500, detail="切片更新后重新索引失败")
|
||||
|
||||
refreshed_chunk_result = await db.execute(
|
||||
select(DocumentChunk).where(DocumentChunk.id == chunk.id)
|
||||
)
|
||||
refreshed_chunk = refreshed_chunk_result.scalar_one()
|
||||
return refreshed_chunk
|
||||
|
||||
|
||||
@router.delete("/{document_id}", status_code=204)
|
||||
async def delete_document(
|
||||
document_id: str,
|
||||
@@ -129,7 +160,7 @@ async def search_documents(
|
||||
if mode == "keyword":
|
||||
results = await kb_svc._keyword_search(query, current_user.id, top_k)
|
||||
elif mode == "semantic":
|
||||
results = await kb_svc.retrieve(query, current_user.id, top_k, use_rerank=True)
|
||||
results = await kb_svc.retrieve(query, current_user.id, top_k=top_k, use_rerank=True)
|
||||
else:
|
||||
results = await kb_svc.hybrid_search(query, current_user.id, top_k)
|
||||
|
||||
|
||||
@@ -64,8 +64,8 @@ async def update_task(
|
||||
if field == "tags":
|
||||
setattr(task, field, json.dumps(value))
|
||||
elif field == "status" and value == TaskStatus.DONE:
|
||||
from datetime import datetime
|
||||
task.completed_at = datetime.utcnow()
|
||||
from datetime import UTC, datetime
|
||||
task.completed_at = datetime.now(UTC)
|
||||
setattr(task, field, value)
|
||||
else:
|
||||
setattr(task, field, value)
|
||||
|
||||
@@ -81,9 +81,9 @@ async def update_todo(
|
||||
if data.title is not None:
|
||||
todo.title = data.title
|
||||
if data.is_completed is not None:
|
||||
from datetime import datetime
|
||||
from datetime import UTC, datetime
|
||||
todo.is_completed = data.is_completed
|
||||
todo.completed_at = datetime.utcnow() if data.is_completed else None
|
||||
todo.completed_at = datetime.now(UTC) if data.is_completed else None
|
||||
|
||||
await db.commit()
|
||||
await db.refresh(todo)
|
||||
|
||||
Reference in New Issue
Block a user