Add MinerU document ingestion support
Normalize uploaded documents into structured markdown, add clearer parser errors for missing dependencies, and cover the ingestion flow with backend tests. This also replaces deprecated UTC timestamp helpers in the touched backend paths so the knowledge pipeline stays warning-free. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -14,9 +14,12 @@ from sqlalchemy import select, or_
|
||||
from app.models.document import Document, DocumentChunk
|
||||
from app.models.folder import Folder
|
||||
from app.config import settings
|
||||
from app.services.document_service import DocumentService
|
||||
import chromadb
|
||||
from chromadb.config import Settings as ChromaSettings
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime
|
||||
import json
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -72,24 +75,50 @@ class KnowledgeService:
|
||||
if not chunks:
|
||||
return
|
||||
|
||||
await self._index_chunks(doc, chunks, user_id, folder_path=folder_path)
|
||||
|
||||
async def _index_chunks(
|
||||
self,
|
||||
document: Document,
|
||||
chunks: list[DocumentChunk],
|
||||
user_id: str,
|
||||
folder_path: str | None = None,
|
||||
):
|
||||
folder_path = folder_path or (await self._get_folder_path(document.folder_id) if document.folder_id else "")
|
||||
collection = self.get_collection(user_id)
|
||||
|
||||
ids = [chunk.id for chunk in chunks]
|
||||
documents = [chunk.content for chunk in chunks]
|
||||
metadatas = [
|
||||
{
|
||||
"document_id": doc.id,
|
||||
"document_title": doc.title,
|
||||
metadatas = []
|
||||
for chunk in chunks:
|
||||
chunk_metadata = self._parse_metadata(chunk.metadata_)
|
||||
meta = {
|
||||
"document_id": document.id,
|
||||
"document_title": document.title,
|
||||
"document_filename": document.filename,
|
||||
"chunk_index": chunk.chunk_index,
|
||||
"file_type": doc.file_type,
|
||||
"file_type": document.file_type,
|
||||
"folder_path": folder_path or "",
|
||||
"content_type": chunk_metadata.get("content_type", "text"),
|
||||
"section_title": chunk_metadata.get("section_title") or "",
|
||||
"section_path": " / ".join(chunk_metadata.get("section_path", [])),
|
||||
"page_number": chunk_metadata.get("page_number") or 0,
|
||||
"sheet_name": chunk_metadata.get("sheet_name") or "",
|
||||
"row_start": chunk_metadata.get("row_start") or 0,
|
||||
"row_end": chunk_metadata.get("row_end") or 0,
|
||||
"parser_version": chunk_metadata.get("parser_version") or document.parser_version or "",
|
||||
"index_version": chunk_metadata.get("index_version") or document.index_version or "",
|
||||
}
|
||||
for chunk in chunks
|
||||
]
|
||||
chunk.chroma_collection = f"user_{user_id}"
|
||||
chunk.chroma_id = chunk.id
|
||||
metadatas.append(meta)
|
||||
|
||||
collection.add(ids=ids, documents=documents, metadatas=metadatas)
|
||||
|
||||
doc.is_indexed = True
|
||||
document.is_indexed = True
|
||||
document.ingestion_status = "ready"
|
||||
document.ingestion_error = None
|
||||
document.indexed_at = datetime.now(UTC)
|
||||
await self.db.commit()
|
||||
|
||||
async def retrieve(
|
||||
@@ -141,7 +170,7 @@ class KnowledgeService:
|
||||
meta = metadatas[i] if i < len(metadatas) else {}
|
||||
score = 1.0 - (distances[i] if i < len(distances) else 0.0)
|
||||
|
||||
prev_chunk, next_chunk = await self._get_sibling_chunks(
|
||||
prev_chunk, next_chunk = await self._get_related_chunks(
|
||||
chunk_id=chunk_id,
|
||||
chunk_index=meta.get("chunk_index", 0),
|
||||
document_id=meta.get("document_id", ""),
|
||||
@@ -153,7 +182,7 @@ class KnowledgeService:
|
||||
document_title=meta.get("document_title", ""),
|
||||
content=documents[i] if i < len(documents) else "",
|
||||
score=score,
|
||||
metadata_=str(meta),
|
||||
metadata_=json.dumps(meta, ensure_ascii=False),
|
||||
prev_chunk=prev_chunk,
|
||||
next_chunk=next_chunk,
|
||||
))
|
||||
@@ -171,10 +200,11 @@ class KnowledgeService:
|
||||
results: list[SearchResult],
|
||||
top_k: int,
|
||||
) -> list[SearchResult]:
|
||||
"""Rerank: 语义分 * 0.7 + 关键词匹配 * 0.2 + 标题匹配 * 0.1"""
|
||||
"""Rerank: 语义分 * 0.7 + 关键词匹配 * 0.2 + 标题匹配 * 0.1 + 结构加权"""
|
||||
import re
|
||||
|
||||
query_words = set(re.findall(r"\w+", query.lower()))
|
||||
table_query = any(token in query.lower() for token in ["sheet", "excel", "csv", "表", "列", "金额", "统计", "日期"])
|
||||
|
||||
scored = []
|
||||
for r in results:
|
||||
@@ -189,36 +219,56 @@ class KnowledgeService:
|
||||
title_overlap = len(query_words & title_words) / max(len(query_words), 1)
|
||||
score += title_overlap * 0.1
|
||||
|
||||
metadata = self._parse_metadata(r.metadata_)
|
||||
if table_query and metadata.get("content_type") == "table_schema":
|
||||
score += 0.25
|
||||
elif table_query and metadata.get("content_type") == "table_rows":
|
||||
score += 0.15
|
||||
|
||||
scored.append((score, r))
|
||||
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [r for _, r in scored[:top_k]]
|
||||
|
||||
async def _get_sibling_chunks(
|
||||
async def _get_related_chunks(
|
||||
self,
|
||||
chunk_id: str,
|
||||
chunk_index: int,
|
||||
document_id: str,
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""获取前一个和后一个 chunk(完整上下文)"""
|
||||
prev_result = await self.db.execute(
|
||||
select(DocumentChunk).where(
|
||||
DocumentChunk.document_id == document_id,
|
||||
DocumentChunk.chunk_index == chunk_index - 1,
|
||||
)
|
||||
"""获取结构相关的上下文 chunk"""
|
||||
current_result = await self.db.execute(
|
||||
select(DocumentChunk).where(DocumentChunk.id == chunk_id)
|
||||
)
|
||||
next_result = await self.db.execute(
|
||||
select(DocumentChunk).where(
|
||||
DocumentChunk.document_id == document_id,
|
||||
DocumentChunk.chunk_index == chunk_index + 1,
|
||||
)
|
||||
)
|
||||
prev_chunk = prev_result.scalar_one_or_none()
|
||||
next_chunk = next_result.scalar_one_or_none()
|
||||
return (
|
||||
prev_chunk.content if prev_chunk else None,
|
||||
next_chunk.content if next_chunk else None,
|
||||
current_chunk = current_result.scalar_one_or_none()
|
||||
if not current_chunk:
|
||||
return None, None
|
||||
|
||||
current_metadata = self._parse_metadata(current_chunk.metadata_)
|
||||
section_path = current_metadata.get("section_path") or []
|
||||
sheet_name = current_metadata.get("sheet_name")
|
||||
|
||||
chunk_result = await self.db.execute(
|
||||
select(DocumentChunk)
|
||||
.where(DocumentChunk.document_id == document_id)
|
||||
.order_by(DocumentChunk.chunk_index)
|
||||
)
|
||||
chunks = list(chunk_result.scalars().all())
|
||||
|
||||
prev_chunk = None
|
||||
next_chunk = None
|
||||
for chunk in chunks:
|
||||
if chunk.id == chunk_id:
|
||||
continue
|
||||
metadata = self._parse_metadata(chunk.metadata_)
|
||||
same_sheet = bool(sheet_name) and metadata.get("sheet_name") == sheet_name
|
||||
same_section = bool(section_path) and metadata.get("section_path") == section_path
|
||||
if chunk.chunk_index < chunk_index and (same_sheet or same_section):
|
||||
prev_chunk = chunk.content
|
||||
if chunk.chunk_index > chunk_index and (same_sheet or same_section):
|
||||
next_chunk = chunk.content
|
||||
break
|
||||
return prev_chunk, next_chunk
|
||||
|
||||
async def _get_folder_path(self, folder_id: str) -> str | None:
|
||||
"""获取文件夹的完整路径"""
|
||||
@@ -244,6 +294,16 @@ class KnowledgeService:
|
||||
|
||||
return "/" + "/".join(path_parts)
|
||||
|
||||
def _parse_metadata(self, raw_metadata: str | dict | None) -> dict:
|
||||
if isinstance(raw_metadata, dict):
|
||||
return raw_metadata
|
||||
if not raw_metadata:
|
||||
return {}
|
||||
try:
|
||||
return json.loads(raw_metadata)
|
||||
except (TypeError, json.JSONDecodeError):
|
||||
return {}
|
||||
|
||||
async def hybrid_search(
|
||||
self,
|
||||
query: str,
|
||||
@@ -306,3 +366,43 @@ class KnowledgeService:
|
||||
collection.delete(where={"document_id": document_id})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
async def reindex_document(self, document_id: str, user_id: str) -> bool:
|
||||
result = await self.db.execute(
|
||||
select(Document).where(
|
||||
Document.id == document_id,
|
||||
Document.user_id == user_id,
|
||||
)
|
||||
)
|
||||
document = result.scalar_one_or_none()
|
||||
if not document:
|
||||
return False
|
||||
|
||||
await self.delete_from_vectorstore(user_id, document_id)
|
||||
document = await DocumentService(self.db, user_id=user_id).rebuild_document(document)
|
||||
await self.index_document(document.id, user_id)
|
||||
return True
|
||||
|
||||
async def reindex_document_chunks(self, document_id: str, user_id: str) -> bool:
|
||||
result = await self.db.execute(
|
||||
select(Document).where(
|
||||
Document.id == document_id,
|
||||
Document.user_id == user_id,
|
||||
)
|
||||
)
|
||||
document = result.scalar_one_or_none()
|
||||
if not document:
|
||||
return False
|
||||
|
||||
chunks_result = await self.db.execute(
|
||||
select(DocumentChunk)
|
||||
.where(DocumentChunk.document_id == document_id)
|
||||
.order_by(DocumentChunk.chunk_index)
|
||||
)
|
||||
chunks = list(chunks_result.scalars().all())
|
||||
if not chunks:
|
||||
return False
|
||||
|
||||
await self.delete_from_vectorstore(user_id, document_id)
|
||||
await self._index_chunks(document, chunks, user_id)
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user