Add MinerU document ingestion support

Normalize uploaded documents into structured markdown, add clearer parser errors for missing dependencies, and cover the ingestion flow with backend tests. This also replaces deprecated UTC timestamp helpers in the touched backend paths so the knowledge pipeline stays warning-free. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 13:42:16 +08:00
parent a9ddf3c9b4
commit 3ee825aa90
20 changed files with 2159 additions and 156 deletions
--- a/backend/app/services/knowledge_service.py
+++ b/backend/app/services/knowledge_service.py
@@ -14,9 +14,12 @@ from sqlalchemy import select, or_
 from app.models.document import Document, DocumentChunk
 from app.models.folder import Folder
 from app.config import settings
+from app.services.document_service import DocumentService
 import chromadb
 from chromadb.config import Settings as ChromaSettings
 from dataclasses import dataclass
+from datetime import UTC, datetime
+import json


@dataclass
@@ -72,24 +75,50 @@ class KnowledgeService:
        if not chunks:
            return

+        await self._index_chunks(doc, chunks, user_id, folder_path=folder_path)
+
+    async def _index_chunks(
+        self,
+        document: Document,
+        chunks: list[DocumentChunk],
+        user_id: str,
+        folder_path: str | None = None,
+    ):
+        folder_path = folder_path or (await self._get_folder_path(document.folder_id) if document.folder_id else "")
        collection = self.get_collection(user_id)

        ids = [chunk.id for chunk in chunks]
        documents = [chunk.content for chunk in chunks]
-        metadatas = [
-            {
-                "document_id": doc.id,
-                "document_title": doc.title,
+        metadatas = []
+        for chunk in chunks:
+            chunk_metadata = self._parse_metadata(chunk.metadata_)
+            meta = {
+                "document_id": document.id,
+                "document_title": document.title,
+                "document_filename": document.filename,
                "chunk_index": chunk.chunk_index,
-                "file_type": doc.file_type,
+                "file_type": document.file_type,
                "folder_path": folder_path or "",
+                "content_type": chunk_metadata.get("content_type", "text"),
+                "section_title": chunk_metadata.get("section_title") or "",
+                "section_path": " / ".join(chunk_metadata.get("section_path", [])),
+                "page_number": chunk_metadata.get("page_number") or 0,
+                "sheet_name": chunk_metadata.get("sheet_name") or "",
+                "row_start": chunk_metadata.get("row_start") or 0,
+                "row_end": chunk_metadata.get("row_end") or 0,
+                "parser_version": chunk_metadata.get("parser_version") or document.parser_version or "",
+                "index_version": chunk_metadata.get("index_version") or document.index_version or "",
            }
-            for chunk in chunks
-        ]
+            chunk.chroma_collection = f"user_{user_id}"
+            chunk.chroma_id = chunk.id
+            metadatas.append(meta)

        collection.add(ids=ids, documents=documents, metadatas=metadatas)

-        doc.is_indexed = True
+        document.is_indexed = True
+        document.ingestion_status = "ready"
+        document.ingestion_error = None
+        document.indexed_at = datetime.now(UTC)
        await self.db.commit()

    async def retrieve(
@@ -141,7 +170,7 @@ class KnowledgeService:
            meta = metadatas[i] if i < len(metadatas) else {}
            score = 1.0 - (distances[i] if i < len(distances) else 0.0)

-            prev_chunk, next_chunk = await self._get_sibling_chunks(
+            prev_chunk, next_chunk = await self._get_related_chunks(
                chunk_id=chunk_id,
                chunk_index=meta.get("chunk_index", 0),
                document_id=meta.get("document_id", ""),
@@ -153,7 +182,7 @@ class KnowledgeService:
                document_title=meta.get("document_title", ""),
                content=documents[i] if i < len(documents) else "",
                score=score,
-                metadata_=str(meta),
+                metadata_=json.dumps(meta, ensure_ascii=False),
                prev_chunk=prev_chunk,
                next_chunk=next_chunk,
            ))
@@ -171,10 +200,11 @@ class KnowledgeService:
        results: list[SearchResult],
        top_k: int,
    ) -> list[SearchResult]:
-        """Rerank: 语义分 * 0.7 + 关键词匹配 * 0.2 + 标题匹配 * 0.1"""
+        """Rerank: 语义分 * 0.7 + 关键词匹配 * 0.2 + 标题匹配 * 0.1 + 结构加权"""
        import re

        query_words = set(re.findall(r"\w+", query.lower()))
+        table_query = any(token in query.lower() for token in ["sheet", "excel", "csv", "表", "列", "金额", "统计", "日期"])

        scored = []
        for r in results:
@@ -189,36 +219,56 @@ class KnowledgeService:
                title_overlap = len(query_words & title_words) / max(len(query_words), 1)
                score += title_overlap * 0.1

+            metadata = self._parse_metadata(r.metadata_)
+            if table_query and metadata.get("content_type") == "table_schema":
+                score += 0.25
+            elif table_query and metadata.get("content_type") == "table_rows":
+                score += 0.15
+
            scored.append((score, r))

        scored.sort(key=lambda x: x[0], reverse=True)
        return [r for _, r in scored[:top_k]]

-    async def _get_sibling_chunks(
+    async def _get_related_chunks(
        self,
        chunk_id: str,
        chunk_index: int,
        document_id: str,
    ) -> tuple[str | None, str | None]:
-        """获取前一个和后一个 chunk（完整上下文）"""
-        prev_result = await self.db.execute(
-            select(DocumentChunk).where(
-                DocumentChunk.document_id == document_id,
-                DocumentChunk.chunk_index == chunk_index - 1,
-            )
+        """获取结构相关的上下文 chunk"""
+        current_result = await self.db.execute(
+            select(DocumentChunk).where(DocumentChunk.id == chunk_id)
        )
-        next_result = await self.db.execute(
-            select(DocumentChunk).where(
-                DocumentChunk.document_id == document_id,
-                DocumentChunk.chunk_index == chunk_index + 1,
-            )
-        )
-        prev_chunk = prev_result.scalar_one_or_none()
-        next_chunk = next_result.scalar_one_or_none()
-        return (
-            prev_chunk.content if prev_chunk else None,
-            next_chunk.content if next_chunk else None,
+        current_chunk = current_result.scalar_one_or_none()
+        if not current_chunk:
+            return None, None
+
+        current_metadata = self._parse_metadata(current_chunk.metadata_)
+        section_path = current_metadata.get("section_path") or []
+        sheet_name = current_metadata.get("sheet_name")
+
+        chunk_result = await self.db.execute(
+            select(DocumentChunk)
+            .where(DocumentChunk.document_id == document_id)
+            .order_by(DocumentChunk.chunk_index)
        )
+        chunks = list(chunk_result.scalars().all())
+
+        prev_chunk = None
+        next_chunk = None
+        for chunk in chunks:
+            if chunk.id == chunk_id:
+                continue
+            metadata = self._parse_metadata(chunk.metadata_)
+            same_sheet = bool(sheet_name) and metadata.get("sheet_name") == sheet_name
+            same_section = bool(section_path) and metadata.get("section_path") == section_path
+            if chunk.chunk_index < chunk_index and (same_sheet or same_section):
+                prev_chunk = chunk.content
+            if chunk.chunk_index > chunk_index and (same_sheet or same_section):
+                next_chunk = chunk.content
+                break
+        return prev_chunk, next_chunk

    async def _get_folder_path(self, folder_id: str) -> str | None:
        """获取文件夹的完整路径"""
@@ -244,6 +294,16 @@ class KnowledgeService:

        return "/" + "/".join(path_parts)

+    def _parse_metadata(self, raw_metadata: str | dict | None) -> dict:
+        if isinstance(raw_metadata, dict):
+            return raw_metadata
+        if not raw_metadata:
+            return {}
+        try:
+            return json.loads(raw_metadata)
+        except (TypeError, json.JSONDecodeError):
+            return {}
+
    async def hybrid_search(
        self,
        query: str,
@@ -306,3 +366,43 @@ class KnowledgeService:
            collection.delete(where={"document_id": document_id})
        except Exception:
            pass
+
+    async def reindex_document(self, document_id: str, user_id: str) -> bool:
+        result = await self.db.execute(
+            select(Document).where(
+                Document.id == document_id,
+                Document.user_id == user_id,
+            )
+        )
+        document = result.scalar_one_or_none()
+        if not document:
+            return False
+
+        await self.delete_from_vectorstore(user_id, document_id)
+        document = await DocumentService(self.db, user_id=user_id).rebuild_document(document)
+        await self.index_document(document.id, user_id)
+        return True
+
+    async def reindex_document_chunks(self, document_id: str, user_id: str) -> bool:
+        result = await self.db.execute(
+            select(Document).where(
+                Document.id == document_id,
+                Document.user_id == user_id,
+            )
+        )
+        document = result.scalar_one_or_none()
+        if not document:
+            return False
+
+        chunks_result = await self.db.execute(
+            select(DocumentChunk)
+            .where(DocumentChunk.document_id == document_id)
+            .order_by(DocumentChunk.chunk_index)
+        )
+        chunks = list(chunks_result.scalars().all())
+        if not chunks:
+            return False
+
+        await self.delete_from_vectorstore(user_id, document_id)
+        await self._index_chunks(document, chunks, user_id)
+        return True