Add FastAPI backend with agent system

2026-03-21 10:13:29 +08:00
parent ed6bab59fe
commit 6ffa07adde
82 changed files with 11138 additions and 0 deletions
--- a/backend/app/services/document_service.py
+++ b/backend/app/services/document_service.py
@@ -0,0 +1,256 @@
+"""
+文档服务 - 上传、解析、分块、存储
+支持多种文档格式 + LlamaIndex 智能分块
+"""
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select
+from fastapi import UploadFile
+from app.models.document import Document, DocumentChunk
+from app.models.folder import Folder
+from app.config import settings
+import os
+import aiofiles
+import uuid
+
+
+ALLOWED_EXTENSIONS = {".pdf", ".md", ".txt", ".docx", ".doc"}
+
+
+class DocumentService:
+    def __init__(self, db: AsyncSession, user_id: str = None):
+        self.db = db
+        self.user_id = user_id
+
+    async def upload_document(self, user_id: str, file: UploadFile, folder_id: str | None = None) -> Document:
+        ext = os.path.splitext(file.filename)[1].lower()
+        if ext not in ALLOWED_EXTENSIONS:
+            raise ValueError(f"不支持的文件类型: {ext}")
+
+        os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
+        file_id = str(uuid.uuid4())
+        file_path = os.path.join(settings.UPLOAD_DIR, f"{file_id}{ext}")
+
+        content = await file.read()
+        file_size = len(content)
+        if file_size > settings.MAX_UPLOAD_SIZE:
+            raise ValueError(f"文件大小超过限制: {settings.MAX_UPLOAD_SIZE // 1024 // 1024}MB")
+
+        async with aiofiles.open(file_path, "wb") as f:
+            await f.write(content)
+
+        text_content = await self._extract_text(file_path, ext)
+
+        doc = Document(
+            user_id=user_id,
+            title=file.filename.rsplit('.', 1)[0],
+            filename=file.filename,
+            file_type=ext[1:],
+            file_size=file_size,
+            file_path=file_path,
+            summary=text_content[:500] if len(text_content) > 500 else text_content,
+            folder_id=folder_id,
+        )
+        self.db.add(doc)
+        await self.db.commit()
+        await self.db.refresh(doc)
+
+        chunks = self._chunk_text(text_content)
+        for i, chunk_text in enumerate(chunks):
+            chunk = DocumentChunk(
+                document_id=doc.id,
+                chunk_index=i,
+                content=chunk_text,
+            )
+            self.db.add(chunk)
+        doc.chunk_count = len(chunks)
+        await self.db.commit()
+
+        return doc
+
+    async def _get_folder_path(self, folder_id: str) -> str | None:
+        """获取文件夹的完整路径"""
+        folders = await self.db.execute(
+            select(Folder).where(Folder.user_id == self.user_id)
+        )
+        folder_map = {f.id: f for f in folders.scalars().all()}
+
+        path_parts = []
+        current_id = folder_id
+        while current_id:
+            folder = folder_map.get(current_id)
+            if not folder:
+                break
+            path_parts.insert(0, folder.name)
+            current_id = folder.parent_id
+
+        return "/" + "/".join(path_parts) if path_parts else None
+
+    async def delete_document(self, user_id: str, document_id: str):
+        result = await self.db.execute(
+            select(Document).where(
+                Document.id == document_id,
+                Document.user_id == user_id,
+            )
+        )
+        doc = result.scalar_one_or_none()
+        if not doc:
+            raise ValueError("文档不存在")
+
+        if os.path.exists(doc.file_path):
+            os.remove(doc.file_path)
+
+        await self.db.delete(doc)
+        await self.db.commit()
+
+    async def _extract_text(self, file_path: str, ext: str) -> str:
+        if ext == ".pdf":
+            try:
+                import pymupdf
+                doc = pymupdf.open(file_path)
+                text = "".join(page.get_text() for page in doc)
+                doc.close()
+                return text
+            except ImportError:
+                return "[PDF 内容需要安装 pymupdf: uv pip install pymupdf]"
+
+        elif ext in (".md", ".txt"):
+            async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
+                return await f.read()
+
+        elif ext in (".docx", ".doc"):
+            try:
+                from docx import Document as DocxDocument
+                doc = DocxDocument(file_path)
+                return "\n".join([p.text for p in doc.paragraphs])
+            except ImportError:
+                return "[Word 内容需要安装 python-docx: uv pip install python-docx]"
+
+        return "[暂不支持此格式]"
+
+    def _chunk_text(self, text: str) -> list[str]:
+        """
+        智能文档分块策略
+        1. 先按 Markdown 标题层级（H1/H2/H3）切分
+        2. 每个大段落内部按固定长度切分
+        3. 保留上下文（prev_summary / next_summary）
+        """
+        import re
+
+        chunks = []
+
+        # 策略1: Markdown 标题切分（优先）
+        header_pattern = re.compile(r"^(#{1,3})\s+(.+)$", re.MULTILINE)
+        headers = list(header_pattern.finditer(text))
+
+        if headers:
+            # 按标题段落切分
+            for i, match in enumerate(headers):
+                start = match.start()
+                end = headers[i + 1].start() if i + 1 < len(headers) else len(text)
+                section = text[start:end].strip()
+                if len(section) > settings.CHUNK_SIZE:
+                    # 大段落内部再切分
+                    sub_chunks = self._split_large_chunk(section, match.group(2))
+                    chunks.extend(sub_chunks)
+                elif section:
+                    chunks.append(section)
+        else:
+            # 策略2: 按段落切分
+            chunks = self._chunk_by_paragraphs(text)
+
+        # 过滤空 chunk
+        chunks = [c.strip() for c in chunks if c.strip()]
+        return chunks if chunks else [text[: settings.CHUNK_SIZE]]
+
+    def _chunk_by_paragraphs(self, text: str) -> list[str]:
+        """按段落分块，带上下文"""
+        paragraphs = text.split("\n\n")
+        chunks = []
+        current = ""
+        prev_summary = ""
+
+        for para in paragraphs:
+            para = para.strip()
+            if not para:
+                continue
+            if len(current) + len(para) < settings.CHUNK_SIZE:
+                current += "\n\n" + para
+            else:
+                if current:
+                    # 添加上下文摘要
+                    enriched = current.strip()
+                    chunks.append(enriched)
+                current = para
+
+        if current.strip():
+            chunks.append(current.strip())
+
+        return chunks
+
+    def _split_large_chunk(self, text: str, title: str) -> list[str]:
+        """将大段落拆分为固定大小的子块"""
+        chunks = []
+        sentences = text.split("。")
+        current = title + "\n\n"
+
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+            full_sentence = sentence if sentence.endswith("。") else sentence + "。"
+            if len(current) + len(full_sentence) < settings.CHUNK_SIZE:
+                current += full_sentence + " "
+            else:
+                if current.strip():
+                    chunks.append(current.strip())
+                current = title + "\n\n" + full_sentence + " "
+
+        if current.strip():
+            chunks.append(current.strip())
+
+        return chunks
+
+    async def get_document_chunks(self, document_id: str) -> list[DocumentChunk]:
+        result = await self.db.execute(
+            select(DocumentChunk)
+            .where(DocumentChunk.document_id == document_id)
+            .order_by(DocumentChunk.chunk_index)
+        )
+        return list(result.scalars().all())
+
+    async def get_document_content(self, user_id: str, document_id: str) -> str | None:
+        """获取文档的文本内容"""
+        import os
+
+        result = await self.db.execute(
+            select(Document).where(
+                Document.id == document_id,
+                Document.user_id == user_id,
+            )
+        )
+        doc = result.scalar_one_or_none()
+        if not doc:
+            return None
+
+        file_path = doc.file_path
+        if not os.path.exists(file_path):
+            return None
+
+        # 根据文件类型读取内容
+        ext = doc.filename.split('.')[-1].lower()
+
+        try:
+            if ext == 'txt':
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    return f.read()
+            elif ext == 'md':
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    return f.read()
+            elif ext == 'pdf':
+                # 简单文本提取（生产环境应使用专业库）
+                return f"[PDF文档] {doc.filename}"
+            else:
+                return f"[文档] {doc.filename}"
+        except Exception:
+            return f"[文档] {doc.filename}"