Add FastAPI backend with agent system
This commit is contained in:
256
backend/app/services/document_service.py
Normal file
256
backend/app/services/document_service.py
Normal file
@@ -0,0 +1,256 @@
|
||||
"""
|
||||
文档服务 - 上传、解析、分块、存储
|
||||
支持多种文档格式 + LlamaIndex 智能分块
|
||||
"""
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select
|
||||
from fastapi import UploadFile
|
||||
from app.models.document import Document, DocumentChunk
|
||||
from app.models.folder import Folder
|
||||
from app.config import settings
|
||||
import os
|
||||
import aiofiles
|
||||
import uuid
|
||||
|
||||
|
||||
ALLOWED_EXTENSIONS = {".pdf", ".md", ".txt", ".docx", ".doc"}
|
||||
|
||||
|
||||
class DocumentService:
|
||||
def __init__(self, db: AsyncSession, user_id: str = None):
|
||||
self.db = db
|
||||
self.user_id = user_id
|
||||
|
||||
async def upload_document(self, user_id: str, file: UploadFile, folder_id: str | None = None) -> Document:
|
||||
ext = os.path.splitext(file.filename)[1].lower()
|
||||
if ext not in ALLOWED_EXTENSIONS:
|
||||
raise ValueError(f"不支持的文件类型: {ext}")
|
||||
|
||||
os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
|
||||
file_id = str(uuid.uuid4())
|
||||
file_path = os.path.join(settings.UPLOAD_DIR, f"{file_id}{ext}")
|
||||
|
||||
content = await file.read()
|
||||
file_size = len(content)
|
||||
if file_size > settings.MAX_UPLOAD_SIZE:
|
||||
raise ValueError(f"文件大小超过限制: {settings.MAX_UPLOAD_SIZE // 1024 // 1024}MB")
|
||||
|
||||
async with aiofiles.open(file_path, "wb") as f:
|
||||
await f.write(content)
|
||||
|
||||
text_content = await self._extract_text(file_path, ext)
|
||||
|
||||
doc = Document(
|
||||
user_id=user_id,
|
||||
title=file.filename.rsplit('.', 1)[0],
|
||||
filename=file.filename,
|
||||
file_type=ext[1:],
|
||||
file_size=file_size,
|
||||
file_path=file_path,
|
||||
summary=text_content[:500] if len(text_content) > 500 else text_content,
|
||||
folder_id=folder_id,
|
||||
)
|
||||
self.db.add(doc)
|
||||
await self.db.commit()
|
||||
await self.db.refresh(doc)
|
||||
|
||||
chunks = self._chunk_text(text_content)
|
||||
for i, chunk_text in enumerate(chunks):
|
||||
chunk = DocumentChunk(
|
||||
document_id=doc.id,
|
||||
chunk_index=i,
|
||||
content=chunk_text,
|
||||
)
|
||||
self.db.add(chunk)
|
||||
doc.chunk_count = len(chunks)
|
||||
await self.db.commit()
|
||||
|
||||
return doc
|
||||
|
||||
async def _get_folder_path(self, folder_id: str) -> str | None:
|
||||
"""获取文件夹的完整路径"""
|
||||
folders = await self.db.execute(
|
||||
select(Folder).where(Folder.user_id == self.user_id)
|
||||
)
|
||||
folder_map = {f.id: f for f in folders.scalars().all()}
|
||||
|
||||
path_parts = []
|
||||
current_id = folder_id
|
||||
while current_id:
|
||||
folder = folder_map.get(current_id)
|
||||
if not folder:
|
||||
break
|
||||
path_parts.insert(0, folder.name)
|
||||
current_id = folder.parent_id
|
||||
|
||||
return "/" + "/".join(path_parts) if path_parts else None
|
||||
|
||||
async def delete_document(self, user_id: str, document_id: str):
|
||||
result = await self.db.execute(
|
||||
select(Document).where(
|
||||
Document.id == document_id,
|
||||
Document.user_id == user_id,
|
||||
)
|
||||
)
|
||||
doc = result.scalar_one_or_none()
|
||||
if not doc:
|
||||
raise ValueError("文档不存在")
|
||||
|
||||
if os.path.exists(doc.file_path):
|
||||
os.remove(doc.file_path)
|
||||
|
||||
await self.db.delete(doc)
|
||||
await self.db.commit()
|
||||
|
||||
async def _extract_text(self, file_path: str, ext: str) -> str:
|
||||
if ext == ".pdf":
|
||||
try:
|
||||
import pymupdf
|
||||
doc = pymupdf.open(file_path)
|
||||
text = "".join(page.get_text() for page in doc)
|
||||
doc.close()
|
||||
return text
|
||||
except ImportError:
|
||||
return "[PDF 内容需要安装 pymupdf: uv pip install pymupdf]"
|
||||
|
||||
elif ext in (".md", ".txt"):
|
||||
async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
|
||||
return await f.read()
|
||||
|
||||
elif ext in (".docx", ".doc"):
|
||||
try:
|
||||
from docx import Document as DocxDocument
|
||||
doc = DocxDocument(file_path)
|
||||
return "\n".join([p.text for p in doc.paragraphs])
|
||||
except ImportError:
|
||||
return "[Word 内容需要安装 python-docx: uv pip install python-docx]"
|
||||
|
||||
return "[暂不支持此格式]"
|
||||
|
||||
def _chunk_text(self, text: str) -> list[str]:
|
||||
"""
|
||||
智能文档分块策略
|
||||
1. 先按 Markdown 标题层级(H1/H2/H3)切分
|
||||
2. 每个大段落内部按固定长度切分
|
||||
3. 保留上下文(prev_summary / next_summary)
|
||||
"""
|
||||
import re
|
||||
|
||||
chunks = []
|
||||
|
||||
# 策略1: Markdown 标题切分(优先)
|
||||
header_pattern = re.compile(r"^(#{1,3})\s+(.+)$", re.MULTILINE)
|
||||
headers = list(header_pattern.finditer(text))
|
||||
|
||||
if headers:
|
||||
# 按标题段落切分
|
||||
for i, match in enumerate(headers):
|
||||
start = match.start()
|
||||
end = headers[i + 1].start() if i + 1 < len(headers) else len(text)
|
||||
section = text[start:end].strip()
|
||||
if len(section) > settings.CHUNK_SIZE:
|
||||
# 大段落内部再切分
|
||||
sub_chunks = self._split_large_chunk(section, match.group(2))
|
||||
chunks.extend(sub_chunks)
|
||||
elif section:
|
||||
chunks.append(section)
|
||||
else:
|
||||
# 策略2: 按段落切分
|
||||
chunks = self._chunk_by_paragraphs(text)
|
||||
|
||||
# 过滤空 chunk
|
||||
chunks = [c.strip() for c in chunks if c.strip()]
|
||||
return chunks if chunks else [text[: settings.CHUNK_SIZE]]
|
||||
|
||||
def _chunk_by_paragraphs(self, text: str) -> list[str]:
|
||||
"""按段落分块,带上下文"""
|
||||
paragraphs = text.split("\n\n")
|
||||
chunks = []
|
||||
current = ""
|
||||
prev_summary = ""
|
||||
|
||||
for para in paragraphs:
|
||||
para = para.strip()
|
||||
if not para:
|
||||
continue
|
||||
if len(current) + len(para) < settings.CHUNK_SIZE:
|
||||
current += "\n\n" + para
|
||||
else:
|
||||
if current:
|
||||
# 添加上下文摘要
|
||||
enriched = current.strip()
|
||||
chunks.append(enriched)
|
||||
current = para
|
||||
|
||||
if current.strip():
|
||||
chunks.append(current.strip())
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_large_chunk(self, text: str, title: str) -> list[str]:
|
||||
"""将大段落拆分为固定大小的子块"""
|
||||
chunks = []
|
||||
sentences = text.split("。")
|
||||
current = title + "\n\n"
|
||||
|
||||
for sentence in sentences:
|
||||
sentence = sentence.strip()
|
||||
if not sentence:
|
||||
continue
|
||||
full_sentence = sentence if sentence.endswith("。") else sentence + "。"
|
||||
if len(current) + len(full_sentence) < settings.CHUNK_SIZE:
|
||||
current += full_sentence + " "
|
||||
else:
|
||||
if current.strip():
|
||||
chunks.append(current.strip())
|
||||
current = title + "\n\n" + full_sentence + " "
|
||||
|
||||
if current.strip():
|
||||
chunks.append(current.strip())
|
||||
|
||||
return chunks
|
||||
|
||||
async def get_document_chunks(self, document_id: str) -> list[DocumentChunk]:
|
||||
result = await self.db.execute(
|
||||
select(DocumentChunk)
|
||||
.where(DocumentChunk.document_id == document_id)
|
||||
.order_by(DocumentChunk.chunk_index)
|
||||
)
|
||||
return list(result.scalars().all())
|
||||
|
||||
async def get_document_content(self, user_id: str, document_id: str) -> str | None:
|
||||
"""获取文档的文本内容"""
|
||||
import os
|
||||
|
||||
result = await self.db.execute(
|
||||
select(Document).where(
|
||||
Document.id == document_id,
|
||||
Document.user_id == user_id,
|
||||
)
|
||||
)
|
||||
doc = result.scalar_one_or_none()
|
||||
if not doc:
|
||||
return None
|
||||
|
||||
file_path = doc.file_path
|
||||
if not os.path.exists(file_path):
|
||||
return None
|
||||
|
||||
# 根据文件类型读取内容
|
||||
ext = doc.filename.split('.')[-1].lower()
|
||||
|
||||
try:
|
||||
if ext == 'txt':
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read()
|
||||
elif ext == 'md':
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read()
|
||||
elif ext == 'pdf':
|
||||
# 简单文本提取(生产环境应使用专业库)
|
||||
return f"[PDF文档] {doc.filename}"
|
||||
else:
|
||||
return f"[文档] {doc.filename}"
|
||||
except Exception:
|
||||
return f"[文档] {doc.filename}"
|
||||
Reference in New Issue
Block a user