feat(backend): 完善日志系统,支持按日期分目录存储
- 实现 logs/YYYY-MM-DD/ 日期文件夹结构 - 添加 success.log 和 failure.log 专用日志 - 使用 TimedRotatingFileHandler 实现按天切割 - 添加 log_success 和 log_failure 便捷函数 - 集成 markitdown 进行文件转换 - 优化文件存储路径,按项目ID分类存储 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
Chunks API Router
|
||||
"""
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from uuid import UUID
|
||||
from pydantic import BaseModel, Field
|
||||
@@ -13,19 +14,28 @@ from app.api.response import ApiResponse, PaginatedResponse
|
||||
from app.core.database import get_db
|
||||
from app.core.exceptions import NotFoundException
|
||||
from app.core.crud import CRUDBase
|
||||
from app.core.logging import log_success, log_failure
|
||||
from app.models.models import Chunk, File
|
||||
from app.schemas.chunk import ChunkResponse
|
||||
from app.schemas.chunk import ChunkCreateSchema
|
||||
from app.services.text_splitter.splitter import get_splitter
|
||||
from app.services.file_processor.pdf_processor import process_pdf
|
||||
from app.services.file_processor.docx_processor import process_docx
|
||||
from app.services.file_processor.excel_processor import process_csv, process_excel
|
||||
from markitdown import MarkItDown
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Initialize CRUD
|
||||
chunk_crud = CRUDBase(Chunk)
|
||||
|
||||
# Initialize markitdown
|
||||
markitdown = MarkItDown()
|
||||
|
||||
|
||||
def get_project_ready_dir(project_id: str) -> Path:
|
||||
"""获取项目的 ready 文件目录"""
|
||||
base_dir = Path("/data/code/YG-Datasets/data") / project_id / "ready"
|
||||
base_dir.mkdir(parents=True, exist_ok=True)
|
||||
return base_dir
|
||||
|
||||
|
||||
class SplitRequest(BaseModel):
|
||||
"""Request model for splitting text"""
|
||||
@@ -37,28 +47,29 @@ class SplitRequest(BaseModel):
|
||||
|
||||
|
||||
async def process_file_by_type(file: File) -> str:
|
||||
"""Process file based on its type"""
|
||||
"""Process file based on its type, convert to markdown"""
|
||||
if not file.file_path:
|
||||
raise NotFoundException("File", file.id)
|
||||
|
||||
processors = {
|
||||
"pdf": process_pdf,
|
||||
"docx": process_docx,
|
||||
"xlsx": process_excel,
|
||||
"csv": process_csv,
|
||||
}
|
||||
# Supported types for markitdown
|
||||
markitdown_types = ["pdf", "docx", "doc", "pptx", "ppt", "xlsx", "xls", "htm", "html"]
|
||||
|
||||
processor = processors.get(file.file_type)
|
||||
if not processor:
|
||||
# Return raw text for txt, md files
|
||||
if file.file_type in markitdown_types:
|
||||
# Use markitdown to convert to markdown
|
||||
loop = asyncio.get_event_loop()
|
||||
content = await loop.run_in_executor(
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: open(file.file_path, 'r', encoding='utf-8').read()
|
||||
lambda: markitdown.convert(file.file_path)
|
||||
)
|
||||
return content
|
||||
return result.text_content
|
||||
|
||||
return await processor(file.file_path)
|
||||
# Return raw text for txt, md files
|
||||
loop = asyncio.get_event_loop()
|
||||
content = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: open(file.file_path, 'r', encoding='utf-8').read()
|
||||
)
|
||||
return content
|
||||
|
||||
|
||||
@router.post("/split", response_model=ApiResponse)
|
||||
@@ -68,52 +79,97 @@ async def split_text(
|
||||
db: AsyncSession = Depends(get_db)
|
||||
):
|
||||
"""Split text into chunks"""
|
||||
# Get file
|
||||
result = await db.execute(
|
||||
select(File).where(File.id == request.file_id, File.project_id == project_id)
|
||||
)
|
||||
file = result.scalar_one_or_none()
|
||||
if not file:
|
||||
raise NotFoundException("File", request.file_id)
|
||||
|
||||
# Process file
|
||||
text = await process_file_by_type(file)
|
||||
|
||||
# Update file status
|
||||
file.status = "processing"
|
||||
await db.commit()
|
||||
|
||||
# Split text
|
||||
kwargs = {"chunk_size": request.chunk_size, "overlap": request.overlap}
|
||||
if request.method == "custom" and request.separator:
|
||||
kwargs["separator"] = request.separator
|
||||
|
||||
splitter = get_splitter(request.method, **kwargs)
|
||||
split_results = splitter.split(text)
|
||||
|
||||
# Save chunks
|
||||
chunks = []
|
||||
for chunk_data in split_results:
|
||||
db_chunk = Chunk(
|
||||
project_id=project_id,
|
||||
file_id=file.id,
|
||||
name=chunk_data.get("name", f"Chunk {chunk_data['index'] + 1}"),
|
||||
content=chunk_data["content"],
|
||||
word_count=chunk_data.get("word_count", len(chunk_data["content"].split()))
|
||||
try:
|
||||
# Get file
|
||||
result = await db.execute(
|
||||
select(File).where(File.id == request.file_id, File.project_id == project_id)
|
||||
)
|
||||
db.add(db_chunk)
|
||||
chunks.append(db_chunk)
|
||||
file = result.scalar_one_or_none()
|
||||
if not file:
|
||||
raise NotFoundException("File", request.file_id)
|
||||
|
||||
await db.commit()
|
||||
# 记录开始处理
|
||||
log_success(
|
||||
"开始处理文件",
|
||||
project_id=str(project_id),
|
||||
file_id=str(file.id),
|
||||
filename=file.filename,
|
||||
method=request.method,
|
||||
chunk_size=request.chunk_size,
|
||||
overlap=request.overlap
|
||||
)
|
||||
|
||||
# Update file status
|
||||
file.status = "completed"
|
||||
await db.commit()
|
||||
# Process file
|
||||
text = await process_file_by_type(file)
|
||||
|
||||
return ApiResponse.ok(
|
||||
data={"chunks": len(chunks)},
|
||||
message=f"Successfully split into {len(chunks)} chunks"
|
||||
)
|
||||
# Update file status
|
||||
file.status = "processing"
|
||||
await db.commit()
|
||||
|
||||
# Split text
|
||||
kwargs = {"chunk_size": request.chunk_size, "overlap": request.overlap}
|
||||
if request.method == "custom" and request.separator:
|
||||
kwargs["separator"] = request.separator
|
||||
|
||||
splitter = get_splitter(request.method, **kwargs)
|
||||
split_results = splitter.split(text)
|
||||
|
||||
# Save chunks
|
||||
chunks = []
|
||||
for chunk_data in split_results:
|
||||
db_chunk = Chunk(
|
||||
project_id=project_id,
|
||||
file_id=file.id,
|
||||
name=chunk_data.get("name", f"Chunk {chunk_data['index'] + 1}"),
|
||||
content=chunk_data["content"],
|
||||
word_count=chunk_data.get("word_count", len(chunk_data["content"].split()))
|
||||
)
|
||||
db.add(db_chunk)
|
||||
chunks.append(db_chunk)
|
||||
|
||||
await db.commit()
|
||||
|
||||
# Save processed markdown to ready directory
|
||||
ready_dir = get_project_ready_dir(str(project_id))
|
||||
md_filename = f"{file.id}_{file.filename}.md"
|
||||
md_path = ready_dir / md_filename
|
||||
|
||||
# Write markdown content to file
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(
|
||||
None,
|
||||
lambda: md_path.write_text(text, encoding='utf-8')
|
||||
)
|
||||
|
||||
# Update file path to ready location
|
||||
file.file_path = str(md_path)
|
||||
file.status = "completed"
|
||||
await db.commit()
|
||||
|
||||
# 记录成功日志
|
||||
log_success(
|
||||
"文件处理完成",
|
||||
project_id=str(project_id),
|
||||
file_id=str(file.id),
|
||||
filename=file.filename,
|
||||
chunk_count=len(chunks),
|
||||
text_length=len(text),
|
||||
ready_path=str(md_path)
|
||||
)
|
||||
|
||||
return ApiResponse.ok(
|
||||
data={"chunks": len(chunks)},
|
||||
message=f"Successfully split into {len(chunks)} chunks"
|
||||
)
|
||||
except Exception as e:
|
||||
# 记录失败日志
|
||||
log_failure(
|
||||
"文件处理失败",
|
||||
project_id=str(project_id),
|
||||
file_id=str(request.file_id),
|
||||
error=str(e)
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
@router.get("", response_model=ApiResponse)
|
||||
|
||||
Reference in New Issue
Block a user