feat(backend): 完善日志系统，支持按日期分目录存储

- 实现 logs/YYYY-MM-DD/ 日期文件夹结构 - 添加 success.log 和 failure.log 专用日志 - 使用 TimedRotatingFileHandler 实现按天切割 - 添加 log_success 和 log_failure 便捷函数 - 集成 markitdown 进行文件转换 - 优化文件存储路径，按项目ID分类存储 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 10:44:09 +08:00
parent 7514e7e763
commit 68453cead8
5 changed files with 276 additions and 112 deletions
--- a/backend/app/api/v1/chunks/init.py
+++ b/backend/app/api/v1/chunks/init.py
@@ -2,6 +2,7 @@
 Chunks API Router
 """
 import asyncio
+from pathlib import Path
 from typing import List, Optional
 from uuid import UUID
 from pydantic import BaseModel, Field
@@ -13,19 +14,28 @@ from app.api.response import ApiResponse, PaginatedResponse
 from app.core.database import get_db
 from app.core.exceptions import NotFoundException
 from app.core.crud import CRUDBase
+from app.core.logging import log_success, log_failure
 from app.models.models import Chunk, File
 from app.schemas.chunk import ChunkResponse
 from app.schemas.chunk import ChunkCreateSchema
 from app.services.text_splitter.splitter import get_splitter
-from app.services.file_processor.pdf_processor import process_pdf
-from app.services.file_processor.docx_processor import process_docx
-from app.services.file_processor.excel_processor import process_csv, process_excel
+from markitdown import MarkItDown

 router = APIRouter()

 # Initialize CRUD
 chunk_crud = CRUDBase(Chunk)

+# Initialize markitdown
+markitdown = MarkItDown()
+
+
+def get_project_ready_dir(project_id: str) -> Path:
+    """获取项目的 ready 文件目录"""
+    base_dir = Path("/data/code/YG-Datasets/data") / project_id / "ready"
+    base_dir.mkdir(parents=True, exist_ok=True)
+    return base_dir
+

 class SplitRequest(BaseModel):
    """Request model for splitting text"""
@@ -37,28 +47,29 @@ class SplitRequest(BaseModel):


 async def process_file_by_type(file: File) -> str:
-    """Process file based on its type"""
+    """Process file based on its type, convert to markdown"""
    if not file.file_path:
        raise NotFoundException("File", file.id)

-    processors = {
-        "pdf": process_pdf,
-        "docx": process_docx,
-        "xlsx": process_excel,
-        "csv": process_csv,
-    }
+    # Supported types for markitdown
+    markitdown_types = ["pdf", "docx", "doc", "pptx", "ppt", "xlsx", "xls", "htm", "html"]

-    processor = processors.get(file.file_type)
-    if not processor:
-        # Return raw text for txt, md files
+    if file.file_type in markitdown_types:
+        # Use markitdown to convert to markdown
        loop = asyncio.get_event_loop()
-        content = await loop.run_in_executor(
+        result = await loop.run_in_executor(
            None,
-            lambda: open(file.file_path, 'r', encoding='utf-8').read()
+            lambda: markitdown.convert(file.file_path)
        )
-        return content
+        return result.text_content

-    return await processor(file.file_path)
+    # Return raw text for txt, md files
+    loop = asyncio.get_event_loop()
+    content = await loop.run_in_executor(
+        None,
+        lambda: open(file.file_path, 'r', encoding='utf-8').read()
+    )
+    return content


@router.post("/split", response_model=ApiResponse)
@@ -68,52 +79,97 @@ async def split_text(
    db: AsyncSession = Depends(get_db)
 ):
    """Split text into chunks"""
-    # Get file
-    result = await db.execute(
-        select(File).where(File.id == request.file_id, File.project_id == project_id)
-    )
-    file = result.scalar_one_or_none()
-    if not file:
-        raise NotFoundException("File", request.file_id)
-
-    # Process file
-    text = await process_file_by_type(file)
-
-    # Update file status
-    file.status = "processing"
-    await db.commit()
-
-    # Split text
-    kwargs = {"chunk_size": request.chunk_size, "overlap": request.overlap}
-    if request.method == "custom" and request.separator:
-        kwargs["separator"] = request.separator
-
-    splitter = get_splitter(request.method, **kwargs)
-    split_results = splitter.split(text)
-
-    # Save chunks
-    chunks = []
-    for chunk_data in split_results:
-        db_chunk = Chunk(
-            project_id=project_id,
-            file_id=file.id,
-            name=chunk_data.get("name", f"Chunk {chunk_data['index'] + 1}"),
-            content=chunk_data["content"],
-            word_count=chunk_data.get("word_count", len(chunk_data["content"].split()))
+    try:
+        # Get file
+        result = await db.execute(
+            select(File).where(File.id == request.file_id, File.project_id == project_id)
        )
-        db.add(db_chunk)
-        chunks.append(db_chunk)
+        file = result.scalar_one_or_none()
+        if not file:
+            raise NotFoundException("File", request.file_id)

-    await db.commit()
+        # 记录开始处理
+        log_success(
+            "开始处理文件",
+            project_id=str(project_id),
+            file_id=str(file.id),
+            filename=file.filename,
+            method=request.method,
+            chunk_size=request.chunk_size,
+            overlap=request.overlap
+        )

-    # Update file status
-    file.status = "completed"
-    await db.commit()
+        # Process file
+        text = await process_file_by_type(file)

-    return ApiResponse.ok(
-        data={"chunks": len(chunks)},
-        message=f"Successfully split into {len(chunks)} chunks"
-    )
+        # Update file status
+        file.status = "processing"
+        await db.commit()
+
+        # Split text
+        kwargs = {"chunk_size": request.chunk_size, "overlap": request.overlap}
+        if request.method == "custom" and request.separator:
+            kwargs["separator"] = request.separator
+
+        splitter = get_splitter(request.method, **kwargs)
+        split_results = splitter.split(text)
+
+        # Save chunks
+        chunks = []
+        for chunk_data in split_results:
+            db_chunk = Chunk(
+                project_id=project_id,
+                file_id=file.id,
+                name=chunk_data.get("name", f"Chunk {chunk_data['index'] + 1}"),
+                content=chunk_data["content"],
+                word_count=chunk_data.get("word_count", len(chunk_data["content"].split()))
+            )
+            db.add(db_chunk)
+            chunks.append(db_chunk)
+
+        await db.commit()
+
+        # Save processed markdown to ready directory
+        ready_dir = get_project_ready_dir(str(project_id))
+        md_filename = f"{file.id}_{file.filename}.md"
+        md_path = ready_dir / md_filename
+
+        # Write markdown content to file
+        loop = asyncio.get_event_loop()
+        await loop.run_in_executor(
+            None,
+            lambda: md_path.write_text(text, encoding='utf-8')
+        )
+
+        # Update file path to ready location
+        file.file_path = str(md_path)
+        file.status = "completed"
+        await db.commit()
+
+        # 记录成功日志
+        log_success(
+            "文件处理完成",
+            project_id=str(project_id),
+            file_id=str(file.id),
+            filename=file.filename,
+            chunk_count=len(chunks),
+            text_length=len(text),
+            ready_path=str(md_path)
+        )
+
+        return ApiResponse.ok(
+            data={"chunks": len(chunks)},
+            message=f"Successfully split into {len(chunks)} chunks"
+        )
+    except Exception as e:
+        # 记录失败日志
+        log_failure(
+            "文件处理失败",
+            project_id=str(project_id),
+            file_id=str(request.file_id),
+            error=str(e)
+        )
+        raise


@router.get("", response_model=ApiResponse)