文件上传页面功能基本集成完成

2026-01-19 17:28:58 +08:00
parent 88eaa33db0
commit bfaeb24d9e
10 changed files with 16758 additions and 248 deletions
--- a/src/api/init.py
+++ b/src/api/init.py
@@ -0,0 +1,9 @@
+"""
+API 路由包
+"""
+from .datasets import datasets_bp
+
+# 注册所有蓝图
+def register_blueprints(app):
+    """注册所有蓝图"""
+    app.register_blueprint(datasets_bp)
--- a/src/api/datasets.py
+++ b/src/api/datasets.py
@@ -0,0 +1,430 @@
+"""
+数据集管理 API 路由
+"""
+import io
+import os
+import time
+import zipfile
+from flask import Blueprint, request, jsonify, send_from_directory, Response
+from werkzeug.utils import secure_filename
+
+# 获取项目根目录
+PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+DATASET_FOLDER = os.path.join(PROJECT_ROOT, 'datasets')
+ALLOWED_EXTENSIONS = {'jsonl', 'json', 'xls', 'xlsx'}
+
+# 创建蓝图
+datasets_bp = Blueprint('datasets', __name__, url_prefix='/api/dataset-manage')
+
+
+def get_db_connection():
+    """获取数据库连接"""
+    import pymysql
+    import yaml
+    CONFIG_PATH = os.path.join(PROJECT_ROOT, 'config.yaml')
+    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
+        CONFIG = yaml.safe_load(f)
+    db_config = CONFIG['database']
+    return pymysql.connect(
+        host=db_config['host'],
+        port=db_config['port'],
+        user=db_config['username'],
+        password=db_config['password'],
+        database=db_config['name'],
+        charset=db_config.get('charset', 'utf8mb4'),
+        cursorclass=pymysql.cursors.DictCursor
+    )
+
+
+def format_file_size(size_bytes):
+    """格式化文件大小"""
+    if size_bytes < 1024:
+        return f"{size_bytes} B"
+    elif size_bytes < 1024 * 1024:
+        return f"{size_bytes / 1024:.1f} KB"
+    elif size_bytes < 1024 * 1024 * 1024:
+        return f"{size_bytes / (1024 * 1024):.1f} MB"
+    else:
+        return f"{size_bytes / (1024 * 1024 * 1024):.1f} GB"
+
+
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+
+
+def generic_get_by_id(table_name, id_val):
+    """通用按ID查询"""
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    cursor.execute(f"SELECT * FROM {table_name} WHERE id = %s", (id_val,))
+    result = cursor.fetchone()
+    cursor.close()
+    conn.close()
+    return result
+
+
+# ============ 数据集管理 CRUD ============
+
+@datasets_bp.route('/<int:id>', methods=['GET'])
+def get_dataset(id):
+    """获取单个数据集详情"""
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    cursor.execute("SELECT * FROM dataset_manage WHERE id = %s", (id,))
+    dataset = cursor.fetchone()
+
+    if not dataset:
+        cursor.close()
+        conn.close()
+        return jsonify({'code': 1, 'message': '数据集不存在'})
+
+    # 获取关联的文件列表
+    cursor.execute(
+        "SELECT id, file_name, file_path, file_size, file_type, create_time FROM dataset_files WHERE dataset_id = %s ORDER BY create_time DESC",
+        (id,)
+    )
+    files = cursor.fetchall()
+
+    # 格式化文件大小
+    for f in files:
+        f['file_size_formatted'] = format_file_size(f['file_size'])
+
+    dataset['files'] = files
+    cursor.close()
+    conn.close()
+
+    return jsonify({'code': 0, 'data': dataset})
+
+
+@datasets_bp.route('', methods=['GET'])
+def get_datasets():
+    """获取所有数据集"""
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    cursor.execute("SELECT * FROM dataset_manage ORDER BY create_time DESC")
+    result = cursor.fetchall()
+    cursor.close()
+    conn.close()
+    return jsonify({'code': 0, 'data': result})
+
+
+@datasets_bp.route('', methods=['POST'])
+def create_dataset():
+    """创建数据集"""
+    data = request.json
+    try:
+        conn = get_db_connection()
+        cursor = conn.cursor()
+        columns = ', '.join(data.keys())
+        placeholders = ', '.join(['%s'] * len(data))
+        sql = f"INSERT INTO dataset_manage ({columns}) VALUES ({placeholders})"
+        cursor.execute(sql, list(data.values()))
+        conn.commit()
+        new_id = cursor.lastrowid
+        cursor.close()
+        conn.close()
+        return jsonify({'code': 0, 'message': '创建成功', 'id': new_id})
+    except Exception as e:
+        return jsonify({'code': 1, 'message': f'创建失败: {str(e)}'})
+
+
+@datasets_bp.route('/<int:id>', methods=['PUT'])
+def update_dataset(id):
+    """更新数据集"""
+    data = request.json
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    set_clause = ', '.join([f"{k} = %s" for k in data.keys()])
+    sql = f"UPDATE dataset_manage SET {set_clause} WHERE id = %s"
+    values = list(data.values()) + [id]
+    cursor.execute(sql, values)
+    conn.commit()
+    cursor.close()
+    conn.close()
+    return jsonify({'code': 0, 'message': '更新成功'})
+
+
+@datasets_bp.route('/<int:id>', methods=['DELETE'])
+def delete_dataset(id):
+    """删除数据集"""
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    # 获取文件路径列表
+    cursor.execute("SELECT file_path FROM dataset_files WHERE dataset_id = %s", (id,))
+    files = cursor.fetchall()
+    # 删除文件
+    for f in files:
+        file_path = f.get('file_path')
+        if file_path and os.path.exists(file_path):
+            try:
+                os.remove(file_path)
+            except Exception as e:
+                print(f"删除文件失败: {file_path}, {e}")
+    # 删除数据库记录
+    cursor.execute("DELETE FROM dataset_files WHERE dataset_id = %s", (id,))
+    cursor.execute("DELETE FROM dataset_manage WHERE id = %s", (id,))
+    conn.commit()
+    cursor.close()
+    conn.close()
+    return jsonify({'code': 0, 'message': '删除成功'})
+
+
+# ============ 数据集文件上传接口 ============
+
+@datasets_bp.route('/upload/<int:dataset_id>', methods=['POST'])
+def upload_dataset_file(dataset_id):
+    """上传数据集文件"""
+    # 检查数据集是否存在
+    dataset = generic_get_by_id('dataset_manage', dataset_id)
+    if not dataset:
+        return jsonify({'code': 1, 'message': '数据集不存在'})
+
+    # 确保上传目录存在（datasets根目录）
+    os.makedirs(DATASET_FOLDER, exist_ok=True)
+
+    uploaded_files = []
+    errors = []
+
+    if 'files' not in request.files:
+        return jsonify({'code': 1, 'message': '没有文件被上传'})
+
+    files = request.files.getlist('files')
+
+    for file in files:
+        if file.filename == '':
+            continue
+
+        if file and allowed_file(file.filename):
+            filename = secure_filename(file.filename)
+            # 添加时间戳和dataset_id防止文件名冲突，格式：timestamp_datasetId_filename
+            timestamp = int(time.time() * 1000)
+            new_filename = f"{timestamp}_{dataset_id}_{filename}"
+            file_path = os.path.join(DATASET_FOLDER, new_filename)
+
+            # 保存文件
+            file.save(file_path)
+            file_size = os.path.getsize(file_path)
+
+            # 获取文件扩展名
+            ext = filename.rsplit('.', 1)[1].lower()
+
+            # 保存文件信息到数据库
+            conn = get_db_connection()
+            cursor = conn.cursor()
+            cursor.execute(
+                "INSERT INTO dataset_files (dataset_id, file_name, file_path, file_size, file_type) VALUES (%s, %s, %s, %s, %s)",
+                (dataset_id, filename, file_path, file_size, ext)
+            )
+            conn.commit()
+            cursor.close()
+            conn.close()
+
+            uploaded_files.append({
+                'name': filename,
+                'size': file_size,
+                'size_formatted': format_file_size(file_size)
+            })
+        else:
+            errors.append(f"{file.filename}: 文件类型不支持")
+
+    # 如果有成功上传的文件，才更新数据集的文件数量和大小
+    if uploaded_files:
+        conn = get_db_connection()
+        cursor = conn.cursor()
+        cursor.execute("SELECT COUNT(*) as count, SUM(file_size) as total_size FROM dataset_files WHERE dataset_id = %s", (dataset_id,))
+        result = cursor.fetchone()
+        file_count = result['count'] or 0
+        total_size = result['total_size'] or 0
+
+        cursor.execute(
+            "UPDATE dataset_manage SET file_count = %s, size = %s WHERE id = %s",
+            (file_count, format_file_size(total_size), dataset_id)
+        )
+        conn.commit()
+        cursor.close()
+        conn.close()
+
+    if errors:
+        return jsonify({
+            'code': 0,
+            'message': f'部分文件上传成功，{len(errors)}个文件失败',
+            'data': {
+                'uploaded': uploaded_files,
+                'errors': errors
+            }
+        })
+
+    return jsonify({
+        'code': 0,
+        'message': f'成功上传 {len(uploaded_files)} 个文件',
+        'data': {
+            'uploaded': uploaded_files,
+            'file_count': file_count
+        }
+    })
+
+
+@datasets_bp.route('/<int:dataset_id>/files', methods=['GET'])
+def get_dataset_files(dataset_id):
+    """获取数据集文件列表"""
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    cursor.execute(
+        "SELECT id, file_name, file_path, file_size, file_type, create_time FROM dataset_files WHERE dataset_id = %s ORDER BY create_time DESC",
+        (dataset_id,)
+    )
+    files = cursor.fetchall()
+    cursor.close()
+    conn.close()
+
+    # 格式化文件大小
+    for f in files:
+        f['file_size_formatted'] = format_file_size(f['file_size'])
+
+    return jsonify({'code': 0, 'data': files})
+
+
+@datasets_bp.route('/files/<int:file_id>', methods=['DELETE'])
+def delete_dataset_file(file_id):
+    """删除数据集文件"""
+    conn = get_db_connection()
+    cursor = conn.cursor()
+
+    # 获取文件信息
+    cursor.execute("SELECT dataset_id, file_path FROM dataset_files WHERE id = %s", (file_id,))
+    file_info = cursor.fetchone()
+
+    if not file_info:
+        cursor.close()
+        conn.close()
+        return jsonify({'code': 1, 'message': '文件不存在'})
+
+    # 删除物理文件
+    file_path = file_info['file_path']
+    if file_path and os.path.exists(file_path):
+        try:
+            os.remove(file_path)
+        except Exception as e:
+            print(f"删除文件失败: {file_path}, {e}")
+
+    # 删除数据库记录
+    cursor.execute("DELETE FROM dataset_files WHERE id = %s", (file_id,))
+
+    # 更新数据集的文件数量和大小
+    dataset_id = file_info['dataset_id']
+    cursor.execute("SELECT COUNT(*) as count, SUM(file_size) as total_size FROM dataset_files WHERE dataset_id = %s", (dataset_id,))
+    result = cursor.fetchone()
+    file_count = result['count'] or 0
+    total_size = result['total_size'] or 0
+
+    cursor.execute(
+        "UPDATE dataset_manage SET file_count = %s, size = %s WHERE id = %s",
+        (file_count, format_file_size(total_size), dataset_id)
+    )
+
+    conn.commit()
+    cursor.close()
+    conn.close()
+
+    return jsonify({'code': 0, 'message': '删除成功'})
+
+
+# ============ 文件下载接口 ============
+
+@datasets_bp.route('/download/<int:dataset_id>/<filename>', methods=['GET'])
+def download_dataset_file(dataset_id, filename):
+    """下载数据集文件"""
+    # 文件直接存储在 DATASET_FOLDER 根目录下
+    return send_from_directory(DATASET_FOLDER, filename, as_attachment=True)
+
+
+@datasets_bp.route('/download/<int:dataset_id>', methods=['GET'])
+def download_dataset_all(dataset_id):
+    """下载数据集所有文件（ZIP打包）"""
+    conn = get_db_connection()
+    cursor = conn.cursor()
+
+    # 获取数据集信息
+    cursor.execute("SELECT name FROM dataset_manage WHERE id = %s", (dataset_id,))
+    dataset = cursor.fetchone()
+
+    if not dataset:
+        cursor.close()
+        conn.close()
+        return jsonify({'code': 1, 'message': '数据集不存在'})
+
+    # 获取所有文件
+    cursor.execute(
+        "SELECT id, file_name, file_path FROM dataset_files WHERE dataset_id = %s ORDER BY create_time DESC",
+        (dataset_id,)
+    )
+    files = cursor.fetchall()
+    cursor.close()
+    conn.close()
+
+    if not files:
+        return jsonify({'code': 1, 'message': '数据集没有文件'})
+
+    # 创建ZIP文件
+    memory_file = io.BytesIO()
+    with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for f in files:
+            file_path = f.get('file_path')
+            if file_path and os.path.exists(file_path):
+                # 使用原始文件名
+                zf.write(file_path, f['file_name'])
+
+    memory_file.seek(0)
+
+    # 发送ZIP文件
+    zip_name = f"{dataset['name'] or 'dataset'}_{dataset_id}.zip"
+    return Response(
+        memory_file,
+        mimetype='application/zip',
+        headers={'Content-Disposition': f'attachment;filename={zip_name}'}
+    )
+
+
+# ============ 文件预览接口 ============
+
+@datasets_bp.route('/preview/<int:file_id>', methods=['GET'])
+def preview_dataset_file(file_id):
+    """预览数据集文件内容（限100KB）"""
+    conn = get_db_connection()
+    cursor = conn.cursor()
+
+    # 获取文件信息
+    cursor.execute("SELECT id, file_name, file_path, file_type FROM dataset_files WHERE id = %s", (file_id,))
+    file_info = cursor.fetchone()
+
+    if not file_info:
+        cursor.close()
+        conn.close()
+        return jsonify({'code': 1, 'message': '文件不存在'})
+
+    file_path = file_info['file_path']
+
+    if not file_path or not os.path.exists(file_path):
+        cursor.close()
+        conn.close()
+        return jsonify({'code': 1, 'message': '文件不存在'})
+
+    # 读取文件内容（限100KB）
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read(102400)  # 100KB
+    except Exception as e:
+        cursor.close()
+        conn.close()
+        return jsonify({'code': 1, 'message': f'读取文件失败: {str(e)}'})
+
+    cursor.close()
+    conn.close()
+
+    return jsonify({
+        'code': 0,
+        'data': {
+            'file_name': file_info['file_name'],
+            'content': content
+        }
+    })