模型开始训练界面以及查看日志功能完善
This commit is contained in:
@@ -16,3 +16,6 @@ app:
|
|||||||
|
|
||||||
# 密钥配置
|
# 密钥配置
|
||||||
secret_key: "yg-ft-platform-secret-key-2024"
|
secret_key: "yg-ft-platform-secret-key-2024"
|
||||||
|
|
||||||
|
# 训练日志路径
|
||||||
|
training_logs_path: "/app/base/training_logs"
|
||||||
|
|||||||
@@ -7,3 +7,4 @@ requests==2.31.0
|
|||||||
psutil==5.9.8
|
psutil==5.9.8
|
||||||
werkzeug==3.0.1
|
werkzeug==3.0.1
|
||||||
pynvml==11.5.0
|
pynvml==11.5.0
|
||||||
|
tensorboard>=2.13.0
|
||||||
|
|||||||
@@ -3,16 +3,31 @@
|
|||||||
调用 llamafactory-cli 执行训练任务
|
调用 llamafactory-cli 执行训练任务
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import subprocess
|
import subprocess
|
||||||
import json
|
import json
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
import signal
|
||||||
|
import yaml
|
||||||
from flask import Blueprint, request, jsonify
|
from flask import Blueprint, request, jsonify
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
# 添加项目根目录到路径
|
||||||
|
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
sys.path.insert(0, PROJECT_ROOT)
|
||||||
|
|
||||||
|
# 加载配置
|
||||||
|
CONFIG_PATH = os.path.join(PROJECT_ROOT, 'config.yaml')
|
||||||
|
with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
|
||||||
|
CONFIG = yaml.safe_load(f)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
train_logger = logging.getLogger('train') # 专门的训练日志 logger,输出到 train.log
|
train_logger = logging.getLogger('train') # 专门的训练日志 logger,输出到 train.log
|
||||||
|
|
||||||
|
# 从配置获取训练日志路径
|
||||||
|
TRAINING_LOGS_DIR = CONFIG.get('training_logs_path', '/app/base/training_logs')
|
||||||
|
|
||||||
# 创建蓝图
|
# 创建蓝图
|
||||||
fine_tune_bp = Blueprint('fine_tune', __name__, url_prefix='/api/fine-tune')
|
fine_tune_bp = Blueprint('fine_tune', __name__, url_prefix='/api/fine-tune')
|
||||||
|
|
||||||
@@ -72,21 +87,21 @@ def start_training():
|
|||||||
|
|
||||||
train_logger.info(f"[TRAIN] 模型路径: {model_path}")
|
train_logger.info(f"[TRAIN] 模型路径: {model_path}")
|
||||||
|
|
||||||
# 设置工作目录为 llamafactory 目录
|
# 设置工作目录和 llamafactory 目录
|
||||||
llamafactory_dir = '/app/src/llamafactory'
|
work_dir = '/app/base'
|
||||||
|
llamafactory_dir = '/app/base'
|
||||||
|
|
||||||
# 处理数据集文件:将数据集复制到 llamafactory 的 datasets 目录
|
# 数据集目录直接使用 /app/base/datasets(不再复制)
|
||||||
|
datasets_dir = '/app/base/datasets'
|
||||||
|
|
||||||
|
# 获取数据集名称(用于 --dataset 参数)
|
||||||
|
dataset_key = None
|
||||||
dataset_id = data.get('train_dataset_id')
|
dataset_id = data.get('train_dataset_id')
|
||||||
try:
|
try:
|
||||||
dataset_id_int = int(dataset_id) if str(dataset_id).isdigit() else None
|
dataset_id_int = int(dataset_id) if str(dataset_id).isdigit() else None
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
dataset_id_int = None
|
dataset_id_int = None
|
||||||
|
|
||||||
llamafactory_datasets_dir = os.path.join(llamafactory_dir, 'datasets')
|
|
||||||
os.makedirs(llamafactory_datasets_dir, exist_ok=True)
|
|
||||||
|
|
||||||
# 获取数据集名称(用于 --dataset 参数)
|
|
||||||
dataset_key = None
|
|
||||||
if dataset_id_int:
|
if dataset_id_int:
|
||||||
from .datasets import get_db_connection as get_dataset_conn
|
from .datasets import get_db_connection as get_dataset_conn
|
||||||
conn = get_dataset_conn()
|
conn = get_dataset_conn()
|
||||||
@@ -94,43 +109,8 @@ def start_training():
|
|||||||
cursor.execute("SELECT dm.name FROM dataset_manage dm WHERE dm.id = %s", (dataset_id_int,))
|
cursor.execute("SELECT dm.name FROM dataset_manage dm WHERE dm.id = %s", (dataset_id_int,))
|
||||||
dataset_result = cursor.fetchone()
|
dataset_result = cursor.fetchone()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
dataset_key = dataset_result['name'] if dataset_result else None
|
dataset_key = dataset_result['name'] if dataset_result else None
|
||||||
|
train_logger.info(f"[TRAIN] 数据集名称: {dataset_key}")
|
||||||
if dataset_key:
|
|
||||||
# 从 dataset_info.json 读取实际文件名
|
|
||||||
src_info_json = os.path.join('/app/base', 'datasets', 'dataset_info.json')
|
|
||||||
actual_file_name = None
|
|
||||||
if os.path.exists(src_info_json):
|
|
||||||
import json as json_lib
|
|
||||||
with open(src_info_json, 'r', encoding='utf-8') as f:
|
|
||||||
dataset_info = json_lib.load(f)
|
|
||||||
if dataset_key in dataset_info:
|
|
||||||
actual_file_name = dataset_info[dataset_key].get('file_name')
|
|
||||||
train_logger.info(f"[TRAIN] 从 dataset_info.json 获取文件名: {dataset_key} -> {actual_file_name}")
|
|
||||||
|
|
||||||
# 复制数据集文件到 llamafactory 目录
|
|
||||||
if actual_file_name:
|
|
||||||
src_file = os.path.join('/app/base', 'datasets', actual_file_name)
|
|
||||||
dst_file = os.path.join(llamafactory_datasets_dir, actual_file_name)
|
|
||||||
if os.path.exists(src_file):
|
|
||||||
import shutil
|
|
||||||
shutil.copy2(src_file, dst_file)
|
|
||||||
train_logger.info(f"[TRAIN] 复制数据集文件: {src_file} -> {dst_file}")
|
|
||||||
else:
|
|
||||||
train_logger.warning(f"[TRAIN] 数据集文件不存在: {src_file}")
|
|
||||||
|
|
||||||
# 复制 dataset_info.json 到 llamafactory datasets 目录
|
|
||||||
src_info_json = os.path.join('/app/base', 'datasets', 'dataset_info.json')
|
|
||||||
dst_info_json = os.path.join(llamafactory_datasets_dir, 'dataset_info.json')
|
|
||||||
try:
|
|
||||||
if os.path.exists(src_info_json):
|
|
||||||
shutil.copy2(src_info_json, dst_info_json)
|
|
||||||
train_logger.info(f"[TRAIN] 已复制 dataset_info.json 到 llamafactory 目录")
|
|
||||||
else:
|
|
||||||
train_logger.warning(f"[TRAIN] dataset_info.json 不存在: {src_info_json}")
|
|
||||||
except Exception as e:
|
|
||||||
train_logger.warning(f"[TRAIN] 复制 dataset_info.json 失败: {e}")
|
|
||||||
|
|
||||||
# 获取选中的 GPU 索引
|
# 获取选中的 GPU 索引
|
||||||
gpus = data.get('gpus', [])
|
gpus = data.get('gpus', [])
|
||||||
@@ -145,6 +125,9 @@ def start_training():
|
|||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
env['CUDA_VISIBLE_DEVICES'] = cuda_devices
|
env['CUDA_VISIBLE_DEVICES'] = cuda_devices
|
||||||
env['TF_CPP_MIN_LOG_LEVEL'] = '2' # 减少 TensorFlow 日志
|
env['TF_CPP_MIN_LOG_LEVEL'] = '2' # 减少 TensorFlow 日志
|
||||||
|
env['LLAMAFACTORY_DIR'] = '/app/base' # 指定 llamafactory 根目录
|
||||||
|
env['PYTHONUNBUFFERED'] = '1' # 强制 Python 不缓冲输出,实时写入日志
|
||||||
|
env['TRANSFORMERS_VERBOSITY'] = 'INFO' # 设置 transformers 日志级别
|
||||||
|
|
||||||
# 构建 llamafactory-cli 命令(传入数据集名称用于 --dataset 参数)
|
# 构建 llamafactory-cli 命令(传入数据集名称用于 --dataset 参数)
|
||||||
cmd = build_train_command(data, model_path, dataset_key)
|
cmd = build_train_command(data, model_path, dataset_key)
|
||||||
@@ -154,57 +137,93 @@ def start_training():
|
|||||||
# 在返回的命令中显示 GPU 配置
|
# 在返回的命令中显示 GPU 配置
|
||||||
cmd_str_with_gpu = f"CUDA_VISIBLE_DEVICES={cuda_devices} {cmd_str}"
|
cmd_str_with_gpu = f"CUDA_VISIBLE_DEVICES={cuda_devices} {cmd_str}"
|
||||||
|
|
||||||
# 生成训练日志文件路径(按日期分目录)
|
# 生成训练日志文件路径(存储在 logs 目录下的日期子目录中)
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
today = datetime.now().strftime('%Y-%m-%d')
|
today = datetime.now().strftime('%Y-%m-%d')
|
||||||
task_id_str = str(data.get('task_id', 'unknown'))
|
now_str = datetime.now().strftime('%Y%m%d_%H%M%S') # 时间戳用于排序
|
||||||
log_dir = os.path.join(llamafactory_dir, 'logs', today)
|
task_id = data.get('task_id', 'unknown')
|
||||||
train_output_log = os.path.join(log_dir, f'train_{task_id_str}.log')
|
task_name = data.get('name', 'unknown')
|
||||||
|
# 工作目录设为 /app/base(而非 llamafactory 目录)
|
||||||
|
work_dir = '/app/base'
|
||||||
|
# 使用 logs 目录下的日期子目录
|
||||||
|
training_logs_dir = os.path.join('/app/base/logs', today)
|
||||||
|
os.makedirs(training_logs_dir, exist_ok=True)
|
||||||
|
|
||||||
# 确保日志目录存在
|
# 日志文件路径: logs/{日期}/{task_id}_{task_name}.log
|
||||||
os.makedirs(log_dir, exist_ok=True)
|
log_file = os.path.join(training_logs_dir, f'{task_id}_{task_name}.log')
|
||||||
|
|
||||||
train_logger.info(f"[TRAIN] 启动训练进程...")
|
train_logger.info(f"[TRAIN] 启动训练进程...")
|
||||||
|
|
||||||
|
# 用于存储实际进程 PID
|
||||||
|
actual_pid = None
|
||||||
|
final_log_path = log_file
|
||||||
|
|
||||||
# 使用线程在后台运行训练进程
|
# 使用线程在后台运行训练进程
|
||||||
def run_training():
|
def run_training():
|
||||||
with open(train_output_log, 'w', encoding='utf-8') as log_file:
|
nonlocal actual_pid, final_log_path
|
||||||
|
|
||||||
|
# 从 data 中获取 template 和 train_method(与 build_train_command 保持一致)
|
||||||
|
template = data.get('template', 'default')
|
||||||
|
train_method = data.get('train_method', 'lora')
|
||||||
|
|
||||||
|
# 创建输出目录(如果不存在)
|
||||||
|
output_model_name = data.get('output_model_name', f"{template}/{train_method}")
|
||||||
|
if not output_model_name.startswith('/'):
|
||||||
|
output_model_name = f"/app/base/saves/{output_model_name}"
|
||||||
|
output_dir = output_model_name
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
train_logger.info(f"[TRAIN] 输出目录: {output_dir}")
|
||||||
|
train_logger.info(f"[TRAIN] 完整训练命令: {' '.join(cmd)}")
|
||||||
|
|
||||||
|
with open(log_file, 'w', encoding='utf-8') as f:
|
||||||
|
# 设置 cwd 为 /app,但通过 LLAMAFACTORY_DIR 环境变量指定 llamafactory 位置
|
||||||
process = subprocess.Popen(
|
process = subprocess.Popen(
|
||||||
cmd,
|
cmd,
|
||||||
cwd=llamafactory_dir,
|
cwd=work_dir,
|
||||||
stdout=log_file,
|
stdout=f,
|
||||||
stderr=subprocess.STDOUT,
|
stderr=subprocess.STDOUT,
|
||||||
env=env
|
env=env
|
||||||
)
|
)
|
||||||
train_logger.info(f"[TRAIN] 训练进程 PID: {process.pid}")
|
actual_pid = process.pid
|
||||||
|
train_logger.info(f"[TRAIN] 训练进程 PID: {actual_pid}")
|
||||||
|
train_logger.info(f"[TRAIN] 日志文件: {log_file}")
|
||||||
|
|
||||||
|
# 更新数据库中的 PID(立即更新,方便停止任务)
|
||||||
|
update_fine_tune_status(task_id, 'running', actual_pid)
|
||||||
|
|
||||||
# 等待进程完成
|
# 等待进程完成
|
||||||
process.wait()
|
process.wait()
|
||||||
train_logger.info(f"[TRAIN] 训练进程已结束,退出码: {process.returncode}")
|
train_logger.info(f"[TRAIN] 训练进程已结束,退出码: {process.returncode}")
|
||||||
|
|
||||||
# 更新任务状态
|
# 更新任务状态
|
||||||
final_status = 'completed' if process.returncode == 0 else 'failed'
|
final_status = 'completed' if process.returncode == 0 else 'failed'
|
||||||
update_fine_tune_status(data.get('task_id'), final_status, process.pid)
|
update_fine_tune_status(task_id, final_status, actual_pid)
|
||||||
|
|
||||||
# 启动后台线程
|
# 启动后台线程
|
||||||
training_thread = threading.Thread(target=run_training, daemon=True)
|
training_thread = threading.Thread(target=run_training, daemon=True)
|
||||||
training_thread.start()
|
training_thread.start()
|
||||||
|
|
||||||
# 立即返回,不等待进程完成
|
# 等待 PID 并更新到数据库
|
||||||
pid = None # 此时还不知道实际 PID,稍后可从日志获取
|
for i in range(10): # 最多等待1秒
|
||||||
train_logger.info(f"[TRAIN] 训练任务已在后台启动")
|
time.sleep(0.1)
|
||||||
train_logger.info(f"[TRAIN] 训练日志输出到: {train_output_log}")
|
if actual_pid:
|
||||||
|
break
|
||||||
|
|
||||||
# 更新任务状态为运行中
|
# 立即返回,不等待进程完成
|
||||||
update_fine_tune_status(data.get('task_id'), 'running', 0)
|
train_logger.info(f"[TRAIN] 训练任务已在后台启动,PID: {actual_pid}")
|
||||||
|
|
||||||
|
train_logger.info(f"[TRAIN] 训练日志输出到: {log_file}")
|
||||||
|
|
||||||
return jsonify({
|
return jsonify({
|
||||||
'code': 0,
|
'code': 0,
|
||||||
'message': f'训练任务已启动 (GPU: {cuda_devices})',
|
'message': f'训练任务已启动 (GPU: {cuda_devices})',
|
||||||
'data': {
|
'data': {
|
||||||
'task_id': data.get('task_id'),
|
'task_id': task_id,
|
||||||
|
'pid': actual_pid,
|
||||||
'gpu_ids': cuda_devices,
|
'gpu_ids': cuda_devices,
|
||||||
'command': cmd_str_with_gpu,
|
'command': cmd_str_with_gpu,
|
||||||
'log_file': train_output_log
|
'log_file': log_file,
|
||||||
|
'training_logs_dir': training_logs_dir
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -258,10 +277,11 @@ def build_train_command(data, model_path, dataset_name=None):
|
|||||||
train_method = data.get('train_method', 'lora')
|
train_method = data.get('train_method', 'lora')
|
||||||
cmd.extend(['--finetuning_type', FINETUNING_TYPE_MAP.get(train_method, 'lora')])
|
cmd.extend(['--finetuning_type', FINETUNING_TYPE_MAP.get(train_method, 'lora')])
|
||||||
|
|
||||||
# 输出目录
|
# 输出目录(确保是绝对路径)
|
||||||
output_dir = data.get('output_model_name', f"./saves/{template}/{train_method}")
|
output_model_name = data.get('output_model_name', f"{template}/{train_method}")
|
||||||
if not output_dir.startswith('./'):
|
if not output_model_name.startswith('/'):
|
||||||
output_dir = f"./saves/{output_dir}"
|
output_model_name = f"/app/base/saves/{output_model_name}"
|
||||||
|
output_dir = output_model_name
|
||||||
cmd.extend(['--output_dir', output_dir])
|
cmd.extend(['--output_dir', output_dir])
|
||||||
|
|
||||||
# 常用参数
|
# 常用参数
|
||||||
@@ -274,10 +294,11 @@ def build_train_command(data, model_path, dataset_name=None):
|
|||||||
'--per_device_eval_batch_size', '1',
|
'--per_device_eval_batch_size', '1',
|
||||||
'--gradient_accumulation_steps', str(data.get('gradient_accumulation_steps', 8)),
|
'--gradient_accumulation_steps', str(data.get('gradient_accumulation_steps', 8)),
|
||||||
'--lr_scheduler_type', data.get('lr_scheduler_type', 'cosine'),
|
'--lr_scheduler_type', data.get('lr_scheduler_type', 'cosine'),
|
||||||
'--logging_steps', '50',
|
'--logging_steps', '5',
|
||||||
'--warmup_steps', str(data.get('warmup_steps', 20)),
|
'--warmup_steps', str(data.get('warmup_steps', 20)),
|
||||||
'--save_steps', '100',
|
'--save_steps', str(data.get('save_steps', 100)),
|
||||||
'--eval_steps', str(data.get('eval_steps', 100)),
|
'--log_level', 'info', # 设置日志级别为 info
|
||||||
|
'--log_level_replica', 'info', # 设置副本日志级别
|
||||||
])
|
])
|
||||||
|
|
||||||
# 学习率
|
# 学习率
|
||||||
@@ -295,9 +316,10 @@ def build_train_command(data, model_path, dataset_name=None):
|
|||||||
if data.get('max_samples'):
|
if data.get('max_samples'):
|
||||||
cmd.extend(['--max_samples', str(data.get('max_samples'))])
|
cmd.extend(['--max_samples', str(data.get('max_samples'))])
|
||||||
|
|
||||||
|
# 启用 TensorBoard 日志(用于可视化训练曲线)
|
||||||
|
cmd.append('--plot_loss')
|
||||||
|
|
||||||
# 其他选项
|
# 其他选项
|
||||||
if data.get('plot_loss'):
|
|
||||||
cmd.append('--plot_loss')
|
|
||||||
|
|
||||||
if data.get('fp16'):
|
if data.get('fp16'):
|
||||||
cmd.append('--fp16')
|
cmd.append('--fp16')
|
||||||
@@ -386,6 +408,57 @@ def stop_training(task_id):
|
|||||||
return jsonify({'code': 1, 'message': str(e)})
|
return jsonify({'code': 1, 'message': str(e)})
|
||||||
|
|
||||||
|
|
||||||
|
@fine_tune_bp.route('/<int:task_id>', methods=['DELETE'])
|
||||||
|
def delete_training_task(task_id):
|
||||||
|
"""删除训练任务及对应的日志文件"""
|
||||||
|
try:
|
||||||
|
from .model_manage import get_db_connection
|
||||||
|
|
||||||
|
# 获取任务信息(用于删除日志文件)
|
||||||
|
conn = get_db_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT name, process_id FROM fine_tune WHERE id = %s", (task_id,))
|
||||||
|
task_result = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not task_result:
|
||||||
|
return jsonify({'code': 1, 'message': '任务不存在'})
|
||||||
|
|
||||||
|
task_name = task_result.get('name', 'unknown')
|
||||||
|
|
||||||
|
# 删除日志文件 (logs/{日期}/{task_id}_{task_name}.log)
|
||||||
|
try:
|
||||||
|
from datetime import datetime
|
||||||
|
today = datetime.now().strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
# 可能的日志文件路径
|
||||||
|
log_paths = [
|
||||||
|
f'/app/base/logs/{today}/{task_id}_{task_name}.log',
|
||||||
|
f'/app/base/logs/{task_id}_{task_name}.log',
|
||||||
|
]
|
||||||
|
|
||||||
|
for log_path in log_paths:
|
||||||
|
if os.path.exists(log_path):
|
||||||
|
os.remove(log_path)
|
||||||
|
logger.info(f"已删除日志文件: {log_path}")
|
||||||
|
except Exception as log_err:
|
||||||
|
logger.warning(f"删除日志文件失败: {log_err}")
|
||||||
|
|
||||||
|
# 删除数据库中的任务记录
|
||||||
|
conn = get_db_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("DELETE FROM fine_tune WHERE id = %s", (task_id,))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
logger.info(f"已删除训练任务 {task_id}: {task_name}")
|
||||||
|
return jsonify({'code': 0, 'message': '删除成功'})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"删除训练任务失败: {e}")
|
||||||
|
return jsonify({'code': 1, 'message': str(e)})
|
||||||
|
|
||||||
|
|
||||||
@fine_tune_bp.route('/status/<int:task_id>', methods=['GET'])
|
@fine_tune_bp.route('/status/<int:task_id>', methods=['GET'])
|
||||||
def get_training_status(task_id):
|
def get_training_status(task_id):
|
||||||
"""获取训练任务状态"""
|
"""获取训练任务状态"""
|
||||||
@@ -402,12 +475,25 @@ def get_training_status(task_id):
|
|||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
|
# 检查 PID 是否仍在运行
|
||||||
|
actual_status = result['status']
|
||||||
|
pid = result.get('process_id')
|
||||||
|
if pid and actual_status == 'running':
|
||||||
|
try:
|
||||||
|
# 检查进程是否存在
|
||||||
|
os.kill(pid, 0)
|
||||||
|
# 进程仍在运行
|
||||||
|
actual_status = 'running'
|
||||||
|
except (OSError, ProcessLookupError):
|
||||||
|
# 进程已结束,尝试更新状态
|
||||||
|
actual_status = 'completed' # 假设完成(实际可能失败)
|
||||||
|
|
||||||
return jsonify({
|
return jsonify({
|
||||||
'code': 0,
|
'code': 0,
|
||||||
'data': {
|
'data': {
|
||||||
'task_id': result['id'],
|
'task_id': result['id'],
|
||||||
'name': result['name'],
|
'name': result['name'],
|
||||||
'status': result['status'],
|
'status': actual_status,
|
||||||
'progress': result['progress'],
|
'progress': result['progress'],
|
||||||
'pid': result.get('process_id')
|
'pid': result.get('process_id')
|
||||||
}
|
}
|
||||||
@@ -420,6 +506,254 @@ def get_training_status(task_id):
|
|||||||
return jsonify({'code': 1, 'message': str(e)})
|
return jsonify({'code': 1, 'message': str(e)})
|
||||||
|
|
||||||
|
|
||||||
|
@fine_tune_bp.route('/check-pid/<int:pid>', methods=['GET'])
|
||||||
|
def check_pid_status(pid):
|
||||||
|
"""检查 PID 是否仍在运行"""
|
||||||
|
try:
|
||||||
|
if pid <= 0:
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'exists': False,
|
||||||
|
'message': '无效的 PID'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 发送信号 0 来检查进程是否存在(不会实际终止进程)
|
||||||
|
os.kill(pid, 0)
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'exists': True,
|
||||||
|
'message': '进程仍在运行'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
except (OSError, ProcessLookupError):
|
||||||
|
# 进程不存在
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'exists': False,
|
||||||
|
'message': '进程已结束'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"检查 PID 状态失败: {e}")
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'exists': False,
|
||||||
|
'message': f'检查失败: {str(e)}'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@fine_tune_bp.route('/log/<int:task_id>', methods=['GET'])
|
||||||
|
def get_training_log(task_id):
|
||||||
|
"""获取训练任务日志内容(支持实时读取)"""
|
||||||
|
try:
|
||||||
|
from .model_manage import get_db_connection
|
||||||
|
|
||||||
|
# 获取任务信息和进程ID
|
||||||
|
conn = get_db_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"SELECT name, process_id, status FROM fine_tune WHERE id = %s",
|
||||||
|
(task_id,)
|
||||||
|
)
|
||||||
|
result = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
return jsonify({'code': 1, 'message': '任务不存在'})
|
||||||
|
|
||||||
|
process_id = result.get('process_id')
|
||||||
|
task_name = result['name']
|
||||||
|
status = result['status']
|
||||||
|
|
||||||
|
if not process_id:
|
||||||
|
return jsonify({'code': 1, 'message': '任务尚未启动'})
|
||||||
|
|
||||||
|
# 构建日志文件路径 - 新格式: logs/{日期}/{task_id}_{task_name}.log
|
||||||
|
from datetime import datetime
|
||||||
|
today = datetime.now().strftime('%Y-%m-%d')
|
||||||
|
training_logs_dir = os.path.join('/app/base/logs', today)
|
||||||
|
|
||||||
|
# 查找日志文件 (新格式: {task_id}_{task_name}.log)
|
||||||
|
log_file = os.path.join(training_logs_dir, f'{task_id}_{task_name}.log')
|
||||||
|
|
||||||
|
if not os.path.exists(log_file):
|
||||||
|
# 如果没找到,返回空日志
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'content': '',
|
||||||
|
'status': status,
|
||||||
|
'message': '日志文件尚未创建'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
# 读取日志文件内容
|
||||||
|
try:
|
||||||
|
with open(log_file, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
|
content = f.read()
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'content': content,
|
||||||
|
'status': status,
|
||||||
|
'log_file': log_file
|
||||||
|
}
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'content': '',
|
||||||
|
'status': status,
|
||||||
|
'message': f'读取日志失败: {str(e)}'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取训练日志失败: {e}")
|
||||||
|
return jsonify({'code': 1, 'message': str(e)})
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
@fine_tune_bp.route('/progress/<int:task_id>', methods=['GET'])
|
||||||
|
def get_training_progress(task_id):
|
||||||
|
"""获取训练任务进度(从日志中解析 llamafactory 的进度信息)"""
|
||||||
|
try:
|
||||||
|
from .model_manage import get_db_connection
|
||||||
|
|
||||||
|
# 获取任务信息和进程ID
|
||||||
|
conn = get_db_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"SELECT name, process_id, status FROM fine_tune WHERE id = %s",
|
||||||
|
(task_id,)
|
||||||
|
)
|
||||||
|
result = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
return jsonify({'code': 1, 'message': '任务不存在'})
|
||||||
|
|
||||||
|
process_id = result.get('process_id')
|
||||||
|
task_name = result['name']
|
||||||
|
status = result['status']
|
||||||
|
|
||||||
|
if not process_id:
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'progress': 0,
|
||||||
|
'step': '',
|
||||||
|
'eta': '',
|
||||||
|
'speed': '',
|
||||||
|
'status': status,
|
||||||
|
'message': '任务尚未启动'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
# 构建日志文件路径 - 新格式: logs/{日期}/{task_id}_{task_name}.log
|
||||||
|
from datetime import datetime
|
||||||
|
today = datetime.now().strftime('%Y-%m-%d')
|
||||||
|
training_logs_dir = os.path.join('/app/base/logs', today)
|
||||||
|
|
||||||
|
# 查找日志文件 (新格式: {task_id}_{task_name}.log)
|
||||||
|
log_file = os.path.join(training_logs_dir, f'{task_id}_{task_name}.log')
|
||||||
|
|
||||||
|
# TensorBoard 日志目录(使用默认值)
|
||||||
|
tensorboard_log_dir = '/app/base/saves'
|
||||||
|
|
||||||
|
if not os.path.exists(log_file):
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'step': '',
|
||||||
|
'elapsed': '',
|
||||||
|
'eta': '',
|
||||||
|
'speed': '',
|
||||||
|
'status': status,
|
||||||
|
'message': '日志文件尚未创建',
|
||||||
|
'tensorboard_url': ''
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
# 读取日志文件最后部分,解析进度信息
|
||||||
|
try:
|
||||||
|
with open(log_file, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
|
# 读取最后 10KB 内容
|
||||||
|
f.seek(0, 2) # 跳到文件末尾
|
||||||
|
file_size = f.tell()
|
||||||
|
read_size = min(10240, file_size)
|
||||||
|
f.seek(max(0, file_size - read_size))
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
# 匹配 llamafactory 进度格式: 52%|█████▏ | 17/33 [02:16<02:08, 8.04s/it]
|
||||||
|
progress_pattern = r'\s*(\d+)%\|[█░▌▋▒█▏▎▏▐▀■□▪▫‣▶➜➡→]+\s*\|\s*(\d+)/(\d+)\s+\[(\d+):(\d+)<(\d+):(\d+),\s*([\d.]+)s/it\]'
|
||||||
|
match = re.search(progress_pattern, content)
|
||||||
|
|
||||||
|
step_info = ''
|
||||||
|
elapsed = ''
|
||||||
|
eta = ''
|
||||||
|
speed = ''
|
||||||
|
message = '等待训练开始'
|
||||||
|
|
||||||
|
if match:
|
||||||
|
current_step = int(match.group(2))
|
||||||
|
total_steps = int(match.group(3))
|
||||||
|
elapsed_min = int(match.group(4))
|
||||||
|
elapsed_sec = int(match.group(5))
|
||||||
|
eta_min = int(match.group(6))
|
||||||
|
eta_sec = int(match.group(7))
|
||||||
|
speed_val = float(match.group(8))
|
||||||
|
|
||||||
|
step_info = f'{current_step}/{total_steps}'
|
||||||
|
elapsed = f'{elapsed_min:02d}:{elapsed_sec:02d}'
|
||||||
|
eta = f'{eta_min:02d}:{eta_sec:02d}'
|
||||||
|
speed = f'{speed_val}s/it'
|
||||||
|
message = '训练进行中'
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'step': step_info,
|
||||||
|
'elapsed': elapsed,
|
||||||
|
'eta': eta,
|
||||||
|
'speed': speed,
|
||||||
|
'status': status,
|
||||||
|
'message': message,
|
||||||
|
'tensorboard_log_dir': tensorboard_log_dir,
|
||||||
|
'tensorboard_url': ''
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'step': '',
|
||||||
|
'elapsed': '',
|
||||||
|
'eta': '',
|
||||||
|
'speed': '',
|
||||||
|
'status': status,
|
||||||
|
'message': f'读取进度失败: {str(e)}',
|
||||||
|
'tensorboard_log_dir': tensorboard_log_dir,
|
||||||
|
'tensorboard_url': ''
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取训练进度失败: {e}")
|
||||||
|
return jsonify({'code': 1, 'message': str(e)})
|
||||||
|
|
||||||
|
|
||||||
def get_db_connection():
|
def get_db_connection():
|
||||||
"""获取数据库连接"""
|
"""获取数据库连接"""
|
||||||
import pymysql
|
import pymysql
|
||||||
@@ -441,3 +775,129 @@ def get_db_connection():
|
|||||||
charset=db_config.get('charset', 'utf8mb4'),
|
charset=db_config.get('charset', 'utf8mb4'),
|
||||||
cursorclass=pymysql.cursors.DictCursor
|
cursorclass=pymysql.cursors.DictCursor
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@fine_tune_bp.route('/check-name', methods=['GET'])
|
||||||
|
def check_task_name():
|
||||||
|
"""检查任务名称是否重复"""
|
||||||
|
try:
|
||||||
|
name = request.args.get('name', '').strip()
|
||||||
|
if not name:
|
||||||
|
return jsonify({'code': 1, 'message': '任务名称不能为空'})
|
||||||
|
|
||||||
|
# 验证任务名称格式:只能包含英文、数字、下划线
|
||||||
|
import re
|
||||||
|
if not re.match(r'^[a-zA-Z0-9_]+$', name):
|
||||||
|
return jsonify({'code': 1, 'message': '任务名称只能包含英文、数字和下划线'})
|
||||||
|
|
||||||
|
conn = get_db_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT id FROM fine_tune WHERE name = %s", (name,))
|
||||||
|
result = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if result:
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'exists': True,
|
||||||
|
'message': '任务名称已存在'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'exists': False,
|
||||||
|
'message': '任务名称可用'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"检查任务名称失败: {e}")
|
||||||
|
return jsonify({'code': 1, 'message': str(e)})
|
||||||
|
|
||||||
|
|
||||||
|
# TensorBoard 服务进程
|
||||||
|
tensorboard_process = None
|
||||||
|
|
||||||
|
|
||||||
|
@fine_tune_bp.route('/tensorboard/start', methods=['POST'])
|
||||||
|
def start_tensorboard():
|
||||||
|
"""启动 TensorBoard 服务"""
|
||||||
|
global tensorboard_process
|
||||||
|
try:
|
||||||
|
import subprocess
|
||||||
|
import os
|
||||||
|
|
||||||
|
# 检查是否已有进程在运行
|
||||||
|
if tensorboard_process and tensorboard_process.poll() is None:
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'url': 'http://10.10.10.177:6006',
|
||||||
|
'status': 'already_running',
|
||||||
|
'message': 'TensorBoard 服务已运行'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
# 获取日志目录
|
||||||
|
log_dir = '/app/base/saves'
|
||||||
|
|
||||||
|
# 检查目录是否存在
|
||||||
|
if not os.path.exists(log_dir):
|
||||||
|
return jsonify({'code': 1, 'message': f'日志目录不存在: {log_dir}'})
|
||||||
|
|
||||||
|
# 启动 TensorBoard(后台运行)
|
||||||
|
cmd = ['tensorboard', '--logdir', log_dir, '--port', '6006', '--bind_all']
|
||||||
|
tensorboard_process = subprocess.Popen(
|
||||||
|
cmd,
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
preexec_fn=os.setsid
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"TensorBoard 服务已启动: {cmd}")
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'url': 'http://10.10.10.177:6006',
|
||||||
|
'status': 'started',
|
||||||
|
'message': 'TensorBoard 服务已启动'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"启动 TensorBoard 失败: {e}")
|
||||||
|
return jsonify({'code': 1, 'message': str(e)})
|
||||||
|
|
||||||
|
|
||||||
|
@fine_tune_bp.route('/tensorboard/stop', methods=['POST'])
|
||||||
|
def stop_tensorboard():
|
||||||
|
"""停止 TensorBoard 服务"""
|
||||||
|
global tensorboard_process
|
||||||
|
try:
|
||||||
|
import subprocess
|
||||||
|
import signal
|
||||||
|
|
||||||
|
if tensorboard_process and tensorboard_process.poll() is None:
|
||||||
|
# 使用 os.killpg 终止进程组
|
||||||
|
try:
|
||||||
|
os.killpg(os.getpgid(tensorboard_process.pid), signal.SIGTERM)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
tensorboard_process = None
|
||||||
|
logger.info("TensorBoard 服务已停止")
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'status': 'stopped',
|
||||||
|
'message': 'TensorBoard 服务已停止'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"停止 TensorBoard 失败: {e}")
|
||||||
|
return jsonify({'code': 1, 'message': str(e)})
|
||||||
|
|||||||
228
src/api/logs.py
228
src/api/logs.py
@@ -169,3 +169,231 @@ def get_log_content():
|
|||||||
})
|
})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return jsonify({'code': 1, 'message': f'读取日志文件失败: {str(e)}'})
|
return jsonify({'code': 1, 'message': f'读取日志文件失败: {str(e)}'})
|
||||||
|
|
||||||
|
|
||||||
|
# ============ 训练日志相关 API ============
|
||||||
|
|
||||||
|
# 训练日志保存在 logs/{日期} 目录下
|
||||||
|
TRAINING_LOGS_BASE_DIR = '/app/base/logs'
|
||||||
|
# 本地开发时的备用路径(Windows)
|
||||||
|
LOCAL_TRAINING_LOGS_BASE_DIR = os.path.join(PROJECT_ROOT, 'logs')
|
||||||
|
|
||||||
|
# 添加调试日志
|
||||||
|
logs_logger.info(f"[DEBUG] TRAINING_LOGS_BASE_DIR: {TRAINING_LOGS_BASE_DIR}")
|
||||||
|
logs_logger.info(f"[DEBUG] LOCAL_TRAINING_LOGS_BASE_DIR: {LOCAL_TRAINING_LOGS_BASE_DIR}")
|
||||||
|
|
||||||
|
|
||||||
|
@logs_bp.route('/training-log-files', methods=['GET'])
|
||||||
|
def get_training_log_files():
|
||||||
|
"""获取训练日志文件列表 - 从 logs/{日期} 目录下的 .log 文件"""
|
||||||
|
try:
|
||||||
|
# 确定基础目录
|
||||||
|
logs_base_dir = TRAINING_LOGS_BASE_DIR
|
||||||
|
if not os.path.exists(logs_base_dir):
|
||||||
|
logs_base_dir = LOCAL_TRAINING_LOGS_BASE_DIR
|
||||||
|
|
||||||
|
logs_logger.info(f"[DEBUG] logs_base_dir: {logs_base_dir}, exists: {os.path.exists(logs_base_dir)}")
|
||||||
|
|
||||||
|
if not os.path.exists(logs_base_dir):
|
||||||
|
return jsonify({'code': 0, 'data': []})
|
||||||
|
|
||||||
|
# 遍历所有日期目录,收集训练日志文件
|
||||||
|
log_files = []
|
||||||
|
date_dirs = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 获取所有日期目录(格式: YYYY-MM-DD)
|
||||||
|
for item in os.listdir(logs_base_dir):
|
||||||
|
item_path = os.path.join(logs_base_dir, item)
|
||||||
|
if os.path.isdir(item_path):
|
||||||
|
# 验证是否为日期目录
|
||||||
|
try:
|
||||||
|
datetime.strptime(item, '%Y-%m-%d')
|
||||||
|
date_dirs.append(item)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
except Exception as list_err:
|
||||||
|
logs_logger.error(f"[DEBUG] Failed to list base directory: {list_err}")
|
||||||
|
return jsonify({'code': 0, 'data': []})
|
||||||
|
|
||||||
|
# 按日期排序(最新的在前面)
|
||||||
|
date_dirs.sort(reverse=True)
|
||||||
|
|
||||||
|
logs_logger.info(f"[DEBUG] Date directories: {date_dirs}")
|
||||||
|
|
||||||
|
# 遍历每个日期目录,查找 .log 文件
|
||||||
|
for date_dir in date_dirs:
|
||||||
|
date_full_path = os.path.join(logs_base_dir, date_dir)
|
||||||
|
try:
|
||||||
|
files = os.listdir(date_full_path)
|
||||||
|
except Exception as list_err:
|
||||||
|
logs_logger.warning(f"[DEBUG] Failed to list {date_full_path}: {list_err}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
for file_name in files:
|
||||||
|
if not file_name.endswith('.log'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
file_path = os.path.join(date_full_path, file_name)
|
||||||
|
try:
|
||||||
|
size = os.path.getsize(file_path)
|
||||||
|
except Exception as size_err:
|
||||||
|
logs_logger.warning(f"[DEBUG] Failed to get size of {file_path}: {size_err}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 文件名格式: {task_id}_{task_name}.log
|
||||||
|
# 例如: 889_testing.log
|
||||||
|
parts = file_name.replace('.log', '').split('_', 1)
|
||||||
|
if len(parts) >= 2:
|
||||||
|
task_id = parts[0]
|
||||||
|
task_name = parts[1]
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(date_dir, '%Y-%m-%d')
|
||||||
|
# 使用日期目录的时间作为排序键
|
||||||
|
sort_key = dt.timestamp()
|
||||||
|
display_date = date_dir
|
||||||
|
except:
|
||||||
|
sort_key = 0
|
||||||
|
display_date = date_dir
|
||||||
|
else:
|
||||||
|
task_id = 'unknown'
|
||||||
|
task_name = file_name.replace('.log', '')
|
||||||
|
sort_key = 0
|
||||||
|
display_date = date_dir
|
||||||
|
|
||||||
|
# 构建相对路径 (日期/文件名)
|
||||||
|
relative_path = f"{date_dir}/{file_name}"
|
||||||
|
|
||||||
|
log_files.append({
|
||||||
|
'name': task_name,
|
||||||
|
'file': relative_path,
|
||||||
|
'task_id': task_id,
|
||||||
|
'date': display_date,
|
||||||
|
'size': format_file_size(size),
|
||||||
|
'sort_key': sort_key
|
||||||
|
})
|
||||||
|
|
||||||
|
# 按时间戳排序(最新的在前面)
|
||||||
|
log_files.sort(key=lambda x: x['sort_key'] if x['sort_key'] else 0, reverse=True)
|
||||||
|
|
||||||
|
logs_logger.info(f"[DEBUG] Found {len(log_files)} training log files")
|
||||||
|
|
||||||
|
return jsonify({'code': 0, 'data': log_files})
|
||||||
|
except Exception as e:
|
||||||
|
logs_logger.error(f"[DEBUG] 获取训练日志列表失败: {e}")
|
||||||
|
return jsonify({'code': 1, 'message': f'获取训练日志列表失败: {str(e)}'})
|
||||||
|
|
||||||
|
|
||||||
|
@logs_bp.route('/training-log-content', methods=['GET'])
|
||||||
|
def get_training_log_content():
|
||||||
|
"""获取训练日志文件内容 - 从 logs/{日期}/ 目录"""
|
||||||
|
file_name = request.args.get('file')
|
||||||
|
if not file_name:
|
||||||
|
return jsonify({'code': 1, 'message': '缺少文件参数'})
|
||||||
|
|
||||||
|
logs_logger.info(f"[DEBUG] ============ get_training_log_content ============")
|
||||||
|
logs_logger.info(f"[DEBUG] file: {file_name}")
|
||||||
|
|
||||||
|
# 防止目录遍历攻击
|
||||||
|
file_name = file_name.replace('..', '').replace('//', '/')
|
||||||
|
|
||||||
|
# file 格式: 日期/文件名,例如: 2026-01-28/889_testing.log
|
||||||
|
# 解析日期和文件名
|
||||||
|
parts = file_name.split('/')
|
||||||
|
if len(parts) < 2:
|
||||||
|
return jsonify({'code': 1, 'message': '无效的文件路径格式'})
|
||||||
|
|
||||||
|
date_dir = parts[0]
|
||||||
|
log_file_name = '/'.join(parts[1:])
|
||||||
|
|
||||||
|
# 验证日期格式
|
||||||
|
try:
|
||||||
|
datetime.strptime(date_dir, '%Y-%m-%d')
|
||||||
|
except ValueError:
|
||||||
|
return jsonify({'code': 1, 'message': '无效的日期格式'})
|
||||||
|
|
||||||
|
# 确定基础目录
|
||||||
|
container_base_dir = TRAINING_LOGS_BASE_DIR # /app/base/logs
|
||||||
|
local_base_dir = LOCAL_TRAINING_LOGS_BASE_DIR # 项目目录下的 logs
|
||||||
|
|
||||||
|
container_full_path = os.path.join(container_base_dir, date_dir, log_file_name)
|
||||||
|
local_full_path = os.path.join(local_base_dir, date_dir, log_file_name)
|
||||||
|
|
||||||
|
logs_logger.info(f"[DEBUG] container_base_dir: {container_base_dir}, exists: {os.path.exists(container_base_dir)}")
|
||||||
|
logs_logger.info(f"[DEBUG] local_base_dir: {local_base_dir}, exists: {os.path.exists(local_base_dir)}")
|
||||||
|
logs_logger.info(f"[DEBUG] container_full_path: {container_full_path}, exists: {os.path.exists(container_full_path)}")
|
||||||
|
logs_logger.info(f"[DEBUG] local_full_path: {local_full_path}, exists: {os.path.exists(local_full_path)}")
|
||||||
|
|
||||||
|
# 选择最终路径
|
||||||
|
full_path = None
|
||||||
|
if os.path.exists(container_full_path):
|
||||||
|
full_path = container_full_path
|
||||||
|
logs_logger.info(f"[DEBUG] Using container path")
|
||||||
|
elif os.path.exists(local_full_path):
|
||||||
|
full_path = local_full_path
|
||||||
|
logs_logger.info(f"[DEBUG] Using local path")
|
||||||
|
else:
|
||||||
|
logs_logger.error(f"[DEBUG] File not found: {file_name}")
|
||||||
|
return jsonify({'code': 1, 'message': f'日志文件不存在: {file_name}'})
|
||||||
|
|
||||||
|
logs_logger.info(f"[DEBUG] Final full_path: {full_path}")
|
||||||
|
|
||||||
|
# 尝试直接读取文件
|
||||||
|
try:
|
||||||
|
max_size = 10 * 1024 * 1024
|
||||||
|
content = ''
|
||||||
|
read_success = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(full_path, 'rb') as f:
|
||||||
|
f.seek(0, 2)
|
||||||
|
size = f.tell()
|
||||||
|
f.seek(0)
|
||||||
|
|
||||||
|
if size > max_size:
|
||||||
|
f.seek(size - max_size)
|
||||||
|
content = '... (日志文件较大,已显示最后 10MB 内容) ...\n\n' + f.read().decode('utf-8', errors='ignore')
|
||||||
|
else:
|
||||||
|
content = f.read().decode('utf-8', errors='ignore')
|
||||||
|
read_success = True
|
||||||
|
except (PermissionError, OSError) as pe:
|
||||||
|
logs_logger.warning(f"[DEBUG] 直接读取失败: {pe},尝试共享模式读取")
|
||||||
|
import mmap
|
||||||
|
try:
|
||||||
|
with open(full_path, 'rb') as f:
|
||||||
|
mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
||||||
|
try:
|
||||||
|
f.seek(0, 2)
|
||||||
|
size = f.tell()
|
||||||
|
if size > max_size:
|
||||||
|
content = '... (日志文件较大,已显示最后 10MB 内容) ...\n\n' + \
|
||||||
|
mm[-max_size:].decode('utf-8', errors='ignore')
|
||||||
|
else:
|
||||||
|
content = mm[:].decode('utf-8', errors='ignore')
|
||||||
|
read_success = True
|
||||||
|
finally:
|
||||||
|
mm.close()
|
||||||
|
except Exception as e2:
|
||||||
|
logs_logger.error(f"[DEBUG] 共享模式读取失败: {e2}")
|
||||||
|
return jsonify({
|
||||||
|
'code': 2,
|
||||||
|
'message': f'日志文件正在被训练进程占用,训练结束后可查看完整内容',
|
||||||
|
'data': {
|
||||||
|
'file': log_file_name,
|
||||||
|
'size': format_file_size(0),
|
||||||
|
'content': ''
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if read_success:
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'file': log_file_name,
|
||||||
|
'size': format_file_size(size),
|
||||||
|
'content': content
|
||||||
|
}
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logs_logger.error(f"[DEBUG] 读取日志文件失败: {e}")
|
||||||
|
return jsonify({'code': 1, 'message': f'读取日志文件失败: {str(e)}'})
|
||||||
|
|||||||
@@ -193,3 +193,65 @@ def get_local_models():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"获取本地模型列表失败: {e}")
|
logger.error(f"获取本地模型列表失败: {e}")
|
||||||
return jsonify({'code': 1, 'message': str(e)})
|
return jsonify({'code': 1, 'message': str(e)})
|
||||||
|
|
||||||
|
|
||||||
|
# ============ 已训练模型列表接口 ============
|
||||||
|
|
||||||
|
@model_manage_bp.route('/trained-models', methods=['GET'])
|
||||||
|
def get_trained_models():
|
||||||
|
"""获取已训练模型列表(从/app/base/saves目录)"""
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 使用 /app/base/saves 目录(容器内路径)
|
||||||
|
saves_base_path = '/app/base/saves'
|
||||||
|
# 本地开发时的备用路径
|
||||||
|
local_saves_path = os.path.join(PROJECT_ROOT, 'saves')
|
||||||
|
|
||||||
|
# 选择存在的路径
|
||||||
|
base_path = saves_base_path if os.path.exists(saves_base_path) else local_saves_path
|
||||||
|
|
||||||
|
logger.info(f"[DEBUG] 已训练模型目录: {base_path}, exists: {os.path.exists(base_path)}")
|
||||||
|
|
||||||
|
models = []
|
||||||
|
if os.path.exists(base_path):
|
||||||
|
for item in os.listdir(base_path):
|
||||||
|
item_path = os.path.join(base_path, item)
|
||||||
|
if os.path.isdir(item_path):
|
||||||
|
# 检查是否是模板目录(包含训练方法的子目录)
|
||||||
|
sub_items = []
|
||||||
|
if os.path.exists(item_path):
|
||||||
|
for sub_item in os.listdir(item_path):
|
||||||
|
sub_path = os.path.join(item_path, sub_item)
|
||||||
|
if os.path.isdir(sub_path):
|
||||||
|
# 检查是否包含模型文件(adapter_model.bin 或 pytorch_model.bin 等)
|
||||||
|
has_model = False
|
||||||
|
for f in os.listdir(sub_path):
|
||||||
|
if f.endswith('.bin') or f.endswith('.safetensors'):
|
||||||
|
has_model = True
|
||||||
|
break
|
||||||
|
if has_model:
|
||||||
|
sub_items.append({
|
||||||
|
'name': sub_item,
|
||||||
|
'path': sub_path
|
||||||
|
})
|
||||||
|
|
||||||
|
models.append({
|
||||||
|
'name': item,
|
||||||
|
'path': item_path,
|
||||||
|
'train_methods': sub_items
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info(f"[DEBUG] 找到 {len(models)} 个已训练模型")
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'models': models,
|
||||||
|
'base_path': base_path
|
||||||
|
}
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取已训练模型列表失败: {e}")
|
||||||
|
return jsonify({'code': 1, 'message': str(e)})
|
||||||
|
|||||||
215
src/main.py
215
src/main.py
@@ -33,6 +33,9 @@ def load_config():
|
|||||||
|
|
||||||
CONFIG = load_config()
|
CONFIG = load_config()
|
||||||
|
|
||||||
|
# 训练日志路径
|
||||||
|
TRAINING_LOGS_DIR = CONFIG.get('training_logs_path', '/app/base/training_logs')
|
||||||
|
|
||||||
# ============ 日志系统配置 ============
|
# ============ 日志系统配置 ============
|
||||||
LOG_BASE_DIR = os.path.join(PROJECT_ROOT, 'logs')
|
LOG_BASE_DIR = os.path.join(PROJECT_ROOT, 'logs')
|
||||||
|
|
||||||
@@ -339,9 +342,10 @@ def init_database():
|
|||||||
|
|
||||||
# 为 fine_tune 表添加训练相关列
|
# 为 fine_tune 表添加训练相关列
|
||||||
columns_to_add = [
|
columns_to_add = [
|
||||||
|
("description", "TEXT COMMENT '任务描述'"),
|
||||||
("train_dataset_id", "INT COMMENT '训练数据集ID'"),
|
("train_dataset_id", "INT COMMENT '训练数据集ID'"),
|
||||||
("valid_dataset_id", "INT COMMENT '验证数据集ID'"),
|
("valid_dataset_id", "INT COMMENT '验证数据集ID'"),
|
||||||
("eval_steps", "INT DEFAULT 100 COMMENT '评估步数'"),
|
("save_steps", "INT DEFAULT 100 COMMENT '保存步数'"),
|
||||||
("lr_scheduler_type", "VARCHAR(50) DEFAULT 'cosine' COMMENT '学习率调度器'"),
|
("lr_scheduler_type", "VARCHAR(50) DEFAULT 'cosine' COMMENT '学习率调度器'"),
|
||||||
("warmup_ratio", "FLOAT DEFAULT 0.05 COMMENT '预热比例'"),
|
("warmup_ratio", "FLOAT DEFAULT 0.05 COMMENT '预热比例'"),
|
||||||
("weight_decay", "FLOAT DEFAULT 0.01 COMMENT '权重衰减'"),
|
("weight_decay", "FLOAT DEFAULT 0.01 COMMENT '权重衰减'"),
|
||||||
@@ -379,8 +383,18 @@ def init_database():
|
|||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
app.config['SECRET_KEY'] = CONFIG['secret_key']
|
app.config['SECRET_KEY'] = CONFIG['secret_key']
|
||||||
app.config['CORS_HEADERS'] = 'Content-Type'
|
app.config['CORS_HEADERS'] = 'Content-Type'
|
||||||
# 允许所有来源
|
|
||||||
CORS(app, resources={r"/api/*": {"origins": "*"}}, methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"], allow_headers=["Content-Type", "Authorization"])
|
# 允许所有来源 - 支持跨域请求
|
||||||
|
CORS(app, resources={
|
||||||
|
r"/api/*": {
|
||||||
|
"origins": "*",
|
||||||
|
"methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS", "PATCH"],
|
||||||
|
"allow_headers": ["Content-Type", "Authorization", "X-Requested-With"],
|
||||||
|
"expose_headers": ["Content-Length", "Content-Range"],
|
||||||
|
"supports_credentials": False,
|
||||||
|
"max_age": 86400 # 缓存预检请求结果 24 小时
|
||||||
|
}
|
||||||
|
}, vary_header=True)
|
||||||
|
|
||||||
# 注册蓝图
|
# 注册蓝图
|
||||||
register_blueprints(app)
|
register_blueprints(app)
|
||||||
@@ -674,6 +688,168 @@ def get_fine_tune():
|
|||||||
return jsonify({'code': 0, 'data': generic_get_all('fine_tune')})
|
return jsonify({'code': 0, 'data': generic_get_all('fine_tune')})
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/api/fine-tune/<int:id>', methods=['GET'])
|
||||||
|
def get_fine_tune_by_id(id):
|
||||||
|
"""获取单个训练任务详情"""
|
||||||
|
try:
|
||||||
|
conn = get_db_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT * FROM fine_tune WHERE id = %s", (id,))
|
||||||
|
task = cursor.fetchone()
|
||||||
|
|
||||||
|
if not task:
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
return jsonify({'code': 1, 'message': '任务不存在'})
|
||||||
|
|
||||||
|
# 获取列名并转换为字典(get_db_connection已使用DictCursor,task已是字典)
|
||||||
|
if isinstance(task, dict):
|
||||||
|
task_dict = task
|
||||||
|
else:
|
||||||
|
columns = [desc[0] for desc in cursor.description]
|
||||||
|
task_dict = dict(zip(columns, task))
|
||||||
|
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
# 处理 datetime 序列化
|
||||||
|
for key, value in task_dict.items():
|
||||||
|
if isinstance(value, datetime):
|
||||||
|
task_dict[key] = value.strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
|
return jsonify({'code': 0, 'data': task_dict})
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({'code': 1, 'message': str(e)})
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/api/fine-tune/progress/<int:id>', methods=['GET'])
|
||||||
|
def get_fine_tune_progress(id):
|
||||||
|
"""获取训练任务的进度(通过解析日志文件)"""
|
||||||
|
try:
|
||||||
|
# 获取任务信息
|
||||||
|
conn = get_db_connection()
|
||||||
|
cursor = conn.cursor(dictionary=True)
|
||||||
|
cursor.execute("SELECT id, process_id, name, status FROM fine_tune WHERE id = %s", (id,))
|
||||||
|
task = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not task:
|
||||||
|
return jsonify({'code': 1, 'message': '任务不存在'})
|
||||||
|
|
||||||
|
process_id = task.get('process_id')
|
||||||
|
task_name = task.get('name', '')
|
||||||
|
|
||||||
|
if not process_id:
|
||||||
|
return jsonify({'code': 0, 'data': {'progress': 0, 'status': task.get('status', 'unknown'), 'step': '', 'speed': '', 'eta': ''}})
|
||||||
|
|
||||||
|
# 查找日志文件 - 优先使用容器路径,如果不存在则使用本地路径
|
||||||
|
training_logs_dir = TRAINING_LOGS_DIR
|
||||||
|
if not os.path.exists(training_logs_dir):
|
||||||
|
training_logs_dir = os.path.join(PROJECT_ROOT, 'training_logs')
|
||||||
|
|
||||||
|
if not os.path.exists(training_logs_dir):
|
||||||
|
return jsonify({'code': 0, 'data': {'progress': 0, 'status': task.get('status', 'unknown'), 'step': '', 'speed': '', 'eta': ''}})
|
||||||
|
|
||||||
|
log_file = None
|
||||||
|
|
||||||
|
# 优先按 process_id 查找
|
||||||
|
for file_name in os.listdir(training_logs_dir):
|
||||||
|
if file_name.endswith('.log') and file_name.startswith(f'{process_id}_'):
|
||||||
|
log_file = os.path.join(training_logs_dir, file_name)
|
||||||
|
break
|
||||||
|
|
||||||
|
# 如果没找到,尝试按任务名称查找
|
||||||
|
if not log_file and task_name:
|
||||||
|
for file_name in os.listdir(training_logs_dir):
|
||||||
|
if file_name.endswith('.log') and task_name in file_name:
|
||||||
|
log_file = os.path.join(training_logs_dir, file_name)
|
||||||
|
break
|
||||||
|
|
||||||
|
if not log_file or not os.path.exists(log_file):
|
||||||
|
return jsonify({'code': 0, 'data': {'progress': 0, 'status': task.get('status', 'unknown'), 'step': '', 'speed': '', 'eta': ''}})
|
||||||
|
|
||||||
|
# 读取日志文件内容
|
||||||
|
try:
|
||||||
|
with open(log_file, 'r', encoding='utf-8') as f:
|
||||||
|
content = f.read()
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({'code': 0, 'data': {'progress': 0, 'status': task.get('status', 'unknown'), 'step': '', 'speed': '', 'eta': ''}})
|
||||||
|
|
||||||
|
# 解析进度
|
||||||
|
progress = 0
|
||||||
|
step_info = ''
|
||||||
|
speed_info = ''
|
||||||
|
eta_info = ''
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
# 处理 Windows 格式的日志(\r 覆盖行),将 \r 替换为换行
|
||||||
|
content = content.replace('\r', '\n')
|
||||||
|
|
||||||
|
# 日志格式: " 3%|▎ | 1/33 [00:09<05:10, 9.69s/it]"
|
||||||
|
# 或: " 30%|███ | 10/33 [01:22<03:00, 7.86s/it]"
|
||||||
|
# 匹配 "数字%|进度条| step/total [elapsed<eta, speed]"
|
||||||
|
progress_pattern = re.compile(r'(\d+)%\s*[\|▌▊█\s]+\s*\|\s*(\d+)/(\d+)\s*\[(\d+):?(\d+)<(\d+):?(\d+),\s*([\d.]+\s*(?:it/s|s/it))\s*\]')
|
||||||
|
|
||||||
|
# 按行分割并从后往前搜索
|
||||||
|
lines = content.split('\n')
|
||||||
|
for line in reversed(lines):
|
||||||
|
line = line.strip()
|
||||||
|
match = progress_pattern.search(line)
|
||||||
|
if match:
|
||||||
|
progress = int(match.group(1))
|
||||||
|
current_step = match.group(2)
|
||||||
|
total_steps = match.group(3)
|
||||||
|
elapsed_min = match.group(4)
|
||||||
|
elapsed_sec = match.group(5)
|
||||||
|
eta_min = match.group(6)
|
||||||
|
eta_sec = match.group(7)
|
||||||
|
speed = match.group(8).strip()
|
||||||
|
|
||||||
|
step_info = f'{current_step}/{total_steps}'
|
||||||
|
eta_info = f'{eta_min}:{eta_sec}'
|
||||||
|
speed_info = speed
|
||||||
|
break
|
||||||
|
|
||||||
|
# 如果没有找到进度格式,尝试其他格式
|
||||||
|
if progress == 0:
|
||||||
|
for line in reversed(lines):
|
||||||
|
if 'Running training' in line or 'running training' in line:
|
||||||
|
# 训练刚开始
|
||||||
|
break
|
||||||
|
# 尝试匹配简化格式
|
||||||
|
simple_match = re.search(r'(\d+)%\s*\|\s*(\d+)/(\d+)', line)
|
||||||
|
if simple_match:
|
||||||
|
progress = int(simple_match.group(1))
|
||||||
|
step_info = f'{simple_match.group(2)}/{simple_match.group(3)}'
|
||||||
|
break
|
||||||
|
|
||||||
|
# 检查训练是否完成
|
||||||
|
status = task.get('status', 'unknown')
|
||||||
|
for line in reversed(lines):
|
||||||
|
if 'Training completed' in line or '训练完成' in line:
|
||||||
|
status = 'completed'
|
||||||
|
progress = 100
|
||||||
|
break
|
||||||
|
if 'error' in line.lower() or 'failed' in line.lower() or 'Error' in line:
|
||||||
|
if 'KeyboardInterrupt' not in line:
|
||||||
|
status = 'failed'
|
||||||
|
break
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'code': 0,
|
||||||
|
'data': {
|
||||||
|
'progress': progress,
|
||||||
|
'status': status,
|
||||||
|
'step': step_info,
|
||||||
|
'speed': speed_info,
|
||||||
|
'eta': eta_info
|
||||||
|
}
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({'code': 1, 'message': f'获取进度失败: {str(e)}'})
|
||||||
|
|
||||||
|
|
||||||
@app.route('/api/fine-tune', methods=['POST'])
|
@app.route('/api/fine-tune', methods=['POST'])
|
||||||
def create_fine_tune():
|
def create_fine_tune():
|
||||||
data = request.json
|
data = request.json
|
||||||
@@ -690,6 +866,39 @@ def update_fine_tune(id):
|
|||||||
|
|
||||||
@app.route('/api/fine-tune/<int:id>', methods=['DELETE'])
|
@app.route('/api/fine-tune/<int:id>', methods=['DELETE'])
|
||||||
def delete_fine_tune(id):
|
def delete_fine_tune(id):
|
||||||
|
# 删除前获取任务信息(用于删除日志文件)
|
||||||
|
conn = get_db_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT process_id, name FROM fine_tune WHERE id = %s", (id,))
|
||||||
|
task_info = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
# 删除相关的日志文件
|
||||||
|
if task_info and task_info.get('process_id'):
|
||||||
|
from datetime import datetime
|
||||||
|
process_id = task_info['process_id']
|
||||||
|
task_name = task_info.get('name', 'unknown')
|
||||||
|
|
||||||
|
# 优先使用容器路径,如果不存在则使用本地路径
|
||||||
|
training_logs_dir = TRAINING_LOGS_DIR
|
||||||
|
if not os.path.exists(training_logs_dir):
|
||||||
|
training_logs_dir = os.path.join(PROJECT_ROOT, 'training_logs')
|
||||||
|
|
||||||
|
try:
|
||||||
|
if os.path.exists(training_logs_dir):
|
||||||
|
for file_name in os.listdir(training_logs_dir):
|
||||||
|
# 查找以 PID 开头的日志文件
|
||||||
|
if file_name.endswith('.log') and file_name.startswith(f'{process_id}_'):
|
||||||
|
log_file = os.path.join(training_logs_dir, file_name)
|
||||||
|
try:
|
||||||
|
os.remove(log_file)
|
||||||
|
print(f"[INFO] 已删除日志文件: {log_file}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] 删除日志文件失败: {log_file}, 错误: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] 查找或删除日志文件时出错: {e}")
|
||||||
|
|
||||||
|
# 删除数据库记录
|
||||||
generic_delete('fine_tune', id)
|
generic_delete('fine_tune', id)
|
||||||
return jsonify({'code': 0, 'message': '删除成功'})
|
return jsonify({'code': 0, 'message': '删除成功'})
|
||||||
|
|
||||||
|
|||||||
@@ -62,7 +62,9 @@ start_api() {
|
|||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
python src/main.py &
|
LOG_DIR="$SCRIPT_DIR/logs/$(date +%Y-%m-%d)"
|
||||||
|
mkdir -p "$LOG_DIR"
|
||||||
|
python src/main.py > "$LOG_DIR/api.log" 2>&1 &
|
||||||
API_PID=$!
|
API_PID=$!
|
||||||
echo "✅ 后端服务已启动 (PID: $API_PID, 端口: $API_PORT)"
|
echo "✅ 后端服务已启动 (PID: $API_PID, 端口: $API_PORT)"
|
||||||
echo "$API_PID" > /tmp/ygft_api.pid
|
echo "$API_PID" > /tmp/ygft_api.pid
|
||||||
|
|||||||
@@ -219,10 +219,21 @@
|
|||||||
<div class="mb-6">
|
<div class="mb-6">
|
||||||
<h3 class="text-sm font-semibold text-gray-700 mb-4 pb-2 border-b border-gray-100">基本信息</h3>
|
<h3 class="text-sm font-semibold text-gray-700 mb-4 pb-2 border-b border-gray-100">基本信息</h3>
|
||||||
<div class="mb-4">
|
<div class="mb-4">
|
||||||
<label class="block text-sm text-gray-600 mb-3">任务名称</label>
|
<label class="block text-sm text-gray-600 mb-3">
|
||||||
|
任务名称
|
||||||
|
<span class="text-gray-400 text-xs ml-1">(英文、数字、下划线)</span>
|
||||||
|
</label>
|
||||||
<div>
|
<div>
|
||||||
<input type="text" name="name" class="w-full px-3 py-2 border border-gray-300 rounded-lg text-sm focus:border-primary focus:outline-none" placeholder="请输入任务名称" maxlength="50">
|
<input type="text" name="name" class="w-full px-3 py-2 border border-gray-300 rounded-lg text-sm focus:border-primary focus:outline-none" placeholder="请输入任务名称" maxlength="50">
|
||||||
<p class="text-xs text-gray-400 mt-1"><span id="nameCount">0</span> / 50</p>
|
<p class="text-xs text-gray-400 mt-1"><span id="nameCount">0</span> / 50</p>
|
||||||
|
<p id="nameFormatError" class="text-xs text-red-500 mt-1 hidden">任务名称只能包含英文、数字和下划线</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<label class="block text-sm text-gray-600 mb-3">任务描述</label>
|
||||||
|
<div>
|
||||||
|
<textarea name="description" class="w-full px-3 py-2 border border-gray-300 rounded-lg text-sm focus:border-primary focus:outline-none resize-none" placeholder="请输入任务描述(选填)" maxlength="200" rows="3"></textarea>
|
||||||
|
<p class="text-xs text-gray-400 mt-1"><span id="descriptionCount">0</span> / 200</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -466,16 +477,16 @@
|
|||||||
</tr>
|
</tr>
|
||||||
<tr class="hover:bg-blue-50/30 transition-colors">
|
<tr class="hover:bg-blue-50/30 transition-colors">
|
||||||
<td class="py-3 px-4">
|
<td class="py-3 px-4">
|
||||||
<span class="text-gray-700 font-mono text-sm">eval_steps</span>
|
<span class="text-gray-700 font-mono text-sm">save_steps</span>
|
||||||
<span class="text-red-500 ml-1">*</span>
|
<span class="text-red-500 ml-1">*</span>
|
||||||
</td>
|
</td>
|
||||||
<td class="py-3 px-4">
|
<td class="py-3 px-4">
|
||||||
<input type="number" name="eval_steps" value="100" min="10" max="10000" class="w-24 px-3 py-1.5 border border-gray-300 rounded-lg text-sm text-center focus:border-primary focus:outline-none focus:ring-2 focus:ring-primary/20 transition-all">
|
<input type="number" name="save_steps" value="100" min="10" max="10000" class="w-24 px-3 py-1.5 border border-gray-300 rounded-lg text-sm text-center focus:border-primary focus:outline-none focus:ring-2 focus:ring-primary/20 transition-all">
|
||||||
</td>
|
</td>
|
||||||
<td class="py-3 px-4 text-xs text-gray-500">
|
<td class="py-3 px-4 text-xs text-gray-500">
|
||||||
<span class="inline-flex items-center px-2 py-0.5 rounded bg-gray-100 text-gray-600 font-mono">[10, 10000]</span>
|
<span class="inline-flex items-center px-2 py-0.5 rounded bg-gray-100 text-gray-600 font-mono">[10, 10000]</span>
|
||||||
</td>
|
</td>
|
||||||
<td class="py-3 px-4 text-xs text-gray-500 leading-relaxed">每训练多少步进行一次模型评估,建议设置为100的倍数</td>
|
<td class="py-3 px-4 text-xs text-gray-500 leading-relaxed">每训练多少步进行一次模型保存,建议设置为100的倍数</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr class="hover:bg-blue-50/30 transition-colors">
|
<tr class="hover:bg-blue-50/30 transition-colors">
|
||||||
<td class="py-3 px-4">
|
<td class="py-3 px-4">
|
||||||
@@ -616,14 +627,7 @@
|
|||||||
<div class="mb-6">
|
<div class="mb-6">
|
||||||
<h3 class="text-sm font-semibold text-gray-700 mb-4 pb-2 border-b border-gray-100">训练产出</h3>
|
<h3 class="text-sm font-semibold text-gray-700 mb-4 pb-2 border-b border-gray-100">训练产出</h3>
|
||||||
|
|
||||||
<!-- 模型名称 -->
|
<p class="text-sm text-gray-500 mb-4">训练完成后,模型将保存为: <code class="bg-gray-100 px-2 py-0.5 rounded text-primary" id="modelNamePreview">任务名称</code></p>
|
||||||
<div class="mb-4">
|
|
||||||
<label class="block text-sm text-gray-600 mb-3">模型名称</label>
|
|
||||||
<div>
|
|
||||||
<input type="text" name="output_model_name" class="w-64 px-3 py-2 border border-gray-300 rounded-lg text-sm focus:border-primary focus:outline-none" placeholder="请输入模型名称" maxlength="50">
|
|
||||||
<p class="text-xs text-gray-400 mt-1"><span id="modelNameCount">0</span> / 50</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- 训练命令预览 -->
|
<!-- 训练命令预览 -->
|
||||||
<div class="mt-4">
|
<div class="mt-4">
|
||||||
@@ -678,16 +682,38 @@
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// 任务名称字数统计
|
// 任务名称字数统计和实时预览(只能输入英文、数字、下划线)
|
||||||
const nameInput = document.querySelector('input[name="name"]');
|
const nameInput = document.querySelector('input[name="name"]');
|
||||||
|
const nameFormatError = document.getElementById('nameFormatError');
|
||||||
|
const nameRegex = /^[a-zA-Z0-9_]*$/;
|
||||||
|
|
||||||
nameInput.addEventListener('input', () => {
|
nameInput.addEventListener('input', () => {
|
||||||
|
const value = nameInput.value;
|
||||||
|
// 验证格式
|
||||||
|
if (value.length > 0 && !nameRegex.test(value)) {
|
||||||
|
nameInput.classList.add('border-red-500');
|
||||||
|
nameInput.classList.remove('border-gray-300');
|
||||||
|
nameFormatError.classList.remove('hidden');
|
||||||
|
} else {
|
||||||
|
nameInput.classList.remove('border-red-500');
|
||||||
|
nameInput.classList.add('border-gray-300');
|
||||||
|
nameFormatError.classList.add('hidden');
|
||||||
|
}
|
||||||
|
// 过滤非法字符:只允许英文、数字、下划线
|
||||||
|
const filteredValue = value.replace(/[^a-zA-Z0-9_]/g, '');
|
||||||
|
if (value !== filteredValue) {
|
||||||
|
nameInput.value = filteredValue;
|
||||||
|
}
|
||||||
document.getElementById('nameCount').textContent = nameInput.value.length;
|
document.getElementById('nameCount').textContent = nameInput.value.length;
|
||||||
|
// 更新模型名称预览
|
||||||
|
document.getElementById('modelNamePreview').textContent = nameInput.value || '任务名称';
|
||||||
|
updateCommandPreview();
|
||||||
});
|
});
|
||||||
|
|
||||||
// 模型名称字数统计
|
// 任务描述字数统计
|
||||||
const modelNameInput = document.querySelector('input[name="output_model_name"]');
|
const descInput = document.querySelector('textarea[name="description"]');
|
||||||
modelNameInput.addEventListener('input', () => {
|
descInput.addEventListener('input', () => {
|
||||||
document.getElementById('modelNameCount').textContent = modelNameInput.value.length;
|
document.getElementById('descriptionCount').textContent = descInput.value.length;
|
||||||
});
|
});
|
||||||
|
|
||||||
// 加载数据集列表
|
// 加载数据集列表
|
||||||
@@ -774,7 +800,7 @@
|
|||||||
'batch_size': 1,
|
'batch_size': 1,
|
||||||
'learning_rate': 0.0001,
|
'learning_rate': 0.0001,
|
||||||
'n_epochs': 1,
|
'n_epochs': 1,
|
||||||
'eval_steps': 100,
|
'save_steps': 100,
|
||||||
'lr_scheduler_type': 'cosine',
|
'lr_scheduler_type': 'cosine',
|
||||||
'max_length': 512,
|
'max_length': 512,
|
||||||
'warmup_ratio': 0.05,
|
'warmup_ratio': 0.05,
|
||||||
@@ -1014,7 +1040,7 @@
|
|||||||
batch_size: parseInt(formData.get('batch_size')) || 1,
|
batch_size: parseInt(formData.get('batch_size')) || 1,
|
||||||
learning_rate: parseFloat(formData.get('learning_rate')) || 0.0001,
|
learning_rate: parseFloat(formData.get('learning_rate')) || 0.0001,
|
||||||
n_epochs: parseFloat(formData.get('n_epochs')) || 1.0,
|
n_epochs: parseFloat(formData.get('n_epochs')) || 1.0,
|
||||||
eval_steps: parseInt(formData.get('eval_steps')) || 100,
|
save_steps: parseInt(formData.get('save_steps')) || 100,
|
||||||
lr_scheduler_type: formData.get('lr_scheduler_type') || 'cosine',
|
lr_scheduler_type: formData.get('lr_scheduler_type') || 'cosine',
|
||||||
max_length: parseInt(formData.get('max_length')) || 512,
|
max_length: parseInt(formData.get('max_length')) || 512,
|
||||||
warmup_ratio: parseFloat(formData.get('warmup_ratio')) || 0.05,
|
warmup_ratio: parseFloat(formData.get('warmup_ratio')) || 0.05,
|
||||||
@@ -1024,15 +1050,18 @@
|
|||||||
lora_rank: formData.get('lora_rank') || '8'
|
lora_rank: formData.get('lora_rank') || '8'
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const taskName = formData.get('name');
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
name: formData.get('name'),
|
name: taskName,
|
||||||
|
description: formData.get('description'),
|
||||||
base_model: formData.get('base_model'),
|
base_model: formData.get('base_model'),
|
||||||
template: formData.get('template'),
|
template: formData.get('template'),
|
||||||
train_type: formData.get('train_type'),
|
train_type: formData.get('train_type'),
|
||||||
train_method: formData.get('train_method'),
|
train_method: formData.get('train_method'),
|
||||||
gpus: selectedGPUs,
|
gpus: selectedGPUs,
|
||||||
train_dataset_id: formData.get('train_dataset_id'),
|
train_dataset_id: formData.get('train_dataset_id'),
|
||||||
output_model_name: formData.get('output_model_name'),
|
output_model_name: taskName, // 使用任务名称作为模型名称
|
||||||
...trainParams,
|
...trainParams,
|
||||||
status: 'pending',
|
status: 'pending',
|
||||||
progress: 0
|
progress: 0
|
||||||
@@ -1042,6 +1071,26 @@
|
|||||||
showMessage('提示', '请输入任务名称', 'warning');
|
showMessage('提示', '请输入任务名称', 'warning');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 验证任务名称格式
|
||||||
|
const nameRegex = /^[a-zA-Z0-9_]+$/;
|
||||||
|
if (!nameRegex.test(data.name)) {
|
||||||
|
showMessage('提示', '任务名称只能包含英文、数字和下划线', 'warning');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 检查任务名称是否重复
|
||||||
|
try {
|
||||||
|
const checkResponse = await fetch(`${API_BASE}/fine-tune/check-name?name=${encodeURIComponent(data.name)}`);
|
||||||
|
const checkResult = await checkResponse.json();
|
||||||
|
if (checkResult.code === 0 && checkResult.data.exists) {
|
||||||
|
showMessage('提示', '任务名称已存在,请使用其他名称', 'warning');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('检查任务名称失败:', error);
|
||||||
|
}
|
||||||
|
|
||||||
if (selectedGPUs.length === 0) {
|
if (selectedGPUs.length === 0) {
|
||||||
showMessage('提示', '请选择至少一个GPU硬件', 'warning');
|
showMessage('提示', '请选择至少一个GPU硬件', 'warning');
|
||||||
return;
|
return;
|
||||||
@@ -1060,6 +1109,12 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
// 显示加载中状态
|
||||||
|
const submitBtn = document.querySelector('button[onclick="submitForm()"]');
|
||||||
|
const originalText = submitBtn.innerHTML;
|
||||||
|
submitBtn.disabled = true;
|
||||||
|
submitBtn.innerHTML = '<i class="fa fa-spinner fa-spin mr-2"></i>训练任务创建中...';
|
||||||
|
|
||||||
// 第一步:创建训练任务记录
|
// 第一步:创建训练任务记录
|
||||||
const createResponse = await fetch(`${API_BASE}/fine-tune`, {
|
const createResponse = await fetch(`${API_BASE}/fine-tune`, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
@@ -1068,6 +1123,8 @@
|
|||||||
});
|
});
|
||||||
const createResult = await createResponse.json();
|
const createResult = await createResponse.json();
|
||||||
if (createResult.code !== 0) {
|
if (createResult.code !== 0) {
|
||||||
|
submitBtn.disabled = false;
|
||||||
|
submitBtn.innerHTML = originalText;
|
||||||
showMessage('错误', createResult.message || '创建任务失败', 'error');
|
showMessage('错误', createResult.message || '创建任务失败', 'error');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -1077,12 +1134,13 @@
|
|||||||
// 第二步:启动训练
|
// 第二步:启动训练
|
||||||
const startData = {
|
const startData = {
|
||||||
task_id: taskId,
|
task_id: taskId,
|
||||||
|
name: data.name, // 任务名称,用于日志文件名和模型名称
|
||||||
base_model: data.base_model,
|
base_model: data.base_model,
|
||||||
template: data.template,
|
template: data.template,
|
||||||
train_type: data.train_type,
|
train_type: data.train_type,
|
||||||
train_method: data.train_method,
|
train_method: data.train_method,
|
||||||
train_dataset_id: data.train_dataset_id,
|
train_dataset_id: data.train_dataset_id,
|
||||||
output_model_name: data.output_model_name,
|
output_model_name: data.name, // 使用任务名称作为模型名称
|
||||||
...trainParams
|
...trainParams
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -1093,9 +1151,12 @@
|
|||||||
});
|
});
|
||||||
const startResult = await startResponse.json();
|
const startResult = await startResponse.json();
|
||||||
|
|
||||||
|
// 恢复按钮状态
|
||||||
|
submitBtn.disabled = false;
|
||||||
|
submitBtn.innerHTML = originalText;
|
||||||
|
|
||||||
if (startResult.code === 0) {
|
if (startResult.code === 0) {
|
||||||
const cmd = startResult.data?.command || '';
|
showMessage('成功', '训练任务已启动!', 'success', () => {
|
||||||
showMessage('成功', `训练任务已启动!<br><br><code class="text-xs bg-gray-100 p-1 rounded">${cmd}</code>`, 'success', () => {
|
|
||||||
window.location.href = 'main.html';
|
window.location.href = 'main.html';
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
@@ -1108,6 +1169,12 @@
|
|||||||
showMessage('错误', startResult.message || '启动训练失败', 'error');
|
showMessage('错误', startResult.message || '启动训练失败', 'error');
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
// 恢复按钮状态
|
||||||
|
const submitBtn = document.querySelector('button[onclick="submitForm()"]');
|
||||||
|
if (submitBtn) {
|
||||||
|
submitBtn.disabled = false;
|
||||||
|
submitBtn.innerHTML = '开始训练';
|
||||||
|
}
|
||||||
showMessage('错误', '操作失败: ' + error.message, 'error');
|
showMessage('错误', '操作失败: ' + error.message, 'error');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1146,9 +1213,10 @@
|
|||||||
const trainMethod = formData.get('train_method') || 'lora';
|
const trainMethod = formData.get('train_method') || 'lora';
|
||||||
const methodMap = { 'lora': 'lora', 'full': 'full' };
|
const methodMap = { 'lora': 'lora', 'full': 'full' };
|
||||||
|
|
||||||
// 获取输出模型名称
|
// 获取输出模型名称(使用任务名称)
|
||||||
const outputModelName = formData.get('output_model_name') || `${template}/${trainMethod}`;
|
const taskName = formData.get('name') || 'task_name';
|
||||||
const outputDir = outputModelName.startsWith('./') ? outputModelName : `./saves/${outputModelName}`;
|
const outputModelName = taskName;
|
||||||
|
const outputDir = outputModelName.startsWith('/') ? outputModelName : `/app/base/saves/${outputModelName}`;
|
||||||
|
|
||||||
// 获取数据集名称
|
// 获取数据集名称
|
||||||
const trainDatasetSelect = form.querySelector('select[name="train_dataset_id"]');
|
const trainDatasetSelect = form.querySelector('select[name="train_dataset_id"]');
|
||||||
@@ -1167,7 +1235,7 @@
|
|||||||
const nEpochs = parseFloat(formData.get('n_epochs')) || 1.0;
|
const nEpochs = parseFloat(formData.get('n_epochs')) || 1.0;
|
||||||
const maxLength = parseInt(formData.get('max_length')) || 512;
|
const maxLength = parseInt(formData.get('max_length')) || 512;
|
||||||
const warmupSteps = parseInt(formData.get('warmup_steps')) || 20;
|
const warmupSteps = parseInt(formData.get('warmup_steps')) || 20;
|
||||||
const evalSteps = parseInt(formData.get('eval_steps')) || 100;
|
const saveSteps = parseInt(formData.get('save_steps')) || 100;
|
||||||
const gradientAccumulationSteps = parseInt(formData.get('gradient_accumulation_steps')) || 8;
|
const gradientAccumulationSteps = parseInt(formData.get('gradient_accumulation_steps')) || 8;
|
||||||
const lrSchedulerType = formData.get('lr_scheduler_type') || 'cosine';
|
const lrSchedulerType = formData.get('lr_scheduler_type') || 'cosine';
|
||||||
|
|
||||||
@@ -1204,10 +1272,10 @@
|
|||||||
cmd += ` --lr_scheduler_type ${lrSchedulerType} \\\n`;
|
cmd += ` --lr_scheduler_type ${lrSchedulerType} \\\n`;
|
||||||
cmd += ` --logging_steps 50 \\\n`;
|
cmd += ` --logging_steps 50 \\\n`;
|
||||||
cmd += ` --warmup_steps ${warmupSteps} \\\n`;
|
cmd += ` --warmup_steps ${warmupSteps} \\\n`;
|
||||||
cmd += ` --save_steps 100 \\\n`;
|
cmd += ` --save_steps ${saveSteps} \\\n`;
|
||||||
cmd += ` --eval_steps ${evalSteps} \\\n`;
|
|
||||||
cmd += ` --learning_rate ${learningRate} \\\n`;
|
cmd += ` --learning_rate ${learningRate} \\\n`;
|
||||||
cmd += ` --num_train_epochs ${nEpochs}`;
|
cmd += ` --num_train_epochs ${nEpochs} \\\n`;
|
||||||
|
cmd += ` --plot_loss`;
|
||||||
|
|
||||||
return cmd;
|
return cmd;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -260,6 +260,11 @@
|
|||||||
<header class="bg-white border-b border-gray-200 shadow-sm">
|
<header class="bg-white border-b border-gray-200 shadow-sm">
|
||||||
<div class="flex items-center justify-between px-6 h-14">
|
<div class="flex items-center justify-between px-6 h-14">
|
||||||
<div class="flex items-center space-x-6">
|
<div class="flex items-center space-x-6">
|
||||||
|
<!-- 返回按钮(仅外部页面显示) -->
|
||||||
|
<button id="pageBackBtn" class="hidden text-gray-500 hover:text-gray-700 flex items-center transition-colors" onclick="goBackToList()">
|
||||||
|
<i class="fa fa-arrow-left mr-1"></i>
|
||||||
|
<span>返回</span>
|
||||||
|
</button>
|
||||||
<button class="md:hidden text-gray-500 hover:text-gray-700">
|
<button class="md:hidden text-gray-500 hover:text-gray-700">
|
||||||
<i class="fa fa-bars"></i>
|
<i class="fa fa-bars"></i>
|
||||||
</button>
|
</button>
|
||||||
@@ -304,28 +309,6 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
// 会话超时检查(5分钟)
|
|
||||||
const SESSION_TIMEOUT = 5 * 60 * 1000; // 5分钟
|
|
||||||
function checkSession() {
|
|
||||||
const loginTime = localStorage.getItem('loginTime');
|
|
||||||
if (!loginTime || (Date.now() - parseInt(loginTime)) > SESSION_TIMEOUT) {
|
|
||||||
// 会话过期,清除并跳转到登录页
|
|
||||||
localStorage.removeItem('loginTime');
|
|
||||||
localStorage.removeItem('username');
|
|
||||||
window.location.href = 'login.html';
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// 更新登录时间(用户有活动时续期)
|
|
||||||
localStorage.setItem('loginTime', Date.now());
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 页面加载时检查会话
|
|
||||||
if (!checkSession()) {
|
|
||||||
// 阻止页面渲染
|
|
||||||
document.body.innerHTML = '';
|
|
||||||
}
|
|
||||||
|
|
||||||
// API 基础地址 - 使用 config.yaml 中的 app.port (7861)
|
// API 基础地址 - 使用 config.yaml 中的 app.port (7861)
|
||||||
const getApiBase = () => {
|
const getApiBase = () => {
|
||||||
const protocol = window.location.protocol;
|
const protocol = window.location.protocol;
|
||||||
@@ -430,9 +413,8 @@
|
|||||||
createText: '创建训练任务',
|
createText: '创建训练任务',
|
||||||
columns: [
|
columns: [
|
||||||
{ title: '任务名称', key: 'name' },
|
{ title: '任务名称', key: 'name' },
|
||||||
{ title: '基础模型', key: 'base_model' },
|
{ title: '基础模型', key: 'base_model', render: (val, row) => `<span class="model-name-cell" data-model-id="${val}">加载中...</span>` },
|
||||||
{ title: '状态', key: 'status', render: (val) => `<span class="px-2 py-1 rounded text-xs ${val === 'running' ? 'bg-green-100 text-green-700' : val === 'failed' ? 'bg-red-100 text-red-700' : 'bg-gray-100 text-gray-700'}">${val}</span>` },
|
{ title: '状态', key: 'status', render: (val) => `<span class="px-2 py-1 rounded text-xs ${val === 'running' ? 'bg-green-100 text-green-700' : val === 'failed' ? 'bg-red-100 text-red-700' : 'bg-gray-100 text-gray-700'}">${val}</span>` },
|
||||||
{ title: '进度', key: 'progress', render: (val) => `${val || 0}%` },
|
|
||||||
{ title: '创建时间', key: 'create_time', render: (val) => val ? new Date(val).toLocaleString('zh-CN') : '-' }
|
{ title: '创建时间', key: 'create_time', render: (val) => val ? new Date(val).toLocaleString('zh-CN') : '-' }
|
||||||
],
|
],
|
||||||
actions: ['stop', 'logs', 'delete']
|
actions: ['stop', 'logs', 'delete']
|
||||||
@@ -586,6 +568,12 @@
|
|||||||
skipFetch: true,
|
skipFetch: true,
|
||||||
hasCreate: false,
|
hasCreate: false,
|
||||||
isExternalPage: true
|
isExternalPage: true
|
||||||
|
},
|
||||||
|
'training-log': {
|
||||||
|
title: '训练日志',
|
||||||
|
skipFetch: true,
|
||||||
|
hasCreate: false,
|
||||||
|
isExternalPage: true
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -606,6 +594,124 @@
|
|||||||
'chat': '对话'
|
'chat': '对话'
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// 训练进度缓存
|
||||||
|
let trainingProgressCache = {};
|
||||||
|
let progressRefreshTimer = null;
|
||||||
|
|
||||||
|
// 渲染训练进度
|
||||||
|
function renderTrainingProgress(val, row) {
|
||||||
|
const progressData = trainingProgressCache[row.id];
|
||||||
|
if (progressData && progressData.status === 'running') {
|
||||||
|
if (progressData.progress > 0) {
|
||||||
|
return `
|
||||||
|
<div class="flex flex-col">
|
||||||
|
<span class="text-sm font-medium text-primary">${progressData.progress}%</span>
|
||||||
|
<span class="text-xs text-gray-500">${progressData.step || ''} ${progressData.speed || ''}</span>
|
||||||
|
<span class="text-xs text-gray-400">ETA: ${progressData.eta || '--:--'}</span>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return `${val || 0}%`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 刷新训练进度
|
||||||
|
async function refreshTrainingProgress() {
|
||||||
|
if (currentPage !== 'fine-tune') return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE}/fine-tune`);
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (result.code === 0 && result.data) {
|
||||||
|
// 刷新运行中或已完成的任务(有进度信息)
|
||||||
|
const activeTasks = result.data.filter(task =>
|
||||||
|
task.status === 'running' || task.status === 'pending'
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const task of activeTasks) {
|
||||||
|
try {
|
||||||
|
// 并行获取进度和PID状态
|
||||||
|
const [progressResponse, statusResponse] = await Promise.all([
|
||||||
|
fetch(`${API_BASE}/fine-tune/progress/${task.id}`),
|
||||||
|
fetch(`${API_BASE}/fine-tune/${task.id}`)
|
||||||
|
]);
|
||||||
|
const progressResult = await progressResponse.json();
|
||||||
|
const statusResult = await statusResponse.json();
|
||||||
|
|
||||||
|
if (progressResult.code === 0 && progressResult.data) {
|
||||||
|
trainingProgressCache[task.id] = progressResult.data;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 如果状态已改变(PID已结束),更新表格中的状态显示
|
||||||
|
if (statusResult.code === 0 && statusResult.data) {
|
||||||
|
const actualStatus = statusResult.data.status;
|
||||||
|
if (task.status !== actualStatus) {
|
||||||
|
// 找到对应的行并更新状态
|
||||||
|
const row = document.querySelector(`tr[data-id="${task.id}"]`);
|
||||||
|
if (row) {
|
||||||
|
const statusCell = row.querySelector('td:nth-child(3)');
|
||||||
|
if (statusCell) {
|
||||||
|
statusCell.innerHTML = `<span class="px-2 py-1 rounded text-xs ${actualStatus === 'running' ? 'bg-green-100 text-green-700' : actualStatus === 'failed' ? 'bg-red-100 text-red-700' : 'bg-blue-100 text-blue-700'}">${actualStatus}</span>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.warn(`获取任务 ${task.id} 信息失败:`, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.warn('刷新训练进度失败:', error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 检查并更新任务状态(用于 fine-tune 页面)
|
||||||
|
async function checkAndUpdateTaskStatus() {
|
||||||
|
if (currentPage !== 'fine-tune') return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE}/fine-tune`);
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (result.code === 0 && result.data) {
|
||||||
|
// 获取所有 running 状态的任务
|
||||||
|
const runningTasks = result.data.filter(task => task.status === 'running');
|
||||||
|
|
||||||
|
for (const task of runningTasks) {
|
||||||
|
try {
|
||||||
|
// 调用 status API 获取实际状态(会检查 PID)
|
||||||
|
const statusResponse = await fetch(`${API_BASE}/fine-tune/${task.id}`);
|
||||||
|
const statusResult = await statusResponse.json();
|
||||||
|
|
||||||
|
if (statusResult.code === 0 && statusResult.data) {
|
||||||
|
const actualStatus = statusResult.data.status;
|
||||||
|
// 如果实际状态不是 running,更新表格显示
|
||||||
|
if (actualStatus !== 'running') {
|
||||||
|
const row = document.querySelector(`tr[data-id="${task.id}"]`);
|
||||||
|
if (row) {
|
||||||
|
const statusCell = row.querySelector('td:nth-child(3)');
|
||||||
|
if (statusCell) {
|
||||||
|
const statusClass = actualStatus === 'failed'
|
||||||
|
? 'bg-red-100 text-red-700'
|
||||||
|
: 'bg-blue-100 text-blue-700';
|
||||||
|
statusCell.innerHTML = `<span class="px-2 py-1 rounded text-xs ${statusClass}">${actualStatus}</span>`;
|
||||||
|
console.log(`[Status] 任务 ${task.id} 状态已更新: running -> ${actualStatus}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.warn(`检查任务 ${task.id} 状态失败:`, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.warn('检查任务状态失败:', error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// 页面加载完成后初始化
|
// 页面加载完成后初始化
|
||||||
document.addEventListener('DOMContentLoaded', function() {
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
// 从 localStorage 加载自定义工具
|
// 从 localStorage 加载自定义工具
|
||||||
@@ -645,6 +751,12 @@
|
|||||||
|
|
||||||
loadPage(defaultPage);
|
loadPage(defaultPage);
|
||||||
|
|
||||||
|
// 启动训练进度自动刷新(每5秒)
|
||||||
|
progressRefreshTimer = setInterval(() => {
|
||||||
|
refreshTrainingProgress();
|
||||||
|
checkAndUpdateTaskStatus();
|
||||||
|
}, 5000);
|
||||||
|
|
||||||
// 更新侧边栏高亮状态
|
// 更新侧边栏高亮状态
|
||||||
document.querySelectorAll('.nav-link').forEach(link => {
|
document.querySelectorAll('.nav-link').forEach(link => {
|
||||||
if (link.dataset.page === defaultPage) {
|
if (link.dataset.page === defaultPage) {
|
||||||
@@ -727,6 +839,14 @@
|
|||||||
// 离开日志页面时停止自动刷新
|
// 离开日志页面时停止自动刷新
|
||||||
stopLogAutoRefresh();
|
stopLogAutoRefresh();
|
||||||
|
|
||||||
|
// 离开模型调优页面时停止进度刷新
|
||||||
|
if (currentPage === 'fine-tune' && pageName !== 'fine-tune') {
|
||||||
|
if (progressRefreshTimer) {
|
||||||
|
clearInterval(progressRefreshTimer);
|
||||||
|
progressRefreshTimer = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const container = document.getElementById('page-content');
|
const container = document.getElementById('page-content');
|
||||||
const config = tableConfigs[pageName];
|
const config = tableConfigs[pageName];
|
||||||
|
|
||||||
@@ -740,6 +860,14 @@
|
|||||||
</div>
|
</div>
|
||||||
`;
|
`;
|
||||||
|
|
||||||
|
// 显示/隐藏返回按钮(外部页面显示,普通页面隐藏)
|
||||||
|
const backBtn = document.getElementById('pageBackBtn');
|
||||||
|
if (config.isExternalPage) {
|
||||||
|
backBtn.classList.remove('hidden');
|
||||||
|
} else {
|
||||||
|
backBtn.classList.add('hidden');
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// 渲染页面
|
// 渲染页面
|
||||||
if (config.isExternalPage) {
|
if (config.isExternalPage) {
|
||||||
@@ -747,9 +875,14 @@
|
|||||||
const response = await fetch(`${pageName}.html?t=${Date.now()}`);
|
const response = await fetch(`${pageName}.html?t=${Date.now()}`);
|
||||||
if (response.ok) {
|
if (response.ok) {
|
||||||
const html = await response.text();
|
const html = await response.text();
|
||||||
// 只提取内联脚本内容(没有src属性的script标签)
|
// 提取所有内联脚本内容(没有src属性的script标签)
|
||||||
const scriptMatch = html.match(/<script\b(?![^>]*\bsrc)[^>]*>([\s\S]*?)<\/script>/);
|
const scriptRegex = /<script\b(?![^>]*\bsrc)[^>]*>([\s\S]*?)<\/script>/g;
|
||||||
const scriptContent = scriptMatch ? scriptMatch[1] : '';
|
const scriptContents = [];
|
||||||
|
let match;
|
||||||
|
while ((match = scriptRegex.exec(html)) !== null) {
|
||||||
|
scriptContents.push(match[1]);
|
||||||
|
}
|
||||||
|
const scriptContent = scriptContents.join('\n');
|
||||||
// 移除所有script标签后插入HTML
|
// 移除所有script标签后插入HTML
|
||||||
const htmlWithoutScript = html.replace(/<script\b[^>]*>[\s\S]*?<\/script>/g, '');
|
const htmlWithoutScript = html.replace(/<script\b[^>]*>[\s\S]*?<\/script>/g, '');
|
||||||
|
|
||||||
@@ -785,10 +918,18 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
container.innerHTML = headerHtml + htmlWithoutScript;
|
container.innerHTML = headerHtml + htmlWithoutScript;
|
||||||
// 执行脚本
|
// 执行脚本 - 使用 script 元素注入,使函数在全局作用域可用
|
||||||
if (scriptContent && scriptContent.trim()) {
|
if (scriptContent && scriptContent.trim()) {
|
||||||
try {
|
try {
|
||||||
eval(scriptContent);
|
// 移除可能存在的旧脚本容器
|
||||||
|
const oldScript = document.getElementById('externalPageScript');
|
||||||
|
if (oldScript) oldScript.remove();
|
||||||
|
|
||||||
|
// 创建新的 script 元素
|
||||||
|
const scriptEl = document.createElement('script');
|
||||||
|
scriptEl.id = 'externalPageScript';
|
||||||
|
scriptEl.textContent = scriptContent;
|
||||||
|
document.body.appendChild(scriptEl);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error('执行脚本失败:', e);
|
console.error('执行脚本失败:', e);
|
||||||
}
|
}
|
||||||
@@ -812,6 +953,17 @@
|
|||||||
const data = await fetchData(`${API_BASE}/${config.api}`);
|
const data = await fetchData(`${API_BASE}/${config.api}`);
|
||||||
currentPageData = data; // 保存当前页面数据
|
currentPageData = data; // 保存当前页面数据
|
||||||
container.innerHTML = renderTablePage(config, data);
|
container.innerHTML = renderTablePage(config, data);
|
||||||
|
|
||||||
|
// 异步更新模型名称单元格
|
||||||
|
setTimeout(() => {
|
||||||
|
const modelCells = container.querySelectorAll('.model-name-cell');
|
||||||
|
modelCells.forEach(cell => {
|
||||||
|
const modelId = cell.getAttribute('data-model-id');
|
||||||
|
if (modelId) {
|
||||||
|
fetchAndUpdateModelName(modelId, cell);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}, 0);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('加载数据失败:', error);
|
console.error('加载数据失败:', error);
|
||||||
@@ -859,6 +1011,44 @@
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 停止训练任务
|
||||||
|
async function stopItem(taskId) {
|
||||||
|
showConfirm('确认停止', '确定要停止这个训练任务吗?进程将被终止。', async () => {
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE}/fine-tune/stop/${taskId}`, {
|
||||||
|
method: 'POST'
|
||||||
|
});
|
||||||
|
const result = await response.json();
|
||||||
|
if (result.code === 0) {
|
||||||
|
showMessage('成功', '训练任务已停止', 'success');
|
||||||
|
// 刷新当前页面
|
||||||
|
const activeLink = document.querySelector('.nav-link.sidebar-item-active');
|
||||||
|
if (activeLink) {
|
||||||
|
loadPage(activeLink.dataset.page);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
showMessage('错误', result.message || '停止失败', 'error');
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
showMessage('错误', '停止失败: ' + error.message, 'error');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// 跳转到训练日志二级页面
|
||||||
|
function navigateToTrainingLog(taskId) {
|
||||||
|
// 设置 sessionStorage 传递 taskId
|
||||||
|
sessionStorage.setItem('trainingLogTaskId', taskId.toString());
|
||||||
|
// 跳转到日志页面
|
||||||
|
navigateToPage('training-log');
|
||||||
|
}
|
||||||
|
|
||||||
|
// 查看训练日志 - 跳转到日志页面
|
||||||
|
async function viewTrainingLog(taskId, taskName) {
|
||||||
|
// 跳转到日志页面
|
||||||
|
loadPage('logs');
|
||||||
|
}
|
||||||
|
|
||||||
// 更新模型用途
|
// 更新模型用途
|
||||||
async function updateModelPurpose(id, purpose) {
|
async function updateModelPurpose(id, purpose) {
|
||||||
try {
|
try {
|
||||||
@@ -1133,7 +1323,7 @@
|
|||||||
` : ''}
|
` : ''}
|
||||||
${columns.map(col => `
|
${columns.map(col => `
|
||||||
<td class="px-4 py-4 text-sm text-center">
|
<td class="px-4 py-4 text-sm text-center">
|
||||||
${col.render ? col.render(item[col.key]) : (item[col.key] || '-')}
|
${col.render ? col.render(item[col.key], item) : (item[col.key] || '-')}
|
||||||
</td>
|
</td>
|
||||||
`).join('')}
|
`).join('')}
|
||||||
<td class="px-4 py-4 text-sm text-center">
|
<td class="px-4 py-4 text-sm text-center">
|
||||||
@@ -1141,7 +1331,16 @@
|
|||||||
${config.actions.map(action => {
|
${config.actions.map(action => {
|
||||||
let onclick = '';
|
let onclick = '';
|
||||||
let btnClass = 'text-primary hover:text-primary/80';
|
let btnClass = 'text-primary hover:text-primary/80';
|
||||||
if (action === 'delete') {
|
|
||||||
|
// 对于 fine-tune 的停止按钮,检查状态
|
||||||
|
if (action === 'stop' && config.api === 'fine-tune') {
|
||||||
|
// 状态为 completed 或 failed 时隐藏停止按钮
|
||||||
|
if (item.status === 'completed' || item.status === 'failed') {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
onclick = `stopItem(${item.id})`;
|
||||||
|
btnClass = 'text-orange-500 hover:text-orange-600';
|
||||||
|
} else if (action === 'delete') {
|
||||||
onclick = `deleteItem('${config.api}', ${item.id})`;
|
onclick = `deleteItem('${config.api}', ${item.id})`;
|
||||||
btnClass = 'text-danger hover:text-danger/80';
|
btnClass = 'text-danger hover:text-danger/80';
|
||||||
} else if (action === 'edit') {
|
} else if (action === 'edit') {
|
||||||
@@ -1152,6 +1351,8 @@
|
|||||||
onclick = `downloadDataset('${item.id}')`;
|
onclick = `downloadDataset('${item.id}')`;
|
||||||
} else if (action === 'compare' && config.api === 'model-compare') {
|
} else if (action === 'compare' && config.api === 'model-compare') {
|
||||||
onclick = `startCompare(${item.id})`;
|
onclick = `startCompare(${item.id})`;
|
||||||
|
} else if (action === 'logs' && config.api === 'fine-tune') {
|
||||||
|
onclick = `navigateToTrainingLog(${item.id})`;
|
||||||
} else {
|
} else {
|
||||||
onclick = `showMessage('提示', '${actionLabels[action] || action}功能开发中...', 'info')`;
|
onclick = `showMessage('提示', '${actionLabels[action] || action}功能开发中...', 'info')`;
|
||||||
}
|
}
|
||||||
@@ -1189,33 +1390,59 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="p-4">
|
<div class="p-4">
|
||||||
<!-- 日期和刷新间隔选择 -->
|
<!-- 日志类型切换 -->
|
||||||
<div class="flex items-center flex-wrap gap-4 mb-4">
|
<div class="flex items-center mb-4">
|
||||||
<div class="flex items-center">
|
<div class="flex bg-gray-100 rounded-lg p-1">
|
||||||
<label class="text-sm text-gray-600 mr-3">选择日期:</label>
|
<button id="logTabSystem" onclick="switchLogTab('system')" class="px-4 py-1.5 text-sm rounded-md transition-colors bg-white shadow-sm text-primary">
|
||||||
<input type="date" id="logDatePicker" class="px-3 py-1.5 border border-gray-300 rounded text-sm focus:border-primary focus:outline-none" onchange="loadLogFiles()">
|
系统日志
|
||||||
|
</button>
|
||||||
|
<button id="logTabTraining" onclick="switchLogTab('training')" class="px-4 py-1.5 text-sm rounded-md transition-colors text-gray-600 hover:text-gray-800">
|
||||||
|
训练日志
|
||||||
|
</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex items-center">
|
</div>
|
||||||
<label class="text-sm text-gray-600 mr-3">自动刷新:</label>
|
|
||||||
<select id="logRefreshInterval" onchange="setRefreshInterval()" class="px-3 py-1.5 border border-gray-300 rounded text-sm focus:border-primary focus:outline-none">
|
<!-- 系统日志选项 -->
|
||||||
<option value="0">关闭</option>
|
<div id="systemLogOptions">
|
||||||
<option value="5">5秒</option>
|
<!-- 日期选择 -->
|
||||||
<option value="10" selected>10秒</option>
|
<div class="flex items-center flex-wrap gap-4 mb-4">
|
||||||
<option value="30">30秒</option>
|
<div class="flex items-center">
|
||||||
<option value="60">60秒</option>
|
<label class="text-sm text-gray-600 mr-3">选择日期:</label>
|
||||||
|
<input type="date" id="logDatePicker" class="px-3 py-1.5 border border-gray-300 rounded text-sm focus:border-primary focus:outline-none" onchange="loadLogFiles()">
|
||||||
|
</div>
|
||||||
|
<div class="flex items-center">
|
||||||
|
<label class="text-sm text-gray-600 mr-3">自动刷新:</label>
|
||||||
|
<select id="logRefreshInterval" onchange="setRefreshInterval()" class="px-3 py-1.5 border border-gray-300 rounded text-sm focus:border-primary focus:outline-none">
|
||||||
|
<option value="0">关闭</option>
|
||||||
|
<option value="5">5秒</option>
|
||||||
|
<option value="10" selected>10秒</option>
|
||||||
|
<option value="30">30秒</option>
|
||||||
|
<option value="60">60秒</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div id="logRefreshCountdown" class="text-sm text-gray-500 hidden">
|
||||||
|
<i class="fa fa-clock-o mr-1"></i><span>下次刷新: <span id="countdownNumber">10</span>秒</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<!-- 日志类型选择 -->
|
||||||
|
<div class="flex items-center mb-4">
|
||||||
|
<label class="text-sm text-gray-600 mr-3">日志类型:</label>
|
||||||
|
<select id="logTypeSelect" onchange="loadSelectedLog()" class="px-3 py-1.5 border border-gray-300 rounded text-sm focus:border-primary focus:outline-none">
|
||||||
|
<option value="">请选择日志文件</option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
<div id="logRefreshCountdown" class="text-sm text-gray-500 hidden">
|
</div>
|
||||||
<i class="fa fa-clock-o mr-1"></i><span>下次刷新: <span id="countdownNumber">10</span>秒</span>
|
|
||||||
|
<!-- 训练日志选项(初始隐藏) -->
|
||||||
|
<div id="trainingLogOptions" class="hidden">
|
||||||
|
<div class="flex items-center mb-4">
|
||||||
|
<label class="text-sm text-gray-600 mr-3">训练日志:</label>
|
||||||
|
<select id="trainingLogSelect" onchange="loadSelectedTrainingLog()" class="px-3 py-1.5 border border-gray-300 rounded text-sm focus:border-primary focus:outline-none flex-1">
|
||||||
|
<option value="">请选择训练日志</option>
|
||||||
|
</select>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<!-- 日志类型选择 -->
|
|
||||||
<div class="flex items-center mb-4">
|
|
||||||
<label class="text-sm text-gray-600 mr-3">日志类型:</label>
|
|
||||||
<select id="logTypeSelect" onchange="loadSelectedLog()" class="px-3 py-1.5 border border-gray-300 rounded text-sm focus:border-primary focus:outline-none">
|
|
||||||
<option value="">请选择日志文件</option>
|
|
||||||
</select>
|
|
||||||
</div>
|
|
||||||
<!-- 日志内容显示 -->
|
<!-- 日志内容显示 -->
|
||||||
<div class="border border-gray-200 rounded-lg">
|
<div class="border border-gray-200 rounded-lg">
|
||||||
<div class="flex items-center justify-between px-4 py-2 bg-gray-50 border-b border-gray-200">
|
<div class="flex items-center justify-between px-4 py-2 bg-gray-50 border-b border-gray-200">
|
||||||
@@ -1233,24 +1460,131 @@
|
|||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 当前日志类型:system 或 training
|
||||||
|
let currentLogTab = 'system';
|
||||||
|
|
||||||
|
// 切换日志类型标签
|
||||||
|
function switchLogTab(tab) {
|
||||||
|
currentLogTab = tab;
|
||||||
|
const systemTab = document.getElementById('logTabSystem');
|
||||||
|
const trainingTab = document.getElementById('logTabTraining');
|
||||||
|
const systemOptions = document.getElementById('systemLogOptions');
|
||||||
|
const trainingOptions = document.getElementById('trainingLogOptions');
|
||||||
|
|
||||||
|
if (tab === 'system') {
|
||||||
|
systemTab.className = 'px-4 py-1.5 text-sm rounded-md transition-colors bg-white shadow-sm text-primary';
|
||||||
|
trainingTab.className = 'px-4 py-1.5 text-sm rounded-md transition-colors text-gray-600 hover:text-gray-800';
|
||||||
|
systemOptions.classList.remove('hidden');
|
||||||
|
trainingOptions.classList.add('hidden');
|
||||||
|
loadLogFiles();
|
||||||
|
} else {
|
||||||
|
trainingTab.className = 'px-4 py-1.5 text-sm rounded-md transition-colors bg-white shadow-sm text-primary';
|
||||||
|
systemTab.className = 'px-4 py-1.5 text-sm rounded-md transition-colors text-gray-600 hover:text-gray-800';
|
||||||
|
trainingOptions.classList.remove('hidden');
|
||||||
|
systemOptions.classList.add('hidden');
|
||||||
|
loadTrainingLogFiles();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// 初始化日志查看器
|
// 初始化日志查看器
|
||||||
function initLogViewer() {
|
function initLogViewer() {
|
||||||
const datePicker = document.getElementById('logDatePicker');
|
const datePicker = document.getElementById('logDatePicker');
|
||||||
if (datePicker) {
|
if (datePicker) {
|
||||||
const today = new Date().toISOString().split('T')[0];
|
const today = new Date().toISOString().split('T')[0];
|
||||||
datePicker.value = today;
|
datePicker.value = today;
|
||||||
loadLogFiles();
|
|
||||||
}
|
}
|
||||||
|
// 加载默认日志类型
|
||||||
|
loadLogFiles();
|
||||||
// 启动自动刷新
|
// 启动自动刷新
|
||||||
setRefreshInterval();
|
setRefreshInterval();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 加载训练日志文件列表
|
||||||
|
async function loadTrainingLogFiles() {
|
||||||
|
const logSelect = document.getElementById('trainingLogSelect');
|
||||||
|
if (!logSelect) return;
|
||||||
|
|
||||||
|
logSelect.innerHTML = '<option value="">加载中...</option>';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE}/training-log-files`);
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (result.code === 0 && result.data) {
|
||||||
|
logSelect.innerHTML = '<option value="">请选择训练日志</option>';
|
||||||
|
result.data.forEach(log => {
|
||||||
|
const option = document.createElement('option');
|
||||||
|
option.value = log.file;
|
||||||
|
option.textContent = `${log.name} (PID: ${log.pid}, ${log.date}, ${log.size})`;
|
||||||
|
logSelect.appendChild(option);
|
||||||
|
});
|
||||||
|
// 如果有日志文件,自动加载第一个
|
||||||
|
if (result.data.length > 0) {
|
||||||
|
logSelect.value = result.data[0].file;
|
||||||
|
loadSelectedTrainingLog();
|
||||||
|
} else {
|
||||||
|
document.getElementById('logContent').textContent = '暂无训练日志';
|
||||||
|
document.getElementById('logFileInfo').textContent = '无训练日志';
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logSelect.innerHTML = '<option value="">暂无训练日志</option>';
|
||||||
|
document.getElementById('logContent').textContent = '暂无训练日志';
|
||||||
|
document.getElementById('logFileInfo').textContent = '无训练日志';
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('加载训练日志列表失败:', error);
|
||||||
|
logSelect.innerHTML = '<option value="">加载失败</option>';
|
||||||
|
document.getElementById('logContent').textContent = '加载训练日志列表失败: ' + error.message;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 加载选中的训练日志
|
||||||
|
async function loadSelectedTrainingLog() {
|
||||||
|
const logSelect = document.getElementById('trainingLogSelect');
|
||||||
|
const logFile = logSelect.value;
|
||||||
|
const logContent = document.getElementById('logContent');
|
||||||
|
const logFileInfo = document.getElementById('logFileInfo');
|
||||||
|
|
||||||
|
if (!logFile) {
|
||||||
|
logContent.textContent = '请选择训练日志';
|
||||||
|
logFileInfo.textContent = '无训练日志';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
logContent.textContent = '加载中...';
|
||||||
|
logFileInfo.textContent = '加载中...';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE}/training-log-content?file=${encodeURIComponent(logFile)}`);
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (result.code === 0 && result.data) {
|
||||||
|
logFullContent = result.data.content || '';
|
||||||
|
logContent.textContent = logFullContent || '(空日志)';
|
||||||
|
logFileInfo.textContent = result.data.file + ' (' + result.data.size + ')';
|
||||||
|
// 清空搜索
|
||||||
|
document.getElementById('logSearchInput').value = '';
|
||||||
|
document.getElementById('logMatchCount').textContent = '';
|
||||||
|
// 滚动到底部
|
||||||
|
scrollToLogBottom();
|
||||||
|
} else {
|
||||||
|
logContent.textContent = '加载失败: ' + (result.message || '未知错误');
|
||||||
|
logFileInfo.textContent = '加载失败';
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('加载训练日志内容失败:', error);
|
||||||
|
logContent.textContent = '加载失败: ' + error.message;
|
||||||
|
logFileInfo.textContent = '加载失败';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// 加载日志文件列表
|
// 加载日志文件列表
|
||||||
async function loadLogFiles() {
|
async function loadLogFiles() {
|
||||||
const datePicker = document.getElementById('logDatePicker');
|
const datePicker = document.getElementById('logDatePicker');
|
||||||
const logTypeSelect = document.getElementById('logTypeSelect');
|
const logTypeSelect = document.getElementById('logTypeSelect');
|
||||||
const selectedDate = datePicker.value;
|
const selectedDate = datePicker ? datePicker.value : new Date().toISOString().split('T')[0];
|
||||||
|
|
||||||
|
if (!logTypeSelect) return;
|
||||||
logTypeSelect.innerHTML = '<option value="">加载中...</option>';
|
logTypeSelect.innerHTML = '<option value="">加载中...</option>';
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -1269,6 +1603,10 @@
|
|||||||
if (result.data.length > 0) {
|
if (result.data.length > 0) {
|
||||||
logTypeSelect.value = result.data[0].file;
|
logTypeSelect.value = result.data[0].file;
|
||||||
loadSelectedLog();
|
loadSelectedLog();
|
||||||
|
} else {
|
||||||
|
logTypeSelect.innerHTML = '<option value="">暂无日志文件</option>';
|
||||||
|
document.getElementById('logContent').textContent = '该日期暂无日志文件';
|
||||||
|
document.getElementById('logFileInfo').textContent = '无日志文件';
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
logTypeSelect.innerHTML = '<option value="">暂无日志文件</option>';
|
logTypeSelect.innerHTML = '<option value="">暂无日志文件</option>';
|
||||||
@@ -1324,9 +1662,13 @@
|
|||||||
|
|
||||||
// 刷新日志
|
// 刷新日志
|
||||||
function refreshLogs() {
|
function refreshLogs() {
|
||||||
loadLogFiles();
|
if (currentLogTab === 'system') {
|
||||||
if (document.getElementById('logTypeSelect').value) {
|
loadLogFiles();
|
||||||
loadSelectedLog();
|
if (document.getElementById('logTypeSelect').value) {
|
||||||
|
loadSelectedLog();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
loadTrainingLogFiles();
|
||||||
}
|
}
|
||||||
// 重置倒计时
|
// 重置倒计时
|
||||||
const select = document.getElementById('logRefreshInterval');
|
const select = document.getElementById('logRefreshInterval');
|
||||||
@@ -2057,10 +2399,63 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 根据模型ID获取模型名称
|
// 根据模型ID获取模型名称(同步版本,用于表格渲染)
|
||||||
function getModelName(modelId) {
|
function getModelName(modelId) {
|
||||||
const model = modelListCache.find(m => m.id === modelId);
|
if (!modelId) return '-';
|
||||||
return model ? model.name : `模型${modelId}`;
|
|
||||||
|
// 尝试多种方式匹配(处理类型不一致的情况)
|
||||||
|
const model = modelListCache.find(m =>
|
||||||
|
m.id == modelId ||
|
||||||
|
m.id === String(modelId) ||
|
||||||
|
m.id === Number(modelId)
|
||||||
|
);
|
||||||
|
|
||||||
|
if (model) {
|
||||||
|
return model.name;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 如果缓存中没有找到,尝试直接通过 API 获取单个模型
|
||||||
|
// 这是一个备用方案,不会阻塞渲染
|
||||||
|
return `模型${modelId}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 异步获取模型名称并更新 DOM(用于表格渲染后的更新)
|
||||||
|
async function fetchAndUpdateModelName(modelId, cellElement) {
|
||||||
|
if (!modelId) {
|
||||||
|
cellElement.textContent = '-';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 先尝试从缓存中找
|
||||||
|
let model = modelListCache.find(m =>
|
||||||
|
m.id == modelId ||
|
||||||
|
m.id === String(modelId) ||
|
||||||
|
m.id === Number(modelId)
|
||||||
|
);
|
||||||
|
|
||||||
|
// 如果缓存中没有,尝试直接获取
|
||||||
|
if (!model) {
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE}/model-manage`);
|
||||||
|
const result = await response.json();
|
||||||
|
if (result.code === 0) {
|
||||||
|
modelListCache = result.data || [];
|
||||||
|
model = modelListCache.find(m =>
|
||||||
|
m.id == modelId ||
|
||||||
|
m.id === String(modelId) ||
|
||||||
|
m.id === Number(modelId)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error('获取模型列表失败:', e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (model) {
|
||||||
|
cellElement.textContent = model.name;
|
||||||
|
} else {
|
||||||
|
cellElement.textContent = `模型${modelId}`;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 根据模型ID列表获取模型名称列表
|
// 根据模型ID列表获取模型名称列表
|
||||||
@@ -2255,12 +2650,12 @@
|
|||||||
<td class="p-3 border border-gray-200 text-gray-500 text-sm">循环次数,代表模型训练过程中模型学习数据集的次数,可理解为看几遍数据,一般建议的范围是1-3遍即可</td>
|
<td class="p-3 border border-gray-200 text-gray-500 text-sm">循环次数,代表模型训练过程中模型学习数据集的次数,可理解为看几遍数据,一般建议的范围是1-3遍即可</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td class="p-3 border border-gray-200 text-gray-700">eval_steps</td>
|
<td class="p-3 border border-gray-200 text-gray-700">save_steps</td>
|
||||||
<td class="p-3 border border-gray-200">
|
<td class="p-3 border border-gray-200">
|
||||||
<input type="number" name="eval_steps_lora" value="50" class="w-24 px-2 py-1 border border-gray-300 rounded text-sm focus:border-primary focus:outline-none">
|
<input type="number" name="save_steps_lora" value="50" class="w-24 px-2 py-1 border border-gray-300 rounded text-sm focus:border-primary focus:outline-none">
|
||||||
<span class="text-xs text-gray-400 ml-2">[1,2147483647]</span>
|
<span class="text-xs text-gray-400 ml-2">[1,2147483647]</span>
|
||||||
</td>
|
</td>
|
||||||
<td class="p-3 border border-gray-200 text-gray-500 text-sm">验证步数,训练阶段针模型的验证间隔步长,用于阶段性评估模型训练准确率、训练损失</td>
|
<td class="p-3 border border-gray-200 text-gray-500 text-sm">保存步数,训练阶段模型的保存间隔步长,用于阶段性保存模型权重</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td class="p-3 border border-gray-200 text-gray-700">lora_alpha</td>
|
<td class="p-3 border border-gray-200 text-gray-700">lora_alpha</td>
|
||||||
@@ -2375,12 +2770,12 @@
|
|||||||
<td class="p-3 border border-gray-200 text-gray-500 text-sm">循环次数,代表模型训练过程中模型学习数据集的次数,可理解为看几遍数据,一般建议的范围是1-3遍即可,可依据需求进行调整</td>
|
<td class="p-3 border border-gray-200 text-gray-500 text-sm">循环次数,代表模型训练过程中模型学习数据集的次数,可理解为看几遍数据,一般建议的范围是1-3遍即可,可依据需求进行调整</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td class="p-3 border border-gray-200 text-gray-700">eval_steps</td>
|
<td class="p-3 border border-gray-200 text-gray-700">save_steps</td>
|
||||||
<td class="p-3 border border-gray-200">
|
<td class="p-3 border border-gray-200">
|
||||||
<input type="number" name="eval_steps_full" value="50" class="w-24 px-2 py-1 border border-gray-300 rounded text-sm focus:border-primary focus:outline-none">
|
<input type="number" name="save_steps_full" value="50" class="w-24 px-2 py-1 border border-gray-300 rounded text-sm focus:border-primary focus:outline-none">
|
||||||
<span class="text-xs text-gray-400 ml-2">[1,2147483647]</span>
|
<span class="text-xs text-gray-400 ml-2">[1,2147483647]</span>
|
||||||
</td>
|
</td>
|
||||||
<td class="p-3 border border-gray-200 text-gray-500 text-sm">验证步数,训练阶段针模型的验证间隔步长,用于阶段性评估模型训练准确率、训练损失</td>
|
<td class="p-3 border border-gray-200 text-gray-500 text-sm">保存步数,训练阶段模型的保存间隔步长,用于阶段性保存模型权重</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td class="p-3 border border-gray-200 text-gray-700">lr_scheduler_type</td>
|
<td class="p-3 border border-gray-200 text-gray-700">lr_scheduler_type</td>
|
||||||
@@ -2629,7 +3024,7 @@
|
|||||||
'batch_size_lora': '16',
|
'batch_size_lora': '16',
|
||||||
'learning_rate_lora': '3e-4',
|
'learning_rate_lora': '3e-4',
|
||||||
'n_epochs_lora': '3',
|
'n_epochs_lora': '3',
|
||||||
'eval_steps_lora': '50',
|
'save_steps_lora': '50',
|
||||||
'lora_alpha': '32',
|
'lora_alpha': '32',
|
||||||
'lora_dropout': '0.1',
|
'lora_dropout': '0.1',
|
||||||
'lora_rank': '8',
|
'lora_rank': '8',
|
||||||
@@ -2649,7 +3044,7 @@
|
|||||||
'batch_size_full': '16',
|
'batch_size_full': '16',
|
||||||
'learning_rate_full': '1e-5',
|
'learning_rate_full': '1e-5',
|
||||||
'n_epochs_full': '3',
|
'n_epochs_full': '3',
|
||||||
'eval_steps_full': '50',
|
'save_steps_full': '50',
|
||||||
'lr_scheduler_type_full': 'linear',
|
'lr_scheduler_type_full': 'linear',
|
||||||
'max_length_full': '8192',
|
'max_length_full': '8192',
|
||||||
'warmup_ratio_full': '0.05',
|
'warmup_ratio_full': '0.05',
|
||||||
@@ -2755,6 +3150,7 @@
|
|||||||
const modalConfirmBtn = document.getElementById('modalConfirmBtn');
|
const modalConfirmBtn = document.getElementById('modalConfirmBtn');
|
||||||
const modalCancelBtn = document.getElementById('modalCancelBtn');
|
const modalCancelBtn = document.getElementById('modalCancelBtn');
|
||||||
const modalBtnGroup = document.getElementById('modalBtnGroup');
|
const modalBtnGroup = document.getElementById('modalBtnGroup');
|
||||||
|
const modalSingleBtnGroup = document.getElementById('modalSingleBtnGroup');
|
||||||
|
|
||||||
if (!modalConfirmBtn) {
|
if (!modalConfirmBtn) {
|
||||||
console.error('modalConfirmBtn not found');
|
console.error('modalConfirmBtn not found');
|
||||||
@@ -2771,7 +3167,9 @@
|
|||||||
modalIcon.innerHTML = '<div class="w-12 h-12 mx-auto mb-4 rounded-full bg-blue-100 flex items-center justify-center"><i class="fa fa-question text-xl text-blue-600"></i></div>';
|
modalIcon.innerHTML = '<div class="w-12 h-12 mx-auto mb-4 rounded-full bg-blue-100 flex items-center justify-center"><i class="fa fa-question text-xl text-blue-600"></i></div>';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 显示双按钮组,隐藏单按钮组
|
||||||
modalBtnGroup.classList.remove('hidden');
|
modalBtnGroup.classList.remove('hidden');
|
||||||
|
modalSingleBtnGroup.classList.add('hidden');
|
||||||
modalConfirmBtn.textContent = '确定';
|
modalConfirmBtn.textContent = '确定';
|
||||||
modalConfirmBtn.className = 'px-6 py-2 bg-primary text-white rounded-lg hover:bg-primary/90 transition-colors';
|
modalConfirmBtn.className = 'px-6 py-2 bg-primary text-white rounded-lg hover:bg-primary/90 transition-colors';
|
||||||
|
|
||||||
@@ -2863,6 +3261,11 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 返回到列表页(外部页面用)
|
||||||
|
function goBackToList() {
|
||||||
|
navigateToPage('fine-tune');
|
||||||
|
}
|
||||||
|
|
||||||
// 添加评测维度
|
// 添加评测维度
|
||||||
function addDimension() {
|
function addDimension() {
|
||||||
window.location.href = 'model-dimension-create.html';
|
window.location.href = 'model-dimension-create.html';
|
||||||
|
|||||||
@@ -219,6 +219,9 @@
|
|||||||
<button onclick="switchTab('evaluation')" id="tab-evaluation" class="tab-btn" style="display: inline-flex; align-items: center; justify-content: center;">
|
<button onclick="switchTab('evaluation')" id="tab-evaluation" class="tab-btn" style="display: inline-flex; align-items: center; justify-content: center;">
|
||||||
评测模型
|
评测模型
|
||||||
</button>
|
</button>
|
||||||
|
<button onclick="switchTab('trained')" id="tab-trained" class="tab-btn" style="display: inline-flex; align-items: center; justify-content: center;">
|
||||||
|
已训练模型
|
||||||
|
</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -235,7 +238,7 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- 模型表格 -->
|
<!-- 模型表格 -->
|
||||||
<div class="bg-white rounded-lg shadow-sm">
|
<div id="modelsTableContainer" class="bg-white rounded-lg shadow-sm">
|
||||||
<div class="overflow-x-auto">
|
<div class="overflow-x-auto">
|
||||||
<table class="w-full">
|
<table class="w-full">
|
||||||
<thead class="bg-gray-50">
|
<thead class="bg-gray-50">
|
||||||
@@ -260,6 +263,33 @@
|
|||||||
<p class="text-gray-500">暂无模型数据</p>
|
<p class="text-gray-500">暂无模型数据</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- 已训练模型表格 -->
|
||||||
|
<div id="trainedModelsContainer" class="hidden bg-white rounded-lg shadow-sm">
|
||||||
|
<div class="p-4 border-b border-gray-200">
|
||||||
|
<p class="text-sm text-gray-500">已训练模型存储在 /app/base/saves 目录下</p>
|
||||||
|
</div>
|
||||||
|
<div class="overflow-x-auto">
|
||||||
|
<table class="w-full">
|
||||||
|
<thead class="bg-gray-50">
|
||||||
|
<tr>
|
||||||
|
<th class="px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">基座模型</th>
|
||||||
|
<th class="px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">训练方法</th>
|
||||||
|
<th class="px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">模型路径</th>
|
||||||
|
<th class="px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">操作</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody id="trainedModelsBody" class="bg-white divide-y divide-gray-200">
|
||||||
|
<!-- 动态加载 -->
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<!-- 空状态 -->
|
||||||
|
<div id="trainedEmptyState" class="hidden px-6 py-12 text-center">
|
||||||
|
<i class="fa fa-inbox text-4xl text-gray-300 mb-3"></i>
|
||||||
|
<p class="text-gray-500">暂无已训练模型</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
@@ -272,6 +302,7 @@
|
|||||||
const API_BASE = getApiBase();
|
const API_BASE = getApiBase();
|
||||||
|
|
||||||
let allModels = [];
|
let allModels = [];
|
||||||
|
let trainedModels = [];
|
||||||
let currentTab = 'all';
|
let currentTab = 'all';
|
||||||
|
|
||||||
// Tab 切换
|
// Tab 切换
|
||||||
@@ -284,7 +315,25 @@
|
|||||||
const activeTab = document.getElementById(`tab-${tab}`);
|
const activeTab = document.getElementById(`tab-${tab}`);
|
||||||
activeTab.classList.add('tab-active');
|
activeTab.classList.add('tab-active');
|
||||||
|
|
||||||
renderModels();
|
// 显示/隐藏搜索框和添加按钮
|
||||||
|
const toolbar = document.querySelector('div[style*="justify-content: space-between"]');
|
||||||
|
if (toolbar) {
|
||||||
|
toolbar.style.display = tab === 'trained' ? 'none' : 'flex';
|
||||||
|
}
|
||||||
|
|
||||||
|
// 显示/隐藏表格容器
|
||||||
|
const modelsTable = document.getElementById('modelsTableContainer');
|
||||||
|
const trainedModelsContainer = document.getElementById('trainedModelsContainer');
|
||||||
|
|
||||||
|
if (tab === 'trained') {
|
||||||
|
modelsTable.classList.add('hidden');
|
||||||
|
trainedModelsContainer.classList.remove('hidden');
|
||||||
|
loadTrainedModels();
|
||||||
|
} else {
|
||||||
|
modelsTable.classList.remove('hidden');
|
||||||
|
trainedModelsContainer.classList.add('hidden');
|
||||||
|
renderModels();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 加载模型数据
|
// 加载模型数据
|
||||||
@@ -302,6 +351,23 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 加载已训练模型数据
|
||||||
|
async function loadTrainedModels() {
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE}/model-manage/trained-models`);
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
console.log('[DEBUG] 已训练模型:', result);
|
||||||
|
|
||||||
|
if (result.code === 0) {
|
||||||
|
trainedModels = result.data?.models || [];
|
||||||
|
renderTrainedModels();
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('加载已训练模型失败:', error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// 筛选模型
|
// 筛选模型
|
||||||
function filterModels() {
|
function filterModels() {
|
||||||
renderModels();
|
renderModels();
|
||||||
@@ -396,6 +462,70 @@
|
|||||||
}).join('');
|
}).join('');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 渲染已训练模型列表
|
||||||
|
function renderTrainedModels() {
|
||||||
|
const tbody = document.getElementById('trainedModelsBody');
|
||||||
|
const emptyState = document.getElementById('trainedEmptyState');
|
||||||
|
|
||||||
|
// 收集所有训练方法
|
||||||
|
let allTrainMethods = [];
|
||||||
|
trainedModels.forEach(model => {
|
||||||
|
if (model.train_methods && model.train_methods.length > 0) {
|
||||||
|
model.train_methods.forEach(method => {
|
||||||
|
allTrainMethods.push({
|
||||||
|
baseModel: model.name,
|
||||||
|
trainMethod: method.name,
|
||||||
|
path: method.path
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (allTrainMethods.length === 0) {
|
||||||
|
tbody.innerHTML = '';
|
||||||
|
emptyState.classList.remove('hidden');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
emptyState.classList.add('hidden');
|
||||||
|
|
||||||
|
tbody.innerHTML = allTrainMethods.map(item => {
|
||||||
|
// 训练方法显示
|
||||||
|
const methodMap = {
|
||||||
|
'lora': 'LoRA',
|
||||||
|
'qlora': 'QLoRA',
|
||||||
|
'full': '全量微调',
|
||||||
|
'prefix': 'Prefix Tuning',
|
||||||
|
'adapter': 'Adapter'
|
||||||
|
};
|
||||||
|
const methodDisplay = methodMap[item.trainMethod] || item.trainMethod;
|
||||||
|
|
||||||
|
return `
|
||||||
|
<tr class="hover:bg-gray-50">
|
||||||
|
<td class="px-6 py-4 whitespace-nowrap">
|
||||||
|
<div class="text-sm font-medium text-gray-900">${item.baseModel}</div>
|
||||||
|
</td>
|
||||||
|
<td class="px-6 py-4 whitespace-nowrap">
|
||||||
|
<span class="px-2 py-1 text-xs font-medium rounded bg-green-100 text-green-700">${methodDisplay}</span>
|
||||||
|
</td>
|
||||||
|
<td class="px-6 py-4 whitespace-nowrap">
|
||||||
|
<div class="text-sm text-gray-500 max-w-xs truncate" title="${item.path}">${item.path}</div>
|
||||||
|
</td>
|
||||||
|
<td class="px-6 py-4 whitespace-nowrap text-sm text-gray-500">
|
||||||
|
<button onclick="viewTrainedModel('${item.path.replace(/\\/g, '\\\\')}')" class="text-primary hover:text-primary/80 mr-3">
|
||||||
|
<i class="fa fa-folder-open"></i> 查看
|
||||||
|
</button>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
`;
|
||||||
|
}).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
// 查看已训练模型
|
||||||
|
function viewTrainedModel(path) {
|
||||||
|
alert(`模型路径: ${path}\n\n您可以从此路径加载模型进行推理或评测。`);
|
||||||
|
}
|
||||||
|
|
||||||
// 编辑模型
|
// 编辑模型
|
||||||
function editModel(id) {
|
function editModel(id) {
|
||||||
window.location.href = `model-manage-create.html?id=${id}`;
|
window.location.href = `model-manage-create.html?id=${id}`;
|
||||||
|
|||||||
740
web/pages/training-log.html
Normal file
740
web/pages/training-log.html
Normal file
@@ -0,0 +1,740 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="zh-CN">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>训练日志 / 远光软件微调平台</title>
|
||||||
|
<script src="../lib/tailwindcss/tailwind.js"></script>
|
||||||
|
<link href="../lib/font-awesome/css/font-awesome.min.css" rel="stylesheet">
|
||||||
|
<script src="../lib/chart.js/chart.min.js"></script>
|
||||||
|
<script>
|
||||||
|
// 确保 Chart.js 已加载
|
||||||
|
if (typeof Chart === 'undefined') {
|
||||||
|
console.error('Chart.js 未加载,尝试动态加载...');
|
||||||
|
// 备用:尝试动态加载
|
||||||
|
const script = document.createElement('script');
|
||||||
|
script.src = '../lib/chart.js/chart.umd.min.js';
|
||||||
|
script.onload = function() {
|
||||||
|
console.log('Chart.js 动态加载成功');
|
||||||
|
window.chartJsLoaded = true;
|
||||||
|
};
|
||||||
|
script.onerror = function() {
|
||||||
|
console.error('Chart.js 加载失败');
|
||||||
|
};
|
||||||
|
document.head.appendChild(script);
|
||||||
|
} else {
|
||||||
|
console.log('Chart.js 已加载');
|
||||||
|
window.chartJsLoaded = true;
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
<style>
|
||||||
|
.bg-primary { background-color: #1890ff; }
|
||||||
|
.text-primary { color: #1890ff; }
|
||||||
|
.border-primary { border-color: #1890ff; }
|
||||||
|
:root { --primary: #1890ff; --danger: #f5222d; --success: #52c41a; }
|
||||||
|
|
||||||
|
/* 日志样式 */
|
||||||
|
.log-content {
|
||||||
|
font-family: 'Consolas', 'Monaco', monospace;
|
||||||
|
font-size: 12px;
|
||||||
|
line-height: 1.5;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
word-wrap: break-word;
|
||||||
|
}
|
||||||
|
.log-content .error { color: #dc3545; }
|
||||||
|
.log-content .warning { color: #d97706; }
|
||||||
|
.log-content .info { color: #0891b2; }
|
||||||
|
.log-content .success { color: #16a34a; }
|
||||||
|
.log-content .progress { color: #7c3aed; font-weight: bold; }
|
||||||
|
.log-line { padding: 1px 8px; }
|
||||||
|
.log-line:hover { background-color: rgba(24, 144, 255, 0.1); }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body class="bg-gray-50 p-6">
|
||||||
|
<!-- 页面标题 -->
|
||||||
|
<div class="bg-white rounded-lg shadow-sm w-full p-4 border-b border-gray-100 mb-4">
|
||||||
|
<div class="flex items-center justify-between">
|
||||||
|
<div class="flex items-center text-sm">
|
||||||
|
<span class="text-gray-800 font-medium">训练日志</span>
|
||||||
|
</div>
|
||||||
|
<div class="flex items-center space-x-3">
|
||||||
|
<button onclick="toggleTB()" id="tbBtn" class="bg-purple-500 text-white px-4 py-2 rounded hover:bg-purple-600 transition-colors text-sm">
|
||||||
|
<i class="fa fa-bar-chart mr-1"></i>TensorBoard
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 任务信息 -->
|
||||||
|
<div class="bg-white rounded-lg shadow-sm p-6 mb-6">
|
||||||
|
<div class="flex items-center justify-between mb-4">
|
||||||
|
<h2 class="text-lg font-medium text-gray-800" id="taskName">加载中...</h2>
|
||||||
|
<span id="taskStatus" class="px-3 py-1 rounded-full text-sm bg-gray-100 text-gray-600">加载中</span>
|
||||||
|
</div>
|
||||||
|
<div class="grid grid-cols-2 md:grid-cols-5 gap-4 text-sm">
|
||||||
|
<div>
|
||||||
|
<div class="text-gray-500 text-xs">基础模型</div>
|
||||||
|
<div id="baseModel" class="font-medium text-gray-800">-</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="text-gray-500 text-xs">数据集</div>
|
||||||
|
<div id="dataset" class="font-medium text-gray-800">-</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="text-gray-500 text-xs">创建时间</div>
|
||||||
|
<div id="createTime" class="font-medium text-gray-800">-</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="text-gray-500 text-xs">进程ID</div>
|
||||||
|
<div id="processId" class="font-medium text-gray-800">-</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="text-gray-500 text-xs">最后更新</div>
|
||||||
|
<div id="lastUpdate" class="font-medium text-gray-800">-</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 训练曲线图表 -->
|
||||||
|
<div id="chartsContainer" class="bg-white rounded-lg shadow-sm p-6 mb-6">
|
||||||
|
<h3 class="text-base font-medium text-gray-800 mb-4">训练曲线</h3>
|
||||||
|
<div class="grid grid-cols-1 md:grid-cols-3 gap-4">
|
||||||
|
<div>
|
||||||
|
<canvas id="lossChart" class="w-full h-48"></canvas>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<canvas id="gradNormChart" class="w-full h-48"></canvas>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<canvas id="learningRateChart" class="w-full h-48"></canvas>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 日志内容 -->
|
||||||
|
<div class="bg-white rounded-lg shadow-sm">
|
||||||
|
<div class="flex items-center justify-between p-4 border-b border-gray-100">
|
||||||
|
<h3 class="text-base font-medium text-gray-800">实时日志</h3>
|
||||||
|
<div class="flex items-center space-x-4">
|
||||||
|
<input type="text" id="logSearchInput" placeholder="搜索日志..."
|
||||||
|
class="px-3 py-1.5 border border-gray-300 rounded text-sm focus:outline-none focus:border-primary w-48"
|
||||||
|
oninput="searchLog()">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="p-4">
|
||||||
|
<div id="logMatchCount" class="text-xs text-gray-500 mb-2"></div>
|
||||||
|
<div id="logContent" class="log-content bg-gray-50 rounded p-4 max-h-[400px] overflow-y-auto text-xs">
|
||||||
|
加载日志中...
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
let taskId = null;
|
||||||
|
let taskInfo = null;
|
||||||
|
let trainingLogFullContent = '';
|
||||||
|
|
||||||
|
// 训练曲线数据
|
||||||
|
const lossData = { labels: [], values: [] };
|
||||||
|
const gradNormData = { labels: [], values: [] };
|
||||||
|
const learningRateData = { labels: [], values: [] };
|
||||||
|
|
||||||
|
// 图表实例
|
||||||
|
let lossChart, gradNormChart, learningRateChart;
|
||||||
|
|
||||||
|
// 初始化图表
|
||||||
|
function initCharts() {
|
||||||
|
if (typeof Chart === 'undefined') {
|
||||||
|
console.error('[Charts] Chart 未定义,无法初始化图表');
|
||||||
|
document.getElementById('chartsContainer').innerHTML = '<div class="text-center p-4 text-red-500"><i class="fa fa-exclamation-triangle mr-2"></i>图表库加载失败,请刷新页面重试</div>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('[Charts] 开始初始化图表...');
|
||||||
|
const commonOptions = {
|
||||||
|
responsive: true,
|
||||||
|
maintainAspectRatio: false,
|
||||||
|
animation: false,
|
||||||
|
scales: {
|
||||||
|
x: {
|
||||||
|
title: { display: true, text: 'Step' },
|
||||||
|
grid: { color: 'rgba(0,0,0,0.05)' }
|
||||||
|
},
|
||||||
|
y: {
|
||||||
|
title: { display: true, text: 'Value' },
|
||||||
|
grid: { color: 'rgba(0,0,0,0.05)' }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
plugins: {
|
||||||
|
legend: { display: false }
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Loss 图表
|
||||||
|
const lossCtx = document.getElementById('lossChart').getContext('2d');
|
||||||
|
lossChart = new Chart(lossCtx, {
|
||||||
|
type: 'line',
|
||||||
|
data: {
|
||||||
|
labels: lossData.labels,
|
||||||
|
datasets: [{
|
||||||
|
label: 'Loss',
|
||||||
|
data: lossData.values,
|
||||||
|
borderColor: '#ef4444',
|
||||||
|
backgroundColor: 'rgba(239, 68, 68, 0.1)',
|
||||||
|
fill: true,
|
||||||
|
tension: 0.3,
|
||||||
|
pointRadius: 3
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
options: {
|
||||||
|
...commonOptions,
|
||||||
|
plugins: { ...commonOptions.plugins, title: { display: true, text: 'Loss', color: '#ef4444', font: { size: 14 } } }
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Grad Norm 图表
|
||||||
|
const gradNormCtx = document.getElementById('gradNormChart').getContext('2d');
|
||||||
|
gradNormChart = new Chart(gradNormCtx, {
|
||||||
|
type: 'line',
|
||||||
|
data: {
|
||||||
|
labels: gradNormData.labels,
|
||||||
|
datasets: [{
|
||||||
|
label: 'Grad Norm',
|
||||||
|
data: gradNormData.values,
|
||||||
|
borderColor: '#3b82f6',
|
||||||
|
backgroundColor: 'rgba(59, 130, 246, 0.1)',
|
||||||
|
fill: true,
|
||||||
|
tension: 0.3,
|
||||||
|
pointRadius: 3
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
options: {
|
||||||
|
...commonOptions,
|
||||||
|
plugins: { ...commonOptions.plugins, title: { display: true, text: 'Grad Norm', color: '#3b82f6', font: { size: 14 } } }
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Learning Rate 图表
|
||||||
|
const lrCtx = document.getElementById('learningRateChart').getContext('2d');
|
||||||
|
learningRateChart = new Chart(lrCtx, {
|
||||||
|
type: 'line',
|
||||||
|
data: {
|
||||||
|
labels: learningRateData.labels,
|
||||||
|
datasets: [{
|
||||||
|
label: 'Learning Rate',
|
||||||
|
data: learningRateData.values,
|
||||||
|
borderColor: '#22c55e',
|
||||||
|
backgroundColor: 'rgba(34, 197, 94, 0.1)',
|
||||||
|
fill: true,
|
||||||
|
tension: 0.3,
|
||||||
|
pointRadius: 3
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
options: {
|
||||||
|
...commonOptions,
|
||||||
|
scales: {
|
||||||
|
...commonOptions.scales,
|
||||||
|
y: {
|
||||||
|
...commonOptions.scales.y,
|
||||||
|
type: 'logarithmic',
|
||||||
|
title: { display: true, text: 'Learning Rate (log)' }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
plugins: { ...commonOptions.plugins, title: { display: true, text: 'Learning Rate', color: '#22c55e', font: { size: 14 } } }
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// 解析日志中的指标数据
|
||||||
|
function parseMetricsFromLog(logContent) {
|
||||||
|
// 匹配 {'loss': x.xxxx, 'grad_norm': x.xxxx, 'learning_rate': x.xxxx, 'epoch': x.xx}
|
||||||
|
const metricRegex = /\{'loss':\s*([\d.]+),\s*'grad_norm':\s*([\d.]+),\s*'learning_rate':\s*([\d.e+-]+),\s*'epoch':\s*([\d.]+)\}/g;
|
||||||
|
let match;
|
||||||
|
let step = 0;
|
||||||
|
|
||||||
|
while ((match = metricRegex.exec(logContent)) !== null) {
|
||||||
|
const loss = parseFloat(match[1]);
|
||||||
|
const gradNorm = parseFloat(match[2]);
|
||||||
|
const learningRate = parseFloat(match[3]);
|
||||||
|
const epoch = parseFloat(match[4]);
|
||||||
|
|
||||||
|
// 更新数据
|
||||||
|
if (!lossData.values.includes(loss)) {
|
||||||
|
step++;
|
||||||
|
lossData.labels.push(step);
|
||||||
|
lossData.values.push(loss);
|
||||||
|
gradNormData.labels.push(step);
|
||||||
|
gradNormData.values.push(gradNorm);
|
||||||
|
learningRateData.labels.push(step);
|
||||||
|
learningRateData.values.push(learningRate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 更新图表
|
||||||
|
if (lossChart) {
|
||||||
|
lossChart.data.labels = lossData.labels;
|
||||||
|
lossChart.data.datasets[0].data = lossData.values;
|
||||||
|
lossChart.update('none');
|
||||||
|
}
|
||||||
|
if (gradNormChart) {
|
||||||
|
gradNormChart.data.labels = gradNormData.labels;
|
||||||
|
gradNormChart.data.datasets[0].data = gradNormData.values;
|
||||||
|
gradNormChart.update('none');
|
||||||
|
}
|
||||||
|
if (learningRateChart) {
|
||||||
|
learningRateChart.data.labels = learningRateData.labels;
|
||||||
|
learningRateChart.data.datasets[0].data = learningRateData.values;
|
||||||
|
learningRateChart.update('none');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 带超时的 fetch
|
||||||
|
async function fetchWithTimeout(url, options = {}, timeout = 10000) {
|
||||||
|
const controller = new AbortController();
|
||||||
|
const id = setTimeout(() => controller.abort(), timeout);
|
||||||
|
try {
|
||||||
|
const response = await fetch(url, {
|
||||||
|
...options,
|
||||||
|
signal: controller.signal
|
||||||
|
});
|
||||||
|
clearTimeout(id);
|
||||||
|
return response;
|
||||||
|
} catch (error) {
|
||||||
|
clearTimeout(id);
|
||||||
|
throw new Error(`请求超时或失败: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取URL参数
|
||||||
|
function getQueryParam(name) {
|
||||||
|
const urlParams = new URLSearchParams(window.location.search);
|
||||||
|
return urlParams.get(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取任务ID(优先从URL参数,其次从sessionStorage)
|
||||||
|
function getTaskId() {
|
||||||
|
let id = getQueryParam('id');
|
||||||
|
if (!id) {
|
||||||
|
try {
|
||||||
|
id = sessionStorage.getItem('trainingLogTaskId');
|
||||||
|
} catch (e) {}
|
||||||
|
}
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 返回模型调优列表
|
||||||
|
function goBack() {
|
||||||
|
if (window.parent && window.parent.navigateToPage) {
|
||||||
|
window.parent.navigateToPage('fine-tune');
|
||||||
|
} else {
|
||||||
|
window.location.href = 'main.html?page=fine-tune';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 初始化
|
||||||
|
async function init() {
|
||||||
|
console.log('[Init] 开始初始化...');
|
||||||
|
|
||||||
|
taskId = getTaskId();
|
||||||
|
console.log('[Init] taskId:', taskId);
|
||||||
|
|
||||||
|
if (!taskId) {
|
||||||
|
document.getElementById('taskName').textContent = '未指定任务ID';
|
||||||
|
document.getElementById('logContent').innerHTML = '<span class="text-gray-400">请先从模型调优列表点击查看日志</span>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('[Init] 加载任务信息...');
|
||||||
|
await loadTaskInfo();
|
||||||
|
|
||||||
|
console.log('[Init] 加载日志内容...');
|
||||||
|
await loadLogContent();
|
||||||
|
|
||||||
|
// 自动刷新(每5秒)
|
||||||
|
setInterval(async () => {
|
||||||
|
await loadTaskInfo();
|
||||||
|
await loadLogContent();
|
||||||
|
}, 5000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 加载任务信息
|
||||||
|
async function loadTaskInfo() {
|
||||||
|
try {
|
||||||
|
console.log('[Task] Fetching task info from:', `${API_BASE}/fine-tune/${taskId}`);
|
||||||
|
const response = await fetchWithTimeout(`${API_BASE}/fine-tune/${taskId}`);
|
||||||
|
console.log('[Task] Response status:', response.status);
|
||||||
|
const result = await response.json();
|
||||||
|
console.log('[Task] API result:', result);
|
||||||
|
|
||||||
|
if (result.code === 0 && result.data) {
|
||||||
|
taskInfo = result.data;
|
||||||
|
console.log('[Task] taskInfo:', taskInfo);
|
||||||
|
console.log('[Task] process_id:', taskInfo.process_id);
|
||||||
|
await updateTaskInfo();
|
||||||
|
} else {
|
||||||
|
console.error('[Task] API返回错误:', result.message);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Task] 获取任务信息失败:', error);
|
||||||
|
document.getElementById('taskStatus').textContent = '获取失败';
|
||||||
|
document.getElementById('taskStatus').className = 'px-3 py-1 rounded-full text-sm bg-red-100 text-red-700';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 更新任务信息显示
|
||||||
|
async function updateTaskInfo() {
|
||||||
|
if (!taskInfo) return;
|
||||||
|
|
||||||
|
document.getElementById('taskName').textContent = taskInfo.name || '未知任务';
|
||||||
|
|
||||||
|
// 更新状态
|
||||||
|
const statusElement = document.getElementById('taskStatus');
|
||||||
|
const actualStatus = taskInfo.status ? taskInfo.status.toLowerCase() : 'unknown';
|
||||||
|
const statusMap = {
|
||||||
|
'pending': { text: '等待中', class: 'bg-gray-100 text-gray-600' },
|
||||||
|
'running': { text: '训练中', class: 'bg-blue-100 text-blue-600' },
|
||||||
|
'completed': { text: '已完成', class: 'bg-green-100 text-green-600' },
|
||||||
|
'failed': { text: '失败', class: 'bg-red-100 text-red-700' },
|
||||||
|
'stopped': { text: '已停止', class: 'bg-orange-100 text-orange-600' }
|
||||||
|
};
|
||||||
|
|
||||||
|
const statusConfig = statusMap[actualStatus] || { text: actualStatus, class: 'bg-gray-100 text-gray-600' };
|
||||||
|
statusElement.textContent = statusConfig.text;
|
||||||
|
statusElement.className = `px-3 py-1 rounded-full text-sm ${statusConfig.class}`;
|
||||||
|
|
||||||
|
// 更新进度
|
||||||
|
const progressElement = document.getElementById('taskProgress');
|
||||||
|
if (progressElement && taskInfo.progress !== undefined) {
|
||||||
|
progressElement.textContent = `${taskInfo.progress}%`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取并显示GPU信息(如果有)
|
||||||
|
try {
|
||||||
|
const gpuResponse = await fetchWithTimeout(`${API_BASE}/fine-tune/progress/${taskId}`);
|
||||||
|
const gpuResult = await gpuResponse.json();
|
||||||
|
if (gpuResult.code === 0 && gpuResult.data) {
|
||||||
|
const gpuElement = document.getElementById('taskGPU');
|
||||||
|
if (gpuElement && gpuResult.data.gpu_info) {
|
||||||
|
gpuElement.textContent = gpuResult.data.gpu_info;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.log('[Task] 获取GPU信息失败:', e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 更新数据集信息
|
||||||
|
const datasetElement = document.getElementById('dataset');
|
||||||
|
if (datasetElement && taskInfo.train_dataset_id) {
|
||||||
|
try {
|
||||||
|
const datasetResponse = await fetchWithTimeout(`${API_BASE}/dataset-manage/${taskInfo.train_dataset_id}`);
|
||||||
|
const datasetResult = await datasetResponse.json();
|
||||||
|
if (datasetResult.code === 0 && datasetResult.data) {
|
||||||
|
datasetElement.textContent = datasetResult.data.name;
|
||||||
|
} else {
|
||||||
|
datasetElement.textContent = `数据集${taskInfo.train_dataset_id}`;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
datasetElement.textContent = `数据集${taskInfo.train_dataset_id}`;
|
||||||
|
}
|
||||||
|
} else if (datasetElement) {
|
||||||
|
datasetElement.textContent = '-';
|
||||||
|
}
|
||||||
|
|
||||||
|
// 更新最后更新时间
|
||||||
|
const lastUpdateElement = document.getElementById('lastUpdate');
|
||||||
|
if (lastUpdateElement && taskInfo.update_time) {
|
||||||
|
try {
|
||||||
|
const updateTime = new Date(taskInfo.update_time);
|
||||||
|
lastUpdateElement.textContent = updateTime.toLocaleString('zh-CN');
|
||||||
|
} catch (e) {
|
||||||
|
lastUpdateElement.textContent = taskInfo.update_time || '-';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 其他信息
|
||||||
|
document.getElementById('processId').textContent = taskInfo.process_id || '-';
|
||||||
|
document.getElementById('createTime').textContent = taskInfo.create_time ?
|
||||||
|
new Date(taskInfo.create_time).toLocaleString('zh-CN') : '-';
|
||||||
|
|
||||||
|
// 获取模型名称
|
||||||
|
if (taskInfo.base_model) {
|
||||||
|
loadModelName(taskInfo.base_model);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 加载模型名称
|
||||||
|
async function loadModelName(modelId) {
|
||||||
|
try {
|
||||||
|
const response = await fetchWithTimeout(`${API_BASE}/model-manage`);
|
||||||
|
const result = await response.json();
|
||||||
|
if (result.code === 0 && result.data) {
|
||||||
|
const model = result.data.find(m => m.id == modelId);
|
||||||
|
document.getElementById('baseModel').textContent = model ? model.name : `模型${modelId}`;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
document.getElementById('baseModel').textContent = `模型${modelId}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 加载日志内容
|
||||||
|
async function loadLogContent() {
|
||||||
|
console.log('[Log] loadLogContent called');
|
||||||
|
console.log('[Log] taskInfo:', taskInfo);
|
||||||
|
console.log('[Log] taskInfo.process_id:', taskInfo ? taskInfo.process_id : 'taskInfo is null');
|
||||||
|
|
||||||
|
// 检查 taskInfo 是否存在
|
||||||
|
if (!taskInfo) {
|
||||||
|
console.log('[Log] taskInfo 为空,等待任务信息加载...');
|
||||||
|
// 尝试重新加载任务信息
|
||||||
|
await loadTaskInfo();
|
||||||
|
if (!taskInfo) {
|
||||||
|
document.getElementById('logContent').innerHTML = '<span class="text-gray-400">无法获取任务信息</span>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 检查 process_id 和 task_name
|
||||||
|
const processId = taskInfo.process_id;
|
||||||
|
const taskName = taskInfo.name || '';
|
||||||
|
|
||||||
|
if (!processId && !taskName) {
|
||||||
|
const msg = '<span class="text-gray-400">暂无日志文件 (任务未开始或无进程ID)</span>';
|
||||||
|
document.getElementById('logContent').innerHTML = msg;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
console.log('[Log] Fetching training log files...');
|
||||||
|
const response = await fetchWithTimeout(`${API_BASE}/training-log-files`);
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (result.code === 0 && result.data) {
|
||||||
|
console.log('[Log] Training log files:', result.data);
|
||||||
|
|
||||||
|
// 优先使用进程ID匹配文件名
|
||||||
|
let selectedFile = null;
|
||||||
|
|
||||||
|
if (processId) {
|
||||||
|
const pidStr = processId.toString();
|
||||||
|
for (const file of result.data) {
|
||||||
|
console.log(`[Log] Checking file: ${file.file}, PID: ${file.pid}, Match: ${file.file.startsWith(pidStr + '_') || file.file.includes(pidStr)}`);
|
||||||
|
if (file.file.startsWith(pidStr + '_') || file.file.includes(`_${pidStr}_`) || file.file.endsWith(`_${pidStr}.log`)) {
|
||||||
|
selectedFile = file.file;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 如果没找到,尝试使用任务名称匹配
|
||||||
|
if (!selectedFile && taskName) {
|
||||||
|
for (const file of result.data) {
|
||||||
|
if (file.file.includes(taskName)) {
|
||||||
|
selectedFile = file.file;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 如果仍然没有找到,使用第一个文件
|
||||||
|
if (!selectedFile && result.data.length > 0) {
|
||||||
|
selectedFile = result.data[0].file;
|
||||||
|
console.log('[Log] No matching file found, using first available file:', selectedFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (selectedFile) {
|
||||||
|
console.log('[Log] Selected log file:', selectedFile);
|
||||||
|
await loadLogFileContent(selectedFile);
|
||||||
|
} else {
|
||||||
|
document.getElementById('logContent').innerHTML = '<span class="text-gray-400">未找到匹配的日志文件</span>';
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
document.getElementById('logContent').innerHTML = '<span class="text-gray-400">获取日志列表失败: ' + (result.message || '未知错误') + '</span>';
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Log] 获取日志列表失败:', error);
|
||||||
|
document.getElementById('logContent').innerHTML = '<span class="text-red-500">加载日志失败: ' + error.message + '</span>';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 加载日志文件内容
|
||||||
|
async function loadLogFileContent(fileName) {
|
||||||
|
console.log('[Log] Loading log file:', fileName);
|
||||||
|
try {
|
||||||
|
const response = await fetchWithTimeout(`${API_BASE}/training-log-content?file=${encodeURIComponent(fileName)}`);
|
||||||
|
const result = await response.json();
|
||||||
|
console.log('[Log] Log content API response:', result);
|
||||||
|
|
||||||
|
if (result.code === 0 && result.data) {
|
||||||
|
trainingLogFullContent = result.data.content || '';
|
||||||
|
console.log('[Log] Log content length:', trainingLogFullContent.length);
|
||||||
|
renderLogContent();
|
||||||
|
// 解析并更新图表
|
||||||
|
parseMetricsFromLog(trainingLogFullContent);
|
||||||
|
} else if (result.code === 2) {
|
||||||
|
// 文件被锁定,正在训练中
|
||||||
|
document.getElementById('logContent').innerHTML = `
|
||||||
|
<div class="text-orange-500 p-4 text-center">
|
||||||
|
<i class="fa fa-spinner fa-spin fa-2x mb-2"></i>
|
||||||
|
<p class="text-lg">日志文件正在被训练进程占用</p>
|
||||||
|
<p class="text-sm text-gray-500 mt-1">${result.message || '训练结束后可查看完整内容'}</p>
|
||||||
|
<p class="text-xs text-gray-400 mt-2">页面将自动刷新...</p>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
} else {
|
||||||
|
document.getElementById('logContent').innerHTML = '<span class="text-red-500">加载日志失败: ' + (result.message || '未知错误') + '</span>';
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Log] 获取日志内容失败:', error);
|
||||||
|
document.getElementById('logContent').innerHTML = '<span class="text-red-500">加载日志失败: ' + error.message + '</span>';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 渲染日志内容
|
||||||
|
function renderLogContent() {
|
||||||
|
const logContent = document.getElementById('logContent');
|
||||||
|
const searchInput = document.getElementById('logSearchInput');
|
||||||
|
const searchText = searchInput ? searchInput.value.toLowerCase() : '';
|
||||||
|
|
||||||
|
if (!trainingLogFullContent) {
|
||||||
|
logContent.innerHTML = '<span class="text-gray-400">暂无日志内容</span>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const lines = trainingLogFullContent.split('\n');
|
||||||
|
let html = '';
|
||||||
|
let matchCount = 0;
|
||||||
|
|
||||||
|
// 只显示最后500行以提高性能
|
||||||
|
const displayLines = lines.slice(-500);
|
||||||
|
|
||||||
|
for (const line of displayLines) {
|
||||||
|
if (!line.trim()) continue;
|
||||||
|
|
||||||
|
// 搜索过滤
|
||||||
|
if (searchText && !line.toLowerCase().includes(searchText)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 级别过滤(不再使用)
|
||||||
|
let cssClass = '';
|
||||||
|
if (line.includes('[ERROR') || line.includes('error:') || line.includes('Error:')) {
|
||||||
|
cssClass = 'error';
|
||||||
|
} else if (line.includes('[WARNING') || line.includes('warning:') || line.includes('Warning:')) {
|
||||||
|
cssClass = 'warning';
|
||||||
|
} else if (line.includes('[INFO') || line.includes('info:') || line.includes('Info:')) {
|
||||||
|
cssClass = 'info';
|
||||||
|
}
|
||||||
|
|
||||||
|
// 进度条格式高亮
|
||||||
|
if (/\d+%/.test(line)) {
|
||||||
|
cssClass = cssClass ? cssClass + ' progress' : 'progress';
|
||||||
|
}
|
||||||
|
|
||||||
|
html += `<div class="log-line ${cssClass}">${escapeHtml(line)}</div>`;
|
||||||
|
matchCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (matchCount === 0) {
|
||||||
|
html = '<div class="text-gray-400 p-4">没有匹配的日志</div>';
|
||||||
|
}
|
||||||
|
|
||||||
|
logContent.innerHTML = html;
|
||||||
|
logContent.scrollTop = logContent.scrollHeight;
|
||||||
|
|
||||||
|
// 更新匹配数量
|
||||||
|
document.getElementById('logMatchCount').textContent =
|
||||||
|
searchText ? `找到 ${matchCount} 条` : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// 搜索日志
|
||||||
|
function searchLog() {
|
||||||
|
console.log('[Search] 搜索触发,trainingLogFullContent:', trainingLogFullContent ? '已加载' : '未加载');
|
||||||
|
const searchInput = document.getElementById('logSearchInput');
|
||||||
|
console.log('[Search] 搜索文本:', searchInput ? searchInput.value : '输入框未找到');
|
||||||
|
renderLogContent();
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTML转义
|
||||||
|
function escapeHtml(text) {
|
||||||
|
const div = document.createElement('div');
|
||||||
|
div.textContent = text;
|
||||||
|
return div.innerHTML;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 页面加载完成后初始化
|
||||||
|
function startApp() {
|
||||||
|
console.log('[App] startApp called');
|
||||||
|
console.log('[App] Chart available:', typeof Chart !== 'undefined');
|
||||||
|
|
||||||
|
// 等待 Chart.js 加载完成(最多等待5秒)
|
||||||
|
let waitCount = 0;
|
||||||
|
const maxWait = 50; // 50 * 100ms = 5秒
|
||||||
|
|
||||||
|
function waitForChart() {
|
||||||
|
if (typeof Chart !== 'undefined') {
|
||||||
|
console.log('[App] Chart.js 已加载,开始初始化');
|
||||||
|
initCharts();
|
||||||
|
init();
|
||||||
|
} else if (waitCount < maxWait) {
|
||||||
|
waitCount++;
|
||||||
|
console.log('[App] 等待 Chart.js 加载... (' + waitCount + ')');
|
||||||
|
setTimeout(waitForChart, 100);
|
||||||
|
} else {
|
||||||
|
console.error('[App] Chart.js 加载超时');
|
||||||
|
document.getElementById('chartsContainer').innerHTML = '<div class="text-center p-4 text-red-500"><i class="fa fa-exclamation-triangle mr-2"></i>图表库加载失败,请检查网络或刷新页面</div>';
|
||||||
|
// 仍然初始化其他功能
|
||||||
|
init();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 如果已加载,直接初始化;否则等待
|
||||||
|
if (typeof Chart !== 'undefined') {
|
||||||
|
initCharts();
|
||||||
|
init();
|
||||||
|
} else {
|
||||||
|
console.log('[App] Chart.js 尚未加载,开始等待...');
|
||||||
|
setTimeout(waitForChart, 100);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TensorBoard 控制
|
||||||
|
const TB_URL = 'http://10.10.10.177:6006';
|
||||||
|
|
||||||
|
function toggleTB() {
|
||||||
|
const btn = document.getElementById('tbBtn');
|
||||||
|
btn.innerHTML = '<i class="fa fa-spinner fa-spin mr-1"></i>启动中...';
|
||||||
|
btn.className = 'bg-gray-500 text-white px-4 py-2 rounded transition-colors text-sm cursor-wait';
|
||||||
|
|
||||||
|
// 调用API启动TensorBoard服务
|
||||||
|
fetch(`${API_BASE}/fine-tune/tensorboard/start`, { method: 'POST' })
|
||||||
|
.then(res => res.json())
|
||||||
|
.then(result => {
|
||||||
|
console.log('TensorBoard启动结果:', result);
|
||||||
|
if (result.code === 0) {
|
||||||
|
// 跳转到TensorBoard页面
|
||||||
|
window.open(TB_URL, '_blank');
|
||||||
|
btn.innerHTML = '<i class="fa fa-bar-chart mr-1"></i>打开TensorBoard';
|
||||||
|
btn.className = 'bg-purple-500 text-white px-4 py-2 rounded hover:bg-purple-600 transition-colors text-sm';
|
||||||
|
} else {
|
||||||
|
alert('提示: ' + (result.message || '启动失败'));
|
||||||
|
btn.innerHTML = '<i class="fa fa-bar-chart mr-1"></i>TensorBoard';
|
||||||
|
btn.className = 'bg-purple-500 text-white px-4 py-2 rounded hover:bg-purple-600 transition-colors text-sm';
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(err => {
|
||||||
|
console.error('启动TensorBoard失败:', err);
|
||||||
|
alert('提示: 启动失败 - ' + err.message);
|
||||||
|
btn.innerHTML = '<i class="fa fa-bar-chart mr-1"></i>TensorBoard';
|
||||||
|
btn.className = 'bg-purple-500 text-white px-4 py-2 rounded hover:bg-purple-600 transition-colors text-sm';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// 立即尝试初始化(处理 iframe 情况)
|
||||||
|
if (document.readyState === 'loading') {
|
||||||
|
document.addEventListener('DOMContentLoaded', startApp);
|
||||||
|
} else {
|
||||||
|
startApp();
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
Reference in New Issue
Block a user