1. 修改了一些bug

2. 做了一些调整,比如启动脚本,支持了tenmsorboard
This commit is contained in:
2026-01-29 15:51:45 +08:00
parent e9e0e21e47
commit e494c4ce50
20 changed files with 995 additions and 287 deletions

View File

@@ -204,44 +204,67 @@ def get_trained_models():
logger = logging.getLogger(__name__)
try:
# 使用 /app/base/saves 目录(容器内路径
saves_base_path = '/app/base/saves'
# 本地开发时的备用路径
local_saves_path = os.path.join(PROJECT_ROOT, 'saves')
# 多个可能的路径
potential_paths = [
'/app/base/saves', # 容器内路径
os.path.join(PROJECT_ROOT, 'saves'), # 本地开发路径
os.path.join(os.path.dirname(os.path.dirname(PROJECT_ROOT)), 'YG_FT_Base', 'saves'), # 上级目录
]
# 选择存在的路径
base_path = saves_base_path if os.path.exists(saves_base_path) else local_saves_path
base_path = None
for path in potential_paths:
logger.info(f"[DEBUG] 检查路径: {path}, exists: {os.path.exists(path)}")
if os.path.exists(path):
base_path = path
break
logger.info(f"[DEBUG] 已训练模型目录: {base_path}, exists: {os.path.exists(base_path)}")
logger.info(f"[DEBUG] 最终使用的路径: {base_path}")
models = []
if os.path.exists(base_path):
for item in os.listdir(base_path):
item_path = os.path.join(base_path, item)
if os.path.isdir(item_path):
# 检查是否是模板目录(包含训练方法的子目录)
sub_items = []
if os.path.exists(item_path):
for sub_item in os.listdir(item_path):
sub_path = os.path.join(item_path, sub_item)
if os.path.isdir(sub_path):
# 检查是否包含模型文件adapter_model.bin 或 pytorch_model.bin 等)
has_model = False
for f in os.listdir(sub_path):
if f.endswith('.bin') or f.endswith('.safetensors'):
has_model = True
break
if has_model:
sub_items.append({
'name': sub_item,
'path': sub_path
})
if base_path and os.path.exists(base_path):
logger.info(f"[DEBUG] 遍历目录: {base_path}")
try:
# 路径结构: /app/base/saves/{train_method}/{model_name}/
# train_method: lora, full, qlora, dpo, cpt 等
models.append({
'name': item,
'path': item_path,
'train_methods': sub_items
})
for train_method in os.listdir(base_path):
train_method_path = os.path.join(base_path, train_method)
if not os.path.isdir(train_method_path):
continue
logger.info(f"[DEBUG] 检查训练方法目录: {train_method}")
model_count = 0
# 遍历模型文件夹
for model_name in os.listdir(train_method_path):
model_path = os.path.join(train_method_path, model_name)
if not os.path.isdir(model_path):
continue
# 检查是否有模型文件
try:
files = os.listdir(model_path)
logger.info(f"[DEBUG] {train_method}/{model_name} 文件: {files[:5]}...")
has_model = any(f.endswith('.bin') or f.endswith('.safetensors') for f in files)
if has_model:
logger.info(f"[DEBUG] 找到模型: {train_method}/{model_name}")
models.append({
'name': model_name,
'path': model_path,
'train_methods': [{
'name': train_method,
'path': model_path
}]
})
model_count += 1
except Exception as file_err:
logger.error(f"[DEBUG] 读取 {model_path} 失败: {file_err}")
logger.info(f"[DEBUG] {train_method} 找到 {model_count} 个模型")
except Exception as list_err:
logger.error(f"[DEBUG] 遍历目录失败: {list_err}")
logger.info(f"[DEBUG] 找到 {len(models)} 个已训练模型")
@@ -249,7 +272,7 @@ def get_trained_models():
'code': 0,
'data': {
'models': models,
'base_path': base_path
'base_path': base_path or ''
}
})
except Exception as e: