From b7cd8097ac8216f05ec0702b7d8ebf5d16b497ba Mon Sep 17 00:00:00 2001 From: "WIN-JHFT4D3SIVT\\caoxiaozhu" Date: Mon, 26 Jan 2026 16:18:23 +0800 Subject: [PATCH] =?UTF-8?q?GPU=E6=A3=80=E6=B5=8B=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + src/main.py | 150 ++++++++++++++++ web/pages/fine-tune-create.html | 107 +++++++++--- web/pages/main.html | 291 +++++++++++++++++++++++--------- 4 files changed, 445 insertions(+), 104 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0c1964c..c91f3e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ cryptography==41.0.7 requests==2.31.0 psutil==5.9.8 werkzeug==3.0.1 +pynvml==11.5.0 diff --git a/src/main.py b/src/main.py index 8483209..cbd0394 100644 --- a/src/main.py +++ b/src/main.py @@ -366,6 +366,156 @@ def health_check(): return jsonify({'status': 'error', 'code': 1, 'message': str(e)}) +# ============ 详细系统监控 ============ +@app.route('/api/system-info', methods=['GET']) +def system_info(): + """获取详细系统监控信息""" + import psutil + import os + try: + # CPU 信息 + cpu_percent = psutil.cpu_percent(interval=None) + cpu_counts = psutil.cpu_count() + cpu_freq = psutil.cpu_freq() + + # 内存信息 + memory = psutil.virtual_memory() + + # 磁盘信息 + disk = psutil.disk_usage('/') + disk_io = psutil.disk_io_counters() + + # 网络信息 + net_io = psutil.net_io_counters() + + # 系统启动时间 + boot_time = psutil.boot_time() + uptime_seconds = time.time() - boot_time + + # GPU 信息 + gpu_list = [] + try: + import pynvml + pynvml.nvmlInit() + gpu_count = pynvml.nvmlDeviceGetCount() + for i in range(gpu_count): + try: + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + name = pynvml.nvmlDeviceGetName(handle) + + # 获取显存信息 + try: + mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + memory_used = mem_info.used + memory_total = mem_info.total + except: + memory_used = 0 + memory_total = 0 + + # 获取利用率 + try: + util = pynvml.nvmlDeviceGetUtilizationRates(handle) + gpu_util = util.gpu + mem_util = util.memory + except: + gpu_util = 0 + mem_util = 0 + + # 获取温度 - pynvml 11.x API: 只接受handle参数 + try: + temp = pynvml.nvmlDeviceGetTemperature(handle) + except: + temp = 0 + + # 获取功耗 + try: + power = pynvml.nvmlDeviceGetPowerUsage(handle) + except: + power = 0 + + # 获取风扇转速 (百分比) + try: + fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle) + except: + fan_speed = 0 + + # 获取显卡时钟频率 (MHz) + try: + clock = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM) + except: + clock = 0 + + # 获取显存时钟频率 (MHz) + try: + mem_clock = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM) + except: + mem_clock = 0 + + # 获取驱动版本信息 + try: + version = pynvml.nvmlSystemGetDriverVersion() + except: + version = '' + + gpu_list.append({ + 'name': name.decode() if isinstance(name, bytes) else name, + 'memory_used_gb': round(memory_used / (1024**3), 1), + 'memory_total_gb': round(memory_total / (1024**3), 1), + 'gpu_percent': gpu_util, + 'memory_percent': mem_util, + 'temperature': temp, + 'power_w': round(power / 1000, 1) if power > 0 else 0, + 'fan_speed': fan_speed, + 'clock_mhz': clock, + 'memory_clock_mhz': mem_clock, + 'driver_version': version.decode() if isinstance(version, bytes) else version + }) + except Exception as e: + logger.debug(f"获取GPU {i} 信息失败: {e}") + continue + pynvml.nvmlShutdown() + except Exception as e: + logger.warning(f"获取GPU信息失败: {e}") + gpu_list = [] + + return jsonify({ + 'code': 0, + 'data': { + 'cpu': { + 'percent': cpu_percent, + 'cores': cpu_counts, + 'frequency_mhz': cpu_freq.current if cpu_freq else 0 + }, + 'memory': { + 'percent': memory.percent, + 'used_gb': round(memory.used / (1024**3), 1), + 'total_gb': round(memory.total / (1024**3), 1), + 'available_gb': round(memory.available / (1024**3), 1), + 'cached_gb': round(memory.cached / (1024**3), 1) if hasattr(memory, 'cached') else 0 + }, + 'disk': { + 'percent': disk.percent, + 'used_gb': round(disk.used / (1024**3), 0), + 'total_gb': round(disk.total / (1024**3), 0), + 'read_mb': round(disk_io.read_bytes / (1024**2), 0), + 'write_mb': round(disk_io.write_bytes / (1024**2), 0) + }, + 'network': { + 'upload_mb': round(net_io.bytes_sent / (1024**2), 1), + 'download_mb': round(net_io.bytes_recv / (1024**2), 1) + }, + 'system': { + 'uptime_seconds': uptime_seconds, + 'process_count': len(psutil.pids()) + }, + 'gpu': gpu_list + } + }) + except Exception as e: + logger.error(f"获取系统信息失败: {e}") + return jsonify({'code': 1, 'message': str(e)}) + + # ============ 通用 CRUD 操作 ============ def generic_get_all(table_name, order_by='create_time DESC'): """通用查询所有""" diff --git a/web/pages/fine-tune-create.html b/web/pages/fine-tune-create.html index 879f137..d6f6361 100644 --- a/web/pages/fine-tune-create.html +++ b/web/pages/fine-tune-create.html @@ -826,23 +826,45 @@ } } - // 获取GPU数据(模拟数据,实际可从API获取) + // 获取GPU数据(从真实API获取) async function fetchGPUs() { - // 实际项目中可以调用后端API获取GPU信息 - // const response = await fetch(`${API_BASE}/gpus`); - // return await response.json(); + try { + const response = await fetch(`${API_BASE}/system-info`); + const result = await response.json(); - // 模拟GPU数据 - return [ - { id: 'gpu0', name: 'NVIDIA A100 80GB', memory: '80GB', cuda_cores: 6912, available: true }, - { id: 'gpu1', name: 'NVIDIA A100 80GB', memory: '80GB', cuda_cores: 6912, available: true }, - { id: 'gpu2', name: 'NVIDIA A100 40GB', memory: '40GB', cuda_cores: 6912, available: true }, - { id: 'gpu3', name: 'NVIDIA A100 40GB', memory: '40GB', cuda_cores: 6912, available: false }, - { id: 'gpu4', name: 'NVIDIA V100 32GB', memory: '32GB', cuda_cores: 5120, available: true }, - { id: 'gpu5', name: 'NVIDIA V100 16GB', memory: '16GB', cuda_cores: 5120, available: false }, - { id: 'gpu6', name: 'NVIDIA RTX 3090', memory: '24GB', cuda_cores: 10496, available: true }, - { id: 'gpu7', name: 'NVIDIA RTX 4090', memory: '24GB', cuda_cores: 16384, available: true } - ]; + if (result.code === 0 && result.data.gpu && result.data.gpu.length > 0) { + // 将真实GPU数据转换为前端所需格式 + return result.data.gpu.map((gpu, index) => ({ + id: `gpu${index}`, + name: gpu.name || `GPU ${index}`, + memory: `${gpu.memory_total_gb}GB`, + cuda_cores: 'N/A', + available: gpu.power_w > 0 || gpu.gpu_percent >= 0, // 有数据即为可用 + real_data: gpu // 保存真实数据供显示 + })); + } + + // 如果没有真实数据,尝试获取驱动版本 + const driverVersion = result.data.gpu?.[0]?.driver_version || ''; + if (driverVersion) { + return [{ + id: 'gpu0', + name: 'NVIDIA GPU (Detected)', + memory: 'Unknown', + cuda_cores: 'N/A', + available: true, + real_data: result.data.gpu?.[0] || null + }]; + } + + throw new Error('未检测到GPU设备'); + } catch (error) { + console.warn('获取GPU信息失败,使用模拟数据:', error); + // 失败时返回模拟数据作为后备 + return [ + { id: 'gpu0', name: 'NVIDIA GPU (未检测到)', memory: 'Unknown', cuda_cores: 'N/A', available: false } + ]; + } } // 渲染GPU列表(点击卡片选中,无需复选框) @@ -850,7 +872,18 @@ const container = document.getElementById('gpuSelectionArea'); if (!container) return; - container.innerHTML = gpus.map(gpu => ` + container.innerHTML = gpus.map(gpu => { + // 从真实数据中提取监控信息 + const realData = gpu.real_data || {}; + const memoryUsed = realData.memory_used_gb || 0; + const memoryTotal = realData.memory_total_gb || 0; + const temp = realData.temperature || 0; + const power = realData.power_w || 0; + const gpuPercent = realData.gpu_percent || 0; + const fanSpeed = realData.fan_speed || 0; + const clock = realData.clock_mhz || 0; + + return `
' : ''}
-
- ${gpu.memory} - ${gpu.cuda_cores} CUDA +
+ ${memoryUsed}/${memoryTotal} GB + ${temp}°C + ${power} W + ${clock} MHz +
+ +
+
+ GPU: ${gpuPercent}% + Fan: ${fanSpeed}% +
+
+
+
- `).join(''); + `}).join(''); } // 切换GPU选择状态 @@ -884,11 +930,16 @@ if (card.classList.contains('border-primary')) { // 取消选中 card.classList.remove('border-primary', 'bg-blue-50'); - card.querySelector('.fa-check-circle').classList.replace('text-primary', 'text-green-600'); + // 恢复图标为可选中状态(绿色勾选圈) + const icon = card.querySelector('.fa-check, .fa-check-circle'); + if (icon) { + icon.classList.remove('fa-check', 'text-primary'); + icon.classList.add('fa-check-circle', 'text-green-600'); + } } else { // 选中 card.classList.add('border-primary', 'bg-blue-50'); - // 移除检查图标,添加选中标记 + // 切换图标为已选中状态(蓝色勾选) const icon = card.querySelector('.fa-check-circle'); if (icon) { icon.classList.remove('fa-check-circle', 'text-green-600'); @@ -900,7 +951,17 @@ // 获取选中的GPU列表 function getSelectedGPUs() { const cards = document.querySelectorAll('.gpu-card.border-primary'); - return Array.from(cards).map(card => card.dataset.gpuId); + return Array.from(cards).map(card => { + const gpuId = card.dataset.gpuId; + // 获取GPU名称和显存信息用于显示 + const nameEl = card.querySelector('.text-gray-700'); + const name = nameEl ? nameEl.textContent : gpuId; + // 返回GPU信息对象 + return { + id: gpuId, + name: name + }; + }); } // 提交表单 diff --git a/web/pages/main.html b/web/pages/main.html index 3e73bb7..a889fb1 100644 --- a/web/pages/main.html +++ b/web/pages/main.html @@ -1721,8 +1721,69 @@ let refreshTimer = null; let currentRefreshInterval = 5000; - // 刷新硬件信息 - function refreshHardwareInfo() { + // 刷新硬件信息(使用真实API) + async function refreshHardwareInfo() { + try { + const response = await fetch(`${API_BASE}/system-info`); + const result = await response.json(); + + if (result.code === 0 && result.data) { + const data = result.data; + + // 更新CPU + const cpu = data.cpu || {}; + const cpuPercent = cpu.percent || 0; + document.getElementById('cpuPercent').textContent = cpuPercent + '%'; + document.getElementById('cpuBar').style.width = cpuPercent + '%'; + document.getElementById('cpuCores').textContent = (cpu.cores || 0) + ' 核心'; + + // 更新内存 + const mem = data.memory || {}; + const memUsed = mem.used_gb || 0; + const memTotal = mem.total_gb || 0; + const memPercent = mem.percent || 0; + document.getElementById('memoryPercent').textContent = memPercent + '%'; + document.getElementById('memoryBar').style.width = memPercent + '%'; + document.getElementById('memoryUsed').textContent = memUsed + ' GB'; + document.getElementById('memoryAvailable').textContent = (mem.available_gb || 0) + ' GB'; + document.getElementById('memoryCached').textContent = (mem.cached_gb || 0) + ' GB'; + + // 更新磁盘 + const disk = data.disk || {}; + const diskUsed = disk.used_gb || 0; + const diskTotal = disk.total_gb || 0; + const diskPercent = disk.percent || 0; + document.getElementById('diskPercent').textContent = diskPercent + '%'; + document.getElementById('diskBar').style.width = diskPercent + '%'; + document.getElementById('diskUsed').textContent = diskUsed + ' GB'; + document.getElementById('diskAvailable').textContent = (diskTotal - diskUsed) + ' GB'; + + // 更新网络 + const net = data.network || {}; + document.getElementById('totalDownload').textContent = (net.download_mb || 0) + ' GB'; + document.getElementById('totalUpload').textContent = (net.upload_mb || 0) + ' GB'; + + // 更新系统信息 + const sys = data.system || {}; + const uptime = sys.uptime_seconds || 0; + const days = Math.floor(uptime / 86400); + const hours = Math.floor((uptime % 86400) / 3600); + const mins = Math.floor((uptime % 3600) / 60); + document.getElementById('uptime').textContent = days + ' 天 ' + hours + ' 时 ' + mins + ' 分'; + document.getElementById('processCount').textContent = sys.process_count || 0; + + // 更新GPU信息(传入真实数据) + updateGPUInfo(data.gpu || []); + } + } catch (error) { + console.error('获取系统信息失败:', error); + // 如果API调用失败,使用模拟数据作为后备 + useMockData(); + } + } + + // 使用模拟数据(当API不可用时) + function useMockData() { // 更新CPU const cpuUsage = Math.floor(Math.random() * 30) + 20; document.getElementById('cpuPercent').textContent = cpuUsage + '%'; @@ -1768,10 +1829,6 @@ document.getElementById('uptime').textContent = days + ' 天 ' + hours + ' 时 ' + mins + ' 分'; document.getElementById('processCount').textContent = Math.floor(Math.random() * 200 + 100); document.getElementById('loadAvg').textContent = (Math.random() * 2).toFixed(2) + ', ' + (Math.random() * 1.5).toFixed(2) + ', ' + (Math.random() * 1).toFixed(2); - - // 更新时间 - // const now = new Date(); - // document.getElementById('updateTime').textContent = now.toLocaleTimeString('zh-CN'); } // GPU配置 - 支持模拟1-8块GPU @@ -1787,95 +1844,167 @@ { name: 'NVIDIA RTX 4080', memory: 16 } ]; - // 初始化GPU列表 - function initGPUList() { - const gpuList = document.getElementById('gpuList'); - const gpuCount = Math.min(GPU_COUNT, 8); - document.getElementById('gpuCount').textContent = `检测到 ${gpuCount} 块 GPU`; - - let gpuCardsHTML = ''; - for (let i = 0; i < gpuCount; i++) { - const config = gpuConfigs[i % gpuConfigs.length]; - gpuCardsHTML += ` -
-
-
-
- -
-
-
${config.name}
-
PCIe ${Math.floor(Math.random() * 4 + 1)}:00.0
-
-
-
- 0% -
-
-
-
-
-
-
-
显存
-
0/${config.memory}G
-
-
-
温度
-
0°C
-
-
-
功耗
-
0W
-
-
-
Fan
-
0%
-
-
-
- `; + // 初始化GPU列表(获取真实数据) + async function initGPUList() { + try { + const response = await fetch(`${API_BASE}/system-info`); + const result = await response.json(); + const gpuData = (result.data && result.data.gpu) || []; + updateGPUInfo(gpuData); + } catch (error) { + console.error('初始化GPU列表失败:', error); + useMockGPUData(); } - gpuList.innerHTML = gpuCardsHTML; } // 更新GPU信息 - function updateGPUInfo() { + function updateGPUInfo(gpuData) { + // 如果有真实数据,使用真实数据 + if (gpuData && gpuData.length > 0) { + const gpuCount = gpuData.length; + document.getElementById('gpuCount').textContent = `检测到 ${gpuCount} 块 GPU`; + + let totalUsedMemory = 0; + let totalMemory = 0; + + // 重新初始化GPU列表 + const gpuList = document.getElementById('gpuList'); + if (gpuList) { + let gpuCardsHTML = ''; + for (let i = 0; i < gpuCount; i++) { + const gpu = gpuData[i]; + totalUsedMemory += gpu.memory_used_gb; + totalMemory += gpu.memory_total_gb; + + gpuCardsHTML += ` +
+
+
+
+ +
+
+
${gpu.name}
+
PCIe
+
+
+
+ ${gpu.gpu_percent}% +
+
+
+
+
+
+
+
显存
+
${gpu.memory_used_gb}/${gpu.memory_total_gb} GB
+
+
+
温度
+
${gpu.temperature}°C
+
+
+
功耗
+
${gpu.power_w} W
+
+
+
Fan
+
${gpu.fan_speed || 0}%
+
+
+
+
Clock: ${gpu.clock_mhz || 0} MHz
+
Driver: ${gpu.driver_version || '-'}
+
+
+ `; + } + gpuList.innerHTML = gpuCardsHTML; + } + + // 更新总显存 + const gpuTotalMem = document.getElementById('gpuTotalMemory'); + if (gpuTotalMem) { + gpuTotalMem.textContent = `${totalUsedMemory}/${totalMemory} GB`; + } + return; + } + + // 没有真实数据,使用模拟数据 + useMockGPUData(); + } + + // 使用模拟GPU数据 + function useMockGPUData() { const gpuCount = Math.min(GPU_COUNT, 8); let totalUsedMemory = 0; let totalMemory = 0; - for (let i = 0; i < gpuCount; i++) { - const config = gpuConfigs[i % gpuConfigs.length]; - const gpuUsage = Math.floor(Math.random() * 60 + 20); - const memUsed = (Math.random() * config.memory * 0.7 + config.memory * 0.1).toFixed(1); - const temp = Math.floor(Math.random() * 30 + 40); - const power = Math.floor(Math.random() * 150 + 100); - const fan = Math.floor(gpuUsage + Math.random() * 10); + // 重新初始化GPU列表 + const gpuList = document.getElementById('gpuList'); + if (gpuList) { + let gpuCardsHTML = ''; + for (let i = 0; i < gpuCount; i++) { + const config = gpuConfigs[i % gpuConfigs.length]; + const gpuUsage = Math.floor(Math.random() * 60 + 20); + const memUsed = (Math.random() * config.memory * 0.7 + config.memory * 0.1).toFixed(1); + const temp = Math.floor(Math.random() * 30 + 40); + const power = Math.floor(Math.random() * 150 + 100); + const fan = Math.floor(gpuUsage + Math.random() * 10); - totalUsedMemory += parseFloat(memUsed); - totalMemory += config.memory; + totalUsedMemory += parseFloat(memUsed); + totalMemory += config.memory; - document.getElementById(`gpuPercent${i}`).textContent = gpuUsage + '%'; - document.getElementById(`gpuBar${i}`).style.width = gpuUsage + '%'; - document.getElementById(`gpuMem${i}`).textContent = `${parseFloat(memUsed).toFixed(1)}/${config.memory} GB`; - document.getElementById(`gpuTemp${i}`).textContent = temp + '°C'; - document.getElementById(`gpuPower${i}`).textContent = power + ' W'; - document.getElementById(`gpuFan${i}`).textContent = fan + '%'; - - // 根据温度改变颜色 - const tempEl = document.getElementById(`gpuTemp${i}`); - if (temp >= 80) { - tempEl.className = 'font-medium text-red-600'; - } else if (temp >= 70) { - tempEl.className = 'font-medium text-yellow-600'; - } else { - tempEl.className = 'font-medium text-gray-800'; + gpuCardsHTML += ` +
+
+
+
+ +
+
+
${config.name}
+
PCIe ${Math.floor(Math.random() * 4 + 1)}:00.0
+
+
+
+ ${gpuUsage}% +
+
+
+
+
+
+
+
显存
+
${parseFloat(memUsed).toFixed(1)}/${config.memory} GB
+
+
+
温度
+
${temp}°C
+
+
+
功耗
+
${power} W
+
+
+
Fan
+
${fan}%
+
+
+
+ `; } + gpuList.innerHTML = gpuCardsHTML; + document.getElementById('gpuCount').textContent = `检测到 ${gpuCount} 块 GPU`; } // 更新总显存 - document.getElementById('gpuTotalMemory').textContent = `${totalUsedMemory.toFixed(1)}/${totalMemory} GB`; + const gpuTotalMem = document.getElementById('gpuTotalMemory'); + if (gpuTotalMem) { + gpuTotalMem.textContent = `${totalUsedMemory.toFixed(1)}/${totalMemory} GB`; + } } // 启动硬件监控自动刷新