'use client'; import { useState, useCallback } from 'react'; import { Box, Typography, Button, Paper, List, ListItem, ListItemIcon, ListItemText, LinearProgress, Alert } from '@mui/material'; import { CloudUpload as UploadIcon, Description as FileIcon, CheckCircle as CheckIcon } from '@mui/icons-material'; import { useTranslation } from 'react-i18next'; // import { useDropzone } from 'react-dropzone'; /** * 文件上传步骤组件 */ export default function FileUploadStep({ onDataLoaded, onError }) { const { t } = useTranslation(); const [uploading, setUploading] = useState(false); const [uploadedFiles, setUploadedFiles] = useState([]); // 健壮的CSV解析函数,支持多行字段和引号转义 const parseCSV = text => { const result = []; const lines = []; let currentLine = ''; let inQuotes = false; // 逐字符解析,正确处理引号内的换行符 for (let i = 0; i < text.length; i++) { const char = text[i]; const nextChar = text[i + 1]; if (char === '"') { if (inQuotes && nextChar === '"') { // 转义的引号 currentLine += '"'; i++; // 跳过下一个引号 } else { // 切换引号状态 inQuotes = !inQuotes; } } else if (char === '\n' && !inQuotes) { // 行结束(不在引号内) if (currentLine.trim()) { lines.push(currentLine); } currentLine = ''; } else { currentLine += char; } } // 添加最后一行 if (currentLine.trim()) { lines.push(currentLine); } if (lines.length < 2) { throw new Error('CSV文件格式不正确,至少需要标题行和一行数据'); } // 解析标题行 const headers = parseCSVLine(lines[0]); // 解析数据行 for (let i = 1; i < lines.length; i++) { const values = parseCSVLine(lines[i]); if (values.length > 0) { const obj = {}; headers.forEach((header, index) => { obj[header] = values[index] || ''; }); result.push(obj); } } return result; }; // 解析单行CSV,处理逗号分隔和引号转义 const parseCSVLine = line => { const result = []; let current = ''; let inQuotes = false; for (let i = 0; i < line.length; i++) { const char = line[i]; const nextChar = line[i + 1]; if (char === '"') { if (inQuotes && nextChar === '"') { // 转义的引号 current += '"'; i++; // 跳过下一个引号 } else { // 切换引号状态 inQuotes = !inQuotes; } } else if (char === ',' && !inQuotes) { // 字段分隔符(不在引号内) result.push(current.trim()); current = ''; } else { current += char; } } // 添加最后一个字段 result.push(current.trim()); return result; }; // 检测并转换ShareGPT格式为Alpaca格式 const convertShareGPTToAlpaca = item => { // 检查是否包含conversations字段且格式正确 if (item.conversations && Array.isArray(item.conversations)) { const conversations = item.conversations; // 查找system、human、gpt消息 let systemMessage = ''; let instruction = ''; let output = ''; for (const conv of conversations) { if (conv.from === 'system' && conv.value) { systemMessage = conv.value; } else if (conv.from === 'human' && conv.value) { instruction = conv.value; } else if (conv.from === 'gpt' && conv.value) { output = conv.value; break; // 只取第一轮对话 } } // 如果有system消息,将其作为instruction的前缀 if (systemMessage && instruction) { instruction = `${systemMessage}\n\n${instruction}`; } else if (systemMessage && !instruction) { instruction = systemMessage; } // 转换为Alpaca格式 return { instruction: instruction || '', input: '', // ShareGPT格式通常没有单独的input字段 output: output || '', // 保留其他字段 ...Object.fromEntries(Object.entries(item).filter(([key]) => key !== 'conversations')) }; } return item; // 如果不是ShareGPT格式,返回原始数据 }; const parseFileContent = async file => { const text = await file.text(); const extension = file.name.split('.').pop().toLowerCase(); try { let data = []; if (extension === 'json') { const parsed = JSON.parse(text); data = Array.isArray(parsed) ? parsed : [parsed]; } else if (extension === 'jsonl') { data = text .split('\n') .filter(line => line.trim()) .map(line => JSON.parse(line)); } else if (extension === 'csv') { // 更健壮的CSV解析,支持多行字段和引号转义 data = parseCSV(text); if (data.length === 0) { throw new Error('CSV文件格式不正确或没有数据'); } } else { throw new Error('不支持的文件格式'); } if (data.length === 0) { throw new Error('文件中没有找到有效数据'); } // 检测并转换ShareGPT格式为Alpaca格式 data = data.map(convertShareGPTToAlpaca); // 生成预览数据(取前3条记录,每个字段值截取前100字符) const previewData = data.slice(0, 3).map(item => { const preview = {}; Object.keys(item).forEach(key => { const value = String(item[key] || ''); preview[key] = value.length > 100 ? value.substring(0, 100) + '...' : value; }); return preview; }); return { data, preview: previewData, source: { type: 'file', fileName: file.name, fileSize: file.size, totalRecords: data.length } }; } catch (error) { throw new Error(`解析文件失败: ${error.message}`); } }; const handleFileSelect = async event => { const files = event.target.files; if (!files || files.length === 0) return; const file = files[0]; setUploading(true); try { const result = await parseFileContent(file); setUploadedFiles([ { name: file.name, size: file.size, status: 'success' } ]); onDataLoaded(result.data, result.preview, result.source); } catch (error) { setUploadedFiles([ { name: file.name, size: file.size, status: 'error', error: error.message } ]); onError(error.message); } finally { setUploading(false); } }; const formatFileSize = bytes => { if (bytes === 0) return '0 Bytes'; const k = 1024; const sizes = ['Bytes', 'KB', 'MB', 'GB']; const i = Math.floor(Math.log(bytes) / Math.log(k)); return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; }; return ( {t('import.uploadFile', '上传文件')} {t('import.supportedFormats', '支持 JSON、JSONL、CSV 格式文件')} {/* 文件上传区域 */} document.getElementById('file-upload-input').click()} > {t('import.dragDropFile', '拖拽文件到此处或点击选择文件')} {t('import.maxFileSize', '最大文件大小: 50MB')} {/* 上传进度 */} {uploading && ( {t('import.processingFile', '正在处理文件...')} )} {/* 已上传文件列表 */} {uploadedFiles.length > 0 && ( {t('import.uploadedFiles', '已上传文件')} {uploadedFiles.map((file, index) => ( {file.status === 'success' ? : } ))} {uploadedFiles.some(f => f.status === 'error') && ( {t('import.uploadError', '文件上传失败,请检查文件格式是否正确')} )} )} ); }