Files
YG-Datasets/easy-dataset-main/components/datasets/import/FileUploadStep.js

345 lines
9.7 KiB
JavaScript
Raw Normal View History

2026-03-17 14:36:31 +08:00
'use client';
import { useState, useCallback } from 'react';
import {
Box,
Typography,
Button,
Paper,
List,
ListItem,
ListItemIcon,
ListItemText,
LinearProgress,
Alert
} from '@mui/material';
import { CloudUpload as UploadIcon, Description as FileIcon, CheckCircle as CheckIcon } from '@mui/icons-material';
import { useTranslation } from 'react-i18next';
// import { useDropzone } from 'react-dropzone';
/**
* 文件上传步骤组件
*/
export default function FileUploadStep({ onDataLoaded, onError }) {
const { t } = useTranslation();
const [uploading, setUploading] = useState(false);
const [uploadedFiles, setUploadedFiles] = useState([]);
// 健壮的CSV解析函数支持多行字段和引号转义
const parseCSV = text => {
const result = [];
const lines = [];
let currentLine = '';
let inQuotes = false;
// 逐字符解析,正确处理引号内的换行符
for (let i = 0; i < text.length; i++) {
const char = text[i];
const nextChar = text[i + 1];
if (char === '"') {
if (inQuotes && nextChar === '"') {
// 转义的引号
currentLine += '"';
i++; // 跳过下一个引号
} else {
// 切换引号状态
inQuotes = !inQuotes;
}
} else if (char === '\n' && !inQuotes) {
// 行结束(不在引号内)
if (currentLine.trim()) {
lines.push(currentLine);
}
currentLine = '';
} else {
currentLine += char;
}
}
// 添加最后一行
if (currentLine.trim()) {
lines.push(currentLine);
}
if (lines.length < 2) {
throw new Error('CSV文件格式不正确至少需要标题行和一行数据');
}
// 解析标题行
const headers = parseCSVLine(lines[0]);
// 解析数据行
for (let i = 1; i < lines.length; i++) {
const values = parseCSVLine(lines[i]);
if (values.length > 0) {
const obj = {};
headers.forEach((header, index) => {
obj[header] = values[index] || '';
});
result.push(obj);
}
}
return result;
};
// 解析单行CSV处理逗号分隔和引号转义
const parseCSVLine = line => {
const result = [];
let current = '';
let inQuotes = false;
for (let i = 0; i < line.length; i++) {
const char = line[i];
const nextChar = line[i + 1];
if (char === '"') {
if (inQuotes && nextChar === '"') {
// 转义的引号
current += '"';
i++; // 跳过下一个引号
} else {
// 切换引号状态
inQuotes = !inQuotes;
}
} else if (char === ',' && !inQuotes) {
// 字段分隔符(不在引号内)
result.push(current.trim());
current = '';
} else {
current += char;
}
}
// 添加最后一个字段
result.push(current.trim());
return result;
};
// 检测并转换ShareGPT格式为Alpaca格式
const convertShareGPTToAlpaca = item => {
// 检查是否包含conversations字段且格式正确
if (item.conversations && Array.isArray(item.conversations)) {
const conversations = item.conversations;
// 查找system、human、gpt消息
let systemMessage = '';
let instruction = '';
let output = '';
for (const conv of conversations) {
if (conv.from === 'system' && conv.value) {
systemMessage = conv.value;
} else if (conv.from === 'human' && conv.value) {
instruction = conv.value;
} else if (conv.from === 'gpt' && conv.value) {
output = conv.value;
break; // 只取第一轮对话
}
}
// 如果有system消息将其作为instruction的前缀
if (systemMessage && instruction) {
instruction = `${systemMessage}\n\n${instruction}`;
} else if (systemMessage && !instruction) {
instruction = systemMessage;
}
// 转换为Alpaca格式
return {
instruction: instruction || '',
input: '', // ShareGPT格式通常没有单独的input字段
output: output || '',
// 保留其他字段
...Object.fromEntries(Object.entries(item).filter(([key]) => key !== 'conversations'))
};
}
return item; // 如果不是ShareGPT格式返回原始数据
};
const parseFileContent = async file => {
const text = await file.text();
const extension = file.name.split('.').pop().toLowerCase();
try {
let data = [];
if (extension === 'json') {
const parsed = JSON.parse(text);
data = Array.isArray(parsed) ? parsed : [parsed];
} else if (extension === 'jsonl') {
data = text
.split('\n')
.filter(line => line.trim())
.map(line => JSON.parse(line));
} else if (extension === 'csv') {
// 更健壮的CSV解析支持多行字段和引号转义
data = parseCSV(text);
if (data.length === 0) {
throw new Error('CSV文件格式不正确或没有数据');
}
} else {
throw new Error('不支持的文件格式');
}
if (data.length === 0) {
throw new Error('文件中没有找到有效数据');
}
// 检测并转换ShareGPT格式为Alpaca格式
data = data.map(convertShareGPTToAlpaca);
// 生成预览数据取前3条记录每个字段值截取前100字符
const previewData = data.slice(0, 3).map(item => {
const preview = {};
Object.keys(item).forEach(key => {
const value = String(item[key] || '');
preview[key] = value.length > 100 ? value.substring(0, 100) + '...' : value;
});
return preview;
});
return {
data,
preview: previewData,
source: {
type: 'file',
fileName: file.name,
fileSize: file.size,
totalRecords: data.length
}
};
} catch (error) {
throw new Error(`解析文件失败: ${error.message}`);
}
};
const handleFileSelect = async event => {
const files = event.target.files;
if (!files || files.length === 0) return;
const file = files[0];
setUploading(true);
try {
const result = await parseFileContent(file);
setUploadedFiles([
{
name: file.name,
size: file.size,
status: 'success'
}
]);
onDataLoaded(result.data, result.preview, result.source);
} catch (error) {
setUploadedFiles([
{
name: file.name,
size: file.size,
status: 'error',
error: error.message
}
]);
onError(error.message);
} finally {
setUploading(false);
}
};
const formatFileSize = bytes => {
if (bytes === 0) return '0 Bytes';
const k = 1024;
const sizes = ['Bytes', 'KB', 'MB', 'GB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
};
return (
<Box>
<Typography variant="h6" gutterBottom>
{t('import.uploadFile', '上传文件')}
</Typography>
<Typography variant="body2" color="text.secondary" sx={{ mb: 3 }}>
{t('import.supportedFormats', '支持 JSON、JSONL、CSV 格式文件')}
</Typography>
{/* 文件上传区域 */}
<Paper
sx={{
p: 4,
textAlign: 'center',
cursor: 'pointer',
border: '2px dashed',
borderColor: 'divider',
backgroundColor: 'background.paper',
transition: 'all 0.2s ease',
mb: 3,
'&:hover': {
borderColor: 'primary.main',
backgroundColor: 'action.hover'
}
}}
onClick={() => document.getElementById('file-upload-input').click()}
>
<input
id="file-upload-input"
type="file"
accept=".json,.jsonl,.csv"
onChange={handleFileSelect}
style={{ display: 'none' }}
/>
<UploadIcon sx={{ fontSize: 48, color: 'text.secondary', mb: 2 }} />
<Typography variant="h6" gutterBottom>
{t('import.dragDropFile', '拖拽文件到此处或点击选择文件')}
</Typography>
<Typography variant="body2" color="text.secondary">
{t('import.maxFileSize', '最大文件大小: 50MB')}
</Typography>
</Paper>
{/* 上传进度 */}
{uploading && (
<Box sx={{ mb: 3 }}>
<Typography variant="body2" gutterBottom>
{t('import.processingFile', '正在处理文件...')}
</Typography>
<LinearProgress />
</Box>
)}
{/* 已上传文件列表 */}
{uploadedFiles.length > 0 && (
<Box>
<Typography variant="subtitle2" gutterBottom>
{t('import.uploadedFiles', '已上传文件')}
</Typography>
<List>
{uploadedFiles.map((file, index) => (
<ListItem key={index} sx={{ px: 0 }}>
<ListItemIcon>
{file.status === 'success' ? <CheckIcon color="success" /> : <FileIcon color="error" />}
</ListItemIcon>
<ListItemText
primary={file.name}
secondary={file.status === 'success' ? `${formatFileSize(file.size)}` : file.error}
/>
</ListItem>
))}
</List>
{uploadedFiles.some(f => f.status === 'error') && (
<Alert severity="error" sx={{ mt: 2 }}>
{t('import.uploadError', '文件上传失败,请检查文件格式是否正确')}
</Alert>
)}
</Box>
)}
</Box>
);
}