Files
YG-Datasets/easy-dataset-main/app/api/projects/[projectId]/huggingface/upload/route.js

311 lines
9.1 KiB
JavaScript
Raw Normal View History

2026-03-17 14:36:31 +08:00
import { NextResponse } from 'next/server';
import { getProject } from '@/lib/db/projects';
import { getDatasets } from '@/lib/db/datasets';
import fs from 'fs';
import path from 'path';
import os from 'os';
import { uploadFiles, createRepo, checkRepoAccess } from '@huggingface/hub';
// 上传数据集到 HuggingFace
export async function POST(request, { params }) {
try {
const projectId = params.projectId;
const {
token,
datasetName,
isPrivate,
formatType,
systemPrompt,
confirmedOnly,
includeCOT,
fileFormat,
customFields,
reasoningLanguage
} = await request.json();
// 获取项目信息
const project = await getProject(projectId);
if (!project) {
return NextResponse.json({ error: '项目不存在' }, { status: 404 });
}
// 获取数据集问题
const questions = await getDatasets(projectId, confirmedOnly);
if (!questions || questions.length === 0) {
return NextResponse.json({ error: '没有可用的数据集问题' }, { status: 400 });
}
// 格式化数据集
const formattedData = formatDataset(questions, formatType, systemPrompt, includeCOT, customFields);
// 创建临时目录
const tempDir = path.join(os.tmpdir(), `hf-upload-${projectId}-${Date.now()}`);
fs.mkdirSync(tempDir, { recursive: true });
// 创建数据集文件
const datasetFilePath = path.join(tempDir, `dataset.${fileFormat}`);
if (fileFormat === 'json') {
fs.writeFileSync(datasetFilePath, JSON.stringify(formattedData, null, 2));
} else if (fileFormat === 'jsonl') {
const jsonlContent = formattedData.map(item => JSON.stringify(item)).join('\n');
fs.writeFileSync(datasetFilePath, jsonlContent);
} else if (fileFormat === 'csv') {
const csvContent = convertToCSV(formattedData);
fs.writeFileSync(datasetFilePath, csvContent);
}
// 创建 README.md 文件
const readmePath = path.join(tempDir, 'README.md');
const readmeContent = generateReadme(project.name, project.description, formatType);
fs.writeFileSync(readmePath, readmeContent);
// 使用 Hugging Face REST API 上传数据集
const visibility = isPrivate ? 'private' : 'public';
try {
// 准备仓库配置
const repo = { type: 'dataset', name: datasetName };
// 检查仓库是否存在
let repoExists = true;
try {
await checkRepoAccess({ repo, accessToken: token });
console.log(`Repository ${datasetName} exists, continuing to upload files`);
} catch (error) {
// If error code is 404, the repository does not exist
if (error.statusCode === 404) {
repoExists = false;
console.log(`Repository ${datasetName} does not exist, preparing to create`);
} else {
// Other errors (e.g., permission errors)
throw new Error(`Failed to check repository access: ${error.message}`);
}
}
// If the repository does not exist, create a new one
if (!repoExists) {
try {
await createRepo({
repo,
accessToken: token,
private: isPrivate,
license: 'mit',
description: project.description || 'Dataset created with Easy Dataset'
});
console.log(`Successfully created dataset repository: ${datasetName}`);
} catch (error) {
throw new Error(`Failed to create dataset repository: ${error.message}`);
}
}
// 2. 上传数据集文件
await uploadFile(token, datasetName, datasetFilePath, `dataset.${fileFormat}`);
// 3. 上传 README.md
await uploadFile(token, datasetName, readmePath, 'README.md');
} catch (error) {
console.error('Upload to HuggingFace Failed:', String(error));
return NextResponse.json({ error: `Upload Error: ${error.message}` }, { status: 500 });
}
// 清理临时目录
fs.rmSync(tempDir, { recursive: true, force: true });
// 返回成功信息
const datasetUrl = `https://huggingface.co/datasets/${datasetName}`;
return NextResponse.json({
success: true,
message: 'Upload successfully HuggingFace',
url: datasetUrl
});
} catch (error) {
console.error('Upload Faile:', String(error));
return NextResponse.json({ error: error.message }, { status: 500 });
}
}
// 格式化数据集
function formatDataset(questions, formatType, systemPrompt, includeCOT, customFields) {
if (formatType === 'alpaca') {
return questions.map(q => {
const item = {
instruction: q.question,
input: '',
output: includeCOT && q.cot ? `${q.cot}\n\n${q.answer}` : q.answer
};
if (systemPrompt) {
item.system = systemPrompt;
}
return item;
});
} else if (formatType === 'sharegpt') {
return questions.map(q => {
const messages = [];
if (systemPrompt) {
messages.push({
role: 'system',
content: systemPrompt
});
}
messages.push({
role: 'user',
content: q.question
});
messages.push({
role: 'assistant',
content: includeCOT && q.cot ? `${q.cot}\n\n${q.answer}` : q.answer
});
return { messages };
});
} else if (formatType === 'multilingualthinking') {
return questions.map(q => {
const messages = [];
// Main message block
const mainMsg = {
reasoning_language: reasoningLanguage ? reasoningLanguage : 'English',
user: q.question,
analysis: includeCOT && q.cot ? `${q.cot}` : null,
final: q.answer
};
if (systemPrompt) {
mainMsg.developer = systemPrompt;
}
messages.push(mainMsg);
// Optional system prompt
if (systemPrompt) {
messages.push({
role: 'system',
content: systemPrompt,
thinking: null
});
}
// User message
messages.push({
role: 'user',
content: q.question,
thinking: null
});
// Assistant message
messages.push({
role: 'assistant',
content: q.answer,
thinking: includeCOT && q.cot ? `${q.cot}` : null
});
return { messages };
});
} else if (formatType === 'custom' && customFields) {
return questions.map(q => {
const item = {
[customFields.questionField]: q.question,
[customFields.answerField]: q.answer
};
if (includeCOT && q.cot) {
item[customFields.cotField] = q.cot;
}
if (customFields.includeLabels && q.labels) {
item.labels = q.labels;
}
if (customFields.includeChunk && q.chunkId) {
item.chunkId = q.chunkId;
}
return item;
});
}
// 默认返回 alpaca 格式
return questions.map(q => ({
instruction: q.question,
output: includeCOT && q.cot ? `${q.cot}\n\n${q.answer}` : q.answer
}));
}
// 将数据转换为 CSV 格式
function convertToCSV(data) {
if (!data || data.length === 0) return '';
const headers = Object.keys(data[0]);
const headerRow = headers.join(',');
const rows = data.map(item => {
return headers
.map(header => {
const value = item[header];
if (typeof value === 'string') {
// 处理字符串中的逗号和引号
return `"${value.replace(/"/g, '""')}"`;
} else if (Array.isArray(value)) {
return `"${JSON.stringify(value).replace(/"/g, '""')}"`;
} else if (typeof value === 'object' && value !== null) {
return `"${JSON.stringify(value).replace(/"/g, '""')}"`;
}
return value;
})
.join(',');
});
return [headerRow, ...rows].join('\n');
}
// 使用 @huggingface/hub 包上传文件到 HuggingFace
async function uploadFile(token, datasetName, filePath, destFileName) {
try {
// 准备仓库配置
const repo = { type: 'dataset', name: datasetName };
// 创建文件 URL
const fileUrl = new URL(`file://${filePath}`);
// 使用 @huggingface/hub 包上传文件
await uploadFiles({
repo,
accessToken: token,
files: [
{
path: destFileName,
content: fileUrl
}
],
commitTitle: `Upload ${destFileName}`,
commitDescription: `Files uploaded using Easy Dataset`
});
return { success: true };
} catch (error) {
console.error(`File ${destFileName} Upload Error:`, String(error));
throw error;
}
}
// Generate README.md file
function generateReadme(projectName, projectDescription, formatType) {
return `# ${projectName}
## Description
${projectDescription || 'This dataset was created using the Easy Dataset tool.'}
## Format
This dataset is in ${formatType} format.
## Creation Method
This dataset was created using the [Easy Dataset](https://github.com/ConardLi/easy-dataset) tool.
> Easy Dataset is a specialized application designed to streamline the creation of fine-tuning datasets for Large Language Models (LLMs). It offers an intuitive interface for uploading domain-specific files, intelligently splitting content, generating questions, and producing high-quality training data for model fine-tuning.
`;
}