311 lines
9.1 KiB
JavaScript
311 lines
9.1 KiB
JavaScript
import { NextResponse } from 'next/server';
|
|
import { getProject } from '@/lib/db/projects';
|
|
import { getDatasets } from '@/lib/db/datasets';
|
|
import fs from 'fs';
|
|
import path from 'path';
|
|
import os from 'os';
|
|
import { uploadFiles, createRepo, checkRepoAccess } from '@huggingface/hub';
|
|
|
|
// 上传数据集到 HuggingFace
|
|
export async function POST(request, { params }) {
|
|
try {
|
|
const projectId = params.projectId;
|
|
const {
|
|
token,
|
|
datasetName,
|
|
isPrivate,
|
|
formatType,
|
|
systemPrompt,
|
|
confirmedOnly,
|
|
includeCOT,
|
|
fileFormat,
|
|
customFields,
|
|
reasoningLanguage
|
|
} = await request.json();
|
|
|
|
// 获取项目信息
|
|
const project = await getProject(projectId);
|
|
if (!project) {
|
|
return NextResponse.json({ error: '项目不存在' }, { status: 404 });
|
|
}
|
|
|
|
// 获取数据集问题
|
|
const questions = await getDatasets(projectId, confirmedOnly);
|
|
if (!questions || questions.length === 0) {
|
|
return NextResponse.json({ error: '没有可用的数据集问题' }, { status: 400 });
|
|
}
|
|
|
|
// 格式化数据集
|
|
const formattedData = formatDataset(questions, formatType, systemPrompt, includeCOT, customFields);
|
|
|
|
// 创建临时目录
|
|
const tempDir = path.join(os.tmpdir(), `hf-upload-${projectId}-${Date.now()}`);
|
|
fs.mkdirSync(tempDir, { recursive: true });
|
|
|
|
// 创建数据集文件
|
|
const datasetFilePath = path.join(tempDir, `dataset.${fileFormat}`);
|
|
if (fileFormat === 'json') {
|
|
fs.writeFileSync(datasetFilePath, JSON.stringify(formattedData, null, 2));
|
|
} else if (fileFormat === 'jsonl') {
|
|
const jsonlContent = formattedData.map(item => JSON.stringify(item)).join('\n');
|
|
fs.writeFileSync(datasetFilePath, jsonlContent);
|
|
} else if (fileFormat === 'csv') {
|
|
const csvContent = convertToCSV(formattedData);
|
|
fs.writeFileSync(datasetFilePath, csvContent);
|
|
}
|
|
|
|
// 创建 README.md 文件
|
|
const readmePath = path.join(tempDir, 'README.md');
|
|
const readmeContent = generateReadme(project.name, project.description, formatType);
|
|
fs.writeFileSync(readmePath, readmeContent);
|
|
|
|
// 使用 Hugging Face REST API 上传数据集
|
|
const visibility = isPrivate ? 'private' : 'public';
|
|
|
|
try {
|
|
// 准备仓库配置
|
|
const repo = { type: 'dataset', name: datasetName };
|
|
|
|
// 检查仓库是否存在
|
|
let repoExists = true;
|
|
try {
|
|
await checkRepoAccess({ repo, accessToken: token });
|
|
console.log(`Repository ${datasetName} exists, continuing to upload files`);
|
|
} catch (error) {
|
|
// If error code is 404, the repository does not exist
|
|
if (error.statusCode === 404) {
|
|
repoExists = false;
|
|
console.log(`Repository ${datasetName} does not exist, preparing to create`);
|
|
} else {
|
|
// Other errors (e.g., permission errors)
|
|
throw new Error(`Failed to check repository access: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
// If the repository does not exist, create a new one
|
|
if (!repoExists) {
|
|
try {
|
|
await createRepo({
|
|
repo,
|
|
accessToken: token,
|
|
private: isPrivate,
|
|
license: 'mit',
|
|
description: project.description || 'Dataset created with Easy Dataset'
|
|
});
|
|
console.log(`Successfully created dataset repository: ${datasetName}`);
|
|
} catch (error) {
|
|
throw new Error(`Failed to create dataset repository: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
// 2. 上传数据集文件
|
|
await uploadFile(token, datasetName, datasetFilePath, `dataset.${fileFormat}`);
|
|
|
|
// 3. 上传 README.md
|
|
await uploadFile(token, datasetName, readmePath, 'README.md');
|
|
} catch (error) {
|
|
console.error('Upload to HuggingFace Failed:', String(error));
|
|
return NextResponse.json({ error: `Upload Error: ${error.message}` }, { status: 500 });
|
|
}
|
|
|
|
// 清理临时目录
|
|
fs.rmSync(tempDir, { recursive: true, force: true });
|
|
|
|
// 返回成功信息
|
|
const datasetUrl = `https://huggingface.co/datasets/${datasetName}`;
|
|
return NextResponse.json({
|
|
success: true,
|
|
message: 'Upload successfully HuggingFace',
|
|
url: datasetUrl
|
|
});
|
|
} catch (error) {
|
|
console.error('Upload Faile:', String(error));
|
|
return NextResponse.json({ error: error.message }, { status: 500 });
|
|
}
|
|
}
|
|
|
|
// 格式化数据集
|
|
function formatDataset(questions, formatType, systemPrompt, includeCOT, customFields) {
|
|
if (formatType === 'alpaca') {
|
|
return questions.map(q => {
|
|
const item = {
|
|
instruction: q.question,
|
|
input: '',
|
|
output: includeCOT && q.cot ? `${q.cot}\n\n${q.answer}` : q.answer
|
|
};
|
|
|
|
if (systemPrompt) {
|
|
item.system = systemPrompt;
|
|
}
|
|
|
|
return item;
|
|
});
|
|
} else if (formatType === 'sharegpt') {
|
|
return questions.map(q => {
|
|
const messages = [];
|
|
|
|
if (systemPrompt) {
|
|
messages.push({
|
|
role: 'system',
|
|
content: systemPrompt
|
|
});
|
|
}
|
|
|
|
messages.push({
|
|
role: 'user',
|
|
content: q.question
|
|
});
|
|
|
|
messages.push({
|
|
role: 'assistant',
|
|
content: includeCOT && q.cot ? `${q.cot}\n\n${q.answer}` : q.answer
|
|
});
|
|
|
|
return { messages };
|
|
});
|
|
} else if (formatType === 'multilingualthinking') {
|
|
return questions.map(q => {
|
|
const messages = [];
|
|
|
|
// Main message block
|
|
const mainMsg = {
|
|
reasoning_language: reasoningLanguage ? reasoningLanguage : 'English',
|
|
user: q.question,
|
|
analysis: includeCOT && q.cot ? `${q.cot}` : null,
|
|
final: q.answer
|
|
};
|
|
if (systemPrompt) {
|
|
mainMsg.developer = systemPrompt;
|
|
}
|
|
messages.push(mainMsg);
|
|
|
|
// Optional system prompt
|
|
if (systemPrompt) {
|
|
messages.push({
|
|
role: 'system',
|
|
content: systemPrompt,
|
|
thinking: null
|
|
});
|
|
}
|
|
|
|
// User message
|
|
messages.push({
|
|
role: 'user',
|
|
content: q.question,
|
|
thinking: null
|
|
});
|
|
|
|
// Assistant message
|
|
messages.push({
|
|
role: 'assistant',
|
|
content: q.answer,
|
|
thinking: includeCOT && q.cot ? `${q.cot}` : null
|
|
});
|
|
|
|
return { messages };
|
|
});
|
|
} else if (formatType === 'custom' && customFields) {
|
|
return questions.map(q => {
|
|
const item = {
|
|
[customFields.questionField]: q.question,
|
|
[customFields.answerField]: q.answer
|
|
};
|
|
|
|
if (includeCOT && q.cot) {
|
|
item[customFields.cotField] = q.cot;
|
|
}
|
|
|
|
if (customFields.includeLabels && q.labels) {
|
|
item.labels = q.labels;
|
|
}
|
|
|
|
if (customFields.includeChunk && q.chunkId) {
|
|
item.chunkId = q.chunkId;
|
|
}
|
|
|
|
return item;
|
|
});
|
|
}
|
|
|
|
// 默认返回 alpaca 格式
|
|
return questions.map(q => ({
|
|
instruction: q.question,
|
|
output: includeCOT && q.cot ? `${q.cot}\n\n${q.answer}` : q.answer
|
|
}));
|
|
}
|
|
|
|
// 将数据转换为 CSV 格式
|
|
function convertToCSV(data) {
|
|
if (!data || data.length === 0) return '';
|
|
|
|
const headers = Object.keys(data[0]);
|
|
const headerRow = headers.join(',');
|
|
|
|
const rows = data.map(item => {
|
|
return headers
|
|
.map(header => {
|
|
const value = item[header];
|
|
if (typeof value === 'string') {
|
|
// 处理字符串中的逗号和引号
|
|
return `"${value.replace(/"/g, '""')}"`;
|
|
} else if (Array.isArray(value)) {
|
|
return `"${JSON.stringify(value).replace(/"/g, '""')}"`;
|
|
} else if (typeof value === 'object' && value !== null) {
|
|
return `"${JSON.stringify(value).replace(/"/g, '""')}"`;
|
|
}
|
|
return value;
|
|
})
|
|
.join(',');
|
|
});
|
|
|
|
return [headerRow, ...rows].join('\n');
|
|
}
|
|
|
|
// 使用 @huggingface/hub 包上传文件到 HuggingFace
|
|
async function uploadFile(token, datasetName, filePath, destFileName) {
|
|
try {
|
|
// 准备仓库配置
|
|
const repo = { type: 'dataset', name: datasetName };
|
|
|
|
// 创建文件 URL
|
|
const fileUrl = new URL(`file://${filePath}`);
|
|
|
|
// 使用 @huggingface/hub 包上传文件
|
|
await uploadFiles({
|
|
repo,
|
|
accessToken: token,
|
|
files: [
|
|
{
|
|
path: destFileName,
|
|
content: fileUrl
|
|
}
|
|
],
|
|
commitTitle: `Upload ${destFileName}`,
|
|
commitDescription: `Files uploaded using Easy Dataset`
|
|
});
|
|
|
|
return { success: true };
|
|
} catch (error) {
|
|
console.error(`File ${destFileName} Upload Error:`, String(error));
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
// Generate README.md file
|
|
function generateReadme(projectName, projectDescription, formatType) {
|
|
return `# ${projectName}
|
|
|
|
## Description
|
|
${projectDescription || 'This dataset was created using the Easy Dataset tool.'}
|
|
|
|
## Format
|
|
This dataset is in ${formatType} format.
|
|
|
|
## Creation Method
|
|
This dataset was created using the [Easy Dataset](https://github.com/ConardLi/easy-dataset) tool.
|
|
|
|
> Easy Dataset is a specialized application designed to streamline the creation of fine-tuning datasets for Large Language Models (LLMs). It offers an intuitive interface for uploading domain-specific files, intelligently splitting content, generating questions, and producing high-quality training data for model fine-tuning.
|
|
|
|
`;
|
|
}
|