easy-dataset-main/app/api/projects/%5BprojectId%5D/huggingface/upload/route.js

import { NextResponse } from 'next/server';
import { getProject } from '@/lib/db/projects';
import { getDatasets } from '@/lib/db/datasets';
import fs from 'fs';
import path from 'path';
import os from 'os';
import { uploadFiles, createRepo, checkRepoAccess } from '@huggingface/hub';

// 上传数据集到 HuggingFace
export async function POST(request, { params }) {
  try {
    const projectId = params.projectId;
    const {
      token,
      datasetName,
      isPrivate,
      formatType,
      systemPrompt,
      confirmedOnly,
      includeCOT,
      fileFormat,
      customFields,
      reasoningLanguage
    } = await request.json();

    // 获取项目信息
    const project = await getProject(projectId);
    if (!project) {
      return NextResponse.json({ error: '项目不存在' }, { status: 404 });
    }

    // 获取数据集问题
    const questions = await getDatasets(projectId, confirmedOnly);
    if (!questions || questions.length === 0) {
      return NextResponse.json({ error: '没有可用的数据集问题' }, { status: 400 });
    }

    // 格式化数据集
    const formattedData = formatDataset(questions, formatType, systemPrompt, includeCOT, customFields);

    // 创建临时目录
    const tempDir = path.join(os.tmpdir(), `hf-upload-${projectId}-${Date.now()}`);
    fs.mkdirSync(tempDir, { recursive: true });

    // 创建数据集文件
    const datasetFilePath = path.join(tempDir, `dataset.${fileFormat}`);
    if (fileFormat === 'json') {
      fs.writeFileSync(datasetFilePath, JSON.stringify(formattedData, null, 2));
    } else if (fileFormat === 'jsonl') {
      const jsonlContent = formattedData.map(item => JSON.stringify(item)).join('\n');
      fs.writeFileSync(datasetFilePath, jsonlContent);
    } else if (fileFormat === 'csv') {
      const csvContent = convertToCSV(formattedData);
      fs.writeFileSync(datasetFilePath, csvContent);
    }

    // 创建 README.md 文件
    const readmePath = path.join(tempDir, 'README.md');
    const readmeContent = generateReadme(project.name, project.description, formatType);
    fs.writeFileSync(readmePath, readmeContent);

    // 使用 Hugging Face REST API 上传数据集
    const visibility = isPrivate ? 'private' : 'public';

    try {
      // 准备仓库配置
      const repo = { type: 'dataset', name: datasetName };

      // 检查仓库是否存在
      let repoExists = true;
      try {
        await checkRepoAccess({ repo, accessToken: token });
        console.log(`Repository ${datasetName} exists, continuing to upload files`);
      } catch (error) {
        // If error code is 404, the repository does not exist
        if (error.statusCode === 404) {
          repoExists = false;
          console.log(`Repository ${datasetName} does not exist, preparing to create`);
        } else {
          // Other errors (e.g., permission errors)
          throw new Error(`Failed to check repository access: ${error.message}`);
        }
      }

      // If the repository does not exist, create a new one
      if (!repoExists) {
        try {
          await createRepo({
            repo,
            accessToken: token,
            private: isPrivate,
            license: 'mit',
            description: project.description || 'Dataset created with Easy Dataset'
          });
          console.log(`Successfully created dataset repository: ${datasetName}`);
        } catch (error) {
          throw new Error(`Failed to create dataset repository: ${error.message}`);
        }
      }

      // 2. 上传数据集文件
      await uploadFile(token, datasetName, datasetFilePath, `dataset.${fileFormat}`);

      // 3. 上传 README.md
      await uploadFile(token, datasetName, readmePath, 'README.md');
    } catch (error) {
      console.error('Upload to HuggingFace Failed:', String(error));
      return NextResponse.json({ error: `Upload Error: ${error.message}` }, { status: 500 });
    }

    // 清理临时目录
    fs.rmSync(tempDir, { recursive: true, force: true });

    // 返回成功信息
    const datasetUrl = `https://huggingface.co/datasets/${datasetName}`;
    return NextResponse.json({
      success: true,
      message: 'Upload successfully HuggingFace',
      url: datasetUrl
    });
  } catch (error) {
    console.error('Upload Faile:', String(error));
    return NextResponse.json({ error: error.message }, { status: 500 });
  }
}

// 格式化数据集
function formatDataset(questions, formatType, systemPrompt, includeCOT, customFields) {
  if (formatType === 'alpaca') {
    return questions.map(q => {
      const item = {
        instruction: q.question,
        input: '',
        output: includeCOT && q.cot ? `${q.cot}\n\n${q.answer}` : q.answer
      };

      if (systemPrompt) {
        item.system = systemPrompt;
      }

      return item;
    });
  } else if (formatType === 'sharegpt') {
    return questions.map(q => {
      const messages = [];

      if (systemPrompt) {
        messages.push({
          role: 'system',
          content: systemPrompt
        });
      }

      messages.push({
        role: 'user',
        content: q.question
      });

      messages.push({
        role: 'assistant',
        content: includeCOT && q.cot ? `${q.cot}\n\n${q.answer}` : q.answer
      });

      return { messages };
    });
  } else if (formatType === 'multilingualthinking') {
    return questions.map(q => {
      const messages = [];

      // Main message block
      const mainMsg = {
        reasoning_language: reasoningLanguage ? reasoningLanguage : 'English',
        user: q.question,
        analysis: includeCOT && q.cot ? `${q.cot}` : null,
        final: q.answer
      };
      if (systemPrompt) {
        mainMsg.developer = systemPrompt;
      }
      messages.push(mainMsg);

      // Optional system prompt
      if (systemPrompt) {
        messages.push({
          role: 'system',
          content: systemPrompt,
          thinking: null
        });
      }

      // User message
      messages.push({
        role: 'user',
        content: q.question,
        thinking: null
      });

      // Assistant message
      messages.push({
        role: 'assistant',
        content: q.answer,
        thinking: includeCOT && q.cot ? `${q.cot}` : null
      });

      return { messages };
    });
  } else if (formatType === 'custom' && customFields) {
    return questions.map(q => {
      const item = {
        [customFields.questionField]: q.question,
        [customFields.answerField]: q.answer
      };

      if (includeCOT && q.cot) {
        item[customFields.cotField] = q.cot;
      }

      if (customFields.includeLabels && q.labels) {
        item.labels = q.labels;
      }

      if (customFields.includeChunk && q.chunkId) {
        item.chunkId = q.chunkId;
      }

      return item;
    });
  }

  // 默认返回 alpaca 格式
  return questions.map(q => ({
    instruction: q.question,
    output: includeCOT && q.cot ? `${q.cot}\n\n${q.answer}` : q.answer
  }));
}

// 将数据转换为 CSV 格式
function convertToCSV(data) {
  if (!data || data.length === 0) return '';

  const headers = Object.keys(data[0]);
  const headerRow = headers.join(',');

  const rows = data.map(item => {
    return headers
      .map(header => {
        const value = item[header];
        if (typeof value === 'string') {
          // 处理字符串中的逗号和引号
          return `"${value.replace(/"/g, '""')}"`;
        } else if (Array.isArray(value)) {
          return `"${JSON.stringify(value).replace(/"/g, '""')}"`;
        } else if (typeof value === 'object' && value !== null) {
          return `"${JSON.stringify(value).replace(/"/g, '""')}"`;
        }
        return value;
      })
      .join(',');
  });

  return [headerRow, ...rows].join('\n');
}

// 使用 @huggingface/hub 包上传文件到 HuggingFace
async function uploadFile(token, datasetName, filePath, destFileName) {
  try {
    // 准备仓库配置
    const repo = { type: 'dataset', name: datasetName };

    // 创建文件 URL
    const fileUrl = new URL(`file://${filePath}`);

    // 使用 @huggingface/hub 包上传文件
    await uploadFiles({
      repo,
      accessToken: token,
      files: [
        {
          path: destFileName,
          content: fileUrl
        }
      ],
      commitTitle: `Upload ${destFileName}`,
      commitDescription: `Files uploaded using Easy Dataset`
    });

    return { success: true };
  } catch (error) {
    console.error(`File ${destFileName} Upload Error:`, String(error));
    throw error;
  }
}

// Generate README.md file
function generateReadme(projectName, projectDescription, formatType) {
  return `# ${projectName}

## Description
${projectDescription || 'This dataset was created using the Easy Dataset tool.'}

## Format
This dataset is in ${formatType} format.

## Creation Method
This dataset was created using the [Easy Dataset](https://github.com/ConardLi/easy-dataset) tool.

> Easy Dataset is a specialized application designed to streamline the creation of fine-tuning datasets for Large Language Models (LLMs). It offers an intuitive interface for uploading domain-specific files, intelligently splitting content, generating questions, and producing high-quality training data for model fine-tuning.

`;
}
first-update 2026-03-17 14:36:31 +08:00			`import { NextResponse } from 'next/server';`
			`import { getProject } from '@/lib/db/projects';`
			`import { getDatasets } from '@/lib/db/datasets';`
			`import fs from 'fs';`
			`import path from 'path';`
			`import os from 'os';`
			`import { uploadFiles, createRepo, checkRepoAccess } from '@huggingface/hub';`

			`// 上传数据集到 HuggingFace`
			`export async function POST(request, { params }) {`
			`try {`
			`const projectId = params.projectId;`
			`const {`
			`token,`
			`datasetName,`
			`isPrivate,`
			`formatType,`
			`systemPrompt,`
			`confirmedOnly,`
			`includeCOT,`
			`fileFormat,`
			`customFields,`
			`reasoningLanguage`
			`} = await request.json();`

			`// 获取项目信息`
			`const project = await getProject(projectId);`
			`if (!project) {`
			`return NextResponse.json({ error: '项目不存在' }, { status: 404 });`
			`}`

			`// 获取数据集问题`
			`const questions = await getDatasets(projectId, confirmedOnly);`
			`if (!questions \|\| questions.length === 0) {`
			`return NextResponse.json({ error: '没有可用的数据集问题' }, { status: 400 });`
			`}`

			`// 格式化数据集`
			`const formattedData = formatDataset(questions, formatType, systemPrompt, includeCOT, customFields);`

			`// 创建临时目录`
			const tempDir = path.join(os.tmpdir(), `hf-upload-${projectId}-${Date.now()}`);
			`fs.mkdirSync(tempDir, { recursive: true });`

			`// 创建数据集文件`
			const datasetFilePath = path.join(tempDir, `dataset.${fileFormat}`);
			`if (fileFormat === 'json') {`
			`fs.writeFileSync(datasetFilePath, JSON.stringify(formattedData, null, 2));`
			`} else if (fileFormat === 'jsonl') {`
			`const jsonlContent = formattedData.map(item => JSON.stringify(item)).join('\n');`
			`fs.writeFileSync(datasetFilePath, jsonlContent);`
			`} else if (fileFormat === 'csv') {`
			`const csvContent = convertToCSV(formattedData);`
			`fs.writeFileSync(datasetFilePath, csvContent);`
			`}`

			`// 创建 README.md 文件`
			`const readmePath = path.join(tempDir, 'README.md');`
			`const readmeContent = generateReadme(project.name, project.description, formatType);`
			`fs.writeFileSync(readmePath, readmeContent);`

			`// 使用 Hugging Face REST API 上传数据集`
			`const visibility = isPrivate ? 'private' : 'public';`

			`try {`
			`// 准备仓库配置`
			`const repo = { type: 'dataset', name: datasetName };`

			`// 检查仓库是否存在`
			`let repoExists = true;`
			`try {`
			`await checkRepoAccess({ repo, accessToken: token });`
			console.log(`Repository ${datasetName} exists, continuing to upload files`);
			`} catch (error) {`
			`// If error code is 404, the repository does not exist`
			`if (error.statusCode === 404) {`
			`repoExists = false;`
			console.log(`Repository ${datasetName} does not exist, preparing to create`);
			`} else {`
			`// Other errors (e.g., permission errors)`
			throw new Error(`Failed to check repository access: ${error.message}`);
			`}`
			`}`

			`// If the repository does not exist, create a new one`
			`if (!repoExists) {`
			`try {`
			`await createRepo({`
			`repo,`
			`accessToken: token,`
			`private: isPrivate,`
			`license: 'mit',`
			`description: project.description \|\| 'Dataset created with Easy Dataset'`
			`});`
			console.log(`Successfully created dataset repository: ${datasetName}`);
			`} catch (error) {`
			throw new Error(`Failed to create dataset repository: ${error.message}`);
			`}`
			`}`

			`// 2. 上传数据集文件`
			await uploadFile(token, datasetName, datasetFilePath, `dataset.${fileFormat}`);

			`// 3. 上传 README.md`
			`await uploadFile(token, datasetName, readmePath, 'README.md');`
			`} catch (error) {`
			`console.error('Upload to HuggingFace Failed:', String(error));`
			return NextResponse.json({ error: `Upload Error: ${error.message}` }, { status: 500 });
			`}`

			`// 清理临时目录`
			`fs.rmSync(tempDir, { recursive: true, force: true });`

			`// 返回成功信息`
			const datasetUrl = `https://huggingface.co/datasets/${datasetName}`;
			`return NextResponse.json({`
			`success: true,`
			`message: 'Upload successfully HuggingFace',`
			`url: datasetUrl`
			`});`
			`} catch (error) {`
			`console.error('Upload Faile:', String(error));`
			`return NextResponse.json({ error: error.message }, { status: 500 });`
			`}`
			`}`

			`// 格式化数据集`
			`function formatDataset(questions, formatType, systemPrompt, includeCOT, customFields) {`
			`if (formatType === 'alpaca') {`
			`return questions.map(q => {`
			`const item = {`
			`instruction: q.question,`
			`input: '',`
			output: includeCOT && q.cot ? `${q.cot}\n\n${q.answer}` : q.answer
			`};`

			`if (systemPrompt) {`
			`item.system = systemPrompt;`
			`}`

			`return item;`
			`});`
			`} else if (formatType === 'sharegpt') {`
			`return questions.map(q => {`
			`const messages = [];`

			`if (systemPrompt) {`
			`messages.push({`
			`role: 'system',`
			`content: systemPrompt`
			`});`
			`}`

			`messages.push({`
			`role: 'user',`
			`content: q.question`
			`});`

			`messages.push({`
			`role: 'assistant',`
			content: includeCOT && q.cot ? `${q.cot}\n\n${q.answer}` : q.answer
			`});`

			`return { messages };`
			`});`
			`} else if (formatType === 'multilingualthinking') {`
			`return questions.map(q => {`
			`const messages = [];`

			`// Main message block`
			`const mainMsg = {`
			`reasoning_language: reasoningLanguage ? reasoningLanguage : 'English',`
			`user: q.question,`
			analysis: includeCOT && q.cot ? `${q.cot}` : null,
			`final: q.answer`
			`};`
			`if (systemPrompt) {`
			`mainMsg.developer = systemPrompt;`
			`}`
			`messages.push(mainMsg);`

			`// Optional system prompt`
			`if (systemPrompt) {`
			`messages.push({`
			`role: 'system',`
			`content: systemPrompt,`
			`thinking: null`
			`});`
			`}`

			`// User message`
			`messages.push({`
			`role: 'user',`
			`content: q.question,`
			`thinking: null`
			`});`

			`// Assistant message`
			`messages.push({`
			`role: 'assistant',`
			`content: q.answer,`
			thinking: includeCOT && q.cot ? `${q.cot}` : null
			`});`

			`return { messages };`
			`});`
			`} else if (formatType === 'custom' && customFields) {`
			`return questions.map(q => {`
			`const item = {`
			`[customFields.questionField]: q.question,`
			`[customFields.answerField]: q.answer`
			`};`

			`if (includeCOT && q.cot) {`
			`item[customFields.cotField] = q.cot;`
			`}`

			`if (customFields.includeLabels && q.labels) {`
			`item.labels = q.labels;`
			`}`

			`if (customFields.includeChunk && q.chunkId) {`
			`item.chunkId = q.chunkId;`
			`}`

			`return item;`
			`});`
			`}`

			`// 默认返回 alpaca 格式`
			`return questions.map(q => ({`
			`instruction: q.question,`
			output: includeCOT && q.cot ? `${q.cot}\n\n${q.answer}` : q.answer
			`}));`
			`}`

			`// 将数据转换为 CSV 格式`
			`function convertToCSV(data) {`
			`if (!data \|\| data.length === 0) return '';`

			`const headers = Object.keys(data[0]);`
			`const headerRow = headers.join(',');`

			`const rows = data.map(item => {`
			`return headers`
			`.map(header => {`
			`const value = item[header];`
			`if (typeof value === 'string') {`
			`// 处理字符串中的逗号和引号`
			return `"${value.replace(/"/g, '""')}"`;
			`} else if (Array.isArray(value)) {`
			return `"${JSON.stringify(value).replace(/"/g, '""')}"`;
			`} else if (typeof value === 'object' && value !== null) {`
			return `"${JSON.stringify(value).replace(/"/g, '""')}"`;
			`}`
			`return value;`
			`})`
			`.join(',');`
			`});`

			`return [headerRow, ...rows].join('\n');`
			`}`

			`// 使用 @huggingface/hub 包上传文件到 HuggingFace`
			`async function uploadFile(token, datasetName, filePath, destFileName) {`
			`try {`
			`// 准备仓库配置`
			`const repo = { type: 'dataset', name: datasetName };`

			`// 创建文件 URL`
			const fileUrl = new URL(`file://${filePath}`);

			`// 使用 @huggingface/hub 包上传文件`
			`await uploadFiles({`
			`repo,`
			`accessToken: token,`
			`files: [`
			`{`
			`path: destFileName,`
			`content: fileUrl`
			`}`
			`],`
			commitTitle: `Upload ${destFileName}`,
			commitDescription: `Files uploaded using Easy Dataset`
			`});`

			`return { success: true };`
			`} catch (error) {`
			console.error(`File ${destFileName} Upload Error:`, String(error));
			`throw error;`
			`}`
			`}`

			`// Generate README.md file`
			`function generateReadme(projectName, projectDescription, formatType) {`
			return `# ${projectName}

			`## Description`
			`${projectDescription \|\| 'This dataset was created using the Easy Dataset tool.'}`

			`## Format`
			`This dataset is in ${formatType} format.`

			`## Creation Method`
			`This dataset was created using the [Easy Dataset](https://github.com/ConardLi/easy-dataset) tool.`

			`> Easy Dataset is a specialized application designed to streamline the creation of fine-tuning datasets for Large Language Models (LLMs). It offers an intuitive interface for uploading domain-specific files, intelligently splitting content, generating questions, and producing high-quality training data for model fine-tuning.`

			`;
			`}`