Files
YG_TDgenerator/qa_generator.py
DESKTOP-72TV0V4\caoxiaozhu 8088b59d30 1. 修改了问题表达
2. 缩减了表的数量
3. 可以选择生成多少个
2025-12-31 18:15:50 +08:00

289 lines
11 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
QA生成器 - 简化版
基于selected.json文件生成问答对
只使用字段中文名、字段英文名、抽象中文名作为提问基础
"""
import json
import os
import random
from typing import List, Dict, Any
from config import QAConfig
class QAGenerator:
"""QA生成器 - 简化版"""
def __init__(self, config: QAConfig = None):
"""初始化生成器"""
self.config = config or QAConfig()
os.makedirs(self.config.OUTPUT_DIR, exist_ok=True)
random.seed(self.config.RANDOM_SEED)
# 问题模板前缀
self.QUESTION_PREFIXES = [
"请告诉我",
"查询",
"请问",
"请解释",
"请输出",
"请列举",
"请说明",
"请查找",
"请确认"
]
# 答句模板前缀
self.ANSWER_PREFIXES = [
"该字段的",
"查询结果显示,",
"经查询,该字段的",
"根据记录显示,",
"该数据的",
"查询结果:",
"经系统查询,",
"根据记录,",
"该值的"
]
# 答句模板后缀
self.ANSWER_SUFFIXES = [
"",
""
]
def get_random_element(self, elements: List[str]) -> str:
"""从列表中随机获取一个元素"""
return random.choice(elements) if elements else ""
def load_json(self, file_path: str) -> List[Dict]:
"""加载JSON文件"""
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
def generate_qa_for_item(self, item: Dict) -> List[Dict]:
"""为单个数据项生成问答对
基于字段中文名、字段英文名询问其他所有字段
"""
qa_pairs = []
# 获取两个核心字段
field_chinese_name = item.get('字段中文名', '')
field_english_name = item.get('字段英文名', '')
# 基于字段中文名提问
if field_chinese_name:
# 询问值类型
if item.get('值类型'):
question = f"字段中文名为'{field_chinese_name}'的值类型是什么?"
answer = f"值类型为「{item['值类型']}"
qa_pairs.append({
"instruct": question,
"input": "",
"output": f"{self.get_random_element(self.ANSWER_PREFIXES)}{answer}{self.get_random_element(self.ANSWER_SUFFIXES)}"
})
# 询问是否枚举
if item.get('是否枚举'):
question = f"字段中文名为'{field_chinese_name}'是否枚举?"
answer = f"是否枚举为「{item['是否枚举']}"
qa_pairs.append({
"instruct": question,
"input": "",
"output": f"{self.get_random_element(self.ANSWER_PREFIXES)}{answer}{self.get_random_element(self.ANSWER_SUFFIXES)}"
})
# 询问枚举数量
if item.get('枚举数量') is not None:
question = f"字段中文名为'{field_chinese_name}'的枚举数量是多少?"
answer = f"枚举数量为{item['枚举数量']}"
qa_pairs.append({
"instruct": question,
"input": "",
"output": f"{self.get_random_element(self.ANSWER_PREFIXES)}{answer}{self.get_random_element(self.ANSWER_SUFFIXES)}"
})
# 询问总长度
if item.get('总长度') is not None:
question = f"字段中文名为'{field_chinese_name}'的总长度是多少?"
answer = f"总长度为{item['总长度']}"
qa_pairs.append({
"instruct": question,
"input": "",
"output": f"{self.get_random_element(self.ANSWER_PREFIXES)}{answer}{self.get_random_element(self.ANSWER_SUFFIXES)}"
})
# 询问小数位
if item.get('小数位') is not None:
question = f"字段中文名为'{field_chinese_name}'的小数位是多少?"
answer = f"小数位为{item['小数位']}"
qa_pairs.append({
"instruct": question,
"input": "",
"output": f"{self.get_random_element(self.ANSWER_PREFIXES)}{answer}{self.get_random_element(self.ANSWER_SUFFIXES)}"
})
# 询问字段英文名
if field_english_name:
question = f"字段中文名为'{field_chinese_name}'的字段英文名是什么?"
answer = f"字段英文名为「{field_english_name}"
qa_pairs.append({
"instruct": question,
"input": "",
"output": f"{self.get_random_element(self.ANSWER_PREFIXES)}{answer}{self.get_random_element(self.ANSWER_SUFFIXES)}"
})
# 基于字段英文名提问
if field_english_name:
# 询问值类型
if item.get('值类型'):
question = f"字段英文名为'{field_english_name}'的值类型是什么?"
answer = f"值类型为「{item['值类型']}"
qa_pairs.append({
"instruct": question,
"input": "",
"output": f"{self.get_random_element(self.ANSWER_PREFIXES)}{answer}{self.get_random_element(self.ANSWER_SUFFIXES)}"
})
# 询问是否枚举
if item.get('是否枚举'):
question = f"字段英文名为'{field_english_name}'是否枚举?"
answer = f"是否枚举为「{item['是否枚举']}"
qa_pairs.append({
"instruct": question,
"input": "",
"output": f"{self.get_random_element(self.ANSWER_PREFIXES)}{answer}{self.get_random_element(self.ANSWER_SUFFIXES)}"
})
# 询问总长度
if item.get('总长度') is not None:
question = f"字段英文名为'{field_english_name}'的总长度是多少?"
answer = f"总长度为{item['总长度']}"
qa_pairs.append({
"instruct": question,
"input": "",
"output": f"{self.get_random_element(self.ANSWER_PREFIXES)}{answer}{self.get_random_element(self.ANSWER_SUFFIXES)}"
})
# 询问小数位
if item.get('小数位') is not None:
question = f"字段英文名为'{field_english_name}'的小数位是多少?"
answer = f"小数位为{item['小数位']}"
qa_pairs.append({
"instruct": question,
"input": "",
"output": f"{self.get_random_element(self.ANSWER_PREFIXES)}{answer}{self.get_random_element(self.ANSWER_SUFFIXES)}"
})
# 询问字段中文名
if field_chinese_name:
question = f"字段英文名为'{field_english_name}'的字段中文名是什么?"
answer = f"字段中文名为「{field_chinese_name}"
qa_pairs.append({
"instruct": question,
"input": "",
"output": f"{self.get_random_element(self.ANSWER_PREFIXES)}{answer}{self.get_random_element(self.ANSWER_SUFFIXES)}"
})
return qa_pairs
def generate_qa_for_data(self, data: List[Dict]) -> List[Dict]:
"""为所有数据生成QA"""
all_qa = []
for item in data:
qa_pairs = self.generate_qa_for_item(item)
all_qa.extend(qa_pairs)
return all_qa
def shuffle_qa_pairs(self, qa_pairs: List[Dict]) -> List[Dict]:
"""随机打乱问答对顺序"""
if self.config.SHUFFLE_OUTPUT:
random.shuffle(qa_pairs)
return qa_pairs
def save_qa(self, qa_pairs: List[Dict], filename: str):
"""保存QA到文件"""
output_path = os.path.join(self.config.OUTPUT_DIR, filename)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(qa_pairs, f, ensure_ascii=False, indent=2)
size_kb = os.path.getsize(output_path) / 1024
print(f"[OK] 已生成: {output_path}")
print(f"{len(qa_pairs)} 条问答对, {size_kb:.1f} KB")
def generate_report(self, total_qa_count: int):
"""生成生成报告"""
report = {
"生成时间": "2025-12-31",
"版本": "简化版",
"输入文件": "selected.json",
"输出目录": self.config.OUTPUT_DIR,
"随机种子": self.config.RANDOM_SEED,
"总问答对数量": total_qa_count,
"说明": "基于字段中文名、字段英文名、抽象中文名询问其他所有字段"
}
report_path = os.path.join(self.config.OUTPUT_DIR, "QA生成报告.json")
with open(report_path, 'w', encoding='utf-8') as f:
json.dump(report, f, ensure_ascii=False, indent=2)
print(f"[OK] 已生成: {report_path}")
def process_selected_json(self):
"""处理selected.json文件"""
input_file = os.path.join(self.config.INPUT_DIR, "selected.json")
if not os.path.exists(input_file):
print(f"[ERROR] 文件不存在: {input_file}")
return
print("="*60)
print("QA生成器 - 简化版")
print("="*60)
print(f"\n[INFO] 加载数据: {input_file}")
try:
data = self.load_json(input_file)
print(f" 数据记录: {len(data)}")
print(f"\n[INFO] 生成问答对...")
qa_pairs = self.generate_qa_for_data(data)
print(f" 生成数量: {len(qa_pairs)}")
print(f"\n[INFO] 打乱顺序...")
qa_pairs = self.shuffle_qa_pairs(qa_pairs)
print(f"\n[INFO] 保存文件...")
self.save_qa(qa_pairs, "selected_QA.json")
print(f"\n[INFO] 生成报告...")
self.generate_report(len(qa_pairs))
print(f"\n[DONE] 处理完成!")
print(f"[OUT] 输出目录: {self.config.OUTPUT_DIR}")
print(f"[TOTAL] 总计生成: {len(qa_pairs)} 条问答对")
except Exception as e:
print(f"[ERROR] 处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()
def main():
"""主函数"""
# 使用默认配置
config = QAConfig()
# 创建生成器并处理
generator = QAGenerator(config)
generator.process_selected_json()
if __name__ == "__main__":
main()