Files
YG_TDgenerator/config.py
DESKTOP-72TV0V4\caoxiaozhu 8088b59d30 1. 修改了问题表达
2. 缩减了表的数量
3. 可以选择生成多少个
2025-12-31 18:15:50 +08:00

270 lines
8.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
配置文件
用于控制QA生成的各种参数
"""
from typing import List, Dict
class QAConfig:
"""QA生成配置类"""
def __init__(self):
# ========== 基础配置 ==========
# 随机种子(确保结果可重现)
self.RANDOM_SEED = 42
# 输入输出目录
self.INPUT_DIR = "Data_Export_Json"
self.OUTPUT_DIR = "Data_QA_Outputs"
# ========== 随机抽取配置 ==========
# 从final.json随机抽取的记录个数生成selected.json文件
self.SELECT_COUNT = 3000
# ========== 问题数量控制 ==========
# 每个数据项生成的基本问题数量(简单模式下)
self.BASIC_QUESTIONS_PER_ITEM = 1
# 每个数据项生成的最多问题数量(复杂模式下)
self.MAX_QUESTIONS_PER_ITEM = 10
# 多列查询问题的占比0.0-1.0
# 0.0 表示不生成多列问题1.0 表示所有问题都是多列问题
self.MULTI_COLUMN_RATIO = 0.3
# ========== 复杂程度控制 ==========
# 复杂程度等级1-5
# 1: 最简单(只有基础问答)
# 3: 中等(包含部分多列问答)
# 5: 最复杂(包含所有类型问答)
self.COMPLEXITY_LEVEL = 3
# ========== 问句模板配置 ==========
# 问句前缀模板(可根据复杂程度调整使用数量)
self.QUESTION_PREFIXES_SIMPLE = [
"请告诉我",
"查询",
"请问"
]
self.QUESTION_PREFIXES_NORMAL = [
"请告诉我",
"查询",
"请问",
"",
"请解释",
"请输出",
"请列举",
"请说明",
"请查找",
"请确认"
]
# ========== 答句模板配置 ==========
# 答句前缀模板
self.ANSWER_PREFIXES_SIMPLE = [
"根据表记录,该字段的",
"查询结果显示,",
"经查询,该字段的"
]
self.ANSWER_PREFIXES_NORMAL = [
"根据表记录,该字段的",
"查询结果显示,",
"经查询,该字段的",
"根据数据库记录,",
"在表中,此字段的",
"查询结果:",
"经系统查询,",
"根据记录显示,",
"在数据中,该字段的",
"查询得知,该字段的"
]
# 答句后缀模板
self.ANSWER_SUFFIXES_SIMPLE = [
"",
""
]
self.ANSWER_SUFFIXES_NORMAL = [
"",
",请参考。",
",详情如上。",
",以上信息为准。",
",望知悉。",
",如需更多信息请联系。",
",希望能帮到您。",
",祝您工作顺利。",
",谢谢。",
""
]
# ========== 连接词配置 ==========
# 多列查询的连接词
self.CONNECTORS_SIMPLE = ["", ""]
self.CONNECTORS_NORMAL = ["", "", "", "", ",还有", "以及"]
# ========== 数据文件配置 ==========
# 要处理的数据文件列表
self.DATA_FILES = [
{
"name": "元素治理模板",
"file": "元素治理模板.json",
"output": "元素治理模板_QA.json",
"enabled": True
},
{
"name": "物理模型",
"file": "物理模型.json",
"output": "物理模型_QA.json",
"enabled": True
},
{
"name": "逻辑模型",
"file": "逻辑模型.json",
"output": "逻辑模型_QA.json",
"enabled": True
}
]
# ========== 单列问题模板配置 ==========
# 根据复杂程度决定启用哪些模板
self.SINGLE_COLUMN_TEMPLATES = {
1: 3, # 简单模式只启用前3个模板
2: 6, # 简单+模式启用前6个模板
3: 9, # 中等模式启用前9个模板
4: 12, # 复杂模式启用前12个模板全部
5: 12 # 最复杂模式:启用全部模板
}
# ========== 多列问题模板配置 ==========
# 根据复杂程度决定启用哪些多列模板
self.MULTI_COLUMN_TEMPLATES = {
1: 0, # 简单模式:不生成多列问题
2: 1, # 简单+模式启用1个多列模板
3: 3, # 中等模式启用3个多列模板
4: 4, # 复杂模式启用4个多列模板
5: 5 # 最复杂模式:启用全部多列模板
}
# ========== 输出控制 ==========
# 是否打乱问答对顺序
self.SHUFFLE_OUTPUT = True
# 是否生成QA生成报告
self.GENERATE_REPORT = True
# 是否显示详细日志
self.VERBOSE_LOG = True
def get_complexity_settings(self) -> Dict:
"""根据复杂程度等级获取相应设置"""
level = self.COMPLEXITY_LEVEL
if level <= 2:
return {
"question_prefixes": self.QUESTION_PREFIXES_SIMPLE,
"answer_prefixes": self.ANSWER_PREFIXES_SIMPLE,
"answer_suffixes": self.ANSWER_SUFFIXES_SIMPLE,
"connectors": self.CONNECTORS_SIMPLE,
"single_templates": self.SINGLE_COLUMN_TEMPLATES.get(level, 3),
"multi_templates": self.MULTI_COLUMN_TEMPLATES.get(level, 0),
"multi_ratio": 0.0 if level == 1 else 0.1
}
else:
return {
"question_prefixes": self.QUESTION_PREFIXES_NORMAL,
"answer_prefixes": self.ANSWER_PREFIXES_NORMAL,
"answer_suffixes": self.ANSWER_SUFFIXES_NORMAL,
"connectors": self.CONNECTORS_NORMAL,
"single_templates": self.SINGLE_COLUMN_TEMPLATES.get(level, 9),
"multi_templates": self.MULTI_COLUMN_TEMPLATES.get(level, 3),
"multi_ratio": self.MULTI_COLUMN_RATIO
}
def update_config(self, **kwargs):
"""更新配置参数"""
for key, value in kwargs.items():
if hasattr(self, key):
setattr(self, key, value)
print(f"[CONFIG] 已更新 {key} = {value}")
else:
print(f"[WARNING] 未找到配置项: {key}")
def print_config(self):
"""打印当前配置"""
print("\n" + "="*60)
print("当前QA生成配置")
print("="*60)
print(f"随机种子: {self.RANDOM_SEED}")
print(f"输入目录: {self.INPUT_DIR}")
print(f"输出目录: {self.OUTPUT_DIR}")
print(f"复杂程度等级: {self.COMPLEXITY_LEVEL}")
print(f"基本问题数量: {self.BASIC_QUESTIONS_PER_ITEM}")
print(f"最大问题数量: {self.MAX_QUESTIONS_PER_ITEM}")
print(f"多列查询占比: {self.MULTI_COLUMN_RATIO}")
print(f"打乱输出: {self.SHUFFLE_OUTPUT}")
print(f"生成报告: {self.GENERATE_REPORT}")
print(f"详细日志: {self.VERBOSE_LOG}")
print("\n启用处理的文件:")
for file_info in self.DATA_FILES:
if file_info["enabled"]:
print(f"{file_info['name']} ({file_info['file']})")
print("="*60 + "\n")
# 创建默认配置实例
DEFAULT_CONFIG = QAConfig()
def create_custom_config(**kwargs) -> QAConfig:
"""创建自定义配置"""
config = QAConfig()
config.update_config(**kwargs)
return config
# 配置预设
SIMPLE_CONFIG = create_custom_config(
COMPLEXITY_LEVEL=1,
MULTI_COLUMN_RATIO=0.0,
BASIC_QUESTIONS_PER_ITEM=1,
MAX_QUESTIONS_PER_ITEM=3,
VERBOSE_LOG=False
)
NORMAL_CONFIG = create_custom_config(
COMPLEXITY_LEVEL=3,
MULTI_COLUMN_RATIO=0.3,
BASIC_QUESTIONS_PER_ITEM=3,
MAX_QUESTIONS_PER_ITEM=8
)
COMPLEX_CONFIG = create_custom_config(
COMPLEXITY_LEVEL=5,
MULTI_COLUMN_RATIO=0.5,
BASIC_QUESTIONS_PER_ITEM=5,
MAX_QUESTIONS_PER_ITEM=10
)
if __name__ == "__main__":
# 演示配置使用
config = QAConfig()
config.print_config()
settings = config.get_complexity_settings()
print("\n根据当前复杂程度等级的设置:")
print(f"问句前缀数量: {len(settings['question_prefixes'])}")
print(f"答句前缀数量: {len(settings['answer_prefixes'])}")
print(f"答句后缀数量: {len(settings['answer_suffixes'])}")
print(f"连接词数量: {len(settings['connectors'])}")
print(f"单列模板数: {settings['single_templates']}")
print(f"多列模板数: {settings['multi_templates']}")
print(f"多列占比: {settings['multi_ratio']}")