Files
YG_TDgenerator/config.py

270 lines
8.8 KiB
Python
Raw Normal View History

2025-12-18 16:16:12 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
配置文件
用于控制QA生成的各种参数
"""
from typing import List, Dict
class QAConfig:
"""QA生成配置类"""
def __init__(self):
# ========== 基础配置 ==========
# 随机种子(确保结果可重现)
self.RANDOM_SEED = 42
# 输入输出目录
self.INPUT_DIR = "Data_Export_Json"
self.OUTPUT_DIR = "Data_QA_Outputs"
# ========== 随机抽取配置 ==========
# 从final.json随机抽取的记录个数生成selected.json文件
self.SELECT_COUNT = 3000
2025-12-18 16:16:12 +08:00
# ========== 问题数量控制 ==========
# 每个数据项生成的基本问题数量(简单模式下)
self.BASIC_QUESTIONS_PER_ITEM = 1
# 每个数据项生成的最多问题数量(复杂模式下)
self.MAX_QUESTIONS_PER_ITEM = 10
# 多列查询问题的占比0.0-1.0
# 0.0 表示不生成多列问题1.0 表示所有问题都是多列问题
self.MULTI_COLUMN_RATIO = 0.3
# ========== 复杂程度控制 ==========
# 复杂程度等级1-5
# 1: 最简单(只有基础问答)
# 3: 中等(包含部分多列问答)
# 5: 最复杂(包含所有类型问答)
self.COMPLEXITY_LEVEL = 3
# ========== 问句模板配置 ==========
# 问句前缀模板(可根据复杂程度调整使用数量)
self.QUESTION_PREFIXES_SIMPLE = [
"请告诉我",
"查询",
"请问"
]
self.QUESTION_PREFIXES_NORMAL = [
"请告诉我",
"查询",
"请问",
"",
"请解释",
"请输出",
"请列举",
"请说明",
"请查找",
"请确认"
]
# ========== 答句模板配置 ==========
# 答句前缀模板
self.ANSWER_PREFIXES_SIMPLE = [
"根据表记录,该字段的",
"查询结果显示,",
"经查询,该字段的"
]
self.ANSWER_PREFIXES_NORMAL = [
"根据表记录,该字段的",
"查询结果显示,",
"经查询,该字段的",
"根据数据库记录,",
"在表中,此字段的",
"查询结果:",
"经系统查询,",
"根据记录显示,",
"在数据中,该字段的",
"查询得知,该字段的"
]
# 答句后缀模板
self.ANSWER_SUFFIXES_SIMPLE = [
"",
""
]
self.ANSWER_SUFFIXES_NORMAL = [
"",
",请参考。",
",详情如上。",
",以上信息为准。",
",望知悉。",
",如需更多信息请联系。",
",希望能帮到您。",
",祝您工作顺利。",
",谢谢。",
""
]
# ========== 连接词配置 ==========
# 多列查询的连接词
self.CONNECTORS_SIMPLE = ["", ""]
self.CONNECTORS_NORMAL = ["", "", "", "", ",还有", "以及"]
# ========== 数据文件配置 ==========
# 要处理的数据文件列表
self.DATA_FILES = [
{
"name": "元素治理模板",
"file": "元素治理模板.json",
"output": "元素治理模板_QA.json",
"enabled": True
},
{
"name": "物理模型",
"file": "物理模型.json",
"output": "物理模型_QA.json",
"enabled": True
},
{
"name": "逻辑模型",
"file": "逻辑模型.json",
"output": "逻辑模型_QA.json",
"enabled": True
}
]
# ========== 单列问题模板配置 ==========
# 根据复杂程度决定启用哪些模板
self.SINGLE_COLUMN_TEMPLATES = {
1: 3, # 简单模式只启用前3个模板
2: 6, # 简单+模式启用前6个模板
3: 9, # 中等模式启用前9个模板
4: 12, # 复杂模式启用前12个模板全部
5: 12 # 最复杂模式:启用全部模板
}
# ========== 多列问题模板配置 ==========
# 根据复杂程度决定启用哪些多列模板
self.MULTI_COLUMN_TEMPLATES = {
1: 0, # 简单模式:不生成多列问题
2: 1, # 简单+模式启用1个多列模板
3: 3, # 中等模式启用3个多列模板
4: 4, # 复杂模式启用4个多列模板
5: 5 # 最复杂模式:启用全部多列模板
}
# ========== 输出控制 ==========
# 是否打乱问答对顺序
2026-01-04 11:34:19 +08:00
self.SHUFFLE_OUTPUT = False
2025-12-18 16:16:12 +08:00
# 是否生成QA生成报告
self.GENERATE_REPORT = True
# 是否显示详细日志
self.VERBOSE_LOG = True
def get_complexity_settings(self) -> Dict:
"""根据复杂程度等级获取相应设置"""
level = self.COMPLEXITY_LEVEL
if level <= 2:
return {
"question_prefixes": self.QUESTION_PREFIXES_SIMPLE,
"answer_prefixes": self.ANSWER_PREFIXES_SIMPLE,
"answer_suffixes": self.ANSWER_SUFFIXES_SIMPLE,
"connectors": self.CONNECTORS_SIMPLE,
"single_templates": self.SINGLE_COLUMN_TEMPLATES.get(level, 3),
"multi_templates": self.MULTI_COLUMN_TEMPLATES.get(level, 0),
"multi_ratio": 0.0 if level == 1 else 0.1
}
else:
return {
"question_prefixes": self.QUESTION_PREFIXES_NORMAL,
"answer_prefixes": self.ANSWER_PREFIXES_NORMAL,
"answer_suffixes": self.ANSWER_SUFFIXES_NORMAL,
"connectors": self.CONNECTORS_NORMAL,
"single_templates": self.SINGLE_COLUMN_TEMPLATES.get(level, 9),
"multi_templates": self.MULTI_COLUMN_TEMPLATES.get(level, 3),
"multi_ratio": self.MULTI_COLUMN_RATIO
}
def update_config(self, **kwargs):
"""更新配置参数"""
for key, value in kwargs.items():
if hasattr(self, key):
setattr(self, key, value)
print(f"[CONFIG] 已更新 {key} = {value}")
else:
print(f"[WARNING] 未找到配置项: {key}")
def print_config(self):
"""打印当前配置"""
print("\n" + "="*60)
print("当前QA生成配置")
print("="*60)
print(f"随机种子: {self.RANDOM_SEED}")
print(f"输入目录: {self.INPUT_DIR}")
print(f"输出目录: {self.OUTPUT_DIR}")
print(f"复杂程度等级: {self.COMPLEXITY_LEVEL}")
print(f"基本问题数量: {self.BASIC_QUESTIONS_PER_ITEM}")
print(f"最大问题数量: {self.MAX_QUESTIONS_PER_ITEM}")
print(f"多列查询占比: {self.MULTI_COLUMN_RATIO}")
print(f"打乱输出: {self.SHUFFLE_OUTPUT}")
print(f"生成报告: {self.GENERATE_REPORT}")
print(f"详细日志: {self.VERBOSE_LOG}")
print("\n启用处理的文件:")
for file_info in self.DATA_FILES:
if file_info["enabled"]:
print(f"{file_info['name']} ({file_info['file']})")
print("="*60 + "\n")
# 创建默认配置实例
DEFAULT_CONFIG = QAConfig()
def create_custom_config(**kwargs) -> QAConfig:
"""创建自定义配置"""
config = QAConfig()
config.update_config(**kwargs)
return config
# 配置预设
SIMPLE_CONFIG = create_custom_config(
COMPLEXITY_LEVEL=1,
MULTI_COLUMN_RATIO=0.0,
BASIC_QUESTIONS_PER_ITEM=1,
MAX_QUESTIONS_PER_ITEM=3,
VERBOSE_LOG=False
)
NORMAL_CONFIG = create_custom_config(
COMPLEXITY_LEVEL=3,
MULTI_COLUMN_RATIO=0.3,
BASIC_QUESTIONS_PER_ITEM=3,
MAX_QUESTIONS_PER_ITEM=8
)
COMPLEX_CONFIG = create_custom_config(
COMPLEXITY_LEVEL=5,
MULTI_COLUMN_RATIO=0.5,
BASIC_QUESTIONS_PER_ITEM=5,
MAX_QUESTIONS_PER_ITEM=10
)
if __name__ == "__main__":
# 演示配置使用
config = QAConfig()
config.print_config()
settings = config.get_complexity_settings()
print("\n根据当前复杂程度等级的设置:")
print(f"问句前缀数量: {len(settings['question_prefixes'])}")
print(f"答句前缀数量: {len(settings['answer_prefixes'])}")
print(f"答句后缀数量: {len(settings['answer_suffixes'])}")
print(f"连接词数量: {len(settings['connectors'])}")
print(f"单列模板数: {settings['single_templates']}")
print(f"多列模板数: {settings['multi_templates']}")
print(f"多列占比: {settings['multi_ratio']}")