1. 修改了问题表达
2. 缩减了表的数量 3. 可以选择生成多少个
This commit is contained in:
120
random_select.py
Normal file
120
random_select.py
Normal file
@@ -0,0 +1,120 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
随机抽取脚本
|
||||
从final.json中随机抽取指定数量的记录,生成select_N.json文件
|
||||
"""
|
||||
|
||||
import json
|
||||
import random
|
||||
import os
|
||||
from typing import List, Dict, Any
|
||||
from config import QAConfig
|
||||
|
||||
def load_json_file(file_path: str) -> List[Dict[str, Any]]:
|
||||
"""加载JSON文件"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
print(f"[OK] 加载文件: {os.path.basename(file_path)} - {len(data)} 条记录")
|
||||
return data
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 加载文件失败 {file_path}: {e}")
|
||||
return []
|
||||
|
||||
def random_select(records: List[Dict[str, Any]], count: int, random_seed: int) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
随机抽取记录
|
||||
|
||||
Args:
|
||||
records: 记录列表
|
||||
count: 要抽取的数量
|
||||
random_seed: 随机种子
|
||||
|
||||
Returns:
|
||||
抽取的记录列表
|
||||
"""
|
||||
# 设置随机种子
|
||||
random.seed(random_seed)
|
||||
|
||||
# 如果抽取数量大于等于总数,直接返回所有记录
|
||||
if count >= len(records):
|
||||
print(f"[WARN] 抽取数量 ({count}) 大于等于总记录数 ({len(records)}),返回所有记录")
|
||||
return records
|
||||
|
||||
# 随机抽取
|
||||
selected = random.sample(records, count)
|
||||
print(f"[OK] 从 {len(records)} 条记录中随机抽取 {count} 条")
|
||||
|
||||
return selected
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
print("="*60)
|
||||
print("随机抽取工具")
|
||||
print("="*60)
|
||||
|
||||
# 加载配置
|
||||
config = QAConfig()
|
||||
print(f"\n[INFO] 加载配置:")
|
||||
print(f" 随机种子: {config.RANDOM_SEED}")
|
||||
print(f" 抽取数量: {config.SELECT_COUNT}")
|
||||
|
||||
# 文件路径
|
||||
input_file = os.path.join(config.INPUT_DIR, "final.json")
|
||||
output_file = os.path.join(config.INPUT_DIR, "selected.json")
|
||||
|
||||
# 检查输入文件是否存在
|
||||
if not os.path.exists(input_file):
|
||||
print(f"\n[ERROR] 输入文件不存在: {input_file}")
|
||||
return
|
||||
|
||||
# 加载数据
|
||||
print(f"\n[INFO] 加载数据...")
|
||||
records = load_json_file(input_file)
|
||||
|
||||
if not records:
|
||||
print(f"\n[ERROR] 无法加载数据或数据为空")
|
||||
return
|
||||
|
||||
# 随机抽取
|
||||
print(f"\n[INFO] 执行随机抽取...")
|
||||
selected_records = random_select(records, config.SELECT_COUNT, config.RANDOM_SEED)
|
||||
|
||||
# 保存结果
|
||||
try:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(selected_records, f, ensure_ascii=False, indent=2)
|
||||
|
||||
file_size = os.path.getsize(output_file) / 1024 # KB
|
||||
print(f"\n[OK] 抽取完成!")
|
||||
print(f" 输出文件: {output_file}")
|
||||
print(f" 记录数量: {len(selected_records)}")
|
||||
print(f" 文件大小: {file_size:.1f} KB")
|
||||
|
||||
# 显示前3条记录的字段名
|
||||
if selected_records:
|
||||
print(f"\n[INFO] 抽取记录示例:")
|
||||
sample = selected_records[0]
|
||||
print(f" 字段数量: {len(sample)}")
|
||||
print(f" 字段名: {list(sample.keys())[:10]}...")
|
||||
|
||||
# 显示统计信息
|
||||
three_table_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' in r and '物理模型表_表名' in r)
|
||||
element_logical_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' in r and '物理模型表_表名' not in r)
|
||||
element_physical_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '物理模型表_表名' in r and '逻辑模型表_表名' not in r)
|
||||
element_only_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' not in r and '物理模型表_表名' not in r)
|
||||
|
||||
print(f"\n[INFO] 抽取记录统计:")
|
||||
print(f" 三表匹配: {three_table_match} 条")
|
||||
print(f" 元素治理+逻辑模型: {element_logical_match} 条")
|
||||
print(f" 元素治理+物理模型: {element_physical_match} 条")
|
||||
print(f" 仅元素治理: {element_only_match} 条")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n[ERROR] 保存文件失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user