Files
YG_TDgenerator/random_select.py
DESKTOP-72TV0V4\caoxiaozhu 8088b59d30 1. 修改了问题表达
2. 缩减了表的数量
3. 可以选择生成多少个
2025-12-31 18:15:50 +08:00

121 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
随机抽取脚本
从final.json中随机抽取指定数量的记录生成select_N.json文件
"""
import json
import random
import os
from typing import List, Dict, Any
from config import QAConfig
def load_json_file(file_path: str) -> List[Dict[str, Any]]:
"""加载JSON文件"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"[OK] 加载文件: {os.path.basename(file_path)} - {len(data)} 条记录")
return data
except Exception as e:
print(f"[ERROR] 加载文件失败 {file_path}: {e}")
return []
def random_select(records: List[Dict[str, Any]], count: int, random_seed: int) -> List[Dict[str, Any]]:
"""
随机抽取记录
Args:
records: 记录列表
count: 要抽取的数量
random_seed: 随机种子
Returns:
抽取的记录列表
"""
# 设置随机种子
random.seed(random_seed)
# 如果抽取数量大于等于总数,直接返回所有记录
if count >= len(records):
print(f"[WARN] 抽取数量 ({count}) 大于等于总记录数 ({len(records)}),返回所有记录")
return records
# 随机抽取
selected = random.sample(records, count)
print(f"[OK] 从 {len(records)} 条记录中随机抽取 {count}")
return selected
def main():
"""主函数"""
print("="*60)
print("随机抽取工具")
print("="*60)
# 加载配置
config = QAConfig()
print(f"\n[INFO] 加载配置:")
print(f" 随机种子: {config.RANDOM_SEED}")
print(f" 抽取数量: {config.SELECT_COUNT}")
# 文件路径
input_file = os.path.join(config.INPUT_DIR, "final.json")
output_file = os.path.join(config.INPUT_DIR, "selected.json")
# 检查输入文件是否存在
if not os.path.exists(input_file):
print(f"\n[ERROR] 输入文件不存在: {input_file}")
return
# 加载数据
print(f"\n[INFO] 加载数据...")
records = load_json_file(input_file)
if not records:
print(f"\n[ERROR] 无法加载数据或数据为空")
return
# 随机抽取
print(f"\n[INFO] 执行随机抽取...")
selected_records = random_select(records, config.SELECT_COUNT, config.RANDOM_SEED)
# 保存结果
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(selected_records, f, ensure_ascii=False, indent=2)
file_size = os.path.getsize(output_file) / 1024 # KB
print(f"\n[OK] 抽取完成!")
print(f" 输出文件: {output_file}")
print(f" 记录数量: {len(selected_records)}")
print(f" 文件大小: {file_size:.1f} KB")
# 显示前3条记录的字段名
if selected_records:
print(f"\n[INFO] 抽取记录示例:")
sample = selected_records[0]
print(f" 字段数量: {len(sample)}")
print(f" 字段名: {list(sample.keys())[:10]}...")
# 显示统计信息
three_table_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' in r and '物理模型表_表名' in r)
element_logical_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' in r and '物理模型表_表名' not in r)
element_physical_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '物理模型表_表名' in r and '逻辑模型表_表名' not in r)
element_only_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' not in r and '物理模型表_表名' not in r)
print(f"\n[INFO] 抽取记录统计:")
print(f" 三表匹配: {three_table_match}")
print(f" 元素治理+逻辑模型: {element_logical_match}")
print(f" 元素治理+物理模型: {element_physical_match}")
print(f" 仅元素治理: {element_only_match}")
except Exception as e:
print(f"\n[ERROR] 保存文件失败: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()