#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 随机抽取脚本 从final.json中随机抽取指定数量的记录,生成select_N.json文件 """ import json import random import os from typing import List, Dict, Any from config import QAConfig def load_json_file(file_path: str) -> List[Dict[str, Any]]: """加载JSON文件""" try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) print(f"[OK] 加载文件: {os.path.basename(file_path)} - {len(data)} 条记录") return data except Exception as e: print(f"[ERROR] 加载文件失败 {file_path}: {e}") return [] def random_select(records: List[Dict[str, Any]], count: int, random_seed: int) -> List[Dict[str, Any]]: """ 随机抽取记录 Args: records: 记录列表 count: 要抽取的数量 random_seed: 随机种子 Returns: 抽取的记录列表 """ # 设置随机种子 random.seed(random_seed) # 如果抽取数量大于等于总数,直接返回所有记录 if count >= len(records): print(f"[WARN] 抽取数量 ({count}) 大于等于总记录数 ({len(records)}),返回所有记录") return records # 随机抽取 selected = random.sample(records, count) print(f"[OK] 从 {len(records)} 条记录中随机抽取 {count} 条") return selected def main(): """主函数""" print("="*60) print("随机抽取工具") print("="*60) # 加载配置 config = QAConfig() print(f"\n[INFO] 加载配置:") print(f" 随机种子: {config.RANDOM_SEED}") print(f" 抽取数量: {config.SELECT_COUNT}") # 文件路径 input_file = os.path.join(config.INPUT_DIR, "final.json") output_file = os.path.join(config.INPUT_DIR, "selected.json") # 检查输入文件是否存在 if not os.path.exists(input_file): print(f"\n[ERROR] 输入文件不存在: {input_file}") return # 加载数据 print(f"\n[INFO] 加载数据...") records = load_json_file(input_file) if not records: print(f"\n[ERROR] 无法加载数据或数据为空") return # 随机抽取 print(f"\n[INFO] 执行随机抽取...") selected_records = random_select(records, config.SELECT_COUNT, config.RANDOM_SEED) # 保存结果 try: with open(output_file, 'w', encoding='utf-8') as f: json.dump(selected_records, f, ensure_ascii=False, indent=2) file_size = os.path.getsize(output_file) / 1024 # KB print(f"\n[OK] 抽取完成!") print(f" 输出文件: {output_file}") print(f" 记录数量: {len(selected_records)}") print(f" 文件大小: {file_size:.1f} KB") # 显示前3条记录的字段名 if selected_records: print(f"\n[INFO] 抽取记录示例:") sample = selected_records[0] print(f" 字段数量: {len(sample)}") print(f" 字段名: {list(sample.keys())[:10]}...") # 显示统计信息 three_table_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' in r and '物理模型表_表名' in r) element_logical_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' in r and '物理模型表_表名' not in r) element_physical_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '物理模型表_表名' in r and '逻辑模型表_表名' not in r) element_only_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' not in r and '物理模型表_表名' not in r) print(f"\n[INFO] 抽取记录统计:") print(f" 三表匹配: {three_table_match} 条") print(f" 元素治理+逻辑模型: {element_logical_match} 条") print(f" 元素治理+物理模型: {element_physical_match} 条") print(f" 仅元素治理: {element_only_match} 条") except Exception as e: print(f"\n[ERROR] 保存文件失败: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()