1. 修改了问题表达

2. 缩减了表的数量 3. 可以选择生成多少个
2025-12-31 18:15:50 +08:00
parent 9f33e0b396
commit 8088b59d30
5 changed files with 803 additions and 501 deletions
--- a/random_select.py
+++ b/random_select.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+随机抽取脚本
+从final.json中随机抽取指定数量的记录，生成select_N.json文件
+"""
+
+import json
+import random
+import os
+from typing import List, Dict, Any
+from config import QAConfig
+
+def load_json_file(file_path: str) -> List[Dict[str, Any]]:
+    """加载JSON文件"""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+            print(f"[OK] 加载文件: {os.path.basename(file_path)} - {len(data)} 条记录")
+            return data
+    except Exception as e:
+        print(f"[ERROR] 加载文件失败 {file_path}: {e}")
+        return []
+
+def random_select(records: List[Dict[str, Any]], count: int, random_seed: int) -> List[Dict[str, Any]]:
+    """
+    随机抽取记录
+
+    Args:
+        records: 记录列表
+        count: 要抽取的数量
+        random_seed: 随机种子
+
+    Returns:
+        抽取的记录列表
+    """
+    # 设置随机种子
+    random.seed(random_seed)
+
+    # 如果抽取数量大于等于总数，直接返回所有记录
+    if count >= len(records):
+        print(f"[WARN] 抽取数量 ({count}) 大于等于总记录数 ({len(records)})，返回所有记录")
+        return records
+
+    # 随机抽取
+    selected = random.sample(records, count)
+    print(f"[OK] 从 {len(records)} 条记录中随机抽取 {count} 条")
+
+    return selected
+
+def main():
+    """主函数"""
+    print("="*60)
+    print("随机抽取工具")
+    print("="*60)
+
+    # 加载配置
+    config = QAConfig()
+    print(f"\n[INFO] 加载配置:")
+    print(f"       随机种子: {config.RANDOM_SEED}")
+    print(f"       抽取数量: {config.SELECT_COUNT}")
+
+    # 文件路径
+    input_file = os.path.join(config.INPUT_DIR, "final.json")
+    output_file = os.path.join(config.INPUT_DIR, "selected.json")
+
+    # 检查输入文件是否存在
+    if not os.path.exists(input_file):
+        print(f"\n[ERROR] 输入文件不存在: {input_file}")
+        return
+
+    # 加载数据
+    print(f"\n[INFO] 加载数据...")
+    records = load_json_file(input_file)
+
+    if not records:
+        print(f"\n[ERROR] 无法加载数据或数据为空")
+        return
+
+    # 随机抽取
+    print(f"\n[INFO] 执行随机抽取...")
+    selected_records = random_select(records, config.SELECT_COUNT, config.RANDOM_SEED)
+
+    # 保存结果
+    try:
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(selected_records, f, ensure_ascii=False, indent=2)
+
+        file_size = os.path.getsize(output_file) / 1024  # KB
+        print(f"\n[OK] 抽取完成!")
+        print(f"       输出文件: {output_file}")
+        print(f"       记录数量: {len(selected_records)}")
+        print(f"       文件大小: {file_size:.1f} KB")
+
+        # 显示前3条记录的字段名
+        if selected_records:
+            print(f"\n[INFO] 抽取记录示例:")
+            sample = selected_records[0]
+            print(f"       字段数量: {len(sample)}")
+            print(f"       字段名: {list(sample.keys())[:10]}...")
+
+            # 显示统计信息
+            three_table_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' in r and '物理模型表_表名' in r)
+            element_logical_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' in r and '物理模型表_表名' not in r)
+            element_physical_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '物理模型表_表名' in r and '逻辑模型表_表名' not in r)
+            element_only_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' not in r and '物理模型表_表名' not in r)
+
+            print(f"\n[INFO] 抽取记录统计:")
+            print(f"       三表匹配: {three_table_match} 条")
+            print(f"       元素治理+逻辑模型: {element_logical_match} 条")
+            print(f"       元素治理+物理模型: {element_physical_match} 条")
+            print(f"       仅元素治理: {element_only_match} 条")
+
+    except Exception as e:
+        print(f"\n[ERROR] 保存文件失败: {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    main()