#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 优化版JSON文件合并脚本 根据字段英文名匹配逻辑模型表、物理模型表和元素治理模板表的数据 """ import json import os from collections import defaultdict from typing import Dict, List, Any def load_json_file(file_path: str) -> List[Dict[str, Any]]: """加载JSON文件""" try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) print(f"[OK] 加载文件: {os.path.basename(file_path)} - {len(data)} 条记录") return data except Exception as e: print(f"[ERROR] 加载文件失败 {file_path}: {e}") return [] def build_index(records: List[Dict], field_name: str) -> Dict[str, List[Dict]]: """为记录列表建立索引,加速查找""" index = defaultdict(list) for record in records: field_value = record.get(field_name) if field_value: index[field_value].append(record) print(f"[INFO] 建立索引完成: {len(index)} 个唯一字段值") return index def merge_records_optimized(logical_index: Dict, physical_index: Dict, element_records: List[Dict]) -> List[Dict]: """ 使用索引优化合并三个表的记录 """ merged_data = [] processed_fields = set() # 遍历元素治理表 print(f"\n[INFO] 开始合并数据...") for i, element_record in enumerate(element_records): if i % 5000 == 0: print(f" 处理进度: {i}/{len(element_records)}") field_english_name = element_record.get('字段英文名') if not field_english_name or field_english_name in processed_fields: continue processed_fields.add(field_english_name) # 创建合并记录 merged_record = {} # 添加元素治理模板表的数据 for key, value in element_record.items(): if key != '表名': merged_record[key] = value # 查找逻辑模型表中的匹配记录 logical_matches = logical_index.get(field_english_name, []) # 查找物理模型表中的匹配记录 physical_matches = physical_index.get(field_english_name, []) # 添加逻辑模型表的数据(添加前缀避免冲突) if logical_matches: for logical_match in logical_matches: for key, value in logical_match.items(): if key not in ['表名', '字段英文名']: new_key = f"逻辑模型_{key}" merged_record[new_key] = value # 只有当有匹配数据时才添加表名信息 merged_record['逻辑模型表_表名'] = '远光数据架构逻辑模型表' # 添加物理模型表的数据(添加前缀避免冲突) if physical_matches: for physical_match in physical_matches: for key, value in physical_match.items(): if key not in ['表名', '字段英文名']: new_key = f"物理模型_{key}" merged_record[new_key] = value # 只有当有匹配数据时才添加表名信息 merged_record['物理模型表_表名'] = '远光数据架构物理模型表' # 添加元素治理表名(始终存在) merged_record['元素治理表_表名'] = '远光数据架构元素治理模板表' merged_data.append(merged_record) print(f" 完成合并: {len(merged_data)} 条记录") return merged_data def add_unmatched_records_optimized(merged_data: List[Dict], logical_index: Dict, physical_index: Dict) -> List[Dict]: """ 添加未匹配的记录 """ print(f"\n[INFO] 处理未匹配的记录...") # 获取所有已处理的字段英文名 processed_fields = {record.get('字段英文名') for record in merged_data if record.get('字段英文名')} # 添加逻辑模型表中未匹配的记录 logical_unmatched = len(logical_index) - len([f for f in logical_index if f in processed_fields]) print(f" 逻辑模型表未匹配: {logical_unmatched} 条") for field_name, logical_matches in logical_index.items(): if field_name not in processed_fields: for logical_match in logical_matches: merged_record = {'字段英文名': field_name} for key, value in logical_match.items(): if key not in ['表名', '字段英文名']: merged_record[f"逻辑模型_{key}"] = value merged_record['逻辑模型表_表名'] = '远光数据架构逻辑模型表' merged_record['物理模型表_表名'] = None merged_record['元素治理表_表名'] = None merged_data.append(merged_record) # 添加物理模型表中未匹配的记录 physical_unmatched = len(physical_index) - len([f for f in physical_index if f in processed_fields]) print(f" 物理模型表未匹配: {physical_unmatched} 条") for field_name, physical_matches in physical_index.items(): if field_name not in processed_fields: # 检查是否已经添加过(通过逻辑模型表) already_added = any(r.get('字段英文名') == field_name for r in merged_data) if not already_added: for physical_match in physical_matches: merged_record = {'字段英文名': field_name} for key, value in physical_match.items(): if key not in ['表名', '字段英文名']: merged_record[f"物理模型_{key}"] = value merged_record['逻辑模型表_表名'] = None merged_record['物理模型表_表名'] = '远光数据架构物理模型表' merged_record['元素治理表_表名'] = None merged_data.append(merged_record) return merged_data def main(): """主函数""" print("="*60) print("优化版JSON文件合并工具") print("="*60) # 文件路径 logical_json_path = "Data_Export_Json/远光数据架构逻辑模型表.json" physical_json_path = "Data_Export_Json/远光数据架构物理模型表.json" element_json_path = "Data_Export_Json/远光数据架构元素治理模板表.json" output_path = "Data_Export_Json/final.json" # 加载JSON文件 print("\n[INFO] 加载JSON文件...") logical_records = load_json_file(logical_json_path) physical_records = load_json_file(physical_json_path) element_records = load_json_file(element_json_path) if not (logical_records and physical_records and element_records): print("\n[ERROR] 无法加载所有JSON文件") return # 建立索引 print(f"\n[INFO] 建立索引加速查找...") logical_index = build_index(logical_records, '字段英文名') physical_index = build_index(physical_records, '字段英文名') # 合并数据(只处理元素治理表中存在的字段) merged_data = merge_records_optimized(logical_index, physical_index, element_records) # 不再添加未匹配的记录,因为用户只关心元素治理表中的字段 # 保存合并后的数据 try: print(f"\n[INFO] 保存合并数据到 {output_path}...") with open(output_path, 'w', encoding='utf-8') as f: json.dump(merged_data, f, ensure_ascii=False, indent=2) file_size = os.path.getsize(output_path) / 1024 # KB print(f"\n[OK] 合并完成!") print(f" 输出文件: {output_path}") print(f" 合并记录: {len(merged_data)} 条") print(f" 文件大小: {file_size:.1f} KB") # 显示统计信息 three_table_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('逻辑模型表_表名') and r.get('物理模型表_表名')) element_logical_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('逻辑模型表_表名') and not r.get('物理模型表_表名')) element_physical_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('物理模型表_表名') and not r.get('逻辑模型表_表名')) element_only_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and not r.get('逻辑模型表_表名') and not r.get('物理模型表_表名')) logical_only_count = sum(1 for r in merged_data if r.get('逻辑模型表_表名') and not r.get('元素治理表_表名')) physical_only_count = sum(1 for r in merged_data if r.get('物理模型表_表名') and not r.get('元素治理表_表名')) print(f"\n[INFO] 统计信息:") print(f" 三表匹配: {three_table_match} 条") print(f" 元素治理+逻辑模型: {element_logical_match} 条") print(f" 元素治理+物理模型: {element_physical_match} 条") print(f" 仅元素治理: {element_only_match} 条") print(f" 仅逻辑模型: {logical_only_count} 条") print(f" 仅物理模型: {physical_only_count} 条") # 显示前3条记录的字段名 if merged_data: print(f"\n[INFO] 合并记录示例:") sample_record = merged_data[0] print(f" 字段数量: {len(sample_record)}") print(f" 字段名: {list(sample_record.keys())[:10]}...") # 只显示前10个字段 except Exception as e: print(f"\n[ERROR] 保存文件失败: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()