227 lines
9.6 KiB
Python
227 lines
9.6 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
优化版JSON文件合并脚本
|
|
根据字段英文名匹配逻辑模型表、物理模型表和元素治理模板表的数据
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Any
|
|
|
|
def load_json_file(file_path: str) -> List[Dict[str, Any]]:
|
|
"""加载JSON文件"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
print(f"[OK] 加载文件: {os.path.basename(file_path)} - {len(data)} 条记录")
|
|
return data
|
|
except Exception as e:
|
|
print(f"[ERROR] 加载文件失败 {file_path}: {e}")
|
|
return []
|
|
|
|
def build_index(records: List[Dict], field_name: str) -> Dict[str, List[Dict]]:
|
|
"""为记录列表建立索引,加速查找"""
|
|
index = defaultdict(list)
|
|
for record in records:
|
|
field_value = record.get(field_name)
|
|
if field_value:
|
|
index[field_value].append(record)
|
|
print(f"[INFO] 建立索引完成: {len(index)} 个唯一字段值")
|
|
return index
|
|
|
|
def merge_records_optimized(logical_index: Dict, physical_index: Dict, element_records: List[Dict]) -> List[Dict]:
|
|
"""
|
|
使用索引优化合并三个表的记录
|
|
"""
|
|
merged_data = []
|
|
processed_fields = set()
|
|
|
|
# 遍历元素治理表
|
|
print(f"\n[INFO] 开始合并数据...")
|
|
for i, element_record in enumerate(element_records):
|
|
if i % 5000 == 0:
|
|
print(f" 处理进度: {i}/{len(element_records)}")
|
|
|
|
field_english_name = element_record.get('字段英文名')
|
|
if not field_english_name or field_english_name in processed_fields:
|
|
continue
|
|
|
|
processed_fields.add(field_english_name)
|
|
|
|
# 创建合并记录
|
|
merged_record = {}
|
|
|
|
# 添加元素治理模板表的数据
|
|
for key, value in element_record.items():
|
|
if key != '表名':
|
|
merged_record[key] = value
|
|
|
|
# 查找逻辑模型表中的匹配记录
|
|
logical_matches = logical_index.get(field_english_name, [])
|
|
|
|
# 查找物理模型表中的匹配记录
|
|
physical_matches = physical_index.get(field_english_name, [])
|
|
|
|
# 添加逻辑模型表的数据(添加前缀避免冲突)
|
|
if logical_matches:
|
|
for logical_match in logical_matches:
|
|
for key, value in logical_match.items():
|
|
if key not in ['表名', '字段英文名']:
|
|
new_key = f"逻辑模型_{key}"
|
|
merged_record[new_key] = value
|
|
|
|
# 只有当有匹配数据时才添加表名信息
|
|
merged_record['逻辑模型表_表名'] = '远光数据架构逻辑模型表'
|
|
|
|
# 添加物理模型表的数据(添加前缀避免冲突)
|
|
if physical_matches:
|
|
for physical_match in physical_matches:
|
|
for key, value in physical_match.items():
|
|
if key not in ['表名', '字段英文名']:
|
|
new_key = f"物理模型_{key}"
|
|
merged_record[new_key] = value
|
|
|
|
# 只有当有匹配数据时才添加表名信息
|
|
merged_record['物理模型表_表名'] = '远光数据架构物理模型表'
|
|
|
|
# 添加元素治理表名(始终存在)
|
|
merged_record['元素治理表_表名'] = '远光数据架构元素治理模板表'
|
|
|
|
merged_data.append(merged_record)
|
|
|
|
print(f" 完成合并: {len(merged_data)} 条记录")
|
|
|
|
return merged_data
|
|
|
|
def add_unmatched_records_optimized(merged_data: List[Dict],
|
|
logical_index: Dict,
|
|
physical_index: Dict) -> List[Dict]:
|
|
"""
|
|
添加未匹配的记录
|
|
"""
|
|
print(f"\n[INFO] 处理未匹配的记录...")
|
|
|
|
# 获取所有已处理的字段英文名
|
|
processed_fields = {record.get('字段英文名') for record in merged_data if record.get('字段英文名')}
|
|
|
|
# 添加逻辑模型表中未匹配的记录
|
|
logical_unmatched = len(logical_index) - len([f for f in logical_index if f in processed_fields])
|
|
print(f" 逻辑模型表未匹配: {logical_unmatched} 条")
|
|
|
|
for field_name, logical_matches in logical_index.items():
|
|
if field_name not in processed_fields:
|
|
for logical_match in logical_matches:
|
|
merged_record = {'字段英文名': field_name}
|
|
|
|
for key, value in logical_match.items():
|
|
if key not in ['表名', '字段英文名']:
|
|
merged_record[f"逻辑模型_{key}"] = value
|
|
|
|
merged_record['逻辑模型表_表名'] = '远光数据架构逻辑模型表'
|
|
merged_record['物理模型表_表名'] = None
|
|
merged_record['元素治理表_表名'] = None
|
|
|
|
merged_data.append(merged_record)
|
|
|
|
# 添加物理模型表中未匹配的记录
|
|
physical_unmatched = len(physical_index) - len([f for f in physical_index if f in processed_fields])
|
|
print(f" 物理模型表未匹配: {physical_unmatched} 条")
|
|
|
|
for field_name, physical_matches in physical_index.items():
|
|
if field_name not in processed_fields:
|
|
# 检查是否已经添加过(通过逻辑模型表)
|
|
already_added = any(r.get('字段英文名') == field_name for r in merged_data)
|
|
|
|
if not already_added:
|
|
for physical_match in physical_matches:
|
|
merged_record = {'字段英文名': field_name}
|
|
|
|
for key, value in physical_match.items():
|
|
if key not in ['表名', '字段英文名']:
|
|
merged_record[f"物理模型_{key}"] = value
|
|
|
|
merged_record['逻辑模型表_表名'] = None
|
|
merged_record['物理模型表_表名'] = '远光数据架构物理模型表'
|
|
merged_record['元素治理表_表名'] = None
|
|
|
|
merged_data.append(merged_record)
|
|
|
|
return merged_data
|
|
|
|
def main():
|
|
"""主函数"""
|
|
print("="*60)
|
|
print("优化版JSON文件合并工具")
|
|
print("="*60)
|
|
|
|
# 文件路径
|
|
logical_json_path = "Data_Export_Json/远光数据架构逻辑模型表.json"
|
|
physical_json_path = "Data_Export_Json/远光数据架构物理模型表.json"
|
|
element_json_path = "Data_Export_Json/远光数据架构元素治理模板表.json"
|
|
output_path = "Data_Export_Json/final.json"
|
|
|
|
# 加载JSON文件
|
|
print("\n[INFO] 加载JSON文件...")
|
|
logical_records = load_json_file(logical_json_path)
|
|
physical_records = load_json_file(physical_json_path)
|
|
element_records = load_json_file(element_json_path)
|
|
|
|
if not (logical_records and physical_records and element_records):
|
|
print("\n[ERROR] 无法加载所有JSON文件")
|
|
return
|
|
|
|
# 建立索引
|
|
print(f"\n[INFO] 建立索引加速查找...")
|
|
logical_index = build_index(logical_records, '字段英文名')
|
|
physical_index = build_index(physical_records, '字段英文名')
|
|
|
|
# 合并数据(只处理元素治理表中存在的字段)
|
|
merged_data = merge_records_optimized(logical_index, physical_index, element_records)
|
|
|
|
# 不再添加未匹配的记录,因为用户只关心元素治理表中的字段
|
|
|
|
# 保存合并后的数据
|
|
try:
|
|
print(f"\n[INFO] 保存合并数据到 {output_path}...")
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(merged_data, f, ensure_ascii=False, indent=2)
|
|
|
|
file_size = os.path.getsize(output_path) / 1024 # KB
|
|
print(f"\n[OK] 合并完成!")
|
|
print(f" 输出文件: {output_path}")
|
|
print(f" 合并记录: {len(merged_data)} 条")
|
|
print(f" 文件大小: {file_size:.1f} KB")
|
|
|
|
# 显示统计信息
|
|
three_table_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('逻辑模型表_表名') and r.get('物理模型表_表名'))
|
|
element_logical_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('逻辑模型表_表名') and not r.get('物理模型表_表名'))
|
|
element_physical_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('物理模型表_表名') and not r.get('逻辑模型表_表名'))
|
|
element_only_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and not r.get('逻辑模型表_表名') and not r.get('物理模型表_表名'))
|
|
logical_only_count = sum(1 for r in merged_data if r.get('逻辑模型表_表名') and not r.get('元素治理表_表名'))
|
|
physical_only_count = sum(1 for r in merged_data if r.get('物理模型表_表名') and not r.get('元素治理表_表名'))
|
|
|
|
print(f"\n[INFO] 统计信息:")
|
|
print(f" 三表匹配: {three_table_match} 条")
|
|
print(f" 元素治理+逻辑模型: {element_logical_match} 条")
|
|
print(f" 元素治理+物理模型: {element_physical_match} 条")
|
|
print(f" 仅元素治理: {element_only_match} 条")
|
|
print(f" 仅逻辑模型: {logical_only_count} 条")
|
|
print(f" 仅物理模型: {physical_only_count} 条")
|
|
|
|
# 显示前3条记录的字段名
|
|
if merged_data:
|
|
print(f"\n[INFO] 合并记录示例:")
|
|
sample_record = merged_data[0]
|
|
print(f" 字段数量: {len(sample_record)}")
|
|
print(f" 字段名: {list(sample_record.keys())[:10]}...") # 只显示前10个字段
|
|
|
|
except Exception as e:
|
|
print(f"\n[ERROR] 保存文件失败: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|