Files
YG_TDgenerator/merge_json_fast.py
DESKTOP-72TV0V4\caoxiaozhu 8088b59d30 1. 修改了问题表达
2. 缩减了表的数量
3. 可以选择生成多少个
2025-12-31 18:15:50 +08:00

227 lines
9.6 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
优化版JSON文件合并脚本
根据字段英文名匹配逻辑模型表、物理模型表和元素治理模板表的数据
"""
import json
import os
from collections import defaultdict
from typing import Dict, List, Any
def load_json_file(file_path: str) -> List[Dict[str, Any]]:
"""加载JSON文件"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"[OK] 加载文件: {os.path.basename(file_path)} - {len(data)} 条记录")
return data
except Exception as e:
print(f"[ERROR] 加载文件失败 {file_path}: {e}")
return []
def build_index(records: List[Dict], field_name: str) -> Dict[str, List[Dict]]:
"""为记录列表建立索引,加速查找"""
index = defaultdict(list)
for record in records:
field_value = record.get(field_name)
if field_value:
index[field_value].append(record)
print(f"[INFO] 建立索引完成: {len(index)} 个唯一字段值")
return index
def merge_records_optimized(logical_index: Dict, physical_index: Dict, element_records: List[Dict]) -> List[Dict]:
"""
使用索引优化合并三个表的记录
"""
merged_data = []
processed_fields = set()
# 遍历元素治理表
print(f"\n[INFO] 开始合并数据...")
for i, element_record in enumerate(element_records):
if i % 5000 == 0:
print(f" 处理进度: {i}/{len(element_records)}")
field_english_name = element_record.get('字段英文名')
if not field_english_name or field_english_name in processed_fields:
continue
processed_fields.add(field_english_name)
# 创建合并记录
merged_record = {}
# 添加元素治理模板表的数据
for key, value in element_record.items():
if key != '表名':
merged_record[key] = value
# 查找逻辑模型表中的匹配记录
logical_matches = logical_index.get(field_english_name, [])
# 查找物理模型表中的匹配记录
physical_matches = physical_index.get(field_english_name, [])
# 添加逻辑模型表的数据(添加前缀避免冲突)
if logical_matches:
for logical_match in logical_matches:
for key, value in logical_match.items():
if key not in ['表名', '字段英文名']:
new_key = f"逻辑模型_{key}"
merged_record[new_key] = value
# 只有当有匹配数据时才添加表名信息
merged_record['逻辑模型表_表名'] = '远光数据架构逻辑模型表'
# 添加物理模型表的数据(添加前缀避免冲突)
if physical_matches:
for physical_match in physical_matches:
for key, value in physical_match.items():
if key not in ['表名', '字段英文名']:
new_key = f"物理模型_{key}"
merged_record[new_key] = value
# 只有当有匹配数据时才添加表名信息
merged_record['物理模型表_表名'] = '远光数据架构物理模型表'
# 添加元素治理表名(始终存在)
merged_record['元素治理表_表名'] = '远光数据架构元素治理模板表'
merged_data.append(merged_record)
print(f" 完成合并: {len(merged_data)} 条记录")
return merged_data
def add_unmatched_records_optimized(merged_data: List[Dict],
logical_index: Dict,
physical_index: Dict) -> List[Dict]:
"""
添加未匹配的记录
"""
print(f"\n[INFO] 处理未匹配的记录...")
# 获取所有已处理的字段英文名
processed_fields = {record.get('字段英文名') for record in merged_data if record.get('字段英文名')}
# 添加逻辑模型表中未匹配的记录
logical_unmatched = len(logical_index) - len([f for f in logical_index if f in processed_fields])
print(f" 逻辑模型表未匹配: {logical_unmatched}")
for field_name, logical_matches in logical_index.items():
if field_name not in processed_fields:
for logical_match in logical_matches:
merged_record = {'字段英文名': field_name}
for key, value in logical_match.items():
if key not in ['表名', '字段英文名']:
merged_record[f"逻辑模型_{key}"] = value
merged_record['逻辑模型表_表名'] = '远光数据架构逻辑模型表'
merged_record['物理模型表_表名'] = None
merged_record['元素治理表_表名'] = None
merged_data.append(merged_record)
# 添加物理模型表中未匹配的记录
physical_unmatched = len(physical_index) - len([f for f in physical_index if f in processed_fields])
print(f" 物理模型表未匹配: {physical_unmatched}")
for field_name, physical_matches in physical_index.items():
if field_name not in processed_fields:
# 检查是否已经添加过(通过逻辑模型表)
already_added = any(r.get('字段英文名') == field_name for r in merged_data)
if not already_added:
for physical_match in physical_matches:
merged_record = {'字段英文名': field_name}
for key, value in physical_match.items():
if key not in ['表名', '字段英文名']:
merged_record[f"物理模型_{key}"] = value
merged_record['逻辑模型表_表名'] = None
merged_record['物理模型表_表名'] = '远光数据架构物理模型表'
merged_record['元素治理表_表名'] = None
merged_data.append(merged_record)
return merged_data
def main():
"""主函数"""
print("="*60)
print("优化版JSON文件合并工具")
print("="*60)
# 文件路径
logical_json_path = "Data_Export_Json/远光数据架构逻辑模型表.json"
physical_json_path = "Data_Export_Json/远光数据架构物理模型表.json"
element_json_path = "Data_Export_Json/远光数据架构元素治理模板表.json"
output_path = "Data_Export_Json/final.json"
# 加载JSON文件
print("\n[INFO] 加载JSON文件...")
logical_records = load_json_file(logical_json_path)
physical_records = load_json_file(physical_json_path)
element_records = load_json_file(element_json_path)
if not (logical_records and physical_records and element_records):
print("\n[ERROR] 无法加载所有JSON文件")
return
# 建立索引
print(f"\n[INFO] 建立索引加速查找...")
logical_index = build_index(logical_records, '字段英文名')
physical_index = build_index(physical_records, '字段英文名')
# 合并数据(只处理元素治理表中存在的字段)
merged_data = merge_records_optimized(logical_index, physical_index, element_records)
# 不再添加未匹配的记录,因为用户只关心元素治理表中的字段
# 保存合并后的数据
try:
print(f"\n[INFO] 保存合并数据到 {output_path}...")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(merged_data, f, ensure_ascii=False, indent=2)
file_size = os.path.getsize(output_path) / 1024 # KB
print(f"\n[OK] 合并完成!")
print(f" 输出文件: {output_path}")
print(f" 合并记录: {len(merged_data)}")
print(f" 文件大小: {file_size:.1f} KB")
# 显示统计信息
three_table_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('逻辑模型表_表名') and r.get('物理模型表_表名'))
element_logical_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('逻辑模型表_表名') and not r.get('物理模型表_表名'))
element_physical_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('物理模型表_表名') and not r.get('逻辑模型表_表名'))
element_only_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and not r.get('逻辑模型表_表名') and not r.get('物理模型表_表名'))
logical_only_count = sum(1 for r in merged_data if r.get('逻辑模型表_表名') and not r.get('元素治理表_表名'))
physical_only_count = sum(1 for r in merged_data if r.get('物理模型表_表名') and not r.get('元素治理表_表名'))
print(f"\n[INFO] 统计信息:")
print(f" 三表匹配: {three_table_match}")
print(f" 元素治理+逻辑模型: {element_logical_match}")
print(f" 元素治理+物理模型: {element_physical_match}")
print(f" 仅元素治理: {element_only_match}")
print(f" 仅逻辑模型: {logical_only_count}")
print(f" 仅物理模型: {physical_only_count}")
# 显示前3条记录的字段名
if merged_data:
print(f"\n[INFO] 合并记录示例:")
sample_record = merged_data[0]
print(f" 字段数量: {len(sample_record)}")
print(f" 字段名: {list(sample_record.keys())[:10]}...") # 只显示前10个字段
except Exception as e:
print(f"\n[ERROR] 保存文件失败: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()