Files
YG_TDgenerator/csv2json.py

971 lines
35 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
数据处理一体化工具
功能1Excel/CSV转JSON - 读取Excel/CSV文件并转换为JSON
功能2JSON合并 - 根据字段英文名匹配逻辑模型表、物理模型表和元素治理模板表的数据
功能3随机抽取 - 从合并后的JSON中随机抽取指定数量的记录
支持多种Excel读取方式自动处理复杂格式
"""
import pandas as pd
import json
import os
import glob
import subprocess
import xlwings as xw
import random
from datetime import datetime
from collections import defaultdict
from typing import Optional, Dict, List, Tuple, Any
class ExcelToJsonConverter:
"""Excel转JSON转换器"""
def __init__(self, input_dir: str, output_dir: str):
"""
初始化转换器
Args:
input_dir: Excel文件输入目录
output_dir: JSON文件输出目录
"""
self.input_dir = input_dir
self.output_dir = output_dir
# 确保输出目录存在
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# CSV临时目录仅在Excel模式下使用
self.temp_csv_dir = None
def find_excel_files(self) -> List[Tuple[str, str]]:
"""扫描目录下的所有Excel文件"""
excel_files = []
search_pattern = os.path.join(self.input_dir, "*.xlsx")
for excel_path in glob.glob(search_pattern):
filename = os.path.basename(excel_path)
# 跳过临时文件Excel的临时文件以~$开头)
if filename.startswith('~$'):
print(f"[SKIP] 跳过临时文件: {filename}")
continue
# 生成基础文件名(不含扩展名)
base_name = filename.replace('.xlsx', '')
excel_files.append((excel_path, base_name))
return excel_files
def read_excel_with_xlwings(self, excel_path: str) -> Optional[pd.DataFrame]:
"""使用xlwings读取Excel文件"""
try:
print(f" [TRY] 使用xlwings读取...")
app = xw.App(visible=False)
wb = app.books.open(excel_path)
sheet = wb.sheets[0]
# 读取数据
data = sheet.range('A1').expand().value
wb.close()
app.quit()
# 转换为DataFrame
if data and len(data) > 0:
if isinstance(data[0], list):
# 标准表格格式
headers = data[0]
rows = data[1:] if len(data) > 1 else []
df = pd.DataFrame(rows, columns=headers)
else:
# 每行只有一个值的特殊格式
df = pd.DataFrame(data, columns=['内容'])
return df
return None
except ImportError:
print(f" [WARN] xlwings未安装")
return None
except Exception as e:
print(f" [WARN] xlwings读取失败: {str(e)[:100]}")
return None
def read_excel_with_libreoffice(self, excel_path: str) -> Optional[pd.DataFrame]:
"""使用LibreOffice转换后读取"""
try:
print(f" [TRY] 使用LibreOffice转换...")
# 输出CSV路径
csv_path = excel_path.replace('.xlsx', '_temp.csv')
# 使用LibreOffice转换
cmd = [
'libreoffice',
'--headless',
'--convert-to', 'csv',
'--outdir', os.path.dirname(excel_path),
excel_path
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
if os.path.exists(csv_path):
df = pd.read_csv(csv_path, encoding='utf-8')
# 删除临时文件
os.remove(csv_path)
print(f" [OK] LibreOffice转换成功")
return df
else:
print(f" [WARN] LibreOffice转换失败")
return None
except FileNotFoundError:
print(f" [WARN] LibreOffice未安装")
return None
except subprocess.TimeoutExpired:
print(f" [WARN] LibreOffice转换超时")
return None
except Exception as e:
print(f" [WARN] LibreOffice转换失败: {e}")
return None
def read_excel_with_pandas(self, excel_path: str) -> Optional[pd.DataFrame]:
"""使用pandas读取Excel文件"""
engines = ['openpyxl', 'xlrd']
for engine in engines:
try:
print(f" [TRY] 使用pandas ({engine})读取...")
df = pd.read_excel(excel_path, engine=engine)
print(f" [OK] pandas ({engine}) 读取成功")
return df
except Exception as e:
print(f" [WARN] pandas ({engine}) 失败: {str(e)[:100]}")
continue
return None
def read_excel_file(self, excel_path: str) -> Optional[pd.DataFrame]:
"""
尝试多种方法读取Excel文件
Args:
excel_path: Excel文件路径
Returns:
DataFrame或None
"""
print(f"\n[INFO] 读取文件: {os.path.basename(excel_path)}")
# 按优先级尝试读取方法
methods = [
("xlwings", self.read_excel_with_xlwings),
("pandas-openpyxl", lambda p: self.read_excel_with_pandas(p) if 'openpyxl' in str(p) else None),
("LibreOffice", self.read_excel_with_libreoffice),
("pandas-xlrd", self.read_excel_with_pandas),
]
for method_name, method_func in methods:
try:
if method_name == "pandas-openpyxl":
# 特殊处理pandas-openpyxl
df = self.read_excel_with_pandas(excel_path)
elif method_name == "pandas-xlrd":
# 跳过,因为上面已经尝试过了
continue
else:
df = method_func(excel_path)
if df is not None and not df.empty:
print(f"[OK] {method_name} 成功读取!")
print(f" 数据形状: {df.shape[0]}× {df.shape[1]}")
return df
except Exception as e:
print(f"[WARN] {method_name} 失败: {str(e)[:100]}")
print(f"[ERROR] 所有读取方法都失败了")
return None
def convert_to_csv(self, df: pd.DataFrame, base_name: str) -> str:
"""
将DataFrame转换为CSV
Args:
df: 数据框
base_name: 文件基础名
Returns:
CSV文件路径
"""
# 确保临时CSV目录存在
if self.temp_csv_dir is None:
self.temp_csv_dir = os.path.join(self.output_dir, "temp_csv")
if not os.path.exists(self.temp_csv_dir):
os.makedirs(self.temp_csv_dir)
csv_filename = f"{base_name}.csv"
csv_path = os.path.join(self.temp_csv_dir, csv_filename)
# 保存为CSV使用utf-8-sig编码支持中文
df.to_csv(csv_path, index=False, encoding='utf-8-sig')
file_size = os.path.getsize(csv_path) / 1024 # KB
print(f" [OK] CSV已生成: {csv_filename} ({file_size:.1f} KB)")
return csv_path
def convert_csv_to_json(self, csv_path: str, base_name: str) -> str:
"""
将CSV文件转换为JSON
Args:
csv_path: CSV文件路径
base_name: 文件基础名
Returns:
JSON文件路径
"""
try:
# 读取CSV文件
df = pd.read_csv(csv_path, encoding='utf-8-sig')
if df.empty:
print(f" [WARN] CSV文件为空")
return ""
# 转换为JSON列表
json_data = []
for index, row in df.iterrows():
# 创建JSON对象
json_obj = {}
for column in df.columns:
value = row[column]
# 处理Na值
if pd.isna(value):
json_obj[column] = None
else:
# 处理数据值:如果是字符串且包含英文字母,转换为小写
if isinstance(value, str) and any(c.isalpha() and ord(c) < 128 for c in value):
# 将数据值中的英文字母转换为小写
value = value.lower()
# 将英文字段名转换为小写
# 检查字段名是否完全是英文字符(包括字母、数字、下划线)
if all(ord(c) < 128 for c in column if c.isalnum() or c in '_'):
# 完全是英文字段名,转换为小写
json_obj[column.lower()] = value
else:
# 包含中文字符的字段名保持不变
json_obj[column] = value
# 添加表名字段
json_obj['表名'] = base_name
json_data.append(json_obj)
# 生成JSON文件路径
json_filename = f"{base_name}.json"
json_path = os.path.join(self.output_dir, json_filename)
# 保存JSON文件
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(json_data, f, ensure_ascii=False, indent=2)
file_size = os.path.getsize(json_path) / 1024 # KB
print(f" [OK] JSON已生成: {json_filename} ({file_size:.1f} KB)")
print(f" 数据量: {len(json_data)} 条记录")
return json_path
except Exception as e:
print(f" [ERROR] CSV转JSON失败: {e}")
import traceback
traceback.print_exc()
return ""
def process_single_file(self, excel_path: str, base_name: str) -> bool:
"""
处理单个Excel文件Excel -> CSV -> JSON
Args:
excel_path: Excel文件路径
base_name: 文件基础名
Returns:
是否成功
"""
print(f"\n{'='*60}")
print(f"处理: {os.path.basename(excel_path)}")
print(f"{'='*60}")
# 步骤1: 读取Excel
df = self.read_excel_file(excel_path)
if df is None:
print(f"[ERROR] 读取失败,跳过此文件")
return False
# 显示数据预览
print(f"\n[INFO] 数据预览:")
print(df.head(3))
# 步骤2: 转换为CSV
csv_path = self.convert_to_csv(df, base_name)
# 步骤3: 转换为JSON
json_path = self.convert_csv_to_json(csv_path, base_name)
if json_path:
print(f"\n[OK] 转换完成!")
return True
else:
print(f"\n[ERROR] 转换失败")
return False
def process_all(self) -> Dict:
"""
处理所有Excel文件
Returns:
处理结果统计
"""
print("="*60)
print("Excel转JSON一体化工具")
print("="*60)
print(f"输入目录: {self.input_dir}")
print(f"输出目录: {self.output_dir}")
# 查找Excel文件
excel_files = self.find_excel_files()
if not excel_files:
print(f"\n[WARN] 未找到任何Excel文件")
return {'total': 0, 'success': 0, 'failed': 0}
print(f"\n[INFO] 发现 {len(excel_files)} 个Excel文件")
# 处理每个文件
success_count = 0
failed_count = 0
results = []
for excel_path, base_name in excel_files:
if self.process_single_file(excel_path, base_name):
success_count += 1
results.append({'file': os.path.basename(excel_path), 'status': 'success'})
else:
failed_count += 1
results.append({'file': os.path.basename(excel_path), 'status': 'failed'})
# 输出统计信息
print(f"\n{'='*60}")
print("转换完成!")
print(f"{'='*60}")
print(f"总计: {len(excel_files)} 个文件")
print(f"成功: {success_count} 个文件")
print(f"失败: {failed_count} 个文件")
# 显示生成的JSON文件
if success_count > 0:
print(f"\n生成的JSON文件:")
json_files = glob.glob(os.path.join(self.output_dir, "*.json"))
for json_file in sorted(json_files):
file_size = os.path.getsize(json_file) / 1024 # KB
filename = os.path.basename(json_file)
print(f" - {filename} ({file_size:.1f} KB)")
return {
'total': len(excel_files),
'success': success_count,
'failed': failed_count,
'results': results
}
def find_csv_files(self, csv_dir: str) -> List[Tuple[str, str]]:
"""扫描目录下的所有CSV文件"""
csv_files = []
search_pattern = os.path.join(csv_dir, "*.csv")
for csv_path in glob.glob(search_pattern):
filename = os.path.basename(csv_path)
# 生成基础文件名(不含扩展名)
base_name = filename.replace('.csv', '')
csv_files.append((csv_path, base_name))
return csv_files
def convert_csv_to_json_direct(self, csv_path: str, base_name: str) -> str:
"""
直接将CSV文件转换为JSON不生成临时CSV
这个方法直接从CSV读取并转换为JSON
Args:
csv_path: CSV文件路径
base_name: 文件基础名
Returns:
JSON文件路径
"""
try:
# 尝试多种编码读取CSV文件
encodings = ['utf-8-sig', 'gb2312', 'gbk', 'utf-8']
df = None
for encoding in encodings:
try:
print(f" [TRY] 尝试编码: {encoding}")
df = pd.read_csv(csv_path, encoding=encoding)
print(f" [OK] 编码 {encoding} 读取成功")
break
except (UnicodeDecodeError, UnicodeError):
print(f" [WARN] 编码 {encoding} 失败")
continue
except Exception as e:
print(f" [WARN] 编码 {encoding} 其他错误: {str(e)[:50]}")
continue
if df is None:
print(f" [ERROR] 所有编码都失败无法读取CSV文件")
return ""
if df.empty:
print(f" [WARN] CSV文件为空")
return ""
# 转换为JSON列表
json_data = []
for index, row in df.iterrows():
# 创建JSON对象
json_obj = {}
for column in df.columns:
value = row[column]
# 处理Na值
if pd.isna(value):
json_obj[column] = None
else:
# 处理数据值:如果是字符串且包含英文字母,转换为小写
if isinstance(value, str) and any(c.isalpha() and ord(c) < 128 for c in value):
# 将数据值中的英文字母转换为小写
value = value.lower()
# 将英文字段名转换为小写
# 检查字段名是否完全是英文字符(包括字母、数字、下划线)
if all(ord(c) < 128 for c in column if c.isalnum() or c in '_'):
# 完全是英文字段名,转换为小写
json_obj[column.lower()] = value
else:
# 包含中文字符的字段名保持不变
json_obj[column] = value
# 添加表名字段
json_obj['表名'] = base_name
json_data.append(json_obj)
# 生成JSON文件路径
json_filename = f"{base_name}.json"
json_path = os.path.join(self.output_dir, json_filename)
# 保存JSON文件
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(json_data, f, ensure_ascii=False, indent=2)
file_size = os.path.getsize(json_path) / 1024 # KB
print(f" [OK] JSON已生成: {json_filename} ({file_size:.1f} KB)")
print(f" 数据量: {len(json_data)} 条记录")
return json_path
except Exception as e:
print(f" [ERROR] CSV转JSON失败: {e}")
import traceback
traceback.print_exc()
return ""
def process_single_csv(self, csv_path: str, base_name: str) -> bool:
"""
处理单个CSV文件CSV → JSON
Args:
csv_path: CSV文件路径
base_name: 文件基础名
Returns:
是否成功
"""
print(f"\n{'='*60}")
print(f"处理: {os.path.basename(csv_path)}")
print(f"{'='*60}")
# 步骤1: 读取CSV文件并预览
try:
# 尝试多种编码读取CSV文件
encodings = ['utf-8-sig', 'gb2312', 'gbk', 'utf-8']
df = None
for encoding in encodings:
try:
df = pd.read_csv(csv_path, encoding=encoding)
break
except (UnicodeDecodeError, UnicodeError):
continue
except Exception as e:
print(f"[ERROR] 编码 {encoding} 错误: {e}")
continue
if df is None or df.empty:
print(f"[ERROR] CSV文件为空或读取失败")
return False
print(f"\n[INFO] 数据预览:")
print(df.head(3))
print(f"\n[INFO] 数据形状: {df.shape[0]}× {df.shape[1]}")
except Exception as e:
print(f"[ERROR] 读取CSV失败: {e}")
return False
# 步骤2: 转换为JSON
json_path = self.convert_csv_to_json_direct(csv_path, base_name)
if json_path:
print(f"\n[OK] 转换完成!")
return True
else:
print(f"\n[ERROR] 转换失败")
return False
def convert_csv_directory(self, csv_dir: str) -> Dict:
"""
处理CSV目录下的所有CSV文件
Args:
csv_dir: CSV文件目录
Returns:
处理结果统计
"""
print("="*60)
print("CSV转JSON工具")
print("="*60)
print(f"CSV输入目录: {csv_dir}")
print(f"JSON输出目录: {self.output_dir}")
# 查找CSV文件
csv_files = self.find_csv_files(csv_dir)
if not csv_files:
print(f"\n[WARN] 未找到任何CSV文件")
return {'total': 0, 'success': 0, 'failed': 0}
print(f"\n[INFO] 发现 {len(csv_files)} 个CSV文件")
# 处理每个文件
success_count = 0
failed_count = 0
results = []
for csv_path, base_name in csv_files:
if self.process_single_csv(csv_path, base_name):
success_count += 1
results.append({'file': os.path.basename(csv_path), 'status': 'success'})
else:
failed_count += 1
results.append({'file': os.path.basename(csv_path), 'status': 'failed'})
# 输出统计信息
print(f"\n{'='*60}")
print("转换完成!")
print(f"{'='*60}")
print(f"总计: {len(csv_files)} 个文件")
print(f"成功: {success_count} 个文件")
print(f"失败: {failed_count} 个文件")
# 显示生成的JSON文件
if success_count > 0:
print(f"\n生成的JSON文件:")
json_files = glob.glob(os.path.join(self.output_dir, "*.json"))
for json_file in sorted(json_files):
file_size = os.path.getsize(json_file) / 1024 # KB
filename = os.path.basename(json_file)
print(f" - {filename} ({file_size:.1f} KB)")
return {
'total': len(csv_files),
'success': success_count,
'failed': failed_count,
'results': results
}
class JsonMerger:
"""JSON文件合并器"""
def __init__(self, output_dir: str):
self.output_dir = output_dir
def load_json_file(self, file_path: str) -> List[Dict[str, Any]]:
"""加载JSON文件"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"[OK] 加载文件: {os.path.basename(file_path)} - {len(data)} 条记录")
return data
except Exception as e:
print(f"[ERROR] 加载文件失败 {file_path}: {e}")
return []
def build_index(self, records: List[Dict], field_name: str) -> Dict[str, List[Dict]]:
"""为记录列表建立索引,加速查找"""
index = defaultdict(list)
for record in records:
field_value = record.get(field_name)
if field_value:
index[field_value].append(record)
print(f"[INFO] 建立索引完成: {len(index)} 个唯一字段值")
return index
def merge_records_optimized(self, logical_index: Dict, physical_index: Dict, element_records: List[Dict]) -> List[Dict]:
"""
使用索引优化合并三个表的记录
"""
merged_data = []
processed_fields = set()
# 遍历元素治理表
print(f"\n[INFO] 开始合并数据...")
for i, element_record in enumerate(element_records):
if i % 5000 == 0:
print(f" 处理进度: {i}/{len(element_records)}")
field_english_name = element_record.get('字段英文名')
if not field_english_name or field_english_name in processed_fields:
continue
processed_fields.add(field_english_name)
# 创建合并记录
merged_record = {}
# 添加元素治理模板表的数据
for key, value in element_record.items():
if key != '表名':
merged_record[key] = value
# 查找逻辑模型表中的匹配记录
logical_matches = logical_index.get(field_english_name, [])
# 查找物理模型表中的匹配记录
physical_matches = physical_index.get(field_english_name, [])
# 添加逻辑模型表的数据(添加前缀避免冲突)
if logical_matches:
for logical_match in logical_matches:
for key, value in logical_match.items():
if key not in ['表名', '字段英文名']:
new_key = f"逻辑模型_{key}"
merged_record[new_key] = value
# 只有当有匹配数据时才添加表名信息
merged_record['逻辑模型表_表名'] = '远光数据架构逻辑模型表'
# 添加物理模型表的数据(添加前缀避免冲突)
if physical_matches:
for physical_match in physical_matches:
for key, value in physical_match.items():
if key not in ['表名', '字段英文名']:
new_key = f"物理模型_{key}"
merged_record[new_key] = value
# 只有当有匹配数据时才添加表名信息
merged_record['物理模型表_表名'] = '远光数据架构物理模型表'
# 添加元素治理表名(始终存在)
merged_record['元素治理表_表名'] = '远光数据架构元素治理模板表'
merged_data.append(merged_record)
print(f" 完成合并: {len(merged_data)} 条记录")
return merged_data
def merge_all(self, logical_file: str, physical_file: str, element_file: str, output_file: str) -> Dict:
"""合并所有JSON文件"""
print("="*60)
print("优化版JSON文件合并工具")
print("="*60)
# 构建文件路径
logical_json_path = os.path.join(self.output_dir, logical_file)
physical_json_path = os.path.join(self.output_dir, physical_file)
element_json_path = os.path.join(self.output_dir, element_file)
output_path = os.path.join(self.output_dir, output_file)
# 加载JSON文件
print("\n[INFO] 加载JSON文件...")
logical_records = self.load_json_file(logical_json_path)
physical_records = self.load_json_file(physical_json_path)
element_records = self.load_json_file(element_json_path)
if not (logical_records and physical_records and element_records):
print("\n[ERROR] 无法加载所有JSON文件")
return {'success': False, 'merged_count': 0}
# 建立索引
print(f"\n[INFO] 建立索引加速查找...")
logical_index = self.build_index(logical_records, '字段英文名')
physical_index = self.build_index(physical_records, '字段英文名')
# 合并数据(只处理元素治理表中存在的字段)
merged_data = self.merge_records_optimized(logical_index, physical_index, element_records)
# 保存合并后的数据
try:
print(f"\n[INFO] 保存合并数据到 {output_path}...")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(merged_data, f, ensure_ascii=False, indent=2)
file_size = os.path.getsize(output_path) / 1024 # KB
print(f"\n[OK] 合并完成!")
print(f" 输出文件: {output_path}")
print(f" 合并记录: {len(merged_data)}")
print(f" 文件大小: {file_size:.1f} KB")
# 显示统计信息
three_table_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('逻辑模型表_表名') and r.get('物理模型表_表名'))
element_logical_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('逻辑模型表_表名') and not r.get('物理模型表_表名'))
element_physical_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('物理模型表_表名') and not r.get('逻辑模型表_表名'))
element_only_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and not r.get('逻辑模型表_表名') and not r.get('物理模型表_表名'))
print(f"\n[INFO] 统计信息:")
print(f" 三表匹配: {three_table_match}")
print(f" 元素治理+逻辑模型: {element_logical_match}")
print(f" 元素治理+物理模型: {element_physical_match}")
print(f" 仅元素治理: {element_only_match}")
# 显示前3条记录的字段名
if merged_data:
print(f"\n[INFO] 合并记录示例:")
sample_record = merged_data[0]
print(f" 字段数量: {len(sample_record)}")
print(f" 字段名: {list(sample_record.keys())[:10]}...") # 只显示前10个字段
return {
'success': True,
'merged_count': len(merged_data),
'output_file': output_path,
'file_size_kb': file_size,
'statistics': {
'三表匹配': three_table_match,
'元素治理+逻辑模型': element_logical_match,
'元素治理+物理模型': element_physical_match,
'仅元素治理': element_only_match
}
}
except Exception as e:
print(f"\n[ERROR] 保存文件失败: {e}")
import traceback
traceback.print_exc()
return {'success': False, 'merged_count': 0}
class RandomSelector:
"""随机选择器"""
def __init__(self, output_dir: str, random_seed: int = 42, select_count: int = 3000):
self.output_dir = output_dir
self.random_seed = random_seed
self.select_count = select_count
def load_json_file(self, file_path: str) -> List[Dict[str, Any]]:
"""加载JSON文件"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"[OK] 加载文件: {os.path.basename(file_path)} - {len(data)} 条记录")
return data
except Exception as e:
print(f"[ERROR] 加载文件失败 {file_path}: {e}")
return []
def random_select(self, records: List[Dict[str, Any]], count: int) -> List[Dict[str, Any]]:
"""
随机抽取记录
Args:
records: 记录列表
count: 要抽取的数量
Returns:
抽取的记录列表
"""
# 设置随机种子
random.seed(self.random_seed)
# 如果抽取数量大于等于总数,直接返回所有记录
if count >= len(records):
print(f"[WARN] 抽取数量 ({count}) 大于等于总记录数 ({len(records)}),返回所有记录")
return records
# 随机抽取
selected = random.sample(records, count)
print(f"[OK] 从 {len(records)} 条记录中随机抽取 {count}")
return selected
def select_random(self, input_file: str, output_file: str) -> Dict:
"""随机抽取记录"""
print("="*60)
print("随机抽取工具")
print("="*60)
# 构建文件路径
input_path = os.path.join(self.output_dir, input_file)
output_path = os.path.join(self.output_dir, output_file)
print(f"\n[INFO] 配置:")
print(f" 随机种子: {self.random_seed}")
print(f" 抽取数量: {self.select_count}")
# 检查输入文件是否存在
if not os.path.exists(input_path):
print(f"\n[ERROR] 输入文件不存在: {input_path}")
return {'success': False, 'selected_count': 0}
# 加载数据
print(f"\n[INFO] 加载数据...")
records = self.load_json_file(input_path)
if not records:
print(f"\n[ERROR] 无法加载数据或数据为空")
return {'success': False, 'selected_count': 0}
# 随机抽取
print(f"\n[INFO] 执行随机抽取...")
selected_records = self.random_select(records, self.select_count)
# 保存结果
try:
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(selected_records, f, ensure_ascii=False, indent=2)
file_size = os.path.getsize(output_path) / 1024 # KB
print(f"\n[OK] 抽取完成!")
print(f" 输出文件: {output_path}")
print(f" 记录数量: {len(selected_records)}")
print(f" 文件大小: {file_size:.1f} KB")
# 显示前3条记录的字段名
if selected_records:
print(f"\n[INFO] 抽取记录示例:")
sample = selected_records[0]
print(f" 字段数量: {len(sample)}")
print(f" 字段名: {list(sample.keys())[:10]}...")
# 显示统计信息
three_table_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' in r and '物理模型表_表名' in r)
element_logical_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' in r and '物理模型表_表名' not in r)
element_physical_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '物理模型表_表名' in r and '逻辑模型表_表名' not in r)
element_only_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' not in r and '物理模型表_表名' not in r)
print(f"\n[INFO] 抽取记录统计:")
print(f" 三表匹配: {three_table_match}")
print(f" 元素治理+逻辑模型: {element_logical_match}")
print(f" 元素治理+物理模型: {element_physical_match}")
print(f" 仅元素治理: {element_only_match}")
return {
'success': True,
'selected_count': len(selected_records),
'output_file': output_path,
'file_size_kb': file_size,
'statistics': {
'三表匹配': three_table_match if selected_records else 0,
'元素治理+逻辑模型': element_logical_match if selected_records else 0,
'元素治理+物理模型': element_physical_match if selected_records else 0,
'仅元素治理': element_only_match if selected_records else 0
}
}
except Exception as e:
print(f"\n[ERROR] 保存文件失败: {e}")
import traceback
traceback.print_exc()
return {'success': False, 'selected_count': 0}
def main():
"""主函数 - 演示用法"""
# 配置路径
input_dir = "Data"
csv_input_dir = "Data_Export_CSV"
output_dir = "Data_Export_Json"
# 创建转换器实例
converter = ExcelToJsonConverter(input_dir, output_dir)
# 步骤1: Excel/CSV转JSON
print("\n" + "="*60)
print("步骤1: Excel/CSV转JSON")
print("="*60)
# 优先使用CSV模式
if os.path.exists(csv_input_dir) and os.listdir(csv_input_dir):
# CSV模式使用现有的CSV文件
print(f"\n[INFO] 检测到CSV文件使用CSV模式")
print(f"{csv_input_dir} 读取CSV文件")
result = converter.convert_csv_directory(csv_input_dir)
else:
# Excel模式使用Excel文件备选方案
excel_files = converter.find_excel_files()
if excel_files:
print(f"\n[INFO] 未找到CSV文件使用Excel模式")
print(f"{input_dir} 读取Excel文件")
result = converter.process_all()
else:
print(f"\n[WARN] 未找到CSV文件和Excel文件")
result = {'total': 0, 'success': 0, 'failed': 0}
print(f"\n[INFO] 转换结果: {result}")
# 步骤2: 合并JSON文件
print("\n" + "="*60)
print("步骤2: JSON合并")
print("="*60)
merger = JsonMerger(output_dir)
merge_result = merger.merge_all(
logical_file="远光数据架构逻辑模型表.json",
physical_file="远光数据架构物理模型表.json",
element_file="远光数据架构元素治理模板表.json",
output_file="final.json"
)
# 步骤3: 随机抽取
print("\n" + "="*60)
print("步骤3: 随机抽取")
print("="*60)
selector = RandomSelector(output_dir, random_seed=42, select_count=3000)
select_result = selector.select_random(
input_file="final.json",
output_file="selected.json"
)
# 最终结果
print("\n" + "="*60)
print("处理完成!")
print("="*60)
print(f"Excel/CSV转JSON: {result}")
print(f"JSON合并: {merge_result}")
print(f"随机抽取: {select_result}")
if __name__ == "__main__":
main()