修改了一些文件名和文件信息,增加了requirements
This commit is contained in:
970
csv2json.py
Normal file
970
csv2json.py
Normal file
@@ -0,0 +1,970 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
数据处理一体化工具
|
||||
功能1:Excel/CSV转JSON - 读取Excel/CSV文件并转换为JSON
|
||||
功能2:JSON合并 - 根据字段英文名匹配逻辑模型表、物理模型表和元素治理模板表的数据
|
||||
功能3:随机抽取 - 从合并后的JSON中随机抽取指定数量的记录
|
||||
支持多种Excel读取方式,自动处理复杂格式
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
import glob
|
||||
import subprocess
|
||||
import xlwings as xw
|
||||
import random
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
from typing import Optional, Dict, List, Tuple, Any
|
||||
|
||||
|
||||
class ExcelToJsonConverter:
|
||||
"""Excel转JSON转换器"""
|
||||
|
||||
def __init__(self, input_dir: str, output_dir: str):
|
||||
"""
|
||||
初始化转换器
|
||||
|
||||
Args:
|
||||
input_dir: Excel文件输入目录
|
||||
output_dir: JSON文件输出目录
|
||||
"""
|
||||
self.input_dir = input_dir
|
||||
self.output_dir = output_dir
|
||||
|
||||
# 确保输出目录存在
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
# CSV临时目录(仅在Excel模式下使用)
|
||||
self.temp_csv_dir = None
|
||||
|
||||
def find_excel_files(self) -> List[Tuple[str, str]]:
|
||||
"""扫描目录下的所有Excel文件"""
|
||||
excel_files = []
|
||||
search_pattern = os.path.join(self.input_dir, "*.xlsx")
|
||||
|
||||
for excel_path in glob.glob(search_pattern):
|
||||
filename = os.path.basename(excel_path)
|
||||
|
||||
# 跳过临时文件(Excel的临时文件以~$开头)
|
||||
if filename.startswith('~$'):
|
||||
print(f"[SKIP] 跳过临时文件: {filename}")
|
||||
continue
|
||||
|
||||
# 生成基础文件名(不含扩展名)
|
||||
base_name = filename.replace('.xlsx', '')
|
||||
excel_files.append((excel_path, base_name))
|
||||
|
||||
return excel_files
|
||||
|
||||
def read_excel_with_xlwings(self, excel_path: str) -> Optional[pd.DataFrame]:
|
||||
"""使用xlwings读取Excel文件"""
|
||||
try:
|
||||
print(f" [TRY] 使用xlwings读取...")
|
||||
app = xw.App(visible=False)
|
||||
wb = app.books.open(excel_path)
|
||||
sheet = wb.sheets[0]
|
||||
|
||||
# 读取数据
|
||||
data = sheet.range('A1').expand().value
|
||||
wb.close()
|
||||
app.quit()
|
||||
|
||||
# 转换为DataFrame
|
||||
if data and len(data) > 0:
|
||||
if isinstance(data[0], list):
|
||||
# 标准表格格式
|
||||
headers = data[0]
|
||||
rows = data[1:] if len(data) > 1 else []
|
||||
df = pd.DataFrame(rows, columns=headers)
|
||||
else:
|
||||
# 每行只有一个值的特殊格式
|
||||
df = pd.DataFrame(data, columns=['内容'])
|
||||
return df
|
||||
return None
|
||||
|
||||
except ImportError:
|
||||
print(f" [WARN] xlwings未安装")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" [WARN] xlwings读取失败: {str(e)[:100]}")
|
||||
return None
|
||||
|
||||
def read_excel_with_libreoffice(self, excel_path: str) -> Optional[pd.DataFrame]:
|
||||
"""使用LibreOffice转换后读取"""
|
||||
try:
|
||||
print(f" [TRY] 使用LibreOffice转换...")
|
||||
# 输出CSV路径
|
||||
csv_path = excel_path.replace('.xlsx', '_temp.csv')
|
||||
|
||||
# 使用LibreOffice转换
|
||||
cmd = [
|
||||
'libreoffice',
|
||||
'--headless',
|
||||
'--convert-to', 'csv',
|
||||
'--outdir', os.path.dirname(excel_path),
|
||||
excel_path
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
|
||||
if os.path.exists(csv_path):
|
||||
df = pd.read_csv(csv_path, encoding='utf-8')
|
||||
# 删除临时文件
|
||||
os.remove(csv_path)
|
||||
print(f" [OK] LibreOffice转换成功")
|
||||
return df
|
||||
else:
|
||||
print(f" [WARN] LibreOffice转换失败")
|
||||
return None
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f" [WARN] LibreOffice未安装")
|
||||
return None
|
||||
except subprocess.TimeoutExpired:
|
||||
print(f" [WARN] LibreOffice转换超时")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" [WARN] LibreOffice转换失败: {e}")
|
||||
return None
|
||||
|
||||
def read_excel_with_pandas(self, excel_path: str) -> Optional[pd.DataFrame]:
|
||||
"""使用pandas读取Excel文件"""
|
||||
engines = ['openpyxl', 'xlrd']
|
||||
|
||||
for engine in engines:
|
||||
try:
|
||||
print(f" [TRY] 使用pandas ({engine})读取...")
|
||||
df = pd.read_excel(excel_path, engine=engine)
|
||||
print(f" [OK] pandas ({engine}) 读取成功")
|
||||
return df
|
||||
except Exception as e:
|
||||
print(f" [WARN] pandas ({engine}) 失败: {str(e)[:100]}")
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def read_excel_file(self, excel_path: str) -> Optional[pd.DataFrame]:
|
||||
"""
|
||||
尝试多种方法读取Excel文件
|
||||
|
||||
Args:
|
||||
excel_path: Excel文件路径
|
||||
|
||||
Returns:
|
||||
DataFrame或None
|
||||
"""
|
||||
print(f"\n[INFO] 读取文件: {os.path.basename(excel_path)}")
|
||||
|
||||
# 按优先级尝试读取方法
|
||||
methods = [
|
||||
("xlwings", self.read_excel_with_xlwings),
|
||||
("pandas-openpyxl", lambda p: self.read_excel_with_pandas(p) if 'openpyxl' in str(p) else None),
|
||||
("LibreOffice", self.read_excel_with_libreoffice),
|
||||
("pandas-xlrd", self.read_excel_with_pandas),
|
||||
]
|
||||
|
||||
for method_name, method_func in methods:
|
||||
try:
|
||||
if method_name == "pandas-openpyxl":
|
||||
# 特殊处理pandas-openpyxl
|
||||
df = self.read_excel_with_pandas(excel_path)
|
||||
elif method_name == "pandas-xlrd":
|
||||
# 跳过,因为上面已经尝试过了
|
||||
continue
|
||||
else:
|
||||
df = method_func(excel_path)
|
||||
|
||||
if df is not None and not df.empty:
|
||||
print(f"[OK] {method_name} 成功读取!")
|
||||
print(f" 数据形状: {df.shape[0]}行 × {df.shape[1]}列")
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
print(f"[WARN] {method_name} 失败: {str(e)[:100]}")
|
||||
|
||||
print(f"[ERROR] 所有读取方法都失败了")
|
||||
return None
|
||||
|
||||
def convert_to_csv(self, df: pd.DataFrame, base_name: str) -> str:
|
||||
"""
|
||||
将DataFrame转换为CSV
|
||||
|
||||
Args:
|
||||
df: 数据框
|
||||
base_name: 文件基础名
|
||||
|
||||
Returns:
|
||||
CSV文件路径
|
||||
"""
|
||||
# 确保临时CSV目录存在
|
||||
if self.temp_csv_dir is None:
|
||||
self.temp_csv_dir = os.path.join(self.output_dir, "temp_csv")
|
||||
if not os.path.exists(self.temp_csv_dir):
|
||||
os.makedirs(self.temp_csv_dir)
|
||||
|
||||
csv_filename = f"{base_name}.csv"
|
||||
csv_path = os.path.join(self.temp_csv_dir, csv_filename)
|
||||
|
||||
# 保存为CSV,使用utf-8-sig编码支持中文
|
||||
df.to_csv(csv_path, index=False, encoding='utf-8-sig')
|
||||
|
||||
file_size = os.path.getsize(csv_path) / 1024 # KB
|
||||
print(f" [OK] CSV已生成: {csv_filename} ({file_size:.1f} KB)")
|
||||
|
||||
return csv_path
|
||||
|
||||
def convert_csv_to_json(self, csv_path: str, base_name: str) -> str:
|
||||
"""
|
||||
将CSV文件转换为JSON
|
||||
|
||||
Args:
|
||||
csv_path: CSV文件路径
|
||||
base_name: 文件基础名
|
||||
|
||||
Returns:
|
||||
JSON文件路径
|
||||
"""
|
||||
try:
|
||||
# 读取CSV文件
|
||||
df = pd.read_csv(csv_path, encoding='utf-8-sig')
|
||||
|
||||
if df.empty:
|
||||
print(f" [WARN] CSV文件为空")
|
||||
return ""
|
||||
|
||||
# 转换为JSON列表
|
||||
json_data = []
|
||||
for index, row in df.iterrows():
|
||||
# 创建JSON对象
|
||||
json_obj = {}
|
||||
for column in df.columns:
|
||||
value = row[column]
|
||||
|
||||
# 处理Na值
|
||||
if pd.isna(value):
|
||||
json_obj[column] = None
|
||||
else:
|
||||
# 处理数据值:如果是字符串且包含英文字母,转换为小写
|
||||
if isinstance(value, str) and any(c.isalpha() and ord(c) < 128 for c in value):
|
||||
# 将数据值中的英文字母转换为小写
|
||||
value = value.lower()
|
||||
|
||||
# 将英文字段名转换为小写
|
||||
# 检查字段名是否完全是英文字符(包括字母、数字、下划线)
|
||||
if all(ord(c) < 128 for c in column if c.isalnum() or c in '_'):
|
||||
# 完全是英文字段名,转换为小写
|
||||
json_obj[column.lower()] = value
|
||||
else:
|
||||
# 包含中文字符的字段名保持不变
|
||||
json_obj[column] = value
|
||||
|
||||
# 添加表名字段
|
||||
json_obj['表名'] = base_name
|
||||
|
||||
json_data.append(json_obj)
|
||||
|
||||
# 生成JSON文件路径
|
||||
json_filename = f"{base_name}.json"
|
||||
json_path = os.path.join(self.output_dir, json_filename)
|
||||
|
||||
# 保存JSON文件
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(json_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
file_size = os.path.getsize(json_path) / 1024 # KB
|
||||
print(f" [OK] JSON已生成: {json_filename} ({file_size:.1f} KB)")
|
||||
print(f" 数据量: {len(json_data)} 条记录")
|
||||
|
||||
return json_path
|
||||
|
||||
except Exception as e:
|
||||
print(f" [ERROR] CSV转JSON失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return ""
|
||||
|
||||
def process_single_file(self, excel_path: str, base_name: str) -> bool:
|
||||
"""
|
||||
处理单个Excel文件:Excel -> CSV -> JSON
|
||||
|
||||
Args:
|
||||
excel_path: Excel文件路径
|
||||
base_name: 文件基础名
|
||||
|
||||
Returns:
|
||||
是否成功
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"处理: {os.path.basename(excel_path)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# 步骤1: 读取Excel
|
||||
df = self.read_excel_file(excel_path)
|
||||
if df is None:
|
||||
print(f"[ERROR] 读取失败,跳过此文件")
|
||||
return False
|
||||
|
||||
# 显示数据预览
|
||||
print(f"\n[INFO] 数据预览:")
|
||||
print(df.head(3))
|
||||
|
||||
# 步骤2: 转换为CSV
|
||||
csv_path = self.convert_to_csv(df, base_name)
|
||||
|
||||
# 步骤3: 转换为JSON
|
||||
json_path = self.convert_csv_to_json(csv_path, base_name)
|
||||
|
||||
if json_path:
|
||||
print(f"\n[OK] 转换完成!")
|
||||
return True
|
||||
else:
|
||||
print(f"\n[ERROR] 转换失败")
|
||||
return False
|
||||
|
||||
def process_all(self) -> Dict:
|
||||
"""
|
||||
处理所有Excel文件
|
||||
|
||||
Returns:
|
||||
处理结果统计
|
||||
"""
|
||||
print("="*60)
|
||||
print("Excel转JSON一体化工具")
|
||||
print("="*60)
|
||||
print(f"输入目录: {self.input_dir}")
|
||||
print(f"输出目录: {self.output_dir}")
|
||||
|
||||
# 查找Excel文件
|
||||
excel_files = self.find_excel_files()
|
||||
|
||||
if not excel_files:
|
||||
print(f"\n[WARN] 未找到任何Excel文件")
|
||||
return {'total': 0, 'success': 0, 'failed': 0}
|
||||
|
||||
print(f"\n[INFO] 发现 {len(excel_files)} 个Excel文件")
|
||||
|
||||
# 处理每个文件
|
||||
success_count = 0
|
||||
failed_count = 0
|
||||
results = []
|
||||
|
||||
for excel_path, base_name in excel_files:
|
||||
if self.process_single_file(excel_path, base_name):
|
||||
success_count += 1
|
||||
results.append({'file': os.path.basename(excel_path), 'status': 'success'})
|
||||
else:
|
||||
failed_count += 1
|
||||
results.append({'file': os.path.basename(excel_path), 'status': 'failed'})
|
||||
|
||||
# 输出统计信息
|
||||
print(f"\n{'='*60}")
|
||||
print("转换完成!")
|
||||
print(f"{'='*60}")
|
||||
print(f"总计: {len(excel_files)} 个文件")
|
||||
print(f"成功: {success_count} 个文件")
|
||||
print(f"失败: {failed_count} 个文件")
|
||||
|
||||
# 显示生成的JSON文件
|
||||
if success_count > 0:
|
||||
print(f"\n生成的JSON文件:")
|
||||
json_files = glob.glob(os.path.join(self.output_dir, "*.json"))
|
||||
for json_file in sorted(json_files):
|
||||
file_size = os.path.getsize(json_file) / 1024 # KB
|
||||
filename = os.path.basename(json_file)
|
||||
print(f" - {filename} ({file_size:.1f} KB)")
|
||||
|
||||
return {
|
||||
'total': len(excel_files),
|
||||
'success': success_count,
|
||||
'failed': failed_count,
|
||||
'results': results
|
||||
}
|
||||
|
||||
def find_csv_files(self, csv_dir: str) -> List[Tuple[str, str]]:
|
||||
"""扫描目录下的所有CSV文件"""
|
||||
csv_files = []
|
||||
search_pattern = os.path.join(csv_dir, "*.csv")
|
||||
|
||||
for csv_path in glob.glob(search_pattern):
|
||||
filename = os.path.basename(csv_path)
|
||||
# 生成基础文件名(不含扩展名)
|
||||
base_name = filename.replace('.csv', '')
|
||||
csv_files.append((csv_path, base_name))
|
||||
|
||||
return csv_files
|
||||
|
||||
def convert_csv_to_json_direct(self, csv_path: str, base_name: str) -> str:
|
||||
"""
|
||||
直接将CSV文件转换为JSON(不生成临时CSV)
|
||||
这个方法直接从CSV读取并转换为JSON
|
||||
|
||||
Args:
|
||||
csv_path: CSV文件路径
|
||||
base_name: 文件基础名
|
||||
|
||||
Returns:
|
||||
JSON文件路径
|
||||
"""
|
||||
try:
|
||||
# 尝试多种编码读取CSV文件
|
||||
encodings = ['utf-8-sig', 'gb2312', 'gbk', 'utf-8']
|
||||
df = None
|
||||
|
||||
for encoding in encodings:
|
||||
try:
|
||||
print(f" [TRY] 尝试编码: {encoding}")
|
||||
df = pd.read_csv(csv_path, encoding=encoding)
|
||||
print(f" [OK] 编码 {encoding} 读取成功")
|
||||
break
|
||||
except (UnicodeDecodeError, UnicodeError):
|
||||
print(f" [WARN] 编码 {encoding} 失败")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" [WARN] 编码 {encoding} 其他错误: {str(e)[:50]}")
|
||||
continue
|
||||
|
||||
if df is None:
|
||||
print(f" [ERROR] 所有编码都失败,无法读取CSV文件")
|
||||
return ""
|
||||
|
||||
if df.empty:
|
||||
print(f" [WARN] CSV文件为空")
|
||||
return ""
|
||||
|
||||
# 转换为JSON列表
|
||||
json_data = []
|
||||
for index, row in df.iterrows():
|
||||
# 创建JSON对象
|
||||
json_obj = {}
|
||||
for column in df.columns:
|
||||
value = row[column]
|
||||
|
||||
# 处理Na值
|
||||
if pd.isna(value):
|
||||
json_obj[column] = None
|
||||
else:
|
||||
# 处理数据值:如果是字符串且包含英文字母,转换为小写
|
||||
if isinstance(value, str) and any(c.isalpha() and ord(c) < 128 for c in value):
|
||||
# 将数据值中的英文字母转换为小写
|
||||
value = value.lower()
|
||||
|
||||
# 将英文字段名转换为小写
|
||||
# 检查字段名是否完全是英文字符(包括字母、数字、下划线)
|
||||
if all(ord(c) < 128 for c in column if c.isalnum() or c in '_'):
|
||||
# 完全是英文字段名,转换为小写
|
||||
json_obj[column.lower()] = value
|
||||
else:
|
||||
# 包含中文字符的字段名保持不变
|
||||
json_obj[column] = value
|
||||
|
||||
# 添加表名字段
|
||||
json_obj['表名'] = base_name
|
||||
|
||||
json_data.append(json_obj)
|
||||
|
||||
# 生成JSON文件路径
|
||||
json_filename = f"{base_name}.json"
|
||||
json_path = os.path.join(self.output_dir, json_filename)
|
||||
|
||||
# 保存JSON文件
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(json_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
file_size = os.path.getsize(json_path) / 1024 # KB
|
||||
print(f" [OK] JSON已生成: {json_filename} ({file_size:.1f} KB)")
|
||||
print(f" 数据量: {len(json_data)} 条记录")
|
||||
|
||||
return json_path
|
||||
|
||||
except Exception as e:
|
||||
print(f" [ERROR] CSV转JSON失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return ""
|
||||
|
||||
def process_single_csv(self, csv_path: str, base_name: str) -> bool:
|
||||
"""
|
||||
处理单个CSV文件:CSV → JSON
|
||||
|
||||
Args:
|
||||
csv_path: CSV文件路径
|
||||
base_name: 文件基础名
|
||||
|
||||
Returns:
|
||||
是否成功
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"处理: {os.path.basename(csv_path)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# 步骤1: 读取CSV文件并预览
|
||||
try:
|
||||
# 尝试多种编码读取CSV文件
|
||||
encodings = ['utf-8-sig', 'gb2312', 'gbk', 'utf-8']
|
||||
df = None
|
||||
|
||||
for encoding in encodings:
|
||||
try:
|
||||
df = pd.read_csv(csv_path, encoding=encoding)
|
||||
break
|
||||
except (UnicodeDecodeError, UnicodeError):
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 编码 {encoding} 错误: {e}")
|
||||
continue
|
||||
|
||||
if df is None or df.empty:
|
||||
print(f"[ERROR] CSV文件为空或读取失败")
|
||||
return False
|
||||
|
||||
print(f"\n[INFO] 数据预览:")
|
||||
print(df.head(3))
|
||||
print(f"\n[INFO] 数据形状: {df.shape[0]}行 × {df.shape[1]}列")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 读取CSV失败: {e}")
|
||||
return False
|
||||
|
||||
# 步骤2: 转换为JSON
|
||||
json_path = self.convert_csv_to_json_direct(csv_path, base_name)
|
||||
|
||||
if json_path:
|
||||
print(f"\n[OK] 转换完成!")
|
||||
return True
|
||||
else:
|
||||
print(f"\n[ERROR] 转换失败")
|
||||
return False
|
||||
|
||||
def convert_csv_directory(self, csv_dir: str) -> Dict:
|
||||
"""
|
||||
处理CSV目录下的所有CSV文件
|
||||
|
||||
Args:
|
||||
csv_dir: CSV文件目录
|
||||
|
||||
Returns:
|
||||
处理结果统计
|
||||
"""
|
||||
print("="*60)
|
||||
print("CSV转JSON工具")
|
||||
print("="*60)
|
||||
print(f"CSV输入目录: {csv_dir}")
|
||||
print(f"JSON输出目录: {self.output_dir}")
|
||||
|
||||
# 查找CSV文件
|
||||
csv_files = self.find_csv_files(csv_dir)
|
||||
|
||||
if not csv_files:
|
||||
print(f"\n[WARN] 未找到任何CSV文件")
|
||||
return {'total': 0, 'success': 0, 'failed': 0}
|
||||
|
||||
print(f"\n[INFO] 发现 {len(csv_files)} 个CSV文件")
|
||||
|
||||
# 处理每个文件
|
||||
success_count = 0
|
||||
failed_count = 0
|
||||
results = []
|
||||
|
||||
for csv_path, base_name in csv_files:
|
||||
if self.process_single_csv(csv_path, base_name):
|
||||
success_count += 1
|
||||
results.append({'file': os.path.basename(csv_path), 'status': 'success'})
|
||||
else:
|
||||
failed_count += 1
|
||||
results.append({'file': os.path.basename(csv_path), 'status': 'failed'})
|
||||
|
||||
# 输出统计信息
|
||||
print(f"\n{'='*60}")
|
||||
print("转换完成!")
|
||||
print(f"{'='*60}")
|
||||
print(f"总计: {len(csv_files)} 个文件")
|
||||
print(f"成功: {success_count} 个文件")
|
||||
print(f"失败: {failed_count} 个文件")
|
||||
|
||||
# 显示生成的JSON文件
|
||||
if success_count > 0:
|
||||
print(f"\n生成的JSON文件:")
|
||||
json_files = glob.glob(os.path.join(self.output_dir, "*.json"))
|
||||
for json_file in sorted(json_files):
|
||||
file_size = os.path.getsize(json_file) / 1024 # KB
|
||||
filename = os.path.basename(json_file)
|
||||
print(f" - {filename} ({file_size:.1f} KB)")
|
||||
|
||||
return {
|
||||
'total': len(csv_files),
|
||||
'success': success_count,
|
||||
'failed': failed_count,
|
||||
'results': results
|
||||
}
|
||||
|
||||
|
||||
class JsonMerger:
|
||||
"""JSON文件合并器"""
|
||||
|
||||
def __init__(self, output_dir: str):
|
||||
self.output_dir = output_dir
|
||||
|
||||
def load_json_file(self, file_path: str) -> List[Dict[str, Any]]:
|
||||
"""加载JSON文件"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
print(f"[OK] 加载文件: {os.path.basename(file_path)} - {len(data)} 条记录")
|
||||
return data
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 加载文件失败 {file_path}: {e}")
|
||||
return []
|
||||
|
||||
def build_index(self, records: List[Dict], field_name: str) -> Dict[str, List[Dict]]:
|
||||
"""为记录列表建立索引,加速查找"""
|
||||
index = defaultdict(list)
|
||||
for record in records:
|
||||
field_value = record.get(field_name)
|
||||
if field_value:
|
||||
index[field_value].append(record)
|
||||
print(f"[INFO] 建立索引完成: {len(index)} 个唯一字段值")
|
||||
return index
|
||||
|
||||
def merge_records_optimized(self, logical_index: Dict, physical_index: Dict, element_records: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
使用索引优化合并三个表的记录
|
||||
"""
|
||||
merged_data = []
|
||||
processed_fields = set()
|
||||
|
||||
# 遍历元素治理表
|
||||
print(f"\n[INFO] 开始合并数据...")
|
||||
for i, element_record in enumerate(element_records):
|
||||
if i % 5000 == 0:
|
||||
print(f" 处理进度: {i}/{len(element_records)}")
|
||||
|
||||
field_english_name = element_record.get('字段英文名')
|
||||
if not field_english_name or field_english_name in processed_fields:
|
||||
continue
|
||||
|
||||
processed_fields.add(field_english_name)
|
||||
|
||||
# 创建合并记录
|
||||
merged_record = {}
|
||||
|
||||
# 添加元素治理模板表的数据
|
||||
for key, value in element_record.items():
|
||||
if key != '表名':
|
||||
merged_record[key] = value
|
||||
|
||||
# 查找逻辑模型表中的匹配记录
|
||||
logical_matches = logical_index.get(field_english_name, [])
|
||||
|
||||
# 查找物理模型表中的匹配记录
|
||||
physical_matches = physical_index.get(field_english_name, [])
|
||||
|
||||
# 添加逻辑模型表的数据(添加前缀避免冲突)
|
||||
if logical_matches:
|
||||
for logical_match in logical_matches:
|
||||
for key, value in logical_match.items():
|
||||
if key not in ['表名', '字段英文名']:
|
||||
new_key = f"逻辑模型_{key}"
|
||||
merged_record[new_key] = value
|
||||
|
||||
# 只有当有匹配数据时才添加表名信息
|
||||
merged_record['逻辑模型表_表名'] = '远光数据架构逻辑模型表'
|
||||
|
||||
# 添加物理模型表的数据(添加前缀避免冲突)
|
||||
if physical_matches:
|
||||
for physical_match in physical_matches:
|
||||
for key, value in physical_match.items():
|
||||
if key not in ['表名', '字段英文名']:
|
||||
new_key = f"物理模型_{key}"
|
||||
merged_record[new_key] = value
|
||||
|
||||
# 只有当有匹配数据时才添加表名信息
|
||||
merged_record['物理模型表_表名'] = '远光数据架构物理模型表'
|
||||
|
||||
# 添加元素治理表名(始终存在)
|
||||
merged_record['元素治理表_表名'] = '远光数据架构元素治理模板表'
|
||||
|
||||
merged_data.append(merged_record)
|
||||
|
||||
print(f" 完成合并: {len(merged_data)} 条记录")
|
||||
|
||||
return merged_data
|
||||
|
||||
def merge_all(self, logical_file: str, physical_file: str, element_file: str, output_file: str) -> Dict:
|
||||
"""合并所有JSON文件"""
|
||||
print("="*60)
|
||||
print("优化版JSON文件合并工具")
|
||||
print("="*60)
|
||||
|
||||
# 构建文件路径
|
||||
logical_json_path = os.path.join(self.output_dir, logical_file)
|
||||
physical_json_path = os.path.join(self.output_dir, physical_file)
|
||||
element_json_path = os.path.join(self.output_dir, element_file)
|
||||
output_path = os.path.join(self.output_dir, output_file)
|
||||
|
||||
# 加载JSON文件
|
||||
print("\n[INFO] 加载JSON文件...")
|
||||
logical_records = self.load_json_file(logical_json_path)
|
||||
physical_records = self.load_json_file(physical_json_path)
|
||||
element_records = self.load_json_file(element_json_path)
|
||||
|
||||
if not (logical_records and physical_records and element_records):
|
||||
print("\n[ERROR] 无法加载所有JSON文件")
|
||||
return {'success': False, 'merged_count': 0}
|
||||
|
||||
# 建立索引
|
||||
print(f"\n[INFO] 建立索引加速查找...")
|
||||
logical_index = self.build_index(logical_records, '字段英文名')
|
||||
physical_index = self.build_index(physical_records, '字段英文名')
|
||||
|
||||
# 合并数据(只处理元素治理表中存在的字段)
|
||||
merged_data = self.merge_records_optimized(logical_index, physical_index, element_records)
|
||||
|
||||
# 保存合并后的数据
|
||||
try:
|
||||
print(f"\n[INFO] 保存合并数据到 {output_path}...")
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(merged_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
file_size = os.path.getsize(output_path) / 1024 # KB
|
||||
print(f"\n[OK] 合并完成!")
|
||||
print(f" 输出文件: {output_path}")
|
||||
print(f" 合并记录: {len(merged_data)} 条")
|
||||
print(f" 文件大小: {file_size:.1f} KB")
|
||||
|
||||
# 显示统计信息
|
||||
three_table_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('逻辑模型表_表名') and r.get('物理模型表_表名'))
|
||||
element_logical_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('逻辑模型表_表名') and not r.get('物理模型表_表名'))
|
||||
element_physical_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and r.get('物理模型表_表名') and not r.get('逻辑模型表_表名'))
|
||||
element_only_match = sum(1 for r in merged_data if r.get('元素治理表_表名') and not r.get('逻辑模型表_表名') and not r.get('物理模型表_表名'))
|
||||
|
||||
print(f"\n[INFO] 统计信息:")
|
||||
print(f" 三表匹配: {three_table_match} 条")
|
||||
print(f" 元素治理+逻辑模型: {element_logical_match} 条")
|
||||
print(f" 元素治理+物理模型: {element_physical_match} 条")
|
||||
print(f" 仅元素治理: {element_only_match} 条")
|
||||
|
||||
# 显示前3条记录的字段名
|
||||
if merged_data:
|
||||
print(f"\n[INFO] 合并记录示例:")
|
||||
sample_record = merged_data[0]
|
||||
print(f" 字段数量: {len(sample_record)}")
|
||||
print(f" 字段名: {list(sample_record.keys())[:10]}...") # 只显示前10个字段
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'merged_count': len(merged_data),
|
||||
'output_file': output_path,
|
||||
'file_size_kb': file_size,
|
||||
'statistics': {
|
||||
'三表匹配': three_table_match,
|
||||
'元素治理+逻辑模型': element_logical_match,
|
||||
'元素治理+物理模型': element_physical_match,
|
||||
'仅元素治理': element_only_match
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n[ERROR] 保存文件失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return {'success': False, 'merged_count': 0}
|
||||
|
||||
|
||||
class RandomSelector:
|
||||
"""随机选择器"""
|
||||
|
||||
def __init__(self, output_dir: str, random_seed: int = 42, select_count: int = 3000):
|
||||
self.output_dir = output_dir
|
||||
self.random_seed = random_seed
|
||||
self.select_count = select_count
|
||||
|
||||
def load_json_file(self, file_path: str) -> List[Dict[str, Any]]:
|
||||
"""加载JSON文件"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
print(f"[OK] 加载文件: {os.path.basename(file_path)} - {len(data)} 条记录")
|
||||
return data
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 加载文件失败 {file_path}: {e}")
|
||||
return []
|
||||
|
||||
def random_select(self, records: List[Dict[str, Any]], count: int) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
随机抽取记录
|
||||
|
||||
Args:
|
||||
records: 记录列表
|
||||
count: 要抽取的数量
|
||||
|
||||
Returns:
|
||||
抽取的记录列表
|
||||
"""
|
||||
# 设置随机种子
|
||||
random.seed(self.random_seed)
|
||||
|
||||
# 如果抽取数量大于等于总数,直接返回所有记录
|
||||
if count >= len(records):
|
||||
print(f"[WARN] 抽取数量 ({count}) 大于等于总记录数 ({len(records)}),返回所有记录")
|
||||
return records
|
||||
|
||||
# 随机抽取
|
||||
selected = random.sample(records, count)
|
||||
print(f"[OK] 从 {len(records)} 条记录中随机抽取 {count} 条")
|
||||
|
||||
return selected
|
||||
|
||||
def select_random(self, input_file: str, output_file: str) -> Dict:
|
||||
"""随机抽取记录"""
|
||||
print("="*60)
|
||||
print("随机抽取工具")
|
||||
print("="*60)
|
||||
|
||||
# 构建文件路径
|
||||
input_path = os.path.join(self.output_dir, input_file)
|
||||
output_path = os.path.join(self.output_dir, output_file)
|
||||
|
||||
print(f"\n[INFO] 配置:")
|
||||
print(f" 随机种子: {self.random_seed}")
|
||||
print(f" 抽取数量: {self.select_count}")
|
||||
|
||||
# 检查输入文件是否存在
|
||||
if not os.path.exists(input_path):
|
||||
print(f"\n[ERROR] 输入文件不存在: {input_path}")
|
||||
return {'success': False, 'selected_count': 0}
|
||||
|
||||
# 加载数据
|
||||
print(f"\n[INFO] 加载数据...")
|
||||
records = self.load_json_file(input_path)
|
||||
|
||||
if not records:
|
||||
print(f"\n[ERROR] 无法加载数据或数据为空")
|
||||
return {'success': False, 'selected_count': 0}
|
||||
|
||||
# 随机抽取
|
||||
print(f"\n[INFO] 执行随机抽取...")
|
||||
selected_records = self.random_select(records, self.select_count)
|
||||
|
||||
# 保存结果
|
||||
try:
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(selected_records, f, ensure_ascii=False, indent=2)
|
||||
|
||||
file_size = os.path.getsize(output_path) / 1024 # KB
|
||||
print(f"\n[OK] 抽取完成!")
|
||||
print(f" 输出文件: {output_path}")
|
||||
print(f" 记录数量: {len(selected_records)}")
|
||||
print(f" 文件大小: {file_size:.1f} KB")
|
||||
|
||||
# 显示前3条记录的字段名
|
||||
if selected_records:
|
||||
print(f"\n[INFO] 抽取记录示例:")
|
||||
sample = selected_records[0]
|
||||
print(f" 字段数量: {len(sample)}")
|
||||
print(f" 字段名: {list(sample.keys())[:10]}...")
|
||||
|
||||
# 显示统计信息
|
||||
three_table_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' in r and '物理模型表_表名' in r)
|
||||
element_logical_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' in r and '物理模型表_表名' not in r)
|
||||
element_physical_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '物理模型表_表名' in r and '逻辑模型表_表名' not in r)
|
||||
element_only_match = sum(1 for r in selected_records if '元素治理表_表名' in r and '逻辑模型表_表名' not in r and '物理模型表_表名' not in r)
|
||||
|
||||
print(f"\n[INFO] 抽取记录统计:")
|
||||
print(f" 三表匹配: {three_table_match} 条")
|
||||
print(f" 元素治理+逻辑模型: {element_logical_match} 条")
|
||||
print(f" 元素治理+物理模型: {element_physical_match} 条")
|
||||
print(f" 仅元素治理: {element_only_match} 条")
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'selected_count': len(selected_records),
|
||||
'output_file': output_path,
|
||||
'file_size_kb': file_size,
|
||||
'statistics': {
|
||||
'三表匹配': three_table_match if selected_records else 0,
|
||||
'元素治理+逻辑模型': element_logical_match if selected_records else 0,
|
||||
'元素治理+物理模型': element_physical_match if selected_records else 0,
|
||||
'仅元素治理': element_only_match if selected_records else 0
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n[ERROR] 保存文件失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return {'success': False, 'selected_count': 0}
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数 - 演示用法"""
|
||||
# 配置路径
|
||||
input_dir = "Data"
|
||||
csv_input_dir = "Data_Export_CSV"
|
||||
output_dir = "Data_Export_Json"
|
||||
|
||||
# 创建转换器实例
|
||||
converter = ExcelToJsonConverter(input_dir, output_dir)
|
||||
|
||||
# 步骤1: Excel/CSV转JSON
|
||||
print("\n" + "="*60)
|
||||
print("步骤1: Excel/CSV转JSON")
|
||||
print("="*60)
|
||||
|
||||
# 优先使用CSV模式
|
||||
if os.path.exists(csv_input_dir) and os.listdir(csv_input_dir):
|
||||
# CSV模式:使用现有的CSV文件
|
||||
print(f"\n[INFO] 检测到CSV文件,使用CSV模式")
|
||||
print(f" 从 {csv_input_dir} 读取CSV文件")
|
||||
result = converter.convert_csv_directory(csv_input_dir)
|
||||
else:
|
||||
# Excel模式:使用Excel文件(备选方案)
|
||||
excel_files = converter.find_excel_files()
|
||||
if excel_files:
|
||||
print(f"\n[INFO] 未找到CSV文件,使用Excel模式")
|
||||
print(f" 从 {input_dir} 读取Excel文件")
|
||||
result = converter.process_all()
|
||||
else:
|
||||
print(f"\n[WARN] 未找到CSV文件和Excel文件")
|
||||
result = {'total': 0, 'success': 0, 'failed': 0}
|
||||
|
||||
print(f"\n[INFO] 转换结果: {result}")
|
||||
|
||||
# 步骤2: 合并JSON文件
|
||||
print("\n" + "="*60)
|
||||
print("步骤2: JSON合并")
|
||||
print("="*60)
|
||||
|
||||
merger = JsonMerger(output_dir)
|
||||
merge_result = merger.merge_all(
|
||||
logical_file="远光数据架构逻辑模型表.json",
|
||||
physical_file="远光数据架构物理模型表.json",
|
||||
element_file="远光数据架构元素治理模板表.json",
|
||||
output_file="final.json"
|
||||
)
|
||||
|
||||
# 步骤3: 随机抽取
|
||||
print("\n" + "="*60)
|
||||
print("步骤3: 随机抽取")
|
||||
print("="*60)
|
||||
|
||||
selector = RandomSelector(output_dir, random_seed=42, select_count=3000)
|
||||
select_result = selector.select_random(
|
||||
input_file="final.json",
|
||||
output_file="selected.json"
|
||||
)
|
||||
|
||||
# 最终结果
|
||||
print("\n" + "="*60)
|
||||
print("处理完成!")
|
||||
print("="*60)
|
||||
print(f"Excel/CSV转JSON: {result}")
|
||||
print(f"JSON合并: {merge_result}")
|
||||
print(f"随机抽取: {select_result}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user