如何将以下代码改写为几个代码文件,简单且运行结果一样import math
from datetime import datetime
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from collections import Counter
# --- 字体和样式配置 ---
def configure_fonts_and_styles():
chinese_font_name = None
try:
if os.name == 'nt':
test_font = 'SimSun'
try:
fm.FontProperties(family=test_font).get_name(); chinese_font_name = test_font
except RuntimeError:
print(f"警告:Windows系统中未直接找到 {test_font} 字体,尝试 'Microsoft YaHei' 或 'SimHei'。")
fallback_fonts = ['Microsoft YaHei', 'SimHei']
for f_name in fallback_fonts:
try: fm.FontProperties(family=f_name).get_name(); chinese_font_name = f_name; break
except RuntimeError: continue
elif os.name == 'posix':
common_fonts = ['SimSun', 'Songti SC', 'STSong', 'WenQuanYi Micro Hei', 'Noto Serif CJK SC', 'Noto Sans CJK SC']
for f_name in common_fonts:
try:
if fm.FontProperties(family=f_name).get_name(): chinese_font_name = f_name; break
except RuntimeError: continue
if chinese_font_name:
plt.rcParams['font.family'] = chinese_font_name
print(f"中文字体尝试设置为: {chinese_font_name}")
else:
print("警告:未能自动配置一个特定的中文字体 (如 SimSun - 宋体)。中文可能无法正确显示。")
except Exception as e: print(f"设置中文字体时出错: {e}")
try:
current_serif = plt.rcParams.get('font.serif', [])
if 'Times New Roman' not in current_serif:
try:
fm.FontProperties(family='Times New Roman').get_name()
plt.rcParams['font.serif'] = ['Times New Roman'] + current_serif
print(f"英文衬线字体 (serif) 将优先尝试: Times New Roman")
except RuntimeError: print("警告:系统中未找到 'Times New Roman' 字体。英文衬线字体可能使用默认设置。")
except Exception as e: print(f"设置Times New Roman时出错: {e}")
font_size = 14
plt.rcParams['font.size'] = font_size; plt.rcParams['axes.titlesize'] = font_size + 4
plt.rcParams['axes.labelsize'] = font_size + 2; plt.rcParams['xtick.labelsize'] = font_size
plt.rcParams['ytick.labelsize'] = font_size; plt.rcParams['legend.fontsize'] = font_size # Adjusted legend fontsize
plt.rcParams['figure.titlesize'] = font_size + 6; plt.rcParams['axes.unicode_minus'] = False
print(f"全局字体大小已设置为基础: {font_size}pt")
# --- 配置项 ---
BASE_FILE_PATH = r"C:\Users\byx\Desktop\Dataset\Dataset"
CHARGING_DATA_FILE = os.path.join(BASE_FILE_PATH, "Charging_Data.csv")
OUTPUT_DIR = "visualizations_layout_adjusted" # 新的输出文件夹
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
CHARGING_COLUMN_MAPPINGS = {
'power': 'Transaction power/kwh', 'start_time': 'Start Time', 'end_time': 'End Time',
'service_fee': 'Service charge/Yuan', 'end_cause': 'end_cause',
'temperature_direct': 'Temperature(℃)', 'district_charging': 'District Name'
}
# --- CSV读取与解析辅助函数 ---
def read_csv(filepath, column_selection_map=None):
data = []; header = []
try:
with open(filepath, 'r', encoding='utf-8-sig') as f:
lines = f.readlines()
if not lines: return [], []
header = [h.strip().replace('"', '') for h in lines[0].strip().split(',')]
col_indices = {internal_key: header.index(actual_col_name) if actual_col_name in header else -1
for internal_key, actual_col_name in column_selection_map.items()} if column_selection_map else \
{h: i for i, h in enumerate(header)}
for line in lines[1:]:
if not line.strip(): continue
values = [v.strip().replace('"', '') for v in line.split(',')]
if len(values) != len(header): continue
row_dict = {internal_key: values[idx] if idx != -1 and idx < len(values) else None
for internal_key, idx in col_indices.items()}
data.append(row_dict)
except FileNotFoundError: print(f"错误:文件 {filepath} 未找到。"); return [], []
except Exception as e: print(f"读取CSV文件 {filepath} 时发生错误: {e}"); return [], []
return data, header
def parse_datetime_custom(dt_str, context=""):
if not dt_str: return None
dt_str_cleaned = dt_str.strip()
formats = ["%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M", "%m/%d/%Y %H:%M",
"%Y-%m-%d %H:%M", "%Y/%m/%d %H:%M:%S", "%Y%m%d", "%Y-%m-%d"]
for fmt in formats:
try: return datetime.strptime(dt_str_cleaned, fmt)
except ValueError: continue
return None
# --- 统计计算辅助函数 ---
def calculate_mean(data_list):
valid_data = [x for x in data_list if x is not None and isinstance(x, (int, float))]
if not valid_data: return 0.0
return sum(valid_data) / len(valid_data)
def calculate_std_dev(data_list, ddof=1):
valid_data_float = []
for x in data_list:
if x is not None:
try: valid_data_float.append(float(x))
except ValueError: pass
if len(valid_data_float) < max(2, ddof): return 0.0
mean = calculate_mean(valid_data_float)
variance = sum([(x - mean) ** 2 for x in valid_data_float]) / (len(valid_data_float) - ddof)
return math.sqrt(variance)
def calculate_median(data_list):
valid_data = [x for x in data_list if x is not None and isinstance(x, (int, float))]
if not valid_data: return 0.0
sorted_list = sorted(valid_data)
n = len(sorted_list)
mid = n // 2
if n % 2 == 1: return sorted_list[mid]
else: return (sorted_list[mid - 1] + sorted_list[mid]) / 2.0
def calculate_pearson_correlation(list1, list2):
paired_values = []
for x, y in zip(list1, list2):
if x is not None and y is not None:
try:
val_x = float(x); val_y = float(y)
paired_values.append((val_x, val_y))
except ValueError: continue
if len(paired_values) < 2: return 0.0
x_vals = [p[0] for p in paired_values]; y_vals = [p[1] for p in paired_values]
n = len(x_vals)
if n == 0 : return 0.0
mean1, mean2 = calculate_mean(x_vals), calculate_mean(y_vals)
numerator = sum([(x - mean1) * (y - mean2) for x, y in zip(x_vals, y_vals)])
sum_sq_diff1 = sum([(x - mean1) ** 2 for x in x_vals])
sum_sq_diff2 = sum([(y - mean2) ** 2 for y in y_vals])
denominator = math.sqrt(sum_sq_diff1 * sum_sq_diff2)
if denominator == 0: return 0.0
return numerator / denominator
# --- 手动KDE相关函数 ---
def gaussian_kernel(u):
return (1 / np.sqrt(2 * np.pi)) * np.exp(-0.5 * u**2)
def calculate_kde_bandwidth(data_points_np):
n = len(data_points_np)
if n == 0: return 0.1
std_dev = calculate_std_dev(list(data_points_np), ddof=1)
if std_dev == 0:
q75, q25 = np.percentile(data_points_np, [75, 25])
iqr = q75 - q25
A = iqr / 1.349 if iqr > 0 else ( (0.01 * (data_points_np.max() - data_points_np.min())) if data_points_np.max() != data_points_np.min() else 0.01 )
if A <= 1e-5 : A = 0.01
return 0.9 * A * (n ** (-0.2))
return 1.06 * std_dev * (n ** (-0.2))
def manual_kde(data_points_np, x_eval_points_np, bandwidth):
n_data = len(data_points_np)
if n_data == 0 or bandwidth <= 1e-6: return np.zeros_like(x_eval_points_np)
data_points_np = np.asarray(data_points_np).reshape(-1, 1)
x_eval_points_np = np.asarray(x_eval_points_np).reshape(1, -1)
u = (x_eval_points_np - data_points_np) / bandwidth
kernel_values = gaussian_kernel(u)
kde_y = np.sum(kernel_values, axis=0) / (n_data * bandwidth)
return kde_y
# --- 数据处理主函数 ---
def process_data():
print("--- 正在加载和处理数据 ---")
charging_data_raw, _ = read_csv(CHARGING_DATA_FILE, CHARGING_COLUMN_MAPPINGS)
if not charging_data_raw: print("错误:充电数据未能加载或为空。"); return [], {}
num_raw_charging_records = len(charging_data_raw)
cleaned_data = []
power_zero_skipped = 0; parse_errors_time_start = 0; parse_errors_time_end = 0
value_errors_power = 0; time_logic_errors = 0; missing_power=0; missing_start_time=0; missing_end_time=0
value_errors_fee=0; temp_parse_errors=0
for record in charging_data_raw:
power_str = record.get('power'); start_time_str = record.get('start_time'); end_time_str = record.get('end_time')
service_fee_str = record.get('service_fee'); end_cause_val = record.get('end_cause', "未知原因")
temp_direct_str = record.get('temperature_direct'); district_name_val = record.get('district_charging', "未知区域")
if power_str is None: missing_power+=1; continue
if start_time_str is None: missing_start_time+=1; continue
if end_time_str is None: missing_end_time+=1; continue
try: transaction_power = float(power_str)
except (ValueError, TypeError): value_errors_power+=1; continue
if transaction_power == 0: power_zero_skipped += 1; continue
start_time = parse_datetime_custom(start_time_str)
if not start_time: parse_errors_time_start+=1; continue
end_time = parse_datetime_custom(end_time_str)
if not end_time: parse_errors_time_end+=1; continue
if end_time < start_time: time_logic_errors+=1; continue
duration_hours = (end_time - start_time).total_seconds() / 3600.0
try: service_fee = float(service_fee_str) if service_fee_str else 0.0
except (ValueError, TypeError): service_fee = 0.0; value_errors_fee+=1
final_temperature = None
if temp_direct_str is not None and temp_direct_str != "":
try: final_temperature = float(temp_direct_str)
except (ValueError, TypeError): temp_parse_errors+=1
cleaned_data.append({
'transaction_power': transaction_power, 'duration_hours': duration_hours,
'district_name': district_name_val, 'service_fee': service_fee,
'end_cause': end_cause_val, 'temperature': final_temperature
})
print(f"\n数据清洗和转换完成。原始充电记录: {num_raw_charging_records}")
if missing_power > 0: print(f" - 因缺少功率数据跳过: {missing_power} 条")
print(f" - 因功率为0跳过: {power_zero_skipped} 条")
# ... (可以添加其他错误计数打印)
print(f"最终得到 {len(cleaned_data)} 条有效记录用于分析。")
district_stats_aggregated = {}
temp_district_data = {}
for record in cleaned_data:
district = record['district_name']
if district not in temp_district_data: temp_district_data[district] = {'powers': [], 'durations': [], 'service_fees': []}
if record['transaction_power'] is not None: temp_district_data[district]['powers'].append(record['transaction_power'])
if record['duration_hours'] is not None: temp_district_data[district]['durations'].append(record['duration_hours'])
if record['service_fee'] is not None: temp_district_data[district]['service_fees'].append(record['service_fee'])
for district, data in temp_district_data.items():
district_stats_aggregated[district] = {
'avg_power': calculate_mean(data['powers']), 'std_dev_duration': calculate_std_dev(data['durations']),
'median_service_fee': calculate_median(data['service_fees']), 'count': len(data['powers'])
}
return cleaned_data, district_stats_aggregated
# --- 可视化函数 ---
def plot_transaction_power_distribution(cleaned_data, output_dir):
if not cleaned_data: return
transaction_powers_list = [r['transaction_power'] for r in cleaned_data if r['transaction_power'] is not None and r['transaction_power'] > 0]
if not transaction_powers_list: print("无有效的正充电量数据用于绘制分布图。"); return
transaction_powers_np = np.array(transaction_powers_list)
fig, axes = plt.subplots(1, 2, figsize=(16, 7)) # 略微调整figsize
# 子图1: 密度直方图与KDE曲线
ax = axes[0]
ax.hist(transaction_powers_np, bins=50, density=True, alpha=0.6, color='skyblue', edgecolor='gray', label='密度直方图')
if len(transaction_powers_np) > 1:
bandwidth = calculate_kde_bandwidth(transaction_powers_np)
if bandwidth > 1e-5 :
plot_min = transaction_powers_np.min() - 1 * bandwidth
plot_max = transaction_powers_np.max() + 1 * bandwidth
x_range = np.linspace(max(0, plot_min), plot_max, 500)
kde_y = manual_kde(transaction_powers_np, x_range, bandwidth)
ax.plot(x_range, kde_y, color='darkviolet', linewidth=2.5, label='KDE曲线')
ax.set_title('充电量分布 (直方图与KDE)'); ax.set_xlabel('充电量 (kWh)'); ax.set_ylabel('密度')
ax.legend(); ax.grid(axis='y', alpha=0.75); ax.set_xlim(left=0)
# 子图2: 箱线图
ax = axes[1]
bp = ax.boxplot(transaction_powers_np, vert=False, widths=0.7, patch_artist=True,
boxprops=dict(facecolor='lightgreen', color='black'), medianprops=dict(color='red', linewidth=2))
ax.set_title('充电量分布 (箱线图)'); ax.set_xlabel('充电量 (kWh)'); ax.set_yticklabels([])
ax.grid(axis='x', alpha=0.75); ax.set_xlim(left=0)
fig.suptitle('充电量 Transaction Power 分布概览', fontweight='bold')
fig.tight_layout(rect=[0, 0.05, 1, 0.93]) # rect=[left, bottom, right, top]
save_path = os.path.join(output_dir, "transaction_power_distribution_kde.png")
plt.savefig(save_path); print(f"充电量分布图 (含KDE) 已保存至: {save_path}")
def plot_power_by_end_cause(cleaned_data, output_dir, top_n=5):
if not cleaned_data: return
end_cause_data = {}
for record in cleaned_data:
cause = record['end_cause'] if record['end_cause'] else "未知"; power = record['transaction_power']
if power is None: continue
if cause not in end_cause_data: end_cause_data[cause] = []
end_cause_data[cause].append(power)
if not end_cause_data: print("无有效的充电终止原因数据用于绘图。"); return
cause_counts = Counter({cause: len(powers) for cause, powers in end_cause_data.items()})
common_causes_original = [cause for cause, count in cause_counts.most_common(top_n)]
# 确保标签不会过长
common_causes = [ (c[:30] + '...' if len(c) > 30 else c) for c in common_causes_original]
plot_data = [end_cause_data[cause_orig] for cause_orig in common_causes_original if cause_orig in end_cause_data] # Use original cause for data lookup
if not plot_data: print(f"筛选后(前{top_n}个)无充电终止原因数据用于绘图。"); return
fig, ax = plt.subplots(figsize=(max(12, top_n * 1.5), 10)) # 动态宽度,增加高度
bp = ax.boxplot(plot_data, vert=True, patch_artist=True, labels=common_causes, widths=0.6)
try: # Matplotlib 3.7+
cmap = plt.colormaps.get_cmap('Pastel2')
color_list = cmap(np.linspace(0, 1, len(plot_data)))
except AttributeError: # Fallback for older Matplotlib
cmap = plt.cm.get_cmap('Pastel2', len(plot_data))
color_list = cmap.colors if hasattr(cmap, 'colors') else [cmap(i) for i in np.linspace(0, 1, len(plot_data))]
for patch, color in zip(bp['boxes'], color_list): patch.set_facecolor(color)
for median in bp['medians']: median.set_color('black'); median.set_linewidth(2)
ax.set_title(f'不同充电终止原因下的充电量分布 (Top {top_n})', fontweight='bold')
ax.set_ylabel('充电量 (kWh)'); ax.set_xlabel('充电终止原因')
tick_label_fontsize = plt.rcParams['xtick.labelsize'] - (2 if top_n > 5 else 0) # 如果类别多,稍微减小字号
if tick_label_fontsize < 8: tick_label_fontsize = 8 # 最小字号
plt.setp(ax.get_xticklabels(), rotation=30, ha="right", fontsize=tick_label_fontsize)
ax.grid(axis='y', linestyle='--', alpha=0.7)
# 调整 rect 的 bottom 值,给 x 轴标签更多空间
fig.tight_layout(rect=[0, 0.20 if top_n > 3 else 0.1, 1, 0.95]) # 如果标签多,底部空间给大一点
save_path = os.path.join(output_dir, f"power_by_end_cause_top{top_n}.png")
plt.savefig(save_path); print(f"按终止原因分布的充电量图 (Top {top_n}) 已保存至: {save_path}")
def plot_temp_vs_power_scatter(cleaned_data, output_dir):
if not cleaned_data: return
temperatures = [r['temperature'] for r in cleaned_data if r['temperature'] is not None and r['transaction_power'] is not None]
powers = [r['transaction_power'] for r in cleaned_data if r['temperature'] is not None and r['transaction_power'] is not None]
if not temperatures or not powers or len(temperatures) < 2:
print("无足够的有效温度和充电量数据用于绘制散点图。"); return
fig, ax = plt.subplots(figsize=(12, 7))
ax.scatter(temperatures, powers, alpha=0.4, edgecolors='w', s=40, c='steelblue')
ax.set_title('温度与充电量关系散点图', fontweight='bold'); ax.set_xlabel('温度 (℃)'); ax.set_ylabel('充电量 (kWh)')
ax.grid(True, linestyle='--', alpha=0.7); fig.tight_layout()
save_path = os.path.join(output_dir, "temperature_vs_power_scatter.png")
plt.savefig(save_path); print(f"温度与充电量散点图已保存至: {save_path}")
def plot_district_statistics(district_stats, output_dir):
if not district_stats: print("无有效的区域统计数据用于绘图。"); return
sorted_districts_items = sorted(district_stats.items())
if not sorted_districts_items: print("排序后无区域数据用于绘图。"); return
districts = [item[0] for item in sorted_districts_items]
avg_powers = [item[1]['avg_power'] for item in sorted_districts_items]
std_dev_durations = [item[1]['std_dev_duration'] for item in sorted_districts_items]
median_service_fees = [item[1]['median_service_fee'] for item in sorted_districts_items]
x = np.arange(len(districts)); width = 0.35
fig_base_width = 8
fig_width = max(fig_base_width, len(districts) * 1.5 + 2) # 动态计算宽度
fig_height = 8 # 增加默认高度
def _plot_bar_chart(ax, x_data, y_data, width, label_text, color, y_label_text, title_text, districts_labels, bar_label_padding=5):
rects = ax.bar(x_data, y_data, width, label=label_text, color=color)
ax.set_ylabel(y_label_text); ax.set_title(title_text, fontweight='bold')
ax.set_xticks(x_data); ax.set_xticklabels(districts_labels, rotation=45, ha="right")
ax.legend(loc='upper right'); ax.bar_label(rects, padding=bar_label_padding, fmt='%.2f')
# 自动调整y轴上限以确保标签可见
if y_data:
max_val_with_label = max(y_data) * 1.15 # 假设标签不会超出bar高度的15%
current_top = ax.get_ylim()[1]
ax.set_ylim(top=max(current_top, max_val_with_label))
# 图1: 平均充电量
fig1, ax1 = plt.subplots(figsize=(fig_width, fig_height));
_plot_bar_chart(ax1, x, avg_powers, width, '平均充电量', 'cornflowerblue', '平均充电量 (kWh)', '各区域平均充电量', districts)
fig1.tight_layout(rect=[0, 0.25, 1, 0.93]) # 增加底部和顶部边距
save_path1 = os.path.join(output_dir, "district_avg_power.png"); plt.savefig(save_path1); print(f"区域平均充电量图已保存至: {save_path1}")
# 图2: 充电时长标准差
fig2, ax2 = plt.subplots(figsize=(fig_width, fig_height));
_plot_bar_chart(ax2, x, std_dev_durations, width, '充电时长标准差', 'lightcoral', '充电时长标准差 (小时)', '各区域充电时长标准差', districts)
fig2.tight_layout(rect=[0, 0.25, 1, 0.93])
save_path2 = os.path.join(output_dir, "district_duration_stddev.png"); plt.savefig(save_path2); print(f"区域充电时长标准差图已保存至: {save_path2}")
# 图3: 服务费中位数
fig3, ax3 = plt.subplots(figsize=(fig_width, fig_height));
_plot_bar_chart(ax3, x, median_service_fees, width, '服务费中位数', 'mediumseagreen', '服务费中位数 (元)', '各区域服务费中位数', districts)
fig3.tight_layout(rect=[0, 0.25, 1, 0.93])
save_path3 = os.path.join(output_dir, "district_median_service_fee.png"); plt.savefig(save_path3); print(f"区域服务费中位数图已保存至: {save_path3}")
# --- 主函数 ---
if __name__ == '__main__':
configure_fonts_and_styles()
cleaned_data, district_stats_aggregated = process_data()
if cleaned_data:
print("\n--- 开始生成可视化图表与分析 ---")
print("\n--- Pearson 相关系数计算 ---")
temperatures_for_corr = [r['temperature'] for r in cleaned_data]
powers_for_corr = [r['transaction_power'] for r in cleaned_data]
pearson_corr = calculate_pearson_correlation(temperatures_for_corr, powers_for_corr)
valid_pairs_count = len([1 for t, p in zip(temperatures_for_corr, powers_for_corr) if t is not None and p is not None and isinstance(t, (int,float)) and isinstance(p, (int,float))])
if valid_pairs_count > 1:
print(f"Temperature与Transaction power的Pearson相关系数: {pearson_corr:.4f} (基于 {valid_pairs_count} 个有效数据点对)")
else:
print("没有足够的有效数据来计算Temperature与Transaction power的Pearson相关系数。")
print("\n--- 生成图表 ---")
plot_transaction_power_distribution(cleaned_data, OUTPUT_DIR)
plot_power_by_end_cause(cleaned_data, OUTPUT_DIR, top_n=5)
plot_temp_vs_power_scatter(cleaned_data, OUTPUT_DIR)
if district_stats_aggregated:
plot_district_statistics(district_stats_aggregated, OUTPUT_DIR)
else:
print("无区域统计数据,跳过相关图表绘制。")
print(f"\n所有图表已尝试保存到 '{OUTPUT_DIR}' 文件夹中。")
print("正在尝试显示所有图表...")
plt.show()
print("所有图表窗口已尝试显示完毕。")
else:
print("没有清洗后的数据可用于可视化。")
print("\n可视化任务结束。")
最新发布