# -*- coding: utf-8 -*-
"""
Created on Sun Jul 27 09:56:06 2025
@author: Sonia
"""
import os
import time
import logging
from typing import Tuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, explained_variance_score
import joblib
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# 指定支持中文的字体,SimSun 是 Windows 系统自带的宋体,Linux/Mac 可换其他字体
plt.rcParams['font.sans-serif'] = ['SimSun']
# 解决负号显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
# 配置参数
CONFIG = {
'working_dir': r'E:\论文代码\data',
'data_file': './data1.xlsx',
'model_save_dir': '../data_result_model/SVR/',
'result_save_dir': '../data_result/SVR/',
'plot_save_dir': '../data_result/SVR/', # 图片保存路径
'test_size': 0.3, # 测试集占比
# SVR默认超参数
'default_params': {
'C': 50,
'gamma': 0.01,
'kernel': 'rbf',
'epsilon': 0.1,
'shrinking': True,
'tol': 0.001,
'max_iter': -1
}
}
def set_working_directory():
"""设置工作目录并创建所有必要目录"""
os.chdir(CONFIG['working_dir'])
pd.set_option('display.max_columns', 32)
# 确保所有保存目录存在
os.makedirs(CONFIG['plot_save_dir'], exist_ok=True)
os.makedirs(CONFIG['model_save_dir'], exist_ok=True)
os.makedirs(CONFIG['result_save_dir'], exist_ok=True)
logging.info("工作目录及保存目录设置完成")
def process_data(path: str) -> pd.DataFrame:
"""处理数据:读取、去重、缺失值处理"""
original_columns = ['x', 'y', 'rel.wind_speed',
'wind_direction', 'speed', 'course', 'swh',
'me_oil_consume'] # 特征列与目标列
data = pd.read_excel(path)
# 处理缺失值
nan_rows = data[data.isnull().any(axis=1)]
if not nan_rows.empty:
logging.warning(f"发现 {len(nan_rows)} 行含NaN数据,已剔除")
data = data.dropna()
# 确保数据按时间顺序排列(取消注释即可启用)
"""if '时间' in data.columns:
data['时间'] = pd.to_datetime(data['时间'])
data = data.sort_values('时间').reset_index(drop=True)
else:
logging.warning("数据中未发现'时间'列,假设数据已按时间顺序排列")"""
# 确保只保留需要的列并命名
data = data[original_columns]
data.columns = [f'F{i}' for i in range(1, 9)] # 重命名为F1到F8
return data
def load_data() -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
"""加载数据,返回特征、目标变量和原始数据"""
train_data = process_data(CONFIG['data_file'])
x = train_data.iloc[:, 0:7].values # 前7列为特征
y = train_data.iloc[:, -1].values # 最后一列为目标变量(油耗)
print(x, y.shape)
# 验证数据有效性
if x.size == 0 or y.size == 0:
raise ValueError("加载的数据为空,请检查数据源")
return x, y, train_data
def train_and_evaluate(x: np.ndarray, y: np.ndarray, train_data: pd.DataFrame) -> Tuple[
list, list, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""训练SVR模型并评估,返回预测值和真实值用于绘图"""
results = []
param_counts = []
n_features = x.shape[1]
# 时序划分(固定训练/测试集划分)
train_size = int(len(x) * (1 - CONFIG['test_size']))
x_train, x_test = x[:train_size], x[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
# 数据标准化(仅拟合训练集)
std_x = StandardScaler()
std_y = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)
y_train_scaled = std_y.fit_transform(y_train.reshape(-1, 1))
y_test_scaled = std_y.transform(y_test.reshape(-1, 1))
# 训练默认参数的SVR模型
model = SVR(
C=CONFIG['default_params']['C'],
gamma=CONFIG['default_params']['gamma'],
kernel=CONFIG['default_params']['kernel'],
epsilon=CONFIG['default_params']['epsilon'],
shrinking=CONFIG['default_params']['shrinking'],
tol=CONFIG['default_params']['tol'],
max_iter=CONFIG['default_params']['max_iter']
)
start_time = time.perf_counter()
model.fit(x_train, y_train_scaled.ravel())
end_time = time.perf_counter()
logging.info(f"模型训练完成,耗时: {end_time - start_time:.4f}秒")
# 预测
y_train_predict_scaled = model.predict(x_train)
y_test_predict_scaled = model.predict(x_test)
# 反标准化,转换回原始尺度
y_train_predict = std_y.inverse_transform(y_train_predict_scaled.reshape(-1, 1))
y_test_predict = std_y.inverse_transform(y_test_predict_scaled.reshape(-1, 1))
# 计算评估指标
result = calculate_metrics(
y_train_scaled, y_test_scaled, y_train_predict, y_test_predict,
std_y, end_time - start_time, n_features
)
result['Model'] = 'SVR_Default' # 标记模型
results.append(result)
param_counts.append(len(model.support_)) # 记录支持向量数量
# 保存模型和标准化器
model_path = f"{CONFIG['model_save_dir']}_SVR_default_data1.joblib"
joblib.dump(model, model_path)
joblib.dump(std_x, f"{CONFIG['model_save_dir']}_std_x_data1.joblib")
joblib.dump(std_y, f"{CONFIG['model_save_dir']}_std_y_data1.joblib")
logging.info(f"模型已保存至: {model_path}")
return results, param_counts, y_train, y_train_predict, y_test, y_test_predict
def calculate_metrics(y_train, y_test, y_train_predict, y_test_predict, std_y, run_time, n_features):
"""计算评估指标"""
y_train = std_y.inverse_transform(y_train)
y_test = std_y.inverse_transform(y_test)
# 处理可能的维度问题
y_train = y_train.flatten()
y_test = y_test.flatten()
y_train_predict = y_train_predict.flatten()
y_test_predict = y_test_predict.flatten()
# 基础指标计算
train_r2 = r2_score(y_train, y_train_predict)
test_r2 = r2_score(y_test, y_test_predict)
train_mse = mean_squared_error(y_train, y_train_predict)
test_mse = mean_squared_error(y_test, y_test_predict)
train_mae = mean_absolute_error(y_train, y_train_predict)
test_mae = mean_absolute_error(y_test, y_test_predict)
# 调整后R²(避免样本量或特征数影响)
n_train = len(y_train)
n_test = len(y_test)
train_r2_adj = 1 - (1 - train_r2) * (n_train - 1) / (n_train - n_features - 1) if n_train > n_features + 1 else 0
test_r2_adj = 1 - (1 - test_r2) * (n_test - 1) / (n_test - n_features - 1) if n_test > n_features + 1 else 0
# 其他辅助指标
train_evs = explained_variance_score(y_train, y_train_predict)
test_evs = explained_variance_score(y_test, y_test_predict)
train_mape = np.mean(np.abs((y_train_predict - y_train) / (y_train + 1e-8))) * 100 # 避免除零
test_mape = np.mean(np.abs((y_test_predict - y_test) / (y_test + 1e-8))) * 100
return {
'Train R2': train_r2, 'Test R2': test_r2,
'Train R2 Adj': train_r2_adj, 'Test R2 Adj': test_r2_adj,
'Train MSE': train_mse, 'Test MSE': test_mse,
'Train RMSE': np.sqrt(train_mse), 'Test RMSE': np.sqrt(test_mse),
'Train MAE': train_mae, 'Test MAE': test_mae,
'Train EVS': train_evs, 'Test EVS': test_evs,
'Train MAPE': train_mape, 'Test MAPE': test_mape,
'Run Time': run_time
}
def save_results(results: list, param_counts: np.ndarray, feature_names: list):
"""保存结果到Excel"""
results_df = pd.DataFrame(results)
# 调整列顺序,将Model放在首位
cols = ['Model'] + [col for col in results_df.columns if col != 'Model']
results_df = results_df[cols]
# 添加支持向量数量列
param_counts_df = pd.DataFrame(param_counts, columns=['Support_Vectors_Count'])
results_df = pd.concat([results_df, param_counts_df], axis=1)
# 保存结果
filename = os.path.splitext(os.path.basename(CONFIG['data_file']))[0]
save_path = f"{CONFIG['result_save_dir']}{filename}_SVR.xlsx"
results_df.to_excel(save_path, index=False, engine='openpyxl')
logging.info(f'结果已保存至: {save_path}')
def plot_predictions_vs_true(y_train, y_train_pred, y_test, y_test_pred):
"""绘制预测值与真实值对比图(包括训练集和测试集)"""
# 确保数据为一维且过滤异常值
print(y_train.shape, y_train_pred.shape, y_test.shape, y_test_pred.shape)
y_train = np.ravel(y_train)
y_train_pred = np.ravel(y_train_pred)
y_test = np.ravel(y_test)
y_test_pred = np.ravel(y_test_pred)
print(y_train.shape, y_train_pred.shape, y_test.shape, y_test_pred.shape)
# 过滤非有限值
mask_train = np.isfinite(y_train) & np.isfinite(y_train_pred)
y_train = y_train[mask_train]
y_train_pred = y_train_pred[mask_train]
mask_test = np.isfinite(y_test) & np.isfinite(y_test_pred)
y_test = y_test[mask_test]
y_test_pred = y_test_pred[mask_test]
print(y_train.shape, y_train_pred.shape, y_test.shape, y_test_pred.shape)
plt.figure(figsize=(12, 6))
# 绘制训练集对比
plt.subplot(1, 2, 1)
plt.scatter(y_train, y_train_pred, alpha=0.6, color='blue', label='训练集')
# 添加理想线 (y = x)
min_val = np.min(np.concatenate([y_train, y_train_pred]))
max_val = np.max(np.concatenate([y_train, y_train_pred]))
plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='理想线 y=x')
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.title('训练集:预测值 vs 真实值')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
# 绘制测试集对比
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_test_pred, alpha=0.6, color='green', label='测试集')
# 添加理想线 (y = x)
min_val = np.min(np.concatenate([y_test, y_test_pred]))
max_val = np.max(np.concatenate([y_test, y_test_pred]))
plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='理想线 y=x')
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.title('测试集:预测值 vs 真实值')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
# 保存图片
save_path = f"{CONFIG['plot_save_dir']}predictions_vs_true.png"
plt.savefig(save_path, dpi=300, bbox_inches='tight')
plt.close()
logging.info(f'预测值与真实值对比图已保存至: {save_path}')
def plot_error_distribution(y_train, y_train_predict, y_test, y_test_predict):
"""绘制误差分布直方图(包括训练集和测试集)"""
# 计算误差
train_errors = np.ravel(y_train_predict) - np.ravel(y_train)
test_errors = np.ravel(y_test_predict) - np.ravel(y_test)
plt.figure(figsize=(12, 6))
# 训练集误差分布
plt.subplot(1, 2, 1)
plt.hist(train_errors, bins=30, alpha=0.7, color='blue', edgecolor='black')
plt.axvline(x=0, color='red', linestyle='--', label='零误差')
plt.xlabel('预测误差 (预测值 - 真实值)')
plt.ylabel('频数')
plt.title(f'训练集误差分布 (均值: {train_errors.mean():.4f}, 标准差: {train_errors.std():.4f})')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
# 测试集误差分布
plt.subplot(1, 2, 2)
plt.hist(test_errors, bins=30, alpha=0.7, color='green', edgecolor='black')
plt.axvline(x=0, color='red', linestyle='--', label='零误差')
plt.xlabel('预测误差 (预测值 - 真实值)')
plt.ylabel('频数')
plt.title(f'测试集误差分布 (均值: {test_errors.mean():.4f}, 标准差: {test_errors.std():.4f})')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
# 保存图片
save_path = f"{CONFIG['plot_save_dir']}error_distribution.png"
plt.savefig(save_path, dpi=300, bbox_inches='tight')
plt.close()
logging.info(f'误差分布直方图已保存至: {save_path}')
def plot_fitting_curve(y_train, y_train_predict, y_test, y_test_predict):
"""绘制拟合曲线(按数据顺序展示)"""
# 创建索引用于排序展示
train_idx = np.arange(len(y_train))
test_idx = np.arange(len(y_train), len(y_train) + len(y_test))
plt.figure(figsize=(12, 6))
# 绘制训练集拟合曲线
plt.plot(train_idx, y_train, 'b-', label='训练集真实值', alpha=0.7)
plt.plot(train_idx, y_train_predict, 'r--', label='训练集预测值', alpha=0.7)
# 绘制测试集拟合曲线
plt.plot(test_idx, y_test, 'g-', label='测试集真实值', alpha=0.7)
plt.plot(test_idx, y_test_predict, 'm--', label='测试集预测值', alpha=0.7)
# 添加训练集和测试集分隔线
plt.axvline(x=len(y_train) - 1, color='gray', linestyle=':', label='训练集/测试集分割')
plt.xlabel('样本索引')
plt.ylabel('油耗值')
plt.title('拟合曲线:预测值与真实值随样本顺序变化')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
# 保存图片
save_path = f"{CONFIG['plot_save_dir']}fitting_curve.png"
plt.savefig(save_path, dpi=300, bbox_inches='tight')
def main():
"""主函数:串联数据加载、模型训练、结果保存和绘图流程"""
# 1. 设置工作目录
set_working_directory()
# 2. 加载数据
x, y, train_data = load_data()
logging.info(f"数据加载完成,特征维度:{x.shape},目标变量数量:{y.shape[0]}")
# 3. 训练模型并获取评估结果、预测值
results, param_counts, y_train, y_train_predict, y_test, y_test_predict = train_and_evaluate(x, y, train_data)
# 4. 保存评估结果到Excel
feature_names = [f'F{i}' for i in range(1, 1 + x.shape[1])] # 匹配特征命名规则
save_results(results, np.array(param_counts), feature_names)
# 5. 绘制并保存所有图表
plot_predictions_vs_true(y_train, y_train_predict, y_test, y_test_predict)
plot_error_distribution(y_train, y_train_predict, y_test, y_test_predict)
plot_fitting_curve(y_train, y_train_predict, y_test, y_test_predict)
logging.info("所有流程执行完毕!")
if __name__ == "__main__":
main()用粒子群算法来优化这段SVR代码的超参数