关于svm_train的参数问题

这篇博客讲述了作者在初次使用svm_train时遇到的参数配置困惑。通过对svm_train源码的研读,作者理解了各种参数的作用,如svm类型、内核类型、度、gamma、coef0等,并最终成功解决了报错问题。尽管花费了一整天时间,但作者通过自己的探索得到了满意的结果。

    初次使用svm,不知道svm_train的参数该怎么设置,svm_train源码的注释里虽然有相关解释,但是看得云里雾里,如下:

Train an SVM model from data (y, x) or an svm_problem prob using
'options' or an svm_parameter param.
If '-v' is specified in 'options' (i.e., cross validation)
either accuracy (ACC) or mean-squared error (MSE) is returned.
options:
   -s svm_type : set type of SVM (default 0)
       0 -- C-SVC(multi-class classification)
       1 -- nu-SVC(multi-class classification

import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV from imblearn.over_sampling import SMOTE from sklearn.svm import SVC from sklearn.metrics import accuracy_score # 读取特征文件,仅使用 T2_1_GH.xlsx t1_path = r'C:\Users\b\T2_1_GH.xlsx' t1_features = pd.read_excel(t1_path) # 假设最后一列是标签 X = t1_features.iloc[:, :-1].values y = t1_features.iloc[:, -1].values # 分出单独测试集 X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) # 数据标准化 scaler = StandardScaler() X_train_val_scaled = scaler.fit_transform(X_train_val) # SMOTE 平衡样本 smote = SMOTE() X_resampled, y_resampled = smote.fit_resample(X_train_val_scaled, y_train_val) # 特征降维,使用 PCA pca = PCA(n_components=0.95) # 保留 95% 的方差 X_reduced = pca.fit_transform(X_resampled) # 十折交叉验证 skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # 定义模型及超参数空间 svm_model = SVC() # 新的 SVM参数网格,减小 C 值和 gamma 值范围以避免过拟合 svm_param_grid = { 'C': [0.1], 'kernel': ['linear', 'rbf'], 'degree': [2], # 多项式核的度数 'gamma': [ 0.01, 0.1] } # 存储各折验证准确率 svm_fold_accuracies = [] svm_fold_train_accuracies = [] # 存储最佳超参数 best_svm_params = [] for train_index, val_index in skf.split(X_reduced, y_resampled): X_train, X_val = X_reduced[train_index], X_reduced[val_index] y_train, y_val = y_resampled[train_index], y_resampled[val_index] # 超参数优化 SVM grid_search_svm = GridSearchCV(svm_model, svm_param_grid, cv=5) grid_search_svm.fit(X_train, y_train) best_svm = grid_search_svm.best_estimator_ best_svm_params.append(grid_search_svm.best_params_) svm_pred = best_svm.predict(X_val) svm_accuracy = accuracy_score(y_val, svm_pred) svm_fold_accuracies.append(svm_accuracy) # 计算 SVM 训练集准确率 svm_train_pred = best_svm.predict(X_train) svm_train_accuracy = accuracy_score(y_train, svm_train_pred) svm_fold_train_accuracies.append(svm_train_accuracy) # 输出各折验证准确率 print("SVM各折验证准确率:", svm_fold_accuracies) # 输出各折训练准确率 print("SVM各折训练准确率:", svm_fold_train_accuracies) # 输出平均准确率 print(f"SVM 平均交叉验证准确率: {np.mean(svm_fold_accuracies)}") # 输出平均训练准确率 print(f"SVM 平均训练准确率: {np.mean(svm_fold_train_accuracies)}") # 判断是否过拟合 print("SVM 是否过拟合:", np.mean(svm_fold_train_accuracies) - np.mean(svm_fold_accuracies) > 0.1) # 输出最佳超参数 print("SVM 最佳超参数:", best_svm_params) # 在测试集上评估最终模型 X_test_scaled = scaler.transform(X_test) X_test_reduced = pca.transform(X_test_scaled) # 用最佳参数SVM 模型在测试集评估 best_svm.fit(X_reduced, y_resampled) svm_test_pred = best_svm.predict(X_test_reduced) svm_test_accuracy = accuracy_score(y_test, svm_test_pred) print(f"SVM 测试集准确率: {svm_test_accuracy}") 保持原有结构不变,加入其他指标的计算 recall_score, f1_score, roc_auc_score,且加入异常值的检索和处理的代码
最新发布
12-10
以下是在原代码基础上加入 `recall_score`、`f1_score`、`roc_auc_score` 指标计算及异常值检索和处理代码的版本: ```python import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV from imblearn.over_sampling import SMOTE from sklearn.svm import SVC from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score # 读取特征文件,仅使用 T2_1_GH.xlsx t1_path = r'C:\Users\b\T2_1_GH.xlsx' t1_features = pd.read_excel(t1_path) # 异常值检索和处理:使用 IQR 方法 Q1 = t1_features.quantile(0.25) Q3 = t1_features.quantile(0.75) IQR = Q3 - Q1 t1_features = t1_features[~((t1_features < (Q1 - 1.5 * IQR)) | (t1_features > (Q3 + 1.5 * IQR))).any(axis=1)] t1_features = t1_features.dropna() # 假设最后一列是标签 X = t1_features.iloc[:, :-1].values y = t1_features.iloc[:, -1].values # 分出单独测试集 X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) # 数据标准化 scaler = StandardScaler() X_train_val_scaled = scaler.fit_transform(X_train_val) # SMOTE 平衡样本 smote = SMOTE() X_resampled, y_resampled = smote.fit_resample(X_train_val_scaled, y_train_val) # 特征降维,使用 PCA pca = PCA(n_components=0.95) # 保留 95% 的方差 X_reduced = pca.fit_transform(X_resampled) # 十折交叉验证 skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # 定义模型及超参数空间 svm_model = SVC(probability=True) # 新的 SVM参数网格,减小 C 值和 gamma 值范围以避免过拟合 svm_param_grid = { 'C': [0.1], 'kernel': ['linear', 'rbf'], 'degree': [2], # 多项式核的度数 'gamma': [0.01, 0.1] } # 存储各折验证准确率 svm_fold_accuracies = [] svm_fold_train_accuracies = [] # 存储各折验证的召回率、F1 分数、AUC svm_fold_recalls = [] svm_fold_f1s = [] svm_fold_aucs = [] # 存储最佳超参数 best_svm_params = [] for train_index, val_index in skf.split(X_reduced, y_resampled): X_train, X_val = X_reduced[train_index], X_reduced[val_index] y_train, y_val = y_resampled[train_index], y_resampled[val_index] # 超参数优化 SVM grid_search_svm = GridSearchCV(svm_model, svm_param_grid, cv=5) grid_search_svm.fit(X_train, y_train) best_svm = grid_search_svm.best_estimator_ best_svm_params.append(grid_search_svm.best_params_) svm_pred = best_svm.predict(X_val) svm_accuracy = accuracy_score(y_val, svm_pred) svm_fold_accuracies.append(svm_accuracy) # 计算 SVM 训练集准确率 svm_train_pred = best_svm.predict(X_train) svm_train_accuracy = accuracy_score(y_train, svm_train_pred) svm_fold_train_accuracies.append(svm_train_accuracy) # 计算其他评估指标 recall = recall_score(y_val, svm_pred) f1 = f1_score(y_val, svm_pred) y_pro = best_svm.predict_proba(X_val) auc = roc_auc_score(y_val, y_pro[:, 1]) svm_fold_recalls.append(recall) svm_fold_f1s.append(f1) svm_fold_aucs.append(auc) # 输出各折验证准确率 print("SVM各折验证准确率:", svm_fold_accuracies) # 输出各折训练准确率 print("SVM各折训练准确率:", svm_fold_train_accuracies) # 输出平均准确率 print(f"SVM 平均交叉验证准确率: {np.mean(svm_fold_accuracies)}") # 输出平均训练准确率 print(f"SVM 平均训练准确率: {np.mean(svm_fold_train_accuracies)}") # 判断是否过拟合 print("SVM 是否过拟合:", np.mean(svm_fold_train_accuracies) - np.mean(svm_fold_accuracies) > 0.1) # 输出最佳超参数 print("SVM 最佳超参数:", best_svm_params) # 输出各折验证的召回率、F1 分数、AUC print("SVM各折验证召回率:", svm_fold_recalls) print("SVM各折验证F1分数:", svm_fold_f1s) print("SVM各折验证AUC:", svm_fold_aucs) # 输出平均召回率、F1 分数、AUC print(f"SVM 平均交叉验证召回率: {np.mean(svm_fold_recalls)}") print(f"SVM 平均交叉验证F1分数: {np.mean(svm_fold_f1s)}") print(f"SVM 平均交叉验证AUC: {np.mean(svm_fold_aucs)}") # 在测试集上评估最终模型 X_test_scaled = scaler.transform(X_test) X_test_reduced = pca.transform(X_test_scaled) # 用最佳参数SVM 模型在测试集评估 best_svm.fit(X_reduced, y_resampled) svm_test_pred = best_svm.predict(X_test_reduced) svm_test_accuracy = accuracy_score(y_test, svm_test_pred) print(f"SVM 测试集准确率: {svm_test_accuracy}") # 计算测试集的召回率、F1 分数、AUC test_recall = recall_score(y_test, svm_test_pred) test_f1 = f1_score(y_test, svm_test_pred) y_test_pro = best_svm.predict_proba(X_test_reduced) test_auc = roc_auc_score(y_test, y_test_pro[:, 1]) print(f"SVM 测试集召回率: {test_recall}") print(f"SVM 测试集F1分数: {test_f1}") print(f"SVM 测试集AUC: {test_auc}") ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值