pycharm运行xgb.Booster.predict报错:Process finished with exit code -1073740791 (0xC0000409)

在PyCharm中使用xgboost训练模型时遇到进程退出代码-1073740791 (0xC0000409)的错误。该错误出现在调用`xgb.Booster.predict`时。经过排查,确定是由于GPU内存不足导致。解决方案是在训练时使用GPU,但在预测时切换到CPU进行预测,通过设置`tree_method`参数实现。问题解决后,代码运行正常。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

pycharm运行xgboost训练程序时,报错如下:

Process finished with exit code -1073740791 (0xC0000409)

  1. 调试发现是运行如下这行代码出现的问题:
    dataset3_predict[‘label’] = model.predict(predict_dmatrix)

  2. 找遍了网络上各种方法都不行,但是网络上很多博客说这样的错误是显卡内存不足(如博客:https://blog.youkuaiyun.com/cuicheng01/article/details/79774156)

  3. 不断调试摸索,怀疑是xgb.Booster.predict使用GPU运行时显卡内存不足,因此考虑在使用GPU训练之后,在predict时,使用CPU predict,具体做法是:
    1)xgboost train时使用的参数为:

    params = {'booster': 'gbtree',
                  'objective': 'binary:logistic',
                  'eval_metric': 'auc',
                  'gamma': 0.1,
                  'min_child_weight': 1.1,
                  'max_depth': 5,
                  'lambda': 10,
                  'subsample': 0.7,
                  'colsample_bytree': 0.7,
                  'colsample_bylevel': 0.7,
                  'eta': 0.01,
                  'tree_method': 'gpu_hist',
      
import os import pandas as pd import numpy as np import joblib from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV from sklearn.metrics import (roc_curve, auc, classification_report, confusion_matrix, accuracy_score) from sklearn.linear_model import LogisticRegressionCV from sklearn import svm, naive_bayes from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from scipy.stats import randint, uniform # 用于随机搜索的参数分布 import xgboost as xgb import matplotlib.pyplot as plt import seaborn as sns # 用于混淆矩阵可视化 # 设置中文显示 plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"] plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 outPath = r'D:\PyCharmproject\PychaemProject1\out' savePath = r'D:\PyCharmproject\PychaemProject1\save' rfPath = os.path.join(outPath, 'merged.csv') cliPath = os.path.join(outPath, 'cli.csv') rfsPath = os.path.join(outPath, 'selected_rf.csv') clisPath = os.path.join(outPath, 'selected_cli.csv') uniPath = os.path.join(outPath, 'uni.csv') uniprobPath = os.path.join(outPath, 'uniprob.csv') colrfsPath = os.path.join(savePath, 'colrfs_ndarr.jlb') colclisPath = os.path.join(savePath, 'colclis_ndarr.jlb') stdPath = os.path.join(savePath, 'std_uni.jlb') unimPath = os.path.join(savePath, 'mdl_uni.jlb') # 数据准备 col_rfs = np.append(pd.read_csv(rfsPath).columns.values, 'ID') col_clis = np.append(pd.read_csv(clisPath).columns.values, '序号') joblib.dump(col_rfs[:-2], colrfsPath) joblib.dump(col_clis[:-2], colclisPath) df_rfs = pd.read_csv(rfPath)[col_rfs] df_clis = pd.read_csv(cliPath)[col_clis] df_uni = pd.merge(df_rfs, df_clis, left_on='ID', right_on='序号') del df_uni['Enlarge'] del df_uni['ID'] df_uni.dropna(axis=0, inplace=True) df_uni.to_csv(uniPath, index=False) # 特征和目标变量 y = df_uni['血肿扩大'].values del df_uni['血肿扩大'] del df_uni['序号'] X = df_uni.values features = df_uni.columns.values print(f"目标变量形状: {y.shape}") print(f"特征变量形状: {X.shape}") # 数据标准化 stdsc = StandardScaler().fit(X) joblib.dump(stdsc, stdPath) X_std = stdsc.transform(X) # 数据集分割 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.2, random_state=2022) print(f'训练集大小: {len(X_train)}') print(f'测试集大小: {len(X_test)}') # 模型训练 # 逻辑回归 lr = LogisticRegressionCV(cv=5, scoring='roc_auc', max_iter=10000, class_weight='balanced', verbose=1, random_state=2022).fit(X_train, y_train) print(f'最佳正则化参数: {lr.C_}') # SVM svm_rbf = svm.SVC(C=1, kernel='rbf', probability=True, class_weight='balanced', verbose=True, random_state=2022).fit(X_train, y_train) # KNN knn = KNeighborsClassifier().fit(X_train, y_train) # 高斯朴素贝叶斯 gnb = naive_bayes.GaussianNB().fit(X_train, y_train) # 随机森林(使用随机搜索) rfc = RandomForestClassifier( bootstrap=True, oob_score=True, random_state=2022, n_jobs=-1, class_weight='balanced' ) # 随机森林参数分布 param_dist_rfc = { 'n_estimators': randint(low=50, high=200), 'max_depth': randint(low=3, high=10), 'min_samples_split': randint(low=5, high=30), 'min_samples_leaf': randint(low=2, high=8), 'max_features': ['sqrt', 'log2', 0.3, 0.5, 0.7], 'max_samples': uniform(loc=0.7, scale=0.2), 'criterion': ['gini', 'entropy'], 'min_weight_fraction_leaf': [0.0, 0.01, 0.02] } # 随机搜索 optimizer_rfc = RandomizedSearchCV( estimator=rfc, param_distributions=param_dist_rfc, n_iter=50, cv=3, scoring='roc_auc', verbose=5, random_state=2022, n_jobs=-1 ).fit(X_train, y_train) best_rfc = optimizer_rfc.best_estimator_ print("随机森林最佳参数组合:", optimizer_rfc.best_params_) print("随机森林最佳交叉验证AUC:", optimizer_rfc.best_score_) # XGBoost(带网格搜索) xgbcf = xgb.XGBClassifier(objective='binary:logistic', booster='gbtree') param_grid_xgb = { 'learning_rate': [0.01, 0.05], 'gamma': [0.1, 0.3], 'max_depth': [3, 5], 'subsample': [0.6, 0.8], 'colsample_bytree': [0.6, 0.8], 'min_child_weight': [1, 2] } optimizer_xgb = GridSearchCV(xgbcf, param_grid=param_grid_xgb, cv=3, scoring='roc_auc', verbose=10).fit(X_train, y_train) # 特征重要性(随机森林) importances = optimizer_rfc.best_estimator_.feature_importances_ indices = np.argsort(importances)[::-1] print("\n随机森林特征重要性:") for f in range(X.shape[1]): print(f"{f + 1:2d}) {features[indices[f]]:<60} {importances[indices[f]]:.6f}") # 特征重要性(XGBoost) importances = optimizer_xgb.best_estimator_.feature_importances_ indices = np.argsort(importances)[::-1] print("\nXGBoost特征重要性:") for f in range(X.shape[1]): print(f"{f + 1:2d}) {features[indices[f]]:<60} {importances[indices[f]]:.6f}") # 模型评估与可视化 model_names = ['LR', 'SVM', 'KNN', 'GNB', 'RF', 'XGB'] model_instances = [ lr, svm_rbf, knn, gnb, optimizer_rfc.best_estimator_, optimizer_xgb.best_estimator_ ] plot_colors = [ 'orange', 'gold', 'mediumseagreen', 'steelblue', 'mediumpurple', 'crimson' ] # 1. 训练集ROC曲线 plt.figure(figsize=(12, 10), dpi=300) for (name, instance, color) in zip(model_names, model_instances, plot_colors): y_train_predprob = instance.predict_proba(X_train)[:, 1] fpr, tpr, _ = roc_curve(y_train, y_train_predprob, pos_label=1) plt.plot(fpr, tpr, lw=3, label=f'{name} (AUC={auc(fpr, tpr):.3f})', color=color) plt.plot([0, 1], [0, 1], '--', lw=3, color='grey') plt.axis('square') plt.xlabel('假阳性率 (FPR)', fontsize=14) plt.ylabel('真阳性率 (TPR)', fontsize=14) plt.title('训练集ROC曲线', fontsize=18) plt.legend(loc='lower right', fontsize=12) plt.grid(alpha=0.3) plt.show() # 2. 测试集ROC曲线 plt.figure(figsize=(12, 10), dpi=300) test_aucs = [] test_accuracies = [] for (name, instance, color) in zip(model_names, model_instances, plot_colors): y_test_preds = instance.predict(X_test) y_test_predprob = instance.predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_test_predprob, pos_label=1) roc_auc = auc(fpr, tpr) accuracy = accuracy_score(y_test, y_test_preds) test_aucs.append(roc_auc) test_accuracies.append(accuracy) print(f'\n=========={name}模型评估==========') print(classification_report(y_test, y_test_preds, target_names=['稳定', '扩大'])) print(f'准确率: {accuracy:.4f}') print(f'AUC值: {roc_auc:.4f}') plt.plot(fpr, tpr, lw=3, label=f'{name} (AUC={roc_auc:.3f})', color=color) plt.plot([0, 1], [0, 1], '--', lw=3, color='grey') plt.axis('square') plt.xlabel('假阳性率 (FPR)', fontsize=14) plt.ylabel('真阳性率 (TPR)', fontsize=14) plt.title('测试集ROC曲线', fontsize=18) plt.legend(loc='lower right', fontsize=12) plt.grid(alpha=0.3) plt.show() # 3. 模型准确率对比可视化 plt.figure(figsize=(10, 6), dpi=300) x = np.arange(len(model_names)) plt.bar(x, test_accuracies, color=plot_colors, alpha=0.7) plt.xticks(x, model_names, fontsize=12) plt.ylabel('准确率', fontsize=14) plt.title('各模型在测试集上的准确率对比', fontsize=16) plt.ylim(0, 1.0) for i, v in enumerate(test_accuracies): plt.text(i, v + 0.01, f'{v:.3f}', ha='center', fontsize=10) plt.grid(axis='y', alpha=0.3) plt.show() # 4. 混淆矩阵可视化(显示数量) fig, axes = plt.subplots(2, 3, figsize=(18, 12), dpi=300) axes = axes.flatten() # 修复循环变量解包问题 for i, (name, instance, color) in enumerate(zip(model_names, model_instances, plot_colors)): y_test_preds = instance.predict(X_test) cm = confusion_matrix(y_test, y_test_preds) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i], cbar=False, annot_kws={"size": 12}) axes[i].set_title(f'{name}模型混淆矩阵 (数量)', fontsize=14) axes[i].set_xlabel('预测标签', fontsize=12) axes[i].set_ylabel('真实标签', fontsize=12) axes[i].set_xticklabels(['稳定', '扩大']) axes[i].set_yticklabels(['稳定', '扩大']) plt.tight_layout() plt.show() # 选择最佳模型并保存 chosen_model = optimizer_rfc.best_estimator_ # 生成预测概率文件 df_uniprob = pd.read_csv(uniPath) df_uniprob.insert(df_uniprob.shape[1], 'Prob', 0) for index, row in df_uniprob.iterrows(): x = stdsc.transform(row[:-3].values.reshape(1, -1)) pred_prob = chosen_model.predict_proba(x)[:, 1] df_uniprob.loc[index, 'Prob'] = pred_prob df_uniprob.info() df_uniprob.to_csv(uniprobPath, index=False) # 预测概率分布直方图 probs_e = df_uniprob['Prob'][df_uniprob['血肿扩大'] == 1] probs_s = df_uniprob['Prob'][df_uniprob['血肿扩大'] == 0] plt.figure(figsize=(12, 8), dpi=300) plt.hist(probs_e, range=[0, 1], bins=20, color='r', alpha=0.7, label='血肿扩大组') plt.hist(probs_s, range=[0, 1], bins=20, color='g', alpha=0.7, label='血肿稳定组') plt.title('预测概率分布', fontsize=18) plt.xlabel('预测概率', fontsize=14) plt.ylabel('样本数量', fontsize=14) plt.legend(loc='upper right', fontsize=12) plt.grid(alpha=0.3) plt.show() # 保存最佳模型 joblib.dump(chosen_model, unimPath) print(f"最佳模型已保存至: {unimPath}") 对上面代码修改,要求如下: 1. 与概率的输出结果相呼应,说明这些高风险的患者概率多高,最好能做可视化分析; 2. 优化评分系统,使不同风险组间评分差异明显。 需要根据这两个需求优化下算法,简单来说,现有是根据阈值进行二分类,但后续需要进行多分类问题,根据风险评分,划分层次,每个层次下,最终特征(血肿扩大)为真的概率可以衡量 应该如何调整呢,其中多分类问题使用:等频分箱划分为五个等级
最新发布
08-15
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值