Titanic数据分析——KeyError: "None of [Int64Index([ 0, 1, 2,... dtype='int64')] are in the [columns]"

代码报错处:

#---------------------------------------------------modify the parameter------------------------------------------------
range_m = np.logspace(2, 6, 5, base = 2).astype(int)
best_m = 0
min_scores = 10000
scores_m = []
for m in range_m:
    kf = KFold(n_splits=5,shuffle=True)
    clf = RandomForestClassifier(n_estimators = 1000 ,max_depth = m,random_state = 4)
    scores = 0
    for train_index, test_index in kf.split(X_train):
          #print("Train:", train_index, "Validation:",test_index)
        clf.fit(X_train[train_index], Y_train[train_index])
#         pred = clf.predict(X_train[test_index])
#         scores += log_loss(Y_train[test_index], pred) / 5
#     scores_m.append(scores)
#     if scores < min_scores:
#         min_scores = scores
#         best_m = m
#
# print(best_m, min_scores)  # 打印随机森林的树的最佳数量和其损失值
# print(scores_m)  # 打印不同数量树的随机森林模型的损失值

错误提示:

KeyError: "None of [Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,\n            ...\n            826, 828, 829, 830, 831, 833, 834, 835, 836, 837],\n           dtype='int64', length=670)] are in the [columns]"

解决方案:
很明显索引出现问题,数据框DataFrame有两种新的索引方式:

  1. .iloc[index,:],其中index是索引位置
  2. .loc[:,''],其中’ '中为列名

选择一种方式:

clf.fit(X_train.iloc[train_index,:], Y_train.iloc[train_index,:])

在这里插入图片描述

import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline # 加载数据 train_data = pd.read_csv('train.csv') test_data = pd.read_csv('test.csv') # 特征选择,加入 Name 列 features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Name'] X = train_data[features] y = train_data['Survived'] # 特征工程 def feature_engineering(df): df = df.copy() # 创建家庭规模特征 df['FamilySize'] = df['SibSp'] + df['Parch'] + 1 # 创建是否独行特征 df['IsAlone'] = (df['FamilySize'] == 1).astype(int) # 创建称谓特征 df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False) return df.drop(columns=['Name']) # 只删除 Name 列,因为 Ticket 和 Cabin 列本来就没选 X = feature_engineering(X) # 预处理管道 numeric_features = ['Age', 'Fare', 'FamilySize'] categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'IsAlone'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) # 对训练数据进行预处理 X_processed = preprocessor.fit_transform(X) print('预处理后的训练数据形状:', X_processed.shape) from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import StratifiedKFold from sklearn.metrics import recall_score, precision_recall_curve # 使用随机森林(因其特征重要性解释性强) model = RandomForestClassifier( n_estimators=500, max_depth=8, min_samples_split=5, class_weight='balanced', # 处理不平衡数据 random_state=42 ) # K折交叉验证(K=10) kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) recall_scores = [] # 阈值调整列表 thresholds = np.linspace(0.1, 0.5, 50) for train_index, val_index in kf.split(X, y): X_train, X_val = X.iloc[train_index], X.iloc[val_index] y_train, y_val = y.iloc[train_index], y.iloc[val_index] # 预处理 X_train_proc = preprocessor.fit_transform(X_train) X_val_proc = preprocessor.transform(X_val) # 训练模型 model.fit(X_train_proc, y_train) # 获取预测概率 y_probs = model.predict_proba(X_val_proc)[:, 1] # 寻找满足召回率≥90%的最佳阈值 best_threshold = 0.5 for threshold in thresholds: y_pred = (y_probs >= threshold).astype(int) recall = recall_score(y_val, y_pred) if recall >= 0.90: best_threshold = threshold break # 使用最佳阈值预测 y_pred = (y_probs >= best_threshold).astype(int) recall_scores.append(recall_score(y_val, y_pred)) print(f"平均召回率: {np.mean(recall_scores):.2%} ± {np.std(recall_scores):.2%}") import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay # 最终模型训练 X_processed = preprocessor.fit_transform(X) model.fit(X_processed, y) # 混淆矩阵可视化 y_probs = model.predict_proba(X_processed)[:, 1] y_pred = (y_probs >= best_threshold).astype(int) cm = confusion_matrix(y, y_pred) disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Died', 'Survived']) disp.plot(cmap='Blues') plt.title('Confusion Matrix (Recall Optimized)') plt.show() # 特征重要性 importances = model.feature_importances_ features = preprocessor.get_feature_names_out() plt.barh(features[np.argsort(importances)[-10:]], sorted(importances)[-10:]) plt.title('Top 10 Important Features') plt.show() # 最终验证 final_recall = recall_score(y, y_pred) print(f"最终召回率: {final_recall:.2%}") # 预测测试集 X_test = feature_engineering(test_data[features]) X_test_proc = preprocessor.transform(X_test) test_probs = model.predict_proba(X_test_proc)[:, 1] test_preds = (test_probs >= best_threshold).astype(int) # 生成提交文件 output = pd.DataFrame({ 'PassengerId': test_data.PassengerId, 'Survived': test_preds }) output.to_csv('submission_recall90.csv', index=False)
最新发布
07-30
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值