python的pandas库的sort_values、set_index、reset_index、cumsum、groupby函数的用法

本文深入探讨了使用Pandas和NumPy进行数据排序、分组、累加和索引设置等操作的方法,包括Pandas的sort_values、groupby、set_index函数和NumPy的cumsum函数的具体应用实例。
import pandas as pd


#sort_values()函数是按照选中索引所在列的原素进行排序
df=pd.DataFrame({'A':[3,1,1,6,7],'B':['a','d','c','b','e'],'C':[123,343,122,978,459]})

print(df.sort_values('A'))
# 结果
'''
   A  B    C
1  1  d  343
2  1  c  122
0  3  a  123
3  6  b  978
4  7  e  459

'''

#如果多个索引,按照索引先后顺序,依次在上一次索引排序后再对重复排序的内部进行排序
print(df.sort_values(['A','B']))
# 结果
'''
   A  B    C
1  1  e  343
2  1  t  122
0  3  a  123
3  6  o  978
4  7  r  459
'''

# set_index,rset_index:重新设置某一列作为索引/恢复某一列的索引
df=pd.DataFrame({'A':['a','a','e','a','e'],'B':[4,6,5,7,5],'C':[i for i in range(23,28)]})
print(df.set_index('A'))
# 结果
'''
   B   C
A       
a  4  23
a  6  24
e  5  25
a  7  26
e  5  27

'''
sets=df.set_index(['A','B'])
print(sets)
# 结果
'''
      C
A B    
a 4  23
  6  24
e 5  25
a 7  26
e 5  27
'''
sets1=sets.reset_index(['A','B'])
print(sets1)
#结果
'''
   A  B   C
0  a  4  23
1  a  6  24
2  e  5  25
3  a  7  26
4  e  5  27

'''

# cumsum:计算轴向元素累加和,返回由中间结果组成的数组
import numpy as np
# 是2*2*3的数组
arr  = np.array([[[1,2,3],[8,9,12]],[[1,2,4],[2,4,5]]])
print(arr.cumsum(0))
# 结果
'''
[[[ 1  2  3]
  [ 8  9 12]]

 [[ 2  4  7]
  [10 13 17]]]
'''
print(arr.cumsum(1))
# 结果
'''
[[[ 1  2  3]
  [ 9 11 15]]

 [[ 1  2  4]
  [ 3  6  9]]]
'''
print(arr.cumsum(2))
# 结果
'''
[[[ 1  3  6]
  [ 8 17 29]]

 [[ 1  3  7]
  [ 2  6 11]]]
'''

# groupby
df=pd.DataFrame({'A':['age','bwr','age','bwr','dfd'],'B':[1,2,1,5,6],'C':[345,23,345,35,33]})
# 分组后并没有进行计算,下面在'B'列求了平均
print(df.groupby('A')['B'].mean())
我说的是以下code import pandas as pd import numpy as np import xgboost as xgb from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score from sklearn.model_selection import ParameterGrid, train_test_split from imblearn.over_sampling import SMOTE, ADASYN from imblearn.under_sampling import RandomUnderSampler from imblearn.combine import SMOTEENN import matplotlib.pyplot as plt import json import joblib import seaborn as sns from scipy.stats import ks_2samp # ====================== # 1. 特殊值处理函数(优化版) # ====================== def create_special_flags(X, special_values): """为含特殊值的变量创建标识特征,并将特殊值替换为np.nan""" X_processed = X.copy() # 添加特殊值标识特征 for col, values in special_values.items(): if col in X.columns: # 创建特殊值标识列 X_processed[f"{col}_IS_SPECIAL"] = X[col].isin(values).astype(int) # 替换特殊值为nan(XGBoost自动处理缺失值) X_processed[col] = X[col].replace(values, np.nan) return X_processed # ====================== # 2. 评估指标计算(y=1为不良样本) # ====================== def calculate_ks(y_true, y_prob): """计算KS值""" df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob}) df = df.sort_values('y_prob') df['cum_bad'] = df['y_true'].cumsum() / df['y_true'].sum() # y=1为不良 df['cum_good'] = (1 - df['y_true']).cumsum() / (1 - df['y_true']).sum() return np.max(df['cum_bad'] - df['cum_good']) def calculate_ar(y_true, y_prob): """计算AR值(Accuracy Ratio)""" auc = roc_auc_score(y_true, y_prob) return 2 * auc - 1 def calculate_psi(expected, actual, buckets=10): """计算PSI(Population Stability Index)""" # 分箱 breakpoints = np.linspace(0, 1, buckets + 1)[1:-1] expected_bins = np.histogram(expected, bins=breakpoints)[0] / len(expected) actual_bins = np.histogram(actual, bins=breakpoints)[0] / len(actual) # 避免0值 expected_bins = np.where(expected_bins == 0, 0.0001, expected_bins) actual_bins = np.where(actual_bins == 0, 0.0001, actual_bins) # 计算PSI psi = np.sum((actual_bins - expected_bins) * np.log(actual_bins / expected_bins)) return psi # ====================== # 3. 不平衡数据处理(优化版) # ====================== def apply_sampling(X, y, method): """处理样本不平衡(y=1为不良样本,需要增加)""" # 计算目标负样本比例(10%) target_minority_ratio = 0.10 n_minority_target = int(len(y) * target_minority_ratio) if method == 'smote': sampler = SMOTE(sampling_strategy={1: n_minority_target}, random_state=42) elif method == 'adasyn': sampler = ADASYN(sampling_strategy={1: n_minority_target}, random_state=42) elif method == 'undersample': # 保留所有负样本,下采样正样本 n_minority = y.sum() n_majority_target = int(n_minority * (1 - target_minority_ratio) / target_minority_ratio) sampler = RandomUnderSampler(sampling_strategy={0: n_majority_target}, random_state=42) elif method == 'smoteenn': sampler = SMOTEENN(sampling_strategy={1: n_minority_target}, random_state=42) elif method == 'weighted': # 样本权重方案 weight_ratio = len(y[y==0]) / len(y[y==1]) sample_weights = np.where(y == 1, weight_ratio, 1) return X, y, sample_weights else: # raw return X, y, None X_res, y_res = sampler.fit_resample(X, y) return X_res, y_res, None # ====================== # 4. XGBoost参数网格(优化版) # ====================== def get_param_grid(y_train): pos_count = len(y_train[y_train==0]) # y=0为正样本 neg_count = len(y_train[y_train==1]) # y=1为负样本 return { 'objective': ['binary:logistic'], 'eval_metric': ['auc'], 'eta': [0.01, 0.05, 0.1], 'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.3, 0.5, 0.7], 'gamma': [0, 0.1, 0.2], 'scale_pos_weight': [1, neg_count/pos_count, neg_count/pos_count*2], 'lambda': [0.1, 1, 10], 'alpha': [0, 0.1, 1], 'n_estimators': [200, 500, 1000], 'tree_method': ['hist', 'gpu_hist'] # 支持GPU加速 } # ====================== # 5. 模型训练与评估(优化版) # ====================== def train_and_evaluate(X_train, y_train, X_val, y_val, params, sample_weights=None): dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weights) dval = xgb.DMatrix(X_val, label=y_val) evals_result = {} model = xgb.train( params, dtrain, num_boost_round=1000, evals=[(dtrain, 'train'), (dval, 'val')], early_stopping_rounds=50, evals_result=evals_result, verbose_eval=False ) return model, evals_result # ====================== # 6. 评分卡生成(优化版) # ====================== def create_score_table(y_true, y_prob, n_bins=20): """生成评分卡分段表""" df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob}) df = df.sort_values('y_prob', ascending=False) # 高风险到低风险 # 等频分箱 df['bin'] = pd.qcut(df['y_prob'], q=n_bins, duplicates='drop') total_samples = len(df) total_bad = df['y_true'].sum() total_good = total_samples - total_bad bin_stats = df.groupby('bin').agg( total=('y_true', 'count'), bad=('y_true', 'sum'), # y=1为不良 good=('y_true', lambda x: len(x)-sum(x)) # y=0为优良 ).reset_index() # 计算各种比率 bin_stats['bad_rate'] = bin_stats['bad'] / bin_stats['total'] bin_stats['cum_total'] = bin_stats['total'].cumsum() bin_stats['cum_bad'] = bin_stats['bad'].cumsum() bin_stats['cum_good'] = bin_stats['good'].cumsum() bin_stats['cum_total_pct'] = bin_stats['cum_total'] / total_samples bin_stats['cum_bad_pct'] = bin_stats['cum_bad'] / total_bad bin_stats['cum_good_pct'] = bin_stats['cum_good'] / total_good bin_stats['reject_rate'] = bin_stats['cum_bad'] / bin_stats['cum_total'] # 累计拒绝率 bin_stats['lift'] = bin_stats['bad_rate'] / (total_bad/total_samples) # 提升度 return bin_stats # ====================== # 7. 评分卡可视化(新增) # ====================== def plot_score_card(bin_stats, title, save_path=None): """可视化评分卡结果""" fig, ax1 = plt.subplots(figsize=(12, 8)) # 累计不良率曲线 ax1.plot(bin_stats.index, bin_stats['cum_bad_pct'], 'b-', label='累计不良捕获率') ax1.plot(bin_stats.index, bin_stats['cum_total_pct'], 'g-', label='累计样本占比') ax1.set_xlabel('分箱 (高风险->低风险)') ax1.set_ylabel('比例') ax1.set_title(f'{title} - 评分卡分析') # 不良率曲线 ax2 = ax1.twinx() ax2.plot(bin_stats.index, bin_stats['bad_rate'], 'r-', label='分箱不良率') ax2.set_ylabel('不良率') # 添加参考线 ax1.axhline(y=0.8, color='gray', linestyle='--', alpha=0.5) ax1.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5) # 图例 lines, labels = ax1.get_legend_handles_labels() lines2, labels2 = ax2.get_legend_handles_labels() ax1.legend(lines + lines2, labels + labels2, loc='upper left') plt.tight_layout() if save_path: plt.savefig(save_path) plt.close() # ====================== # 8. 主训练流程(优化版) # ====================== def main_training_flow(X_train, y_train, X_test, y_test, X_oot, y_oot, special_values): # 特殊值处理 X_train_proc = create_special_flags(X_train, special_values) X_test_proc = create_special_flags(X_test, special_values) X_oot_proc = create_special_flags(X_oot, special_values) # 定义采样方法 sampling_methods = { 'raw': None, 'smote': 'smote', 'adasyn': 'adasyn', 'undersample': 'undersample', 'smoteenn': 'smoteenn', 'weighted': 'weighted' } all_results = [] param_grid = get_param_grid(y_train) for method_name, method in sampling_methods.items(): print(f"\n=== 处理方法: {method_name} ===") # 处理不平衡数据 if method: X_res, y_res, sample_weights = apply_sampling(X_train_proc, y_train, method) else: X_res, y_res, sample_weights = X_train_proc.copy(), y_train.copy(), None # 从训练集划分验证集 X_tr, X_val, y_tr, y_val = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res) # 参数搜索 for params in ParameterGrid(param_grid): print(f"训练参数: {params}") # 训练模型 model, evals_result = train_and_evaluate( X_tr, y_tr, X_val, y_val, params, sample_weights=sample_weights ) # 评估指标 def evaluate(X, y): y_prob = model.predict(xgb.DMatrix(X)) return { 'auc': roc_auc_score(y, y_prob), 'ks': calculate_ks(y, y_prob), 'ar': calculate_ar(y, y_prob), 'accuracy': accuracy_score(y, (y_prob > 0.5).astype(int)), 'precision': precision_score(y, (y_prob > 0.5).astype(int), pos_label=1), 'recall': recall_score(y, (y_prob > 0.5).astype(int), pos_label=1), 'f1': f1_score(y, (y_prob > 0.5).astype(int), pos_label=1) } # 记录结果 train_metrics = evaluate(X_tr, y_tr) val_metrics = evaluate(X_val, y_val) test_metrics = evaluate(X_test_proc, y_test) oot_metrics = evaluate(X_oot_proc, y_oot) # 计算稳定性指标 y_prob_train = model.predict(xgb.DMatrix(X_tr)) y_prob_oot = model.predict(xgb.DMatrix(X_oot_proc)) psi = calculate_psi(y_prob_train, y_prob_oot) result = { 'sampling_method': method_name, **params, **{'train_'+k: v for k,v in train_metrics.items()}, **{'val_'+k: v for k,v in val_metrics.items()}, **{'test_'+k: v for k,v in test_metrics.items()}, **{'oot_'+k: v for k,v in oot_metrics.items()}, 'psi': psi, 'best_iteration': model.best_iteration } all_results.append(result) # 实时保存 result_df = pd.DataFrame(all_results) result_df.to_csv('xgb_results_all.csv', index=False) # 每10次迭代输出一次进度 if len(all_results) % 10 == 0: print(f"已完成 {len(all_results)} 个参数组合") return pd.DataFrame(all_results) # ====================== # 9. 结果分析与可视化(优化版) # ====================== def analyze_results(results_df): # 选择最佳模型(基于test集KS和稳定性) results_df['stability'] = 1 - abs(results_df['test_ks'] - results_df['oot_ks']) results_df['combined_score'] = results_df['test_ks'] * 0.7 + results_df['stability'] * 0.3 best_by_ks = results_df.loc[results_df['test_ks'].idxmax()] best_by_ar = results_df.loc[results_df['test_ar'].idxmax()] best_by_stability = results_df.loc[results_df['combined_score'].idxmax()] # 重新训练最终模型(使用全部DEV数据) def train_final_model(params, X_train, y_train, X_test, y_test, sampling_method): # 合并训练集和测试集 X_full = pd.concat([X_train, X_test]) y_full = np.concatenate([y_train, y_test]) # 应用采样方法 if sampling_method != 'raw' and sampling_method != 'weighted': X_full, y_full, _ = apply_sampling(X_full, y_full, sampling_method) # 训练最终模型 dtrain = xgb.DMatrix(X_full, label=y_full) return xgb.train( params, dtrain, num_boost_round=int(params['best_iteration'] * 1.1) # 稍微增加迭代次数 ) # 为每个最佳模型生成结果 output = {} for name, best in [('ks', best_by_ks), ('ar', best_by_ar), ('stability', best_by_stability)]: params = {k: v for k,v in best.items() if k in get_param_grid([]).keys()} sampling_method = best['sampling_method'] # 特殊值处理 X_train_proc = create_special_flags(X_train, special_values) X_test_proc = create_special_flags(X_test, special_values) X_oot_proc = create_special_flags(X_oot, special_values) # 训练最终模型 model = train_final_model( params, X_train_proc, y_train, X_test_proc, y_test, sampling_method ) # 保存模型 model.save_model(f'best_model_by_{name}.json') # 特征重要性分析 importance = model.get_score(importance_type='gain') importance_df = pd.DataFrame({ 'feature': list(importance.keys()), 'importance': list(importance.values()) }).sort_values('importance', ascending=False) importance_df.to_csv(f'feature_importance_{name}.csv', index=False) # 生成评分表 score_tables = {} for data_name, X, y in [ ('train', X_train, y_train), ('test', X_test, y_test), ('oot', X_oot, y_oot) ]: X_processed = create_special_flags(X, special_values) y_prob = model.predict(xgb.DMatrix(X_processed)) score_table = create_score_table(y, y_prob) score_tables[data_name] = score_table # 保存评分表 score_table.to_csv(f'score_table_{name}_{data_name}.csv', index=False) # 绘制评分卡 plot_score_card( score_table, f'{name.upper()}模型 - {data_name}集', f'score_card_{name}_{data_name}.png' ) # 绘制概率分布 plt.figure(figsize=(12, 8)) sns.kdeplot(y_prob[y==0], label='正样本(y=0)', fill=True, alpha=0.3) sns.kdeplot(y_prob[y==1], label='负样本(y=1)', fill=True, alpha=0.3) plt.title(f'{data_name}集预测概率分布 ({name.upper()}模型)') plt.xlabel('预测概率') plt.ylabel('密度') plt.legend() plt.savefig(f'prob_dist_{name}_{data_name}.png') plt.close() output[name] = { 'params': params, 'sampling_method': sampling_method, 'score_tables': score_tables, 'feature_importance': importance_df.to_dict() } # 保存结果 with open('final_results.json', 'w') as f: json.dump(output, f, indent=2) # 输出模型比较报告 generate_model_comparison(output) return output # ====================== # 10. 模型比较报告(新增) # ====================== def generate_model_comparison(output): """生成模型比较报告""" comparison = [] for name, results in output.items(): # 获取测试集性能 test_ks = results['score_tables']['test'].iloc[0]['ks'] # 假设KS在第一个位置 comparison.append({ 'model_name': name, 'sampling_method': results['sampling_method'], 'test_ks': results['score_tables']['test'].iloc[0]['ks'], 'test_ar': results['score_tables']['test'].iloc[0]['ar'], 'oot_ks': results['score_tables']['oot'].iloc[0]['ks'], 'psi': results['score_tables']['oot'].iloc[0]['psi'], 'stability': 1 - abs(results['score_tables']['test'].iloc[0]['ks'] - results['score_tables']['oot'].iloc[0]['ks']), 'top_features': ', '.join(results['feature_importance']['feature'][:5]) }) # 保存比较报告 comparison_df = pd.DataFrame(comparison) comparison_df.to_csv('model_comparison.csv', index=False) # 可视化比较 plt.figure(figsize=(12, 8)) comparison_df.set_index('model_name')[['test_ks', 'oot_ks', 'stability']].plot.bar() plt.title('模型性能比较') plt.ylabel('分数') plt.xticks(rotation=45) plt.tight_layout() plt.savefig('model_comparison.png') plt.close() # ====================== # 执行主流程 # ====================== if __name__ == "__main__": # 假设数据已加载 # X_train, y_train, X_test, y_test, X_oot, y_oot = load_data() # special_values = {...} # 运行训练 results = main_training_flow( X_train, y_train, X_test, y_test, X_oot, y_oot, special_values ) # 分析结果 final_results = analyze_results(results) 但我发现完整地运行一次需要很长的时间,而且仍然存在bug,所以想添加一个preview版,简化并完整的运行一次code,确保code的准确性。error的话我发现X_OOTis not defiend会多一点
08-06
你这段代码非常完整,功能非常强大,涵盖了从数据预处理、采样、模型训练、评估、调参到结果分析的全流程。但正如你所说,运行一次耗时长,而且容易出错,比如 `X_oot` 未定义、`y_oot` 未定义等。 --- ### ✅ 解决方案:编写一个 **Preview 版本的代码** 我们将编写一个简化版本的代码,具备以下特点: - 使用 **小样本数据集**(如 `sklearn.datasets.make_classification`) - 简化参数搜索空间 - 仅运行一次完整流程(不遍历所有参数) - 修复 `X_oot` 未定义的问题(补充 dummy 数据) - 只保留核心逻辑,便于调试和验证代码逻辑是否正确 --- ### ✅ Preview Code(简化验证版) ```python import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from xgboost import XGBClassifier import xgboost as xgb from sklearn.metrics import roc_auc_score, accuracy_score from imblearn.over_sampling import SMOTE # ====================== # 1. 生成小样本数据集 # ====================== from sklearn.datasets import make_classification X, y = make_classification(n_samples=2000, n_features=10, n_informative=5, n_redundant=2, n_classes=2, random_state=42) X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])]) y = pd.Series(y, name='target') # 划分训练集、验证集、OOT集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42) X_oot, y_oot = X_test.copy(), y_test.copy() # 模拟OOT集 # 定义特殊值(示例) special_values = { "feature_0": [-2.5, 3.0], "feature_1": [2.0] } # ====================== # 2. 特殊值处理函数(简化版) # ====================== def create_special_flags(X, special_values): X_processed = X.copy() for col, values in special_values.items(): if col in X.columns: X_processed[f"{col}_IS_SPECIAL"] = X[col].isin(values).astype(int) X_processed[col] = X[col].replace(values, np.nan) return X_processed # 应用处理 X_train_proc = create_special_flags(X_train, special_values) X_val_proc = create_special_flags(X_val, special_values) X_test_proc = create_special_flags(X_test, special_values) X_oot_proc = create_special_flags(X_oot, special_values) # ====================== # 3. 不平衡数据处理(简化版) # ====================== def apply_sampling(X, y, method='smote'): if method == 'smote': sampler = SMOTE(random_state=42) X_res, y_res = sampler.fit_resample(X, y) return X_res, y_res else: return X, y X_res, y_res = apply_sampling(X_train_proc, y_train, method='smote') # ====================== # 4. 参数设置 # ====================== params = { 'objective': 'binary:logistic', 'eval_metric': 'auc', 'eta': 0.1, 'max_depth': 3, 'subsample': 0.8, 'colsample_bytree': 0.7, 'gamma': 0, 'scale_pos_weight': 1, 'lambda': 1, 'alpha': 0, 'n_estimators': 100, 'tree_method': 'hist' } # ====================== # 5. 模型训练与评估 # ====================== dtrain = xgb.DMatrix(X_res, label=y_res) dval = xgb.DMatrix(X_val_proc, label=y_val) evals_result = {} model = xgb.train( params, dtrain, num_boost_round=100, evals=[(dtrain, 'train'), (dval, 'val')], early_stopping_rounds=10, evals_result=evals_result, verbose_eval=False ) # ====================== # 6. 评估函数(简化版) # ====================== def evaluate(X, y, model): y_prob = model.predict(xgb.DMatrix(X)) return { 'auc': roc_auc_score(y, y_prob), 'accuracy': accuracy_score(y, (y_prob > 0.5).astype(int)), } # 评估 print("Train Metrics:", evaluate(X_res, y_res, model)) print("Val Metrics:", evaluate(X_val_proc, y_val, model)) print("Test Metrics:", evaluate(X_test_proc, y_test, model)) print("OOT Metrics:", evaluate(X_oot_proc, y_oot, model)) # ====================== # 7. 特征重要性(可选) # ====================== import matplotlib.pyplot as plt importance = model.get_score(importance_type='gain') importance_df = pd.DataFrame({ 'feature': list(importance.keys()), 'importance': list(importance.values()) }).sort_values('importance', ascending=False) importance_df.plot(kind='barh', x='feature', y='importance', title='Feature Importance') plt.show() ``` --- ### ✅ 代码说明: - **使用 `make_classification` 生成数据**,避免依赖外部数据文件 - **只运行一次参数组合**,节省时间 - **定义了 `X_oot` 和 `y_oot`**,避免报错 - **保留了完整流程**:数据处理、采样、训练、评估、特征重要性分析 - **适合快速验证逻辑是否通顺** --- ###
评论 1
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

橙子园

期待你的鼓励!

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值