利用itertools.product输出高维index

print(list(itertools.product(range(3), range(3))))

输出:[(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1), (2, 2)]

 

import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix from imblearn.over_sampling import SMOTE, ADASYN from imblearn.ensemble import BalancedRandomForestClassifier from tensorflow.keras.models import Model from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Embedding, Input, concatenate from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau from tensorflow.keras.regularizers import l2 from tensorflow.keras.optimizers import Adam import json import os from datetime import datetime from itertools import product class SpecialValueEmbedding: def __init__(self, special_values): """ 初始化特殊值映射 :param special_values: dict, 如 {'col1': [-999, 999], 'col2': [99999]} """ self.special_values = special_values self.special_value_maps = {} # 每列的特殊值映射表 def replace_special_values(self, df): """ 将特殊值映射为整数标签,0 表示正常值 """ df = df.copy() for col, values in self.special_values.items(): if col in df.columns: mapping = {val: i + 1 for i, val in enumerate(values)} mapping['normal'] = 0 self.special_value_maps[col] = mapping def replace_func(x): return mapping.get(x, mapping['normal']) df[col] = df[col].apply(replace_func) return df class DNNAutoSpecialValueModel: def __init__(self, special_values, output_dir='results'): self.special_values = special_values self.output_dir = output_dir self.special_value_maps = {} os.makedirs(output_dir, exist_ok=True) def evaluate_model(self, y_true, y_pred, y_prob): """计算多种评估指标""" metrics = { 'KS': self._calc_ks(y_true, y_prob), 'AR': 2 * roc_auc_score(y_true, y_prob) - 1, 'AUC': roc_auc_score(y_true, y_prob), 'Accuracy': accuracy_score(y_true, y_pred), 'F1': f1_score(y_true, y_pred), 'Precision': precision_score(y_true, y_pred), 'Recall': recall_score(y_true, y_pred), 'Confusion_Matrix': confusion_matrix(y_true, y_pred).tolist() } return metrics def _calc_ks(self, y_true, y_prob): """计算KS统计量""" df = pd.DataFrame({'y': y_true, 'prob': y_prob}) df = df.sort_values('prob') df['bad_cum'] = df['y'].cumsum() / df['y'].sum() df['good_cum'] = (1 - df['y']).cumsum() / (1 - df['y']).sum() ks = np.max(np.abs(df['bad_cum'] - df['good_cum'])) return ks def handle_imbalance(self, X, y, method='smote'): """处理数据不平衡""" if method == 'smote': sampler = SMOTE(random_state=42) elif method == 'adasyn': sampler = ADASYN(random_state=42) elif method == 'balanced_forest': return X, y # 使用平衡森林时不需预处理 else: raise ValueError(f"Unknown sampling method: {method}") X_res, y_res = sampler.fit_resample(X, y) return X_res, y_res def build_dnn_model(self, input_dim, special_flag_dims, params): """ 构建 DNN 模型,并为每个特殊值列添加嵌入层 :param input_dim: 主特征维度 :param special_flag_dims: 每个特殊值列的类别数(如 {'col1': 3} 表示 col1 有 3 个特殊值 + 1 个正常值) """ main_input = Input(shape=(input_dim,), name='main_input') # 构建特殊值嵌入层 special_inputs = [] special_embeddings = [] for col, num_types in special_flag_dims.items(): spec_input = Input(shape=(1,), name=f'{col}_flag', dtype='int32') embedding = Embedding( input_dim=num_types, output_dim=4, # 嵌入维度 name=f'emb_{col}' )(spec_input) special_inputs.append(spec_input) special_embeddings.append(embedding) # 主网络 x = Dense(params['units'][0], activation=params['activation'], kernel_regularizer=l2(params['l2_reg']))(main_input) x = BatchNormalization()(x) x = Dropout(params['dropout'])(x) for units in params['units'][1:]: x = Dense(units, activation=params['activation'], kernel_regularizer=l2(params['l2_reg']))(x) x = BatchNormalization()(x) x = Dropout(params['dropout'])(x) # 合并主特征与特殊值嵌入 if special_embeddings: merged = concatenate([x] + special_embeddings) else: merged = x # 输出层 output = Dense(1, activation='sigmoid')(merged) # 构建完整模型 model = Model(inputs=[main_input] + special_inputs, outputs=output) # 设置优化器和损失函数 optimizer = Adam(learning_rate=params.get('lr', 0.001)) model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', 'AUC']) return model def bin_scores_by_fixed_width(y_true, y_prob, n_bins=10): """ 按固定宽度分箱(如每10分为一个区间)进行评分分箱统计 :param y_true: 真实标签(0/1) :param y_prob: 模型预测概率 :param n_bins: 分箱数量(默认10) :return: 分箱统计结果 DataFrame """ # 将概率转换为 0~100 分 scores = y_prob * 100 bins = np.linspace(0, 100, n_bins + 1) # 构建 DataFrame df = pd.DataFrame({ 'score': scores, 'y_true': y_true }) # 使用 pd.cut 进行等宽分箱 df['bin'] = pd.cut(df['score'], bins=bins, include_lowest=True) # 分组统计 result = df.groupby('bin').agg( total=('y_true', 'size'), bads=('y_true', 'sum'), goods=('y_true', lambda x: (x == 0).sum()), avg_score=('score', 'mean') ).reset_index() result['bad_rate'] = result['bads'] / result['total'] result['good_rate'] = result['goods'] / result['total'] result['bin_start'] = result['bin'].apply(lambda x: x.left) result['bin_end'] = result['bin'].apply(lambda x: x.right) result = result[[ 'bin_start', 'bin_end', 'total', 'bads', 'goods', 'bad_rate', 'good_rate', 'avg_score' ]] return result def train_evaluate(self, X_train, y_train, X_test, y_test, params): # 处理数据不平衡 if params['balance_method'] != 'balanced_forest': X_train_res, y_train_res = self.handle_imbalance(X_train, y_train, params['balance_method']) else: X_train_res, y_train_res = X_train, y_train # 特殊值映射为整数编码 special_value_processor = SpecialValueEmbedding(self.special_values) X_train_res = special_value_processor.replace_special_values(X_train_res) X_test_processed = special_value_processor.replace_special_values(X_test.copy()) self.special_value_maps = special_value_processor.special_value_maps # 分离主特征和特殊值列 main_cols = [col for col in X_train_res.columns if col not in self.special_values] special_cols = list(self.special_values.keys()) X_train_main = X_train_res[main_cols].values X_test_main = X_test_processed[main_cols].values train_special = [X_train_res[col].values for col in special_cols] test_special = [X_test_processed[col].values for col in special_cols] special_flag_dims = {col: len(self.special_value_maps[col]) for col in special_cols} # 训练模型 if params['balance_method'] == 'balanced_forest': model = BalancedRandomForestClassifier( n_estimators=params.get('n_estimators', 100), random_state=42 ) model.fit(X_train_res, y_train_res) y_prob = model.predict_proba(X_test_processed)[:, 1] else: model = self.build_dnn_model( input_dim=len(main_cols), special_flag_dims=special_flag_dims, params=params ) early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3) # 创建验证集 X_train_final, X_val, y_train_final, y_val = train_test_split( (X_train_main, *train_special), y_train_res, test_size=0.2, random_state=42) # 拆分验证集的特殊值输入 X_val_main = X_val[0] X_val_special = X_val[1:] model.fit( [X_train_main] + train_special, y_train_final, validation_data=([X_val_main] + X_val_special, y_val), epochs=params['epochs'], batch_size=params['batch_size'], callbacks=[early_stop, reduce_lr], verbose=0 ) y_prob = model.predict([X_test_main] + test_special).flatten() y_pred = (y_prob > 0.5).astype(int) metrics = self.evaluate_model(y_test, y_pred, y_prob) # ✅ 新增:评分分箱统计(每10分为一个区间) score_bin_df = bin_scores_by_fixed_width(y_test, y_prob, n_bins=10) metrics['score_bins'] = score_bin_df.to_dict(orient='records') return metrics, model def run_experiments(self, X, y, param_grid): """运行所有参数组合的实验""" results = [] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 生成所有参数组合 param_names = list(param_grid.keys()) param_values = list(param_grid.values()) all_combinations = list(product(*param_values)) for i, combination in enumerate(all_combinations): params = dict(zip(param_names, combination)) print(f"\nRunning experiment {i+1}/{len(all_combinations)} with params: {params}") try: metrics, model = self.train_evaluate(X_train, y_train, X_test, y_test, params) result = { 'params': params, 'metrics': metrics, 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S") } results.append(result) # 保存每个实验的结果 self._save_results(result, i) except Exception as e: print(f"Error in experiment {params}: {str(e)}") continue # 保存所有结果 self._save_all_results(results) return results def _save_results(self, result, exp_id): """保存单个实验结果""" filename = f"{self.output_dir}/exp_{exp_id}.json" with open(filename, 'w') as f: json.dump(result, f, indent=2) def _save_all_results(self, results): """保存所有结果到CSV""" df = pd.DataFrame([{ **r['params'], **r['metrics'] } for r in results]) # 保存详细结果 df.to_csv(f"{self.output_dir}/all_results.csv", index=False) # 保存最佳结果 best_by_ks = df.loc[df['KS'].idxmax()] best_by_auc = df.loc[df['AUC'].idxmax()] with open(f"{self.output_dir}/best_results.txt", 'w') as f: f.write("Best by KS:\n") f.write(best_by_ks.to_string()) f.write("\n\nBest by AUC:\n") f.write(best_by_auc.to_string()) if __name__ == "__main__": # 示例数据(请替换为你的数据) from sklearn.datasets import make_classification X, y = make_classification(n_samples=10000, n_features=20, n_informative=10, n_classes=2, random_state=42) X = pd.DataFrame(X) y = pd.Series(y) # 根据你的数据(21926, 2457),正负样本比 98.82:1.18,调整后的参数 param_grid = { 'balance_method': ['smote'], # 更适合 DNN 的数据增强方法 'units': [ [512, 256], # 更大的网络结构,适应高维特征 [256, 128] ], 'activation': ['relu'], # 稳定、收敛快 'l2_reg': [0.001], # L2 正则化防止过拟合 'dropout': [0.3], # Dropout 提升泛化能力 'lr': [0.001], # Adam 优化器推荐学习率 'epochs': [100], # 高维数据需要更多训练轮次 'batch_size': [128] # 更大的 batch size 提升训练稳定性 } # 初始化并运行实验 evaluator = DNNAutoSpecialValueModel(special_values=special_values) results = evaluator.run_experiments(X, y, param_grid) print("All experiments completed. Results saved to 'results' directory.")
08-02
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值