sklearn.feature_selection讲解

本文详细介绍了sklearn中SelectKBest类的使用方法,该类用于根据特征评分选择最高得分的K个特征,适用于分类任务。文章通过实例展示了如何使用SelectKBest结合卡方检验对digits数据集进行特征选择。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

class sklearn.feature_selection.SelectKBest(score_func=, k=10)
作用:Select features according to the k highest scores
选出分数最高的k个特征

Parameters:
score_func : callable
Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with scores. Default is f_classif (see below “See also”). The default function only works with classification tasks.
k : int or “all”, optional, default=10
Number of top features to select. The “all” option bypasses selection, for use in a parameter search.
输出分数最高的K个特征

类方法:
fit(X, y) Run score function on (X, y) and get the appropriate features.
对X,y数据的特征进行评价
fit_transform(X[, y]) Fit to data, then transform it.
只保留数据X的前K个分数最高的特征

examples:

>>> from sklearn.datasets import load_digits
>>> from sklearn.feature_selection import SelectKBest, chi2
>>> X, y = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> X_new = SelectKBest(chi2, k=20).fit_transform(X, y)
>>> X_new.shape
(1797, 20)
请教学式按句详细讲解以下代码:###--------------------KNN算法与决策树算法-------------------- from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split # 将文本数据转化为数值特征 vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(data_str_list) # 划分数据集为训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 特征缩放 scaler = StandardScaler() X_train = scaler.fit_transform(X_train.toarray()) X_test = scaler.transform(X_test.toarray()) from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score # 使用网格搜索进行超参数调优 param_grid = { "n_neighbors": [3, 5, 7, 9], "weights": ["uniform", "distance"], "algorithm": ["auto", "ball_tree", "kd_tree", "brute"] } knn = KNeighborsClassifier() grid_search = GridSearchCV(knn, param_grid, cv=5) grid_search.fit(X_train, y_train) print("KNN最优参数:", grid_search.best_params_) param_grid = { "criterion": ["gini", "entropy"], "max_depth": [3, 5, 7, 9] } dt = DecisionTreeClassifier() grid_search = GridSearchCV(dt, param_grid, cv=5) grid_search.fit(X_train, y_train) print("决策树最优参数:", grid_search.best_params_) # 训练分类器并进行预测 knn = KNeighborsClassifier(n_neighbors=5, weights="uniform", algorithm="auto") knn.fit(X_train, y_train) knn_pred = knn.predict(X_test) dt = DecisionTreeClassifier(criterion="gini", max_depth=9) dt.fit(X_train, y_train) dt_pred = dt.predict(X_test) # 混合使用KNN和决策树进行文本分类 ensemble_pred = [] for i in range(len(knn_pred)): if knn_pred[i] == dt_pred[i]: ensemble_pred.append(knn_pred[i]) else: ensemble_pred.append(knn_pred[i]) # 输出分类结果和准确率 print("KNN准确率:", accuracy_score(y_test, knn_pred)) print("决策树准确率:", accuracy_score(y_test, dt_pred)) print("混合使用准确率:", accuracy_score(y_test, ensemble_pred))
05-31
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score from xgboost import XGBRegressor # 加载数据 df = pd.read_csv('new_副本_1.csv', encoding='gbk') # 查看数据概览 print(df.info()) print(df.describe()) # 目标变量 target = '用户累计付费金额' # 处理缺失值 df.replace('null', np.nan, inplace=True) df.fillna(df.median(numeric_only=True), inplace=True) # 数值型用中位数填充 df.fillna('missing', inplace=True) # 分类型用'missing'填充 # 特征类型分类 numeric_features = df.select_dtypes(include=np.number).columns.tolist() categorical_features = df.select_dtypes(include=['object']).columns.tolist() # 移除无关特征 remove_cols = ['新增日期', '剧集id', '末次活跃日期', '埋点信息是否follow'] # 示例 df = df.drop(columns=remove_cols) # 更新特征分类列表 numeric_features = [col for col in numeric_features if col not in remove_cols] # 处理分类变量 preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), numeric_features), ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) ]) # 分离特征和目标 X = df.drop(columns=[target]) y = df[target] # 数据拆分 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 应用预处理 X_train_processed = preprocessor.fit_transform(X_train) X_test_processed = preprocessor.transform(X_test) # 初始化模型 xgb = XGBRegressor(objective='reg:squarederror') # 超参数网格 params = { 'n_estimators': [100, 200], 'max_depth': [3, 5], 'learning_rate': [0.01, 0.1] } # 网格搜索 grid = GridSearchCV(xgb, params, cv=3, scoring='neg_mean_squared_error') grid.fit(X_train_processed, y_train) # 最佳模型 best_model = grid.best_estimator_ # 预测 y_pred = best_model.predict(X_test_processed) # 评估指标 print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}") print(f"R² Score: {r2_score(y_test, y_pred)}") # 特征重要性 feature_names = numeric_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out()) importance = pd.Series(best_model.feature_importances_, index=feature_names) print(importance.sort_values(ascending=False).head(10)) # 目标变量分布 plt.figure(figsize=(10,6)) sns.histplot(y, kde=True) plt.title('Target Variable Distribution') plt.show() # 特征相关性 corr_matrix = df[numeric_features + [target]].corr() plt.figure(figsize=(15,10)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm') plt.title('Feature Correlation Matrix') plt.show()
最新发布
05-10
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc from sklearn.preprocessing import StandardScaler import joblib import os def load_and_preprocess_data(csv_path): """ 加载并预处理数据 Args: csv_path: CSV文件路径 Returns: tuple: (X_scaled, y, scaler, feature_names) """ print(f"Loading data from {csv_path}...") df = pd.read_csv(csv_path) # 打印数据集基本信息 print("\nDataset Overview:") print(f"Number of samples: {df.shape[0]}") print(f"Number of features: {df.shape[1]-2}") # 排除image_name和indicator print("\nFeature Statistics:") print(df[['num_spots', 'max_spot_area', 'total_spot_area', 'indicator']].describe()) # 检查缺失值 if df.isnull().sum().sum() > 0: print("\nWarning: Dataset contains missing values") print(df.isnull().sum()) # 处理缺失值 df = df.dropna() # 准备特征和目标 X = df[['num_spots', 'max_spot_area', 'total_spot_area']] y = df['indicator'] # 缩放特征 scaler = StandardScaler() X_scaled = scaler.fit_transform(X) return X_scaled, y, scaler, X.columns def evaluate_model(model, X_test, y_test, feature_names=None): """ 评估模型性能 Args: model: 训练好的模型 X_test: 测试特征 y_test: 测试标签 feature_names: 特征名称列表 Returns: dict: 包含评估结果的字典 """ y_pred = model.predict(X_test) print("\nModel Evaluation:") print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}") print("\nClassification Report:") print(classification_report(y_test, y_pred)) print("\nConfusion Matrix:") cm = confusion_matrix(y_test, y_pred) print(cm) results = { 'acc
03-23
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值