Scikit-learn 常用函数及其用法_sklearn常用函数-优快云博客

本文链接：https://blog.youkuaiyun.com/YOU_AND_LIFE/article/details/146986475

Scikit-learn 常用函数及其用法

数据预处理

标准化 (StandardScaler)

from sklearn.preprocessing import StandardScaler

# 创建标准化器
scaler = StandardScaler()
# 拟合数据并转换
X_scaled = scaler.fit_transform(X)
# 对新数据应用相同的转换
new_data_scaled = scaler.transform(new_data)

参数说明:

with_mean: 是否中心化数据 (默认为True)

with_std: 是否缩放数据到单位方差 (默认为True)

归一化 (MinMaxScaler)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
X_normalized = scaler.fit_transform(X)

参数说明:

feature_range: 缩放范围 (默认为(0,1))

分类特征编码

LabelEncoder (标签编码)

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)
# 反向转换
y_original = le.inverse_transform(y_encoded)

OneHotEncoder (独热编码)

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X_categorical)

参数说明:

sparse: 是否返回稀疏矩阵 (默认为True)

handle_unknown: 遇到未知类别时的处理方式 (‘error’或’ignore’)

数据集划分

train_test_split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,       # 测试集比例
    random_state=42,     # 随机种子
    stratify=y          # 保持类别比例
)

重要参数:

test_size: 可以是比例(0-1)或绝对数量

shuffle: 是否打乱数据 (默认为True)

stratify: 保持分类比例

监督学习模型

线性回归

from sklearn.linear_model import LinearRegression

model = LinearRegression(
    fit_intercept=True,   # 是否计算截距
    normalize=False,      # 是否标准化
    copy_X=True          # 是否复制X
)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)  # R²分数

逻辑回归

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    penalty='l2',        # 正则化类型 ('l1','l2','elasticnet','none')
    C=1.0,              # 正则化强度 (越小越强)
    solver='lbfgs',     # 优化算法
    max_iter=100        # 最大迭代次数
)
model.fit(X_train, y_train)

决策树

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(
    criterion='gini',    # 分裂标准 ('gini'或'entropy')
    max_depth=None,      # 树的最大深度
    min_samples_split=2  # 节点分裂所需最小样本数
)
model.fit(X_train, y_train)

随机森林

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,    # 树的数量
    criterion='gini',    # 分裂标准
    max_depth=None,      # 树的最大深度
    bootstrap=True,      # 是否使用bootstrap采样
    oob_score=True       # 是否使用袋外样本评估
)
model.fit(X_train, y_train)
print(model.oob_score_)  # 袋外分数

无监督学习

K-Means聚类

from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=3,       # 聚类数量
    init='k-means++',    # 初始化方法
    max_iter=300,        # 最大迭代次数
    random_state=42
)
kmeans.fit(X)
labels = kmeans.labels_  # 聚类标签
centers = kmeans.cluster_centers_  # 聚类中心

PCA降维

from sklearn.decomposition import PCA

pca = PCA(
    n_components=2,      # 保留的主成分数
    whiten=False,        # 是否白化数据
    svd_solver='auto'    # SVD求解器
)
X_reduced = pca.fit_transform(X)
print(pca.explained_variance_ratio_)  # 方差解释率

模型评估

分类评估指标

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

# 准确率
acc = accuracy_score(y_true, y_pred)

# 精确率 (可指定average参数: 'micro','macro','weighted')
precision = precision_score(y_true, y_pred, average='macro')

# 召回率
recall = recall_score(y_true, y_pred)

# F1分数
f1 = f1_score(y_true, y_pred)

# ROC AUC (二分类和多分类)
roc_auc = roc_auc_score(y_true, y_scores)

# 混淆矩阵
cm = confusion_matrix(y_true, y_pred)

# 分类报告
report = classification_report(y_true, y_pred)

回归评估指标

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)

mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)  # 均方根误差
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

模型选择与调参

交叉验证

from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    estimator=model,     # 模型
    X=X,                 # 特征
    y=y,                # 标签
    cv=5,               # 折数
    scoring='accuracy'  # 评估指标
)
print(f"平均准确率: {scores.mean():.2f} (±{scores.std():.2f})")

网格搜索

from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.01, 0.1, 1]
}

grid = GridSearchCV(
    estimator=SVC(),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1  # 使用所有CPU核心
)

grid.fit(X_train, y_train)
print("最佳参数:", grid.best_params_)
print("最佳分数:", grid.best_score_)
best_model = grid.best_estimator_

随机搜索

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

param_dist = {
    'C': uniform(0.1, 10),
    'gamma': uniform(0.01, 1),
    'kernel': ['linear', 'rbf']
}

random_search = RandomizedSearchCV(
    SVC(),
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    random_state=42
)
random_search.fit(X_train, y_train)

特征选择

基于统计的方法

from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(
    score_func=f_classif,  # 评分函数 (f_classif用于分类)
    k=10                  # 选择前k个特征
)
X_new = selector.fit_transform(X, y)

递归特征消除 (RFE)

from sklearn.feature_selection import RFE

rfe = RFE(
    estimator=LogisticRegression(),  # 基础模型
    n_features_to_select=5,         # 选择特征数
    step=1                         # 每次迭代移除的特征数
)
X_rfe = rfe.fit_transform(X, y)

Pipeline (管道)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # 缺失值填充
    ('scaler', StandardScaler()),                # 标准化
    ('selector', SelectKBest(k=10)),            # 特征选择
    ('classifier', RandomForestClassifier())     # 分类器
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

实用工具函数

数据集加载

from sklearn.datasets import (
    load_iris,        # 鸢尾花数据集
    load_boston,      # 波士顿房价数据集
    load_digits,      # 手写数字数据集
    make_classification,  # 生成分类数据
    make_regression    # 生成回归数据
)

# 加载内置数据集
iris = load_iris()
X, y = iris.data, iris.target

# 生成模拟数据
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_classes=3,
    random_state=42
)

模型持久化

from sklearn.externals import joblib

# 保存模型
joblib.dump(model, 'model.pkl')

# 加载模型
model = joblib.load('model.pkl')

高级功能

自定义评估指标

from sklearn.metrics import make_scorer

def custom_loss(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

custom_scorer = make_scorer(custom_loss, greater_is_better=False)

grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=custom_scorer,
    cv=5
)

多标签分类

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

model = OneVsRestClassifier(SVC(kernel='linear'))
model.fit(X_train, y_train)