Scikit-learn 常用函数及其用法
数据预处理
标准化 (StandardScaler)
from sklearn.preprocessing import StandardScaler
# 创建标准化器
scaler = StandardScaler()
# 拟合数据并转换
X_scaled = scaler.fit_transform(X)
# 对新数据应用相同的转换
new_data_scaled = scaler.transform(new_data)
参数说明:
with_mean: 是否中心化数据 (默认为True)
with_std: 是否缩放数据到单位方差 (默认为True)
归一化 (MinMaxScaler)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_normalized = scaler.fit_transform(X)
参数说明:
feature_range: 缩放范围 (默认为(0,1))
分类特征编码
LabelEncoder (标签编码)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# 反向转换
y_original = le.inverse_transform(y_encoded)
OneHotEncoder (独热编码)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X_categorical)
参数说明:
sparse: 是否返回稀疏矩阵 (默认为True)
handle_unknown: 遇到未知类别时的处理方式 (‘error’或’ignore’)
数据集划分
train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2, # 测试集比例
random_state=42, # 随机种子
stratify=y # 保持类别比例
)
重要参数:
test_size: 可以是比例(0-1)或绝对数量
shuffle: 是否打乱数据 (默认为True)
stratify: 保持分类比例
监督学习模型
线性回归
from sklearn.linear_model import LinearRegression
model = LinearRegression(
fit_intercept=True, # 是否计算截距
normalize=False, # 是否标准化
copy_X=True # 是否复制X
)
model.fit(X_train, y_train)
score = model.score(X_test, y_test) # R²分数
逻辑回归
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
penalty='l2', # 正则化类型 ('l1','l2','elasticnet','none')
C=1.0, # 正则化强度 (越小越强)
solver='lbfgs', # 优化算法
max_iter=100 # 最大迭代次数
)
model.fit(X_train, y_train)
决策树
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(
criterion='gini', # 分裂标准 ('gini'或'entropy')
max_depth=None, # 树的最大深度
min_samples_split=2 # 节点分裂所需最小样本数
)
model.fit(X_train, y_train)
随机森林
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
n_estimators=100, # 树的数量
criterion='gini', # 分裂标准
max_depth=None, # 树的最大深度
bootstrap=True, # 是否使用bootstrap采样
oob_score=True # 是否使用袋外样本评估
)
model.fit(X_train, y_train)
print(model.oob_score_) # 袋外分数
无监督学习
K-Means聚类
from sklearn.cluster import KMeans
kmeans = KMeans(
n_clusters=3, # 聚类数量
init='k-means++', # 初始化方法
max_iter=300, # 最大迭代次数
random_state=42
)
kmeans.fit(X)
labels = kmeans.labels_ # 聚类标签
centers = kmeans.cluster_centers_ # 聚类中心
PCA降维
from sklearn.decomposition import PCA
pca = PCA(
n_components=2, # 保留的主成分数
whiten=False, # 是否白化数据
svd_solver='auto' # SVD求解器
)
X_reduced = pca.fit_transform(X)
print(pca.explained_variance_ratio_) # 方差解释率
模型评估
分类评估指标
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
roc_auc_score,
confusion_matrix,
classification_report
)
# 准确率
acc = accuracy_score(y_true, y_pred)
# 精确率 (可指定average参数: 'micro','macro','weighted')
precision = precision_score(y_true, y_pred, average='macro')
# 召回率
recall = recall_score(y_true, y_pred)
# F1分数
f1 = f1_score(y_true, y_pred)
# ROC AUC (二分类和多分类)
roc_auc = roc_auc_score(y_true, y_scores)
# 混淆矩阵
cm = confusion_matrix(y_true, y_pred)
# 分类报告
report = classification_report(y_true, y_pred)
回归评估指标
from sklearn.metrics import (
mean_squared_error,
mean_absolute_error,
r2_score
)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse) # 均方根误差
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
模型选择与调参
交叉验证
from sklearn.model_selection import cross_val_score
scores = cross_val_score(
estimator=model, # 模型
X=X, # 特征
y=y, # 标签
cv=5, # 折数
scoring='accuracy' # 评估指标
)
print(f"平均准确率: {scores.mean():.2f} (±{scores.std():.2f})")
网格搜索
from sklearn.model_selection import GridSearchCV
param_grid = {
'C': [0.1, 1, 10],
'kernel': ['linear', 'rbf'],
'gamma': [0.01, 0.1, 1]
}
grid = GridSearchCV(
estimator=SVC(),
param_grid=param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1 # 使用所有CPU核心
)
grid.fit(X_train, y_train)
print("最佳参数:", grid.best_params_)
print("最佳分数:", grid.best_score_)
best_model = grid.best_estimator_
随机搜索
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
param_dist = {
'C': uniform(0.1, 10),
'gamma': uniform(0.01, 1),
'kernel': ['linear', 'rbf']
}
random_search = RandomizedSearchCV(
SVC(),
param_distributions=param_dist,
n_iter=100,
cv=5,
random_state=42
)
random_search.fit(X_train, y_train)
特征选择
基于统计的方法
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(
score_func=f_classif, # 评分函数 (f_classif用于分类)
k=10 # 选择前k个特征
)
X_new = selector.fit_transform(X, y)
递归特征消除 (RFE)
from sklearn.feature_selection import RFE
rfe = RFE(
estimator=LogisticRegression(), # 基础模型
n_features_to_select=5, # 选择特征数
step=1 # 每次迭代移除的特征数
)
X_rfe = rfe.fit_transform(X, y)
Pipeline (管道)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
pipe = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # 缺失值填充
('scaler', StandardScaler()), # 标准化
('selector', SelectKBest(k=10)), # 特征选择
('classifier', RandomForestClassifier()) # 分类器
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
实用工具函数
数据集加载
from sklearn.datasets import (
load_iris, # 鸢尾花数据集
load_boston, # 波士顿房价数据集
load_digits, # 手写数字数据集
make_classification, # 生成分类数据
make_regression # 生成回归数据
)
# 加载内置数据集
iris = load_iris()
X, y = iris.data, iris.target
# 生成模拟数据
X, y = make_classification(
n_samples=1000,
n_features=20,
n_classes=3,
random_state=42
)
模型持久化
from sklearn.externals import joblib
# 保存模型
joblib.dump(model, 'model.pkl')
# 加载模型
model = joblib.load('model.pkl')
高级功能
自定义评估指标
from sklearn.metrics import make_scorer
def custom_loss(y_true, y_pred):
return np.mean(np.abs(y_true - y_pred))
custom_scorer = make_scorer(custom_loss, greater_is_better=False)
grid = GridSearchCV(
estimator=model,
param_grid=param_grid,
scoring=custom_scorer,
cv=5
)
多标签分类
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
model = OneVsRestClassifier(SVC(kernel='linear'))
model.fit(X_train, y_train)