一、KNN(K近邻)算法
1、算法实现(sklearn.neighbors.KNeighborsClassifier)
#主要包
from sklearn.neighbors import KNeighborsClassifier
#创建KNN分类对象
kNN_classifier = KNeighborsClassifier(n_neighbors=5) #n_neighbors就是K
#适配数据
kNN_classifier.fit(X_train,y_train)
#预测
predict_y = kNN_classifier.predict(data_new.reshape(1,-1))
2、划分数据集(sklearn.model_selection.train_test_split)
#数据集制作包
from sklearn.model_selection import train_test_split
#划分
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7, random_state = 233, stratify = y)
3、模型评价( sklearn.metrics.accuracy_score)
#认识鸢尾花数据集
from sklearn import datasets
iris = datasets.load_iris()
x=iris.data #特征
y=iris.target #标签
#sklean模型评价包
from sklearn.metrics import accuracy_score
#评价
accuracy_score(y_test,y_predict)
4、超参数搜索(sklearn.model_selection.GridSearchCV)
from sklearn.model_selection import GridSearchCV
#设置参数范围,便于使用GridSearchCV进行循环匹配
params = {
'n_neighbors': [n for n in range(1, 20)],
'weights': ['uniform', 'distance'],
'p': [p for p in range(1, 7)]
}
#创建对象
grid = GridSearchCV(
estimator=KNeighborsClassifier(),
param_grid=params,
n_jobs=-1
)
#适配数据
grid.fit(x_train, y_train)
#最佳参数
grid.best_params_
#最佳预测分数
grid.best_score_
#适配最佳参数
grid.best_estimator_
#使用最佳参数进行预测
grid.best_estimator_.predict(x_test)
#评估预测分数
grid.best_estimator_.score(x_test, y_test)
5、归一化(sklearn.preprocessing.StandardScaler)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,train_size=0.8,random_state=666)
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
standard_scaler.fit(X_train)
standard_scaler.mean_
standard_scaler.scale_
X_train_standard = standard_scaler.transform(X_train)
X_test_standard = standard_scaler.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_standard,y_train)
knn_classifier.score(X_test_standard, y_test)
6、KNN简单回归(预测值)( sklearn.neighbors.KNeighborsRegressor)
x_train ,x_test, y_train, y_test = train_test_split(x, y ,train_size = 0.7, random_state=233)
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor(n_neighbors=5, weights='distance', p=2)
knn_reg.fit(x_train, y_train)
knn_reg.score(x_test, y_test)
二、线性回归算法
1、线性回归(sklearn.linear_model.LinearRegression)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x_train.reshape(-1,1), y_train)
y_predict = lin_reg.predict(x_test.reshape(-1,1))
plt.scatter(x_test, y_test)
plt.plot(x_test, y_predict, c='r')
plt.show()
2、线性回归模型评价(MSE, RMSE, MAE, R2)
MSE, RMSE: sklearn.metrics.mean_squared_error MAE: sklearn.metrics.mean_absolute_error
R方: from sklearn.metrics import r2_score
3、线性多项式回归(sklearn.preprocessing.PolynomialFeatures)
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_poly = poly .fit_transform(X)
4、逻辑回归(sklearn.linear_model.LogisticRegression)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)
clf.score(x_train, y_train)
clf.score(x_test, y_test)
clf.predict(x_test)
np.argmax(clf.predict_proba(x_test), axis = 1)
5、复杂逻辑回归(多项式/多分类)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
#制作数据集
np.random.seed(0)
X = np.random.normal(0,1,size=(200,2))
y = np.array((X[:,0]**2)+(X[:,1]**2)<2, dtype='int')
print(X)
print(y)
plt.scatter(x_train[:,0], x_train[:,1], c = y_train)
plt.show() #散点图显示
#普通线性逻辑回归
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)
clf.score(x_train, y_train) #0.7071428571428572
clf.score(x_test, y_test)#0.6666666666666666,可见效果并不佳
#多项式逻辑回归
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2) #设置多项式式最高幂
poly.fit(x_train) #特征适配,将多元降幂
x2 = poly.transform(x_train) #定义新x_train(x2)
x2t = poly.transform(x_test) #定义新x_test(x2t)
clf.fit(x2, y_train) #使用普通线性逻辑回归训练数据
clf.score(x2, y_train) #1.0
clf.score(x2t, y_test) #0.9666666666666667 ,可见使用多项式逻辑回归效果显著提升
#多分类逻辑回归(主要针对数据杂乱无序,样本数可数)
from sklearn import datasets
iris = datasets.load_iris() #加载鸢尾花数据集
x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=666)
plt.scatter(x_train[:,0], x_train[:,1], c=y_train)
plt.show()
from sklearn.multiclass import OneVsRestClassifier #使用一对多的方法,耗时少,准确率略低
ovr = OneVsRestClassifier(clf)
ovr.fit(x_train,y_train)
ovr.score(x_test, y_test) #0.9736842105263158
from sklearn.multiclass import OneVsOneClassifier
ovr = OneVsOneClassifier(clf)
ovr.fit(x_train,y_train)
ovr.score(x_test, y_test) #1.0 使用一对一的方法,耗时多,准确率较高