部分算法在之前的博客中有介绍,在这就直接上代码了,之前没有介绍过的算法在这篇中有简要介绍。另外,PCA算法已经在之前的机器学习中介绍过了,用sklearn实现PCA也在机器学习第六回——降维+异常检测中介绍过了。
线性回归
一元一次
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
X=np.linspace(2,10,20)
X.shape=20,1
# f(x)=wx+b
y=np.random.randint(1,6,size=1)*X+np.random.randint(-5,5,size=1)#随机生成数据X和y,其中w和b也是随机的
# 加噪声
y=y+np.random.randn(20,1)*0.8
#plt.scatter(X,y,color='red')#绘制x,y的散点图
#plt.show()
lr=LinearRegression()#使用线性回归进行拟合
lr.fit(X,y)
w=lr.coef_[0,0]
b=lr.intercept_[0]
#print(w,b)
plt.scatter(X,y)
x=np.linspace(1,11,50)
plt.plot(x,w*x+b,color='green')#绘制直线图
plt.show()
一元二次
#f(x)=w1*x**2+w2*x+b或f(x)=w1*x1+w2*x2+b
X=np.linspace(0,10,50)
X.shape=50,1
X=np.concatenate((X**2,X),axis=1)
print(X.shape)#(50, 2)
w=np.random.randint(1,10,size=(2,1))
b=np.random.randint(-5,5,size=1)
y=X.dot(w)+b#矩阵相乘
print(y.shape)
#plt.plot(X[:,1],y,color='blue')
#plt.show()
lr=LinearRegression()
lr.fit(X,y)
w1=lr.coef_[0,0]
w2=lr.coef_[0,1]
b=lr.intercept_[0]
print(w1,w2,b)#输出拟合后的参数
plt.scatter(X[:,1],y,marker='*')
x=np.linspace(-2,12,100)
plt.plot(x,x**2*w1+x*w2+b,color='blue')
plt.show()
逻辑回归
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
X,y=datasets.load_iris(True)
cond=y!=2
X=X[cond]
y=y[cond]#把这个问题变成二分类问题才能使用逻辑回归
from sklearn.model_selection import train_test_split
result=train_test_split(X,y,test_size=0.2)#把数据划分为训练集和测试集
lr=LogisticRegression()
lr.fit(result[0],result[2])#用训练集拟合
proba_=lr.predict_proba(result[1])
print(proba_)
决策树
原理:1. 通过给定的训练集构造出一棵决策树,通过某种衡量标准来确定各个特征作为节点的排序情况。
2. 衡量标准:熵值。熵值越大,混乱程度越高,反之,则越低。熵值计算公式:
3. 信息增益: 特征X排序前后的熵值差,遍历所有的特征,选择信息增益最大的作为根节点,并在剩余特征中重复上述步骤,直到决策树形成。
代码实现:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import tree
X,y=datasets.load_iris(True)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1024)
clf=DecisionTreeClassifier(criterion='entropy')#使用熵值来作为决策树构建原则
clf.fit(X_train,y_train)
y_=clf.predict(X_test)
from sklearn.metrics import accuracy_score
score=accuracy_score(y_test,y_)
print(score)
SVM(支持向量机)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
digits=datasets.load_digits()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test,images_train,images_test=train_test_split(digits.data,digits.target,digits.images,test_size=0.33,random_state=42)
from sklearn import svm
svc_model=svm.SVC(C=1.0,kernel='linear')
svc_model.fit(X_train,y_train)
#接下来通过可视化数据及其预测标签来测试模型
predicted=svc_model.predict(X_test)
images_and_predictions=list(zip(images_test,predicted))
for index,(image,prediction) in enumerate(images_and_predictions[:4]):
plt.subplot(1,4,index+1)
plt.imshow(image,cmap=plt.cm.gray_r,interpolation='nearest')
plt.title('Predicted:'+str(prediction))
plt.show()
# 评估模型
from sklearn import metrics
print(metrics.accuracy_score(y_test,predicted))#准确率0.9797979797979798
神经网络
import sklearn
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets
import pandas as pd
import numpy as np
X,y=datasets.load_iris(True)
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.3,random_state=42)
model=MLPClassifier(activation='relu',solver='adam',alpha=0.0001,max_iter=10000)
model.fit(X_train,y_train)
predicted=model.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,predicted))
参数说明
hidden_layer_sizes: 元祖
activation:激活函数 {‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, 默认 ‘relu’
solver :优化算法{‘lbfgs’, ‘sgd’, ‘adam’}
alpha:L2惩罚(正则化项)参数
learning_rate:学习率 {‘constant’, ‘invscaling’, ‘adaptive’}
learning_rate_init:初始学习率,默认0.001
max_iter:最大迭代次数 默认200
KNN
方法: 1.计算测试数据与各个训练数据之间的距离;
2.按照距离的递增关系进行排序;
3.选取距离最小的K个点;
4.确定前K个点所在类别的出现频率;
5.返回前K个点中出现频率最高的类别作为测试数据的预测分类。
代码实现:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets
X,y=datasets.load_iris(True)
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.3,random_state=42)
clf=KNeighborsClassifier()
clf.fit(X_train,y_train)
predictde=clf.predict(X_test)
from sklearn.metrics import accuracy_score
score=accuracy_score(y_test,predictde)
print(score)