前面介绍了逻辑斯蒂回归以及最大熵模型的原理,这里我们通过代码来重新认识一下这些方法。
逻辑斯蒂回归与最大熵模型
逻辑斯蒂回归模型
数据
数据可以在 https://github.com/canshang/-/blob/master/Logistics.txt 处下载,数据很简单(只有三列,前两列为特征,最后一列为标签)
算法实现
rom numpy import *
filename='Logistics.txt' #文件目录
def loadDataSet(): #读取数据(这里只有两个特征)
dataMat = []
labelMat = [] #列表
fr = open(filename)
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #前面的1,表示方程的常量。比如两个特征X1,X2,共需要三个参数,W1+W2*X1+W3*X2
labelMat.append(int(lineArr[2]))
return dataMat,labelMat
def sigmoid(inX): #sigmoid函数
return 1.0/(1+exp(-inX))
def costFunction(theta, X, y):
m = X.shape[0]#number of training examples
theta = reshape(theta,(len(theta),1))
J =(1./m)*(-transpose(y).dot(log(sigmoid(X.dot(theta))))- transpose(1-y).dot(log(1-sigmoid(X.dot(theta)))))
return J[0][0]
#计算梯度
def compute_grad(theta ,X, y):
theta.shape =(1,3)
m = X.shape[0]
grad = zeros(3)
h = sigmoid(X.dot(theta.T))
delta = h - y
l = grad.size
for i in range(l):
sumdelta = delta.T.dot(X[:, i])
grad[i]=(1.0)* sumdelta*(-1)
theta.shape =(3,)
return grad
def gradAscent(dataMat, labelMat):
dataMatrix=mat(dataMat) #将读取的数据转换为矩阵
classLabels=mat(labelMat).transpose() #将读取的数据转换为矩阵
m,n = shape(dataMatrix)
alpha = 0.001 #设置梯度的阀值,该值越大梯度下降幅度越大
maxCycles = 500 #设置迭代的次数,一般看实际数据进行设定,有些可能200次就够了
weights =array([ 1., 1., 1.])#设置初始的参数,并都赋默认值为1。注意这里权重以矩阵形式表示三个参数。
q=1
#迭代次数
for k in range(maxCycles):
print("迭代次数:")
print(q)
q=q+1
#获取梯度
grad = compute_grad(weights,dataMatrix,classLabels)
#根据梯度更新权重
for i in range(3):
weights[i]=weights[i]+alpha*grad[i]
print("损失:")
print(costFunction(weights,dataMatrix,classLabels))
return weights
def plotBestFit(weights): #画出最终分类的图
import matplotlib.pyplot as plt
dataMat,labelMat=loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i])== 1:
xcord1.append(dataArr[i,1])
ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3,3)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
def main():
dataMat, labelMat = loadDataSet()
weights=gradAscent(dataMat, labelMat)
print(weights)
plotBestFit(weights)
if __name__=='__main__':
main()
结果
迭代次数:
500
损失:
[[0.18622212]]
[ 4.12414349 0.48007329 -0.6168482 ]
最大熵模型
数据集
数据集:https://github.com/canshang/-/blob/master/最大熵.zip
实现
import pandas as pd
import numpy as np
import time
import math
import random
from collections import defaultdict
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
class MaxEnt(object):
def init_params(self, X, Y):
self.X_ = X
self.Y_ = set()
self.cal_Pxy_Px(X, Y)
self.N = len(X) # 训练集大小
self.n = len(self.Pxy) # 书中(x,y)对数
self.M = 10000.0 # 书91页那个M,但实际操作中并没有用那个值
# 可认为是学习速率
self.build_dict()
self.cal_EPxy()
def build_dict(self):
self.id2xy = {}
self.xy2id = {}
for i, (x, y) in enumerate(self.Pxy):
self.id2xy[i] = (x, y)
self.xy2id[(x, y)] = i
def cal_Pxy_Px(self, X, Y):
self.Pxy = defaultdict(int)
self.Px = defaultdict(int)
for i in xrange(len(X)):
x_, y = X[i], Y[i]
self.Y_.add(y)
for x in x_:
self.Pxy[(x, y)] += 1
self.Px[x] += 1
def cal_EPxy(self):
'''
计算书中82页最下面那个期望
'''
self.EPxy = defaultdict(float)
for id in xrange(self.n):
(x, y) = self.id2xy[id]
self.EPxy[id] = float(self.Pxy[(x, y)]) / float(self.N)
def cal_pyx(self, X, y):
result = 0.0
for x in X:
if self.fxy(x, y):
id = self.xy2id[(x, y)]
result += self.w[id]
return (math.exp(result), y)
def cal_probality(self, X):
'''
计算书85页公式6.22
'''
Pyxs = [(self.cal_pyx(X, y)) for y in self.Y_]
Z = sum([prob for prob, y in Pyxs])
return [(prob / Z, y) for prob, y in Pyxs]
def cal_EPx(self):
'''
计算书83页最上面那个期望
'''
self.EPx = [0.0 for i in xrange(self.n)]
for i, X in enumerate(self.X_):
Pyxs = self.cal_probality(X)
for x in X:
for Pyx, y in Pyxs:
if self.fxy(x, y):
id = self.xy2id[(x, y)]
self.EPx[id] += Pyx * (1.0 / self.N)
def fxy(self, x, y):
return (x, y) in self.xy2id
def train(self, X, Y):
self.init_params(X, Y)
self.w = [0.0 for i in range(self.n)]
max_iteration = 1000
for times in xrange(max_iteration):
print 'iterater times %d' % times
sigmas = []
self.cal_EPx()
for i in xrange(self.n):
sigma = 1 / self.M * math.log(self.EPxy[i] / self.EPx[i])
sigmas.append(sigma)
# if len(filter(lambda x: abs(x) >= 0.01, sigmas)) == 0:
# break
self.w = [self.w[i] + sigmas[i] for i in xrange(self.n)]
def predict(self, testset):
results = []
for test in testset:
result = self.cal_probality(test)
results.append(max(result, key=lambda x: x[0])[1])
return results
def rebuild_features(features):
'''
将原feature的(a0,a1,a2,a3,a4,...)
变成 (0_a0,1_a1,2_a2,3_a3,4_a4,...)形式
'''
new_features = []
for feature in features:
new_feature = []
for i, f in enumerate(feature):
new_feature.append(str(i) + '_' + str(f))
new_features.append(new_feature)
return new_features
if __name__ == "__main__":
print 'Start read data'
time_1 = time.time()
raw_data = pd.read_csv('../data/train_binary.csv', header=0)
data = raw_data.values
imgs = data[0::, 1::]
labels = data[::, 0]
# 选取 2/3 数据作为训练集, 1/3 数据作为测试集
train_features, test_features, train_labels, test_labels = train_test_split(
imgs, labels, test_size=0.33, random_state=23323)
train_features = rebuild_features(train_features)
test_features = rebuild_features(test_features)
time_2 = time.time()
print 'read data cost ', time_2 - time_1, ' second', '\n'
print 'Start training'
met = MaxEnt()
met.train(train_features, train_labels)
time_3 = time.time()
print 'training cost ', time_3 - time_2, ' second', '\n'
print 'Start predicting'
test_predict = met.predict(test_features)
time_4 = time.time()
print 'predicting cost ', time_4 - time_3, ' second', '\n'
score = accuracy_score(test_labels, test_predict)
print "The accruacy socre is ", score
程序
https://github.com/canshang/-/blob/master/Logistics.ipynb
https://github.com/canshang/-/blob/master/最大熵模型.ipynb
参考文献
《统计学习方法》
https://blog.youkuaiyun.com/wds2006sdo/article/details/53106579