代价函数得来
首先确定:
hθ(x)=g(θTx)=11+e−θTx
h_{\theta}(x)=g\left(\theta^{T} x\right)=\frac{1}{1+e^{-\theta^{T} x}}
hθ(x)=g(θTx)=1+e−θTx1
函数hθ(x)h_{\theta}(x)hθ(x)即logistic回归的公式。
为了得到一个凸函数,logistic回归似然函数:
L(θ)=∏i=1mP(yi∣xi;θ)=∏i=1m(hθ(xi))yi((1−hθ(xi)))1−yi
L(\theta)=\prod_{i=1}^{m} P\left(y_{i} | x_{i} ; \theta\right)=\prod_{i=1}^{m}\left(h_{\theta}\left(x_{i}\right)\right)^{y_{i}}\left(\left(1-h_{\theta}\left(x_{i}\right)\right)\right)^{1-y_{i}}
L(θ)=i=1∏mP(yi∣xi;θ)=i=1∏m(hθ(xi))yi((1−hθ(xi)))1−yi
取对数:
l(θ)=logL(θ)=∑i=1m(yiloghθ(xi)+(1−yi)log(1−hθ(xi)))
l(\theta)=\log L(\theta)=\sum_{i=1}^{m}\left(y_{i} \operatorname{logh}_{\theta}\left(x_{i}\right)+\left(1-y_{i}\right) \log \left(1-h_{\theta}\left(x_{i}\right)\right)\right)
l(θ)=logL(θ)=i=1∑m(yiloghθ(xi)+(1−yi)log(1−hθ(xi)))
其中mmm为样本个数
代价函数为:J(θ)=−1ml(θ)J(\theta)=-\frac{1}{m} l(\theta)J(θ)=−m1l(θ)
梯度的推导首先要明白一个公式(对θ\thetaθ求导):hθ(x)′=g(θTx)′=g(θTx)(1−g(θTx))xh_{\theta}(x)' = g(\theta^{T}x)' = g(\theta^{T}x)(1-g(\theta^{T}x))xhθ(x)′=g(θTx)′=g(θTx)(1−g(θTx))x
代价函数导数为:
δδθjJ(θ)=−1m(∑i=1myi(1−hθ(xi)xi−(1−yi)hθ(xi)xi))=−1m(∑i=1m(yi−hθ(xi)xi))=1m(∑i=1m(hθ(xi)−yi)xi))\frac{\delta}{\delta_{\theta_{j}}} J(\theta) = -\frac{1}{m}(\sum_{i=1}^{m}y^{i}(1-h_\theta(x^i)x^i - (1-y^i)h_\theta(x^i)x^i))\newline =-\frac{1}{m}(\sum_{i=1}^m(y^i - h_\theta(x^i)x^i))\\ =\frac{1}{m}(\sum_{i=1}^m(h_\theta(x^i)-y^i)x^i))δθjδJ(θ)=−m1(i=1∑myi(1−hθ(xi)xi−(1−yi)hθ(xi)xi))=−m1(i=1∑m(yi−hθ(xi)xi))=m1(i=1∑m(hθ(xi)−yi)xi))
每次只要更新θj=θj−α1m∑i=1m(hθ(xi)−yi)xji)\theta_j = \theta_j - \alpha\frac{1}{m}\sum_{i=1}^m(h_\theta(x^i)-y^i)x_j^i)θj=θj−αm1i=1∑m(hθ(xi)−yi)xji)
下面是python实现代码:
# coding: utf-8
import numpy as np
import pandas as pd
# change 3 kinds of Iris to 1,2,3
# add a feature x0 = 1 (w * x + b to (b.w)*(x0,x))
def transform_classification(Iris_df):
Iris = Iris_df.copy()
classification = {}
for i,c in enumerate(Iris.iloc[:,-1]):
if c not in classification:
classification[c] = len(classification) + 1
Iris.iloc[i,-1] = len(classification)
else:
Iris.iloc[i,-1] = classification[Iris.iloc[i,-1]]
Iris.insert(4,'10',pd.DataFrame(np.ones(len(Iris)+1)))
return Iris
# def predict function
def predict(theta, x):
a = [- (theta * x[i].T) for i in range(len(x))]
h_theta = 1/(1 + np.exp(a))
return h_theta
# define loss function
def eval_loss(theta, x, y):
J_theta = -np.array([y[i] *np.log(predict(theta, x[i])) + (1 - y[i]) *np.log(1 - predict(theta, x[i])) for i in range(len(x))])
clf = np.where(J_theta >= 0.5, 1, 0)
correct_rate = (clf == y).mean()
return J_theta.mean(), correct_rate
# calculate gradient
def get_gradient(y_pred, y_real, x):
d_theta = np.array([((y_pred - y_real) *x[:,i]).mean() for i in range(x.shape[1])])
return d_theta
# update theta
def update_theta(batch_x, batch_y, theta, lr):
batch_y_pred = predict(theta, batch_x)
d_theta = get_gradient(batch_y_pred, batch_y, batch_x)
theta -= lr*d_theta
return theta
# train function
def train(x, y, batch_size, epoch, lr):
x = np.mat(x)
# give theta a random value
theta = np.random.random(x.shape[1])
for epo in range(epoch):
# chose samples randomly
ids = np.random.choice(len(x), batch_size)
batch_x = x[ids]
batch_y = y[ids]
theta = update_theta(batch_x, batch_y, theta, lr)
loss, accuracy = eval_loss(theta, batch_x, batch_y)
print('epoch:{}\nθ:{}\nloss = {}\naccuracy = {}\n'.format(epo+1, theta, loss, accuracy))
return theta
def run():
Iris = pd.read_csv('data/Iris.csv',index_col=0)
Iris = transform_classification(Iris)
Iris.iloc[:,-1] = np.where(Iris.values[:,-1] == 1, 1, 0)
# split data to train
data = Iris.values
idxs = np.array(list(range(len(data))))
np.random.shuffle(idxs)
k = int(1*len(idxs))
train_x = data[idxs[:k],:-1]
train_y = data[idxs[:k],-1]
test_x = data[idxs[k:],:-1]
test_y = data[idxs[k:],:1]
lr = 0.001
batch_size = 100
epoch = 100
# train
theta = train(train_x[:,:], train_y, batch_size, epoch, lr)
if __name__ == '__main__':
run()