Python - 在线学习(PLA 和 SGD)

当数据太大的时候,计算机内存无法同时处理数据集,则可以考虑分段加载的方式,在线依次加载到计算机内存中完成算法的训练。

1. 感知器:Perceptron Learning Algorithm,PLA。最老的计算机学习算法,只能解决线性问题,基于核的感知器则可以解决非线性数据集。关于PLA详细使用请参考感知器算法PLA,其中关于旋转那块重点看,y是用于调整方向的,alpha是来调整旋转角度的,最后目标是使代价函数收敛到y与x方向保持一致的情况,但切记alpha学习速率参数设置过大,不然旋转过多,一般介于0.1到0.4之间。

2. 随机梯度下降:Stochastic Gradient Descent,SGD。随机梯度下降每次只操作一个实例。


感知器:

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 12 10:43:15 2018

@author: Alvin AI
"""

from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale
import numpy as np

#生成数据
def get_data(batch_size):
    b_size = 0
    no_features = 30
    redundant_features = int(0.1*no_features)
    informative_features = int(0.8*no_features)
    repeated_features = int(0.1*no_features)
    
    while b_size < batch_size:#建立多批数据集
        x,y = make_classification(n_samples=1000,\
                                  n_features=no_features,\
                                  flip_y=0.03,\
                                  n_informative=informative_features,\
                                  n_redundant=redundant_features,\
                                  n_repeated=repeated_features,\
                                  random_state=51)
        y_indx = y < 1#把所有的0换为-1
        y[y_indx] = -1
        x = scale(x,with_mean=True,with_std=True)#中心化标准化
        
        yield x,y 
        b_size+=1
        
#构建感知器模型
def build_model(x,y,weights,epochs,alpha=0.5):
    for i in range(epochs):#更新epoch次
        #搅乱数据集
        shuff_index = np.random.shuffle(range(len(y)))
        #这样搅浑后原始数据会比原来在前面多出一个维度
        #像x_train为1*1000*30 而y_train则为1*1000
        #为了正常显示,需要使用.reshape()
        x_train = x[shuff_index,:].reshape(x.shape)
        y_train = np.ravel(y[shuff_index,:])#将多维变1维,行向量矩阵变为列向量
        #一次构建一个实例的权重
        for index in range(len(y)):
            #sign函数,如果权重和属性的乘积是正数,函数返回+1,如果是负数,返回-1
            prediction = np.sign( np.sum(x_train[index,:] * weights) )
            if prediction != y_train[index]:
                weights = weights + alpha*(y_train[index]*x_train[index,:])#更新
    return weights

#评估模型
def model_worth(x,y,weights):
    prediction = np.sign(np.sum(x * weights,axis=1))
    print classification_report(y,prediction)
    
#main函数
if __name__=="__main__":
    data = get_data(10)
    x,y = data.next()
    weights = np.zeros(x.shape[1])
    for i in range(10):
        epochs = 100
        weights = build_model(x,y,weights,epochs)
        print "model worth after receiving dataset batch %d" % (i+1)
        model_worth(x,y,weights)
        if i < 9:
            x,y = data.next()#迭代到下一组数据,因为原有数据太大了,得分批载入
    #这里的权重并没有因为批次不同而重新初始化,一直保持更新的状态
    #这就是PLA通过多次分批导入数据来解决数据过载而无法训练模型的手段

随机梯度下降(SGD):

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 12 16:40:52 2018

@author: Alvin AI
"""

from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import SGDClassifier #随机梯度下降
import numpy as np

def get_data():
    no_features = 30
    redundant_features = int(0.1*no_features)
    informative_features = int(0.6*no_features)
    repeated_features = int(0.1*no_features)
    x,y = make_classification(n_samples=1000,n_features=no_features,\
                              flip_y=0.03,n_informative=informative_features,\
                              n_redundant=redundant_features,\
                              n_repeated=repeated_features,\
                              random_state=7)
    return x,y

def build_model(x,y,x_dev,y_dev):
    #n_iter迭代数:为了更新权重值所需遍历数据集的次数
    #shuffle:搅乱数据
    #loss:损失函数。如果是回归问题,可以用square_loss;而如果是分类问题,因为反应变量y不一样,所以可以考虑“log”
    #learning_rate:指定学习速率eta为常数类型,其他像"optimal"则为逐渐减少,“invscaling”则为随时间缩放
    #fit_intercept:配适回归常数,即权重系数
    #penalty:缩减类型,本例无需缩减,如果要加入正则化,像L2岭回归,penalty="l2",即L2范数正则化
    estimator = SGDClassifier(n_iter=50,shuffle=True,loss="log",\
                             learning_rate="constant",\
                             eta0=0.0001,\
                             fit_intercept=True,\
                             penalty="none")
    estimator.fit(x,y)
    train_predicted = estimator.predict(x)
    train_score = accuracy_score(y,train_predicted)
    dev_predicted = estimator.predict(x_dev)
    dev_score = accuracy_score(y_dev,dev_predicted)
    
    print "training accuracy = %0.2f,   dev accuracy = %0.2f" %\
                (train_score,dev_score)
                
if __name__=="__main__":
    x,y = get_data()
    x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
                                                test_size=0.3,random_state=9)
    x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
                                                 test_size=0.3,random_state=9)
    build_model(x_train,y_train,x_dev,y_dev)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值