Python - 在线学习（PLA 和 SGD）

最新推荐文章于 2025-04-10 10:58:36 发布

Alvin Ai

最新推荐文章于 2025-04-10 10:58:36 发布

阅读量1.5k

点赞数

分类专栏：机器学习

机器学习专栏收录该内容

11 篇文章

订阅专栏

当数据太大的时候，计算机内存无法同时处理数据集，则可以考虑分段加载的方式，在线依次加载到计算机内存中完成算法的训练。

1. 感知器：Perceptron Learning Algorithm，PLA。最老的计算机学习算法，只能解决线性问题，基于核的感知器则可以解决非线性数据集。关于PLA详细使用请参考感知器算法PLA，其中关于旋转那块重点看，y是用于调整方向的，alpha是来调整旋转角度的，最后目标是使代价函数收敛到y与x方向保持一致的情况，但切记alpha学习速率参数设置过大，不然旋转过多，一般介于0.1到0.4之间。

2. 随机梯度下降：Stochastic Gradient Descent，SGD。随机梯度下降每次只操作一个实例。

感知器：

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 12 10:43:15 2018

@author: Alvin AI
"""

from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale
import numpy as np

#生成数据
def get_data(batch_size):
    b_size = 0
    no_features = 30
    redundant_features = int(0.1*no_features)
    informative_features = int(0.8*no_features)
    repeated_features = int(0.1*no_features)
    
    while b_size < batch_size:#建立多批数据集
        x,y = make_classification(n_samples=1000,\
                                  n_features=no_features,\
                                  flip_y=0.03,\
                                  n_informative=informative_features,\
                                  n_redundant=redundant_features,\
                                  n_repeated=repeated_features,\
                                  random_state=51)
        y_indx = y < 1#把所有的0换为-1
        y[y_indx] = -1
        x = scale(x,with_mean=True,with_std=True)#中心化标准化
        
        yield x,y 
        b_size+=1
        
#构建感知器模型
def build_model(x,y,weights,epochs,alpha=0.5):
    for i in range(epochs):#更新epoch次
        #搅乱数据集
        shuff_index = np.random.shuffle(range(len(y)))
        #这样搅浑后原始数据会比原来在前面多出一个维度
        #像x_train为1*1000*30 而y_train则为1*1000
        #为了正常显示，需要使用.reshape()
        x_train = x[shuff_index,:].reshape(x.shape)
        y_train = np.ravel(y[shuff_index,:])#将多维变1维,行向量矩阵变为列向量
        #一次构建一个实例的权重
        for index in range(len(y)):
            #sign函数，如果权重和属性的乘积是正数，函数返回+1，如果是负数，返回-1
            prediction = np.sign( np.sum(x_train[index,:] * weights) )
            if prediction != y_train[index]:
                weights = weights + alpha*(y_train[index]*x_train[index,:])#更新
    return weights

#评估模型
def model_worth(x,y,weights):
    prediction = np.sign(np.sum(x * weights,axis=1))
    print classification_report(y,prediction)
    
#main函数
if __name__=="__main__":
    data = get_data(10)
    x,y = data.next()
    weights = np.zeros(x.shape[1])
    for i in range(10):
        epochs = 100
        weights = build_model(x,y,weights,epochs)
        print "model worth after receiving dataset batch %d" % (i+1)
        model_worth(x,y,weights)
        if i < 9:
            x,y = data.next()#迭代到下一组数据，因为原有数据太大了，得分批载入
    #这里的权重并没有因为批次不同而重新初始化，一直保持更新的状态
    #这就是PLA通过多次分批导入数据来解决数据过载而无法训练模型的手段

随机梯度下降（SGD）:

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 12 16:40:52 2018

@author: Alvin AI
"""

from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import SGDClassifier #随机梯度下降
import numpy as np

def get_data():
    no_features = 30
    redundant_features = int(0.1*no_features)
    informative_features = int(0.6*no_features)
    repeated_features = int(0.1*no_features)
    x,y = make_classification(n_samples=1000,n_features=no_features,\
                              flip_y=0.03,n_informative=informative_features,\
                              n_redundant=redundant_features,\
                              n_repeated=repeated_features,\
                              random_state=7)
    return x,y

def build_model(x,y,x_dev,y_dev):
    #n_iter迭代数：为了更新权重值所需遍历数据集的次数
    #shuffle:搅乱数据
    #loss:损失函数。如果是回归问题，可以用square_loss；而如果是分类问题，因为反应变量y不一样，所以可以考虑“log”
    #learning_rate:指定学习速率eta为常数类型,其他像"optimal"则为逐渐减少，“invscaling”则为随时间缩放
    #fit_intercept：配适回归常数，即权重系数
    #penalty：缩减类型，本例无需缩减，如果要加入正则化，像L2岭回归，penalty="l2",即L2范数正则化
    estimator = SGDClassifier(n_iter=50,shuffle=True,loss="log",\
                             learning_rate="constant",\
                             eta0=0.0001,\
                             fit_intercept=True,\
                             penalty="none")
    estimator.fit(x,y)
    train_predicted = estimator.predict(x)
    train_score = accuracy_score(y,train_predicted)
    dev_predicted = estimator.predict(x_dev)
    dev_score = accuracy_score(y_dev,dev_predicted)
    
    print "training accuracy = %0.2f,   dev accuracy = %0.2f" %\
                (train_score,dev_score)
                
if __name__=="__main__":
    x,y = get_data()
    x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
                                                test_size=0.3,random_state=9)
    x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
                                                 test_size=0.3,random_state=9)
    build_model(x_train,y_train,x_dev,y_dev)