linear regression

最新推荐文章于 2022-03-22 18:37:00 发布

qwzhong1988

最新推荐文章于 2022-03-22 18:37:00 发布

阅读量566

点赞数

CC 4.0 BY-SA版权

分类专栏： machine learning 文章标签： machine learning linear regression 线性回归

本文链接：https://blog.youkuaiyun.com/qwzhong1988/article/details/21073791

machine learning 专栏收录该内容

4 篇文章

订阅专栏

本文介绍了一个线性回归的实际案例，包括梯度下降和随机梯度下降两种方法的应用，并讨论了线性回归过程中需要考虑的因素，如数据的相关性和参数的初始化等。

今天写线性回归的例子^_^

（1）代码没有写数据归一化的操作；（2）包含了梯度下降和随机梯度下降两种方法；（3）适用于多元

体会：线性回归需要很强的先验知识，比如数据的相关性、数据的大致情况（会影响参数的初始值，学习率，迭代步长和收敛阈值等等）

训练的过程就是斜率来回摆动和截距上下移动的过程

#coding=gbk
'''
@author: qwzhong1988(qwzhong1988@163.com)
'''
class Model:
    #个人认为：线性回归需要自变量与应变量之间存在较强的相关性才可以进行
    #否则，一方面可能造成数据溢出，另一方面结果也不一定理想
    def __init__(self):
        #序列类型，用于存放模型参数，最后一个代表常量b
        self.theta=[]
        #存放实际的数据
        self.featureVectors=[]
        #存放属性名称
        self.featureNameList=[]
        pass
    #读取.arff格式的文件
    def __readFile(self,arffFile):
        fr=open(arffFile,'r')
        for line in fr:
            if not line.startswith('@'):
                #读取实际的数据
                self.featureVectors.append(line.strip().split(','))
            else:
                #读取属性
                if(not line.startswith('@RELATION'))and(not line.startswith('@DATA')):
                    self.featureNameList.append(line.split()[1])
        fr.close()   
        pass
    def train(self,arffFile,flg=True):
        #收敛条件都是启发式的（Andrew Ng）
        #收敛条件1：迭代次数达到上限
        #收敛条件2：前后两次的theta的各分量的变化不大，或者整体差值的绝对值变化不大，或者梯度变化不大
        #收敛条件3：损失函数变化不大
        
        #读入数据文件
        self.__readFile(arffFile)
        #根据输入维数初始化模型参数，最后一个参数是常数b
        self.theta=[1]*len(self.featureVectors[0])       
        if flg:
            #梯度下降法
            self.__gradientDescent()
        else:
            #随机梯度下降法
            self.__stochasticGradientDescent()
            
        print self.theta
        pass
    
    def __gradientDescent(self):
        thetaold=[1]*len(self.featureVectors[0])
        #步长，学习率
        alpha=0.01
        #阈值
        epsion=0.1
        #迭代次数
        itr=1500
        
        itrtmp=0
        while itrtmp<itr:
            itrtmp+=1
            
            #更新每一个参数
            for i in range(0,len(self.theta)):
                #用所有的数据更新一个参数
                sumtmp=0.0
                for j in range(0,len(self.featureVectors)):
                    tmp=0.0
                    for k in range(0,len(self.featureVectors[j])-1):
                        tmp+=self.theta[k]*float(self.featureVectors[j][k])
                    tmp+=(self.theta[len(self.theta)-1]*1.0)
                    tmp-=float(self.featureVectors[j][len(self.featureVectors[j])-1])
                    if i!=len(self.theta)-1:
                        tmp*=float(self.featureVectors[j][i])
                    else:
                        tmp*=1.0
                    sumtmp+=tmp
                self.theta[i]-=(alpha*sumtmp/len(self.featureVectors))
            
            #判断迭代结束的条件
            error=0.0
            for i in range(0,len(self.theta)):
                error+=abs(self.theta[i]-thetaold[i])
            error/=len(self.theta)
            if error<epsion:
                break
            for i in range(0,len(self.theta)):
                thetaold[i]=self.theta[i]
        pass
    
    def __stochasticGradientDescent(self):
        
        #随机梯度下降法：数据量必须足够，不然训练达不到预期的效果
        
        #步长，学习率
        alpha=0.01
        
        for j in range(0,len(self.featureVectors)):   
            #更新每一个参数
            tmp=0.0
            for k in range(0,len(self.featureVectors[j])-1):
                tmp+=self.theta[k]*float(self.featureVectors[j][k])
            tmp+=(self.theta[len(self.theta)-1]*1.0)
            tmp-=float(self.featureVectors[j][len(self.featureVectors[j])-1])
            
            for i in range(0,len(self.theta)):
                tmpi=tmp
                if i!=len(self.theta)-1:
                    tmpi*=float(self.featureVectors[j][i])
                else:
                    tmpi*=1.0
                self.theta[i]-=(alpha*tmpi/len(self.featureVectors))
        pass
    
    def test(self,arffFile):
        fr=open(arffFile,'r')
        for line in fr:
            if not line.startswith('@'):
                instance=line.strip().split(',')
                print 'houseprice: %s' % (self.__predict(instance))
        pass
    
    def __predict(self,instance):
        tmp=0.0
        for i in range(0,len(self.theta)-1):
            tmp+=(self.theta[i]*float(instance[i]))
        tmp+=(self.theta[len(self.theta)-1]*1.0)
        return tmp
        pass
    pass


if __name__=='__main__':
    model=Model()
    model.train('houseprice.arff')
    model.test('houseprice.arff')
    pass

数据来源

houseprice.arff

@RELATION house
@ATTRIBUTE houseSize NUMERIC
@ATTRIBUTE sellingPrice NUMERIC
@DATA
0.951,30
1.036,39.9
0.676,46.5
1.456,48.6
1.186,51.5
1.456,56.99
1.368,59.9
0.994,62.5
1.176,65.5
1.216,69
1.41,76.9
1.344,79
1.064,79.9
1.77,79.95
1.524,82.9
1.75,84.9
1.152,85
1.77,87.9
1.624,89.9
1.54,89.9
1.532,93.5
1.647,94.9
1.344,95.8
1.55,98.5
1.752,99.5
1.45,99.9
1.312,102
1.636,106
1.5,108.9
1.8,109.9
1.972,110
1.387,112.29
2.082,114.9
2.463,119.9
2.572,119.9
2.113,122.9
2.016,123.938
1.852,124.9
2.67,126.9
2.336,129.9
1.98,132.9
2.483,134.9
2.809,135.9
2.036,139.5
2.298,139.99
2.038,144.9
2.37,147.6
2.921,149.99
2.262,152.55
2.456,156.9
2.436,164
1.92,167.5
2.949,169.9
3.31,175
2.805,179
2.553,179.9
2.51,189.5
3.627,199

结果比较：