贝叶斯分类、决策树、和向量机对于基于多个不同因素的数值数据做预测,并不是最好的。
构建数据集
#!/usr/bin/python
from random import random,randint
import math
#构建数据集,酒的等级越高,其实价格越高,增加其价格直到peak age
#而酒等级越低,其实价格便宜,且变得越便宜
#生成就得价格
def wineprice(rating,age):#rating为酒的等级,age为酒龄
peak_age=rating - 50#酒的过期年份
price = rating/2 #初始价格
if age > peak_age:
price = price * (5 - (age - peak_age))
else:
price = price * (5 *(age+1)/peak_age)
if price < 0:price =0
return price
#生成数据集
def genwineset():
rows=[]
for i in range(300):
rating = random()*50+50
age=radom()*50
price=wineprice(rating,age)
#增加噪音数据
price*=random()*0.4+0.8
rows.append({'input':(rating,age),'result':price})
return rows
定义相似度
本文中使用rating和price作为向量来计算相似度
#欧式距离
def euclidean(v1,v2):
d=0.0
for i in range(len(v1)):
d+=(v1[i]-v2[i])**2
return math.sqrt(d)
k-Nearest Neighbors
k-nearest neighbors (kNN):通过找到一组相似的item,把它们的平均价格作为你的Item价格的猜测。在kNN中提到的k是item的数目,用来计算平均来获取最终结果。若k为1,意味着你挑选最近的邻居,并使用它的价格。选取太多的邻居,会降低估值的精确度
#取得向量v1最近的邻居
def getdistances(data,vec1):
distancelist=[]
for i in range(len(data)):
vec2=data[i]['input']
distancelist.append((euclidean(vec1,vec2),i))
distancelist.sort
#kNN算法实现
def knnestimate(data,vec1,k=3):
#取得最近的邻居
dlist=getdistances(data,vec1)
avg=0.0
# Take the average of the top k results
for i in range(k):
idx=dlist[i][1]
avg+=data[idx]['result']
avg=avg/k
return avg
带权重的Neighbors
对kNN算法的补充,根据距离来确定权重。item越相似,它们之间的距离越小,现在就需要将距离转换为权重的方法。#反转权重,dist为距离,缺点是很相近的item,导致权重很大
def inverseweight(dist,num=1.0,const=0.1):
return num/(dist+const)
#减法权重
def subtractweight(dist,const=1.0):
if dist>const:
return 0
else:
return const-dist
#高斯
def gaussian(dist,sigma=10.0):
return math.e**(-dist**2/(2*sigma**2))
#带权重的kNN算法实现
def weightedknn(data,vec1,k=5,weightf=gaussian):
# Get distances
dlist=getdistances(data,vec1)
avg=0.0
totalweight=0.0
# Get weighted average
for i in range(k):
dist=dlist[i][0]
idx=dlist[i][1]
weight=weightf(dist)
avg+=weight*data[idx]['result']
totalweight+=weight#总权重
avg=avg/totalweight#平均值
return avg
异构变量
#加入多个变量因素,瓶子尺寸
def wineset2():
rows=[]
for i in range(300):
rating=random()*50+50
age=random()*50
aisle=float(randint(1,20))
bottlesize=[375.0,750.0,1500.0,3000.0][randint(0,3)]
price=wineprice(rating,age)
price*=(bottlesize/750)
price*=(random()*0.9+0.2)
rows.append({'input':(rating,age,aisle,bottlesize),
'result':price})
#通过scale系数,重新计算,此处的scale系数可以通过之前提过的优化方法找到最合适的值
def rescale(data,scale):
scaleddata=[]
for row in data:
scaled=[scale[i]*row['input'][i] for i in range(len(scale))]
scaleddata.append({'input':scaled,'result':row['result']})
return scaleddata
def wineset3():#有时候,没有被跟踪的变量会对price产生很大的影响,例如折扣,这里用一定的概率对数据进行处理
rows=genwineset()
for row in rows:
if random()<0.5:
# Wine was bought at a discount store
row['result']*=0.6
return rows