from math import sqrt
import pandas as pd
import numpy as np
print('同一个向量三个相关性计算如下:')
def multipl(a1, b):
sumofab = 0.0
# a1=a.tolist()
b1=b.tolist()
for i in range(len(a1)):
temp = a1[i] * b1[i]
sumofab += temp
print("sumofab:",sumofab)
return sumofab
def corrcoef(x, y):
n = len(x)
# 求和
sum1 = sum(x)
sum2 = sum(y)
# 求乘积之和
sumofxy = multipl(x, y)
# 求平方和
sumofx2 = sum([pow(i, 2) for i in x])
sumofy2 = sum([pow(j, 2) for j in y])
num = sumofxy - (float(sum1) * float(sum2) / n)
# 计算皮尔逊相关系数
den = sqrt((sumofx2 - float(sum1 ** 2) / n) * (sumofy2 - float(sum2 ** 2) / n))
if den==0:
return 0
else:
return num / den
textFile = open('4-27-test.csv', 'rb')
data = pd.read_csv(textFile, header=None, prefix='x', error_bad_lines=False)
data=data.dropna()
print(data.ix[:])
# 打乱数据的分布
data.sample(frac=1).reset_index(drop=True)
# data = data[data.ix[:, 19] != '-']
print(data.shape)
for i in range(46):
if i == 19 or i==23:#因为第19个属性含有“-”和第23个属性是label
# print("---",data.ix[:,21])
continue
else:
# data = data[data.ix[:,i] != np.nan]
x= [int(x1) for x1 in data.ix[:,i]]
# print("x=",x[:10])
y = data.ix[:,23]
print('第%d个字段与label的皮尔斯相关系数为:%f' % (i, corrcoef(x, y))) # 0.471404520791
# 欧式距离
# -*-coding:utf-8-*-
def distance(vector1, vector2):
d = 0;
for a, b in zip(vector1, vector2):
d += (a - b) ** 2;
return d ** 0.5;
print('欧式距离:%f' % distance(x, y));
import numpy as np
# 自定义余弦相似度函数
def get_cossimi(x, y):
myx = np.array(x)
myy = np.array(y)
cos1 = np.sum(myx * myy)
cos21 = np.sqrt(sum(myy * myy))
cos22 = np.sqrt(sum(myx * myx))
return (cos1 / float(cos22 * cos21))
print('余弦相似性:%f' % get_cossimi(x, y))