from numpy import *
import matplotlib.pyplot as plt
from scipy import *
from numpy import linalg as la
def loadDataSet(fileName, delim='\t'):
fr = open(fileName)
stringArr = [line.strip().split(delim) for line in fr.readlines()]
#print(stringArr)
datArr = [list(map(float,line)) for line in stringArr]
return mat(datArr)
##dataMat = loadDataSet('testSet.txt')
def pca(dataMat, topNfeat=9999999):
meanVals = mean(dataMat, axis=0)
#print(meanVals)
meanRemoved = dataMat - meanVals #remove mean
covMat = mat(cov(meanRemoved, rowvar=0))
#print(covMat)
eigVals,eigVects = linalg.eig(covMat)
## print(eigVals)
## print(eigVects)
eigValInd = argsort(eigVals)#sort, sort goes smallest to largest
#print(eigValInd)
eigValInd = eigValInd[:-(topNfeat+1):-1] #cut off unwanted dimensions
#print(eigValInd)
redEigVects = eigVects[:,eigValInd] #reorganize eig vects largest to smallest
#print(redEigVects)
lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions
#print(lowDDataMat)
reconMat = (lowDDataMat * redEigVects.T) + meanVals
## print(lowDDataMat * redEigVects.T)
## print(reconMat)
## print(dataMat)
return lowDDataMat, reconMat
##lowDMat,reconMat = pca(dataMat,1)
##print(shape(lowDMat))
##
##fig = plt.figure()
##ax = fig.add_subplot(111)
##ax.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],
## marker = '^',s = 40,c ='orange')
##ax.scatter(reconMat[:,0].flatten().A[0],reconMat[:,1].flatten().A[0])
##plt.show()
def replaceNanWithMean():
datMat = loadDataSet('secom.data', ' ')
#print(shape(datMat))
numFeat = shape(datMat)[1]
#print(numFeat)
for i in range(numFeat):
meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number)
datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal #set NaN values to mean
return datMat
dataMat = replaceNanWithMean()
#print(dataMat)
meanVals = mean(dataMat,axis = 0)
meanRemoved = dataMat - meanVals
covMat = cov(meanRemoved,rowvar = 0)
eigVals,eigVects = linalg.eig(mat(covMat))
##print(eigVals)
##print(eigVects)
###第二种数据处理方法:将缺失大于百分之八十的特征直接去掉
##secom = loadDataSet('secom.data',' ')
###print(shape(secom))
##secom1 = secom.copy()
##nanInd = []
##for i in range(secom1.shape[1]):
## nan = mean(isnan(secom1[:,i]))#算缺失值比例
## #print(nan)
## if nan*100 > 80:
## print("第%d列缺失值:%f" %(i,nan*100))
## nanInd.append(i)
## print(nanInd)
##
##del_secom1 = delete(secom1,nanInd,1)
###print(shape(del_secom1))
##numFeat = shape(del_secom1)[1]
##for i in range(numFeat):
## meanVal = mean(del_secom1[nonzero(~isnan(del_secom1[:,i].A))[0],i]) #values that are not NaN (a number)
## del_secom1[nonzero(isnan(del_secom1[:,i].A))[0],i] = meanVal #set NaN
##U,Sigma,VT = la.svd([[1,1],[7,7]])
##print(U)
##print(Sigma)
##print(VT)
##--------------------------------------------------------------------------------
##--------------------------------------------------------------------------------
def loadExData():
return[[0, 0, 0, 2, 2],
[0, 0, 0, 3, 3],
[0, 0, 0, 1, 1],
[1, 1, 1, 0, 0],
[2, 2, 2, 0, 0],
[5, 5, 5, 0, 0],
[1, 1, 1, 0, 0]]
##Data = loadExData()
##U,Sigma,VT = la.svd(Data)
##print(Sigma)
##
##Sig2 = mat([[Sigma[0], 0],[0, Sigma[1]]])
##Data2 = U[:,:2]*Sig2*VT[:2,:]
##print(Data2)
def euclidSim(inA,inB):
return 1.0/(1.0 + la.norm(inA - inB))
def pearsSim(inA,inB):
if len(inA) < 3 : return 1.0
return 0.5+0.5*corrcoef(inA, inB, rowvar = 0)[0][1]
def cosSim(inA,inB):
num = float(inA.T*inB)
denom = la.norm(inA)*la.norm(inB)
return 0.5+0.5*(num/denom)
myMat = mat(loadExData())
##print(myMat)
##print(euclidSim(myMat[:,0],myMat[:,4]))
##print(euclidSim(myMat[:,0],myMat[:,0]))
def standEst(dataMat, user, simMeas, item):
n = shape(dataMat)[1]
simTotal = 0.0; ratSimTotal = 0.0
for j in range(n):
userRating = dataMat[user,j]
if userRating == 0: continue
overLap = nonzero(logical_and(dataMat[:,item].A>0, \
dataMat[:,j].A>0))[0]
if len(overLap) == 0: similarity = 0
else: similarity = simMeas(dataMat[overLap,item], \
dataMat[overLap,j])
print ('the %d and %d similarity is: %f' % (item, j, similarity))
simTotal += similarity
ratSimTotal += similarity * userRating
if simTotal == 0: return 0
else: return ratSimTotal/simTotal
def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
unratedItems = nonzero(dataMat[user,:].A==0)[1]#find unrated items
if len(unratedItems) == 0: return 'you rated everything'
itemScores = []
for item in unratedItems:
estimatedScore = estMethod(dataMat, user, simMeas, item)
itemScores.append((item, estimatedScore))
print(itemScores)
return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]
myMat[0,1] = myMat[0,0] = myMat[1,0] = myMat[2,0] = 4
myMat[3,3] = 2
##print(myMat)
##print(recommend(myMat,2))
def loadExData2():
return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
[0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
[0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
[3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
[5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
[0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
[4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
[0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
[0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]
U,Sigma,VT = la.svd(mat(loadExData2()))
##print(Sigma)
Sigma_2 = multiply(Sigma,Sigma)
Sigma_2_90 = 0.9*sum(Sigma_2)
n = sum(Sigma_2[:3])#要选前三个特征
##print(Sigma_2)
##print(sum(Sigma_2))
##print(Sigma_2_90)
##print(n)
def svdEst(dataMat, user, simMeas, item):
n = shape(dataMat)[1]
simTotal = 0.0; ratSimTotal = 0.0
U,Sigma,VT = la.svd(dataMat)
Sig4 = mat(eye(4)*Sigma[:4]) #arrange Sig4 into a diagonal matrix
xformedItems = dataMat.T * U[:,:4] * Sig4.I #create transformed items
for j in range(n):
userRating = dataMat[user,j]
if userRating == 0 or j==item: continue
similarity = simMeas(xformedItems[item,:].T,\
xformedItems[j,:].T)
print ('the %d and %d similarity is: %f' % (item, j, similarity))
simTotal += similarity
ratSimTotal += similarity * userRating
if simTotal == 0: return 0
else: return ratSimTotal/simTotal
myMat2 = mat(loadExData2())
recommend(myMat2,1,estMethod = svdEst)
recommend(myMat2,1,estMethod = svdEst,simMeas = pearsSim)
《机器学习十三、十四章 PCA、SVD简化数据》
最新推荐文章于 2024-06-21 20:43:02 发布