《机器学习十三、十四章 PCA、SVD简化数据》

本文探讨了主成分分析(PCA)和奇异值分解(SVD)在数据预处理和推荐系统中的实现与应用。通过PCA降低数据维度,去除噪声并保持主要特征,而SVD用于矩阵分解,提高推荐系统的预测准确性。文章详细介绍了这两种方法的Python实现过程,包括数据集的加载、缺失值处理、相似度计算及推荐算法的优化。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

from numpy import *
import matplotlib.pyplot as plt
from scipy import *
from numpy import linalg as la


def loadDataSet(fileName, delim='\t'):
    fr = open(fileName)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]
    #print(stringArr)
    datArr = [list(map(float,line)) for line in stringArr]
    return mat(datArr)

##dataMat = loadDataSet('testSet.txt')

def pca(dataMat, topNfeat=9999999):
    meanVals = mean(dataMat, axis=0)
    #print(meanVals)
    meanRemoved = dataMat - meanVals #remove mean
    covMat = mat(cov(meanRemoved, rowvar=0))
    #print(covMat)
    eigVals,eigVects = linalg.eig(covMat)
##    print(eigVals)
##    print(eigVects)
    eigValInd = argsort(eigVals)#sort, sort goes smallest to largest
    #print(eigValInd) 
    eigValInd = eigValInd[:-(topNfeat+1):-1]  #cut off unwanted dimensions
    #print(eigValInd) 
    redEigVects = eigVects[:,eigValInd]       #reorganize eig vects largest to smallest
    #print(redEigVects)
    lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions
    #print(lowDDataMat)
    reconMat = (lowDDataMat * redEigVects.T) + meanVals
##    print(lowDDataMat * redEigVects.T)
##    print(reconMat)
##    print(dataMat)
    return lowDDataMat, reconMat

##lowDMat,reconMat = pca(dataMat,1)
##print(shape(lowDMat))
##
##fig = plt.figure()
##ax = fig.add_subplot(111)
##ax.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],
##           marker = '^',s = 40,c ='orange')
##ax.scatter(reconMat[:,0].flatten().A[0],reconMat[:,1].flatten().A[0])
##plt.show()


def replaceNanWithMean(): 
    datMat = loadDataSet('secom.data', ' ')
    #print(shape(datMat))
    numFeat = shape(datMat)[1]
    #print(numFeat)
    for i in range(numFeat):
        meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number)
        datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal  #set NaN values to mean
    return datMat


dataMat = replaceNanWithMean()
#print(dataMat)
meanVals = mean(dataMat,axis = 0)
meanRemoved = dataMat - meanVals
covMat = cov(meanRemoved,rowvar = 0)
eigVals,eigVects = linalg.eig(mat(covMat))
##print(eigVals)
##print(eigVects)



###第二种数据处理方法:将缺失大于百分之八十的特征直接去掉
##secom = loadDataSet('secom.data',' ')
###print(shape(secom))
##secom1 = secom.copy()
##nanInd = []
##for i in range(secom1.shape[1]):
##    nan = mean(isnan(secom1[:,i]))#算缺失值比例
##    #print(nan)
##    if nan*100 > 80:
##        print("第%d列缺失值:%f" %(i,nan*100))
##        nanInd.append(i)
##        print(nanInd)
##
##del_secom1 = delete(secom1,nanInd,1)
###print(shape(del_secom1))
##numFeat = shape(del_secom1)[1]
##for i in range(numFeat):
##    meanVal = mean(del_secom1[nonzero(~isnan(del_secom1[:,i].A))[0],i]) #values that are not NaN (a number)
##    del_secom1[nonzero(isnan(del_secom1[:,i].A))[0],i] = meanVal  #set NaN 


##U,Sigma,VT = la.svd([[1,1],[7,7]])
##print(U)
##print(Sigma)
##print(VT)

##--------------------------------------------------------------------------------
##--------------------------------------------------------------------------------

def loadExData():
    return[[0, 0, 0, 2, 2],
           [0, 0, 0, 3, 3],
           [0, 0, 0, 1, 1],
           [1, 1, 1, 0, 0],
           [2, 2, 2, 0, 0],
           [5, 5, 5, 0, 0],
           [1, 1, 1, 0, 0]]

##Data = loadExData()
##U,Sigma,VT = la.svd(Data)
##print(Sigma)
##
##Sig2 = mat([[Sigma[0], 0],[0, Sigma[1]]])
##Data2 = U[:,:2]*Sig2*VT[:2,:]
##print(Data2)


def euclidSim(inA,inB):
    return 1.0/(1.0 + la.norm(inA - inB))

def pearsSim(inA,inB):
    if len(inA) < 3 : return 1.0
    return 0.5+0.5*corrcoef(inA, inB, rowvar = 0)[0][1]

def cosSim(inA,inB):
    num = float(inA.T*inB)
    denom = la.norm(inA)*la.norm(inB)
    return 0.5+0.5*(num/denom)


myMat = mat(loadExData())
##print(myMat)
##print(euclidSim(myMat[:,0],myMat[:,4]))
##print(euclidSim(myMat[:,0],myMat[:,0]))


def standEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0: continue
        overLap = nonzero(logical_and(dataMat[:,item].A>0, \
                                      dataMat[:,j].A>0))[0]
        if len(overLap) == 0: similarity = 0
        else: similarity = simMeas(dataMat[overLap,item], \
                                   dataMat[overLap,j])
        print ('the %d and %d similarity is: %f' % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal


def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
    unratedItems = nonzero(dataMat[user,:].A==0)[1]#find unrated items 
    if len(unratedItems) == 0: return 'you rated everything'
    itemScores = []
    for item in unratedItems:
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        itemScores.append((item, estimatedScore))
        print(itemScores)
    return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]


myMat[0,1] = myMat[0,0] = myMat[1,0] = myMat[2,0] = 4
myMat[3,3] = 2
##print(myMat)
##print(recommend(myMat,2))



def loadExData2():
    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]


U,Sigma,VT = la.svd(mat(loadExData2()))
##print(Sigma)
Sigma_2 = multiply(Sigma,Sigma)
Sigma_2_90 = 0.9*sum(Sigma_2)
n = sum(Sigma_2[:3])#要选前三个特征
##print(Sigma_2)
##print(sum(Sigma_2))
##print(Sigma_2_90)
##print(n)



def svdEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    U,Sigma,VT = la.svd(dataMat)
    Sig4 = mat(eye(4)*Sigma[:4]) #arrange Sig4 into a diagonal matrix
    xformedItems = dataMat.T * U[:,:4] * Sig4.I  #create transformed items
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0 or j==item: continue
        similarity = simMeas(xformedItems[item,:].T,\
                             xformedItems[j,:].T)
        print ('the %d and %d similarity is: %f' % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal

myMat2 = mat(loadExData2())
recommend(myMat2,1,estMethod = svdEst)
recommend(myMat2,1,estMethod = svdEst,simMeas = pearsSim)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值