import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def nomalizeData(X):
return (X - X.mean(axis=0))/X.std(axis=0)
def zeroMean(dataMat):
meanVal=np.mean(dataMat,axis=0)
newData=dataMat-meanVal
return newData,meanVal
def percentage2n(eigVals,percentage):
sortArray=np.sort(eigVals)
sortArray=sortArray[-1::-1]
arraySum=sum(sortArray)
tmpSum=0
num=0
for i in sortArray:
tmpSum+=i
num+=1
if tmpSum>=arraySum*percentage:
return num
def pca(dataMat,percentage=0.99):
newData,meanVal=zeroMean(dataMat)
covMat=np.cov(newData,rowvar=0)
eigVals,eigVects=np.linalg.eig(np.mat(covMat))
n=percentage2n(eigVals,percentage)
eigValIndice=np.argsort(eigVals)
n_eigValIndice=eigValIndice[-1:-(n+1):-1]
n_eigVect=eigVects[:,n_eigValIndice]
lowDDataMat=newData*n_eigVect
reconMat=(lowDDataMat*n_eigVect.T)+meanVal
return lowDDataMat,reconMat
def main():
data = pd.read_csv('C:/Users/Administrator/Desktop/Irisdata.csv')
data.columns = ['sepal_len','sepal_wid','petal_len','petal_wid','classes']
X=data[['sepal_len','sepal_wid','petal_len','petal_wid']].values
y=data['classes'].values
feature_names = ['sepal_len','sepal_wid','petal_len','petal_wid']
label_names=data['classes'].unique()
for feature in range(len(feature_names)):
plt.subplot(2,2,feature+1)
for label in label_names:
plt.hist(X[y==label,feature],bins=10,alpha=0.5,label=label)
plt.legend(loc='best')
plt.show()
std_feature=nomalizeData(X)
newData,meanVal=zeroMean(std_feature)
lowDDataMat,reconMat=pca(newData)
print(lowDDataMat)
figure = plt.figure(figsize=(8,6))
for label,c in zip(label_names,['red','green','black']):
plt.scatter(std_feature[y==label][:,0],std_feature[y==label][:,1],c=c,label=label,alpha=0.6,s=20)
leg = plt.legend(loc='best')
leg.get_frame().set_alpha(0.6)
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])
plt.show()
figure = plt.figure(figsize=(8,6))
for label,c in zip(label_names,['red','green','black']):
plt.scatter((lowDDataMat[y==label][:,0]).tolist(),(lowDDataMat[y==label][:,1]).tolist(),c=c,label=label,alpha=0.6,s=20)
leg = plt.legend(loc='best')
leg.get_frame().set_alpha(0.6)
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])
plt.show()
if __name__=='__main__':
main()
实现结果: