# -*- coding: utf-8 -*-
"""
Created on Fri Dec 3 10:09:23 2021
@author: 继续革命
"""
import numpy as np
import pandas as pd
df=pd.read_csv(r'/nba_data本赛季.csv')
print(df.shape)
df.columns=['sepal_len','sepal_wid',
'petal_len','petal_wid','class']
print(df.head(6))
print(df['class'].unique())
X=df.iloc[:,0:4].values
y=df.iloc[:,4].values
print(X[y=='setosa',1])
import matplotlib.pyplot as plt
import math
label_dict={1:'Setosa',2:'Versicolor',
3:'Virgnica'}
feature_dict={0:'sepal leng [cm]',
1:'sepal wid [cm]',
2:'petal len [cm]',
3:'petal wid [cm]'}
plt.figure(figsize=(8,6))
for cnt in range(4):
plt.subplot(2,2,cnt+1)
for lab in ('setosa', 'versicolor', 'Ivirginica'):
plt.hist(X[y==lab,cnt],label=lab,
bins=10,alpha=0.3)
plt.xlabel(feature_dict[cnt])
plt.legend(loc='upper right',
fancybox=True,fontsize=8)
plt.tight_layout()
plt.show()
from sklearn.preprocessing import StandardScaler
X_std=StandardScaler().fit_transform(X)
print(X_std.shape)
mean_vec=np.mean(X_std,axis=0)
print('4个特征的均值向量:\n',mean_vec)
#协方差矩阵
cov_mat=(X_std-mean_vec
).T.dot((X_std-mean_vec))/(X_std.shape[0]-1) #向量内积
print('协方差矩阵:\n',cov_mat) #实对称阵
cov_mat=np.cov(X_std.T)
eig_vals,eig_vecs=np.linalg.eig(cov_mat)
print('Eigenvectors \n%s' % eig_vecs)
print('\nEigenvalues \n%s' % eig_vals)
eig_pairs=[(np.abs(eig_vals[i]),eig_vecs[:,i])
for i in range(len(eig_vals))]
print(eig_pairs)
print('-------------')
eig_pairs.sort(key=lambda x:x[0],reverse=True)
print('Eigenvalues in descending order:')
for i in eig_pairs:
print(i[0])
#计算方差贡献率 也就是特征值贡献率
tot=sum(eig_vals)
var_exp=[(i/tot)*100 for i in sorted(eig_vals,reverse=True)]
print(var_exp)
#累计方差贡献率
cum_var_exp=np.cumsum(var_exp)
cum_var_exp
plt.figure(figsize=(6,4))
plt.bar(range(4),var_exp,alpha=0.5,
align='center',label='individual explained variance')
plt.step(range(4),cum_var_exp,where='mid',
label='cumulative explained variance')
plt.ylabel('explained variance ratio')
plt.xlabel('pricipal components')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
matrix_w=np.hstack((eig_pairs[0][1].reshape(4,1), #输出前两个特征向量
eig_pairs[1][1].reshape(4,1)))
print('matrix W:\n',matrix_w)
Y=X_std.dot(matrix_w)
print(Y)
#将原数据展示在图上
plt.figure(figsize=(6,4))
for lab,col in zip(('setosa', 'versicolor', 'virginica'),
('blue','red','green')):
plt.scatter(X[y==lab,0],
X[y==lab,1],
label=lab,c=col)
plt.xlabel('sepal_len')
plt.ylabel('sepal_wid')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
#降维后数据作图
plt.figure(figsize=(6,4))
for lab,col in zip(('setosa', 'versicolor', 'virginica'),
('blue','red','green')):
plt.scatter(Y[y==lab,0],
Y[y==lab,1],
label=lab,c=col)
plt.xlabel('principal component 1')
plt.ylabel('principal component 2')
plt.legend(loc='lower center')
plt.show()
主成分分析(pca)
最新推荐文章于 2024-06-16 12:33:12 发布