1 2D PCA
1.1 load data
import scipy.io as sio
mat = sio.loadmat(r'D:\python_try\5. AndrewNg_ML\data\kmeans and pca\ex7data1.mat')
# print(mat.keys()) # 查看数据组成成分
X = mat.get('X')
# X.shape # 查看数据维度
1.2 visualize data
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='notebook', style='white')
sns.lmplot('X1', 'X2', data=pd.DataFrame(X, columns=['X1', 'X2']), fit_reg=False)
plt.show()
1.3 Algorithm
1.3.1 normalize the features
数据标准化处理
X
∗
=
X
j
−
X
‾
σ
X^*=\frac{X_j-\overline{X}}{\sigma}
X∗=σXj−X
def nomalizeData(X):
return (X - X.mean(axis=0))/X.std(axis=0)
1.3.2 covariance matrix
计算协方差矩阵 Σ = 1 m X T X \Sigma=\frac{1}{m}X^TX Σ=m1XTX
def covarianceMatrix(X):
return X.T@X/X.shape[0]
1.3.3 eigenvector
直接计算特征值和特征向量np.linalg.eig,注意,第i个特征值对应的特征向量是第i列
def eigenValVec(X):
eigValue, eigVector = np.linalg.eig(X)
return eigValue, eigVector
调用查看情况
eigenValVec( covarianceMatrix(nomalizeData(X)))
1.3.4 project data to lower dimension
def projectData(X, eigVector, k):
lowerX = X@eigVector[:, :k]
return lowerX
数据降到1维
Z = projectData(nomalizeData(X), eigenValVec( covarianceMatrix(nomalizeData(X)))[1], 1)
## 画图
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 4))
sns.regplot('X1', 'X2',
data=pd.DataFrame(nomalizeData(X), columns=['X1', 'X2']),
fit_reg=False,
ax=ax1)
ax1.set_title('Original dimension')
sns.rugplot(Z, ax=ax2)
ax2.set_xlabel('Z')
ax2.set_title('Z dimension')
plt.show()
1.3.5 recover data to original dimension
def recoverData(Z, eigVector, k):
pcaX = Z@eigVector[:, :k].T
return pcaX
将数据转换到原来的维度
newX = recoverData(Z, eigenValVec( covarianceMatrix(nomalizeData(X)))[1], 1)
## 画图
sns.regplot('X1', 'X2',
data=pd.DataFrame(newX, columns=['X1', 'X2']),
fit_reg=True,
#scatter=False,
label='2D projection from Z')
sns.regplot('X1', 'X2',
data=pd.DataFrame(nomalizeData(X), columns=['X1', 'X2']),
fit_reg=False,
scatter=True,
label='Original dimension')
plt.legend()
plt.show()
可以看出,通过主成分提取后转换的数据拟合误差小了许多
1.4 use sklearn
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(nomalizeData(X))
# 降维
pca.transform(nomalizeData(X))
# 数据转换原来维度
pca.inverse_transform(Z)
2 PCA on face data
2.1 load data
import scipy.io as sio
faceMat = sio.loadmat(r'D:\python_try\5. AndrewNg_ML\data\kmeans and pca\ex7faces.mat')
# faceMat.keys()
faceX = faceMat.get('X')
# faceX.shape
2.2 visualize the top100 of these faces image
import numpy as np
import matplotlib.pyplot as plt
def plotNImage(X, n=100):
imaSize = int(np.sqrt(X.shape[1])) # 每张图片原来特征数
gridSize = int(np.sqrt(n)) # 每行每列存放图片的数量
topNImage = X[:n, :] # 选取前n张
fig, ax = plt.subplots(nrows=gridSize, ncols=gridSize, sharex=True, sharey=True, figsize=(8, 8)) # 画布设置
for r in range(gridSize):
for c in range(gridSize):
ax[r][c].imshow(topNImage[r*gridSize+c].reshape(imaSize, imaSize).T)
plt.xticks(np.array([])) # 不显示刻度
plt.yticks(np.array([]))
plotNImage(faceX)
2.3 use sklearn deal PCA
from sklearn.decomposition import PCA
from sklearn import preprocessing ##标准化使用
数据标准化
faceXNomal = preprocessing.scale(faceX) # 数据标准化
主成分=100
facePCA = PCA(n_components=100)
# 用数据拟合并降维
projectFace = facePCA.fit_transform(faceXNomal)
# 提出主成分并转换到原来维度
reduceFace = facePCA.inverse_transform(projectFace)
画图
plotNImage(reduceFace,36)