R语言手工实现主成分分析 PCA | 奇异值分解(svd) 与PCA | PCA的疑问和解答

本文链接：https://blog.youkuaiyun.com/wangjunliang/article/details/142871996

几个问题：

pca可以用相关系数矩阵做吗？效果比协方差矩阵比怎么样？
pca做完后变量和样本的新坐标怎么旋转获得？
pca做不做scale和center对结果有影响吗？
pca用因子分解和奇异值分解有啥区别？后者怎么获得变量和样本的新坐标？

1. 用R全手工实现 PCA(对比 prcomp() )

不借助包，按照《机器学习实战》P246的伪代码进行操作.

1减去列平均数
2计算协方差矩阵
3计算协方差矩阵的特征值和特征向量
4将特征值从大到小排列
5保留最上面的N个特征值
6将数据转换到上述N个特征向量构建的新空间中。

例1: 针对iris数据集

head(iris)
df1=iris[,1:4]
#1) 减去平均值
df1=sweep(x=df1, 
          MARGIN=2, 
          STATS=apply(df1, 2, mean),
          FUN="-")
head(df1)
#2) 计算协方差矩阵
cor.df1=cov(df1)
#3) 计算协方差矩阵的特征值和特征向量
eigen.df1=eigen(cor.df1)
#4) 特征值默认降序
eigen.df1
#5) 保留最前面的几个特征值
#6) 原center后的坐标 * 旋转矩阵
coord.df1=as.matrix(df1) %*% eigen.df1$vectors
dim(coord.df1)
head(coord.df1)
# plot
coord.df1_=as.data.frame(coord.df1)
colnames(coord.df1_)=paste0("PC_", 1:4)
coord.df1_$type=iris$Species
library(ggplot2)
ggplot(coord.df1_, aes(PC_1, PC_2, color=type))+
  geom_point()

# prcomp() 做PCA
pca.iris=prcomp(iris[,1:4])
pca.iris

# 对比旋转矩阵
> pca.iris$rotation #prcomp()的计算结果
                     PC1         PC2         PC3        PC4
Sepal.Length  0.36138659 -0.65658877  0.58202985  0.3154872
Sepal.Width  -0.08452251 -0.73016143 -0.59791083 -0.3197231
Petal.Length  0.85667061  0.17337266 -0.07623608 -0.4798390
Petal.Width   0.35828920  0.07548102 -0.54583143  0.7536574

> eigen.df1$vectors #协方差矩阵的特征向量构成的矩阵
            [,1]        [,2]        [,3]       [,4]
[1,]  0.36138659 -0.65658877 -0.58202985  0.3154872
[2,] -0.08452251 -0.73016143  0.59791083 -0.3197231
[3,]  0.85667061  0.17337266  0.07623608 -0.4798390
[4,]  0.35828920  0.07548102  0.54583143  0.7536574


# 对比方差
# 主成分的标准差，文档说是 协方差矩阵的特征值的平方根，虽然是通过SVD分解实现的
# square roots of the eigenvalues of the covariance/correlation matrix
# though the calculation is actually done with the singular values of the data matrix
> pca.iris$sdev
[1] 2.0562689 0.4926162 0.2796596 0.1543862

> eigen.df1$values #特征根
[1] 4.22824171 0.24267075 0.07820950 0.02383509

#开方后确实等于 pca.iris$sdev
> sqrt(eigen.df1$values)
[1] 2.0562689 0.4926162 0.2796596 0.1543862

(2)包装成自定义PCA函数

#' 按列求CPA，结果模仿 prcomp
#'
#' @param df0 input a df, PCA on column
#' @param ndims the total eigenvalue to use
#'
#' @return
#' @export
#'
#' @examples
#' 
my_PCA=function(df0, ndims=NULL, scale=F){
  # check params
  if(!is.null(ndims)){
    ndims=min(ndims, ncol(df0));
  }else{
    ndims=ncol(df0)
  }
  
  #1) 减去平均值
  center=apply(df0, 2, mean)
  df1=sweep(x=df0, MARGIN=2, STATS=center, FUN="-")
  # whether scale?
  if(scale){
    df1=scale(df1)
  }
  #2) 计算协方差矩阵
  cor.df1=cov(df1)
  #3) 计算协方差矩阵的特征值和特征向量
  eigen.df1=eigen(cor.df1)
  #4) 特征值默认降序
  #eigen.df1
  #5) 保留最前面的几个特征值
  if(!is.null(ndims)){
    message(">> Use the first ", ndims, " PCs.");
    eigen.df1$values=eigen.df1$values[1:ndims]
    eigen.df1$vectors=eigen.df1$vectors[,1:ndims]
  }
  #6) 原center后的坐标 * 旋转矩阵
  coord.df1=as.matrix(df1) %*% eigen.df1$vectors
  
  # 对比旋转矩阵
  rotation=eigen.df1$vectors #协方差矩阵的特征向量构成的矩阵
  #
  colnames(rotation)=paste0("PC_", 1:ndims)
  rownames(rotation)= colnames(df1)
  #
  colnames(coord.df1)=colnames(rotation)
  rownames(coord.df1)=rownames(df0)

  #eigen.df1$values #特征根
  sdev=sqrt(eigen.df1$values) #开方后确实等于 pca.iris$sdev
  # return
  obj=list(
    sdev=sdev,
    rotation=rotation,
    center=center,
    scale=scale,
    x=coord.df1
  )
  class(obj)="my_PCA"
  return(obj)
}

# test
pca2.iris=my_PCA(iris[,1:4])
# prcomp() 做PCA
pca.iris=prcomp(iris[,1:4])

# compare result
str(pca.iris)
str(pca2.iris)


# test patial eigenvalue
my_PCA(iris[,1:4], ndims = 2)
prcomp(iris[,1:4], rank. = 2)

# plot 
coord.df1_=as.data.frame(pca2.iris$x)
colnames(coord.df1_)=paste0("PC_", 1:4)
coord.df1_$type=iris$Species
library(ggplot2)
ggplot(coord.df1_, aes(PC_1, PC_2, color=type))+
  geom_point()

我计算的样品的新坐标x：原center后的数据矩阵乘旋转矩阵。
如果比10e-15小，可记作0，所以和上文的x没差异。

2. svd分解( A = u.d.v^T )与PCA

(1) svd 分解后，旋转矩阵是v；在新空间位置就是u，也可以 centered_data * v

iris2=sweep(iris[,1:4], MARGIN = 2, STATS = apply(iris[,1:4], 2, mean), FUN = "-") |> as.matrix()
head(iris2)

par(mfrow=c(1,2))
plot(svd(iris[, 1:4])$u, col=iris$Species, pch=19, main="u") #俩坐标类似
plot(iris2 %*% svd(iris[, 1:4])$v, col=iris$Species, pch=19, main="centeredA * v") #俩坐标类似

在这里插入图片描述
可见，虽然坐标尺度不同，但是点的位置一模一样。

(2) PCA( prcomp() )和svd的对应关系

结论：PCA相当于对于经过列中心化(centering on columns) 的表达谱进行 svd()

prcomp()返回的rotation相当于svd()返回的v；
prcomp()返回的x相当于svd()返回的u

dat=iris[,1:4]
rownames(dat)=paste0("sample", 1:nrow(dat))
colnames(dat)=paste0("gene", 1:ncol(dat))
head(dat)
dim(dat) #150样本行，4个基因列

## PCA ====
pca_dat = prcomp(dat) #列为变量
str(pca_dat)
# sdev 为主成分的标准差
# rotation 为变量的矩阵，即其列包括特征向量
# x 为主成分

## svd ====
# 对列中心化后做奇异值分解
dat_centered=scale(dat, center = T, scale = F) #因为prcomp 默认只做center，不做scale
svd_dat=svd( dat_centered )
str(svd_dat) #A=u.d.v^T
# d 为奇异值（singular value）的向量
# u 相当于主成分
# v 相当于变量的矩阵，即其列包括特征向量

# 也就是说，
# prcomp()返回的rotation相当于svd()返回的v；
# prcomp()返回的x相当于svd()返回的u。

## 比较 ====
pca_dat$rotation
svd_dat$v #同上

pca_dat$x |> head()
svd_dat$u |> head()

# 数值虽然不同，但是样本的坐标是一致的
par(mfrow=c(2,2))
plot(pca_dat$x, main="pca_dat$x", col=iris$Species, pch=19) #1行1列
plot(dat_centered %*% pca_dat$rotation, main="centered * pca_dat$rotation", 
     col=iris$Species, pch=19) #1行2列
#
plot(svd_dat$u, main="svd_dat$u", col=iris$Species, pch=19) #2行1列
plot( dat_centered %*% svd_dat$v, main="centered * svd_dat$v", 
      col=iris$Species, pch=19) #2行2列

在这里插入图片描述
结论：以后做PCA可以简化为

对列为变量的矩阵做列中心化：减去每列的均值
做svd分解：A=u.d.v^T
取u矩阵作为原始矩阵在新空间的投影，取v作为旋转矩阵

每个PC对变异的解释百分比: PCA vs svd

> pca_dat$sdev^2 / sum(pca_dat$sdev^2)
[1] 0.924618723 0.053066483 0.017102610 0.005212184

> svd_dat$d^2 / sum(svd_dat$d^2)
[1] 0.924618723 0.053066483 0.017102610 0.005212184

可见，PCA和svd的百分比结果是完全一样的。

pca_dat$sdev / svd_dat$d /0.08192319 #说明完全成比例
pca_dat$sdev^2 / svd_dat$d^2 /0.006711409 #说明完全成比例
协方差矩阵的平方根，和奇异值成比例。

(3) PCA: 对“协方差矩阵”做奇异值分解，vs 特征值分解

dat=iris[,1:4]
rownames(dat)=paste0("sample", 1:nrow(dat))
colnames(dat)=paste0("gene", 1:ncol(dat))
head(dat)
dim(dat) #150样本行，4个基因列

dat_centered=scale(dat, center = T, scale = F) #因为prcomp 默认只做center，不做scale

## (1) 协方差矩阵 ====
cov_dat = cov(dat_centered)
dim(cov_dat)
cov_dat
# 特征值分解
de_eigen = eigen(cov_dat)
de_eigen
# 奇异值分解
de_svd=svd(cov_dat)
de_svd
# 因为是对称矩阵，所以协方差矩阵的 u和v完全一致
all( abs(de_svd$u - de_svd$v)<1e-10 ) #T


## (2) 特征值和奇异值一致 ====
de_eigen$values
#[1] 4.22824171 0.24267075 0.07820950 0.02383509
de_svd$d
#[1] 4.22824171 0.24267075 0.07820950 0.02383509

## (3) 旋转矩阵一致 ====
de_eigen$vectors
de_svd$v
de_eigen$vectors-de_svd$v # 除第一列符号不同，其他完全一致(abs<1e-10认为是0)

## (4) 新空间的投影 ====
dat_centered %*% de_eigen$vectors |> head()
dat_centered %*% de_svd$v |> head()
# 绘图查看
par(mfrow=c(1,2))
df1_e=dat_centered %*% de_eigen$vectors
plot(df1_e[,1], df1_e[,2], col=iris$Species, main="COV | eigen project", pch=19)

df1_s=dat_centered %*% de_svd$v
plot(-df1_s[,1], df1_s[,2], col=iris$Species, main="COV | svd project", pch=19) #x乘以-1

在这里插入图片描述

(4) PCA: 对“相关性矩阵”做奇异值分解，vs 特征值分解

代码除了求相关系数用函数cor()替换了(3)的cov()，其余没变化。

dat=iris[,1:4]
rownames(dat)=paste0("sample", 1:nrow(dat))
colnames(dat)=paste0("gene", 1:ncol(dat))
head(dat)
dim(dat) #150样本行，4个基因列

dat_centered=scale(dat, center = T, scale = F) #因为prcomp 默认只做center，不做scale

## (1) 协方差矩阵 ====
cor_dat = cor(dat_centered)
dim(cor_dat)
cor_dat
# 特征值分解
de_eigen = eigen(cor_dat)
de_eigen
# 奇异值分解
de_svd=svd(cor_dat)
de_svd
# 因为是对称矩阵，所以协方差矩阵的 u和v完全一致
all( abs(de_svd$u - de_svd$v)<1e-10 ) #T


## (2) 特征值和奇异值一致 ====
de_eigen$values
#[1] 2.91849782 0.91403047 0.14675688 0.02071484
de_svd$d
#[1] 2.91849782 0.91403047 0.14675688 0.02071484

## (3) 旋转矩阵一致 ====
de_eigen$vectors
de_svd$v
de_eigen$vectors-de_svd$v # 除第一列符号不同，其他完全一致(abs<1e-10认为是0)
de_eigen$vectors + de_svd$v


## (4) 新空间的投影 ====
dat_centered %*% de_eigen$vectors |> head()
dat_centered %*% de_svd$v |> head()
# 绘图查看
par(mfrow=c(1,2))
df1_e=dat_centered %*% de_eigen$vectors
plot(df1_e[,1], df1_e[,2], col=iris$Species, main="COR | eigen project", pch=19)

df1_s=dat_centered %*% de_svd$v
plot(-df1_s[,1], df1_s[,2], col=iris$Species, main="COR | svd project", pch=19) #x乘以-1

在这里插入图片描述

3. PCA的疑问与解答

已知：

PCA prcomp() 默认输入是列为变量，行为样品的矩阵。
prcomp() 默认对列做中心化（减去列均值），默认不做列scale；

未知：

pca可以用相关系数矩阵做吗？效果比协方差矩阵比怎么样？
可以。比如 SYNCSA::pca()；见上文2(3), 2(4)
pca做完后变量和样本的新坐标怎么旋转获得？
矩阵分解前的矩阵(一般做过center) %*% 旋转矩阵(特征向量或者svd$v矩阵)
pca做不做scale和center对结果有影响吗？
必须做 center，就是减去列的平均值
scale可选，是否忽略每列的量纲的影响。基因认为是微小等效的，可以做scale。
pca用因子分解和奇异值分解有啥区别？后者怎么获得变量和样本的新坐标？
旋转矩阵：特征向量，或奇异值分解的右矩阵v。
映射坐标：centered * 旋转矩阵