PCA算法

原创已于 2022-06-05 18:44:21 修改 · 689 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#机器学习

于 2022-05-07 18:57:29 首次发布

machine learning 专栏收录该内容

12 篇文章

订阅专栏

本文介绍了使用Python和C++实现PCA（主成分分析）的原理与代码实例，包括数据预处理、协方差矩阵计算、特征值和特征向量提取，以及降维方法。通过PCA自动或指定维度减少数据复杂性，适合数据可视化和特征提取。

部署运行你感兴趣的模型镜像

原理参考：
PCA 原理及 Python 实现

代码传送门：https://github.com/taifyang/machine-learning

python实现：

import numpy as np
import pandas as pd

class PCA(object):
    """定义PCA类"""
    def __init__(self, x, n_components=None):
        """x的数据结构应为ndarray"""
        self.x = x
        self.dimension = x.shape[1]       
        if n_components and n_components >= self.dimension:
            print("n_components error")        
        self.n_components = n_components      
    
    def get_feature(self):
        """求协方差矩阵C的特征值和特征向量"""
        x_T = np.transpose(self.x)                           #矩阵转秩
        x_cov = np.cov(x_T)                                  #协方差矩阵
        a, b = np.linalg.eig(x_cov)                          #a为特征值，b为特征向量
        m = a.shape[0]
        c = np.hstack((a.reshape((m,1)), b))
        #print(a,b,c)
        c_df = pd.DataFrame(c)
        c_df_sort = c_df.sort_values(0, ascending=False)    #按照特征值大小降序排列特征向量
        return c_df_sort
        
    def explained_variance_(self):
        c_df_sort = self.get_feature()
        return c_df_sort.values[:, 0]       
              
    def reduce_dimension(self):
        """指定维度降维和根据方差贡献率自动降维"""
        c_df_sort = self.get_feature()
        variance = self.explained_variance_()        
        #print(c_df_sort,variance)
        if self.n_components:                                #指定降维维度
            p = c_df_sort.values[0:self.n_components, 1:]
            y = np.dot(p, np.transpose(self.x))              #矩阵叉乘
            return np.transpose(y)      
        variance_sum = sum(variance)                         #利用方差贡献度自动选择降维维度
        variance_radio = variance / variance_sum       
        variance_contribution = 0
        for R in range(self.dimension):
            variance_contribution += variance_radio[R]       #前R个方差贡献度之和
            if variance_contribution >= 0.99:
                break            
        p = c_df_sort.values[0:R+1, 1:]                      #取前R个特征向量
        y = np.dot(p, np.transpose(self.x))                  #矩阵叉乘
        return np.transpose(y)
  

if __name__ == '__main__':
    x = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    pca = PCA(x)
    print(x)
    print(pca.reduce_dimension())
    print(pca.explained_variance_())

python调包：

import numpy as np
from sklearn.decomposition import PCA

x = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA(n_components='mle')
pca.fit(x)
print(pca.explained_variance_)

C++实现：

#include <iostream>
#include <vector>
#include <Eigen/Dense>

//计算协方差矩阵
Eigen::MatrixXf calc_cov(Eigen::MatrixXf mat)
{
	Eigen::MatrixXf meanVec = mat.colwise().mean();
	Eigen::RowVectorXf meanVecRow(Eigen::RowVectorXf::Map(meanVec.data(), mat.cols()));

	Eigen::MatrixXf zeroMeanMat = mat;
	zeroMeanMat.rowwise() -= meanVecRow;
	Eigen::MatrixXf covMat;
	if (mat.rows() == 1)
		covMat = (zeroMeanMat.adjoint()*zeroMeanMat) / float(mat.rows());
	else
		covMat = (zeroMeanMat.adjoint()*zeroMeanMat) / float(mat.rows() - 1);

	return covMat;
}

//返回从小到大排序的索引
std::vector<int> sort_index(std::vector<float> vec)
{
	std::vector<int>  vec_sorted(vec.size());
	for (size_t i = 0; i != vec_sorted.size(); ++i)
		vec_sorted[i] = i;
	std::sort(vec_sorted.begin(), vec_sorted.end(), [&vec](size_t i, size_t j) {return vec[i] >  vec[j]; });
	return vec_sorted;
}

class PCA
{
public:
	PCA(std::vector<std::vector<float>> x, int n_components = -1)
	{
		m_x.resize(x.size(), x[0].size());
		for (size_t i = 0; i < m_x.rows(); i++)
		{
			for (size_t j = 0; j < m_x.cols(); j++)
			{
				m_x(i, j) = x[i][j];
			}
		}

		m_dimension = x[0].size();

		if (n_components >= m_dimension)
			std::cout << "n_components error" << std::endl;

		m_components = n_components;
	}

	Eigen::MatrixXf get_feature()
	{
		Eigen::MatrixXf x_cov = calc_cov(m_x);
		Eigen::EigenSolver<Eigen::MatrixXf> eigen_solver(x_cov);
		Eigen::MatrixXcf eigen_value = eigen_solver.eigenvalues();
		Eigen::MatrixXcf eigen_vector = eigen_solver.eigenvectors();
		//std::cout << eigen_value << std::endl;
	    //std::cout << eigen_vector << std::endl;

		Eigen::MatrixXf c(eigen_value.rows(), eigen_vector.cols() + 1);
		std::vector<float> vec(eigen_value.rows());
		for (size_t i = 0; i < c.rows(); i++)
		{
			c(i, 0) = eigen_value(i, 0).real();
			vec[i] = c(i, 0);
			for (size_t j = 1; j < c.cols(); j++)
			{
				c(i, j) = eigen_vector(i, j - 1).real();
			}
		}
		//std::cout << c << std::endl;

		std::vector<int> index = sort_index(vec);
		Eigen::MatrixXf c_df_sort(c.rows(), c.cols());
		for (size_t i = 0; i < c.rows(); i++)
		{
			c_df_sort(i, 0) = c(index[i], 0);
			for (size_t j = 1; j < c.cols(); j++)
			{
				c_df_sort(i, j) = c(index[i], j);
			}
		}
		//std::cout << c_df_sort << std::endl;
		return c_df_sort;
	}

	Eigen::VectorXf explained_variance_()
	{
		Eigen::MatrixXf c_df_sort = get_feature();
		Eigen::VectorXf variance(c_df_sort.rows());
		for (size_t i = 0; i < variance.size(); i++)
		{
			variance[i] = c_df_sort(i, 0);
		}
		return variance;
	}

	Eigen::MatrixXf reduce_dimension()
	{
		Eigen::MatrixXf c_df_sort = get_feature();
		Eigen::VectorXf variance = explained_variance_();

		if (m_components != -1)
		{
			Eigen::MatrixXf p = c_df_sort.topRightCorner(m_components, c_df_sort.cols() - 1);
			Eigen::MatrixXf y = p*m_x.transpose();
			return y.transpose();		
		}

		float variance_sum = variance.sum();
		Eigen::VectorXf variance_radio = variance / variance_sum;
		float  variance_contribution = 0;

		int R = 0;
		for (; R < m_dimension; R++)
		{
			variance_contribution += variance_radio[R];
			if (variance_contribution >= 0.99)
				break;
		}

		Eigen::MatrixXf p = c_df_sort.topRightCorner(R + 1, c_df_sort.cols() - 1);
		Eigen::MatrixXf y = p*m_x.transpose();
		return y.transpose();
	}

private:
	Eigen::MatrixXf m_x;
	int m_dimension;
	int m_components;
};

int main(int argc, char* argv[])
{
	std::vector<std::vector<float>> x = { {-1, -1},{-2, -1},{-3, -2},{1, 1},{2, 1},{3, 2} };
	PCA pca = PCA(x);
	//pca.get_feature();
	std::cout << pca.reduce_dimension().transpose() << std::endl;
	std::cout << pca.explained_variance_().transpose() << std::endl;

	system("pause");
	return EXIT_SUCCESS;
}