原理参考:
PCA 原理及 Python 实现
代码传送门:https://github.com/taifyang/machine-learning
python实现:
import numpy as np
import pandas as pd
class PCA(object):
"""定义PCA类"""
def __init__(self, x, n_components=None):
"""x的数据结构应为ndarray"""
self.x = x
self.dimension = x.shape[1]
if n_components and n_components >= self.dimension:
print("n_components error")
self.n_components = n_components
def get_feature(self):
"""求协方差矩阵C的特征值和特征向量"""
x_T = np.transpose(self.x) #矩阵转秩
x_cov = np.cov(x_T) #协方差矩阵
a, b = np.linalg.eig(x_cov) #a为特征值,b为特征向量
m = a.shape[0]
c = np.hstack((a.reshape((m,1)), b))
#print(a,b,c)
c_df = pd.DataFrame(c)
c_df_sort = c_df.sort_values(0, ascending=False) #按照特征值大小降序排列特征向量
return c_df_sort
def explained_variance_(self):
c_df_sort = self.get_feature()
return c_df_sort.values[:, 0]
def reduce_dimension(self):
"""指定维度降维和根据方差贡献率自动降维"""
c_df_sort = self.get_feature()
variance = self.explained_variance_()
#print(c_df_sort,variance)
if self.n_components: #指定降维维度
p = c_df_sort.values[0:self.n_components, 1:]
y = np.dot(p, np.transpose(self.x)) #矩阵叉乘
return np.transpose(y)
variance_sum = sum(variance) #利用方差贡献度自动选择降维维度
variance_radio = variance / variance_sum
variance_contribution = 0
for R in range(self.dimension):
variance_contribution += variance_radio[R] #前R个方差贡献度之和
if variance_contribution >= 0.99:
break
p = c_df_sort.values[0:R+1, 1:] #取前R个特征向量
y = np.dot(p, np.transpose(self.x)) #矩阵叉乘
return np.transpose(y)
if __name__ == '__main__':
x = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA(x)
print(x)
print(pca.reduce_dimension())
print(pca.explained_variance_())
python调包:
import numpy as np
from sklearn.decomposition import PCA
x = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA(n_components='mle')
pca.fit(x)
print(pca.explained_variance_)
C++实现:
#include <iostream>
#include <vector>
#include <Eigen/Dense>
//计算协方差矩阵
Eigen::MatrixXf calc_cov(Eigen::MatrixXf mat)
{
Eigen::MatrixXf meanVec = mat.colwise().mean();
Eigen::RowVectorXf meanVecRow(Eigen::RowVectorXf::Map(meanVec.data(), mat.cols()));
Eigen::MatrixXf zeroMeanMat = mat;
zeroMeanMat.rowwise() -= meanVecRow;
Eigen::MatrixXf covMat;
if (mat.rows() == 1)
covMat = (zeroMeanMat.adjoint()*zeroMeanMat) / float(mat.rows());
else
covMat = (zeroMeanMat.adjoint()*zeroMeanMat) / float(mat.rows() - 1);
return covMat;
}
//返回从小到大排序的索引
std::vector<int> sort_index(std::vector<float> vec)
{
std::vector<int> vec_sorted(vec.size());
for (size_t i = 0; i != vec_sorted.size(); ++i)
vec_sorted[i] = i;
std::sort(vec_sorted.begin(), vec_sorted.end(), [&vec](size_t i, size_t j) {return vec[i] > vec[j]; });
return vec_sorted;
}
class PCA
{
public:
PCA(std::vector<std::vector<float>> x, int n_components = -1)
{
m_x.resize(x.size(), x[0].size());
for (size_t i = 0; i < m_x.rows(); i++)
{
for (size_t j = 0; j < m_x.cols(); j++)
{
m_x(i, j) = x[i][j];
}
}
m_dimension = x[0].size();
if (n_components >= m_dimension)
std::cout << "n_components error" << std::endl;
m_components = n_components;
}
Eigen::MatrixXf get_feature()
{
Eigen::MatrixXf x_cov = calc_cov(m_x);
Eigen::EigenSolver<Eigen::MatrixXf> eigen_solver(x_cov);
Eigen::MatrixXcf eigen_value = eigen_solver.eigenvalues();
Eigen::MatrixXcf eigen_vector = eigen_solver.eigenvectors();
//std::cout << eigen_value << std::endl;
//std::cout << eigen_vector << std::endl;
Eigen::MatrixXf c(eigen_value.rows(), eigen_vector.cols() + 1);
std::vector<float> vec(eigen_value.rows());
for (size_t i = 0; i < c.rows(); i++)
{
c(i, 0) = eigen_value(i, 0).real();
vec[i] = c(i, 0);
for (size_t j = 1; j < c.cols(); j++)
{
c(i, j) = eigen_vector(i, j - 1).real();
}
}
//std::cout << c << std::endl;
std::vector<int> index = sort_index(vec);
Eigen::MatrixXf c_df_sort(c.rows(), c.cols());
for (size_t i = 0; i < c.rows(); i++)
{
c_df_sort(i, 0) = c(index[i], 0);
for (size_t j = 1; j < c.cols(); j++)
{
c_df_sort(i, j) = c(index[i], j);
}
}
//std::cout << c_df_sort << std::endl;
return c_df_sort;
}
Eigen::VectorXf explained_variance_()
{
Eigen::MatrixXf c_df_sort = get_feature();
Eigen::VectorXf variance(c_df_sort.rows());
for (size_t i = 0; i < variance.size(); i++)
{
variance[i] = c_df_sort(i, 0);
}
return variance;
}
Eigen::MatrixXf reduce_dimension()
{
Eigen::MatrixXf c_df_sort = get_feature();
Eigen::VectorXf variance = explained_variance_();
if (m_components != -1)
{
Eigen::MatrixXf p = c_df_sort.topRightCorner(m_components, c_df_sort.cols() - 1);
Eigen::MatrixXf y = p*m_x.transpose();
return y.transpose();
}
float variance_sum = variance.sum();
Eigen::VectorXf variance_radio = variance / variance_sum;
float variance_contribution = 0;
int R = 0;
for (; R < m_dimension; R++)
{
variance_contribution += variance_radio[R];
if (variance_contribution >= 0.99)
break;
}
Eigen::MatrixXf p = c_df_sort.topRightCorner(R + 1, c_df_sort.cols() - 1);
Eigen::MatrixXf y = p*m_x.transpose();
return y.transpose();
}
private:
Eigen::MatrixXf m_x;
int m_dimension;
int m_components;
};
int main(int argc, char* argv[])
{
std::vector<std::vector<float>> x = { {-1, -1},{-2, -1},{-3, -2},{1, 1},{2, 1},{3, 2} };
PCA pca = PCA(x);
//pca.get_feature();
std::cout << pca.reduce_dimension().transpose() << std::endl;
std::cout << pca.explained_variance_().transpose() << std::endl;
system("pause");
return EXIT_SUCCESS;
}
本文介绍了使用Python和C++实现PCA(主成分分析)的原理与代码实例,包括数据预处理、协方差矩阵计算、特征值和特征向量提取,以及降维方法。通过PCA自动或指定维度减少数据复杂性,适合数据可视化和特征提取。
6924

被折叠的 条评论
为什么被折叠?



