Take a look at a kernelized version of PCA, or kernel PCA, which relates to the concepts of kernel SVM.
Using kernel PCA, we will learn how to transform data that is not linearly separable onto a new, lower-dimensional subspace that is suitable for linear classifers.
1. Implementing a kernel principal component analysis.
from scipy.spatial.distance import pdist, squareform from scipy import exp from scipy.linalg import eigh def rbf_kernel_pca(X, gamma, n_components): # Calculate pairwise squared Euclidean distances # in the MxN dimensional dataset. sq_dists = pdist(X, 'sqeuclidean') # Convert pairwise distances into a square matrix. mat_sq_dists = squareform(sq_dists) # Compute the symmetric kernel matrix. K = exp(-gamma * mat_sq_dists) # Center the kernel matrix. N = K.shape[0] one_n = np.ones((N, N)) / N K = K - one_n.dot(K) - K.dot(one_n) + one_n.dot(K).dot(one_n) # Obtaining eigenpairs from the centered kernel matrix # numpy.eigh returns them in sorted order eigvals, eigvecs = eigh(K) # Collect the top k eigenvectors (projected samples) X_pc = np.column_stack((eigvecs[:, -i] for i in range(1, n_components + 1))) return X_pc
2. Separating half-moon shapes
Let's apply rbf_kernel_pca on some nonlinear example datasets
2.1 Creating a two-dimensional dataset
from sklearn.datasets import make_moons X, y = make_moons(n_samples=100, random_state=123) plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red', marker='^', alpha=0.5) plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue', marker='o', alpha=0.5) plt.show()
2.2 See what the dataset looks like if we project it onto the principal components via standard PCA
from sklearn.decomposition import PCA scikit_pca = PCA(n_components=2) X_spca = scikit_pca.fit_transform(X) fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(7, 3)) ax[0].scatter(X_spca[y == 0, 0], X_spca[y == 0, 1], color='red', marker='^', alpha=0.5) ax[0].scatter(X_spca[y == 1, 0], X_spca[y == 1, 1], color='blue', marker='o', alpha=0.5) ax[1].scatter(X_spca[y == 0, 0], np.zeros((50, 1)) + 0.02, color='red', marker='^', alpha=0.5) ax[1].scatter(X_spca[y == 1, 0], np.zeros((50, 1)) - 0.02, color='blue', marker='o', alpha=0.5) ax[0].set_xlabel('PC1') ax[0].set_ylabel('PC2') ax[1].set_ylim([-1, 1]) ax[1].set_yticks([]) ax[1].set_xlabel('PC1') plt.show()
from matplotlib.ticker import FormatStrFormatter X_kpca = rbf_kernel_pca(X, gamma=15, n_components=2) fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(7, 3)) ax[0].scatter(X_kpca[y == 0, 0], X_kpca[y == 0, 1], color='red', marker='^', alpha=0.5) ax[0].scatter(X_kpca[y == 1, 0], X_kpca[y == 1, 1], color='blue', marker='o', alpha=0.5) ax[1].scatter(X_kpca[y == 0, 0], np.zeros((50, 1)) + 0.02, color='red', marker='^', alpha=0.5) ax[1].scatter(X_kpca[y == 1, 0], np.zeros((50, 1)) - 0.02, color='blue', marker='o', alpha=0.5) ax[0].set_xlabel('PC1') ax[0].set_ylabel('PC2') ax[1].set_ylim([-1, 1]) ax[1].set_yticks([]) ax[1].set_xlabel('PC1') ax[0].xaxis.set_major_formatter(FormatStrFormatter('%0.1f')) ax[1].xaxis.set_major_formatter(FormatStrFormatter('%0.1f')) plt.show()
Reference: 《Python Machine Learning》