|
3 | 3 | Kernel PCA
|
4 | 4 | ==========
|
5 | 5 |
|
6 |
| -This example shows that Kernel PCA is able to find a projection of the data |
7 |
| -that makes data linearly separable. |
| 6 | +This example shows the difference between the Principal Components Analysis |
| 7 | +(:class:`~sklearn.decomposition.PCA`) and its kernalized version |
| 8 | +(:class:`~sklearn.decomposition.KernelPCA`). |
8 | 9 |
|
| 10 | +On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able |
| 11 | +to find a projection of the data which linearly separates them while it is not the case |
| 12 | +with :class:`~sklearn.decomposition.PCA`. |
| 13 | +
|
| 14 | +Finally, we show that inverting this projection is an approximation with |
| 15 | +:class:`~sklearn.decomposition.KernelPCA`, while it is exact with |
| 16 | +:class:`~sklearn.decomposition.PCA`. |
9 | 17 | """
|
10 | 18 |
|
11 | 19 | # Authors: Mathieu Blondel
|
12 | 20 | # Andreas Mueller
|
| 21 | +# Guillaume Lemaitre |
13 | 22 | # License: BSD 3 clause
|
14 | 23 |
|
15 |
| -import numpy as np |
| 24 | +# %% |
| 25 | +# Projecting data: `PCA` vs. `KernelPCA` |
| 26 | +# -------------------------------------- |
| 27 | +# |
| 28 | +# In this section, we show the advantages of using a kernel when |
| 29 | +# projecting data using a Principal Component Analysis (PCA). We create a |
| 30 | +# dataset made of two nested circles. |
| 31 | +from sklearn.datasets import make_circles |
| 32 | +from sklearn.model_selection import train_test_split |
| 33 | + |
| 34 | +X, y = make_circles(n_samples=1_000, factor=0.3, noise=0.05, random_state=0) |
| 35 | +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) |
| 36 | + |
| 37 | +# %% |
| 38 | +# Let's have a quick first look at the generated dataset. |
16 | 39 | import matplotlib.pyplot as plt
|
17 | 40 |
|
| 41 | +_, (train_ax, test_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4)) |
| 42 | + |
| 43 | +train_ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train) |
| 44 | +train_ax.set_ylabel("Feature #1") |
| 45 | +train_ax.set_xlabel("Feature #0") |
| 46 | +train_ax.set_title("Training data") |
| 47 | + |
| 48 | +test_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test) |
| 49 | +test_ax.set_xlabel("Feature #0") |
| 50 | +_ = test_ax.set_title("Testing data") |
| 51 | + |
| 52 | +# %% |
| 53 | +# The samples from each class cannot be linearly separated: there is no |
| 54 | +# straight line that can split the samples of the inner set from the outer |
| 55 | +# set. |
| 56 | +# |
| 57 | +# Now, we will use PCA with and without a kernel to see what is the effect of |
| 58 | +# using such a kernel. The kernel used here is a radial basis function (RBF) |
| 59 | +# kernel. |
18 | 60 | from sklearn.decomposition import PCA, KernelPCA
|
19 |
| -from sklearn.datasets import make_circles |
20 | 61 |
|
21 |
| -np.random.seed(0) |
22 |
| - |
23 |
| -X, y = make_circles(n_samples=400, factor=0.3, noise=0.05) |
24 |
| - |
25 |
| -kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10) |
26 |
| -X_kpca = kpca.fit_transform(X) |
27 |
| -X_back = kpca.inverse_transform(X_kpca) |
28 |
| -pca = PCA() |
29 |
| -X_pca = pca.fit_transform(X) |
30 |
| - |
31 |
| -# Plot results |
32 |
| - |
33 |
| -plt.figure() |
34 |
| -plt.subplot(2, 2, 1, aspect="equal") |
35 |
| -plt.title("Original space") |
36 |
| -reds = y == 0 |
37 |
| -blues = y == 1 |
38 |
| - |
39 |
| -plt.scatter(X[reds, 0], X[reds, 1], c="red", s=20, edgecolor="k") |
40 |
| -plt.scatter(X[blues, 0], X[blues, 1], c="blue", s=20, edgecolor="k") |
41 |
| -plt.xlabel("$x_1$") |
42 |
| -plt.ylabel("$x_2$") |
43 |
| - |
44 |
| -X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50)) |
45 |
| -X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T |
46 |
| -# projection on the first principal component (in the phi space) |
47 |
| -Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape) |
48 |
| -plt.contour(X1, X2, Z_grid, colors="grey", linewidths=1, origin="lower") |
49 |
| - |
50 |
| -plt.subplot(2, 2, 2, aspect="equal") |
51 |
| -plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c="red", s=20, edgecolor="k") |
52 |
| -plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c="blue", s=20, edgecolor="k") |
53 |
| -plt.title("Projection by PCA") |
54 |
| -plt.xlabel("1st principal component") |
55 |
| -plt.ylabel("2nd component") |
56 |
| - |
57 |
| -plt.subplot(2, 2, 3, aspect="equal") |
58 |
| -plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=20, edgecolor="k") |
59 |
| -plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=20, edgecolor="k") |
60 |
| -plt.title("Projection by KPCA") |
61 |
| -plt.xlabel(r"1st principal component in space induced by $\phi$") |
62 |
| -plt.ylabel("2nd component") |
63 |
| - |
64 |
| -plt.subplot(2, 2, 4, aspect="equal") |
65 |
| -plt.scatter(X_back[reds, 0], X_back[reds, 1], c="red", s=20, edgecolor="k") |
66 |
| -plt.scatter(X_back[blues, 0], X_back[blues, 1], c="blue", s=20, edgecolor="k") |
67 |
| -plt.title("Original space after inverse transform") |
68 |
| -plt.xlabel("$x_1$") |
69 |
| -plt.ylabel("$x_2$") |
70 |
| - |
71 |
| -plt.tight_layout() |
72 |
| -plt.show() |
| 62 | +pca = PCA(n_components=2) |
| 63 | +kernel_pca = KernelPCA( |
| 64 | + n_components=None, kernel="rbf", gamma=10, fit_inverse_transform=True, alpha=0.1 |
| 65 | +) |
| 66 | + |
| 67 | +X_test_pca = pca.fit(X_train).transform(X_test) |
| 68 | +X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test) |
| 69 | + |
| 70 | +# %% |
| 71 | +fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots( |
| 72 | + ncols=3, figsize=(14, 4) |
| 73 | +) |
| 74 | + |
| 75 | +orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test) |
| 76 | +orig_data_ax.set_ylabel("Feature #1") |
| 77 | +orig_data_ax.set_xlabel("Feature #0") |
| 78 | +orig_data_ax.set_title("Testing data") |
| 79 | + |
| 80 | +pca_proj_ax.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test) |
| 81 | +pca_proj_ax.set_ylabel("Principal component #1") |
| 82 | +pca_proj_ax.set_xlabel("Principal component #0") |
| 83 | +pca_proj_ax.set_title("Projection of testing data\n using PCA") |
| 84 | + |
| 85 | +kernel_pca_proj_ax.scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test) |
| 86 | +kernel_pca_proj_ax.set_ylabel("Principal component #1") |
| 87 | +kernel_pca_proj_ax.set_xlabel("Principal component #0") |
| 88 | +_ = kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA") |
| 89 | + |
| 90 | +# %% |
| 91 | +# We recall that PCA transforms the data linearly. Intuitively, it means that |
| 92 | +# the coordinate system will be centered, rescaled on each component |
| 93 | +# with respected to its variance and finally be rotated. |
| 94 | +# The obtained data from this transformation is isotropic and can now be |
| 95 | +# projected on its _principal components_. |
| 96 | +# |
| 97 | +# Thus, looking at the projection made using PCA (i.e. the middle figure), we |
| 98 | +# see that there is no change regarding the scaling; indeed the data being two |
| 99 | +# concentric circles centered in zero, the original data is already isotropic. |
| 100 | +# However, we can see that the data have been rotated. As a |
| 101 | +# conclusion, we see that such a projection would not help if define a linear |
| 102 | +# classifier to distinguish samples from both classes. |
| 103 | +# |
| 104 | +# Using a kernel allows to make a non-linear projection. Here, by using an RBF |
| 105 | +# kernel, we expect that the projection will unfold the dataset while keeping |
| 106 | +# approximately preserving the relative distances of pairs of data points that |
| 107 | +# are close to one another in the original space. |
| 108 | +# |
| 109 | +# We observe such behaviour in the figure on the right: the samples of a given |
| 110 | +# class are closer to each other than the samples from the opposite class, |
| 111 | +# untangling both sample sets. Now, we can use a linear classifier to separate |
| 112 | +# the samples from the two classes. |
| 113 | +# |
| 114 | +# Projecting into the original feature space |
| 115 | +# ------------------------------------------ |
| 116 | +# |
| 117 | +# One particularity to have in mind when using |
| 118 | +# :class:`~sklearn.decomposition.KernelPCA` is related to the reconstruction |
| 119 | +# (i.e. the back projection in the original feature space). With |
| 120 | +# :class:`~sklearn.decomposition.PCA`, the reconstruction will be exact if |
| 121 | +# `n_components` is the same than the number of original features. |
| 122 | +# This is the case in this example. |
| 123 | +# |
| 124 | +# We can investigate if we get the original dataset when back projecting with |
| 125 | +# :class:`~sklearn.decomposition.KernelPCA`. |
| 126 | +X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test)) |
| 127 | +X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test)) |
| 128 | + |
| 129 | +# %% |
| 130 | +fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax) = plt.subplots( |
| 131 | + ncols=3, sharex=True, sharey=True, figsize=(13, 4) |
| 132 | +) |
| 133 | + |
| 134 | +orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test) |
| 135 | +orig_data_ax.set_ylabel("Feature #1") |
| 136 | +orig_data_ax.set_xlabel("Feature #0") |
| 137 | +orig_data_ax.set_title("Original test data") |
| 138 | + |
| 139 | +pca_back_proj_ax.scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test) |
| 140 | +pca_back_proj_ax.set_xlabel("Feature #0") |
| 141 | +pca_back_proj_ax.set_title("Reconstruction via PCA") |
| 142 | + |
| 143 | +kernel_pca_back_proj_ax.scatter( |
| 144 | + X_reconstructed_kernel_pca[:, 0], X_reconstructed_kernel_pca[:, 1], c=y_test |
| 145 | +) |
| 146 | +kernel_pca_back_proj_ax.set_xlabel("Feature #0") |
| 147 | +_ = kernel_pca_back_proj_ax.set_title("Reconstruction via KernelPCA") |
| 148 | + |
| 149 | +# %% |
| 150 | +# While we see a perfect reconstruction with |
| 151 | +# :class:`~sklearn.decomposition.PCA` we observe a different result for |
| 152 | +# :class:`~sklearn.decomposition.KernelPCA`. |
| 153 | +# |
| 154 | +# Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot |
| 155 | +# rely on an analytical back-projection and thus an extact reconstruction. |
| 156 | +# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` is internally trained |
| 157 | +# to learn a mapping from the kernalized PCA basis to the original feature |
| 158 | +# space. This method therefore comes with an approximation introducing small |
| 159 | +# differences when back projecting in the original feature space. |
| 160 | +# |
| 161 | +# To improve the reconstruction using |
| 162 | +# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune |
| 163 | +# `alpha` in :class:`~sklearn.decomposition.KernelPCA`, the regularization term |
| 164 | +# which controls the reliance on the training data during the training of |
| 165 | +# the mapping. |
0 commit comments