|
3 | 3 | Gaussian Mixture Model Selection
|
4 | 4 | ================================
|
5 | 5 |
|
6 |
| -This example shows that model selection can be performed with |
7 |
| -Gaussian Mixture Models using :ref:`information-theoretic criteria (BIC) <aic_bic>`. |
8 |
| -Model selection concerns both the covariance type |
9 |
| -and the number of components in the model. |
10 |
| -In that case, AIC also provides the right result (not shown to save time), |
11 |
| -but BIC is better suited if the problem is to identify the right model. |
12 |
| -Unlike Bayesian procedures, such inferences are prior-free. |
| 6 | +This example shows that model selection can be performed with Gaussian Mixture |
| 7 | +Models (GMM) using :ref:`information-theory criteria <aic_bic>`. Model selection |
| 8 | +concerns both the covariance type and the number of components in the model. |
13 | 9 |
|
14 |
| -In that case, the model with 2 components and full covariance |
15 |
| -(which corresponds to the true generative model) is selected. |
| 10 | +In this case, both the Akaike Information Criterion (AIC) and the Bayes |
| 11 | +Information Criterion (BIC) provide the right result, but we only demo the |
| 12 | +latter as BIC is better suited to identify the true model among a set of |
| 13 | +candidates. Unlike Bayesian procedures, such inferences are prior-free. |
16 | 14 |
|
17 | 15 | """
|
18 | 16 |
|
| 17 | +# %% |
| 18 | +# Data generation |
| 19 | +# --------------- |
| 20 | +# |
| 21 | +# We generate two components (each one containing `n_samples`) by randomly |
| 22 | +# sampling the standard normal distribution as returned by `numpy.random.randn`. |
| 23 | +# One component is kept spherical yet shifted and re-scaled. The other one is |
| 24 | +# deformed to have a more general covariance matrix. |
| 25 | + |
19 | 26 | import numpy as np
|
20 |
| -import itertools |
21 | 27 |
|
22 |
| -from scipy import linalg |
| 28 | +n_samples = 500 |
| 29 | +np.random.seed(0) |
| 30 | +C = np.array([[0.0, -0.1], [1.7, 0.4]]) |
| 31 | +component_1 = np.dot(np.random.randn(n_samples, 2), C) # general |
| 32 | +component_2 = 0.7 * np.random.randn(n_samples, 2) + np.array([-4, 1]) # spherical |
| 33 | + |
| 34 | +X = np.concatenate([component_1, component_2]) |
| 35 | + |
| 36 | +# %% |
| 37 | +# We can visualize the different components: |
| 38 | + |
23 | 39 | import matplotlib.pyplot as plt
|
24 |
| -import matplotlib as mpl |
25 | 40 |
|
26 |
| -from sklearn import mixture |
| 41 | +plt.scatter(component_1[:, 0], component_1[:, 1], s=0.8) |
| 42 | +plt.scatter(component_2[:, 0], component_2[:, 1], s=0.8) |
| 43 | +plt.title("Gaussian Mixture components") |
| 44 | +plt.axis("equal") |
| 45 | +plt.show() |
27 | 46 |
|
28 |
| -# Number of samples per component |
29 |
| -n_samples = 500 |
| 47 | +# %% |
| 48 | +# Model training and selection |
| 49 | +# ---------------------------- |
| 50 | +# |
| 51 | +# We vary the number of components from 1 to 6 and the type of covariance |
| 52 | +# parameters to use: |
| 53 | +# |
| 54 | +# - `"full"`: each component has its own general covariance matrix. |
| 55 | +# - `"tied"`: all components share the same general covariance matrix. |
| 56 | +# - `"diag"`: each component has its own diagonal covariance matrix. |
| 57 | +# - `"spherical"`: each component has its own single variance. |
| 58 | +# |
| 59 | +# We score the different models and keep the best model (the lowest BIC). This |
| 60 | +# is done by using :class:`~sklearn.model_selection.GridSearchCV` and a |
| 61 | +# user-defined score function which returns the negative BIC score, as |
| 62 | +# :class:`~sklearn.model_selection.GridSearchCV` is designed to **maximize** a |
| 63 | +# score (maximizing the negative BIC is equivalent to minimizing the BIC). |
| 64 | +# |
| 65 | +# The best set of parameters and estimator are stored in `best_parameters_` and |
| 66 | +# `best_estimator_`, respectively. |
| 67 | + |
| 68 | +from sklearn.mixture import GaussianMixture |
| 69 | +from sklearn.model_selection import GridSearchCV |
| 70 | + |
| 71 | + |
| 72 | +def gmm_bic_score(estimator, X): |
| 73 | + """Callable to pass to GridSearchCV that will use the BIC score.""" |
| 74 | + # Make it negative since GridSearchCV expects a score to maximize |
| 75 | + return -estimator.bic(X) |
30 | 76 |
|
31 |
| -# Generate random sample, two components |
32 |
| -np.random.seed(0) |
33 |
| -C = np.array([[0.0, -0.1], [1.7, 0.4]]) |
34 |
| -X = np.r_[ |
35 |
| - np.dot(np.random.randn(n_samples, 2), C), |
36 |
| - 0.7 * np.random.randn(n_samples, 2) + np.array([-6, 3]), |
37 |
| -] |
38 | 77 |
|
39 |
| -lowest_bic = np.infty |
40 |
| -bic = [] |
41 |
| -n_components_range = range(1, 7) |
42 |
| -cv_types = ["spherical", "tied", "diag", "full"] |
43 |
| -for cv_type in cv_types: |
44 |
| - for n_components in n_components_range: |
45 |
| - # Fit a Gaussian mixture with EM |
46 |
| - gmm = mixture.GaussianMixture( |
47 |
| - n_components=n_components, covariance_type=cv_type |
48 |
| - ) |
49 |
| - gmm.fit(X) |
50 |
| - bic.append(gmm.bic(X)) |
51 |
| - if bic[-1] < lowest_bic: |
52 |
| - lowest_bic = bic[-1] |
53 |
| - best_gmm = gmm |
54 |
| - |
55 |
| -bic = np.array(bic) |
56 |
| -color_iter = itertools.cycle(["navy", "turquoise", "cornflowerblue", "darkorange"]) |
57 |
| -clf = best_gmm |
58 |
| -bars = [] |
| 78 | +param_grid = { |
| 79 | + "n_components": range(1, 7), |
| 80 | + "covariance_type": ["spherical", "tied", "diag", "full"], |
| 81 | +} |
| 82 | +grid_search = GridSearchCV( |
| 83 | + GaussianMixture(), param_grid=param_grid, scoring=gmm_bic_score |
| 84 | +) |
| 85 | +grid_search.fit(X) |
59 | 86 |
|
| 87 | +# %% |
60 | 88 | # Plot the BIC scores
|
61 |
| -plt.figure(figsize=(8, 6)) |
62 |
| -spl = plt.subplot(2, 1, 1) |
63 |
| -for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)): |
64 |
| - xpos = np.array(n_components_range) + 0.2 * (i - 2) |
65 |
| - bars.append( |
66 |
| - plt.bar( |
67 |
| - xpos, |
68 |
| - bic[i * len(n_components_range) : (i + 1) * len(n_components_range)], |
69 |
| - width=0.2, |
70 |
| - color=color, |
71 |
| - ) |
72 |
| - ) |
73 |
| -plt.xticks(n_components_range) |
74 |
| -plt.ylim([bic.min() * 1.01 - 0.01 * bic.max(), bic.max()]) |
75 |
| -plt.title("BIC score per model") |
76 |
| -xpos = ( |
77 |
| - np.mod(bic.argmin(), len(n_components_range)) |
78 |
| - + 0.65 |
79 |
| - + 0.2 * np.floor(bic.argmin() / len(n_components_range)) |
| 89 | +# ------------------- |
| 90 | +# |
| 91 | +# To ease the plotting we can create a `pandas.DataFrame` from the results of |
| 92 | +# the cross-validation done by the grid search. We re-inverse the sign of the |
| 93 | +# BIC score to show the effect of minimizing it. |
| 94 | + |
| 95 | +import pandas as pd |
| 96 | + |
| 97 | +df = pd.DataFrame(grid_search.cv_results_)[ |
| 98 | + ["param_n_components", "param_covariance_type", "mean_test_score"] |
| 99 | +] |
| 100 | +df["mean_test_score"] = -df["mean_test_score"] |
| 101 | +df = df.rename( |
| 102 | + columns={ |
| 103 | + "param_n_components": "Number of components", |
| 104 | + "param_covariance_type": "Type of covariance", |
| 105 | + "mean_test_score": "BIC score", |
| 106 | + } |
| 107 | +) |
| 108 | +df.sort_values(by="BIC score").head() |
| 109 | + |
| 110 | +# %% |
| 111 | +import seaborn as sns |
| 112 | + |
| 113 | +sns.catplot( |
| 114 | + data=df, |
| 115 | + kind="bar", |
| 116 | + x="Number of components", |
| 117 | + y="BIC score", |
| 118 | + hue="Type of covariance", |
80 | 119 | )
|
81 |
| -plt.text(xpos, bic.min() * 0.97 + 0.03 * bic.max(), "*", fontsize=14) |
82 |
| -spl.set_xlabel("Number of components") |
83 |
| -spl.legend([b[0] for b in bars], cv_types) |
84 |
| - |
85 |
| -# Plot the winner |
86 |
| -splot = plt.subplot(2, 1, 2) |
87 |
| -Y_ = clf.predict(X) |
88 |
| -for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_, color_iter)): |
| 120 | +plt.show() |
| 121 | + |
| 122 | +# %% |
| 123 | +# In the present case, the model with 2 components and full covariance (which |
| 124 | +# corresponds to the true generative model) has the lowest BIC score and is |
| 125 | +# therefore selected by the grid search. |
| 126 | +# |
| 127 | +# Plot the best model |
| 128 | +# ------------------- |
| 129 | +# |
| 130 | +# We plot an ellipse to show each Gaussian component of the selected model. For |
| 131 | +# such purpose, one needs to find the eigenvalues of the covariance matrices as |
| 132 | +# returned by the `covariances_` attribute. The shape of such matrices depends |
| 133 | +# on the `covariance_type`: |
| 134 | +# |
| 135 | +# - `"full"`: (`n_components`, `n_features`, `n_features`) |
| 136 | +# - `"tied"`: (`n_features`, `n_features`) |
| 137 | +# - `"diag"`: (`n_components`, `n_features`) |
| 138 | +# - `"spherical"`: (`n_components`,) |
| 139 | + |
| 140 | +from matplotlib.patches import Ellipse |
| 141 | +from scipy import linalg |
| 142 | + |
| 143 | +color_iter = sns.color_palette("tab10", 2)[::-1] |
| 144 | +Y_ = grid_search.predict(X) |
| 145 | + |
| 146 | +fig, ax = plt.subplots() |
| 147 | + |
| 148 | +for i, (mean, cov, color) in enumerate( |
| 149 | + zip( |
| 150 | + grid_search.best_estimator_.means_, |
| 151 | + grid_search.best_estimator_.covariances_, |
| 152 | + color_iter, |
| 153 | + ) |
| 154 | +): |
89 | 155 | v, w = linalg.eigh(cov)
|
90 | 156 | if not np.any(Y_ == i):
|
91 | 157 | continue
|
92 | 158 | plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], 0.8, color=color)
|
93 | 159 |
|
94 |
| - # Plot an ellipse to show the Gaussian component |
95 | 160 | angle = np.arctan2(w[0][1], w[0][0])
|
96 | 161 | angle = 180.0 * angle / np.pi # convert to degrees
|
97 | 162 | v = 2.0 * np.sqrt(2.0) * np.sqrt(v)
|
98 |
| - ell = mpl.patches.Ellipse(mean, v[0], v[1], 180.0 + angle, color=color) |
99 |
| - ell.set_clip_box(splot.bbox) |
100 |
| - ell.set_alpha(0.5) |
101 |
| - splot.add_artist(ell) |
| 163 | + ellipse = Ellipse(mean, v[0], v[1], angle=180.0 + angle, color=color) |
| 164 | + ellipse.set_clip_box(fig.bbox) |
| 165 | + ellipse.set_alpha(0.5) |
| 166 | + ax.add_artist(ellipse) |
102 | 167 |
|
103 |
| -plt.xticks(()) |
104 |
| -plt.yticks(()) |
105 | 168 | plt.title(
|
106 |
| - f"Selected GMM: {best_gmm.covariance_type} model, " |
107 |
| - f"{best_gmm.n_components} components" |
| 169 | + f"Selected GMM: {grid_search.best_params_['covariance_type']} model, " |
| 170 | + f"{grid_search.best_params_['n_components']} components" |
108 | 171 | )
|
109 |
| -plt.subplots_adjust(hspace=0.35, bottom=0.02) |
| 172 | +plt.axis("equal") |
110 | 173 | plt.show()
|
0 commit comments