|
1 | 1 | """
|
2 |
| -======================================== |
3 |
| -Lasso and Elastic Net for Sparse Signals |
4 |
| -======================================== |
| 2 | +================================== |
| 3 | +L1-based models for Sparse Signals |
| 4 | +================================== |
5 | 5 |
|
6 |
| -Estimates Lasso and Elastic-Net regression models on a manually generated |
7 |
| -sparse signal corrupted with an additive noise. Estimated coefficients are |
8 |
| -compared with the ground-truth. |
| 6 | +The present example compares three l1-based regression models on a synthetic |
| 7 | +signal obtained from sparse and correlated features that are further corrupted |
| 8 | +with additive gaussian noise: |
| 9 | +
|
| 10 | + - a :ref:`lasso`; |
| 11 | + - an :ref:`automatic_relevance_determination`; |
| 12 | + - an :ref:`elastic_net`. |
| 13 | +
|
| 14 | +It is known that the Lasso estimates turn to be close to the model selection |
| 15 | +estimates when the data dimensions grow, given that the irrelevant variables are |
| 16 | +not too correlated with the relevant ones. In the presence of correlated |
| 17 | +features, Lasso itself cannot select the correct sparsity pattern [1]_. |
9 | 18 |
|
| 19 | +Here we compare the performance of the three models in terms of the :math:`R^2` |
| 20 | +score, the fitting time and the sparsity of the estimated coefficients when |
| 21 | +compared with the ground-truth. |
10 | 22 | """
|
11 | 23 |
|
| 24 | +# Author: Arturo Amor <[email protected]> |
| 25 | + |
12 | 26 | # %%
|
13 |
| -# Data Generation |
14 |
| -# --------------------------------------------------- |
| 27 | +# Generate synthetic dataset |
| 28 | +# -------------------------- |
| 29 | +# |
| 30 | +# We generate a dataset where the number of samples is lower than the total |
| 31 | +# number of features. This leads to an underdetermined system, i.e. the solution |
| 32 | +# is not unique, and thus we cannot apply an :ref:`ordinary_least_squares` by |
| 33 | +# itself. Regularization introduces a penalty term to the objective function, |
| 34 | +# which modifies the optimization problem and can help alleviate the |
| 35 | +# underdetermined nature of the system. |
| 36 | +# |
| 37 | +# The target `y` is a linear combination with alternating signs of sinusoidal |
| 38 | +# signals. Only the 10 lowest out of the 100 frequencies in `X` are used to |
| 39 | +# generate `y`, while the rest of the features are not informative. This results |
| 40 | +# in a high dimensional sparse feature space, where some degree of |
| 41 | +# l1-penalization is necessary. |
15 | 42 |
|
16 | 43 | import numpy as np
|
17 |
| -import matplotlib.pyplot as plt |
18 |
| - |
19 |
| -from sklearn.metrics import r2_score |
20 | 44 |
|
21 |
| -np.random.seed(42) |
| 45 | +rng = np.random.RandomState(0) |
| 46 | +n_samples, n_features, n_informative = 50, 100, 10 |
| 47 | +time_step = np.linspace(-2, 2, n_samples) |
| 48 | +freqs = 2 * np.pi * np.sort(rng.rand(n_features)) / 0.01 |
| 49 | +X = np.zeros((n_samples, n_features)) |
22 | 50 |
|
23 |
| -n_samples, n_features = 50, 100 |
24 |
| -X = np.random.randn(n_samples, n_features) |
| 51 | +for i in range(n_features): |
| 52 | + X[:, i] = np.sin(freqs[i] * time_step) |
25 | 53 |
|
26 |
| -# Decreasing coef w. alternated signs for visualization |
27 | 54 | idx = np.arange(n_features)
|
28 |
| -coef = (-1) ** idx * np.exp(-idx / 10) |
29 |
| -coef[10:] = 0 # sparsify coef |
30 |
| -y = np.dot(X, coef) |
| 55 | +true_coef = (-1) ** idx * np.exp(-idx / 10) |
| 56 | +true_coef[n_informative:] = 0 # sparsify coef |
| 57 | +y = np.dot(X, true_coef) |
31 | 58 |
|
32 |
| -# Add noise |
33 |
| -y += 0.01 * np.random.normal(size=n_samples) |
| 59 | +# %% |
| 60 | +# Some of the informative features have close frequencies to induce |
| 61 | +# (anti-)correlations. |
34 | 62 |
|
35 |
| -# Split data in train set and test set |
36 |
| -n_samples = X.shape[0] |
37 |
| -X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] |
38 |
| -X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :] |
| 63 | +freqs[:n_informative] |
39 | 64 |
|
40 | 65 | # %%
|
41 |
| -# Lasso |
42 |
| -# --------------------------------------------------- |
| 66 | +# A random phase is introduced using :func:`numpy.random.random_sample` |
| 67 | +# and some gaussian noise (implemented by :func:`numpy.random.normal`) |
| 68 | +# is added to both the features and the target. |
| 69 | + |
| 70 | +for i in range(n_features): |
| 71 | + X[:, i] = np.sin(freqs[i] * time_step + 2 * (rng.random_sample() - 0.5)) |
| 72 | + X[:, i] += 0.2 * rng.normal(0, 1, n_samples) |
| 73 | + |
| 74 | +y += 0.2 * rng.normal(0, 1, n_samples) |
| 75 | + |
| 76 | +# %% |
| 77 | +# Such sparse, noisy and correlated features can be obtained, for instance, from |
| 78 | +# sensor nodes monitoring some environmental variables, as they typically register |
| 79 | +# similar values depending on their positions (spatial correlations). |
| 80 | +# We can visualize the target. |
| 81 | + |
| 82 | +import matplotlib.pyplot as plt |
| 83 | + |
| 84 | +plt.plot(time_step, y) |
| 85 | +plt.ylabel("target signal") |
| 86 | +plt.xlabel("time") |
| 87 | +_ = plt.title("Superposition of sinusoidal signals") |
43 | 88 |
|
| 89 | +# %% |
| 90 | +# We split the data into train and test sets for simplicity. In practice one |
| 91 | +# should use a :class:`~sklearn.model_selection.TimeSeriesSplit` |
| 92 | +# cross-validation to estimate the variance of the test score. Here we set |
| 93 | +# `shuffle="False"` as we must not use training data that succeed the testing |
| 94 | +# data when dealing with data that have a temporal relationship. |
| 95 | + |
| 96 | +from sklearn.model_selection import train_test_split |
| 97 | + |
| 98 | +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False) |
| 99 | + |
| 100 | +# %% |
| 101 | +# In the following, we compute the performance of three l1-based models in terms |
| 102 | +# of the goodness of fit :math:`R^2` score and the fitting time. Then we make a |
| 103 | +# plot to compare the sparsity of the estimated coefficients with respect to the |
| 104 | +# ground-truth coefficients and finally we analyze the previous results. |
| 105 | +# |
| 106 | +# Lasso |
| 107 | +# ----- |
| 108 | +# |
| 109 | +# In this example, we demo a :class:`~sklearn.linear_model.Lasso` with a fixed |
| 110 | +# value of the regularization parameter `alpha`. In practice, the optimal |
| 111 | +# parameter `alpha` should be selected by passing a |
| 112 | +# :class:`~sklearn.model_selection.TimeSeriesSplit` cross-validation strategy to a |
| 113 | +# :class:`~sklearn.linear_model.LassoCV`. To keep the example simple and fast to |
| 114 | +# execute, we directly set the optimal value for alpha here. |
44 | 115 | from sklearn.linear_model import Lasso
|
| 116 | +from sklearn.metrics import r2_score |
| 117 | +from time import time |
45 | 118 |
|
46 |
| -alpha = 0.1 |
47 |
| -lasso = Lasso(alpha=alpha) |
| 119 | +t0 = time() |
| 120 | +lasso = Lasso(alpha=0.14).fit(X_train, y_train) |
| 121 | +print(f"Lasso fit done in {(time() - t0):.3f}s") |
48 | 122 |
|
49 |
| -y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test) |
| 123 | +y_pred_lasso = lasso.predict(X_test) |
50 | 124 | r2_score_lasso = r2_score(y_test, y_pred_lasso)
|
51 |
| -print(lasso) |
52 |
| -print("r^2 on test data : %f" % r2_score_lasso) |
| 125 | +print(f"Lasso r^2 on test data : {r2_score_lasso:.3f}") |
| 126 | + |
| 127 | +# %% |
| 128 | +# Automatic Relevance Determination (ARD) |
| 129 | +# --------------------------------------- |
| 130 | +# |
| 131 | +# An ARD regression is the bayesian version of the Lasso. It can produce |
| 132 | +# interval estimates for all of the parameters, including the error variance, if |
| 133 | +# required. It is a suitable option when the signals have gaussian noise. See |
| 134 | +# the example :ref:`sphx_glr_auto_examples_linear_model_plot_ard.py` for a |
| 135 | +# comparison of :class:`~sklearn.linear_model.ARDRegression` and |
| 136 | +# :class:`~sklearn.linear_model.BayesianRidge` regressors. |
| 137 | + |
| 138 | +from sklearn.linear_model import ARDRegression |
| 139 | + |
| 140 | +t0 = time() |
| 141 | +ard = ARDRegression().fit(X_train, y_train) |
| 142 | +print(f"ARD fit done in {(time() - t0):.3f}s") |
| 143 | + |
| 144 | +y_pred_ard = ard.predict(X_test) |
| 145 | +r2_score_ard = r2_score(y_test, y_pred_ard) |
| 146 | +print(f"ARD r^2 on test data : {r2_score_ard:.3f}") |
53 | 147 |
|
54 | 148 | # %%
|
55 | 149 | # ElasticNet
|
56 |
| -# --------------------------------------------------- |
| 150 | +# ---------- |
| 151 | +# |
| 152 | +# :class:`~sklearn.linear_model.ElasticNet` is a middle ground between |
| 153 | +# :class:`~sklearn.linear_model.Lasso` and :class:`~sklearn.linear_model.Ridge`, |
| 154 | +# as it combines a L1 and a L2-penalty. The amount of regularization is |
| 155 | +# controlled by the two hyperparameters `l1_ratio` and `alpha`. For `l1_ratio = |
| 156 | +# 0` the penalty is pure L2 and the model is equivalent to a |
| 157 | +# :class:`~sklearn.linear_model.Ridge`. Similarly, `l1_ratio = 1` is a pure L1 |
| 158 | +# penalty and the model is equivalent to a :class:`~sklearn.linear_model.Lasso`. |
| 159 | +# For `0 < l1_ratio < 1`, the penalty is a combination of L1 and L2. |
| 160 | +# |
| 161 | +# As done before, we train the model with fix values for `alpha` and `l1_ratio`. |
| 162 | +# To select their optimal value we used an |
| 163 | +# :class:`~sklearn.linear_model.ElasticNetCV`, not shown here to keep the |
| 164 | +# example simple. |
57 | 165 |
|
58 | 166 | from sklearn.linear_model import ElasticNet
|
59 | 167 |
|
60 |
| -enet = ElasticNet(alpha=alpha, l1_ratio=0.7) |
| 168 | +t0 = time() |
| 169 | +enet = ElasticNet(alpha=0.08, l1_ratio=0.5).fit(X_train, y_train) |
| 170 | +print(f"ElasticNet fit done in {(time() - t0):.3f}s") |
61 | 171 |
|
62 |
| -y_pred_enet = enet.fit(X_train, y_train).predict(X_test) |
| 172 | +y_pred_enet = enet.predict(X_test) |
63 | 173 | r2_score_enet = r2_score(y_test, y_pred_enet)
|
64 |
| -print(enet) |
65 |
| -print("r^2 on test data : %f" % r2_score_enet) |
66 |
| - |
| 174 | +print(f"ElasticNet r^2 on test data : {r2_score_enet:.3f}") |
67 | 175 |
|
68 | 176 | # %%
|
69 |
| -# Plot |
70 |
| -# --------------------------------------------------- |
71 |
| - |
72 |
| -m, s, _ = plt.stem( |
73 |
| - np.where(enet.coef_)[0], |
74 |
| - enet.coef_[enet.coef_ != 0], |
75 |
| - markerfmt="x", |
76 |
| - label="Elastic net coefficients", |
77 |
| -) |
78 |
| -plt.setp([m, s], color="#2ca02c") |
79 |
| -m, s, _ = plt.stem( |
80 |
| - np.where(lasso.coef_)[0], |
81 |
| - lasso.coef_[lasso.coef_ != 0], |
82 |
| - markerfmt="x", |
83 |
| - label="Lasso coefficients", |
84 |
| -) |
85 |
| -plt.setp([m, s], color="#ff7f0e") |
86 |
| -plt.stem( |
87 |
| - np.where(coef)[0], |
88 |
| - coef[coef != 0], |
89 |
| - label="true coefficients", |
90 |
| - markerfmt="bx", |
| 177 | +# Plot and analysis of the results |
| 178 | +# -------------------------------- |
| 179 | +# |
| 180 | +# In this section, we use a heatmap to visualize the sparsity of the true |
| 181 | +# and estimated coefficients of the respective linear models. |
| 182 | + |
| 183 | +import matplotlib.pyplot as plt |
| 184 | +import seaborn as sns |
| 185 | +import pandas as pd |
| 186 | +from matplotlib.colors import SymLogNorm |
| 187 | + |
| 188 | +df = pd.DataFrame( |
| 189 | + { |
| 190 | + "True coefficients": true_coef, |
| 191 | + "Lasso": lasso.coef_, |
| 192 | + "ARDRegression": ard.coef_, |
| 193 | + "ElasticNet": enet.coef_, |
| 194 | + } |
91 | 195 | )
|
92 | 196 |
|
93 |
| -plt.legend(loc="best") |
| 197 | +plt.figure(figsize=(10, 6)) |
| 198 | +ax = sns.heatmap( |
| 199 | + df.T, |
| 200 | + norm=SymLogNorm(linthresh=10e-4, vmin=-1, vmax=1), |
| 201 | + cbar_kws={"label": "coefficients' values"}, |
| 202 | + cmap="seismic_r", |
| 203 | +) |
| 204 | +plt.ylabel("linear model") |
| 205 | +plt.xlabel("coefficients") |
94 | 206 | plt.title(
|
95 |
| - "Lasso $R^2$: %.3f, Elastic Net $R^2$: %.3f" % (r2_score_lasso, r2_score_enet) |
| 207 | + f"Models' coefficients\nLasso $R^2$: {r2_score_lasso:.3f}, " |
| 208 | + f"ARD $R^2$: {r2_score_ard:.3f}, " |
| 209 | + f"ElasticNet $R^2$: {r2_score_enet:.3f}" |
96 | 210 | )
|
97 |
| -plt.show() |
| 211 | +plt.tight_layout() |
| 212 | + |
| 213 | +# %% |
| 214 | +# In the present example :class:`~sklearn.linear_model.ElasticNet` yields the |
| 215 | +# best score and captures the most of the predictive features, yet still fails |
| 216 | +# at finding all the true components. Notice that both |
| 217 | +# :class:`~sklearn.linear_model.ElasticNet` and |
| 218 | +# :class:`~sklearn.linear_model.ARDRegression` result in a less sparse model |
| 219 | +# than a :class:`~sklearn.linear_model.Lasso`. |
| 220 | +# |
| 221 | +# Conclusions |
| 222 | +# ----------- |
| 223 | +# |
| 224 | +# :class:`~sklearn.linear_model.Lasso` is known to recover sparse data |
| 225 | +# effectively but does not perform well with highly correlated features. Indeed, |
| 226 | +# if several correlated features contribute to the target, |
| 227 | +# :class:`~sklearn.linear_model.Lasso` would end up selecting a single one of |
| 228 | +# them. In the case of sparse yet non-correlated features, a |
| 229 | +# :class:`~sklearn.linear_model.Lasso` model would be more suitable. |
| 230 | +# |
| 231 | +# :class:`~sklearn.linear_model.ElasticNet` introduces some sparsity on the |
| 232 | +# coefficients and shrinks their values to zero. Thus, in the presence of |
| 233 | +# correlated features that contribute to the target, the model is still able to |
| 234 | +# reduce their weights without setting them exactly to zero. This results in a |
| 235 | +# less sparse model than a pure :class:`~sklearn.linear_model.Lasso` and may |
| 236 | +# capture non-predictive features as well. |
| 237 | +# |
| 238 | +# :class:`~sklearn.linear_model.ARDRegression` is better when handling gaussian |
| 239 | +# noise, but is still unable to handle correlated features and requires a larger |
| 240 | +# amount of time due to fitting a prior. |
| 241 | +# |
| 242 | +# References |
| 243 | +# ---------- |
| 244 | +# |
| 245 | +# .. [1] :doi:`"Lasso-type recovery of sparse representations for |
| 246 | +# high-dimensional data" N. Meinshausen, B. Yu - The Annals of Statistics |
| 247 | +# 2009, Vol. 37, No. 1, 246–270 <10.1214/07-AOS582>` |
0 commit comments