|
1 | 1 | """
|
2 |
| -=================================================== |
3 |
| -Feature selection using SelectFromModel and LassoCV |
4 |
| -=================================================== |
| 2 | +============================================ |
| 3 | +Model-based and sequential feature selection |
| 4 | +============================================ |
5 | 5 |
|
6 |
| -Use SelectFromModel meta-transformer along with Lasso to select the best |
7 |
| -couple of features from the diabetes dataset. |
| 6 | +This example illustrates and compares two approaches for feature selection: |
| 7 | +:class:`~sklearn.feature_selection.SelectFromModel` which is based on feature |
| 8 | +importance, and |
| 9 | +:class:`~sklearn.feature_selection.SequentialFeatureSelection` which relies |
| 10 | +on a greedy approach. |
8 | 11 |
|
9 |
| -Since the L1 norm promotes sparsity of features we might be interested in |
10 |
| -selecting only a subset of the most interesting features from the dataset. This |
11 |
| -example shows how to select two the most interesting features from the diabetes |
12 |
| -dataset. |
13 |
| -
|
14 |
| -Diabetes dataset consists of 10 variables (features) collected from 442 |
15 |
| -diabetes patients. This example shows how to use SelectFromModel and LassoCv to |
16 |
| -find the best two features predicting disease progression after one year from |
17 |
| -the baseline. |
| 12 | +We use the Diabetes dataset, which consists of 10 features collected from 442 |
| 13 | +diabetes patients. |
18 | 14 |
|
19 | 15 | Authors: `Manoj Kumar <[email protected]>`_,
|
20 |
| -`Maria Telenczuk <https://github.com/maikia>`_ |
| 16 | +`Maria Telenczuk <https://github.com/maikia>`_, Nicolas Hug. |
21 | 17 |
|
22 | 18 | License: BSD 3 clause
|
23 | 19 | """
|
24 | 20 |
|
25 | 21 | print(__doc__)
|
26 | 22 |
|
27 |
| -import matplotlib.pyplot as plt |
28 |
| -import numpy as np |
29 |
| - |
30 |
| -from sklearn.datasets import load_diabetes |
31 |
| -from sklearn.feature_selection import SelectFromModel |
32 |
| -from sklearn.linear_model import LassoCV |
33 | 23 |
|
34 | 24 | # %%
|
35 |
| -# Load the data |
36 |
| -# --------------------------------------------------------- |
| 25 | +# Loading the data |
| 26 | +# ---------------- |
37 | 27 | #
|
38 |
| -# First, let's load the diabetes dataset which is available from within |
39 |
| -# sklearn. Then, we will look what features are collected for the diabates |
40 |
| -# patients: |
| 28 | +# We first load the diabetes dataset which is available from within |
| 29 | +# scikit-learn, and print its description: |
| 30 | +from sklearn.datasets import load_diabetes |
41 | 31 |
|
42 | 32 | diabetes = load_diabetes()
|
| 33 | +X, y = diabetes.data, diabetes.target |
| 34 | +print(diabetes.DESCR) |
43 | 35 |
|
44 |
| -X = diabetes.data |
45 |
| -y = diabetes.target |
| 36 | +# %% |
| 37 | +# Feature importance from coefficients |
| 38 | +# ------------------------------------ |
| 39 | +# |
| 40 | +# To get an idea of the importance of the features, we are going to use the |
| 41 | +# :class:`~sklearn.linear_model.LassoCV` estimator. The features with the |
| 42 | +# highest absolute `coef_` value are considered the most important. |
| 43 | +# We can observe the coefficients directly without needing to scale them (or |
| 44 | +# scale the data) because from the description above, we know that the features |
| 45 | +# were already standardized. |
| 46 | +# For a more complete example on the interpretations of the coefficients of |
| 47 | +# linear models, you may refer to |
| 48 | +# :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py`. |
| 49 | +import matplotlib.pyplot as plt |
| 50 | +import numpy as np |
| 51 | +from sklearn.linear_model import LassoCV |
46 | 52 |
|
47 |
| -feature_names = diabetes.feature_names |
48 |
| -print(feature_names) |
| 53 | +lasso = LassoCV().fit(X, y) |
| 54 | +importance = np.abs(lasso.coef_) |
| 55 | +feature_names = np.array(diabetes.feature_names) |
| 56 | +plt.bar(height=importance, x=feature_names) |
| 57 | +plt.title("Feature importances via coefficients") |
| 58 | +plt.show() |
49 | 59 |
|
50 | 60 | # %%
|
51 |
| -# Find importance of the features |
52 |
| -# --------------------------------------------------------- |
| 61 | +# Selecting features based on importance |
| 62 | +# -------------------------------------- |
53 | 63 | #
|
54 |
| -# To decide on the importance of the features we are going to use LassoCV |
55 |
| -# estimator. The features with the highest absolute `coef_` value are |
56 |
| -# considered the most important |
| 64 | +# Now we want to select the two features which are the most important according |
| 65 | +# to the coefficients. The :class:`~sklearn.feature_selection.SelectFromModel` |
| 66 | +# is meant just for that. :class:`~sklearn.feature_selection.SelectFromModel` |
| 67 | +# accepts a `threshold` parameter and will select the features whose importance |
| 68 | +# (defined by the coefficients) are above this threshold. |
| 69 | +# |
| 70 | +# Since we want to select only 2 features, we will set this threshold slightly |
| 71 | +# above the coefficient of third most important feature. |
| 72 | +from sklearn.feature_selection import SelectFromModel |
| 73 | +from time import time |
57 | 74 |
|
58 |
| -clf = LassoCV().fit(X, y) |
59 |
| -importance = np.abs(clf.coef_) |
60 |
| -print(importance) |
| 75 | +threshold = np.sort(importance)[-3] + 0.01 |
| 76 | + |
| 77 | +tic = time() |
| 78 | +sfm = SelectFromModel(lasso, threshold=threshold).fit(X, y) |
| 79 | +toc = time() |
| 80 | +print("Features selected by SelectFromModel: " |
| 81 | + f"{feature_names[sfm.get_support()]}") |
| 82 | +print(f"Done in {toc - tic:.3f}s") |
61 | 83 |
|
62 | 84 | # %%
|
63 |
| -# Select from the model features with the higest score |
64 |
| -# --------------------------------------------------------- |
| 85 | +# Selecting features with Sequential Feature Selection |
| 86 | +# ---------------------------------------------------- |
| 87 | +# |
| 88 | +# Another way of selecting features is to use |
| 89 | +# :class:`~sklearn.feature_selection.SequentialFeatureSelector` |
| 90 | +# (SFS). SFS is a greedy procedure where, at each iteration, we choose the best |
| 91 | +# new feature to add to our selected features based a cross-validation score. |
| 92 | +# That is, we start with 0 features and choose the best single feature with the |
| 93 | +# highest score. The procedure is repeated until we reach the desired number of |
| 94 | +# selected features. |
65 | 95 | #
|
66 |
| -# Now we want to select the two features which are the most important. |
67 |
| -# SelectFromModel() allows for setting the threshold. Only the features with |
68 |
| -# the `coef_` higher than the threshold will remain. Here, we want to set the |
69 |
| -# threshold slightly above the third highest `coef_` calculated by LassoCV() |
70 |
| -# from our data. |
| 96 | +# We can also go in the reverse direction (backward SFS), *i.e.* start with all |
| 97 | +# the features and greedily choose features to remove one by one. We illustrate |
| 98 | +# both approaches here. |
71 | 99 |
|
72 |
| -idx_third = importance.argsort()[-3] |
73 |
| -threshold = importance[idx_third] + 0.01 |
| 100 | +from sklearn.feature_selection import SequentialFeatureSelector |
74 | 101 |
|
75 |
| -idx_features = (-importance).argsort()[:2] |
76 |
| -name_features = np.array(feature_names)[idx_features] |
77 |
| -print('Selected features: {}'.format(name_features)) |
| 102 | +tic_fwd = time() |
| 103 | +sfs_forward = SequentialFeatureSelector(lasso, n_features_to_select=2, |
| 104 | + direction='forward').fit(X, y) |
| 105 | +toc_fwd = time() |
78 | 106 |
|
79 |
| -sfm = SelectFromModel(clf, threshold=threshold) |
80 |
| -sfm.fit(X, y) |
81 |
| -X_transform = sfm.transform(X) |
| 107 | +tic_bwd = time() |
| 108 | +sfs_backward = SequentialFeatureSelector(lasso, n_features_to_select=2, |
| 109 | + direction='backward').fit(X, y) |
| 110 | +toc_bwd = time() |
82 | 111 |
|
83 |
| -n_features = sfm.transform(X).shape[1] |
| 112 | +print("Features selected by forward sequential selection: " |
| 113 | + f"{feature_names[sfs_forward.get_support()]}") |
| 114 | +print(f"Done in {toc_fwd - tic_fwd:.3f}s") |
| 115 | +print("Features selected by backward sequential selection: " |
| 116 | + f"{feature_names[sfs_backward.get_support()]}") |
| 117 | +print(f"Done in {toc_bwd - tic_bwd:.3f}s") |
84 | 118 |
|
85 | 119 | # %%
|
86 |
| -# Plot the two most important features |
87 |
| -# --------------------------------------------------------- |
| 120 | +# Discussion |
| 121 | +# ---------- |
88 | 122 | #
|
89 |
| -# Finally we will plot the selected two features from the data. |
90 |
| - |
91 |
| -plt.title( |
92 |
| - "Features from diabets using SelectFromModel with " |
93 |
| - "threshold %0.3f." % sfm.threshold) |
94 |
| -feature1 = X_transform[:, 0] |
95 |
| -feature2 = X_transform[:, 1] |
96 |
| -plt.plot(feature1, feature2, 'r.') |
97 |
| -plt.xlabel("First feature: {}".format(name_features[0])) |
98 |
| -plt.ylabel("Second feature: {}".format(name_features[1])) |
99 |
| -plt.ylim([np.min(feature2), np.max(feature2)]) |
100 |
| -plt.show() |
| 123 | +# Interestingly, forward and backward selection have selected the same set of |
| 124 | +# features. In general, this isn't the case and the two methods would lead to |
| 125 | +# different results. |
| 126 | +# |
| 127 | +# We also note that the features selected by SFS differ from those selected by |
| 128 | +# feature importance: SFS selects `bmi` instead of `s1`. This does sound |
| 129 | +# reasonable though, since `bmi` corresponds to the third most important |
| 130 | +# feature according to the coefficients. It is quite remarkable considering |
| 131 | +# that SFS makes no use of the coefficients at all. |
| 132 | +# |
| 133 | +# To finish with, we should note that |
| 134 | +# :class:`~sklearn.feature_selection.SelectFromModel` is significantly faster |
| 135 | +# than SFS. Indeed, :class:`~sklearn.feature_selection.SelectFromModel` only |
| 136 | +# needs to fit a model once, while SFS needs to cross-validate many different |
| 137 | +# models for each of the iterations. SFS however works with any model, while |
| 138 | +# :class:`~sklearn.feature_selection.SelectFromModel` requires the underlying |
| 139 | +# estimator to expose a `coef_` attribute or a `feature_importances_` |
| 140 | +# attribute. The forward SFS is faster than the backward SFS because it only |
| 141 | +# needs to perform `n_features_to_select = 2` iterations, while the backward |
| 142 | +# SFS needs to perform `n_features - n_features_to_select = 8` iterations. |
0 commit comments