Skip to content

Commit 92a6b80

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 6324e408c9df35d8e0e6a9a7586d1c9e10c28fec
1 parent fde4166 commit 92a6b80

File tree

1,220 files changed

+4772
-4405
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,220 files changed

+4772
-4405
lines changed
Binary file not shown.
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""
2+
===================================================
3+
Feature selection using SelectFromModel and LassoCV
4+
===================================================
5+
6+
Use SelectFromModel meta-transformer along with Lasso to select the best
7+
couple of features from the diabetes dataset.
8+
9+
Since the L1 norm promotes sparsity of features we might be interested in
10+
selecting only a subset of the most interesting features from the dataset. This
11+
example shows how to select two the most interesting features from the diabetes
12+
dataset.
13+
14+
Diabetes dataset consists of 10 variables (features) collected from 442
15+
diabetes patients. This example shows how to use SelectFromModel and LassoCv to
16+
find the best two features predicting disease progression after one year from
17+
the baseline.
18+
19+
Authors: Manoj Kumar <[email protected]>
20+
Maria Telenczuk <https://github.com/maikia>
21+
License: BSD 3 clause
22+
"""
23+
print(__doc__)
24+
25+
import matplotlib.pyplot as plt
26+
import numpy as np
27+
28+
from sklearn.datasets import load_diabetes
29+
from sklearn.feature_selection import SelectFromModel
30+
from sklearn.linear_model import LassoCV
31+
32+
##############################################################################
33+
# Load the data
34+
# ---------------------------------------------------------
35+
#
36+
# First, let's load the diabetes dataset which is available from within
37+
# sklearn. Then, we will look what features are collected for the diabates
38+
# patients:
39+
40+
diabetes = load_diabetes()
41+
42+
X = diabetes.data
43+
y = diabetes.target
44+
45+
feature_names = diabetes.feature_names
46+
print(feature_names)
47+
48+
##############################################################################
49+
# Find importance of the features
50+
# ---------------------------------------------------------
51+
#
52+
# To decide on the importance of the features we are going to use LassoCV
53+
# estimator. The features with the highest absolute coef_ value are considered
54+
# the most important
55+
56+
clf = LassoCV().fit(X, y)
57+
importance = np.abs(clf.coef_)
58+
print(importance)
59+
60+
##############################################################################
61+
# Select from the model features with the higest score
62+
# ---------------------------------------------------------
63+
#
64+
# Now we want to select the two features which are the most important.
65+
# SelectFromModel() allows for setting the threshold. Only the features with
66+
# the coef_ higher than the threshold will remain. Here, we want to set the
67+
# threshold slightly above the third highest coef_ calculated by LassoCV() from
68+
# our data.
69+
70+
idx_third = importance.argsort()[-3]
71+
threshold = importance[idx_third] + 0.01
72+
73+
idx_features = (-importance).argsort()[:2]
74+
name_features = np.array(feature_names)[idx_features]
75+
print('Selected features: {}'.format(name_features))
76+
77+
sfm = SelectFromModel(clf, threshold=threshold)
78+
sfm.fit(X, y)
79+
X_transform = sfm.transform(X)
80+
81+
n_features = sfm.transform(X).shape[1]
82+
83+
##############################################################################
84+
# Plot the two most important features
85+
# ---------------------------------------------------------
86+
#
87+
# Finally we will plot the selected two features from the data.
88+
89+
plt.title(
90+
"Features from diabets using SelectFromModel with "
91+
"threshold %0.3f." % sfm.threshold)
92+
feature1 = X_transform[:, 0]
93+
feature2 = X_transform[:, 1]
94+
plt.plot(feature1, feature2, 'r.')
95+
plt.xlabel("First feature: {}".format(name_features[0]))
96+
plt.ylabel("Second feature: {}".format(name_features[1]))
97+
plt.ylim([np.min(feature2), np.max(feature2)])
98+
plt.show()
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Feature selection using SelectFromModel and LassoCV\n\n\nUse SelectFromModel meta-transformer along with Lasso to select the best\ncouple of features from the diabetes dataset.\n\nSince the L1 norm promotes sparsity of features we might be interested in\nselecting only a subset of the most interesting features from the dataset. This\nexample shows how to select two the most interesting features from the diabetes\ndataset.\n\nDiabetes dataset consists of 10 variables (features) collected from 442\ndiabetes patients. This example shows how to use SelectFromModel and LassoCv to\nfind the best two features predicting disease progression after one year from\nthe baseline.\n\nAuthors: Manoj Kumar <[email protected]>\n Maria Telenczuk <https://github.com/maikia>\nLicense: BSD 3 clause\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"print(__doc__)\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.datasets import load_diabetes\nfrom sklearn.feature_selection import SelectFromModel\nfrom sklearn.linear_model import LassoCV"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"Load the data\n---------------------------------------------------------\n\nFirst, let's load the diabetes dataset which is available from within\nsklearn. Then, we will look what features are collected for the diabates\npatients:\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"diabetes = load_diabetes()\n\nX = diabetes.data\ny = diabetes.target\n\nfeature_names = diabetes.feature_names\nprint(feature_names)"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {},
53+
"source": [
54+
"Find importance of the features\n---------------------------------------------------------\n\nTo decide on the importance of the features we are going to use LassoCV\nestimator. The features with the highest absolute coef_ value are considered\nthe most important\n\n"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [],
64+
"source": [
65+
"clf = LassoCV().fit(X, y)\nimportance = np.abs(clf.coef_)\nprint(importance)"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"metadata": {},
71+
"source": [
72+
"Select from the model features with the higest score\n---------------------------------------------------------\n\nNow we want to select the two features which are the most important.\nSelectFromModel() allows for setting the threshold. Only the features with\nthe coef_ higher than the threshold will remain. Here, we want to set the\nthreshold slightly above the third highest coef_ calculated by LassoCV() from\nour data.\n\n"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": null,
78+
"metadata": {
79+
"collapsed": false
80+
},
81+
"outputs": [],
82+
"source": [
83+
"idx_third = importance.argsort()[-3]\nthreshold = importance[idx_third] + 0.01\n\nidx_features = (-importance).argsort()[:2]\nname_features = np.array(feature_names)[idx_features]\nprint('Selected features: {}'.format(name_features))\n\nsfm = SelectFromModel(clf, threshold=threshold)\nsfm.fit(X, y)\nX_transform = sfm.transform(X)\n\nn_features = sfm.transform(X).shape[1]"
84+
]
85+
},
86+
{
87+
"cell_type": "markdown",
88+
"metadata": {},
89+
"source": [
90+
"Plot the two most important features\n---------------------------------------------------------\n\nFinally we will plot the selected two features from the data.\n\n"
91+
]
92+
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": null,
96+
"metadata": {
97+
"collapsed": false
98+
},
99+
"outputs": [],
100+
"source": [
101+
"plt.title(\n \"Features from diabets using SelectFromModel with \"\n \"threshold %0.3f.\" % sfm.threshold)\nfeature1 = X_transform[:, 0]\nfeature2 = X_transform[:, 1]\nplt.plot(feature1, feature2, 'r.')\nplt.xlabel(\"First feature: {}\".format(name_features[0]))\nplt.ylabel(\"Second feature: {}\".format(name_features[1]))\nplt.ylim([np.min(feature2), np.max(feature2)])\nplt.show()"
102+
]
103+
}
104+
],
105+
"metadata": {
106+
"kernelspec": {
107+
"display_name": "Python 3",
108+
"language": "python",
109+
"name": "python3"
110+
},
111+
"language_info": {
112+
"codemirror_mode": {
113+
"name": "ipython",
114+
"version": 3
115+
},
116+
"file_extension": ".py",
117+
"mimetype": "text/x-python",
118+
"name": "python",
119+
"nbconvert_exporter": "python",
120+
"pygments_lexer": "ipython3",
121+
"version": "3.8.1"
122+
}
123+
},
124+
"nbformat": 4,
125+
"nbformat_minor": 0
126+
}

dev/_downloads/c65d78a64b1ce8a1acbf28e24f9f348c/plot_select_from_model_boston.ipynb

Lines changed: 0 additions & 54 deletions
This file was deleted.
Binary file not shown.

dev/_downloads/ec16caf0c2a5d196169a662373beba02/plot_select_from_model_boston.py

Lines changed: 0 additions & 50 deletions
This file was deleted.

dev/_downloads/scikit-learn-docs.pdf

6.07 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes
32 Bytes
32 Bytes

0 commit comments

Comments
 (0)