bsipocz
diff --git a/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
1.92 KB b/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
1.92 KB
diff --git a/‎dev/_downloads/9a690c19be1584e1c965a01a1e733b2b/plot_select_from_model_diabetes.py
Lines changed: 98 additions & 0 deletions b/‎dev/_downloads/9a690c19be1584e1c965a01a1e733b2b/plot_select_from_model_diabetes.py
Lines changed: 98 additions & 0 deletions
diff --git a/‎dev/_downloads/a992071b7848e988f9122ebc928e2988/plot_select_from_model_diabetes.ipynb
Lines changed: 126 additions & 0 deletions b/‎dev/_downloads/a992071b7848e988f9122ebc928e2988/plot_select_from_model_diabetes.ipynb
Lines changed: 126 additions & 0 deletions
diff --git a/‎dev/_downloads/c65d78a64b1ce8a1acbf28e24f9f348c/plot_select_from_model_boston.ipynb
Lines changed: 0 additions & 54 deletions b/‎dev/_downloads/c65d78a64b1ce8a1acbf28e24f9f348c/plot_select_from_model_boston.ipynb
Lines changed: 0 additions & 54 deletions
diff --git a/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
2.72 KB b/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
2.72 KB
diff --git a/‎dev/_downloads/ec16caf0c2a5d196169a662373beba02/plot_select_from_model_boston.py
Lines changed: 0 additions & 50 deletions b/‎dev/_downloads/ec16caf0c2a5d196169a662373beba02/plot_select_from_model_boston.py
Lines changed: 0 additions & 50 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
6.07 KB b/‎dev/_downloads/scikit-learn-docs.pdf
6.07 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
32 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
32 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
32 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
32 Bytes
@@ -0,0 +1,98 @@
+"""
+===================================================
+Feature selection using SelectFromModel and LassoCV
+===================================================
+
+Use SelectFromModel meta-transformer along with Lasso to select the best
+couple of features from the diabetes dataset.
+
+Since the L1 norm promotes sparsity of features we might be interested in
+selecting only a subset of the most interesting features from the dataset. This
+example shows how to select two the most interesting features from the diabetes
+dataset.
+
+Diabetes dataset consists of 10 variables (features) collected from 442
+diabetes patients. This example shows how to use SelectFromModel and LassoCv to
+find the best two features predicting disease progression after one year from
+the baseline.
+
+Authors: Manoj Kumar <[email protected]>
+         Maria Telenczuk <https://github.com/maikia>
+License: BSD 3 clause
+"""
+print(__doc__)
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.datasets import load_diabetes
+from sklearn.feature_selection import SelectFromModel
+from sklearn.linear_model import LassoCV
+
+##############################################################################
+# Load the data
+# ---------------------------------------------------------
+#
+# First, let's load the diabetes dataset which is available from within
+# sklearn. Then, we will look what features are collected for the diabates
+# patients:
+
+diabetes = load_diabetes()
+
+X = diabetes.data
+y = diabetes.target
+
+feature_names = diabetes.feature_names
+print(feature_names)
+
+##############################################################################
+# Find importance of the features
+# ---------------------------------------------------------
+#
+# To decide on the importance of the features we are going to use LassoCV
+# estimator. The features with the highest absolute coef_ value are considered
+# the most important
+
+clf = LassoCV().fit(X, y)
+importance = np.abs(clf.coef_)
+print(importance)
+
+##############################################################################
+# Select from the model features with the higest score
+# ---------------------------------------------------------
+#
+# Now we want to select the two features which are the most important.
+# SelectFromModel() allows for setting the threshold. Only the features with
+# the coef_ higher than the threshold will remain. Here, we want to set the
+# threshold slightly above the third highest coef_ calculated by LassoCV() from
+# our data.
+
+idx_third = importance.argsort()[-3]
+threshold = importance[idx_third] + 0.01
+
+idx_features = (-importance).argsort()[:2]
+name_features = np.array(feature_names)[idx_features]
+print('Selected features: {}'.format(name_features))
+
+sfm = SelectFromModel(clf, threshold=threshold)
+sfm.fit(X, y)
+X_transform = sfm.transform(X)
+
+n_features = sfm.transform(X).shape[1]
+
+##############################################################################
+# Plot the two most important features
+# ---------------------------------------------------------
+#
+# Finally we will plot the selected two features from the data.
+
+plt.title(
+    "Features from diabets using SelectFromModel with "
+    "threshold %0.3f." % sfm.threshold)
+feature1 = X_transform[:, 0]
+feature2 = X_transform[:, 1]
+plt.plot(feature1, feature2, 'r.')
+plt.xlabel("First feature: {}".format(name_features[0]))
+plt.ylabel("Second feature: {}".format(name_features[1]))
+plt.ylim([np.min(feature2), np.max(feature2)])
+plt.show()
@@ -0,0 +1,126 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Feature selection using SelectFromModel and LassoCV\n\n\nUse SelectFromModel meta-transformer along with Lasso to select the best\ncouple of features from the diabetes dataset.\n\nSince the L1 norm promotes sparsity of features we might be interested in\nselecting only a subset of the most interesting features from the dataset. This\nexample shows how to select two the most interesting features from the diabetes\ndataset.\n\nDiabetes dataset consists of 10 variables (features) collected from 442\ndiabetes patients. This example shows how to use SelectFromModel and LassoCv to\nfind the best two features predicting disease progression after one year from\nthe baseline.\n\nAuthors: Manoj Kumar <[email protected]>\n         Maria Telenczuk <https://github.com/maikia>\nLicense: BSD 3 clause\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(__doc__)\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.datasets import load_diabetes\nfrom sklearn.feature_selection import SelectFromModel\nfrom sklearn.linear_model import LassoCV"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Load the data\n---------------------------------------------------------\n\nFirst, let's load the diabetes dataset which is available from within\nsklearn. Then, we will look what features are collected for the diabates\npatients:\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "diabetes = load_diabetes()\n\nX = diabetes.data\ny = diabetes.target\n\nfeature_names = diabetes.feature_names\nprint(feature_names)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Find importance of the features\n---------------------------------------------------------\n\nTo decide on the importance of the features we are going to use LassoCV\nestimator. The features with the highest absolute coef_ value are considered\nthe most important\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "clf = LassoCV().fit(X, y)\nimportance = np.abs(clf.coef_)\nprint(importance)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Select from the model features with the higest score\n---------------------------------------------------------\n\nNow we want to select the two features which are the most important.\nSelectFromModel() allows for setting the threshold. Only the features with\nthe coef_ higher than the threshold will remain. Here, we want to set the\nthreshold slightly above the third highest coef_ calculated by LassoCV() from\nour data.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "idx_third = importance.argsort()[-3]\nthreshold = importance[idx_third] + 0.01\n\nidx_features = (-importance).argsort()[:2]\nname_features = np.array(feature_names)[idx_features]\nprint('Selected features: {}'.format(name_features))\n\nsfm = SelectFromModel(clf, threshold=threshold)\nsfm.fit(X, y)\nX_transform = sfm.transform(X)\n\nn_features = sfm.transform(X).shape[1]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Plot the two most important features\n---------------------------------------------------------\n\nFinally we will plot the selected two features from the data.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "plt.title(\n    \"Features from diabets using SelectFromModel with \"\n    \"threshold %0.3f.\" % sfm.threshold)\nfeature1 = X_transform[:, 0]\nfeature2 = X_transform[:, 1]\nplt.plot(feature1, feature2, 'r.')\nplt.xlabel(\"First feature: {}\".format(name_features[0]))\nplt.ylabel(\"Second feature: {}\".format(name_features[1]))\nplt.ylim([np.min(feature2), np.max(feature2)])\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.1"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}