scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.15 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.15 KB
diff --git a/‎dev/_downloads/592b2521e44501266ca5339d1fb123cb/plot_rfe_with_cross_validation.py
Lines changed: 53 additions & 23 deletions b/‎dev/_downloads/592b2521e44501266ca5339d1fb123cb/plot_rfe_with_cross_validation.py
Lines changed: 53 additions & 23 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.78 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.78 KB
diff --git a/‎dev/_downloads/949ed208b2147ed2b3e348e81fef52be/plot_rfe_with_cross_validation.ipynb
Lines changed: 52 additions & 2 deletions b/‎dev/_downloads/949ed208b2147ed2b3e348e81fef52be/plot_rfe_with_cross_validation.ipynb
Lines changed: 52 additions & 2 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
-1.13 KB b/‎dev/_downloads/scikit-learn-docs.zip
-1.13 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-204 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-204 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
111 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
111 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-123 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-123 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_all_scaling_006.png
0 Bytes b/‎dev/_images/sphx_glr_plot_all_scaling_006.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_all_scaling_007.png
0 Bytes b/‎dev/_images/sphx_glr_plot_all_scaling_007.png
0 Bytes
@@ -3,57 +3,87 @@
 Recursive feature elimination with cross-validation
 ===================================================
 
-A recursive feature elimination example with automatic tuning of the
+A Recursive Feature Elimination (RFE) example with automatic tuning of the
 number of features selected with cross-validation.
 
 """
 
-import numpy as np
-import matplotlib.pyplot as plt
-from sklearn.svm import SVC
-from sklearn.model_selection import StratifiedKFold
-from sklearn.feature_selection import RFECV
+# %%
+# Data generation
+# ---------------
+#
+# We build a classification task using 3 informative features. The introduction
+# of 2 additional redundant (i.e. correlated) features has the effect that the
+# selected features vary depending on the cross-validation fold. The remaining
+# features are non-informative as they are drawn at random.
+
 from sklearn.datasets import make_classification
 
-# Build a classification task using 3 informative features
 X, y = make_classification(
-    n_samples=1000,
-    n_features=25,
+    n_samples=500,
+    n_features=15,
     n_informative=3,
     n_redundant=2,
     n_repeated=0,
     n_classes=8,
     n_clusters_per_class=1,
+    class_sep=0.8,
     random_state=0,
 )
 
-# Create the RFE object and compute a cross-validated score.
-svc = SVC(kernel="linear")
-# The "accuracy" scoring shows the proportion of correct classifications
+# %%
+# Model training and selection
+# ----------------------------
+#
+# We create the RFE object and compute the cross-validated scores. The scoring
+# strategy "accuracy" optimizes the proportion of correctly classified samples.
+
+from sklearn.feature_selection import RFECV
+from sklearn.model_selection import StratifiedKFold
+from sklearn.linear_model import LogisticRegression
 
 min_features_to_select = 1  # Minimum number of features to consider
+clf = LogisticRegression()
+cv = StratifiedKFold(5)
+
 rfecv = RFECV(
-    estimator=svc,
+    estimator=clf,
     step=1,
-    cv=StratifiedKFold(2),
+    cv=cv,
     scoring="accuracy",
     min_features_to_select=min_features_to_select,
+    n_jobs=2,
 )
 rfecv.fit(X, y)
 
-print("Optimal number of features : %d" % rfecv.n_features_)
-
-n_scores = len(rfecv.cv_results_["split0_test_score"])
-cv_scores = np.vstack(
-    (rfecv.cv_results_["split0_test_score"], rfecv.cv_results_["split1_test_score"])
-).T
+print(f"Optimal number of features: {rfecv.n_features_}")
 
+# %%
+# In the present case, the model with 3 features (which corresponds to the true
+# generative model) is found to be the most optimal.
+#
 # Plot number of features VS. cross-validation scores
+# ---------------------------------------------------
+
+import matplotlib.pyplot as plt
+
+n_scores = len(rfecv.cv_results_["mean_test_score"])
 plt.figure()
 plt.xlabel("Number of features selected")
-plt.ylabel("Cross validation score (accuracy)")
-plt.plot(
+plt.ylabel("Mean test accuracy")
+plt.errorbar(
     range(min_features_to_select, n_scores + min_features_to_select),
-    cv_scores,
+    rfecv.cv_results_["mean_test_score"],
+    yerr=rfecv.cv_results_["std_test_score"],
 )
+plt.title("Recursive Feature Elimination \nwith correlated features")
 plt.show()
+
+# %%
+# From the plot above one can further notice a plateau of equivalent scores
+# (similar mean value and overlapping errorbars) for 3 to 5 selected features.
+# This is the result of introducing correlated features. Indeed, the optimal
+# model selected by the RFE can lie within this range, depending on the
+# cross-validation technique. The test accuracy decreases above 5 selected
+# features, this is, keeping non-informative features leads to over-fitting and
+# is therefore detrimental for the statistical performance of the models.
@@ -15,7 +15,32 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Recursive feature elimination with cross-validation\n\nA recursive feature elimination example with automatic tuning of the\nnumber of features selected with cross-validation.\n"
+        "\n# Recursive feature elimination with cross-validation\n\nA Recursive Feature Elimination (RFE) example with automatic tuning of the\nnumber of features selected with cross-validation.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Data generation\n\nWe build a classification task using 3 informative features. The introduction\nof 2 additional redundant (i.e. correlated) features has the effect that the\nselected features vary depending on the cross-validation fold. The remaining\nfeatures are non-informative as they are drawn at random.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.datasets import make_classification\n\nX, y = make_classification(\n    n_samples=500,\n    n_features=15,\n    n_informative=3,\n    n_redundant=2,\n    n_repeated=0,\n    n_classes=8,\n    n_clusters_per_class=1,\n    class_sep=0.8,\n    random_state=0,\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Model training and selection\n\nWe create the RFE object and compute the cross-validated scores. The scoring\nstrategy \"accuracy\" optimizes the proportion of correctly classified samples.\n\n"
       ]
     },
     {
@@ -26,7 +51,32 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.svm import SVC\nfrom sklearn.model_selection import StratifiedKFold\nfrom sklearn.feature_selection import RFECV\nfrom sklearn.datasets import make_classification\n\n# Build a classification task using 3 informative features\nX, y = make_classification(\n    n_samples=1000,\n    n_features=25,\n    n_informative=3,\n    n_redundant=2,\n    n_repeated=0,\n    n_classes=8,\n    n_clusters_per_class=1,\n    random_state=0,\n)\n\n# Create the RFE object and compute a cross-validated score.\nsvc = SVC(kernel=\"linear\")\n# The \"accuracy\" scoring shows the proportion of correct classifications\n\nmin_features_to_select = 1  # Minimum number of features to consider\nrfecv = RFECV(\n    estimator=svc,\n    step=1,\n    cv=StratifiedKFold(2),\n    scoring=\"accuracy\",\n    min_features_to_select=min_features_to_select,\n)\nrfecv.fit(X, y)\n\nprint(\"Optimal number of features : %d\" % rfecv.n_features_)\n\nn_scores = len(rfecv.cv_results_[\"split0_test_score\"])\ncv_scores = np.vstack(\n    (rfecv.cv_results_[\"split0_test_score\"], rfecv.cv_results_[\"split1_test_score\"])\n).T\n\n# Plot number of features VS. cross-validation scores\nplt.figure()\nplt.xlabel(\"Number of features selected\")\nplt.ylabel(\"Cross validation score (accuracy)\")\nplt.plot(\n    range(min_features_to_select, n_scores + min_features_to_select),\n    cv_scores,\n)\nplt.show()"
+        "from sklearn.feature_selection import RFECV\nfrom sklearn.model_selection import StratifiedKFold\nfrom sklearn.linear_model import LogisticRegression\n\nmin_features_to_select = 1  # Minimum number of features to consider\nclf = LogisticRegression()\ncv = StratifiedKFold(5)\n\nrfecv = RFECV(\n    estimator=clf,\n    step=1,\n    cv=cv,\n    scoring=\"accuracy\",\n    min_features_to_select=min_features_to_select,\n    n_jobs=2,\n)\nrfecv.fit(X, y)\n\nprint(f\"Optimal number of features: {rfecv.n_features_}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "In the present case, the model with 3 features (which corresponds to the true\ngenerative model) is found to be the most optimal.\n\n## Plot number of features VS. cross-validation scores\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n\nn_scores = len(rfecv.cv_results_[\"mean_test_score\"])\nplt.figure()\nplt.xlabel(\"Number of features selected\")\nplt.ylabel(\"Mean test accuracy\")\nplt.errorbar(\n    range(min_features_to_select, n_scores + min_features_to_select),\n    rfecv.cv_results_[\"mean_test_score\"],\n    yerr=rfecv.cv_results_[\"std_test_score\"],\n)\nplt.title(\"Recursive Feature Elimination \\nwith correlated features\")\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "From the plot above one can further notice a plateau of equivalent scores\n(similar mean value and overlapping errorbars) for 3 to 5 selected features.\nThis is the result of introducing correlated features. Indeed, the optimal\nmodel selected by the RFE can lie within this range, depending on the\ncross-validation technique. The test accuracy decreases above 5 selected\nfeatures, this is, keeping non-informative features leads to over-fitting and\nis therefore detrimental for the statistical performance of the models.\n\n"
       ]
     }
   ],