codeur66
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
7.2 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
7.2 KB
diff --git a/‎dev/_downloads/23fb33f64b3c23edf25165a3a4f04237/plot_successive_halving_iterations.ipynb
Lines changed: 97 additions & 0 deletions b/‎dev/_downloads/23fb33f64b3c23edf25165a3a4f04237/plot_successive_halving_iterations.ipynb
Lines changed: 97 additions & 0 deletions
diff --git a/‎dev/_downloads/49fae0b4f6ab58738dcbf62236756548/plot_successive_halving_iterations.py
Lines changed: 84 additions & 0 deletions b/‎dev/_downloads/49fae0b4f6ab58738dcbf62236756548/plot_successive_halving_iterations.py
Lines changed: 84 additions & 0 deletions
diff --git a/‎dev/_downloads/6383d955c013c730f9d211f15e261f38/plot_successive_halving_heatmap.py
Lines changed: 122 additions & 0 deletions b/‎dev/_downloads/6383d955c013c730f9d211f15e261f38/plot_successive_halving_heatmap.py
Lines changed: 122 additions & 0 deletions
@@ -0,0 +1,97 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Successive Halving Iterations\n\nThis example illustrates how a successive halving search (\n:class:`~sklearn.model_selection.HalvingGridSearchCV` and\n:class:`~sklearn.model_selection.HalvingRandomSearchCV`) iteratively chooses\nthe best parameter combination out of multiple candidates.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\nfrom sklearn import datasets\nimport matplotlib.pyplot as plt\nfrom scipy.stats import randint\nimport numpy as np\n\nfrom sklearn.experimental import enable_successive_halving  # noqa\nfrom sklearn.model_selection import HalvingRandomSearchCV\nfrom sklearn.ensemble import RandomForestClassifier\n\n\nprint(__doc__)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We first define the parameter space and train a\n:class:`~sklearn.model_selection.HalvingRandomSearchCV` instance.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "rng = np.random.RandomState(0)\n\nX, y = datasets.make_classification(n_samples=700, random_state=rng)\n\nclf = RandomForestClassifier(n_estimators=20, random_state=rng)\n\nparam_dist = {\"max_depth\": [3, None],\n              \"max_features\": randint(1, 11),\n              \"min_samples_split\": randint(2, 11),\n              \"bootstrap\": [True, False],\n              \"criterion\": [\"gini\", \"entropy\"]}\n\nrsh = HalvingRandomSearchCV(\n    estimator=clf,\n    param_distributions=param_dist,\n    factor=2,\n    random_state=rng)\nrsh.fit(X, y)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We can now use the `cv_results_` attribute of the search estimator to inspect\nand plot the evolution of the search.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "results = pd.DataFrame(rsh.cv_results_)\nresults['params_str'] = results.params.apply(str)\nresults.drop_duplicates(subset=('params_str', 'iter'), inplace=True)\nmean_scores = results.pivot(index='iter', columns='params_str',\n                            values='mean_test_score')\nax = mean_scores.plot(legend=False, alpha=.6)\n\nlabels = [\n    f'iter={i}\\nn_samples={rsh.n_resources_[i]}\\n'\n    f'n_candidates={rsh.n_candidates_[i]}'\n    for i in range(rsh.n_iterations_)\n]\nax.set_xticklabels(labels, rotation=45, multialignment='left')\nax.set_title('Scores of candidates over iterations')\nax.set_ylabel('mean test score', fontsize=15)\nax.set_xlabel('iterations', fontsize=15)\nplt.tight_layout()\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Number of candidates and amount of resource at each iteration\n\nAt the first iteration, a small amount of resources is used. The resource\nhere is the number of samples that the estimators are trained on. All\ncandidates are evaluated.\n\nAt the second iteration, only the best half of the candidates is evaluated.\nThe number of allocated resources is doubled: candidates are evaluated on\ntwice as many samples.\n\nThis process is repeated until the last iteration, where only 2 candidates\nare left. The best candidate is the candidate that has the best score at the\nlast iteration.\n\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,84 @@
+"""
+Successive Halving Iterations
+=============================
+
+This example illustrates how a successive halving search (
+:class:`~sklearn.model_selection.HalvingGridSearchCV` and
+:class:`~sklearn.model_selection.HalvingRandomSearchCV`) iteratively chooses
+the best parameter combination out of multiple candidates.
+
+"""
+import pandas as pd
+from sklearn import datasets
+import matplotlib.pyplot as plt
+from scipy.stats import randint
+import numpy as np
+
+from sklearn.experimental import enable_successive_halving  # noqa
+from sklearn.model_selection import HalvingRandomSearchCV
+from sklearn.ensemble import RandomForestClassifier
+
+
+print(__doc__)
+
+# %%
+# We first define the parameter space and train a
+# :class:`~sklearn.model_selection.HalvingRandomSearchCV` instance.
+
+rng = np.random.RandomState(0)
+
+X, y = datasets.make_classification(n_samples=700, random_state=rng)
+
+clf = RandomForestClassifier(n_estimators=20, random_state=rng)
+
+param_dist = {"max_depth": [3, None],
+              "max_features": randint(1, 11),
+              "min_samples_split": randint(2, 11),
+              "bootstrap": [True, False],
+              "criterion": ["gini", "entropy"]}
+
+rsh = HalvingRandomSearchCV(
+    estimator=clf,
+    param_distributions=param_dist,
+    factor=2,
+    random_state=rng)
+rsh.fit(X, y)
+
+# %%
+# We can now use the `cv_results_` attribute of the search estimator to inspect
+# and plot the evolution of the search.
+
+results = pd.DataFrame(rsh.cv_results_)
+results['params_str'] = results.params.apply(str)
+results.drop_duplicates(subset=('params_str', 'iter'), inplace=True)
+mean_scores = results.pivot(index='iter', columns='params_str',
+                            values='mean_test_score')
+ax = mean_scores.plot(legend=False, alpha=.6)
+
+labels = [
+    f'iter={i}\nn_samples={rsh.n_resources_[i]}\n'
+    f'n_candidates={rsh.n_candidates_[i]}'
+    for i in range(rsh.n_iterations_)
+]
+ax.set_xticklabels(labels, rotation=45, multialignment='left')
+ax.set_title('Scores of candidates over iterations')
+ax.set_ylabel('mean test score', fontsize=15)
+ax.set_xlabel('iterations', fontsize=15)
+plt.tight_layout()
+plt.show()
+
+# %%
+# Number of candidates and amount of resource at each iteration
+# -------------------------------------------------------------
+#
+# At the first iteration, a small amount of resources is used. The resource
+# here is the number of samples that the estimators are trained on. All
+# candidates are evaluated.
+#
+# At the second iteration, only the best half of the candidates is evaluated.
+# The number of allocated resources is doubled: candidates are evaluated on
+# twice as many samples.
+#
+# This process is repeated until the last iteration, where only 2 candidates
+# are left. The best candidate is the candidate that has the best score at the
+# last iteration.
@@ -0,0 +1,122 @@
+"""
+Comparison between grid search and successive halving
+=====================================================
+
+This example compares the parameter search performed by
+:class:`~sklearn.model_selection.HalvingGridSearchCV` and
+:class:`~sklearn.model_selection.GridSearchCV`.
+
+"""
+from time import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+from sklearn.svm import SVC
+from sklearn import datasets
+from sklearn.model_selection import GridSearchCV
+from sklearn.experimental import enable_successive_halving  # noqa
+from sklearn.model_selection import HalvingGridSearchCV
+
+
+print(__doc__)
+
+# %%
+# We first define the parameter space for an :class:`~sklearn.svm.SVC`
+# estimator, and compute the time required to train a
+# :class:`~sklearn.model_selection.HalvingGridSearchCV` instance, as well as a
+# :class:`~sklearn.model_selection.GridSearchCV` instance.
+
+rng = np.random.RandomState(0)
+X, y = datasets.make_classification(n_samples=1000, random_state=rng)
+
+gammas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
+Cs = [1, 10, 100, 1e3, 1e4, 1e5]
+param_grid = {'gamma': gammas, 'C': Cs}
+
+clf = SVC(random_state=rng)
+
+tic = time()
+gsh = HalvingGridSearchCV(estimator=clf, param_grid=param_grid, factor=2,
+                          random_state=rng)
+gsh.fit(X, y)
+gsh_time = time() - tic
+
+tic = time()
+gs = GridSearchCV(estimator=clf, param_grid=param_grid)
+gs.fit(X, y)
+gs_time = time() - tic
+
+# %%
+# We now plot heatmaps for both search estimators.
+
+
+def make_heatmap(ax, gs, is_sh=False, make_cbar=False):
+    """Helper to make a heatmap."""
+    results = pd.DataFrame.from_dict(gs.cv_results_)
+    results['params_str'] = results.params.apply(str)
+    if is_sh:
+        # SH dataframe: get mean_test_score values for the highest iter
+        scores_matrix = results.sort_values('iter').pivot_table(
+                index='param_gamma', columns='param_C',
+                values='mean_test_score', aggfunc='last'
+        )
+    else:
+        scores_matrix = results.pivot(index='param_gamma', columns='param_C',
+                                      values='mean_test_score')
+
+    im = ax.imshow(scores_matrix)
+
+    ax.set_xticks(np.arange(len(Cs)))
+    ax.set_xticklabels(['{:.0E}'.format(x) for x in Cs])
+    ax.set_xlabel('C', fontsize=15)
+
+    ax.set_yticks(np.arange(len(gammas)))
+    ax.set_yticklabels(['{:.0E}'.format(x) for x in gammas])
+    ax.set_ylabel('gamma', fontsize=15)
+
+    # Rotate the tick labels and set their alignment.
+    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
+             rotation_mode="anchor")
+
+    if is_sh:
+        iterations = results.pivot_table(index='param_gamma',
+                                         columns='param_C', values='iter',
+                                         aggfunc='max').values
+        for i in range(len(gammas)):
+            for j in range(len(Cs)):
+                ax.text(j, i, iterations[i, j],
+                        ha="center", va="center", color="w", fontsize=20)
+
+    if make_cbar:
+        fig.subplots_adjust(right=0.8)
+        cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
+        fig.colorbar(im, cax=cbar_ax)
+        cbar_ax.set_ylabel('mean_test_score', rotation=-90, va="bottom",
+                           fontsize=15)
+
+
+fig, axes = plt.subplots(ncols=2, sharey=True)
+ax1, ax2 = axes
+
+make_heatmap(ax1, gsh, is_sh=True)
+make_heatmap(ax2, gs, make_cbar=True)
+
+ax1.set_title('Successive Halving\ntime = {:.3f}s'.format(gsh_time),
+              fontsize=15)
+ax2.set_title('GridSearch\ntime = {:.3f}s'.format(gs_time), fontsize=15)
+
+plt.show()
+
+# %%
+# The heatmaps show the mean test score of the parameter combinations for an
+# :class:`~sklearn.svm.SVC` instance. The
+# :class:`~sklearn.model_selection.HalvingGridSearchCV` also shows the
+# iteration at which the combinations where last used. The combinations marked
+# as ``0`` were only evaluated at the first iteration, while the ones with
+# ``5`` are the parameter combinations that are considered the best ones.
+#
+# We can see that the :class:`~sklearn.model_selection.HalvingGridSearchCV`
+# class is able to find parameter combinations that are just as accurate as
+# :class:`~sklearn.model_selection.GridSearchCV`, in much less time.