scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.36 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.36 KB
diff --git a/‎dev/_downloads/2c8a162a0e436f4ca9af35453585fc81/plot_adaboost_hastie_10_2.py
Lines changed: 71 additions & 23 deletions b/‎dev/_downloads/2c8a162a0e436f4ca9af35453585fc81/plot_adaboost_hastie_10_2.py
Lines changed: 71 additions & 23 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
2.96 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
2.96 KB
diff --git a/‎dev/_downloads/97c9b8aba1989fb600a73f3afb354726/plot_adaboost_hastie_10_2.ipynb
Lines changed: 117 additions & 2 deletions b/‎dev/_downloads/97c9b8aba1989fb600a73f3afb354726/plot_adaboost_hastie_10_2.ipynb
Lines changed: 117 additions & 2 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
3.61 KB b/‎dev/_downloads/scikit-learn-docs.zip
3.61 KB
diff --git a/‎dev/_images/sphx_glr_plot_adaboost_hastie_10_2_001.png
3.37 KB b/‎dev/_images/sphx_glr_plot_adaboost_hastie_10_2_001.png
3.37 KB
diff --git a/‎dev/_images/sphx_glr_plot_adaboost_hastie_10_2_thumb.png
3.52 KB b/‎dev/_images/sphx_glr_plot_adaboost_hastie_10_2_thumb.png
3.52 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-212 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-212 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-76 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-76 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
102 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
102 Bytes
@@ -3,7 +3,7 @@
 Discrete versus Real AdaBoost
 =============================
 
-This example is based on Figure 10.2 from Hastie et al 2009 [1]_ and
+This notebook is based on Figure 10.2 from Hastie et al 2009 [1]_ and
 illustrates the difference in performance between the discrete SAMME [2]_
 boosting algorithm and real SAMME.R boosting algorithm. Both algorithms are
 evaluated on a binary classification task where the target Y is a non-linear
@@ -15,32 +15,44 @@
 .. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical
     Learning Ed. 2", Springer, 2009.
 
-.. [2] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
+.. [2] J Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost",
+    Statistics and Its Interface, 2009.
 
 """
 
-# Author: Peter Prettenhofer <[email protected]>,
-#         Noel Dawe <[email protected]>
+# %%
+# Preparing the data and baseline models
+# --------------------------------------
+# We start by generating the binary classification dataset
+# used in Hastie et al. 2009, Example 10.2.
+
+# Authors: Peter Prettenhofer <[email protected]>,
+#          Noel Dawe <[email protected]>
 #
 # License: BSD 3 clause
 
-import numpy as np
-import matplotlib.pyplot as plt
-
 from sklearn import datasets
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.metrics import zero_one_loss
-from sklearn.ensemble import AdaBoostClassifier
 
+X, y = datasets.make_hastie_10_2(n_samples=12_000, random_state=1)
+
+# %%
+# Now, we set the hyperparameters for our AdaBoost classifiers.
+# Be aware, a learning rate of 1.0 may not be optimal for both SAMME and SAMME.R
 
 n_estimators = 400
-# A learning rate of 1. may not be optimal for both SAMME and SAMME.R
 learning_rate = 1.0
 
-X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
+# %%
+# We split the data into a training and a test set.
+# Then, we train our baseline classifiers, a `DecisionTreeClassifier` with `depth=9`
+# and a "stump" `DecisionTreeClassifier` with `depth=1` and compute the test error.
 
-X_test, y_test = X[2000:], y[2000:]
-X_train, y_train = X[:2000], y[:2000]
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier
+
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=2_000, shuffle=False
+)
 
 dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
 dt_stump.fit(X_train, y_train)
@@ -50,6 +62,14 @@
 dt.fit(X_train, y_train)
 dt_err = 1.0 - dt.score(X_test, y_test)
 
+# %%
+# Adaboost with discrete SAMME and real SAMME.R
+# ---------------------------------------------
+# We now define the discrete and real AdaBoost classifiers
+# and fit them to the training set.
+
+from sklearn.ensemble import AdaBoostClassifier
+
 ada_discrete = AdaBoostClassifier(
     base_estimator=dt_stump,
     learning_rate=learning_rate,
@@ -58,6 +78,8 @@
 )
 ada_discrete.fit(X_train, y_train)
 
+# %%
+
 ada_real = AdaBoostClassifier(
     base_estimator=dt_stump,
     learning_rate=learning_rate,
@@ -66,11 +88,13 @@
 )
 ada_real.fit(X_train, y_train)
 
-fig = plt.figure()
-ax = fig.add_subplot(111)
+# %%
+# Now, let's compute the test error of the discrete and
+# real AdaBoost classifiers for each new stump in `n_estimators`
+# added to the ensemble.
 
-ax.plot([1, n_estimators], [dt_stump_err] * 2, "k-", label="Decision Stump Error")
-ax.plot([1, n_estimators], [dt_err] * 2, "k--", label="Decision Tree Error")
+import numpy as np
+from sklearn.metrics import zero_one_loss
 
 ada_discrete_err = np.zeros((n_estimators,))
 for i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):
@@ -88,36 +112,60 @@
 for i, y_pred in enumerate(ada_real.staged_predict(X_train)):
     ada_real_err_train[i] = zero_one_loss(y_pred, y_train)
 
+# %%
+# Plotting the results
+# --------------------
+# Finally, we plot the train and test errors of our baselines
+# and of the discrete and real AdaBoost classifiers
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+fig = plt.figure()
+ax = fig.add_subplot(111)
+
+ax.plot([1, n_estimators], [dt_stump_err] * 2, "k-", label="Decision Stump Error")
+ax.plot([1, n_estimators], [dt_err] * 2, "k--", label="Decision Tree Error")
+
+colors = sns.color_palette("colorblind")
+
 ax.plot(
     np.arange(n_estimators) + 1,
     ada_discrete_err,
     label="Discrete AdaBoost Test Error",
-    color="red",
+    color=colors[0],
 )
 ax.plot(
     np.arange(n_estimators) + 1,
     ada_discrete_err_train,
     label="Discrete AdaBoost Train Error",
-    color="blue",
+    color=colors[1],
 )
 ax.plot(
     np.arange(n_estimators) + 1,
     ada_real_err,
     label="Real AdaBoost Test Error",
-    color="orange",
+    color=colors[2],
 )
 ax.plot(
     np.arange(n_estimators) + 1,
     ada_real_err_train,
     label="Real AdaBoost Train Error",
-    color="green",
+    color=colors[4],
 )
 
 ax.set_ylim((0.0, 0.5))
-ax.set_xlabel("n_estimators")
+ax.set_xlabel("Number of weak learners")
 ax.set_ylabel("error rate")
 
 leg = ax.legend(loc="upper right", fancybox=True)
 leg.get_frame().set_alpha(0.7)
 
 plt.show()
+# %%
+#
+# Concluding remarks
+# ------------------
+#
+# We observe that the error rate for both train and test sets of real AdaBoost
+# is lower than that of discrete AdaBoost.
@@ -15,7 +15,79 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Discrete versus Real AdaBoost\n\nThis example is based on Figure 10.2 from Hastie et al 2009 [1]_ and\nillustrates the difference in performance between the discrete SAMME [2]_\nboosting algorithm and real SAMME.R boosting algorithm. Both algorithms are\nevaluated on a binary classification task where the target Y is a non-linear\nfunction of 10 input features.\n\nDiscrete SAMME AdaBoost adapts based on errors in predicted class labels\nwhereas real SAMME.R uses the predicted class probabilities.\n\n.. [1] T. Hastie, R. Tibshirani and J. Friedman, \"Elements of Statistical\n    Learning Ed. 2\", Springer, 2009.\n\n.. [2] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\", 2009.\n"
+        "\n# Discrete versus Real AdaBoost\n\nThis notebook is based on Figure 10.2 from Hastie et al 2009 [1]_ and\nillustrates the difference in performance between the discrete SAMME [2]_\nboosting algorithm and real SAMME.R boosting algorithm. Both algorithms are\nevaluated on a binary classification task where the target Y is a non-linear\nfunction of 10 input features.\n\nDiscrete SAMME AdaBoost adapts based on errors in predicted class labels\nwhereas real SAMME.R uses the predicted class probabilities.\n\n.. [1] T. Hastie, R. Tibshirani and J. Friedman, \"Elements of Statistical\n    Learning Ed. 2\", Springer, 2009.\n\n.. [2] J Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\",\n    Statistics and Its Interface, 2009.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Preparing the data and baseline models\nWe start by generating the binary classification dataset\nused in Hastie et al. 2009, Example 10.2.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Authors: Peter Prettenhofer <[email protected]>,\n#          Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom sklearn import datasets\n\nX, y = datasets.make_hastie_10_2(n_samples=12_000, random_state=1)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Now, we set the hyperparameters for our AdaBoost classifiers.\nBe aware, a learning rate of 1.0 may not be optimal for both SAMME and SAMME.R\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "n_estimators = 400\nlearning_rate = 1.0"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We split the data into a training and a test set.\nThen, we train our baseline classifiers, a `DecisionTreeClassifier` with `depth=9`\nand a \"stump\" `DecisionTreeClassifier` with `depth=1` and compute the test error.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.model_selection import train_test_split\nfrom sklearn.tree import DecisionTreeClassifier\n\nX_train, X_test, y_train, y_test = train_test_split(\n    X, y, test_size=2_000, shuffle=False\n)\n\ndt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)\ndt_stump.fit(X_train, y_train)\ndt_stump_err = 1.0 - dt_stump.score(X_test, y_test)\n\ndt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)\ndt.fit(X_train, y_train)\ndt_err = 1.0 - dt.score(X_test, y_test)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Adaboost with discrete SAMME and real SAMME.R\nWe now define the discrete and real AdaBoost classifiers\nand fit them to the training set.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.ensemble import AdaBoostClassifier\n\nada_discrete = AdaBoostClassifier(\n    base_estimator=dt_stump,\n    learning_rate=learning_rate,\n    n_estimators=n_estimators,\n    algorithm=\"SAMME\",\n)\nada_discrete.fit(X_train, y_train)"
       ]
     },
     {
@@ -26,7 +98,50 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Peter Prettenhofer <[email protected]>,\n#         Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.metrics import zero_one_loss\nfrom sklearn.ensemble import AdaBoostClassifier\n\n\nn_estimators = 400\n# A learning rate of 1. may not be optimal for both SAMME and SAMME.R\nlearning_rate = 1.0\n\nX, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)\n\nX_test, y_test = X[2000:], y[2000:]\nX_train, y_train = X[:2000], y[:2000]\n\ndt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)\ndt_stump.fit(X_train, y_train)\ndt_stump_err = 1.0 - dt_stump.score(X_test, y_test)\n\ndt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)\ndt.fit(X_train, y_train)\ndt_err = 1.0 - dt.score(X_test, y_test)\n\nada_discrete = AdaBoostClassifier(\n    base_estimator=dt_stump,\n    learning_rate=learning_rate,\n    n_estimators=n_estimators,\n    algorithm=\"SAMME\",\n)\nada_discrete.fit(X_train, y_train)\n\nada_real = AdaBoostClassifier(\n    base_estimator=dt_stump,\n    learning_rate=learning_rate,\n    n_estimators=n_estimators,\n    algorithm=\"SAMME.R\",\n)\nada_real.fit(X_train, y_train)\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot([1, n_estimators], [dt_stump_err] * 2, \"k-\", label=\"Decision Stump Error\")\nax.plot([1, n_estimators], [dt_err] * 2, \"k--\", label=\"Decision Tree Error\")\n\nada_discrete_err = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):\n    ada_discrete_err[i] = zero_one_loss(y_pred, y_test)\n\nada_discrete_err_train = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_discrete.staged_predict(X_train)):\n    ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train)\n\nada_real_err = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_real.staged_predict(X_test)):\n    ada_real_err[i] = zero_one_loss(y_pred, y_test)\n\nada_real_err_train = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_real.staged_predict(X_train)):\n    ada_real_err_train[i] = zero_one_loss(y_pred, y_train)\n\nax.plot(\n    np.arange(n_estimators) + 1,\n    ada_discrete_err,\n    label=\"Discrete AdaBoost Test Error\",\n    color=\"red\",\n)\nax.plot(\n    np.arange(n_estimators) + 1,\n    ada_discrete_err_train,\n    label=\"Discrete AdaBoost Train Error\",\n    color=\"blue\",\n)\nax.plot(\n    np.arange(n_estimators) + 1,\n    ada_real_err,\n    label=\"Real AdaBoost Test Error\",\n    color=\"orange\",\n)\nax.plot(\n    np.arange(n_estimators) + 1,\n    ada_real_err_train,\n    label=\"Real AdaBoost Train Error\",\n    color=\"green\",\n)\n\nax.set_ylim((0.0, 0.5))\nax.set_xlabel(\"n_estimators\")\nax.set_ylabel(\"error rate\")\n\nleg = ax.legend(loc=\"upper right\", fancybox=True)\nleg.get_frame().set_alpha(0.7)\n\nplt.show()"
+        "ada_real = AdaBoostClassifier(\n    base_estimator=dt_stump,\n    learning_rate=learning_rate,\n    n_estimators=n_estimators,\n    algorithm=\"SAMME.R\",\n)\nada_real.fit(X_train, y_train)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Now, let's compute the test error of the discrete and\nreal AdaBoost classifiers for each new stump in `n_estimators`\nadded to the ensemble.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\nfrom sklearn.metrics import zero_one_loss\n\nada_discrete_err = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):\n    ada_discrete_err[i] = zero_one_loss(y_pred, y_test)\n\nada_discrete_err_train = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_discrete.staged_predict(X_train)):\n    ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train)\n\nada_real_err = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_real.staged_predict(X_test)):\n    ada_real_err[i] = zero_one_loss(y_pred, y_test)\n\nada_real_err_train = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_real.staged_predict(X_train)):\n    ada_real_err_train[i] = zero_one_loss(y_pred, y_train)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plotting the results\nFinally, we plot the train and test errors of our baselines\nand of the discrete and real AdaBoost classifiers\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\nimport seaborn as sns\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot([1, n_estimators], [dt_stump_err] * 2, \"k-\", label=\"Decision Stump Error\")\nax.plot([1, n_estimators], [dt_err] * 2, \"k--\", label=\"Decision Tree Error\")\n\ncolors = sns.color_palette(\"colorblind\")\n\nax.plot(\n    np.arange(n_estimators) + 1,\n    ada_discrete_err,\n    label=\"Discrete AdaBoost Test Error\",\n    color=colors[0],\n)\nax.plot(\n    np.arange(n_estimators) + 1,\n    ada_discrete_err_train,\n    label=\"Discrete AdaBoost Train Error\",\n    color=colors[1],\n)\nax.plot(\n    np.arange(n_estimators) + 1,\n    ada_real_err,\n    label=\"Real AdaBoost Test Error\",\n    color=colors[2],\n)\nax.plot(\n    np.arange(n_estimators) + 1,\n    ada_real_err_train,\n    label=\"Real AdaBoost Train Error\",\n    color=colors[4],\n)\n\nax.set_ylim((0.0, 0.5))\nax.set_xlabel(\"Number of weak learners\")\nax.set_ylabel(\"error rate\")\n\nleg = ax.legend(loc=\"upper right\", fancybox=True)\nleg.get_frame().set_alpha(0.7)\n\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Concluding remarks\n\nWe observe that the error rate for both train and test sets of real AdaBoost\nis lower than that of discrete AdaBoost.\n\n"
       ]
     }
   ],