scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
569 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
569 Bytes
diff --git a/‎dev/_downloads/4e46f015ab8300f262e6e8775bcdcf8a/plot_adaboost_multiclass.py
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/4e46f015ab8300f262e6e8775bcdcf8a/plot_adaboost_multiclass.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/607c99671400a5055ef516d1aabd00c1/plot_adaboost_multiclass.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/607c99671400a5055ef516d1aabd00c1/plot_adaboost_multiclass.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
592 Bytes b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
592 Bytes
diff --git a/‎dev/_downloads/733ff7845fe2f197ecd0c72afcf23651/plot_randomized_search.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/733ff7845fe2f197ecd0c72afcf23651/plot_randomized_search.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/74caedf3eb449b80f3f00e66c1c576bd/plot_discretization_classification.py
Lines changed: 40 additions & 21 deletions b/‎dev/_downloads/74caedf3eb449b80f3f00e66c1c576bd/plot_discretization_classification.py
Lines changed: 40 additions & 21 deletions
@@ -46,12 +46,12 @@
 y_train, y_test = y[:n_split], y[n_split:]
 
 bdt_real = AdaBoostClassifier(
-    DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1
+    DecisionTreeClassifier(max_depth=2), n_estimators=300, learning_rate=1
 )
 
 bdt_discrete = AdaBoostClassifier(
     DecisionTreeClassifier(max_depth=2),
-    n_estimators=600,
+    n_estimators=300,
     learning_rate=1.5,
     algorithm="SAMME",
 )
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_gaussian_quantiles\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.tree import DecisionTreeClassifier\n\n\nX, y = make_gaussian_quantiles(\n    n_samples=13000, n_features=10, n_classes=3, random_state=1\n)\n\nn_split = 3000\n\nX_train, X_test = X[:n_split], X[n_split:]\ny_train, y_test = y[:n_split], y[n_split:]\n\nbdt_real = AdaBoostClassifier(\n    DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1\n)\n\nbdt_discrete = AdaBoostClassifier(\n    DecisionTreeClassifier(max_depth=2),\n    n_estimators=600,\n    learning_rate=1.5,\n    algorithm=\"SAMME\",\n)\n\nbdt_real.fit(X_train, y_train)\nbdt_discrete.fit(X_train, y_train)\n\nreal_test_errors = []\ndiscrete_test_errors = []\n\nfor real_test_predict, discrete_train_predict in zip(\n    bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)\n):\n    real_test_errors.append(1.0 - accuracy_score(real_test_predict, y_test))\n    discrete_test_errors.append(1.0 - accuracy_score(discrete_train_predict, y_test))\n\nn_trees_discrete = len(bdt_discrete)\nn_trees_real = len(bdt_real)\n\n# Boosting might terminate early, but the following arrays are always\n# n_estimators long. We crop them to the actual number of trees here:\ndiscrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]\nreal_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]\ndiscrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]\n\nplt.figure(figsize=(15, 5))\n\nplt.subplot(131)\nplt.plot(range(1, n_trees_discrete + 1), discrete_test_errors, c=\"black\", label=\"SAMME\")\nplt.plot(\n    range(1, n_trees_real + 1),\n    real_test_errors,\n    c=\"black\",\n    linestyle=\"dashed\",\n    label=\"SAMME.R\",\n)\nplt.legend()\nplt.ylim(0.18, 0.62)\nplt.ylabel(\"Test Error\")\nplt.xlabel(\"Number of Trees\")\n\nplt.subplot(132)\nplt.plot(\n    range(1, n_trees_discrete + 1),\n    discrete_estimator_errors,\n    \"b\",\n    label=\"SAMME\",\n    alpha=0.5,\n)\nplt.plot(\n    range(1, n_trees_real + 1), real_estimator_errors, \"r\", label=\"SAMME.R\", alpha=0.5\n)\nplt.legend()\nplt.ylabel(\"Error\")\nplt.xlabel(\"Number of Trees\")\nplt.ylim((0.2, max(real_estimator_errors.max(), discrete_estimator_errors.max()) * 1.2))\nplt.xlim((-20, len(bdt_discrete) + 20))\n\nplt.subplot(133)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights, \"b\", label=\"SAMME\")\nplt.legend()\nplt.ylabel(\"Weight\")\nplt.xlabel(\"Number of Trees\")\nplt.ylim((0, discrete_estimator_weights.max() * 1.2))\nplt.xlim((-20, n_trees_discrete + 20))\n\n# prevent overlapping y-axis labels\nplt.subplots_adjust(wspace=0.25)\nplt.show()"
+        "# Author: Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_gaussian_quantiles\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.tree import DecisionTreeClassifier\n\n\nX, y = make_gaussian_quantiles(\n    n_samples=13000, n_features=10, n_classes=3, random_state=1\n)\n\nn_split = 3000\n\nX_train, X_test = X[:n_split], X[n_split:]\ny_train, y_test = y[:n_split], y[n_split:]\n\nbdt_real = AdaBoostClassifier(\n    DecisionTreeClassifier(max_depth=2), n_estimators=300, learning_rate=1\n)\n\nbdt_discrete = AdaBoostClassifier(\n    DecisionTreeClassifier(max_depth=2),\n    n_estimators=300,\n    learning_rate=1.5,\n    algorithm=\"SAMME\",\n)\n\nbdt_real.fit(X_train, y_train)\nbdt_discrete.fit(X_train, y_train)\n\nreal_test_errors = []\ndiscrete_test_errors = []\n\nfor real_test_predict, discrete_train_predict in zip(\n    bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)\n):\n    real_test_errors.append(1.0 - accuracy_score(real_test_predict, y_test))\n    discrete_test_errors.append(1.0 - accuracy_score(discrete_train_predict, y_test))\n\nn_trees_discrete = len(bdt_discrete)\nn_trees_real = len(bdt_real)\n\n# Boosting might terminate early, but the following arrays are always\n# n_estimators long. We crop them to the actual number of trees here:\ndiscrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]\nreal_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]\ndiscrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]\n\nplt.figure(figsize=(15, 5))\n\nplt.subplot(131)\nplt.plot(range(1, n_trees_discrete + 1), discrete_test_errors, c=\"black\", label=\"SAMME\")\nplt.plot(\n    range(1, n_trees_real + 1),\n    real_test_errors,\n    c=\"black\",\n    linestyle=\"dashed\",\n    label=\"SAMME.R\",\n)\nplt.legend()\nplt.ylim(0.18, 0.62)\nplt.ylabel(\"Test Error\")\nplt.xlabel(\"Number of Trees\")\n\nplt.subplot(132)\nplt.plot(\n    range(1, n_trees_discrete + 1),\n    discrete_estimator_errors,\n    \"b\",\n    label=\"SAMME\",\n    alpha=0.5,\n)\nplt.plot(\n    range(1, n_trees_real + 1), real_estimator_errors, \"r\", label=\"SAMME.R\", alpha=0.5\n)\nplt.legend()\nplt.ylabel(\"Error\")\nplt.xlabel(\"Number of Trees\")\nplt.ylim((0.2, max(real_estimator_errors.max(), discrete_estimator_errors.max()) * 1.2))\nplt.xlim((-20, len(bdt_discrete) + 20))\n\nplt.subplot(133)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights, \"b\", label=\"SAMME\")\nplt.legend()\nplt.ylabel(\"Weight\")\nplt.xlabel(\"Number of Trees\")\nplt.ylim((0, discrete_estimator_weights.max() * 1.2))\nplt.xlim((-20, n_trees_discrete + 20))\n\n# prevent overlapping y-axis labels\nplt.subplots_adjust(wspace=0.25)\nplt.show()"
       ]
     }
   ],
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\n\nfrom time import time\nimport scipy.stats as stats\nfrom sklearn.utils.fixes import loguniform\n\nfrom sklearn.model_selection import GridSearchCV, RandomizedSearchCV\nfrom sklearn.datasets import load_digits\nfrom sklearn.linear_model import SGDClassifier\n\n# get some data\nX, y = load_digits(return_X_y=True)\n\n# build a classifier\nclf = SGDClassifier(loss=\"hinge\", penalty=\"elasticnet\", fit_intercept=True)\n\n\n# Utility function to report best scores\ndef report(results, n_top=3):\n    for i in range(1, n_top + 1):\n        candidates = np.flatnonzero(results[\"rank_test_score\"] == i)\n        for candidate in candidates:\n            print(\"Model with rank: {0}\".format(i))\n            print(\n                \"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n                    results[\"mean_test_score\"][candidate],\n                    results[\"std_test_score\"][candidate],\n                )\n            )\n            print(\"Parameters: {0}\".format(results[\"params\"][candidate]))\n            print(\"\")\n\n\n# specify parameters and distributions to sample from\nparam_dist = {\n    \"average\": [True, False],\n    \"l1_ratio\": stats.uniform(0, 1),\n    \"alpha\": loguniform(1e-4, 1e0),\n}\n\n# run randomized search\nn_iter_search = 20\nrandom_search = RandomizedSearchCV(\n    clf, param_distributions=param_dist, n_iter=n_iter_search\n)\n\nstart = time()\nrandom_search.fit(X, y)\nprint(\n    \"RandomizedSearchCV took %.2f seconds for %d candidates parameter settings.\"\n    % ((time() - start), n_iter_search)\n)\nreport(random_search.cv_results_)\n\n# use a full grid over all parameters\nparam_grid = {\n    \"average\": [True, False],\n    \"l1_ratio\": np.linspace(0, 1, num=10),\n    \"alpha\": np.power(10, np.arange(-4, 1, dtype=float)),\n}\n\n# run grid search\ngrid_search = GridSearchCV(clf, param_grid=param_grid)\nstart = time()\ngrid_search.fit(X, y)\n\nprint(\n    \"GridSearchCV took %.2f seconds for %d candidate parameter settings.\"\n    % (time() - start, len(grid_search.cv_results_[\"params\"]))\n)\nreport(grid_search.cv_results_)"
+        "import numpy as np\n\nfrom time import time\nimport scipy.stats as stats\nfrom sklearn.utils.fixes import loguniform\n\nfrom sklearn.model_selection import GridSearchCV, RandomizedSearchCV\nfrom sklearn.datasets import load_digits\nfrom sklearn.linear_model import SGDClassifier\n\n# get some data\nX, y = load_digits(return_X_y=True, n_class=3)\n\n# build a classifier\nclf = SGDClassifier(loss=\"hinge\", penalty=\"elasticnet\", fit_intercept=True)\n\n\n# Utility function to report best scores\ndef report(results, n_top=3):\n    for i in range(1, n_top + 1):\n        candidates = np.flatnonzero(results[\"rank_test_score\"] == i)\n        for candidate in candidates:\n            print(\"Model with rank: {0}\".format(i))\n            print(\n                \"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n                    results[\"mean_test_score\"][candidate],\n                    results[\"std_test_score\"][candidate],\n                )\n            )\n            print(\"Parameters: {0}\".format(results[\"params\"][candidate]))\n            print(\"\")\n\n\n# specify parameters and distributions to sample from\nparam_dist = {\n    \"average\": [True, False],\n    \"l1_ratio\": stats.uniform(0, 1),\n    \"alpha\": loguniform(1e-2, 1e0),\n}\n\n# run randomized search\nn_iter_search = 15\nrandom_search = RandomizedSearchCV(\n    clf, param_distributions=param_dist, n_iter=n_iter_search\n)\n\nstart = time()\nrandom_search.fit(X, y)\nprint(\n    \"RandomizedSearchCV took %.2f seconds for %d candidates parameter settings.\"\n    % ((time() - start), n_iter_search)\n)\nreport(random_search.cv_results_)\n\n# use a full grid over all parameters\nparam_grid = {\n    \"average\": [True, False],\n    \"l1_ratio\": np.linspace(0, 1, num=10),\n    \"alpha\": np.power(10, np.arange(-2, 1, dtype=float)),\n}\n\n# run grid search\ngrid_search = GridSearchCV(clf, param_grid=param_grid)\nstart = time()\ngrid_search.fit(X, y)\n\nprint(\n    \"GridSearchCV took %.2f seconds for %d candidate parameter settings.\"\n    % (time() - start, len(grid_search.cv_results_[\"params\"]))\n)\nreport(grid_search.cv_results_)"
       ]
     }
   ],
 
@@ -61,33 +61,53 @@ def get_name(estimator):
 
 
 # list of (estimator, param_grid), where param_grid is used in GridSearchCV
+# The parameter spaces in this example are limited to a narrow band to reduce
+# its runtime. In a real use case, a broader search space for the algorithms
+# should be used.
 classifiers = [
-    (LogisticRegression(random_state=0), {"C": np.logspace(-2, 7, 10)}),
-    (LinearSVC(random_state=0), {"C": np.logspace(-2, 7, 10)}),
+    (
+        make_pipeline(StandardScaler(), LogisticRegression(random_state=0)),
+        {"logisticregression__C": np.logspace(-1, 1, 3)},
+    ),
+    (
+        make_pipeline(StandardScaler(), LinearSVC(random_state=0)),
+        {"linearsvc__C": np.logspace(-1, 1, 3)},
+    ),
     (
         make_pipeline(
-            KBinsDiscretizer(encode="onehot"), LogisticRegression(random_state=0)
+            StandardScaler(),
+            KBinsDiscretizer(encode="onehot"),
+            LogisticRegression(random_state=0),
         ),
         {
-            "kbinsdiscretizer__n_bins": np.arange(2, 10),
-            "logisticregression__C": np.logspace(-2, 7, 10),
+            "kbinsdiscretizer__n_bins": np.arange(5, 8),
+            "logisticregression__C": np.logspace(-1, 1, 3),
         },
     ),
     (
-        make_pipeline(KBinsDiscretizer(encode="onehot"), LinearSVC(random_state=0)),
+        make_pipeline(
+            StandardScaler(),
+            KBinsDiscretizer(encode="onehot"),
+            LinearSVC(random_state=0),
+        ),
         {
-            "kbinsdiscretizer__n_bins": np.arange(2, 10),
-            "linearsvc__C": np.logspace(-2, 7, 10),
+            "kbinsdiscretizer__n_bins": np.arange(5, 8),
+            "linearsvc__C": np.logspace(-1, 1, 3),
         },
     ),
     (
-        GradientBoostingClassifier(n_estimators=50, random_state=0),
-        {"learning_rate": np.logspace(-4, 0, 10)},
+        make_pipeline(
+            StandardScaler(), GradientBoostingClassifier(n_estimators=5, random_state=0)
+        ),
+        {"gradientboostingclassifier__learning_rate": np.logspace(-2, 0, 5)},
+    ),
+    (
+        make_pipeline(StandardScaler(), SVC(random_state=0)),
+        {"svc__C": np.logspace(-1, 1, 3)},
     ),
-    (SVC(random_state=0), {"C": np.logspace(-2, 7, 10)}),
 ]
 
-names = [get_name(e) for e, g in classifiers]
+names = [get_name(e).replace("StandardScaler + ", "") for e, _ in classifiers]
 
 n_samples = 100
 datasets = [
@@ -107,15 +127,14 @@ def get_name(estimator):
     nrows=len(datasets), ncols=len(classifiers) + 1, figsize=(21, 9)
 )
 
-cm = plt.cm.PiYG
+cm_piyg = plt.cm.PiYG
 cm_bright = ListedColormap(["#b30065", "#178000"])
 
 # iterate over datasets
 for ds_cnt, (X, y) in enumerate(datasets):
-    print("\ndataset %d\n---------" % ds_cnt)
+    print(f"\ndataset {ds_cnt}\n---------")
 
-    # preprocess dataset, split into training and test part
-    X = StandardScaler().fit_transform(X)
+    # split into training and test part
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.5, random_state=42
     )
@@ -148,18 +167,18 @@ def get_name(estimator):
         with ignore_warnings(category=ConvergenceWarning):
             clf.fit(X_train, y_train)
         score = clf.score(X_test, y_test)
-        print("%s: %.2f" % (name, score))
+        print(f"{name}: {score:.2f}")
 
         # plot the decision boundary. For that, we will assign a color to each
         # point in the mesh [x_min, x_max]*[y_min, y_max].
         if hasattr(clf, "decision_function"):
-            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
+            Z = clf.decision_function(np.column_stack([xx.ravel(), yy.ravel()]))
         else:
-            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
+            Z = clf.predict_proba(np.column_stack([xx.ravel(), yy.ravel()]))[:, 1]
 
         # put the result into a color plot
         Z = Z.reshape(xx.shape)
-        ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8)
+        ax.contourf(xx, yy, Z, cmap=cm_piyg, alpha=0.8)
 
         # plot the training points
         ax.scatter(
@@ -184,7 +203,7 @@ def get_name(estimator):
         ax.text(
             0.95,
             0.06,
-            ("%.2f" % score).lstrip("0"),
+            (f"{score:.2f}").lstrip("0"),
             size=15,
             bbox=dict(boxstyle="round", alpha=0.8, facecolor="white"),
             transform=ax.transAxes,
Original file line number	Diff line number	Diff line change
`@@ -46,12 +46,12 @@`
`46`	`46`	`y_train, y_test = y[:n_split], y[n_split:]`
`47`	`47`
`48`	`48`	`bdt_real = AdaBoostClassifier(`
`49`		`- DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1`
	`49`	`+ DecisionTreeClassifier(max_depth=2), n_estimators=300, learning_rate=1`
`50`	`50`	`)`
`51`	`51`
`52`	`52`	`bdt_discrete = AdaBoostClassifier(`
`53`	`53`	`DecisionTreeClassifier(max_depth=2),`
`54`		`- n_estimators=600,`
	`54`	`+ n_estimators=300,`
`55`	`55`	`learning_rate=1.5,`
`56`	`56`	`algorithm="SAMME",`
`57`	`57`	`)`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Author: Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_gaussian_quantiles\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.tree import DecisionTreeClassifier\n\n\nX, y = make_gaussian_quantiles(\n n_samples=13000, n_features=10, n_classes=3, random_state=1\n)\n\nn_split = 3000\n\nX_train, X_test = X[:n_split], X[n_split:]\ny_train, y_test = y[:n_split], y[n_split:]\n\nbdt_real = AdaBoostClassifier(\n DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1\n)\n\nbdt_discrete = AdaBoostClassifier(\n DecisionTreeClassifier(max_depth=2),\n n_estimators=600,\n learning_rate=1.5,\n algorithm=\"SAMME\",\n)\n\nbdt_real.fit(X_train, y_train)\nbdt_discrete.fit(X_train, y_train)\n\nreal_test_errors = []\ndiscrete_test_errors = []\n\nfor real_test_predict, discrete_train_predict in zip(\n bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)\n):\n real_test_errors.append(1.0 - accuracy_score(real_test_predict, y_test))\n discrete_test_errors.append(1.0 - accuracy_score(discrete_train_predict, y_test))\n\nn_trees_discrete = len(bdt_discrete)\nn_trees_real = len(bdt_real)\n\n# Boosting might terminate early, but the following arrays are always\n# n_estimators long. We crop them to the actual number of trees here:\ndiscrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]\nreal_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]\ndiscrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]\n\nplt.figure(figsize=(15, 5))\n\nplt.subplot(131)\nplt.plot(range(1, n_trees_discrete + 1), discrete_test_errors, c=\"black\", label=\"SAMME\")\nplt.plot(\n range(1, n_trees_real + 1),\n real_test_errors,\n c=\"black\",\n linestyle=\"dashed\",\n label=\"SAMME.R\",\n)\nplt.legend()\nplt.ylim(0.18, 0.62)\nplt.ylabel(\"Test Error\")\nplt.xlabel(\"Number of Trees\")\n\nplt.subplot(132)\nplt.plot(\n range(1, n_trees_discrete + 1),\n discrete_estimator_errors,\n \"b\",\n label=\"SAMME\",\n alpha=0.5,\n)\nplt.plot(\n range(1, n_trees_real + 1), real_estimator_errors, \"r\", label=\"SAMME.R\", alpha=0.5\n)\nplt.legend()\nplt.ylabel(\"Error\")\nplt.xlabel(\"Number of Trees\")\nplt.ylim((0.2, max(real_estimator_errors.max(), discrete_estimator_errors.max()) * 1.2))\nplt.xlim((-20, len(bdt_discrete) + 20))\n\nplt.subplot(133)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights, \"b\", label=\"SAMME\")\nplt.legend()\nplt.ylabel(\"Weight\")\nplt.xlabel(\"Number of Trees\")\nplt.ylim((0, discrete_estimator_weights.max() * 1.2))\nplt.xlim((-20, n_trees_discrete + 20))\n\n# prevent overlapping y-axis labels\nplt.subplots_adjust(wspace=0.25)\nplt.show()"
	`29`	+ "# Author: Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_gaussian_quantiles\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.tree import DecisionTreeClassifier\n\n\nX, y = make_gaussian_quantiles(\n n_samples=13000, n_features=10, n_classes=3, random_state=1\n)\n\nn_split = 3000\n\nX_train, X_test = X[:n_split], X[n_split:]\ny_train, y_test = y[:n_split], y[n_split:]\n\nbdt_real = AdaBoostClassifier(\n DecisionTreeClassifier(max_depth=2), n_estimators=300, learning_rate=1\n)\n\nbdt_discrete = AdaBoostClassifier(\n DecisionTreeClassifier(max_depth=2),\n n_estimators=300,\n learning_rate=1.5,\n algorithm=\"SAMME\",\n)\n\nbdt_real.fit(X_train, y_train)\nbdt_discrete.fit(X_train, y_train)\n\nreal_test_errors = []\ndiscrete_test_errors = []\n\nfor real_test_predict, discrete_train_predict in zip(\n bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)\n):\n real_test_errors.append(1.0 - accuracy_score(real_test_predict, y_test))\n discrete_test_errors.append(1.0 - accuracy_score(discrete_train_predict, y_test))\n\nn_trees_discrete = len(bdt_discrete)\nn_trees_real = len(bdt_real)\n\n# Boosting might terminate early, but the following arrays are always\n# n_estimators long. We crop them to the actual number of trees here:\ndiscrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]\nreal_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]\ndiscrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]\n\nplt.figure(figsize=(15, 5))\n\nplt.subplot(131)\nplt.plot(range(1, n_trees_discrete + 1), discrete_test_errors, c=\"black\", label=\"SAMME\")\nplt.plot(\n range(1, n_trees_real + 1),\n real_test_errors,\n c=\"black\",\n linestyle=\"dashed\",\n label=\"SAMME.R\",\n)\nplt.legend()\nplt.ylim(0.18, 0.62)\nplt.ylabel(\"Test Error\")\nplt.xlabel(\"Number of Trees\")\n\nplt.subplot(132)\nplt.plot(\n range(1, n_trees_discrete + 1),\n discrete_estimator_errors,\n \"b\",\n label=\"SAMME\",\n alpha=0.5,\n)\nplt.plot(\n range(1, n_trees_real + 1), real_estimator_errors, \"r\", label=\"SAMME.R\", alpha=0.5\n)\nplt.legend()\nplt.ylabel(\"Error\")\nplt.xlabel(\"Number of Trees\")\nplt.ylim((0.2, max(real_estimator_errors.max(), discrete_estimator_errors.max()) * 1.2))\nplt.xlim((-20, len(bdt_discrete) + 20))\n\nplt.subplot(133)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights, \"b\", label=\"SAMME\")\nplt.legend()\nplt.ylabel(\"Weight\")\nplt.xlabel(\"Number of Trees\")\nplt.ylim((0, discrete_estimator_weights.max() * 1.2))\nplt.xlim((-20, n_trees_discrete + 20))\n\n# prevent overlapping y-axis labels\nplt.subplots_adjust(wspace=0.25)\nplt.show()"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`