scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
792 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
792 Bytes
diff --git a/‎dev/_downloads/6c50dbd9c6dc52f3da913f8d8f82274d/plot_ensemble_oob.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/6c50dbd9c6dc52f3da913f8d8f82274d/plot_ensemble_oob.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
869 Bytes b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
869 Bytes
diff --git a/‎dev/_downloads/75191b2eb3b4aa13066927321dd3fdcf/plot_ensemble_oob.py
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/75191b2eb3b4aa13066927321dd3fdcf/plot_ensemble_oob.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/acc6f0183d4b7293ae5914724f55bc28/plot_gradient_boosting_categorical.py
Lines changed: 55 additions & 18 deletions b/‎dev/_downloads/acc6f0183d4b7293ae5914724f55bc28/plot_gradient_boosting_categorical.py
Lines changed: 55 additions & 18 deletions
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Kian Ho <[email protected]>\n#         Gilles Louppe <[email protected]>\n#         Andreas Mueller <[email protected]>\n#\n# License: BSD 3 Clause\n\nimport matplotlib.pyplot as plt\n\nfrom collections import OrderedDict\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import RandomForestClassifier\n\nRANDOM_STATE = 123\n\n# Generate a binary classification dataset.\nX, y = make_classification(\n    n_samples=500,\n    n_features=25,\n    n_clusters_per_class=1,\n    n_informative=15,\n    random_state=RANDOM_STATE,\n)\n\n# NOTE: Setting the `warm_start` construction parameter to `True` disables\n# support for parallelized ensembles but is necessary for tracking the OOB\n# error trajectory during training.\nensemble_clfs = [\n    (\n        \"RandomForestClassifier, max_features='sqrt'\",\n        RandomForestClassifier(\n            warm_start=True,\n            oob_score=True,\n            max_features=\"sqrt\",\n            random_state=RANDOM_STATE,\n        ),\n    ),\n    (\n        \"RandomForestClassifier, max_features='log2'\",\n        RandomForestClassifier(\n            warm_start=True,\n            max_features=\"log2\",\n            oob_score=True,\n            random_state=RANDOM_STATE,\n        ),\n    ),\n    (\n        \"RandomForestClassifier, max_features=None\",\n        RandomForestClassifier(\n            warm_start=True,\n            max_features=None,\n            oob_score=True,\n            random_state=RANDOM_STATE,\n        ),\n    ),\n]\n\n# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.\nerror_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)\n\n# Range of `n_estimators` values to explore.\nmin_estimators = 15\nmax_estimators = 175\n\nfor label, clf in ensemble_clfs:\n    for i in range(min_estimators, max_estimators + 1):\n        clf.set_params(n_estimators=i)\n        clf.fit(X, y)\n\n        # Record the OOB error for each `n_estimators=i` setting.\n        oob_error = 1 - clf.oob_score_\n        error_rate[label].append((i, oob_error))\n\n# Generate the \"OOB error rate\" vs. \"n_estimators\" plot.\nfor label, clf_err in error_rate.items():\n    xs, ys = zip(*clf_err)\n    plt.plot(xs, ys, label=label)\n\nplt.xlim(min_estimators, max_estimators)\nplt.xlabel(\"n_estimators\")\nplt.ylabel(\"OOB error rate\")\nplt.legend(loc=\"upper right\")\nplt.show()"
+        "# Author: Kian Ho <[email protected]>\n#         Gilles Louppe <[email protected]>\n#         Andreas Mueller <[email protected]>\n#\n# License: BSD 3 Clause\n\nimport matplotlib.pyplot as plt\n\nfrom collections import OrderedDict\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import RandomForestClassifier\n\nRANDOM_STATE = 123\n\n# Generate a binary classification dataset.\nX, y = make_classification(\n    n_samples=500,\n    n_features=25,\n    n_clusters_per_class=1,\n    n_informative=15,\n    random_state=RANDOM_STATE,\n)\n\n# NOTE: Setting the `warm_start` construction parameter to `True` disables\n# support for parallelized ensembles but is necessary for tracking the OOB\n# error trajectory during training.\nensemble_clfs = [\n    (\n        \"RandomForestClassifier, max_features='sqrt'\",\n        RandomForestClassifier(\n            warm_start=True,\n            oob_score=True,\n            max_features=\"sqrt\",\n            random_state=RANDOM_STATE,\n        ),\n    ),\n    (\n        \"RandomForestClassifier, max_features='log2'\",\n        RandomForestClassifier(\n            warm_start=True,\n            max_features=\"log2\",\n            oob_score=True,\n            random_state=RANDOM_STATE,\n        ),\n    ),\n    (\n        \"RandomForestClassifier, max_features=None\",\n        RandomForestClassifier(\n            warm_start=True,\n            max_features=None,\n            oob_score=True,\n            random_state=RANDOM_STATE,\n        ),\n    ),\n]\n\n# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.\nerror_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)\n\n# Range of `n_estimators` values to explore.\nmin_estimators = 15\nmax_estimators = 150\n\nfor label, clf in ensemble_clfs:\n    for i in range(min_estimators, max_estimators + 1, 5):\n        clf.set_params(n_estimators=i)\n        clf.fit(X, y)\n\n        # Record the OOB error for each `n_estimators=i` setting.\n        oob_error = 1 - clf.oob_score_\n        error_rate[label].append((i, oob_error))\n\n# Generate the \"OOB error rate\" vs. \"n_estimators\" plot.\nfor label, clf_err in error_rate.items():\n    xs, ys = zip(*clf_err)\n    plt.plot(xs, ys, label=label)\n\nplt.xlim(min_estimators, max_estimators)\nplt.xlabel(\"n_estimators\")\nplt.ylabel(\"OOB error rate\")\nplt.legend(loc=\"upper right\")\nplt.show()"
       ]
     }
   ],
 
@@ -81,10 +81,10 @@
 
 # Range of `n_estimators` values to explore.
 min_estimators = 15
-max_estimators = 175
+max_estimators = 150
 
 for label, clf in ensemble_clfs:
-    for i in range(min_estimators, max_estimators + 1):
+    for i in range(min_estimators, max_estimators + 1, 5):
         clf.set_params(n_estimators=i)
         clf.fit(X, y)
 
 
@@ -26,14 +26,44 @@
 # %%
 # Load Ames Housing dataset
 # -------------------------
-# First, we load the ames housing data as a pandas dataframe. The features
+# First, we load the Ames Housing data as a pandas dataframe. The features
 # are either categorical or numerical:
 from sklearn.datasets import fetch_openml
 
 X, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True)
 
-n_categorical_features = (X.dtypes == "category").sum()
-n_numerical_features = (X.dtypes == "float").sum()
+# Select only a subset of features of X to make the example faster to run
+categorical_columns_subset = [
+    "Bldg_Type",
+    "Garage_Finish",
+    "Lot_Config",
+    "Functional",
+    "Mas_Vnr_Type",
+    "House_Style",
+    "Fireplace_Qu",
+    "Exter_Cond",
+    "Exter_Qual",
+    "Pool_QC",
+]
+
+numerical_columns_subset = [
+    "Three_season_porch",
+    "Fireplaces",
+    "Bsmt_Half_Bath",
+    "Half_Bath",
+    "Garage_Cars",
+    "TotRms_AbvGrd",
+    "BsmtFin_SF_1",
+    "BsmtFin_SF_2",
+    "Gr_Liv_Area",
+    "Screen_Porch",
+]
+
+X = X[categorical_columns_subset + numerical_columns_subset]
+
+n_categorical_features = X.select_dtypes(include="category").shape[1]
+n_numerical_features = X.select_dtypes(include="number").shape[1]
+
 print(f"Number of samples: {X.shape[0]}")
 print(f"Number of features: {X.shape[1]}")
 print(f"Number of categorical features: {n_categorical_features}")
@@ -114,6 +144,7 @@
 
 # The ordinal encoder will first output the categorical features, and then the
 # continuous (passed-through) features
+
 categorical_mask = [True] * n_categorical_features + [False] * n_numerical_features
 hist_native = make_pipeline(
     ordinal_encoder,
@@ -134,18 +165,20 @@
 import matplotlib.pyplot as plt
 
 scoring = "neg_mean_absolute_percentage_error"
-dropped_result = cross_validate(hist_dropped, X, y, cv=3, scoring=scoring)
-one_hot_result = cross_validate(hist_one_hot, X, y, cv=3, scoring=scoring)
-ordinal_result = cross_validate(hist_ordinal, X, y, cv=3, scoring=scoring)
-native_result = cross_validate(hist_native, X, y, cv=3, scoring=scoring)
+n_cv_folds = 3
+
+dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
+one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)
+ordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)
+native_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)
 
 
 def plot_results(figure_title):
     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
 
     plot_info = [
         ("fit_time", "Fit times (s)", ax1, None),
-        ("test_score", "Mean Absolute Percentage Error", ax2, (0, 0.20)),
+        ("test_score", "Mean Absolute Percentage Error", ax2, None),
     ]
 
     x, width = np.arange(4), 0.9
@@ -156,11 +189,15 @@ def plot_results(figure_title):
             ordinal_result[key],
             native_result[key],
         ]
+
+        mape_cv_mean = [np.mean(np.abs(item)) for item in items]
+        mape_cv_std = [np.std(item) for item in items]
+
         ax.bar(
-            x,
-            [np.mean(np.abs(item)) for item in items],
-            width,
-            yerr=[np.std(item) for item in items],
+            x=x,
+            height=mape_cv_mean,
+            width=width,
+            yerr=mape_cv_std,
             color=["C0", "C1", "C2", "C3"],
         )
         ax.set(
@@ -173,7 +210,7 @@ def plot_results(figure_title):
     fig.suptitle(figure_title)
 
 
-plot_results("Gradient Boosting on Adult Census")
+plot_results("Gradient Boosting on Ames Housing")
 
 # %%
 # We see that the model with one-hot-encoded data is by far the slowest. This
@@ -219,12 +256,12 @@ def plot_results(figure_title):
         histgradientboostingregressor__max_iter=15,
     )
 
-dropped_result = cross_validate(hist_dropped, X, y, cv=3, scoring=scoring)
-one_hot_result = cross_validate(hist_one_hot, X, y, cv=3, scoring=scoring)
-ordinal_result = cross_validate(hist_ordinal, X, y, cv=3, scoring=scoring)
-native_result = cross_validate(hist_native, X, y, cv=3, scoring=scoring)
+dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
+one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)
+ordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)
+native_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)
 
-plot_results("Gradient Boosting on Adult Census (few and small trees)")
+plot_results("Gradient Boosting on Ames Housing (few and small trees)")
 
 plt.show()
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Author: Kian Ho <[email protected]>\n# Gilles Louppe <[email protected]>\n# Andreas Mueller <[email protected]>\n#\n# License: BSD 3 Clause\n\nimport matplotlib.pyplot as plt\n\nfrom collections import OrderedDict\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import RandomForestClassifier\n\nRANDOM_STATE = 123\n\n# Generate a binary classification dataset.\nX, y = make_classification(\n n_samples=500,\n n_features=25,\n n_clusters_per_class=1,\n n_informative=15,\n random_state=RANDOM_STATE,\n)\n\n# NOTE: Setting the `warm_start` construction parameter to `True` disables\n# support for parallelized ensembles but is necessary for tracking the OOB\n# error trajectory during training.\nensemble_clfs = [\n (\n \"RandomForestClassifier, max_features='sqrt'\",\n RandomForestClassifier(\n warm_start=True,\n oob_score=True,\n max_features=\"sqrt\",\n random_state=RANDOM_STATE,\n ),\n ),\n (\n \"RandomForestClassifier, max_features='log2'\",\n RandomForestClassifier(\n warm_start=True,\n max_features=\"log2\",\n oob_score=True,\n random_state=RANDOM_STATE,\n ),\n ),\n (\n \"RandomForestClassifier, max_features=None\",\n RandomForestClassifier(\n warm_start=True,\n max_features=None,\n oob_score=True,\n random_state=RANDOM_STATE,\n ),\n ),\n]\n\n# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.\nerror_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)\n\n# Range of `n_estimators` values to explore.\nmin_estimators = 15\nmax_estimators = 175\n\nfor label, clf in ensemble_clfs:\n for i in range(min_estimators, max_estimators + 1):\n clf.set_params(n_estimators=i)\n clf.fit(X, y)\n\n # Record the OOB error for each `n_estimators=i` setting.\n oob_error = 1 - clf.oob_score_\n error_rate[label].append((i, oob_error))\n\n# Generate the \"OOB error rate\" vs. \"n_estimators\" plot.\nfor label, clf_err in error_rate.items():\n xs, ys = zip(*clf_err)\n plt.plot(xs, ys, label=label)\n\nplt.xlim(min_estimators, max_estimators)\nplt.xlabel(\"n_estimators\")\nplt.ylabel(\"OOB error rate\")\nplt.legend(loc=\"upper right\")\nplt.show()"
	`29`	+ "# Author: Kian Ho <[email protected]>\n# Gilles Louppe <[email protected]>\n# Andreas Mueller <[email protected]>\n#\n# License: BSD 3 Clause\n\nimport matplotlib.pyplot as plt\n\nfrom collections import OrderedDict\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import RandomForestClassifier\n\nRANDOM_STATE = 123\n\n# Generate a binary classification dataset.\nX, y = make_classification(\n n_samples=500,\n n_features=25,\n n_clusters_per_class=1,\n n_informative=15,\n random_state=RANDOM_STATE,\n)\n\n# NOTE: Setting the `warm_start` construction parameter to `True` disables\n# support for parallelized ensembles but is necessary for tracking the OOB\n# error trajectory during training.\nensemble_clfs = [\n (\n \"RandomForestClassifier, max_features='sqrt'\",\n RandomForestClassifier(\n warm_start=True,\n oob_score=True,\n max_features=\"sqrt\",\n random_state=RANDOM_STATE,\n ),\n ),\n (\n \"RandomForestClassifier, max_features='log2'\",\n RandomForestClassifier(\n warm_start=True,\n max_features=\"log2\",\n oob_score=True,\n random_state=RANDOM_STATE,\n ),\n ),\n (\n \"RandomForestClassifier, max_features=None\",\n RandomForestClassifier(\n warm_start=True,\n max_features=None,\n oob_score=True,\n random_state=RANDOM_STATE,\n ),\n ),\n]\n\n# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.\nerror_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)\n\n# Range of `n_estimators` values to explore.\nmin_estimators = 15\nmax_estimators = 150\n\nfor label, clf in ensemble_clfs:\n for i in range(min_estimators, max_estimators + 1, 5):\n clf.set_params(n_estimators=i)\n clf.fit(X, y)\n\n # Record the OOB error for each `n_estimators=i` setting.\n oob_error = 1 - clf.oob_score_\n error_rate[label].append((i, oob_error))\n\n# Generate the \"OOB error rate\" vs. \"n_estimators\" plot.\nfor label, clf_err in error_rate.items():\n xs, ys = zip(*clf_err)\n plt.plot(xs, ys, label=label)\n\nplt.xlim(min_estimators, max_estimators)\nplt.xlabel(\"n_estimators\")\nplt.ylabel(\"OOB error rate\")\nplt.legend(loc=\"upper right\")\nplt.show()"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`