Skip to content

Commit b46b2c2

Browse files
committed
Pushing the docs to dev/ for branch: main, commit c9e5067cb14de578ab48b64f399743b994e3ca94
1 parent 7587d43 commit b46b2c2

File tree

1,230 files changed

+4607
-4496
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,230 files changed

+4607
-4496
lines changed
Binary file not shown.

dev/_downloads/6c50dbd9c6dc52f3da913f8d8f82274d/plot_ensemble_oob.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Kian Ho <[email protected]>\n# Gilles Louppe <[email protected]>\n# Andreas Mueller <[email protected]>\n#\n# License: BSD 3 Clause\n\nimport matplotlib.pyplot as plt\n\nfrom collections import OrderedDict\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import RandomForestClassifier\n\nRANDOM_STATE = 123\n\n# Generate a binary classification dataset.\nX, y = make_classification(\n n_samples=500,\n n_features=25,\n n_clusters_per_class=1,\n n_informative=15,\n random_state=RANDOM_STATE,\n)\n\n# NOTE: Setting the `warm_start` construction parameter to `True` disables\n# support for parallelized ensembles but is necessary for tracking the OOB\n# error trajectory during training.\nensemble_clfs = [\n (\n \"RandomForestClassifier, max_features='sqrt'\",\n RandomForestClassifier(\n warm_start=True,\n oob_score=True,\n max_features=\"sqrt\",\n random_state=RANDOM_STATE,\n ),\n ),\n (\n \"RandomForestClassifier, max_features='log2'\",\n RandomForestClassifier(\n warm_start=True,\n max_features=\"log2\",\n oob_score=True,\n random_state=RANDOM_STATE,\n ),\n ),\n (\n \"RandomForestClassifier, max_features=None\",\n RandomForestClassifier(\n warm_start=True,\n max_features=None,\n oob_score=True,\n random_state=RANDOM_STATE,\n ),\n ),\n]\n\n# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.\nerror_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)\n\n# Range of `n_estimators` values to explore.\nmin_estimators = 15\nmax_estimators = 175\n\nfor label, clf in ensemble_clfs:\n for i in range(min_estimators, max_estimators + 1):\n clf.set_params(n_estimators=i)\n clf.fit(X, y)\n\n # Record the OOB error for each `n_estimators=i` setting.\n oob_error = 1 - clf.oob_score_\n error_rate[label].append((i, oob_error))\n\n# Generate the \"OOB error rate\" vs. \"n_estimators\" plot.\nfor label, clf_err in error_rate.items():\n xs, ys = zip(*clf_err)\n plt.plot(xs, ys, label=label)\n\nplt.xlim(min_estimators, max_estimators)\nplt.xlabel(\"n_estimators\")\nplt.ylabel(\"OOB error rate\")\nplt.legend(loc=\"upper right\")\nplt.show()"
29+
"# Author: Kian Ho <[email protected]>\n# Gilles Louppe <[email protected]>\n# Andreas Mueller <[email protected]>\n#\n# License: BSD 3 Clause\n\nimport matplotlib.pyplot as plt\n\nfrom collections import OrderedDict\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import RandomForestClassifier\n\nRANDOM_STATE = 123\n\n# Generate a binary classification dataset.\nX, y = make_classification(\n n_samples=500,\n n_features=25,\n n_clusters_per_class=1,\n n_informative=15,\n random_state=RANDOM_STATE,\n)\n\n# NOTE: Setting the `warm_start` construction parameter to `True` disables\n# support for parallelized ensembles but is necessary for tracking the OOB\n# error trajectory during training.\nensemble_clfs = [\n (\n \"RandomForestClassifier, max_features='sqrt'\",\n RandomForestClassifier(\n warm_start=True,\n oob_score=True,\n max_features=\"sqrt\",\n random_state=RANDOM_STATE,\n ),\n ),\n (\n \"RandomForestClassifier, max_features='log2'\",\n RandomForestClassifier(\n warm_start=True,\n max_features=\"log2\",\n oob_score=True,\n random_state=RANDOM_STATE,\n ),\n ),\n (\n \"RandomForestClassifier, max_features=None\",\n RandomForestClassifier(\n warm_start=True,\n max_features=None,\n oob_score=True,\n random_state=RANDOM_STATE,\n ),\n ),\n]\n\n# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.\nerror_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)\n\n# Range of `n_estimators` values to explore.\nmin_estimators = 15\nmax_estimators = 150\n\nfor label, clf in ensemble_clfs:\n for i in range(min_estimators, max_estimators + 1, 5):\n clf.set_params(n_estimators=i)\n clf.fit(X, y)\n\n # Record the OOB error for each `n_estimators=i` setting.\n oob_error = 1 - clf.oob_score_\n error_rate[label].append((i, oob_error))\n\n# Generate the \"OOB error rate\" vs. \"n_estimators\" plot.\nfor label, clf_err in error_rate.items():\n xs, ys = zip(*clf_err)\n plt.plot(xs, ys, label=label)\n\nplt.xlim(min_estimators, max_estimators)\nplt.xlabel(\"n_estimators\")\nplt.ylabel(\"OOB error rate\")\nplt.legend(loc=\"upper right\")\nplt.show()"
3030
]
3131
}
3232
],
Binary file not shown.

dev/_downloads/75191b2eb3b4aa13066927321dd3fdcf/plot_ensemble_oob.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,10 @@
8181

8282
# Range of `n_estimators` values to explore.
8383
min_estimators = 15
84-
max_estimators = 175
84+
max_estimators = 150
8585

8686
for label, clf in ensemble_clfs:
87-
for i in range(min_estimators, max_estimators + 1):
87+
for i in range(min_estimators, max_estimators + 1, 5):
8888
clf.set_params(n_estimators=i)
8989
clf.fit(X, y)
9090

dev/_downloads/acc6f0183d4b7293ae5914724f55bc28/plot_gradient_boosting_categorical.py

Lines changed: 55 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,44 @@
2626
# %%
2727
# Load Ames Housing dataset
2828
# -------------------------
29-
# First, we load the ames housing data as a pandas dataframe. The features
29+
# First, we load the Ames Housing data as a pandas dataframe. The features
3030
# are either categorical or numerical:
3131
from sklearn.datasets import fetch_openml
3232

3333
X, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True)
3434

35-
n_categorical_features = (X.dtypes == "category").sum()
36-
n_numerical_features = (X.dtypes == "float").sum()
35+
# Select only a subset of features of X to make the example faster to run
36+
categorical_columns_subset = [
37+
"Bldg_Type",
38+
"Garage_Finish",
39+
"Lot_Config",
40+
"Functional",
41+
"Mas_Vnr_Type",
42+
"House_Style",
43+
"Fireplace_Qu",
44+
"Exter_Cond",
45+
"Exter_Qual",
46+
"Pool_QC",
47+
]
48+
49+
numerical_columns_subset = [
50+
"Three_season_porch",
51+
"Fireplaces",
52+
"Bsmt_Half_Bath",
53+
"Half_Bath",
54+
"Garage_Cars",
55+
"TotRms_AbvGrd",
56+
"BsmtFin_SF_1",
57+
"BsmtFin_SF_2",
58+
"Gr_Liv_Area",
59+
"Screen_Porch",
60+
]
61+
62+
X = X[categorical_columns_subset + numerical_columns_subset]
63+
64+
n_categorical_features = X.select_dtypes(include="category").shape[1]
65+
n_numerical_features = X.select_dtypes(include="number").shape[1]
66+
3767
print(f"Number of samples: {X.shape[0]}")
3868
print(f"Number of features: {X.shape[1]}")
3969
print(f"Number of categorical features: {n_categorical_features}")
@@ -114,6 +144,7 @@
114144

115145
# The ordinal encoder will first output the categorical features, and then the
116146
# continuous (passed-through) features
147+
117148
categorical_mask = [True] * n_categorical_features + [False] * n_numerical_features
118149
hist_native = make_pipeline(
119150
ordinal_encoder,
@@ -134,18 +165,20 @@
134165
import matplotlib.pyplot as plt
135166

136167
scoring = "neg_mean_absolute_percentage_error"
137-
dropped_result = cross_validate(hist_dropped, X, y, cv=3, scoring=scoring)
138-
one_hot_result = cross_validate(hist_one_hot, X, y, cv=3, scoring=scoring)
139-
ordinal_result = cross_validate(hist_ordinal, X, y, cv=3, scoring=scoring)
140-
native_result = cross_validate(hist_native, X, y, cv=3, scoring=scoring)
168+
n_cv_folds = 3
169+
170+
dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
171+
one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)
172+
ordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)
173+
native_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)
141174

142175

143176
def plot_results(figure_title):
144177
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
145178

146179
plot_info = [
147180
("fit_time", "Fit times (s)", ax1, None),
148-
("test_score", "Mean Absolute Percentage Error", ax2, (0, 0.20)),
181+
("test_score", "Mean Absolute Percentage Error", ax2, None),
149182
]
150183

151184
x, width = np.arange(4), 0.9
@@ -156,11 +189,15 @@ def plot_results(figure_title):
156189
ordinal_result[key],
157190
native_result[key],
158191
]
192+
193+
mape_cv_mean = [np.mean(np.abs(item)) for item in items]
194+
mape_cv_std = [np.std(item) for item in items]
195+
159196
ax.bar(
160-
x,
161-
[np.mean(np.abs(item)) for item in items],
162-
width,
163-
yerr=[np.std(item) for item in items],
197+
x=x,
198+
height=mape_cv_mean,
199+
width=width,
200+
yerr=mape_cv_std,
164201
color=["C0", "C1", "C2", "C3"],
165202
)
166203
ax.set(
@@ -173,7 +210,7 @@ def plot_results(figure_title):
173210
fig.suptitle(figure_title)
174211

175212

176-
plot_results("Gradient Boosting on Adult Census")
213+
plot_results("Gradient Boosting on Ames Housing")
177214

178215
# %%
179216
# We see that the model with one-hot-encoded data is by far the slowest. This
@@ -219,12 +256,12 @@ def plot_results(figure_title):
219256
histgradientboostingregressor__max_iter=15,
220257
)
221258

222-
dropped_result = cross_validate(hist_dropped, X, y, cv=3, scoring=scoring)
223-
one_hot_result = cross_validate(hist_one_hot, X, y, cv=3, scoring=scoring)
224-
ordinal_result = cross_validate(hist_ordinal, X, y, cv=3, scoring=scoring)
225-
native_result = cross_validate(hist_native, X, y, cv=3, scoring=scoring)
259+
dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
260+
one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)
261+
ordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)
262+
native_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)
226263

227-
plot_results("Gradient Boosting on Adult Census (few and small trees)")
264+
plot_results("Gradient Boosting on Ames Housing (few and small trees)")
228265

229266
plt.show()
230267

0 commit comments

Comments
 (0)