Skip to content

Commit ee9dcf1

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 4cf13d2b5fcf68a656cfdf273ac636d7dd6038f1
1 parent 748cf02 commit ee9dcf1

File tree

1,335 files changed

+5985
-6080
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,335 files changed

+5985
-6080
lines changed

dev/.buildinfo

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: c37d70b1df65eda08d42126bbea8fc94
3+
config: 6ed36f680b4f5aacbe97178a71e9529c
44
tags: 645f666f9bcd5a90fca523b33c5a78b7
Binary file not shown.
Binary file not shown.

dev/_downloads/7012baed63b9a27f121bae611b8285c2/plot_cyclical_feature_engineering.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,7 @@
275275
"cell_type": "markdown",
276276
"metadata": {},
277277
"source": [
278-
"All is well. We are now ready to do some predictive modeling!\n\n## Gradient Boosting\n\nGradient Boosting Regression with decision trees is often flexible enough to\nefficiently handle heterogenous tabular data with a mix of categorical and\nnumerical features as long as the number of samples is large enough.\n\nHere, we use the modern\n:class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support\nfor categorical features. Therefore, we only do minimal ordinal encoding for\nthe categorical variables and then\nlet the model know that it should treat those as categorical variables by\nusing a dedicated tree splitting rule. Since we use an ordinal encoder, we\npass the list of categorical values explicitly to use a logical order when\nencoding the categories as integers instead of the lexicographical order.\nThis also has the added benefit of preventing any issue with unknown\ncategories when using cross-validation.\n\nThe numerical variables need no preprocessing and, for the sake of simplicity,\nwe only try the default hyper-parameters for this model:\n\n"
278+
"All is well. We are now ready to do some predictive modeling!\n\n## Gradient Boosting\n\nGradient Boosting Regression with decision trees is often flexible enough to\nefficiently handle heterogenous tabular data with a mix of categorical and\nnumerical features as long as the number of samples is large enough.\n\nHere, we use the modern\n:class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support\nfor categorical features. Therefore, we only need to set\n`categorical_features=\"from_dtype\"` such that features with categorical dtype\nare considered categorical features. For reference, we extract the categorical\nfeatures from the dataframe based on the dtype. The internal trees use a dedicated\ntree splitting rule for these features.\n\nThe numerical variables need no preprocessing and, for the sake of simplicity,\nwe only try the default hyper-parameters for this model:\n\n"
279279
]
280280
},
281281
{
@@ -286,7 +286,7 @@
286286
},
287287
"outputs": [],
288288
"source": [
289-
"from sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OrdinalEncoder\n\ncategorical_columns = [\n \"weather\",\n \"season\",\n \"holiday\",\n \"workingday\",\n]\ncategories = [\n [\"clear\", \"misty\", \"rain\"],\n [\"spring\", \"summer\", \"fall\", \"winter\"],\n [\"False\", \"True\"],\n [\"False\", \"True\"],\n]\nordinal_encoder = OrdinalEncoder(categories=categories)\n\n\ngbrt_pipeline = make_pipeline(\n ColumnTransformer(\n transformers=[\n (\"categorical\", ordinal_encoder, categorical_columns),\n ],\n remainder=\"passthrough\",\n # Use short feature names to make it easier to specify the categorical\n # variables in the HistGradientBoostingRegressor in the next\n # step of the pipeline.\n verbose_feature_names_out=False,\n ),\n HistGradientBoostingRegressor(\n max_iter=300,\n early_stopping=True,\n validation_fraction=0.1,\n categorical_features=categorical_columns,\n random_state=42,\n ),\n).set_output(transform=\"pandas\")"
289+
"from sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\nfrom sklearn.pipeline import make_pipeline\n\ngbrt = HistGradientBoostingRegressor(categorical_features=\"from_dtype\", random_state=42)\ncategorical_columns = X.columns[X.dtypes == \"category\"]\nprint(\"Categorical features:\", categorical_columns.tolist())"
290290
]
291291
},
292292
{
@@ -304,7 +304,7 @@
304304
},
305305
"outputs": [],
306306
"source": [
307-
"import numpy as np\n\n\ndef evaluate(model, X, y, cv, model_prop=None, model_step=None):\n cv_results = cross_validate(\n model,\n X,\n y,\n cv=cv,\n scoring=[\"neg_mean_absolute_error\", \"neg_root_mean_squared_error\"],\n return_estimator=model_prop is not None,\n )\n if model_prop is not None:\n if model_step is not None:\n values = [\n getattr(m[model_step], model_prop) for m in cv_results[\"estimator\"]\n ]\n else:\n values = [getattr(m, model_prop) for m in cv_results[\"estimator\"]]\n print(f\"Mean model.{model_prop} = {np.mean(values)}\")\n mae = -cv_results[\"test_neg_mean_absolute_error\"]\n rmse = -cv_results[\"test_neg_root_mean_squared_error\"]\n print(\n f\"Mean Absolute Error: {mae.mean():.3f} +/- {mae.std():.3f}\\n\"\n f\"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}\"\n )\n\n\nevaluate(\n gbrt_pipeline,\n X,\n y,\n cv=ts_cv,\n model_prop=\"n_iter_\",\n model_step=\"histgradientboostingregressor\",\n)"
307+
"import numpy as np\n\n\ndef evaluate(model, X, y, cv, model_prop=None, model_step=None):\n cv_results = cross_validate(\n model,\n X,\n y,\n cv=cv,\n scoring=[\"neg_mean_absolute_error\", \"neg_root_mean_squared_error\"],\n return_estimator=model_prop is not None,\n )\n if model_prop is not None:\n if model_step is not None:\n values = [\n getattr(m[model_step], model_prop) for m in cv_results[\"estimator\"]\n ]\n else:\n values = [getattr(m, model_prop) for m in cv_results[\"estimator\"]]\n print(f\"Mean model.{model_prop} = {np.mean(values)}\")\n mae = -cv_results[\"test_neg_mean_absolute_error\"]\n rmse = -cv_results[\"test_neg_root_mean_squared_error\"]\n print(\n f\"Mean Absolute Error: {mae.mean():.3f} +/- {mae.std():.3f}\\n\"\n f\"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}\"\n )\n\n\nevaluate(gbrt, X, y, cv=ts_cv, model_prop=\"n_iter_\")"
308308
]
309309
},
310310
{
@@ -657,7 +657,7 @@
657657
},
658658
"outputs": [],
659659
"source": [
660-
"gbrt_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\ngbrt_predictions = gbrt_pipeline.predict(X.iloc[test_0])\n\none_hot_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\none_hot_poly_predictions = one_hot_poly_pipeline.predict(X.iloc[test_0])\n\ncyclic_spline_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\ncyclic_spline_poly_predictions = cyclic_spline_poly_pipeline.predict(X.iloc[test_0])"
660+
"gbrt.fit(X.iloc[train_0], y.iloc[train_0])\ngbrt_predictions = gbrt.predict(X.iloc[test_0])\n\none_hot_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\none_hot_poly_predictions = one_hot_poly_pipeline.predict(X.iloc[test_0])\n\ncyclic_spline_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\ncyclic_spline_poly_predictions = cyclic_spline_poly_pipeline.predict(X.iloc[test_0])"
661661
]
662662
},
663663
{

dev/_downloads/9fcbbc59ab27a20d07e209a711ac4f50/plot_cyclical_feature_engineering.py

Lines changed: 11 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -172,57 +172,22 @@
172172
#
173173
# Here, we use the modern
174174
# :class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support
175-
# for categorical features. Therefore, we only do minimal ordinal encoding for
176-
# the categorical variables and then
177-
# let the model know that it should treat those as categorical variables by
178-
# using a dedicated tree splitting rule. Since we use an ordinal encoder, we
179-
# pass the list of categorical values explicitly to use a logical order when
180-
# encoding the categories as integers instead of the lexicographical order.
181-
# This also has the added benefit of preventing any issue with unknown
182-
# categories when using cross-validation.
175+
# for categorical features. Therefore, we only need to set
176+
# `categorical_features="from_dtype"` such that features with categorical dtype
177+
# are considered categorical features. For reference, we extract the categorical
178+
# features from the dataframe based on the dtype. The internal trees use a dedicated
179+
# tree splitting rule for these features.
183180
#
184181
# The numerical variables need no preprocessing and, for the sake of simplicity,
185182
# we only try the default hyper-parameters for this model:
186183
from sklearn.compose import ColumnTransformer
187184
from sklearn.ensemble import HistGradientBoostingRegressor
188185
from sklearn.model_selection import cross_validate
189186
from sklearn.pipeline import make_pipeline
190-
from sklearn.preprocessing import OrdinalEncoder
191187

192-
categorical_columns = [
193-
"weather",
194-
"season",
195-
"holiday",
196-
"workingday",
197-
]
198-
categories = [
199-
["clear", "misty", "rain"],
200-
["spring", "summer", "fall", "winter"],
201-
["False", "True"],
202-
["False", "True"],
203-
]
204-
ordinal_encoder = OrdinalEncoder(categories=categories)
205-
206-
207-
gbrt_pipeline = make_pipeline(
208-
ColumnTransformer(
209-
transformers=[
210-
("categorical", ordinal_encoder, categorical_columns),
211-
],
212-
remainder="passthrough",
213-
# Use short feature names to make it easier to specify the categorical
214-
# variables in the HistGradientBoostingRegressor in the next
215-
# step of the pipeline.
216-
verbose_feature_names_out=False,
217-
),
218-
HistGradientBoostingRegressor(
219-
max_iter=300,
220-
early_stopping=True,
221-
validation_fraction=0.1,
222-
categorical_features=categorical_columns,
223-
random_state=42,
224-
),
225-
).set_output(transform="pandas")
188+
gbrt = HistGradientBoostingRegressor(categorical_features="from_dtype", random_state=42)
189+
categorical_columns = X.columns[X.dtypes == "category"]
190+
print("Categorical features:", categorical_columns.tolist())
226191

227192
# %%
228193
#
@@ -256,14 +221,7 @@ def evaluate(model, X, y, cv, model_prop=None, model_step=None):
256221
)
257222

258223

259-
evaluate(
260-
gbrt_pipeline,
261-
X,
262-
y,
263-
cv=ts_cv,
264-
model_prop="n_iter_",
265-
model_step="histgradientboostingregressor",
266-
)
224+
evaluate(gbrt, X, y, cv=ts_cv, model_prop="n_iter_")
267225

268226
# %%
269227
# We see that we set `max_iter` large enough such that early stopping took place.
@@ -735,8 +693,8 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
735693
# Let us now have a qualitative look at the predictions of the kernel models
736694
# and of the gradient boosted trees that should be able to better model
737695
# non-linear interactions between features:
738-
gbrt_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
739-
gbrt_predictions = gbrt_pipeline.predict(X.iloc[test_0])
696+
gbrt.fit(X.iloc[train_0], y.iloc[train_0])
697+
gbrt_predictions = gbrt.predict(X.iloc[test_0])
740698

741699
one_hot_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
742700
one_hot_poly_predictions = one_hot_poly_pipeline.predict(X.iloc[test_0])

dev/_downloads/acc6f0183d4b7293ae5914724f55bc28/plot_gradient_boosting_categorical.py

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -138,26 +138,17 @@
138138
# -----------------------------------------------------------
139139
# We now create a :class:`~ensemble.HistGradientBoostingRegressor` estimator
140140
# that will natively handle categorical features. This estimator will not treat
141-
# categorical features as ordered quantities.
141+
# categorical features as ordered quantities. We set
142+
# `categorical_features="from_dtype"` such that features with categorical dtype
143+
# are considered categorical features.
142144
#
143-
# Since the :class:`~ensemble.HistGradientBoostingRegressor` requires category
144-
# values to be encoded in `[0, n_unique_categories - 1]`, we still rely on an
145-
# :class:`~preprocessing.OrdinalEncoder` to pre-process the data.
146-
#
147-
# The main difference between this pipeline and the previous one is that in
145+
# The main difference between this estimator and the previous one is that in
148146
# this one, we let the :class:`~ensemble.HistGradientBoostingRegressor` know
149147
# which features are categorical.
150148

151-
# The ordinal encoder will first output the categorical features, and then the
152-
# continuous (passed-through) features
153-
154-
hist_native = make_pipeline(
155-
ordinal_encoder,
156-
HistGradientBoostingRegressor(
157-
random_state=42,
158-
categorical_features=categorical_columns,
159-
),
160-
).set_output(transform="pandas")
149+
hist_native = HistGradientBoostingRegressor(
150+
random_state=42, categorical_features="from_dtype"
151+
)
161152

162153
# %%
163154
# Model comparison
@@ -256,10 +247,15 @@ def plot_results(figure_title):
256247
# of trees and the depth of each tree.
257248

258249
for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):
259-
pipe.set_params(
260-
histgradientboostingregressor__max_depth=3,
261-
histgradientboostingregressor__max_iter=15,
262-
)
250+
if pipe is hist_native:
251+
# The native model does not use a pipeline so, we can set the parameters
252+
# directly.
253+
pipe.set_params(max_depth=3, max_iter=15)
254+
else:
255+
pipe.set_params(
256+
histgradientboostingregressor__max_depth=3,
257+
histgradientboostingregressor__max_iter=15,
258+
)
263259

264260
dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
265261
one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)

dev/_downloads/cd5de29451c4f8624f47d18def81839c/plot_gradient_boosting_categorical.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@
8383
"cell_type": "markdown",
8484
"metadata": {},
8585
"source": [
86-
"## Gradient boosting estimator with native categorical support\nWe now create a :class:`~ensemble.HistGradientBoostingRegressor` estimator\nthat will natively handle categorical features. This estimator will not treat\ncategorical features as ordered quantities.\n\nSince the :class:`~ensemble.HistGradientBoostingRegressor` requires category\nvalues to be encoded in `[0, n_unique_categories - 1]`, we still rely on an\n:class:`~preprocessing.OrdinalEncoder` to pre-process the data.\n\nThe main difference between this pipeline and the previous one is that in\nthis one, we let the :class:`~ensemble.HistGradientBoostingRegressor` know\nwhich features are categorical.\n\n"
86+
"## Gradient boosting estimator with native categorical support\nWe now create a :class:`~ensemble.HistGradientBoostingRegressor` estimator\nthat will natively handle categorical features. This estimator will not treat\ncategorical features as ordered quantities. We set\n`categorical_features=\"from_dtype\"` such that features with categorical dtype\nare considered categorical features.\n\nThe main difference between this estimator and the previous one is that in\nthis one, we let the :class:`~ensemble.HistGradientBoostingRegressor` know\nwhich features are categorical.\n\n"
8787
]
8888
},
8989
{
@@ -94,7 +94,7 @@
9494
},
9595
"outputs": [],
9696
"source": [
97-
"# The ordinal encoder will first output the categorical features, and then the\n# continuous (passed-through) features\n\nhist_native = make_pipeline(\n ordinal_encoder,\n HistGradientBoostingRegressor(\n random_state=42,\n categorical_features=categorical_columns,\n ),\n).set_output(transform=\"pandas\")"
97+
"hist_native = HistGradientBoostingRegressor(\n random_state=42, categorical_features=\"from_dtype\"\n)"
9898
]
9999
},
100100
{
@@ -137,7 +137,7 @@
137137
},
138138
"outputs": [],
139139
"source": [
140-
"for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):\n pipe.set_params(\n histgradientboostingregressor__max_depth=3,\n histgradientboostingregressor__max_iter=15,\n )\n\ndropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)\none_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)\nordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)\nnative_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)\n\nplot_results(\"Gradient Boosting on Ames Housing (few and small trees)\")\n\nplt.show()"
140+
"for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):\n if pipe is hist_native:\n # The native model does not use a pipeline so, we can set the parameters\n # directly.\n pipe.set_params(max_depth=3, max_iter=15)\n else:\n pipe.set_params(\n histgradientboostingregressor__max_depth=3,\n histgradientboostingregressor__max_iter=15,\n )\n\ndropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)\none_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)\nordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)\nnative_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)\n\nplot_results(\"Gradient Boosting on Ames Housing (few and small trees)\")\n\nplt.show()"
141141
]
142142
},
143143
{

dev/_downloads/scikit-learn-docs.zip

33.4 KB
Binary file not shown.

dev/_images/chanel.png

11.1 KB
-29 Bytes

0 commit comments

Comments
 (0)