scikit-learn
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-1.15 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-1.15 KB
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
-1.21 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
-1.21 KB
diff --git a/‎dev/_downloads/7012baed63b9a27f121bae611b8285c2/plot_cyclical_feature_engineering.ipynb
Lines changed: 4 additions & 4 deletions b/‎dev/_downloads/7012baed63b9a27f121bae611b8285c2/plot_cyclical_feature_engineering.ipynb
Lines changed: 4 additions & 4 deletions
diff --git a/‎dev/_downloads/9fcbbc59ab27a20d07e209a711ac4f50/plot_cyclical_feature_engineering.py
Lines changed: 11 additions & 53 deletions b/‎dev/_downloads/9fcbbc59ab27a20d07e209a711ac4f50/plot_cyclical_feature_engineering.py
Lines changed: 11 additions & 53 deletions
diff --git a/‎dev/_downloads/acc6f0183d4b7293ae5914724f55bc28/plot_gradient_boosting_categorical.py
Lines changed: 16 additions & 20 deletions b/‎dev/_downloads/acc6f0183d4b7293ae5914724f55bc28/plot_gradient_boosting_categorical.py
Lines changed: 16 additions & 20 deletions
diff --git a/‎dev/_downloads/cd5de29451c4f8624f47d18def81839c/plot_gradient_boosting_categorical.ipynb
Lines changed: 3 additions & 3 deletions b/‎dev/_downloads/cd5de29451c4f8624f47d18def81839c/plot_gradient_boosting_categorical.ipynb
Lines changed: 3 additions & 3 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
33.4 KB b/‎dev/_downloads/scikit-learn-docs.zip
33.4 KB
diff --git a/‎dev/_images/chanel.png
11.1 KB b/‎dev/_images/chanel.png
11.1 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-29 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-29 Bytes
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: c37d70b1df65eda08d42126bbea8fc94
+config: 6ed36f680b4f5aacbe97178a71e9529c
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -275,7 +275,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "All is well. We are now ready to do some predictive modeling!\n\n## Gradient Boosting\n\nGradient Boosting Regression with decision trees is often flexible enough to\nefficiently handle heterogenous tabular data with a mix of categorical and\nnumerical features as long as the number of samples is large enough.\n\nHere, we use the modern\n:class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support\nfor categorical features. Therefore, we only do minimal ordinal encoding for\nthe categorical variables and then\nlet the model know that it should treat those as categorical variables by\nusing a dedicated tree splitting rule. Since we use an ordinal encoder, we\npass the list of categorical values explicitly to use a logical order when\nencoding the categories as integers instead of the lexicographical order.\nThis also has the added benefit of preventing any issue with unknown\ncategories when using cross-validation.\n\nThe numerical variables need no preprocessing and, for the sake of simplicity,\nwe only try the default hyper-parameters for this model:\n\n"
+        "All is well. We are now ready to do some predictive modeling!\n\n## Gradient Boosting\n\nGradient Boosting Regression with decision trees is often flexible enough to\nefficiently handle heterogenous tabular data with a mix of categorical and\nnumerical features as long as the number of samples is large enough.\n\nHere, we use the modern\n:class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support\nfor categorical features. Therefore, we only need to set\n`categorical_features=\"from_dtype\"` such that features with categorical dtype\nare considered categorical features. For reference, we extract the categorical\nfeatures from the dataframe based on the dtype. The internal trees use a dedicated\ntree splitting rule for these features.\n\nThe numerical variables need no preprocessing and, for the sake of simplicity,\nwe only try the default hyper-parameters for this model:\n\n"
       ]
     },
     {
@@ -286,7 +286,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OrdinalEncoder\n\ncategorical_columns = [\n    \"weather\",\n    \"season\",\n    \"holiday\",\n    \"workingday\",\n]\ncategories = [\n    [\"clear\", \"misty\", \"rain\"],\n    [\"spring\", \"summer\", \"fall\", \"winter\"],\n    [\"False\", \"True\"],\n    [\"False\", \"True\"],\n]\nordinal_encoder = OrdinalEncoder(categories=categories)\n\n\ngbrt_pipeline = make_pipeline(\n    ColumnTransformer(\n        transformers=[\n            (\"categorical\", ordinal_encoder, categorical_columns),\n        ],\n        remainder=\"passthrough\",\n        # Use short feature names to make it easier to specify the categorical\n        # variables in the HistGradientBoostingRegressor in the next\n        # step of the pipeline.\n        verbose_feature_names_out=False,\n    ),\n    HistGradientBoostingRegressor(\n        max_iter=300,\n        early_stopping=True,\n        validation_fraction=0.1,\n        categorical_features=categorical_columns,\n        random_state=42,\n    ),\n).set_output(transform=\"pandas\")"
+        "from sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\nfrom sklearn.pipeline import make_pipeline\n\ngbrt = HistGradientBoostingRegressor(categorical_features=\"from_dtype\", random_state=42)\ncategorical_columns = X.columns[X.dtypes == \"category\"]\nprint(\"Categorical features:\", categorical_columns.tolist())"
       ]
     },
     {
@@ -304,7 +304,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\n\n\ndef evaluate(model, X, y, cv, model_prop=None, model_step=None):\n    cv_results = cross_validate(\n        model,\n        X,\n        y,\n        cv=cv,\n        scoring=[\"neg_mean_absolute_error\", \"neg_root_mean_squared_error\"],\n        return_estimator=model_prop is not None,\n    )\n    if model_prop is not None:\n        if model_step is not None:\n            values = [\n                getattr(m[model_step], model_prop) for m in cv_results[\"estimator\"]\n            ]\n        else:\n            values = [getattr(m, model_prop) for m in cv_results[\"estimator\"]]\n        print(f\"Mean model.{model_prop} = {np.mean(values)}\")\n    mae = -cv_results[\"test_neg_mean_absolute_error\"]\n    rmse = -cv_results[\"test_neg_root_mean_squared_error\"]\n    print(\n        f\"Mean Absolute Error:     {mae.mean():.3f} +/- {mae.std():.3f}\\n\"\n        f\"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}\"\n    )\n\n\nevaluate(\n    gbrt_pipeline,\n    X,\n    y,\n    cv=ts_cv,\n    model_prop=\"n_iter_\",\n    model_step=\"histgradientboostingregressor\",\n)"
+        "import numpy as np\n\n\ndef evaluate(model, X, y, cv, model_prop=None, model_step=None):\n    cv_results = cross_validate(\n        model,\n        X,\n        y,\n        cv=cv,\n        scoring=[\"neg_mean_absolute_error\", \"neg_root_mean_squared_error\"],\n        return_estimator=model_prop is not None,\n    )\n    if model_prop is not None:\n        if model_step is not None:\n            values = [\n                getattr(m[model_step], model_prop) for m in cv_results[\"estimator\"]\n            ]\n        else:\n            values = [getattr(m, model_prop) for m in cv_results[\"estimator\"]]\n        print(f\"Mean model.{model_prop} = {np.mean(values)}\")\n    mae = -cv_results[\"test_neg_mean_absolute_error\"]\n    rmse = -cv_results[\"test_neg_root_mean_squared_error\"]\n    print(\n        f\"Mean Absolute Error:     {mae.mean():.3f} +/- {mae.std():.3f}\\n\"\n        f\"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}\"\n    )\n\n\nevaluate(gbrt, X, y, cv=ts_cv, model_prop=\"n_iter_\")"
       ]
     },
     {
@@ -657,7 +657,7 @@
       },
       "outputs": [],
       "source": [
-        "gbrt_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\ngbrt_predictions = gbrt_pipeline.predict(X.iloc[test_0])\n\none_hot_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\none_hot_poly_predictions = one_hot_poly_pipeline.predict(X.iloc[test_0])\n\ncyclic_spline_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\ncyclic_spline_poly_predictions = cyclic_spline_poly_pipeline.predict(X.iloc[test_0])"
+        "gbrt.fit(X.iloc[train_0], y.iloc[train_0])\ngbrt_predictions = gbrt.predict(X.iloc[test_0])\n\none_hot_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\none_hot_poly_predictions = one_hot_poly_pipeline.predict(X.iloc[test_0])\n\ncyclic_spline_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\ncyclic_spline_poly_predictions = cyclic_spline_poly_pipeline.predict(X.iloc[test_0])"
       ]
     },
     {
 
@@ -172,57 +172,22 @@
 #
 # Here, we use the modern
 # :class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support
-# for categorical features. Therefore, we only do minimal ordinal encoding for
-# the categorical variables and then
-# let the model know that it should treat those as categorical variables by
-# using a dedicated tree splitting rule. Since we use an ordinal encoder, we
-# pass the list of categorical values explicitly to use a logical order when
-# encoding the categories as integers instead of the lexicographical order.
-# This also has the added benefit of preventing any issue with unknown
-# categories when using cross-validation.
+# for categorical features. Therefore, we only need to set
+# `categorical_features="from_dtype"` such that features with categorical dtype
+# are considered categorical features. For reference, we extract the categorical
+# features from the dataframe based on the dtype. The internal trees use a dedicated
+# tree splitting rule for these features.
 #
 # The numerical variables need no preprocessing and, for the sake of simplicity,
 # we only try the default hyper-parameters for this model:
 from sklearn.compose import ColumnTransformer
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.model_selection import cross_validate
 from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import OrdinalEncoder
 
-categorical_columns = [
-    "weather",
-    "season",
-    "holiday",
-    "workingday",
-]
-categories = [
-    ["clear", "misty", "rain"],
-    ["spring", "summer", "fall", "winter"],
-    ["False", "True"],
-    ["False", "True"],
-]
-ordinal_encoder = OrdinalEncoder(categories=categories)
-
-
-gbrt_pipeline = make_pipeline(
-    ColumnTransformer(
-        transformers=[
-            ("categorical", ordinal_encoder, categorical_columns),
-        ],
-        remainder="passthrough",
-        # Use short feature names to make it easier to specify the categorical
-        # variables in the HistGradientBoostingRegressor in the next
-        # step of the pipeline.
-        verbose_feature_names_out=False,
-    ),
-    HistGradientBoostingRegressor(
-        max_iter=300,
-        early_stopping=True,
-        validation_fraction=0.1,
-        categorical_features=categorical_columns,
-        random_state=42,
-    ),
-).set_output(transform="pandas")
+gbrt = HistGradientBoostingRegressor(categorical_features="from_dtype", random_state=42)
+categorical_columns = X.columns[X.dtypes == "category"]
+print("Categorical features:", categorical_columns.tolist())
 
 # %%
 #
@@ -256,14 +221,7 @@ def evaluate(model, X, y, cv, model_prop=None, model_step=None):
     )
 
 
-evaluate(
-    gbrt_pipeline,
-    X,
-    y,
-    cv=ts_cv,
-    model_prop="n_iter_",
-    model_step="histgradientboostingregressor",
-)
+evaluate(gbrt, X, y, cv=ts_cv, model_prop="n_iter_")
 
 # %%
 # We see that we set `max_iter` large enough such that early stopping took place.
@@ -735,8 +693,8 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 # Let us now have a qualitative look at the predictions of the kernel models
 # and of the gradient boosted trees that should be able to better model
 # non-linear interactions between features:
-gbrt_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
-gbrt_predictions = gbrt_pipeline.predict(X.iloc[test_0])
+gbrt.fit(X.iloc[train_0], y.iloc[train_0])
+gbrt_predictions = gbrt.predict(X.iloc[test_0])
 
 one_hot_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
 one_hot_poly_predictions = one_hot_poly_pipeline.predict(X.iloc[test_0])
 
@@ -138,26 +138,17 @@
 # -----------------------------------------------------------
 # We now create a :class:`~ensemble.HistGradientBoostingRegressor` estimator
 # that will natively handle categorical features. This estimator will not treat
-# categorical features as ordered quantities.
+# categorical features as ordered quantities. We set
+# `categorical_features="from_dtype"` such that features with categorical dtype
+# are considered categorical features.
 #
-# Since the :class:`~ensemble.HistGradientBoostingRegressor` requires category
-# values to be encoded in `[0, n_unique_categories - 1]`, we still rely on an
-# :class:`~preprocessing.OrdinalEncoder` to pre-process the data.
-#
-# The main difference between this pipeline and the previous one is that in
+# The main difference between this estimator and the previous one is that in
 # this one, we let the :class:`~ensemble.HistGradientBoostingRegressor` know
 # which features are categorical.
 
-# The ordinal encoder will first output the categorical features, and then the
-# continuous (passed-through) features
-
-hist_native = make_pipeline(
-    ordinal_encoder,
-    HistGradientBoostingRegressor(
-        random_state=42,
-        categorical_features=categorical_columns,
-    ),
-).set_output(transform="pandas")
+hist_native = HistGradientBoostingRegressor(
+    random_state=42, categorical_features="from_dtype"
+)
 
 # %%
 # Model comparison
@@ -256,10 +247,15 @@ def plot_results(figure_title):
 # of trees and the depth of each tree.
 
 for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):
-    pipe.set_params(
-        histgradientboostingregressor__max_depth=3,
-        histgradientboostingregressor__max_iter=15,
-    )
+    if pipe is hist_native:
+        # The native model does not use a pipeline so, we can set the parameters
+        # directly.
+        pipe.set_params(max_depth=3, max_iter=15)
+    else:
+        pipe.set_params(
+            histgradientboostingregressor__max_depth=3,
+            histgradientboostingregressor__max_iter=15,
+        )
 
 dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
 one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)
 
@@ -83,7 +83,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Gradient boosting estimator with native categorical support\nWe now create a :class:`~ensemble.HistGradientBoostingRegressor` estimator\nthat will natively handle categorical features. This estimator will not treat\ncategorical features as ordered quantities.\n\nSince the :class:`~ensemble.HistGradientBoostingRegressor` requires category\nvalues to be encoded in `[0, n_unique_categories - 1]`, we still rely on an\n:class:`~preprocessing.OrdinalEncoder` to pre-process the data.\n\nThe main difference between this pipeline and the previous one is that in\nthis one, we let the :class:`~ensemble.HistGradientBoostingRegressor` know\nwhich features are categorical.\n\n"
+        "## Gradient boosting estimator with native categorical support\nWe now create a :class:`~ensemble.HistGradientBoostingRegressor` estimator\nthat will natively handle categorical features. This estimator will not treat\ncategorical features as ordered quantities. We set\n`categorical_features=\"from_dtype\"` such that features with categorical dtype\nare considered categorical features.\n\nThe main difference between this estimator and the previous one is that in\nthis one, we let the :class:`~ensemble.HistGradientBoostingRegressor` know\nwhich features are categorical.\n\n"
       ]
     },
     {
@@ -94,7 +94,7 @@
       },
       "outputs": [],
       "source": [
-        "# The ordinal encoder will first output the categorical features, and then the\n# continuous (passed-through) features\n\nhist_native = make_pipeline(\n    ordinal_encoder,\n    HistGradientBoostingRegressor(\n        random_state=42,\n        categorical_features=categorical_columns,\n    ),\n).set_output(transform=\"pandas\")"
+        "hist_native = HistGradientBoostingRegressor(\n    random_state=42, categorical_features=\"from_dtype\"\n)"
       ]
     },
     {
@@ -137,7 +137,7 @@
       },
       "outputs": [],
       "source": [
-        "for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):\n    pipe.set_params(\n        histgradientboostingregressor__max_depth=3,\n        histgradientboostingregressor__max_iter=15,\n    )\n\ndropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)\none_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)\nordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)\nnative_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)\n\nplot_results(\"Gradient Boosting on Ames Housing (few and small trees)\")\n\nplt.show()"
+        "for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):\n    if pipe is hist_native:\n        # The native model does not use a pipeline so, we can set the parameters\n        # directly.\n        pipe.set_params(max_depth=3, max_iter=15)\n    else:\n        pipe.set_params(\n            histgradientboostingregressor__max_depth=3,\n            histgradientboostingregressor__max_iter=15,\n        )\n\ndropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)\none_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)\nordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)\nnative_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)\n\nplot_results(\"Gradient Boosting on Ames Housing (few and small trees)\")\n\nplt.show()"
       ]
     },
     {
Original file line number	Diff line number	Diff line change
`@@ -275,7 +275,7 @@`
`275`	`275`	`"cell_type": "markdown",`
`276`	`276`	`"metadata": {},`
`277`	`277`	`"source": [`
`278`		- "All is well. We are now ready to do some predictive modeling!\n\n## Gradient Boosting\n\nGradient Boosting Regression with decision trees is often flexible enough to\nefficiently handle heterogenous tabular data with a mix of categorical and\nnumerical features as long as the number of samples is large enough.\n\nHere, we use the modern\n:class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support\nfor categorical features. Therefore, we only do minimal ordinal encoding for\nthe categorical variables and then\nlet the model know that it should treat those as categorical variables by\nusing a dedicated tree splitting rule. Since we use an ordinal encoder, we\npass the list of categorical values explicitly to use a logical order when\nencoding the categories as integers instead of the lexicographical order.\nThis also has the added benefit of preventing any issue with unknown\ncategories when using cross-validation.\n\nThe numerical variables need no preprocessing and, for the sake of simplicity,\nwe only try the default hyper-parameters for this model:\n\n"
	`278`	+ "All is well. We are now ready to do some predictive modeling!\n\n## Gradient Boosting\n\nGradient Boosting Regression with decision trees is often flexible enough to\nefficiently handle heterogenous tabular data with a mix of categorical and\nnumerical features as long as the number of samples is large enough.\n\nHere, we use the modern\n:class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support\nfor categorical features. Therefore, we only need to set\n`categorical_features=\"from_dtype\"` such that features with categorical dtype\nare considered categorical features. For reference, we extract the categorical\nfeatures from the dataframe based on the dtype. The internal trees use a dedicated\ntree splitting rule for these features.\n\nThe numerical variables need no preprocessing and, for the sake of simplicity,\nwe only try the default hyper-parameters for this model:\n\n"
`279`	`279`	`]`
`280`	`280`	`},`
`281`	`281`	`{`
`@@ -286,7 +286,7 @@`
`286`	`286`	`},`
`287`	`287`	`"outputs": [],`
`288`	`288`	`"source": [`
`289`		- "from sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OrdinalEncoder\n\ncategorical_columns = [\n \"weather\",\n \"season\",\n \"holiday\",\n \"workingday\",\n]\ncategories = [\n [\"clear\", \"misty\", \"rain\"],\n [\"spring\", \"summer\", \"fall\", \"winter\"],\n [\"False\", \"True\"],\n [\"False\", \"True\"],\n]\nordinal_encoder = OrdinalEncoder(categories=categories)\n\n\ngbrt_pipeline = make_pipeline(\n ColumnTransformer(\n transformers=[\n (\"categorical\", ordinal_encoder, categorical_columns),\n ],\n remainder=\"passthrough\",\n # Use short feature names to make it easier to specify the categorical\n # variables in the HistGradientBoostingRegressor in the next\n # step of the pipeline.\n verbose_feature_names_out=False,\n ),\n HistGradientBoostingRegressor(\n max_iter=300,\n early_stopping=True,\n validation_fraction=0.1,\n categorical_features=categorical_columns,\n random_state=42,\n ),\n).set_output(transform=\"pandas\")"
	`289`	`+ "from sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\nfrom sklearn.pipeline import make_pipeline\n\ngbrt = HistGradientBoostingRegressor(categorical_features=\"from_dtype\", random_state=42)\ncategorical_columns = X.columns[X.dtypes == \"category\"]\nprint(\"Categorical features:\", categorical_columns.tolist())"`
`290`	`290`	`]`
`291`	`291`	`},`
`292`	`292`	`{`
`@@ -304,7 +304,7 @@`
`304`	`304`	`},`
`305`	`305`	`"outputs": [],`
`306`	`306`	`"source": [`
`307`		- "import numpy as np\n\n\ndef evaluate(model, X, y, cv, model_prop=None, model_step=None):\n cv_results = cross_validate(\n model,\n X,\n y,\n cv=cv,\n scoring=[\"neg_mean_absolute_error\", \"neg_root_mean_squared_error\"],\n return_estimator=model_prop is not None,\n )\n if model_prop is not None:\n if model_step is not None:\n values = [\n getattr(m[model_step], model_prop) for m in cv_results[\"estimator\"]\n ]\n else:\n values = [getattr(m, model_prop) for m in cv_results[\"estimator\"]]\n print(f\"Mean model.{model_prop} = {np.mean(values)}\")\n mae = -cv_results[\"test_neg_mean_absolute_error\"]\n rmse = -cv_results[\"test_neg_root_mean_squared_error\"]\n print(\n f\"Mean Absolute Error: {mae.mean():.3f} +/- {mae.std():.3f}\\n\"\n f\"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}\"\n )\n\n\nevaluate(\n gbrt_pipeline,\n X,\n y,\n cv=ts_cv,\n model_prop=\"n_iter_\",\n model_step=\"histgradientboostingregressor\",\n)"
	`307`	+ "import numpy as np\n\n\ndef evaluate(model, X, y, cv, model_prop=None, model_step=None):\n cv_results = cross_validate(\n model,\n X,\n y,\n cv=cv,\n scoring=[\"neg_mean_absolute_error\", \"neg_root_mean_squared_error\"],\n return_estimator=model_prop is not None,\n )\n if model_prop is not None:\n if model_step is not None:\n values = [\n getattr(m[model_step], model_prop) for m in cv_results[\"estimator\"]\n ]\n else:\n values = [getattr(m, model_prop) for m in cv_results[\"estimator\"]]\n print(f\"Mean model.{model_prop} = {np.mean(values)}\")\n mae = -cv_results[\"test_neg_mean_absolute_error\"]\n rmse = -cv_results[\"test_neg_root_mean_squared_error\"]\n print(\n f\"Mean Absolute Error: {mae.mean():.3f} +/- {mae.std():.3f}\\n\"\n f\"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}\"\n )\n\n\nevaluate(gbrt, X, y, cv=ts_cv, model_prop=\"n_iter_\")"
`308`	`308`	`]`
`309`	`309`	`},`
`310`	`310`	`{`
`@@ -657,7 +657,7 @@`
`657`	`657`	`},`
`658`	`658`	`"outputs": [],`
`659`	`659`	`"source": [`
`660`		`- "gbrt_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\ngbrt_predictions = gbrt_pipeline.predict(X.iloc[test_0])\n\none_hot_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\none_hot_poly_predictions = one_hot_poly_pipeline.predict(X.iloc[test_0])\n\ncyclic_spline_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\ncyclic_spline_poly_predictions = cyclic_spline_poly_pipeline.predict(X.iloc[test_0])"`
	`660`	`+ "gbrt.fit(X.iloc[train_0], y.iloc[train_0])\ngbrt_predictions = gbrt.predict(X.iloc[test_0])\n\none_hot_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\none_hot_poly_predictions = one_hot_poly_pipeline.predict(X.iloc[test_0])\n\ncyclic_spline_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])\ncyclic_spline_poly_predictions = cyclic_spline_poly_pipeline.predict(X.iloc[test_0])"`
`661`	`661`	`]`
`662`	`662`	`},`
`663`	`663`	`{`
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@`
`83`	`83`	`"cell_type": "markdown",`
`84`	`84`	`"metadata": {},`
`85`	`85`	`"source": [`
`86`		- "## Gradient boosting estimator with native categorical support\nWe now create a :class:`~ensemble.HistGradientBoostingRegressor` estimator\nthat will natively handle categorical features. This estimator will not treat\ncategorical features as ordered quantities.\n\nSince the :class:`~ensemble.HistGradientBoostingRegressor` requires category\nvalues to be encoded in `[0, n_unique_categories - 1]`, we still rely on an\n:class:`~preprocessing.OrdinalEncoder` to pre-process the data.\n\nThe main difference between this pipeline and the previous one is that in\nthis one, we let the :class:`~ensemble.HistGradientBoostingRegressor` know\nwhich features are categorical.\n\n"
	`86`	+ "## Gradient boosting estimator with native categorical support\nWe now create a :class:`~ensemble.HistGradientBoostingRegressor` estimator\nthat will natively handle categorical features. This estimator will not treat\ncategorical features as ordered quantities. We set\n`categorical_features=\"from_dtype\"` such that features with categorical dtype\nare considered categorical features.\n\nThe main difference between this estimator and the previous one is that in\nthis one, we let the :class:`~ensemble.HistGradientBoostingRegressor` know\nwhich features are categorical.\n\n"
`87`	`87`	`]`
`88`	`88`	`},`
`89`	`89`	`{`
`@@ -94,7 +94,7 @@`
`94`	`94`	`},`
`95`	`95`	`"outputs": [],`
`96`	`96`	`"source": [`
`97`		`- "# The ordinal encoder will first output the categorical features, and then the\n# continuous (passed-through) features\n\nhist_native = make_pipeline(\n ordinal_encoder,\n HistGradientBoostingRegressor(\n random_state=42,\n categorical_features=categorical_columns,\n ),\n).set_output(transform=\"pandas\")"`
	`97`	`+ "hist_native = HistGradientBoostingRegressor(\n random_state=42, categorical_features=\"from_dtype\"\n)"`
`98`	`98`	`]`
`99`	`99`	`},`
`100`	`100`	`{`
`@@ -137,7 +137,7 @@`
`137`	`137`	`},`
`138`	`138`	`"outputs": [],`
`139`	`139`	`"source": [`
`140`		- "for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):\n pipe.set_params(\n histgradientboostingregressor__max_depth=3,\n histgradientboostingregressor__max_iter=15,\n )\n\ndropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)\none_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)\nordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)\nnative_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)\n\nplot_results(\"Gradient Boosting on Ames Housing (few and small trees)\")\n\nplt.show()"
	`140`	+ "for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):\n if pipe is hist_native:\n # The native model does not use a pipeline so, we can set the parameters\n # directly.\n pipe.set_params(max_depth=3, max_iter=15)\n else:\n pipe.set_params(\n histgradientboostingregressor__max_depth=3,\n histgradientboostingregressor__max_iter=15,\n )\n\ndropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)\none_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)\nordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)\nnative_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)\n\nplot_results(\"Gradient Boosting on Ames Housing (few and small trees)\")\n\nplt.show()"
`141`	`141`	`]`
`142`	`142`	`},`
`143`	`143`	`{`