scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
473 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
473 Bytes
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
486 Bytes b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
486 Bytes
diff --git a/‎dev/_downloads/7012baed63b9a27f121bae611b8285c2/plot_cyclical_feature_engineering.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/7012baed63b9a27f121bae611b8285c2/plot_cyclical_feature_engineering.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/9fcbbc59ab27a20d07e209a711ac4f50/plot_cyclical_feature_engineering.py
Lines changed: 6 additions & 2 deletions b/‎dev/_downloads/9fcbbc59ab27a20d07e209a711ac4f50/plot_cyclical_feature_engineering.py
Lines changed: 6 additions & 2 deletions
diff --git a/‎dev/_downloads/acc6f0183d4b7293ae5914724f55bc28/plot_gradient_boosting_categorical.py
Lines changed: 9 additions & 4 deletions b/‎dev/_downloads/acc6f0183d4b7293ae5914724f55bc28/plot_gradient_boosting_categorical.py
Lines changed: 9 additions & 4 deletions
diff --git a/‎dev/_downloads/cd5de29451c4f8624f47d18def81839c/plot_gradient_boosting_categorical.ipynb
Lines changed: 3 additions & 3 deletions b/‎dev/_downloads/cd5de29451c4f8624f47d18def81839c/plot_gradient_boosting_categorical.ipynb
Lines changed: 3 additions & 3 deletions
@@ -297,7 +297,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OrdinalEncoder\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\n\n\ncategorical_columns = [\n    \"weather\",\n    \"season\",\n    \"holiday\",\n    \"workingday\",\n]\ncategories = [\n    [\"clear\", \"misty\", \"rain\"],\n    [\"spring\", \"summer\", \"fall\", \"winter\"],\n    [\"False\", \"True\"],\n    [\"False\", \"True\"],\n]\nordinal_encoder = OrdinalEncoder(categories=categories)\n\n\ngbrt_pipeline = make_pipeline(\n    ColumnTransformer(\n        transformers=[\n            (\"categorical\", ordinal_encoder, categorical_columns),\n        ],\n        remainder=\"passthrough\",\n    ),\n    HistGradientBoostingRegressor(\n        categorical_features=range(4),\n    ),\n)"
+        "from sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OrdinalEncoder\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\n\n\ncategorical_columns = [\n    \"weather\",\n    \"season\",\n    \"holiday\",\n    \"workingday\",\n]\ncategories = [\n    [\"clear\", \"misty\", \"rain\"],\n    [\"spring\", \"summer\", \"fall\", \"winter\"],\n    [\"False\", \"True\"],\n    [\"False\", \"True\"],\n]\nordinal_encoder = OrdinalEncoder(categories=categories)\n\n\ngbrt_pipeline = make_pipeline(\n    ColumnTransformer(\n        transformers=[\n            (\"categorical\", ordinal_encoder, categorical_columns),\n        ],\n        remainder=\"passthrough\",\n        # Use short feature names to make it easier to specify the categorical\n        # variables in the HistGradientBoostingRegressor in the next\n        # step of the pipeline.\n        verbose_feature_names_out=False,\n    ),\n    HistGradientBoostingRegressor(\n        categorical_features=categorical_columns,\n    ),\n).set_output(transform=\"pandas\")"
       ]
     },
     {
 
@@ -209,11 +209,15 @@
             ("categorical", ordinal_encoder, categorical_columns),
         ],
         remainder="passthrough",
+        # Use short feature names to make it easier to specify the categorical
+        # variables in the HistGradientBoostingRegressor in the next
+        # step of the pipeline.
+        verbose_feature_names_out=False,
     ),
     HistGradientBoostingRegressor(
-        categorical_features=range(4),
+        categorical_features=categorical_columns,
     ),
-)
+).set_output(transform="pandas")
 
 # %%
 #
 
@@ -62,7 +62,8 @@
 X = X[categorical_columns_subset + numerical_columns_subset]
 X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
 
-n_categorical_features = X.select_dtypes(include="category").shape[1]
+categorical_columns = X.select_dtypes(include="category").columns
+n_categorical_features = len(categorical_columns)
 n_numerical_features = X.select_dtypes(include="number").shape[1]
 
 print(f"Number of samples: {X.shape[0]}")
@@ -122,6 +123,10 @@
         make_column_selector(dtype_include="category"),
     ),
     remainder="passthrough",
+    # Use short feature names to make it easier to specify the categorical
+    # variables in the HistGradientBoostingRegressor in the next step
+    # of the pipeline.
+    verbose_feature_names_out=False,
 )
 
 hist_ordinal = make_pipeline(
@@ -146,13 +151,13 @@
 # The ordinal encoder will first output the categorical features, and then the
 # continuous (passed-through) features
 
-categorical_mask = [True] * n_categorical_features + [False] * n_numerical_features
 hist_native = make_pipeline(
     ordinal_encoder,
     HistGradientBoostingRegressor(
-        random_state=42, categorical_features=categorical_mask
+        random_state=42,
+        categorical_features=categorical_columns,
     ),
-)
+).set_output(transform="pandas")
 
 # %%
 # Model comparison
 
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.datasets import fetch_openml\n\nX, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True, parser=\"pandas\")\n\n# Select only a subset of features of X to make the example faster to run\ncategorical_columns_subset = [\n    \"BldgType\",\n    \"GarageFinish\",\n    \"LotConfig\",\n    \"Functional\",\n    \"MasVnrType\",\n    \"HouseStyle\",\n    \"FireplaceQu\",\n    \"ExterCond\",\n    \"ExterQual\",\n    \"PoolQC\",\n]\n\nnumerical_columns_subset = [\n    \"3SsnPorch\",\n    \"Fireplaces\",\n    \"BsmtHalfBath\",\n    \"HalfBath\",\n    \"GarageCars\",\n    \"TotRmsAbvGrd\",\n    \"BsmtFinSF1\",\n    \"BsmtFinSF2\",\n    \"GrLivArea\",\n    \"ScreenPorch\",\n]\n\nX = X[categorical_columns_subset + numerical_columns_subset]\nX[categorical_columns_subset] = X[categorical_columns_subset].astype(\"category\")\n\nn_categorical_features = X.select_dtypes(include=\"category\").shape[1]\nn_numerical_features = X.select_dtypes(include=\"number\").shape[1]\n\nprint(f\"Number of samples: {X.shape[0]}\")\nprint(f\"Number of features: {X.shape[1]}\")\nprint(f\"Number of categorical features: {n_categorical_features}\")\nprint(f\"Number of numerical features: {n_numerical_features}\")"
+        "from sklearn.datasets import fetch_openml\n\nX, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True, parser=\"pandas\")\n\n# Select only a subset of features of X to make the example faster to run\ncategorical_columns_subset = [\n    \"BldgType\",\n    \"GarageFinish\",\n    \"LotConfig\",\n    \"Functional\",\n    \"MasVnrType\",\n    \"HouseStyle\",\n    \"FireplaceQu\",\n    \"ExterCond\",\n    \"ExterQual\",\n    \"PoolQC\",\n]\n\nnumerical_columns_subset = [\n    \"3SsnPorch\",\n    \"Fireplaces\",\n    \"BsmtHalfBath\",\n    \"HalfBath\",\n    \"GarageCars\",\n    \"TotRmsAbvGrd\",\n    \"BsmtFinSF1\",\n    \"BsmtFinSF2\",\n    \"GrLivArea\",\n    \"ScreenPorch\",\n]\n\nX = X[categorical_columns_subset + numerical_columns_subset]\nX[categorical_columns_subset] = X[categorical_columns_subset].astype(\"category\")\n\ncategorical_columns = X.select_dtypes(include=\"category\").columns\nn_categorical_features = len(categorical_columns)\nn_numerical_features = X.select_dtypes(include=\"number\").shape[1]\n\nprint(f\"Number of samples: {X.shape[0]}\")\nprint(f\"Number of features: {X.shape[1]}\")\nprint(f\"Number of categorical features: {n_categorical_features}\")\nprint(f\"Number of numerical features: {n_numerical_features}\")"
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.preprocessing import OrdinalEncoder\nimport numpy as np\n\nordinal_encoder = make_column_transformer(\n    (\n        OrdinalEncoder(handle_unknown=\"use_encoded_value\", unknown_value=np.nan),\n        make_column_selector(dtype_include=\"category\"),\n    ),\n    remainder=\"passthrough\",\n)\n\nhist_ordinal = make_pipeline(\n    ordinal_encoder, HistGradientBoostingRegressor(random_state=42)\n)"
+        "from sklearn.preprocessing import OrdinalEncoder\nimport numpy as np\n\nordinal_encoder = make_column_transformer(\n    (\n        OrdinalEncoder(handle_unknown=\"use_encoded_value\", unknown_value=np.nan),\n        make_column_selector(dtype_include=\"category\"),\n    ),\n    remainder=\"passthrough\",\n    # Use short feature names to make it easier to specify the categorical\n    # variables in the HistGradientBoostingRegressor in the next step\n    # of the pipeline.\n    verbose_feature_names_out=False,\n)\n\nhist_ordinal = make_pipeline(\n    ordinal_encoder, HistGradientBoostingRegressor(random_state=42)\n)"
       ]
     },
     {
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "# The ordinal encoder will first output the categorical features, and then the\n# continuous (passed-through) features\n\ncategorical_mask = [True] * n_categorical_features + [False] * n_numerical_features\nhist_native = make_pipeline(\n    ordinal_encoder,\n    HistGradientBoostingRegressor(\n        random_state=42, categorical_features=categorical_mask\n    ),\n)"
+        "# The ordinal encoder will first output the categorical features, and then the\n# continuous (passed-through) features\n\nhist_native = make_pipeline(\n    ordinal_encoder,\n    HistGradientBoostingRegressor(\n        random_state=42,\n        categorical_features=categorical_columns,\n    ),\n).set_output(transform=\"pandas\")"
       ]
     },
     {
Original file line number	Diff line number	Diff line change
`@@ -297,7 +297,7 @@`
`297`	`297`	`},`
`298`	`298`	`"outputs": [],`
`299`	`299`	`"source": [`
`300`		- "from sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OrdinalEncoder\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\n\n\ncategorical_columns = [\n \"weather\",\n \"season\",\n \"holiday\",\n \"workingday\",\n]\ncategories = [\n [\"clear\", \"misty\", \"rain\"],\n [\"spring\", \"summer\", \"fall\", \"winter\"],\n [\"False\", \"True\"],\n [\"False\", \"True\"],\n]\nordinal_encoder = OrdinalEncoder(categories=categories)\n\n\ngbrt_pipeline = make_pipeline(\n ColumnTransformer(\n transformers=[\n (\"categorical\", ordinal_encoder, categorical_columns),\n ],\n remainder=\"passthrough\",\n ),\n HistGradientBoostingRegressor(\n categorical_features=range(4),\n ),\n)"
	`300`	+ "from sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OrdinalEncoder\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\n\n\ncategorical_columns = [\n \"weather\",\n \"season\",\n \"holiday\",\n \"workingday\",\n]\ncategories = [\n [\"clear\", \"misty\", \"rain\"],\n [\"spring\", \"summer\", \"fall\", \"winter\"],\n [\"False\", \"True\"],\n [\"False\", \"True\"],\n]\nordinal_encoder = OrdinalEncoder(categories=categories)\n\n\ngbrt_pipeline = make_pipeline(\n ColumnTransformer(\n transformers=[\n (\"categorical\", ordinal_encoder, categorical_columns),\n ],\n remainder=\"passthrough\",\n # Use short feature names to make it easier to specify the categorical\n # variables in the HistGradientBoostingRegressor in the next\n # step of the pipeline.\n verbose_feature_names_out=False,\n ),\n HistGradientBoostingRegressor(\n categorical_features=categorical_columns,\n ),\n).set_output(transform=\"pandas\")"
`301`	`301`	`]`
`302`	`302`	`},`
`303`	`303`	`{`
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`},`
`34`	`34`	`"outputs": [],`
`35`	`35`	`"source": [`
`36`		- "from sklearn.datasets import fetch_openml\n\nX, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True, parser=\"pandas\")\n\n# Select only a subset of features of X to make the example faster to run\ncategorical_columns_subset = [\n \"BldgType\",\n \"GarageFinish\",\n \"LotConfig\",\n \"Functional\",\n \"MasVnrType\",\n \"HouseStyle\",\n \"FireplaceQu\",\n \"ExterCond\",\n \"ExterQual\",\n \"PoolQC\",\n]\n\nnumerical_columns_subset = [\n \"3SsnPorch\",\n \"Fireplaces\",\n \"BsmtHalfBath\",\n \"HalfBath\",\n \"GarageCars\",\n \"TotRmsAbvGrd\",\n \"BsmtFinSF1\",\n \"BsmtFinSF2\",\n \"GrLivArea\",\n \"ScreenPorch\",\n]\n\nX = X[categorical_columns_subset + numerical_columns_subset]\nX[categorical_columns_subset] = X[categorical_columns_subset].astype(\"category\")\n\nn_categorical_features = X.select_dtypes(include=\"category\").shape[1]\nn_numerical_features = X.select_dtypes(include=\"number\").shape[1]\n\nprint(f\"Number of samples: {X.shape[0]}\")\nprint(f\"Number of features: {X.shape[1]}\")\nprint(f\"Number of categorical features: {n_categorical_features}\")\nprint(f\"Number of numerical features: {n_numerical_features}\")"
	`36`	+ "from sklearn.datasets import fetch_openml\n\nX, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True, parser=\"pandas\")\n\n# Select only a subset of features of X to make the example faster to run\ncategorical_columns_subset = [\n \"BldgType\",\n \"GarageFinish\",\n \"LotConfig\",\n \"Functional\",\n \"MasVnrType\",\n \"HouseStyle\",\n \"FireplaceQu\",\n \"ExterCond\",\n \"ExterQual\",\n \"PoolQC\",\n]\n\nnumerical_columns_subset = [\n \"3SsnPorch\",\n \"Fireplaces\",\n \"BsmtHalfBath\",\n \"HalfBath\",\n \"GarageCars\",\n \"TotRmsAbvGrd\",\n \"BsmtFinSF1\",\n \"BsmtFinSF2\",\n \"GrLivArea\",\n \"ScreenPorch\",\n]\n\nX = X[categorical_columns_subset + numerical_columns_subset]\nX[categorical_columns_subset] = X[categorical_columns_subset].astype(\"category\")\n\ncategorical_columns = X.select_dtypes(include=\"category\").columns\nn_categorical_features = len(categorical_columns)\nn_numerical_features = X.select_dtypes(include=\"number\").shape[1]\n\nprint(f\"Number of samples: {X.shape[0]}\")\nprint(f\"Number of features: {X.shape[1]}\")\nprint(f\"Number of categorical features: {n_categorical_features}\")\nprint(f\"Number of numerical features: {n_numerical_features}\")"
`37`	`37`	`]`
`38`	`38`	`},`
`39`	`39`	`{`
`@@ -87,7 +87,7 @@`
`87`	`87`	`},`
`88`	`88`	`"outputs": [],`
`89`	`89`	`"source": [`
`90`		`- "from sklearn.preprocessing import OrdinalEncoder\nimport numpy as np\n\nordinal_encoder = make_column_transformer(\n (\n OrdinalEncoder(handle_unknown=\"use_encoded_value\", unknown_value=np.nan),\n make_column_selector(dtype_include=\"category\"),\n ),\n remainder=\"passthrough\",\n)\n\nhist_ordinal = make_pipeline(\n ordinal_encoder, HistGradientBoostingRegressor(random_state=42)\n)"`
	`90`	+ "from sklearn.preprocessing import OrdinalEncoder\nimport numpy as np\n\nordinal_encoder = make_column_transformer(\n (\n OrdinalEncoder(handle_unknown=\"use_encoded_value\", unknown_value=np.nan),\n make_column_selector(dtype_include=\"category\"),\n ),\n remainder=\"passthrough\",\n # Use short feature names to make it easier to specify the categorical\n # variables in the HistGradientBoostingRegressor in the next step\n # of the pipeline.\n verbose_feature_names_out=False,\n)\n\nhist_ordinal = make_pipeline(\n ordinal_encoder, HistGradientBoostingRegressor(random_state=42)\n)"
`91`	`91`	`]`
`92`	`92`	`},`
`93`	`93`	`{`
`@@ -105,7 +105,7 @@`
`105`	`105`	`},`
`106`	`106`	`"outputs": [],`
`107`	`107`	`"source": [`
`108`		`- "# The ordinal encoder will first output the categorical features, and then the\n# continuous (passed-through) features\n\ncategorical_mask = [True] * n_categorical_features + [False] * n_numerical_features\nhist_native = make_pipeline(\n ordinal_encoder,\n HistGradientBoostingRegressor(\n random_state=42, categorical_features=categorical_mask\n ),\n)"`
	`108`	`+ "# The ordinal encoder will first output the categorical features, and then the\n# continuous (passed-through) features\n\nhist_native = make_pipeline(\n ordinal_encoder,\n HistGradientBoostingRegressor(\n random_state=42,\n categorical_features=categorical_columns,\n ),\n).set_output(transform=\"pandas\")"`
`109`	`109`	`]`
`110`	`110`	`},`
`111`	`111`	`{`