scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
47 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
47 Bytes
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
51 Bytes b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
51 Bytes
diff --git a/‎dev/_downloads/acc6f0183d4b7293ae5914724f55bc28/plot_gradient_boosting_categorical.py
Lines changed: 20 additions & 21 deletions b/‎dev/_downloads/acc6f0183d4b7293ae5914724f55bc28/plot_gradient_boosting_categorical.py
Lines changed: 20 additions & 21 deletions
diff --git a/‎dev/_downloads/cd5de29451c4f8624f47d18def81839c/plot_gradient_boosting_categorical.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/cd5de29451c4f8624f47d18def81839c/plot_gradient_boosting_categorical.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
2.5 KB b/‎dev/_downloads/scikit-learn-docs.zip
2.5 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
58 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
58 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
82 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
82 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-343 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-343 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-249 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-249 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
11 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
11 Bytes
@@ -30,36 +30,37 @@
 # are either categorical or numerical:
 from sklearn.datasets import fetch_openml
 
-X, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True)
+X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
 
 # Select only a subset of features of X to make the example faster to run
 categorical_columns_subset = [
-    "Bldg_Type",
-    "Garage_Finish",
-    "Lot_Config",
+    "BldgType",
+    "GarageFinish",
+    "LotConfig",
     "Functional",
-    "Mas_Vnr_Type",
-    "House_Style",
-    "Fireplace_Qu",
-    "Exter_Cond",
-    "Exter_Qual",
-    "Pool_QC",
+    "MasVnrType",
+    "HouseStyle",
+    "FireplaceQu",
+    "ExterCond",
+    "ExterQual",
+    "PoolQC",
 ]
 
 numerical_columns_subset = [
-    "Three_season_porch",
+    "3SsnPorch",
     "Fireplaces",
-    "Bsmt_Half_Bath",
-    "Half_Bath",
-    "Garage_Cars",
-    "TotRms_AbvGrd",
-    "BsmtFin_SF_1",
-    "BsmtFin_SF_2",
-    "Gr_Liv_Area",
-    "Screen_Porch",
+    "BsmtHalfBath",
+    "HalfBath",
+    "GarageCars",
+    "TotRmsAbvGrd",
+    "BsmtFinSF1",
+    "BsmtFinSF2",
+    "GrLivArea",
+    "ScreenPorch",
 ]
 
 X = X[categorical_columns_subset + numerical_columns_subset]
+X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
 
 n_categorical_features = X.select_dtypes(include="category").shape[1]
 n_numerical_features = X.select_dtypes(include="number").shape[1]
@@ -153,7 +154,6 @@
     ),
 )
 
-
 # %%
 # Model comparison
 # ----------------
@@ -230,7 +230,6 @@ def plot_results(figure_title):
 # %%
 # Limitting the number of splits
 # ------------------------------
-#
 # In general, one can expect poorer predictions from one-hot-encoded data,
 # especially when the tree depths or the number of nodes are limited: with
 # one-hot-encoded data, one needs more split points, i.e. more depth, in order
 
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.datasets import fetch_openml\n\nX, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True)\n\n# Select only a subset of features of X to make the example faster to run\ncategorical_columns_subset = [\n    \"Bldg_Type\",\n    \"Garage_Finish\",\n    \"Lot_Config\",\n    \"Functional\",\n    \"Mas_Vnr_Type\",\n    \"House_Style\",\n    \"Fireplace_Qu\",\n    \"Exter_Cond\",\n    \"Exter_Qual\",\n    \"Pool_QC\",\n]\n\nnumerical_columns_subset = [\n    \"Three_season_porch\",\n    \"Fireplaces\",\n    \"Bsmt_Half_Bath\",\n    \"Half_Bath\",\n    \"Garage_Cars\",\n    \"TotRms_AbvGrd\",\n    \"BsmtFin_SF_1\",\n    \"BsmtFin_SF_2\",\n    \"Gr_Liv_Area\",\n    \"Screen_Porch\",\n]\n\nX = X[categorical_columns_subset + numerical_columns_subset]\n\nn_categorical_features = X.select_dtypes(include=\"category\").shape[1]\nn_numerical_features = X.select_dtypes(include=\"number\").shape[1]\n\nprint(f\"Number of samples: {X.shape[0]}\")\nprint(f\"Number of features: {X.shape[1]}\")\nprint(f\"Number of categorical features: {n_categorical_features}\")\nprint(f\"Number of numerical features: {n_numerical_features}\")"
+        "from sklearn.datasets import fetch_openml\n\nX, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)\n\n# Select only a subset of features of X to make the example faster to run\ncategorical_columns_subset = [\n    \"BldgType\",\n    \"GarageFinish\",\n    \"LotConfig\",\n    \"Functional\",\n    \"MasVnrType\",\n    \"HouseStyle\",\n    \"FireplaceQu\",\n    \"ExterCond\",\n    \"ExterQual\",\n    \"PoolQC\",\n]\n\nnumerical_columns_subset = [\n    \"3SsnPorch\",\n    \"Fireplaces\",\n    \"BsmtHalfBath\",\n    \"HalfBath\",\n    \"GarageCars\",\n    \"TotRmsAbvGrd\",\n    \"BsmtFinSF1\",\n    \"BsmtFinSF2\",\n    \"GrLivArea\",\n    \"ScreenPorch\",\n]\n\nX = X[categorical_columns_subset + numerical_columns_subset]\nX[categorical_columns_subset] = X[categorical_columns_subset].astype(\"category\")\n\nn_categorical_features = X.select_dtypes(include=\"category\").shape[1]\nn_numerical_features = X.select_dtypes(include=\"number\").shape[1]\n\nprint(f\"Number of samples: {X.shape[0]}\")\nprint(f\"Number of features: {X.shape[1]}\")\nprint(f\"Number of categorical features: {n_categorical_features}\")\nprint(f\"Number of numerical features: {n_numerical_features}\")"
       ]
     },
     {
@@ -137,7 +137,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Limitting the number of splits\n\nIn general, one can expect poorer predictions from one-hot-encoded data,\nespecially when the tree depths or the number of nodes are limited: with\none-hot-encoded data, one needs more split points, i.e. more depth, in order\nto recover an equivalent split that could be obtained in one single split\npoint with native handling.\n\nThis is also true when categories are treated as ordinal quantities: if\ncategories are `A..F` and the best split is `ACF - BDE` the one-hot-encoder\nmodel will need 3 split points (one per category in the left node), and the\nordinal non-native model will need 4 splits: 1 split to isolate `A`, 1 split\nto isolate `F`, and 2 splits to isolate `C` from `BCDE`.\n\nHow strongly the models' performances differ in practice will depend on the\ndataset and on the flexibility of the trees.\n\nTo see this, let us re-run the same analysis with under-fitting models where\nwe artificially limit the total number of splits by both limitting the number\nof trees and the depth of each tree.\n\n"
+        "## Limitting the number of splits\nIn general, one can expect poorer predictions from one-hot-encoded data,\nespecially when the tree depths or the number of nodes are limited: with\none-hot-encoded data, one needs more split points, i.e. more depth, in order\nto recover an equivalent split that could be obtained in one single split\npoint with native handling.\n\nThis is also true when categories are treated as ordinal quantities: if\ncategories are `A..F` and the best split is `ACF - BDE` the one-hot-encoder\nmodel will need 3 split points (one per category in the left node), and the\nordinal non-native model will need 4 splits: 1 split to isolate `A`, 1 split\nto isolate `F`, and 2 splits to isolate `C` from `BCDE`.\n\nHow strongly the models' performances differ in practice will depend on the\ndataset and on the flexibility of the trees.\n\nTo see this, let us re-run the same analysis with under-fitting models where\nwe artificially limit the total number of splits by both limitting the number\nof trees and the depth of each tree.\n\n"
       ]
     },
     {
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`},`
`34`	`34`	`"outputs": [],`
`35`	`35`	`"source": [`
`36`		- "from sklearn.datasets import fetch_openml\n\nX, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True)\n\n# Select only a subset of features of X to make the example faster to run\ncategorical_columns_subset = [\n \"Bldg_Type\",\n \"Garage_Finish\",\n \"Lot_Config\",\n \"Functional\",\n \"Mas_Vnr_Type\",\n \"House_Style\",\n \"Fireplace_Qu\",\n \"Exter_Cond\",\n \"Exter_Qual\",\n \"Pool_QC\",\n]\n\nnumerical_columns_subset = [\n \"Three_season_porch\",\n \"Fireplaces\",\n \"Bsmt_Half_Bath\",\n \"Half_Bath\",\n \"Garage_Cars\",\n \"TotRms_AbvGrd\",\n \"BsmtFin_SF_1\",\n \"BsmtFin_SF_2\",\n \"Gr_Liv_Area\",\n \"Screen_Porch\",\n]\n\nX = X[categorical_columns_subset + numerical_columns_subset]\n\nn_categorical_features = X.select_dtypes(include=\"category\").shape[1]\nn_numerical_features = X.select_dtypes(include=\"number\").shape[1]\n\nprint(f\"Number of samples: {X.shape[0]}\")\nprint(f\"Number of features: {X.shape[1]}\")\nprint(f\"Number of categorical features: {n_categorical_features}\")\nprint(f\"Number of numerical features: {n_numerical_features}\")"
	`36`	+ "from sklearn.datasets import fetch_openml\n\nX, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)\n\n# Select only a subset of features of X to make the example faster to run\ncategorical_columns_subset = [\n \"BldgType\",\n \"GarageFinish\",\n \"LotConfig\",\n \"Functional\",\n \"MasVnrType\",\n \"HouseStyle\",\n \"FireplaceQu\",\n \"ExterCond\",\n \"ExterQual\",\n \"PoolQC\",\n]\n\nnumerical_columns_subset = [\n \"3SsnPorch\",\n \"Fireplaces\",\n \"BsmtHalfBath\",\n \"HalfBath\",\n \"GarageCars\",\n \"TotRmsAbvGrd\",\n \"BsmtFinSF1\",\n \"BsmtFinSF2\",\n \"GrLivArea\",\n \"ScreenPorch\",\n]\n\nX = X[categorical_columns_subset + numerical_columns_subset]\nX[categorical_columns_subset] = X[categorical_columns_subset].astype(\"category\")\n\nn_categorical_features = X.select_dtypes(include=\"category\").shape[1]\nn_numerical_features = X.select_dtypes(include=\"number\").shape[1]\n\nprint(f\"Number of samples: {X.shape[0]}\")\nprint(f\"Number of features: {X.shape[1]}\")\nprint(f\"Number of categorical features: {n_categorical_features}\")\nprint(f\"Number of numerical features: {n_numerical_features}\")"
`37`	`37`	`]`
`38`	`38`	`},`
`39`	`39`	`{`
`@@ -137,7 +137,7 @@`
`137`	`137`	`"cell_type": "markdown",`
`138`	`138`	`"metadata": {},`
`139`	`139`	`"source": [`
`140`		- "## Limitting the number of splits\n\nIn general, one can expect poorer predictions from one-hot-encoded data,\nespecially when the tree depths or the number of nodes are limited: with\none-hot-encoded data, one needs more split points, i.e. more depth, in order\nto recover an equivalent split that could be obtained in one single split\npoint with native handling.\n\nThis is also true when categories are treated as ordinal quantities: if\ncategories are `A..F` and the best split is `ACF - BDE` the one-hot-encoder\nmodel will need 3 split points (one per category in the left node), and the\nordinal non-native model will need 4 splits: 1 split to isolate `A`, 1 split\nto isolate `F`, and 2 splits to isolate `C` from `BCDE`.\n\nHow strongly the models' performances differ in practice will depend on the\ndataset and on the flexibility of the trees.\n\nTo see this, let us re-run the same analysis with under-fitting models where\nwe artificially limit the total number of splits by both limitting the number\nof trees and the depth of each tree.\n\n"
	`140`	+ "## Limitting the number of splits\nIn general, one can expect poorer predictions from one-hot-encoded data,\nespecially when the tree depths or the number of nodes are limited: with\none-hot-encoded data, one needs more split points, i.e. more depth, in order\nto recover an equivalent split that could be obtained in one single split\npoint with native handling.\n\nThis is also true when categories are treated as ordinal quantities: if\ncategories are `A..F` and the best split is `ACF - BDE` the one-hot-encoder\nmodel will need 3 split points (one per category in the left node), and the\nordinal non-native model will need 4 splits: 1 split to isolate `A`, 1 split\nto isolate `F`, and 2 splits to isolate `C` from `BCDE`.\n\nHow strongly the models' performances differ in practice will depend on the\ndataset and on the flexibility of the trees.\n\nTo see this, let us re-run the same analysis with under-fitting models where\nwe artificially limit the total number of splits by both limitting the number\nof trees and the depth of each tree.\n\n"
`141`	`141`	`]`
`142`	`142`	`},`
`143`	`143`	`{`