Skip to content

Commit 49c1559

Browse files
committed
Pushing the docs to dev/ for branch: main, commit f3b36e830b892820385ca3c1e8b960da0abab5c2
1 parent 721c936 commit 49c1559

File tree

1,226 files changed

+4453
-4458
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,226 files changed

+4453
-4458
lines changed
Binary file not shown.
Binary file not shown.

dev/_downloads/acc6f0183d4b7293ae5914724f55bc28/plot_gradient_boosting_categorical.py

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -30,36 +30,37 @@
3030
# are either categorical or numerical:
3131
from sklearn.datasets import fetch_openml
3232

33-
X, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True)
33+
X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
3434

3535
# Select only a subset of features of X to make the example faster to run
3636
categorical_columns_subset = [
37-
"Bldg_Type",
38-
"Garage_Finish",
39-
"Lot_Config",
37+
"BldgType",
38+
"GarageFinish",
39+
"LotConfig",
4040
"Functional",
41-
"Mas_Vnr_Type",
42-
"House_Style",
43-
"Fireplace_Qu",
44-
"Exter_Cond",
45-
"Exter_Qual",
46-
"Pool_QC",
41+
"MasVnrType",
42+
"HouseStyle",
43+
"FireplaceQu",
44+
"ExterCond",
45+
"ExterQual",
46+
"PoolQC",
4747
]
4848

4949
numerical_columns_subset = [
50-
"Three_season_porch",
50+
"3SsnPorch",
5151
"Fireplaces",
52-
"Bsmt_Half_Bath",
53-
"Half_Bath",
54-
"Garage_Cars",
55-
"TotRms_AbvGrd",
56-
"BsmtFin_SF_1",
57-
"BsmtFin_SF_2",
58-
"Gr_Liv_Area",
59-
"Screen_Porch",
52+
"BsmtHalfBath",
53+
"HalfBath",
54+
"GarageCars",
55+
"TotRmsAbvGrd",
56+
"BsmtFinSF1",
57+
"BsmtFinSF2",
58+
"GrLivArea",
59+
"ScreenPorch",
6060
]
6161

6262
X = X[categorical_columns_subset + numerical_columns_subset]
63+
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
6364

6465
n_categorical_features = X.select_dtypes(include="category").shape[1]
6566
n_numerical_features = X.select_dtypes(include="number").shape[1]
@@ -153,7 +154,6 @@
153154
),
154155
)
155156

156-
157157
# %%
158158
# Model comparison
159159
# ----------------
@@ -230,7 +230,6 @@ def plot_results(figure_title):
230230
# %%
231231
# Limitting the number of splits
232232
# ------------------------------
233-
#
234233
# In general, one can expect poorer predictions from one-hot-encoded data,
235234
# especially when the tree depths or the number of nodes are limited: with
236235
# one-hot-encoded data, one needs more split points, i.e. more depth, in order

dev/_downloads/cd5de29451c4f8624f47d18def81839c/plot_gradient_boosting_categorical.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
},
3434
"outputs": [],
3535
"source": [
36-
"from sklearn.datasets import fetch_openml\n\nX, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True)\n\n# Select only a subset of features of X to make the example faster to run\ncategorical_columns_subset = [\n \"Bldg_Type\",\n \"Garage_Finish\",\n \"Lot_Config\",\n \"Functional\",\n \"Mas_Vnr_Type\",\n \"House_Style\",\n \"Fireplace_Qu\",\n \"Exter_Cond\",\n \"Exter_Qual\",\n \"Pool_QC\",\n]\n\nnumerical_columns_subset = [\n \"Three_season_porch\",\n \"Fireplaces\",\n \"Bsmt_Half_Bath\",\n \"Half_Bath\",\n \"Garage_Cars\",\n \"TotRms_AbvGrd\",\n \"BsmtFin_SF_1\",\n \"BsmtFin_SF_2\",\n \"Gr_Liv_Area\",\n \"Screen_Porch\",\n]\n\nX = X[categorical_columns_subset + numerical_columns_subset]\n\nn_categorical_features = X.select_dtypes(include=\"category\").shape[1]\nn_numerical_features = X.select_dtypes(include=\"number\").shape[1]\n\nprint(f\"Number of samples: {X.shape[0]}\")\nprint(f\"Number of features: {X.shape[1]}\")\nprint(f\"Number of categorical features: {n_categorical_features}\")\nprint(f\"Number of numerical features: {n_numerical_features}\")"
36+
"from sklearn.datasets import fetch_openml\n\nX, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)\n\n# Select only a subset of features of X to make the example faster to run\ncategorical_columns_subset = [\n \"BldgType\",\n \"GarageFinish\",\n \"LotConfig\",\n \"Functional\",\n \"MasVnrType\",\n \"HouseStyle\",\n \"FireplaceQu\",\n \"ExterCond\",\n \"ExterQual\",\n \"PoolQC\",\n]\n\nnumerical_columns_subset = [\n \"3SsnPorch\",\n \"Fireplaces\",\n \"BsmtHalfBath\",\n \"HalfBath\",\n \"GarageCars\",\n \"TotRmsAbvGrd\",\n \"BsmtFinSF1\",\n \"BsmtFinSF2\",\n \"GrLivArea\",\n \"ScreenPorch\",\n]\n\nX = X[categorical_columns_subset + numerical_columns_subset]\nX[categorical_columns_subset] = X[categorical_columns_subset].astype(\"category\")\n\nn_categorical_features = X.select_dtypes(include=\"category\").shape[1]\nn_numerical_features = X.select_dtypes(include=\"number\").shape[1]\n\nprint(f\"Number of samples: {X.shape[0]}\")\nprint(f\"Number of features: {X.shape[1]}\")\nprint(f\"Number of categorical features: {n_categorical_features}\")\nprint(f\"Number of numerical features: {n_numerical_features}\")"
3737
]
3838
},
3939
{
@@ -137,7 +137,7 @@
137137
"cell_type": "markdown",
138138
"metadata": {},
139139
"source": [
140-
"## Limitting the number of splits\n\nIn general, one can expect poorer predictions from one-hot-encoded data,\nespecially when the tree depths or the number of nodes are limited: with\none-hot-encoded data, one needs more split points, i.e. more depth, in order\nto recover an equivalent split that could be obtained in one single split\npoint with native handling.\n\nThis is also true when categories are treated as ordinal quantities: if\ncategories are `A..F` and the best split is `ACF - BDE` the one-hot-encoder\nmodel will need 3 split points (one per category in the left node), and the\nordinal non-native model will need 4 splits: 1 split to isolate `A`, 1 split\nto isolate `F`, and 2 splits to isolate `C` from `BCDE`.\n\nHow strongly the models' performances differ in practice will depend on the\ndataset and on the flexibility of the trees.\n\nTo see this, let us re-run the same analysis with under-fitting models where\nwe artificially limit the total number of splits by both limitting the number\nof trees and the depth of each tree.\n\n"
140+
"## Limitting the number of splits\nIn general, one can expect poorer predictions from one-hot-encoded data,\nespecially when the tree depths or the number of nodes are limited: with\none-hot-encoded data, one needs more split points, i.e. more depth, in order\nto recover an equivalent split that could be obtained in one single split\npoint with native handling.\n\nThis is also true when categories are treated as ordinal quantities: if\ncategories are `A..F` and the best split is `ACF - BDE` the one-hot-encoder\nmodel will need 3 split points (one per category in the left node), and the\nordinal non-native model will need 4 splits: 1 split to isolate `A`, 1 split\nto isolate `F`, and 2 splits to isolate `C` from `BCDE`.\n\nHow strongly the models' performances differ in practice will depend on the\ndataset and on the flexibility of the trees.\n\nTo see this, let us re-run the same analysis with under-fitting models where\nwe artificially limit the total number of splits by both limitting the number\nof trees and the depth of each tree.\n\n"
141141
]
142142
},
143143
{

dev/_downloads/scikit-learn-docs.zip

2.5 KB
Binary file not shown.
58 Bytes
82 Bytes
-343 Bytes
-249 Bytes
11 Bytes

0 commit comments

Comments
 (0)