Skip to content

Commit 9805d7f

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 9d677b56f4244eb08f1c7e2061a1e86fd0aee748
1 parent d7f66b7 commit 9805d7f

File tree

1,202 files changed

+4409
-3772
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,202 files changed

+4409
-3772
lines changed
Binary file not shown.

dev/_downloads/b5a4a1546e908b944c14370f9e7e2a25/plot_column_transformer_mixed_types.ipynb

Lines changed: 108 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
"cell_type": "markdown",
3434
"metadata": {},
3535
"source": [
36-
"Use ``ColumnTransformer`` by selecting column by names\n##############################################################################\n We will train our classifier with the following features:\n\n Numeric Features:\n\n * ``age``: float;\n * ``fare``: float.\n\n Categorical Features:\n\n * ``embarked``: categories encoded as strings ``{'C', 'S', 'Q'}``;\n * ``sex``: categories encoded as strings ``{'female', 'male'}``;\n * ``pclass``: ordinal integers ``{1, 2, 3}``.\n\n We create the preprocessing pipelines for both numeric and categorical data.\n\n"
36+
"Use ``ColumnTransformer`` by selecting column by names\n##############################################################################\n We will train our classifier with the following features:\n\n Numeric Features:\n\n * ``age``: float;\n * ``fare``: float.\n\n Categorical Features:\n\n * ``embarked``: categories encoded as strings ``{'C', 'S', 'Q'}``;\n * ``sex``: categories encoded as strings ``{'female', 'male'}``;\n * ``pclass``: ordinal integers ``{1, 2, 3}``.\n\n We create the preprocessing pipelines for both numeric and categorical data.\n Note that ``pclass`` could either be treated as a categorical or numeric\n feature.\n\n"
3737
]
3838
},
3939
{
@@ -44,7 +44,7 @@
4444
},
4545
"outputs": [],
4646
"source": [
47-
"numeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n ('imputer', SimpleImputer(strategy='median')),\n ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n transformers=[\n ('num', numeric_transformer, numeric_features),\n ('cat', categorical_transformer, categorical_features)])\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n ('classifier', LogisticRegression())])\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
47+
"numeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n ('imputer', SimpleImputer(strategy='median')),\n ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n transformers=[\n ('num', numeric_transformer, numeric_features),\n ('cat', categorical_transformer, categorical_features)])\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n ('classifier', LogisticRegression())])\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n random_state=0)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
4848
]
4949
},
5050
{
@@ -62,7 +62,7 @@
6262
},
6363
"outputs": [],
6464
"source": [
65-
"from sklearn import set_config\nset_config(display='diagram')\nclf"
65+
"from sklearn import set_config\n\nset_config(display='diagram')\nclf"
6666
]
6767
},
6868
{
@@ -80,7 +80,7 @@
8080
},
8181
"outputs": [],
8282
"source": [
83-
"subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare']\nX = X[subset_feature]"
83+
"subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare']\nX_train, X_test = X_train[subset_feature], X_test[subset_feature]"
8484
]
8585
},
8686
{
@@ -98,7 +98,7 @@
9898
},
9999
"outputs": [],
100100
"source": [
101-
"X.info()"
101+
"X_train.info()"
102102
]
103103
},
104104
{
@@ -123,7 +123,36 @@
123123
},
124124
"outputs": [],
125125
"source": [
126-
"from sklearn.compose import make_column_selector as selector\n\npreprocessor = ColumnTransformer(transformers=[\n ('num', numeric_transformer, selector(dtype_exclude=\"category\")),\n ('cat', categorical_transformer, selector(dtype_include=\"category\"))\n])\n\n# Reproduce the identical fit/score process\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
126+
"from sklearn.compose import make_column_selector as selector\n\npreprocessor = ColumnTransformer(transformers=[\n ('num', numeric_transformer, selector(dtype_exclude=\"category\")),\n ('cat', categorical_transformer, selector(dtype_include=\"category\"))\n])\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n ('classifier', LogisticRegression())])\n\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
127+
]
128+
},
129+
{
130+
"cell_type": "markdown",
131+
"metadata": {},
132+
"source": [
133+
"The resulting score is not exactly the same as the one from the previous\npipeline becase the dtype-based selector treats the ``pclass`` columns as\na numeric features instead of a categorical feature as previously:\n\n"
134+
]
135+
},
136+
{
137+
"cell_type": "code",
138+
"execution_count": null,
139+
"metadata": {
140+
"collapsed": false
141+
},
142+
"outputs": [],
143+
"source": [
144+
"selector(dtype_exclude=\"category\")(X_train)"
145+
]
146+
},
147+
{
148+
"cell_type": "code",
149+
"execution_count": null,
150+
"metadata": {
151+
"collapsed": false
152+
},
153+
"outputs": [],
154+
"source": [
155+
"selector(dtype_include=\"category\")(X_train)"
127156
]
128157
},
129158
{
@@ -141,7 +170,79 @@
141170
},
142171
"outputs": [],
143172
"source": [
144-
"param_grid = {\n 'preprocessor__num__imputer__strategy': ['mean', 'median'],\n 'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %.3f\"\n % grid_search.score(X_test, y_test)))"
173+
"param_grid = {\n 'preprocessor__num__imputer__strategy': ['mean', 'median'],\n 'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10)\ngrid_search"
174+
]
175+
},
176+
{
177+
"cell_type": "markdown",
178+
"metadata": {},
179+
"source": [
180+
"Calling 'fit' triggers the cross-validated search for the best\nhyper-parameters combination:\n\n\n"
181+
]
182+
},
183+
{
184+
"cell_type": "code",
185+
"execution_count": null,
186+
"metadata": {
187+
"collapsed": false
188+
},
189+
"outputs": [],
190+
"source": [
191+
"grid_search.fit(X_train, y_train)\n\nprint(f\"Best params:\")\nprint(grid_search.best_params_)"
192+
]
193+
},
194+
{
195+
"cell_type": "markdown",
196+
"metadata": {},
197+
"source": [
198+
"The internal cross-validation scores obtained by those parameters is:\n\n"
199+
]
200+
},
201+
{
202+
"cell_type": "code",
203+
"execution_count": null,
204+
"metadata": {
205+
"collapsed": false
206+
},
207+
"outputs": [],
208+
"source": [
209+
"print(f\"Internal CV score: {grid_search.best_score_:.3f}\")"
210+
]
211+
},
212+
{
213+
"cell_type": "markdown",
214+
"metadata": {},
215+
"source": [
216+
"We can also introspect the top grid search results as a pandas dataframe:\n\n"
217+
]
218+
},
219+
{
220+
"cell_type": "code",
221+
"execution_count": null,
222+
"metadata": {
223+
"collapsed": false
224+
},
225+
"outputs": [],
226+
"source": [
227+
"import pandas as pd\n\ncv_results = pd.DataFrame(grid_search.cv_results_)\ncv_results = cv_results.sort_values(\"mean_test_score\", ascending=False)\ncv_results[[\"mean_test_score\", \"std_test_score\",\n \"param_preprocessor__num__imputer__strategy\",\n \"param_classifier__C\"\n ]].head(5)"
228+
]
229+
},
230+
{
231+
"cell_type": "markdown",
232+
"metadata": {},
233+
"source": [
234+
"The best hyper-parameters have be used to re-fit a final model on the full\ntraining set. We can evaluate that final model on held out test data that was\nnot used for hyparameter tuning.\n\n\n"
235+
]
236+
},
237+
{
238+
"cell_type": "code",
239+
"execution_count": null,
240+
"metadata": {
241+
"collapsed": false
242+
},
243+
"outputs": [],
244+
"source": [
245+
"print((\"best logistic regression from grid search: %.3f\"\n % grid_search.score(X_test, y_test)))"
145246
]
146247
}
147248
],
Binary file not shown.

dev/_downloads/ec7916875965bf7f54b7cfe8e6dc4cc2/plot_column_transformer_mixed_types.py

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@
6161
# * ``pclass``: ordinal integers ``{1, 2, 3}``.
6262
#
6363
# We create the preprocessing pipelines for both numeric and categorical data.
64+
# Note that ``pclass`` could either be treated as a categorical or numeric
65+
# feature.
6466

6567
numeric_features = ['age', 'fare']
6668
numeric_transformer = Pipeline(steps=[
@@ -82,7 +84,8 @@
8284
clf = Pipeline(steps=[('preprocessor', preprocessor),
8385
('classifier', LogisticRegression())])
8486

85-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
87+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
88+
random_state=0)
8689

8790
clf.fit(X_train, y_train)
8891
print("model score: %.3f" % clf.score(X_test, y_test))
@@ -93,6 +96,7 @@
9396
# When the ``Pipeline`` is printed out in a jupyter notebook an HTML
9497
# representation of the estimator is displayed as follows:
9598
from sklearn import set_config
99+
96100
set_config(display='diagram')
97101
clf
98102

@@ -107,12 +111,12 @@
107111
# example.
108112

109113
subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare']
110-
X = X[subset_feature]
114+
X_train, X_test = X_train[subset_feature], X_test[subset_feature]
111115

112116
###############################################################################
113117
# Then, we introspect the information regarding each column data type.
114118

115-
X.info()
119+
X_train.info()
116120

117121
###############################################################################
118122
# We can observe that the `embarked` and `sex` columns were tagged as
@@ -134,13 +138,24 @@
134138
('num', numeric_transformer, selector(dtype_exclude="category")),
135139
('cat', categorical_transformer, selector(dtype_include="category"))
136140
])
141+
clf = Pipeline(steps=[('preprocessor', preprocessor),
142+
('classifier', LogisticRegression())])
137143

138-
# Reproduce the identical fit/score process
139-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
140144

141145
clf.fit(X_train, y_train)
142146
print("model score: %.3f" % clf.score(X_test, y_test))
143147

148+
###############################################################################
149+
# The resulting score is not exactly the same as the one from the previous
150+
# pipeline becase the dtype-based selector treats the ``pclass`` columns as
151+
# a numeric features instead of a categorical feature as previously:
152+
153+
selector(dtype_exclude="category")(X_train)
154+
155+
###############################################################################
156+
157+
selector(dtype_include="category")(X_train)
158+
144159
###############################################################################
145160
# Using the prediction pipeline in a grid search
146161
###############################################################################
@@ -157,7 +172,36 @@
157172
}
158173

159174
grid_search = GridSearchCV(clf, param_grid, cv=10)
175+
grid_search
176+
177+
###############################################################################
178+
# Calling 'fit' triggers the cross-validated search for the best
179+
# hyper-parameters combination:
180+
#
160181
grid_search.fit(X_train, y_train)
161182

183+
print(f"Best params:")
184+
print(grid_search.best_params_)
185+
186+
###############################################################################
187+
# The internal cross-validation scores obtained by those parameters is:
188+
print(f"Internal CV score: {grid_search.best_score_:.3f}")
189+
190+
###############################################################################
191+
# We can also introspect the top grid search results as a pandas dataframe:
192+
import pandas as pd
193+
194+
cv_results = pd.DataFrame(grid_search.cv_results_)
195+
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
196+
cv_results[["mean_test_score", "std_test_score",
197+
"param_preprocessor__num__imputer__strategy",
198+
"param_classifier__C"
199+
]].head(5)
200+
201+
###############################################################################
202+
# The best hyper-parameters have be used to re-fit a final model on the full
203+
# training set. We can evaluate that final model on held out test data that was
204+
# not used for hyparameter tuning.
205+
#
162206
print(("best logistic regression from grid search: %.3f"
163207
% grid_search.score(X_test, y_test)))

dev/_downloads/scikit-learn-docs.pdf

18.7 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes

0 commit comments

Comments
 (0)