Skip to content

Commit 7e65402

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 69ea066412abc0c4a4c59c03e532ffe26b96c9c1
1 parent 42d26bb commit 7e65402

File tree

1,257 files changed

+12914
-4651
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,257 files changed

+12914
-4651
lines changed

dev/_downloads/00a5ddd24a9ad44708f4ab3b157ef0ff/plot_stack_predictors.py

Lines changed: 172 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
Combine predictors using stacking
44
=================================
55
6+
.. currentmodule:: sklearn
7+
68
Stacking refers to a method to blend estimators. In this strategy, some
79
estimators are individually fitted on some training data while a final
810
estimator is trained using the stacked predictions of these base estimators.
@@ -16,42 +18,128 @@
1618
print(__doc__)
1719

1820
# Authors: Guillaume Lemaitre <[email protected]>
21+
# Maria Telenczuk <https://github.com/maikia>
1922
# License: BSD 3 clause
2023

24+
2125
###############################################################################
22-
# The function ``plot_regression_results`` is used to plot the predicted and
23-
# true targets.
26+
# Download the dataset
27+
###############################################################################
28+
#
29+
# We will use `Ames Housing`_ dataset which was first compiled by Dean De Cock
30+
# and became better known after it was used in Kaggle challenge. It is a set
31+
# of 1460 residential homes in Ames, Iowa, each described by 80 features. We
32+
# will use it to predict the final logarithmic price of the houses. In this
33+
# example we will use only 20 most interesting features chosen using
34+
# GradientBoostingRegressor() and limit number of entries (here we won't go
35+
# into the details on how to select the most interesting features).
36+
#
37+
# The Ames housing dataset is not shipped with scikit-learn and therefore we
38+
# will fetch it from `OpenML`_.
39+
#
40+
# .. _`Ames Housing`: http://jse.amstat.org/v19n3/decock.pdf
41+
# .. _`OpenML`: https://www.openml.org/d/42165
2442

25-
import matplotlib.pyplot as plt
43+
import numpy as np
2644

45+
from sklearn.datasets import fetch_openml
46+
from sklearn.utils import shuffle
2747

28-
def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
29-
"""Scatter plot of the predicted vs true targets."""
30-
ax.plot([y_true.min(), y_true.max()],
31-
[y_true.min(), y_true.max()],
32-
'--r', linewidth=2)
33-
ax.scatter(y_true, y_pred, alpha=0.2)
3448

35-
ax.spines['top'].set_visible(False)
36-
ax.spines['right'].set_visible(False)
37-
ax.get_xaxis().tick_bottom()
38-
ax.get_yaxis().tick_left()
39-
ax.spines['left'].set_position(('outward', 10))
40-
ax.spines['bottom'].set_position(('outward', 10))
41-
ax.set_xlim([y_true.min(), y_true.max()])
42-
ax.set_ylim([y_true.min(), y_true.max()])
43-
ax.set_xlabel('Measured')
44-
ax.set_ylabel('Predicted')
45-
extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False,
46-
edgecolor='none', linewidth=0)
47-
ax.legend([extra], [scores], loc='upper left')
48-
title = title + '\n Evaluation in {:.2f} seconds'.format(elapsed_time)
49-
ax.set_title(title)
49+
def load_ames_housing():
50+
df = fetch_openml(name="house_prices", as_frame=True)
51+
X = df.data
52+
y = df.target
53+
54+
features = ['YrSold', 'HeatingQC', 'Street', 'YearRemodAdd', 'Heating',
55+
'MasVnrType', 'BsmtUnfSF', 'Foundation', 'MasVnrArea',
56+
'MSSubClass', 'ExterQual', 'Condition2', 'GarageCars',
57+
'GarageType', 'OverallQual', 'TotalBsmtSF', 'BsmtFinSF1',
58+
'HouseStyle', 'MiscFeature', 'MoSold']
59+
60+
X = X[features]
61+
X, y = shuffle(X, y, random_state=0)
62+
63+
X = X[:600]
64+
y = y[:600]
65+
return X, np.log(y)
66+
67+
68+
X, y = load_ames_housing()
69+
70+
71+
###############################################################################
72+
# Make pipeline to preprocess the data
73+
###############################################################################
74+
#
75+
# Before we can use Ames dataset we still need to do some preprocessing.
76+
# First, the dataset has many missing values. To impute them, we will exchange
77+
# categorical missing values with the new category 'missing' while the
78+
# numerical missing values with the 'mean' of the column. We will also encode
79+
# the categories with either :class:`sklearn.preprocessing.OneHotEncoder
80+
# <sklearn.preprocessing.OneHotEncoder>` or
81+
# :class:`sklearn.preprocessing.OrdinalEncoder
82+
# <sklearn.preprocessing.OrdinalEncoder>` depending for which type of model we
83+
# will use them (linear or non-linear model). To falicitate this preprocessing
84+
# we will make two pipelines.
85+
# You can skip this section if your data is ready to use and does
86+
# not need preprocessing
87+
88+
89+
from sklearn.compose import make_column_transformer
90+
from sklearn.impute import SimpleImputer
91+
from sklearn.pipeline import make_pipeline
92+
from sklearn.preprocessing import OneHotEncoder
93+
from sklearn.preprocessing import OrdinalEncoder
94+
from sklearn.preprocessing import StandardScaler
95+
96+
97+
cat_cols = X.columns[X.dtypes == 'O']
98+
num_cols = X.columns[X.dtypes == 'float64']
99+
100+
categories = [
101+
X[column].unique() for column in X[cat_cols]]
102+
103+
for cat in categories:
104+
cat[cat == None] = 'missing' # noqa
105+
106+
cat_proc_nlin = make_pipeline(
107+
SimpleImputer(missing_values=None, strategy='constant',
108+
fill_value='missing'),
109+
OrdinalEncoder(categories=categories)
110+
)
111+
112+
num_proc_nlin = make_pipeline(SimpleImputer(strategy='mean'))
113+
114+
cat_proc_lin = make_pipeline(
115+
SimpleImputer(missing_values=None,
116+
strategy='constant',
117+
fill_value='missing'),
118+
OneHotEncoder(categories=categories)
119+
)
120+
121+
num_proc_lin = make_pipeline(
122+
SimpleImputer(strategy='mean'),
123+
StandardScaler()
124+
)
125+
126+
# transformation to use for non-linear estimators
127+
processor_nlin = make_column_transformer(
128+
(cat_proc_nlin, cat_cols),
129+
(num_proc_nlin, num_cols),
130+
remainder='passthrough')
131+
132+
# transformation to use for linear estimators
133+
processor_lin = make_column_transformer(
134+
(cat_proc_lin, cat_cols),
135+
(num_proc_lin, num_cols),
136+
remainder='passthrough')
50137

51138

52139
###############################################################################
53140
# Stack of predictors on a single data set
54141
###############################################################################
142+
#
55143
# It is sometimes tedious to find the model which will best perform on a given
56144
# dataset. Stacking provide an alternative by combining the outputs of several
57145
# learners, without the need to choose a model specifically. The performance of
@@ -60,35 +148,79 @@ def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
60148
#
61149
# Here, we combine 3 learners (linear and non-linear) and use a ridge regressor
62150
# to combine their outputs together.
151+
#
152+
# Note: although we will make new pipelines with the processors which we wrote
153+
# in the previous section for the 3 learners, the final estimator RidgeCV()
154+
# does not need preprocessing of the data as it will be fed with the already
155+
# preprocessed output from the 3 learners.
156+
63157

64-
from sklearn.ensemble import StackingRegressor
65-
from sklearn.ensemble import RandomForestRegressor
66158
from sklearn.experimental import enable_hist_gradient_boosting # noqa
67159
from sklearn.ensemble import HistGradientBoostingRegressor
160+
from sklearn.ensemble import RandomForestRegressor
161+
from sklearn.ensemble import StackingRegressor
68162
from sklearn.linear_model import LassoCV
69163
from sklearn.linear_model import RidgeCV
70164

71-
estimators = [
72-
('Random Forest', RandomForestRegressor(random_state=42)),
73-
('Lasso', LassoCV()),
74-
('Gradient Boosting', HistGradientBoostingRegressor(random_state=0))
75-
]
76-
stacking_regressor = StackingRegressor(
77-
estimators=estimators, final_estimator=RidgeCV()
78-
)
79165

166+
lasso_pipeline = make_pipeline(processor_lin,
167+
LassoCV())
168+
169+
rf_pipeline = make_pipeline(processor_nlin,
170+
RandomForestRegressor(random_state=42))
80171

172+
gradient_pipeline = make_pipeline(
173+
processor_nlin,
174+
HistGradientBoostingRegressor(random_state=0))
175+
176+
estimators = [('Random Forest', rf_pipeline),
177+
('Lasso', lasso_pipeline),
178+
('Gradient Boosting', gradient_pipeline)]
179+
180+
stacking_regressor = StackingRegressor(estimators=estimators,
181+
final_estimator=RidgeCV())
182+
183+
184+
###############################################################################
185+
# Measure and plot the results
81186
###############################################################################
82-
# We used the Boston data set (prediction of house prices). We check the
83-
# performance of each individual predictor as well as the stack of the
187+
#
188+
# Now we can use Ames Housing dataset to make the predictions. We check the
189+
# performance of each individual predictor as well as of the stack of the
84190
# regressors.
191+
#
192+
# The function ``plot_regression_results`` is used to plot the predicted and
193+
# true targets.
194+
85195

86196
import time
87-
import numpy as np
88-
from sklearn.datasets import load_boston
197+
import matplotlib.pyplot as plt
89198
from sklearn.model_selection import cross_validate, cross_val_predict
90199

91-
X, y = load_boston(return_X_y=True)
200+
201+
def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
202+
"""Scatter plot of the predicted vs true targets."""
203+
ax.plot([y_true.min(), y_true.max()],
204+
[y_true.min(), y_true.max()],
205+
'--r', linewidth=2)
206+
ax.scatter(y_true, y_pred, alpha=0.2)
207+
208+
ax.spines['top'].set_visible(False)
209+
ax.spines['right'].set_visible(False)
210+
ax.get_xaxis().tick_bottom()
211+
ax.get_yaxis().tick_left()
212+
ax.spines['left'].set_position(('outward', 10))
213+
ax.spines['bottom'].set_position(('outward', 10))
214+
ax.set_xlim([y_true.min(), y_true.max()])
215+
ax.set_ylim([y_true.min(), y_true.max()])
216+
ax.set_xlabel('Measured')
217+
ax.set_ylabel('Predicted')
218+
extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False,
219+
edgecolor='none', linewidth=0)
220+
ax.legend([extra], [scores], loc='upper left')
221+
title = title + '\n Evaluation in {:.2f} seconds'.format(elapsed_time)
222+
ax.set_title(title)
223+
92224

93225
fig, axs = plt.subplots(2, 2, figsize=(9, 7))
94226
axs = np.ravel(axs)
@@ -102,6 +234,7 @@ def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
102234
elapsed_time = time.time() - start_time
103235

104236
y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0)
237+
105238
plot_regression_results(
106239
ax, y, y_pred,
107240
name,

0 commit comments

Comments
 (0)