3
3
Combine predictors using stacking
4
4
=================================
5
5
6
+ .. currentmodule:: sklearn
7
+
6
8
Stacking refers to a method to blend estimators. In this strategy, some
7
9
estimators are individually fitted on some training data while a final
8
10
estimator is trained using the stacked predictions of these base estimators.
16
18
print (__doc__ )
17
19
18
20
# Authors: Guillaume Lemaitre <[email protected] >
21
+ # Maria Telenczuk <https://github.com/maikia>
19
22
# License: BSD 3 clause
20
23
24
+
21
25
###############################################################################
22
- # The function ``plot_regression_results`` is used to plot the predicted and
23
- # true targets.
26
+ # Download the dataset
27
+ ###############################################################################
28
+ #
29
+ # We will use `Ames Housing`_ dataset which was first compiled by Dean De Cock
30
+ # and became better known after it was used in Kaggle challenge. It is a set
31
+ # of 1460 residential homes in Ames, Iowa, each described by 80 features. We
32
+ # will use it to predict the final logarithmic price of the houses. In this
33
+ # example we will use only 20 most interesting features chosen using
34
+ # GradientBoostingRegressor() and limit number of entries (here we won't go
35
+ # into the details on how to select the most interesting features).
36
+ #
37
+ # The Ames housing dataset is not shipped with scikit-learn and therefore we
38
+ # will fetch it from `OpenML`_.
39
+ #
40
+ # .. _`Ames Housing`: http://jse.amstat.org/v19n3/decock.pdf
41
+ # .. _`OpenML`: https://www.openml.org/d/42165
24
42
25
- import matplotlib . pyplot as plt
43
+ import numpy as np
26
44
45
+ from sklearn .datasets import fetch_openml
46
+ from sklearn .utils import shuffle
27
47
28
- def plot_regression_results (ax , y_true , y_pred , title , scores , elapsed_time ):
29
- """Scatter plot of the predicted vs true targets."""
30
- ax .plot ([y_true .min (), y_true .max ()],
31
- [y_true .min (), y_true .max ()],
32
- '--r' , linewidth = 2 )
33
- ax .scatter (y_true , y_pred , alpha = 0.2 )
34
48
35
- ax .spines ['top' ].set_visible (False )
36
- ax .spines ['right' ].set_visible (False )
37
- ax .get_xaxis ().tick_bottom ()
38
- ax .get_yaxis ().tick_left ()
39
- ax .spines ['left' ].set_position (('outward' , 10 ))
40
- ax .spines ['bottom' ].set_position (('outward' , 10 ))
41
- ax .set_xlim ([y_true .min (), y_true .max ()])
42
- ax .set_ylim ([y_true .min (), y_true .max ()])
43
- ax .set_xlabel ('Measured' )
44
- ax .set_ylabel ('Predicted' )
45
- extra = plt .Rectangle ((0 , 0 ), 0 , 0 , fc = "w" , fill = False ,
46
- edgecolor = 'none' , linewidth = 0 )
47
- ax .legend ([extra ], [scores ], loc = 'upper left' )
48
- title = title + '\n Evaluation in {:.2f} seconds' .format (elapsed_time )
49
- ax .set_title (title )
49
+ def load_ames_housing ():
50
+ df = fetch_openml (name = "house_prices" , as_frame = True )
51
+ X = df .data
52
+ y = df .target
53
+
54
+ features = ['YrSold' , 'HeatingQC' , 'Street' , 'YearRemodAdd' , 'Heating' ,
55
+ 'MasVnrType' , 'BsmtUnfSF' , 'Foundation' , 'MasVnrArea' ,
56
+ 'MSSubClass' , 'ExterQual' , 'Condition2' , 'GarageCars' ,
57
+ 'GarageType' , 'OverallQual' , 'TotalBsmtSF' , 'BsmtFinSF1' ,
58
+ 'HouseStyle' , 'MiscFeature' , 'MoSold' ]
59
+
60
+ X = X [features ]
61
+ X , y = shuffle (X , y , random_state = 0 )
62
+
63
+ X = X [:600 ]
64
+ y = y [:600 ]
65
+ return X , np .log (y )
66
+
67
+
68
+ X , y = load_ames_housing ()
69
+
70
+
71
+ ###############################################################################
72
+ # Make pipeline to preprocess the data
73
+ ###############################################################################
74
+ #
75
+ # Before we can use Ames dataset we still need to do some preprocessing.
76
+ # First, the dataset has many missing values. To impute them, we will exchange
77
+ # categorical missing values with the new category 'missing' while the
78
+ # numerical missing values with the 'mean' of the column. We will also encode
79
+ # the categories with either :class:`sklearn.preprocessing.OneHotEncoder
80
+ # <sklearn.preprocessing.OneHotEncoder>` or
81
+ # :class:`sklearn.preprocessing.OrdinalEncoder
82
+ # <sklearn.preprocessing.OrdinalEncoder>` depending for which type of model we
83
+ # will use them (linear or non-linear model). To falicitate this preprocessing
84
+ # we will make two pipelines.
85
+ # You can skip this section if your data is ready to use and does
86
+ # not need preprocessing
87
+
88
+
89
+ from sklearn .compose import make_column_transformer
90
+ from sklearn .impute import SimpleImputer
91
+ from sklearn .pipeline import make_pipeline
92
+ from sklearn .preprocessing import OneHotEncoder
93
+ from sklearn .preprocessing import OrdinalEncoder
94
+ from sklearn .preprocessing import StandardScaler
95
+
96
+
97
+ cat_cols = X .columns [X .dtypes == 'O' ]
98
+ num_cols = X .columns [X .dtypes == 'float64' ]
99
+
100
+ categories = [
101
+ X [column ].unique () for column in X [cat_cols ]]
102
+
103
+ for cat in categories :
104
+ cat [cat == None ] = 'missing' # noqa
105
+
106
+ cat_proc_nlin = make_pipeline (
107
+ SimpleImputer (missing_values = None , strategy = 'constant' ,
108
+ fill_value = 'missing' ),
109
+ OrdinalEncoder (categories = categories )
110
+ )
111
+
112
+ num_proc_nlin = make_pipeline (SimpleImputer (strategy = 'mean' ))
113
+
114
+ cat_proc_lin = make_pipeline (
115
+ SimpleImputer (missing_values = None ,
116
+ strategy = 'constant' ,
117
+ fill_value = 'missing' ),
118
+ OneHotEncoder (categories = categories )
119
+ )
120
+
121
+ num_proc_lin = make_pipeline (
122
+ SimpleImputer (strategy = 'mean' ),
123
+ StandardScaler ()
124
+ )
125
+
126
+ # transformation to use for non-linear estimators
127
+ processor_nlin = make_column_transformer (
128
+ (cat_proc_nlin , cat_cols ),
129
+ (num_proc_nlin , num_cols ),
130
+ remainder = 'passthrough' )
131
+
132
+ # transformation to use for linear estimators
133
+ processor_lin = make_column_transformer (
134
+ (cat_proc_lin , cat_cols ),
135
+ (num_proc_lin , num_cols ),
136
+ remainder = 'passthrough' )
50
137
51
138
52
139
###############################################################################
53
140
# Stack of predictors on a single data set
54
141
###############################################################################
142
+ #
55
143
# It is sometimes tedious to find the model which will best perform on a given
56
144
# dataset. Stacking provide an alternative by combining the outputs of several
57
145
# learners, without the need to choose a model specifically. The performance of
@@ -60,35 +148,79 @@ def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
60
148
#
61
149
# Here, we combine 3 learners (linear and non-linear) and use a ridge regressor
62
150
# to combine their outputs together.
151
+ #
152
+ # Note: although we will make new pipelines with the processors which we wrote
153
+ # in the previous section for the 3 learners, the final estimator RidgeCV()
154
+ # does not need preprocessing of the data as it will be fed with the already
155
+ # preprocessed output from the 3 learners.
156
+
63
157
64
- from sklearn .ensemble import StackingRegressor
65
- from sklearn .ensemble import RandomForestRegressor
66
158
from sklearn .experimental import enable_hist_gradient_boosting # noqa
67
159
from sklearn .ensemble import HistGradientBoostingRegressor
160
+ from sklearn .ensemble import RandomForestRegressor
161
+ from sklearn .ensemble import StackingRegressor
68
162
from sklearn .linear_model import LassoCV
69
163
from sklearn .linear_model import RidgeCV
70
164
71
- estimators = [
72
- ('Random Forest' , RandomForestRegressor (random_state = 42 )),
73
- ('Lasso' , LassoCV ()),
74
- ('Gradient Boosting' , HistGradientBoostingRegressor (random_state = 0 ))
75
- ]
76
- stacking_regressor = StackingRegressor (
77
- estimators = estimators , final_estimator = RidgeCV ()
78
- )
79
165
166
+ lasso_pipeline = make_pipeline (processor_lin ,
167
+ LassoCV ())
168
+
169
+ rf_pipeline = make_pipeline (processor_nlin ,
170
+ RandomForestRegressor (random_state = 42 ))
80
171
172
+ gradient_pipeline = make_pipeline (
173
+ processor_nlin ,
174
+ HistGradientBoostingRegressor (random_state = 0 ))
175
+
176
+ estimators = [('Random Forest' , rf_pipeline ),
177
+ ('Lasso' , lasso_pipeline ),
178
+ ('Gradient Boosting' , gradient_pipeline )]
179
+
180
+ stacking_regressor = StackingRegressor (estimators = estimators ,
181
+ final_estimator = RidgeCV ())
182
+
183
+
184
+ ###############################################################################
185
+ # Measure and plot the results
81
186
###############################################################################
82
- # We used the Boston data set (prediction of house prices). We check the
83
- # performance of each individual predictor as well as the stack of the
187
+ #
188
+ # Now we can use Ames Housing dataset to make the predictions. We check the
189
+ # performance of each individual predictor as well as of the stack of the
84
190
# regressors.
191
+ #
192
+ # The function ``plot_regression_results`` is used to plot the predicted and
193
+ # true targets.
194
+
85
195
86
196
import time
87
- import numpy as np
88
- from sklearn .datasets import load_boston
197
+ import matplotlib .pyplot as plt
89
198
from sklearn .model_selection import cross_validate , cross_val_predict
90
199
91
- X , y = load_boston (return_X_y = True )
200
+
201
+ def plot_regression_results (ax , y_true , y_pred , title , scores , elapsed_time ):
202
+ """Scatter plot of the predicted vs true targets."""
203
+ ax .plot ([y_true .min (), y_true .max ()],
204
+ [y_true .min (), y_true .max ()],
205
+ '--r' , linewidth = 2 )
206
+ ax .scatter (y_true , y_pred , alpha = 0.2 )
207
+
208
+ ax .spines ['top' ].set_visible (False )
209
+ ax .spines ['right' ].set_visible (False )
210
+ ax .get_xaxis ().tick_bottom ()
211
+ ax .get_yaxis ().tick_left ()
212
+ ax .spines ['left' ].set_position (('outward' , 10 ))
213
+ ax .spines ['bottom' ].set_position (('outward' , 10 ))
214
+ ax .set_xlim ([y_true .min (), y_true .max ()])
215
+ ax .set_ylim ([y_true .min (), y_true .max ()])
216
+ ax .set_xlabel ('Measured' )
217
+ ax .set_ylabel ('Predicted' )
218
+ extra = plt .Rectangle ((0 , 0 ), 0 , 0 , fc = "w" , fill = False ,
219
+ edgecolor = 'none' , linewidth = 0 )
220
+ ax .legend ([extra ], [scores ], loc = 'upper left' )
221
+ title = title + '\n Evaluation in {:.2f} seconds' .format (elapsed_time )
222
+ ax .set_title (title )
223
+
92
224
93
225
fig , axs = plt .subplots (2 , 2 , figsize = (9 , 7 ))
94
226
axs = np .ravel (axs )
@@ -102,6 +234,7 @@ def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
102
234
elapsed_time = time .time () - start_time
103
235
104
236
y_pred = cross_val_predict (est , X , y , n_jobs = - 1 , verbose = 0 )
237
+
105
238
plot_regression_results (
106
239
ax , y , y_pred ,
107
240
name ,
0 commit comments