6
6
Demonstrate how model complexity influences both prediction accuracy and
7
7
computational performance.
8
8
9
- The dataset is the Boston Housing dataset (resp. 20 Newsgroups) for
10
- regression (resp. classification).
9
+ We will be using two datasets:
10
+ - :ref:`diabetes_dataset` for regression.
11
+ This dataset consists of 10 measurements taken from diabetes patients.
12
+ The task is to predict disease progression;
13
+ - :ref:`20newsgroups_dataset` for classification. This dataset consists of
14
+ newsgroup posts. The task is to predict on which topic (out of 20 topics)
15
+ the post is written about.
16
+
17
+ We will model the complexity influence on three different estimators:
18
+ - :class:`~sklearn.linear_model.SGDClassifier` (for classification data)
19
+ which implements stochastic gradient descent learning;
20
+
21
+ - :class:`~sklearn.svm.NuSVR` (for regression data) which implements
22
+ Nu support vector regression;
23
+
24
+ - :class:`~sklearn.ensemble.GradientBoostingRegressor` (for regression
25
+ data) which builds an additive model in a forward stage-wise fashion.
26
+
27
+
28
+ We make the model complexity vary through the choice of relevant model
29
+ parameters in each of our selected models. Next, we will measure the influence
30
+ on both computational performance (latency) and predictive power (MSE or
31
+ Hamming Loss).
11
32
12
- For each class of models we make the model complexity vary through the choice
13
- of relevant model parameters and measure the influence on both computational
14
- performance (latency) and predictive power (MSE or Hamming Loss).
15
33
"""
16
34
17
35
print (__doc__ )
18
36
19
- # Author: Eustache Diemert <[email protected] >
37
+ # Authors: Eustache Diemert <[email protected] >
38
+ # Maria Telenczuk <https://github.com/maikia>
39
+ # Guillaume Lemaitre <[email protected] >
20
40
# License: BSD 3 clause
21
41
22
42
import time
23
43
import numpy as np
24
44
import matplotlib .pyplot as plt
25
- from mpl_toolkits .axes_grid1 .parasite_axes import host_subplot
26
- from mpl_toolkits .axisartist .axislines import Axes
27
- from scipy .sparse .csr import csr_matrix
28
45
29
46
from sklearn import datasets
30
47
from sklearn .utils import shuffle
34
51
from sklearn .linear_model import SGDClassifier
35
52
from sklearn .metrics import hamming_loss
36
53
37
- # #############################################################################
38
- # Routines
39
-
40
54
41
55
# Initialize random generator
42
56
np .random .seed (0 )
43
57
58
+ ##############################################################################
59
+ # Load the data
60
+ # -------------
61
+ #
62
+ # First we load both datasets.
63
+ #
64
+ # .. note:: We are using
65
+ # :func:`~sklearn.datasets.fetch_20newsgroups_vectorized` to download 20
66
+ # newsgroups dataset. It returns ready-to-use features.
67
+ #
68
+ # .. note:: ``X`` of the 20 newsgroups dataset is a sparse matrix while ``X``
69
+ # of diabetes dataset is a numpy array.
70
+ #
44
71
45
- def generate_data (case , sparse = False ):
72
+
73
+ def generate_data (case ):
46
74
"""Generate regression/classification data."""
47
75
if case == 'regression' :
48
- X , y = datasets .load_boston (return_X_y = True )
76
+ X , y = datasets .load_diabetes (return_X_y = True )
49
77
elif case == 'classification' :
50
78
X , y = datasets .fetch_20newsgroups_vectorized (subset = 'all' ,
51
79
return_X_y = True )
52
80
X , y = shuffle (X , y )
53
81
offset = int (X .shape [0 ] * 0.8 )
54
82
X_train , y_train = X [:offset ], y [:offset ]
55
83
X_test , y_test = X [offset :], y [offset :]
56
- if sparse :
57
- X_train = csr_matrix (X_train )
58
- X_test = csr_matrix (X_test )
59
- else :
60
- X_train = np .array (X_train )
61
- X_test = np .array (X_test )
62
- y_test = np .array (y_test )
63
- y_train = np .array (y_train )
84
+
64
85
data = {'X_train' : X_train , 'X_test' : X_test , 'y_train' : y_train ,
65
86
'y_test' : y_test }
66
87
return data
67
88
68
89
90
+ regression_data = generate_data ('regression' )
91
+ classification_data = generate_data ('classification' )
92
+
93
+
94
+ ##############################################################################
95
+ # Benchmark influence
96
+ # -------------------
97
+ # Next, we can calculate the influence of the parameters on the given
98
+ # estimator. In each round, we will set the estimator with the new value of
99
+ # ``changing_param`` and we will be collecting the prediction times, prediction
100
+ # performance and complexities to see how those changes affect the estimator.
101
+ # We will calculate the complexity using ``complexity_computer`` passed as a
102
+ # parameter.
103
+ #
104
+
105
+
69
106
def benchmark_influence (conf ):
70
107
"""
71
- Benchmark influence of : changing_param: on both MSE and latency.
108
+ Benchmark influence of ` changing_param` on both MSE and latency.
72
109
"""
73
110
prediction_times = []
74
111
prediction_powers = []
75
112
complexities = []
76
113
for param_value in conf ['changing_param_values' ]:
77
114
conf ['tuned_params' ][conf ['changing_param' ]] = param_value
78
115
estimator = conf ['estimator' ](** conf ['tuned_params' ])
116
+
79
117
print ("Benchmarking %s" % estimator )
80
118
estimator .fit (conf ['data' ]['X_train' ], conf ['data' ]['y_train' ])
81
119
conf ['postfit_hook' ](estimator )
@@ -95,37 +133,25 @@ def benchmark_influence(conf):
95
133
return prediction_powers , prediction_times , complexities
96
134
97
135
98
- def plot_influence (conf , mse_values , prediction_times , complexities ):
99
- """
100
- Plot influence of model complexity on both accuracy and latency.
101
- """
102
- plt .figure (figsize = (12 , 6 ))
103
- host = host_subplot (111 , axes_class = Axes )
104
- plt .subplots_adjust (right = 0.75 )
105
- par1 = host .twinx ()
106
- host .set_xlabel ('Model Complexity (%s)' % conf ['complexity_label' ])
107
- y1_label = conf ['prediction_performance_label' ]
108
- y2_label = "Time (s)"
109
- host .set_ylabel (y1_label )
110
- par1 .set_ylabel (y2_label )
111
- p1 , = host .plot (complexities , mse_values , 'b-' , label = "prediction error" )
112
- p2 , = par1 .plot (complexities , prediction_times , 'r-' ,
113
- label = "latency" )
114
- host .legend (loc = 'upper right' )
115
- host .axis ["left" ].label .set_color (p1 .get_color ())
116
- par1 .axis ["right" ].label .set_color (p2 .get_color ())
117
- plt .title ('Influence of Model Complexity - %s' % conf ['estimator' ].__name__ )
118
- plt .show ()
119
-
136
+ ##############################################################################
137
+ # Choose parameters
138
+ # -----------------
139
+ #
140
+ # We choose the parameters for each of our estimators by making
141
+ # a dictionary with all the necessary values.
142
+ # ``changing_param`` is the name of the parameter which will vary in each
143
+ # estimator.
144
+ # Complexity will be defined by the ``complexity_label`` and calculated using
145
+ # `complexity_computer`.
146
+ # Also note that depending on the estimator type we are passing
147
+ # different data.
148
+ #
120
149
121
150
def _count_nonzero_coefficients (estimator ):
122
151
a = estimator .coef_ .toarray ()
123
152
return np .count_nonzero (a )
124
153
125
- # #############################################################################
126
- # Main code
127
- regression_data = generate_data ('regression' )
128
- classification_data = generate_data ('classification' , sparse = True )
154
+
129
155
configurations = [
130
156
{'estimator' : SGDClassifier ,
131
157
'tuned_params' : {'penalty' : 'elasticnet' , 'alpha' : 0.001 , 'loss' :
@@ -162,8 +188,81 @@ def _count_nonzero_coefficients(estimator):
162
188
'prediction_performance_label' : 'MSE' ,
163
189
'n_samples' : 30 },
164
190
]
191
+
192
+
193
+ ##############################################################################
194
+ # Run the code and plot the results
195
+ # ---------------------------------
196
+ #
197
+ # We defined all the functions required to run our benchmark. Now, we will loop
198
+ # over the different configurations that we defined previously. Subsequently,
199
+ # we can analyze the plots obtained from the benchmark:
200
+ # Relaxing the `L1` penalty in the SGD classifier reduces the prediction error
201
+ # but leads to an increase in the training time.
202
+ # We can draw a similar analysis regarding the training time which increases
203
+ # with the number of support vectors with a Nu-SVR. However, we observed that
204
+ # there is an optimal number of support vectors which reduces the prediction
205
+ # error. Indeed, too few support vectors lead to an under-fitted model while
206
+ # too many support vectors lead to an over-fitted model.
207
+ # The exact same conclusion can be drawn for the gradient-boosting model. The
208
+ # only the difference with the Nu-SVR is that having too many trees in the
209
+ # ensemble is not as detrimental.
210
+ #
211
+
212
+ def plot_influence (conf , mse_values , prediction_times , complexities ):
213
+ """
214
+ Plot influence of model complexity on both accuracy and latency.
215
+ """
216
+
217
+ fig = plt .figure ()
218
+ fig .subplots_adjust (right = 0.75 )
219
+
220
+ # first axes (prediction error)
221
+ ax1 = fig .add_subplot (111 )
222
+ line1 = ax1 .plot (complexities , mse_values , c = 'tab:blue' , ls = '-' )[0 ]
223
+ ax1 .set_xlabel ('Model Complexity (%s)' % conf ['complexity_label' ])
224
+ y1_label = conf ['prediction_performance_label' ]
225
+ ax1 .set_ylabel (y1_label )
226
+
227
+ ax1 .spines ['left' ].set_color (line1 .get_color ())
228
+ ax1 .yaxis .label .set_color (line1 .get_color ())
229
+ ax1 .tick_params (axis = 'y' , colors = line1 .get_color ())
230
+
231
+ # second axes (latency)
232
+ ax2 = fig .add_subplot (111 , sharex = ax1 , frameon = False )
233
+ line2 = ax2 .plot (complexities , prediction_times , c = 'tab:orange' , ls = '-' )[0 ]
234
+ ax2 .yaxis .tick_right ()
235
+ ax2 .yaxis .set_label_position ("right" )
236
+ y2_label = "Time (s)"
237
+ ax2 .set_ylabel (y2_label )
238
+ ax1 .spines ['right' ].set_color (line2 .get_color ())
239
+ ax2 .yaxis .label .set_color (line2 .get_color ())
240
+ ax2 .tick_params (axis = 'y' , colors = line2 .get_color ())
241
+
242
+ plt .legend ((line1 , line2 ), ("prediction error" , "latency" ),
243
+ loc = 'upper right' )
244
+
245
+ plt .title ("Influence of varying '%s' on %s" % (conf ['changing_param' ],
246
+ conf ['estimator' ].__name__ ))
247
+
248
+
165
249
for conf in configurations :
166
250
prediction_performances , prediction_times , complexities = \
167
251
benchmark_influence (conf )
168
252
plot_influence (conf , prediction_performances , prediction_times ,
169
253
complexities )
254
+ plt .show ()
255
+
256
+
257
+ ##############################################################################
258
+ # Conclusion
259
+ # ----------
260
+ #
261
+ # As a conclusion, we can deduce the following insights:
262
+ #
263
+ # * a model which is more complex (or expressive) will require a larger
264
+ # training time;
265
+ # * a more complex model does not guarantee to reduce the prediction error.
266
+ #
267
+ # These aspects are related to model generalization and avoiding model
268
+ # under-fitting or over-fitting.
0 commit comments