Skip to content

Commit 1449d1c

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 1986c89a12203a2df02f65e0764acea2bcd027cc
1 parent cab04c0 commit 1449d1c

File tree

1,214 files changed

+4316
-3839
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,214 files changed

+4316
-3839
lines changed

dev/_downloads/09342387020f5abd8190ad409affdd7b/plot_model_complexity_influence.py

Lines changed: 148 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,42 @@
66
Demonstrate how model complexity influences both prediction accuracy and
77
computational performance.
88
9-
The dataset is the Boston Housing dataset (resp. 20 Newsgroups) for
10-
regression (resp. classification).
9+
We will be using two datasets:
10+
- :ref:`diabetes_dataset` for regression.
11+
This dataset consists of 10 measurements taken from diabetes patients.
12+
The task is to predict disease progression;
13+
- :ref:`20newsgroups_dataset` for classification. This dataset consists of
14+
newsgroup posts. The task is to predict on which topic (out of 20 topics)
15+
the post is written about.
16+
17+
We will model the complexity influence on three different estimators:
18+
- :class:`~sklearn.linear_model.SGDClassifier` (for classification data)
19+
which implements stochastic gradient descent learning;
20+
21+
- :class:`~sklearn.svm.NuSVR` (for regression data) which implements
22+
Nu support vector regression;
23+
24+
- :class:`~sklearn.ensemble.GradientBoostingRegressor` (for regression
25+
data) which builds an additive model in a forward stage-wise fashion.
26+
27+
28+
We make the model complexity vary through the choice of relevant model
29+
parameters in each of our selected models. Next, we will measure the influence
30+
on both computational performance (latency) and predictive power (MSE or
31+
Hamming Loss).
1132
12-
For each class of models we make the model complexity vary through the choice
13-
of relevant model parameters and measure the influence on both computational
14-
performance (latency) and predictive power (MSE or Hamming Loss).
1533
"""
1634

1735
print(__doc__)
1836

19-
# Author: Eustache Diemert <[email protected]>
37+
# Authors: Eustache Diemert <[email protected]>
38+
# Maria Telenczuk <https://github.com/maikia>
39+
# Guillaume Lemaitre <[email protected]>
2040
# License: BSD 3 clause
2141

2242
import time
2343
import numpy as np
2444
import matplotlib.pyplot as plt
25-
from mpl_toolkits.axes_grid1.parasite_axes import host_subplot
26-
from mpl_toolkits.axisartist.axislines import Axes
27-
from scipy.sparse.csr import csr_matrix
2845

2946
from sklearn import datasets
3047
from sklearn.utils import shuffle
@@ -34,48 +51,69 @@
3451
from sklearn.linear_model import SGDClassifier
3552
from sklearn.metrics import hamming_loss
3653

37-
# #############################################################################
38-
# Routines
39-
4054

4155
# Initialize random generator
4256
np.random.seed(0)
4357

58+
##############################################################################
59+
# Load the data
60+
# -------------
61+
#
62+
# First we load both datasets.
63+
#
64+
# .. note:: We are using
65+
# :func:`~sklearn.datasets.fetch_20newsgroups_vectorized` to download 20
66+
# newsgroups dataset. It returns ready-to-use features.
67+
#
68+
# .. note:: ``X`` of the 20 newsgroups dataset is a sparse matrix while ``X``
69+
# of diabetes dataset is a numpy array.
70+
#
4471

45-
def generate_data(case, sparse=False):
72+
73+
def generate_data(case):
4674
"""Generate regression/classification data."""
4775
if case == 'regression':
48-
X, y = datasets.load_boston(return_X_y=True)
76+
X, y = datasets.load_diabetes(return_X_y=True)
4977
elif case == 'classification':
5078
X, y = datasets.fetch_20newsgroups_vectorized(subset='all',
5179
return_X_y=True)
5280
X, y = shuffle(X, y)
5381
offset = int(X.shape[0] * 0.8)
5482
X_train, y_train = X[:offset], y[:offset]
5583
X_test, y_test = X[offset:], y[offset:]
56-
if sparse:
57-
X_train = csr_matrix(X_train)
58-
X_test = csr_matrix(X_test)
59-
else:
60-
X_train = np.array(X_train)
61-
X_test = np.array(X_test)
62-
y_test = np.array(y_test)
63-
y_train = np.array(y_train)
84+
6485
data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train,
6586
'y_test': y_test}
6687
return data
6788

6889

90+
regression_data = generate_data('regression')
91+
classification_data = generate_data('classification')
92+
93+
94+
##############################################################################
95+
# Benchmark influence
96+
# -------------------
97+
# Next, we can calculate the influence of the parameters on the given
98+
# estimator. In each round, we will set the estimator with the new value of
99+
# ``changing_param`` and we will be collecting the prediction times, prediction
100+
# performance and complexities to see how those changes affect the estimator.
101+
# We will calculate the complexity using ``complexity_computer`` passed as a
102+
# parameter.
103+
#
104+
105+
69106
def benchmark_influence(conf):
70107
"""
71-
Benchmark influence of :changing_param: on both MSE and latency.
108+
Benchmark influence of `changing_param` on both MSE and latency.
72109
"""
73110
prediction_times = []
74111
prediction_powers = []
75112
complexities = []
76113
for param_value in conf['changing_param_values']:
77114
conf['tuned_params'][conf['changing_param']] = param_value
78115
estimator = conf['estimator'](**conf['tuned_params'])
116+
79117
print("Benchmarking %s" % estimator)
80118
estimator.fit(conf['data']['X_train'], conf['data']['y_train'])
81119
conf['postfit_hook'](estimator)
@@ -95,37 +133,25 @@ def benchmark_influence(conf):
95133
return prediction_powers, prediction_times, complexities
96134

97135

98-
def plot_influence(conf, mse_values, prediction_times, complexities):
99-
"""
100-
Plot influence of model complexity on both accuracy and latency.
101-
"""
102-
plt.figure(figsize=(12, 6))
103-
host = host_subplot(111, axes_class=Axes)
104-
plt.subplots_adjust(right=0.75)
105-
par1 = host.twinx()
106-
host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
107-
y1_label = conf['prediction_performance_label']
108-
y2_label = "Time (s)"
109-
host.set_ylabel(y1_label)
110-
par1.set_ylabel(y2_label)
111-
p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
112-
p2, = par1.plot(complexities, prediction_times, 'r-',
113-
label="latency")
114-
host.legend(loc='upper right')
115-
host.axis["left"].label.set_color(p1.get_color())
116-
par1.axis["right"].label.set_color(p2.get_color())
117-
plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
118-
plt.show()
119-
136+
##############################################################################
137+
# Choose parameters
138+
# -----------------
139+
#
140+
# We choose the parameters for each of our estimators by making
141+
# a dictionary with all the necessary values.
142+
# ``changing_param`` is the name of the parameter which will vary in each
143+
# estimator.
144+
# Complexity will be defined by the ``complexity_label`` and calculated using
145+
# `complexity_computer`.
146+
# Also note that depending on the estimator type we are passing
147+
# different data.
148+
#
120149

121150
def _count_nonzero_coefficients(estimator):
122151
a = estimator.coef_.toarray()
123152
return np.count_nonzero(a)
124153

125-
# #############################################################################
126-
# Main code
127-
regression_data = generate_data('regression')
128-
classification_data = generate_data('classification', sparse=True)
154+
129155
configurations = [
130156
{'estimator': SGDClassifier,
131157
'tuned_params': {'penalty': 'elasticnet', 'alpha': 0.001, 'loss':
@@ -162,8 +188,81 @@ def _count_nonzero_coefficients(estimator):
162188
'prediction_performance_label': 'MSE',
163189
'n_samples': 30},
164190
]
191+
192+
193+
##############################################################################
194+
# Run the code and plot the results
195+
# ---------------------------------
196+
#
197+
# We defined all the functions required to run our benchmark. Now, we will loop
198+
# over the different configurations that we defined previously. Subsequently,
199+
# we can analyze the plots obtained from the benchmark:
200+
# Relaxing the `L1` penalty in the SGD classifier reduces the prediction error
201+
# but leads to an increase in the training time.
202+
# We can draw a similar analysis regarding the training time which increases
203+
# with the number of support vectors with a Nu-SVR. However, we observed that
204+
# there is an optimal number of support vectors which reduces the prediction
205+
# error. Indeed, too few support vectors lead to an under-fitted model while
206+
# too many support vectors lead to an over-fitted model.
207+
# The exact same conclusion can be drawn for the gradient-boosting model. The
208+
# only the difference with the Nu-SVR is that having too many trees in the
209+
# ensemble is not as detrimental.
210+
#
211+
212+
def plot_influence(conf, mse_values, prediction_times, complexities):
213+
"""
214+
Plot influence of model complexity on both accuracy and latency.
215+
"""
216+
217+
fig = plt.figure()
218+
fig.subplots_adjust(right=0.75)
219+
220+
# first axes (prediction error)
221+
ax1 = fig.add_subplot(111)
222+
line1 = ax1.plot(complexities, mse_values, c='tab:blue', ls='-')[0]
223+
ax1.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
224+
y1_label = conf['prediction_performance_label']
225+
ax1.set_ylabel(y1_label)
226+
227+
ax1.spines['left'].set_color(line1.get_color())
228+
ax1.yaxis.label.set_color(line1.get_color())
229+
ax1.tick_params(axis='y', colors=line1.get_color())
230+
231+
# second axes (latency)
232+
ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)
233+
line2 = ax2.plot(complexities, prediction_times, c='tab:orange', ls='-')[0]
234+
ax2.yaxis.tick_right()
235+
ax2.yaxis.set_label_position("right")
236+
y2_label = "Time (s)"
237+
ax2.set_ylabel(y2_label)
238+
ax1.spines['right'].set_color(line2.get_color())
239+
ax2.yaxis.label.set_color(line2.get_color())
240+
ax2.tick_params(axis='y', colors=line2.get_color())
241+
242+
plt.legend((line1, line2), ("prediction error", "latency"),
243+
loc='upper right')
244+
245+
plt.title("Influence of varying '%s' on %s" % (conf['changing_param'],
246+
conf['estimator'].__name__))
247+
248+
165249
for conf in configurations:
166250
prediction_performances, prediction_times, complexities = \
167251
benchmark_influence(conf)
168252
plot_influence(conf, prediction_performances, prediction_times,
169253
complexities)
254+
plt.show()
255+
256+
257+
##############################################################################
258+
# Conclusion
259+
# ----------
260+
#
261+
# As a conclusion, we can deduce the following insights:
262+
#
263+
# * a model which is more complex (or expressive) will require a larger
264+
# training time;
265+
# * a more complex model does not guarantee to reduce the prediction error.
266+
#
267+
# These aspects are related to model generalization and avoiding model
268+
# under-fitting or over-fitting.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)