Skip to content

Commit 17a72b8

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 6a6217f15f654284d227c27c9179bb02a793e811
1 parent d0dbbf1 commit 17a72b8

File tree

1,256 files changed

+6458
-4166
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,256 files changed

+6458
-4166
lines changed
Binary file not shown.

dev/_downloads/2f3ef774a6d7e52e1e6b7ccbb75d25f0/plot_gradient_boosting_quantile.py

Lines changed: 300 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -3,77 +3,330 @@
33
Prediction Intervals for Gradient Boosting Regression
44
=====================================================
55
6-
This example shows how quantile regression can be used
7-
to create prediction intervals.
6+
This example shows how quantile regression can be used to create prediction
7+
intervals.
88
"""
9-
9+
# %%
10+
# Generate some data for a synthetic regression problem by applying the
11+
# function f to uniformly sampled random inputs.
1012
import numpy as np
11-
import matplotlib.pyplot as plt
12-
13-
from sklearn.ensemble import GradientBoostingRegressor
14-
15-
np.random.seed(1)
13+
from sklearn.model_selection import train_test_split
1614

1715

1816
def f(x):
1917
"""The function to predict."""
2018
return x * np.sin(x)
2119

22-
#----------------------------------------------------------------------
23-
# First the noiseless case
24-
X = np.atleast_2d(np.random.uniform(0, 10.0, size=100)).T
25-
X = X.astype(np.float32)
2620

27-
# Observations
28-
y = f(X).ravel()
21+
rng = np.random.RandomState(42)
22+
X = np.atleast_2d(rng.uniform(0, 10.0, size=1000)).T
23+
expected_y = f(X).ravel()
24+
25+
# %%
26+
# To make the problem interesting, we generate observations of the target y as
27+
# the sum of a deterministic term computed by the function f and a random noise
28+
# term that follows a centered `log-normal
29+
# <https://en.wikipedia.org/wiki/Log-normal_distribution>`_. To make this even
30+
# more interesting we consider the case where the amplitude of the noise
31+
# depends on the input variable x (heteroscedastic noise).
32+
#
33+
# The lognormal distribution is non-symmetric and long tailed: observing large
34+
# outliers is likely but it is impossible to observe small outliers.
35+
sigma = 0.5 + X.ravel() / 10
36+
noise = rng.lognormal(sigma=sigma) - np.exp(sigma ** 2 / 2)
37+
y = expected_y + noise
38+
39+
# %%
40+
# Split into train, test datasets:
41+
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
42+
43+
# %%
44+
# Fitting non-linear quantile and least squares regressors
45+
# --------------------------------------------------------
46+
#
47+
# Fit gradient boosting models trained with the quantile loss and
48+
# alpha=0.05, 0.5, 0.95.
49+
#
50+
# The models obtained for alpha=0.05 and alpha=0.95 produce a 90% confidence
51+
# interval (95% - 5% = 90%).
52+
#
53+
# The model trained with alpha=0.5 produces a regression of the median: on
54+
# average, there should be the same number of target observations above and
55+
# below the predicted values.
56+
from sklearn.ensemble import GradientBoostingRegressor
57+
from sklearn.metrics import mean_pinball_loss, mean_squared_error
58+
2959

30-
dy = 1.5 + 1.0 * np.random.random(y.shape)
31-
noise = np.random.normal(0, dy)
32-
y += noise
33-
y = y.astype(np.float32)
60+
all_models = {}
61+
common_params = dict(
62+
learning_rate=0.05,
63+
n_estimators=250,
64+
max_depth=2,
65+
min_samples_leaf=9,
66+
min_samples_split=9,
67+
)
68+
for alpha in [0.05, 0.5, 0.95]:
69+
gbr = GradientBoostingRegressor(loss='quantile', alpha=alpha,
70+
**common_params)
71+
all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train)
3472

35-
# Mesh the input space for evaluations of the real function, the prediction and
36-
# its MSE
73+
# %%
74+
# For the sake of comparison, also fit a baseline model trained with the usual
75+
# least squares loss (ls), also known as the mean squared error (MSE).
76+
gbr_ls = GradientBoostingRegressor(loss='ls', **common_params)
77+
all_models["ls"] = gbr_ls.fit(X_train, y_train)
78+
79+
# %%
80+
# Create an evenly spaced evaluation set of input values spanning the [0, 10]
81+
# range.
3782
xx = np.atleast_2d(np.linspace(0, 10, 1000)).T
38-
xx = xx.astype(np.float32)
3983

40-
alpha = 0.95
84+
# %%
85+
# Plot the true conditional mean function f, the prediction of the conditional
86+
# mean (least squares loss), the conditional median and the conditional 90%
87+
# interval (from 5th to 95th conditional percentiles).
88+
import matplotlib.pyplot as plt
89+
90+
91+
y_pred = all_models['ls'].predict(xx)
92+
y_lower = all_models['q 0.05'].predict(xx)
93+
y_upper = all_models['q 0.95'].predict(xx)
94+
y_med = all_models['q 0.50'].predict(xx)
95+
96+
fig = plt.figure(figsize=(10, 10))
97+
plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$')
98+
plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations')
99+
plt.plot(xx, y_med, 'r-', label='Predicted median', color="orange")
100+
plt.plot(xx, y_pred, 'r-', label='Predicted mean')
101+
plt.plot(xx, y_upper, 'k-')
102+
plt.plot(xx, y_lower, 'k-')
103+
plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4,
104+
label='Predicted 90% interval')
105+
plt.xlabel('$x$')
106+
plt.ylabel('$f(x)$')
107+
plt.ylim(-10, 25)
108+
plt.legend(loc='upper left')
109+
plt.show()
110+
111+
# %%
112+
# Comparing the predicted median with the predicted mean, we note that the
113+
# median is on average below the mean as the noise is skewed towards high
114+
# values (large outliers). The median estimate also seems to be smoother
115+
# because of its natural robustness to outliers.
116+
#
117+
# Also observe that the inductive bias of gradient boosting trees is
118+
# unfortunately preventing our 0.05 quantile to fully capture the sinoisoidal
119+
# shape of the signal, in particular around x=8. Tuning hyper-parameters can
120+
# reduce this effect as shown in the last part of this notebook.
121+
#
122+
# Analysis of the error metrics
123+
# -----------------------------
124+
#
125+
# Measure the models with :func:`mean_squared_error` and
126+
# :func:`mean_pinball_loss` metrics on the training dataset.
127+
import pandas as pd
128+
129+
130+
def highlight_min(x):
131+
x_min = x.min()
132+
return ['font-weight: bold' if v == x_min else ''
133+
for v in x]
134+
135+
136+
results = []
137+
for name, gbr in sorted(all_models.items()):
138+
metrics = {'model': name}
139+
y_pred = gbr.predict(X_train)
140+
for alpha in [0.05, 0.5, 0.95]:
141+
metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(
142+
y_train, y_pred, alpha=alpha)
143+
metrics['MSE'] = mean_squared_error(y_train, y_pred)
144+
results.append(metrics)
145+
146+
pd.DataFrame(results).set_index('model').style.apply(highlight_min)
147+
148+
# %%
149+
# One column shows all models evaluated by the same metric. The minimum number
150+
# on a column should be obtained when the model is trained and measured with
151+
# the same metric. This should be always the case on the training set if the
152+
# training converged.
153+
#
154+
# Note that because the target distribution is asymmetric, the expected
155+
# conditional mean and conditional median are signficiantly different and
156+
# therefore one could not use the least squares model get a good estimation of
157+
# the conditional median nor the converse.
158+
#
159+
# If the target distribution were symmetric and had no outliers (e.g. with a
160+
# Gaussian noise), then median estimator and the least squares estimator would
161+
# have yielded similar predictions.
162+
#
163+
# We then do the same on the test set.
164+
results = []
165+
for name, gbr in sorted(all_models.items()):
166+
metrics = {'model': name}
167+
y_pred = gbr.predict(X_test)
168+
for alpha in [0.05, 0.5, 0.95]:
169+
metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(
170+
y_test, y_pred, alpha=alpha)
171+
metrics['MSE'] = mean_squared_error(y_test, y_pred)
172+
results.append(metrics)
41173

42-
clf = GradientBoostingRegressor(loss='quantile', alpha=alpha,
43-
n_estimators=250, max_depth=3,
44-
learning_rate=.1, min_samples_leaf=9,
45-
min_samples_split=9)
174+
pd.DataFrame(results).set_index('model').style.apply(highlight_min)
46175

47-
clf.fit(X, y)
48176

49-
# Make the prediction on the meshed x-axis
50-
y_upper = clf.predict(xx)
177+
# %%
178+
# Errors are higher meaning the models slightly overfitted the data. It still
179+
# shows that the best test metric is obtained when the model is trained by
180+
# minimizing this same metric.
181+
#
182+
# Note that the conditional median estimator is competitive with the least
183+
# squares estimator in terms of MSE on the test set: this can be explained by
184+
# the fact the least squares estimator is very sensitive to large outliers
185+
# which can cause significant overfitting. This can be seen on the right hand
186+
# side of the previous plot. The conditional median estimator is biased
187+
# (underestimation for this asymetric noise) but is also naturally robust to
188+
# outliers and overfits less.
189+
#
190+
# Calibration of the confidence interval
191+
# --------------------------------------
192+
#
193+
# We can also evaluate the ability of the two extreme quantile estimators at
194+
# producing a well-calibrated conditational 90%-confidence interval.
195+
#
196+
# To do this we can compute the fraction of observations that fall between the
197+
# predictions:
198+
def coverage_fraction(y, y_low, y_high):
199+
return np.mean(np.logical_and(y >= y_low, y <= y_high))
51200

52-
clf.set_params(alpha=1.0 - alpha)
53-
clf.fit(X, y)
54201

55-
# Make the prediction on the meshed x-axis
56-
y_lower = clf.predict(xx)
202+
coverage_fraction(y_train,
203+
all_models['q 0.05'].predict(X_train),
204+
all_models['q 0.95'].predict(X_train))
57205

58-
clf.set_params(loss='ls')
59-
clf.fit(X, y)
206+
# %%
207+
# On the training set the calibration is very close to the expected coverage
208+
# value for a 90% confidence interval.
209+
coverage_fraction(y_test,
210+
all_models['q 0.05'].predict(X_test),
211+
all_models['q 0.95'].predict(X_test))
60212

61-
# Make the prediction on the meshed x-axis
62-
y_pred = clf.predict(xx)
63213

64-
# Plot the function, the prediction and the 95% confidence interval based on
65-
# the MSE
66-
fig = plt.figure()
67-
plt.plot(xx, f(xx), 'g:', label=r'$f(x) = x\,\sin(x)$')
68-
plt.plot(X, y, 'b.', markersize=10, label=u'Observations')
69-
plt.plot(xx, y_pred, 'r-', label=u'Prediction')
214+
# %%
215+
# On the test set, the estimated confidence interval is slightly too narrow.
216+
# Note, however, that we would need to wrap those metrics in a cross-validation
217+
# loop to assess their variability under data resampling.
218+
#
219+
# Tuning the hyper-parameters of the quantile regressors
220+
# ------------------------------------------------------
221+
#
222+
# In the plot above, we observed that the 5th percentile regressor seems to
223+
# underfit and could not adapt to sinusoidal shape of the signal.
224+
#
225+
# The hyper-parameters of the model were approximately hand-tuned for the
226+
# median regressor and there is no reason than the same hyper-parameters are
227+
# suitable for the 5th percentile regressor.
228+
#
229+
# To confirm this hypothesis, we tune the hyper-parameters of a new regressor
230+
# of the 5th percentile by selecting the best model parameters by
231+
# cross-validation on the pinball loss with alpha=0.05:
232+
233+
# %%
234+
from sklearn.model_selection import RandomizedSearchCV
235+
from sklearn.metrics import make_scorer
236+
from pprint import pprint
237+
238+
239+
param_grid = dict(
240+
learning_rate=[0.01, 0.05, 0.1],
241+
n_estimators=[100, 150, 200, 250, 300],
242+
max_depth=[2, 5, 10, 15, 20],
243+
min_samples_leaf=[1, 5, 10, 20, 30, 50],
244+
min_samples_split=[2, 5, 10, 20, 30, 50],
245+
)
246+
alpha = 0.05
247+
neg_mean_pinball_loss_05p_scorer = make_scorer(
248+
mean_pinball_loss,
249+
alpha=alpha,
250+
greater_is_better=False, # maximize the negative loss
251+
)
252+
gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, random_state=0)
253+
search_05p = RandomizedSearchCV(
254+
gbr,
255+
param_grid,
256+
n_iter=10, # increase this if computational budget allows
257+
scoring=neg_mean_pinball_loss_05p_scorer,
258+
n_jobs=2,
259+
random_state=0,
260+
).fit(X_train, y_train)
261+
pprint(search_05p.best_params_)
262+
263+
# %%
264+
# We observe that the search procedure identifies that deeper trees are needed
265+
# to get a good fit for the 5th percentile regressor. Deeper trees are more
266+
# expressive and less likely to underfit.
267+
#
268+
# Let's now tune the hyper-parameters for the 95th percentile regressor. We
269+
# need to redefine the `scoring` metric used to select the best model, along
270+
# with adjusting the alpha parameter of the inner gradient boosting estimator
271+
# itself:
272+
from sklearn.base import clone
273+
274+
alpha = 0.95
275+
neg_mean_pinball_loss_95p_scorer = make_scorer(
276+
mean_pinball_loss,
277+
alpha=alpha,
278+
greater_is_better=False, # maximize the negative loss
279+
)
280+
search_95p = clone(search_05p).set_params(
281+
estimator__alpha=alpha,
282+
scoring=neg_mean_pinball_loss_95p_scorer,
283+
)
284+
search_95p.fit(X_train, y_train)
285+
pprint(search_95p.best_params_)
286+
287+
# %%
288+
# This time, shallower trees are selected and lead to a more constant piecewise
289+
# and therefore more robust estimation of the 95th percentile. This is
290+
# beneficial as it avoids overfitting the large outliers of the log-normal
291+
# additive noise.
292+
#
293+
# We can confirm this intuition by displaying the predicted 90% confidence
294+
# interval comprised by the predictions of those two tuned quantile regressors:
295+
# the prediction of the upper 95th percentile has a much coarser shape than the
296+
# prediction of the lower 5th percentile:
297+
y_lower = search_05p.predict(xx)
298+
y_upper = search_95p.predict(xx)
299+
300+
fig = plt.figure(figsize=(10, 10))
301+
plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$')
302+
plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations')
70303
plt.plot(xx, y_upper, 'k-')
71304
plt.plot(xx, y_lower, 'k-')
72-
plt.fill(np.concatenate([xx, xx[::-1]]),
73-
np.concatenate([y_upper, y_lower[::-1]]),
74-
alpha=.5, fc='b', ec='None', label='95% prediction interval')
305+
plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4,
306+
label='Predicted 90% interval')
75307
plt.xlabel('$x$')
76308
plt.ylabel('$f(x)$')
77-
plt.ylim(-10, 20)
309+
plt.ylim(-10, 25)
78310
plt.legend(loc='upper left')
311+
plt.title("Prediction with tuned hyper-parameters")
79312
plt.show()
313+
314+
# %%
315+
# The plot looks qualitatively better than for the untuned models, especially
316+
# for the shape of the of lower quantile.
317+
#
318+
# We now quantitatively evaluate the joint-calibration of the pair of
319+
# estimators:
320+
coverage_fraction(y_train,
321+
search_05p.predict(X_train),
322+
search_95p.predict(X_train))
323+
# %%
324+
coverage_fraction(y_test,
325+
search_05p.predict(X_test),
326+
search_95p.predict(X_test))
327+
# %%
328+
# The calibration of the tuned pair is sadly not better on the test set: the
329+
# width of the estimated confidence interval is still too narrow.
330+
#
331+
# Again, we would need to wrap this study in a cross-validation loop to
332+
# better assess the variability of those estimates.
Binary file not shown.

0 commit comments

Comments
 (0)