|
3 | 3 | Prediction Intervals for Gradient Boosting Regression
|
4 | 4 | =====================================================
|
5 | 5 |
|
6 |
| -This example shows how quantile regression can be used |
7 |
| -to create prediction intervals. |
| 6 | +This example shows how quantile regression can be used to create prediction |
| 7 | +intervals. |
8 | 8 | """
|
9 |
| - |
| 9 | +# %% |
| 10 | +# Generate some data for a synthetic regression problem by applying the |
| 11 | +# function f to uniformly sampled random inputs. |
10 | 12 | import numpy as np
|
11 |
| -import matplotlib.pyplot as plt |
12 |
| - |
13 |
| -from sklearn.ensemble import GradientBoostingRegressor |
14 |
| - |
15 |
| -np.random.seed(1) |
| 13 | +from sklearn.model_selection import train_test_split |
16 | 14 |
|
17 | 15 |
|
18 | 16 | def f(x):
|
19 | 17 | """The function to predict."""
|
20 | 18 | return x * np.sin(x)
|
21 | 19 |
|
22 |
| -#---------------------------------------------------------------------- |
23 |
| -# First the noiseless case |
24 |
| -X = np.atleast_2d(np.random.uniform(0, 10.0, size=100)).T |
25 |
| -X = X.astype(np.float32) |
26 | 20 |
|
27 |
| -# Observations |
28 |
| -y = f(X).ravel() |
| 21 | +rng = np.random.RandomState(42) |
| 22 | +X = np.atleast_2d(rng.uniform(0, 10.0, size=1000)).T |
| 23 | +expected_y = f(X).ravel() |
| 24 | + |
| 25 | +# %% |
| 26 | +# To make the problem interesting, we generate observations of the target y as |
| 27 | +# the sum of a deterministic term computed by the function f and a random noise |
| 28 | +# term that follows a centered `log-normal |
| 29 | +# <https://en.wikipedia.org/wiki/Log-normal_distribution>`_. To make this even |
| 30 | +# more interesting we consider the case where the amplitude of the noise |
| 31 | +# depends on the input variable x (heteroscedastic noise). |
| 32 | +# |
| 33 | +# The lognormal distribution is non-symmetric and long tailed: observing large |
| 34 | +# outliers is likely but it is impossible to observe small outliers. |
| 35 | +sigma = 0.5 + X.ravel() / 10 |
| 36 | +noise = rng.lognormal(sigma=sigma) - np.exp(sigma ** 2 / 2) |
| 37 | +y = expected_y + noise |
| 38 | + |
| 39 | +# %% |
| 40 | +# Split into train, test datasets: |
| 41 | +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) |
| 42 | + |
| 43 | +# %% |
| 44 | +# Fitting non-linear quantile and least squares regressors |
| 45 | +# -------------------------------------------------------- |
| 46 | +# |
| 47 | +# Fit gradient boosting models trained with the quantile loss and |
| 48 | +# alpha=0.05, 0.5, 0.95. |
| 49 | +# |
| 50 | +# The models obtained for alpha=0.05 and alpha=0.95 produce a 90% confidence |
| 51 | +# interval (95% - 5% = 90%). |
| 52 | +# |
| 53 | +# The model trained with alpha=0.5 produces a regression of the median: on |
| 54 | +# average, there should be the same number of target observations above and |
| 55 | +# below the predicted values. |
| 56 | +from sklearn.ensemble import GradientBoostingRegressor |
| 57 | +from sklearn.metrics import mean_pinball_loss, mean_squared_error |
| 58 | + |
29 | 59 |
|
30 |
| -dy = 1.5 + 1.0 * np.random.random(y.shape) |
31 |
| -noise = np.random.normal(0, dy) |
32 |
| -y += noise |
33 |
| -y = y.astype(np.float32) |
| 60 | +all_models = {} |
| 61 | +common_params = dict( |
| 62 | + learning_rate=0.05, |
| 63 | + n_estimators=250, |
| 64 | + max_depth=2, |
| 65 | + min_samples_leaf=9, |
| 66 | + min_samples_split=9, |
| 67 | +) |
| 68 | +for alpha in [0.05, 0.5, 0.95]: |
| 69 | + gbr = GradientBoostingRegressor(loss='quantile', alpha=alpha, |
| 70 | + **common_params) |
| 71 | + all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train) |
34 | 72 |
|
35 |
| -# Mesh the input space for evaluations of the real function, the prediction and |
36 |
| -# its MSE |
| 73 | +# %% |
| 74 | +# For the sake of comparison, also fit a baseline model trained with the usual |
| 75 | +# least squares loss (ls), also known as the mean squared error (MSE). |
| 76 | +gbr_ls = GradientBoostingRegressor(loss='ls', **common_params) |
| 77 | +all_models["ls"] = gbr_ls.fit(X_train, y_train) |
| 78 | + |
| 79 | +# %% |
| 80 | +# Create an evenly spaced evaluation set of input values spanning the [0, 10] |
| 81 | +# range. |
37 | 82 | xx = np.atleast_2d(np.linspace(0, 10, 1000)).T
|
38 |
| -xx = xx.astype(np.float32) |
39 | 83 |
|
40 |
| -alpha = 0.95 |
| 84 | +# %% |
| 85 | +# Plot the true conditional mean function f, the prediction of the conditional |
| 86 | +# mean (least squares loss), the conditional median and the conditional 90% |
| 87 | +# interval (from 5th to 95th conditional percentiles). |
| 88 | +import matplotlib.pyplot as plt |
| 89 | + |
| 90 | + |
| 91 | +y_pred = all_models['ls'].predict(xx) |
| 92 | +y_lower = all_models['q 0.05'].predict(xx) |
| 93 | +y_upper = all_models['q 0.95'].predict(xx) |
| 94 | +y_med = all_models['q 0.50'].predict(xx) |
| 95 | + |
| 96 | +fig = plt.figure(figsize=(10, 10)) |
| 97 | +plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$') |
| 98 | +plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations') |
| 99 | +plt.plot(xx, y_med, 'r-', label='Predicted median', color="orange") |
| 100 | +plt.plot(xx, y_pred, 'r-', label='Predicted mean') |
| 101 | +plt.plot(xx, y_upper, 'k-') |
| 102 | +plt.plot(xx, y_lower, 'k-') |
| 103 | +plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4, |
| 104 | + label='Predicted 90% interval') |
| 105 | +plt.xlabel('$x$') |
| 106 | +plt.ylabel('$f(x)$') |
| 107 | +plt.ylim(-10, 25) |
| 108 | +plt.legend(loc='upper left') |
| 109 | +plt.show() |
| 110 | + |
| 111 | +# %% |
| 112 | +# Comparing the predicted median with the predicted mean, we note that the |
| 113 | +# median is on average below the mean as the noise is skewed towards high |
| 114 | +# values (large outliers). The median estimate also seems to be smoother |
| 115 | +# because of its natural robustness to outliers. |
| 116 | +# |
| 117 | +# Also observe that the inductive bias of gradient boosting trees is |
| 118 | +# unfortunately preventing our 0.05 quantile to fully capture the sinoisoidal |
| 119 | +# shape of the signal, in particular around x=8. Tuning hyper-parameters can |
| 120 | +# reduce this effect as shown in the last part of this notebook. |
| 121 | +# |
| 122 | +# Analysis of the error metrics |
| 123 | +# ----------------------------- |
| 124 | +# |
| 125 | +# Measure the models with :func:`mean_squared_error` and |
| 126 | +# :func:`mean_pinball_loss` metrics on the training dataset. |
| 127 | +import pandas as pd |
| 128 | + |
| 129 | + |
| 130 | +def highlight_min(x): |
| 131 | + x_min = x.min() |
| 132 | + return ['font-weight: bold' if v == x_min else '' |
| 133 | + for v in x] |
| 134 | + |
| 135 | + |
| 136 | +results = [] |
| 137 | +for name, gbr in sorted(all_models.items()): |
| 138 | + metrics = {'model': name} |
| 139 | + y_pred = gbr.predict(X_train) |
| 140 | + for alpha in [0.05, 0.5, 0.95]: |
| 141 | + metrics["pbl=%1.2f" % alpha] = mean_pinball_loss( |
| 142 | + y_train, y_pred, alpha=alpha) |
| 143 | + metrics['MSE'] = mean_squared_error(y_train, y_pred) |
| 144 | + results.append(metrics) |
| 145 | + |
| 146 | +pd.DataFrame(results).set_index('model').style.apply(highlight_min) |
| 147 | + |
| 148 | +# %% |
| 149 | +# One column shows all models evaluated by the same metric. The minimum number |
| 150 | +# on a column should be obtained when the model is trained and measured with |
| 151 | +# the same metric. This should be always the case on the training set if the |
| 152 | +# training converged. |
| 153 | +# |
| 154 | +# Note that because the target distribution is asymmetric, the expected |
| 155 | +# conditional mean and conditional median are signficiantly different and |
| 156 | +# therefore one could not use the least squares model get a good estimation of |
| 157 | +# the conditional median nor the converse. |
| 158 | +# |
| 159 | +# If the target distribution were symmetric and had no outliers (e.g. with a |
| 160 | +# Gaussian noise), then median estimator and the least squares estimator would |
| 161 | +# have yielded similar predictions. |
| 162 | +# |
| 163 | +# We then do the same on the test set. |
| 164 | +results = [] |
| 165 | +for name, gbr in sorted(all_models.items()): |
| 166 | + metrics = {'model': name} |
| 167 | + y_pred = gbr.predict(X_test) |
| 168 | + for alpha in [0.05, 0.5, 0.95]: |
| 169 | + metrics["pbl=%1.2f" % alpha] = mean_pinball_loss( |
| 170 | + y_test, y_pred, alpha=alpha) |
| 171 | + metrics['MSE'] = mean_squared_error(y_test, y_pred) |
| 172 | + results.append(metrics) |
41 | 173 |
|
42 |
| -clf = GradientBoostingRegressor(loss='quantile', alpha=alpha, |
43 |
| - n_estimators=250, max_depth=3, |
44 |
| - learning_rate=.1, min_samples_leaf=9, |
45 |
| - min_samples_split=9) |
| 174 | +pd.DataFrame(results).set_index('model').style.apply(highlight_min) |
46 | 175 |
|
47 |
| -clf.fit(X, y) |
48 | 176 |
|
49 |
| -# Make the prediction on the meshed x-axis |
50 |
| -y_upper = clf.predict(xx) |
| 177 | +# %% |
| 178 | +# Errors are higher meaning the models slightly overfitted the data. It still |
| 179 | +# shows that the best test metric is obtained when the model is trained by |
| 180 | +# minimizing this same metric. |
| 181 | +# |
| 182 | +# Note that the conditional median estimator is competitive with the least |
| 183 | +# squares estimator in terms of MSE on the test set: this can be explained by |
| 184 | +# the fact the least squares estimator is very sensitive to large outliers |
| 185 | +# which can cause significant overfitting. This can be seen on the right hand |
| 186 | +# side of the previous plot. The conditional median estimator is biased |
| 187 | +# (underestimation for this asymetric noise) but is also naturally robust to |
| 188 | +# outliers and overfits less. |
| 189 | +# |
| 190 | +# Calibration of the confidence interval |
| 191 | +# -------------------------------------- |
| 192 | +# |
| 193 | +# We can also evaluate the ability of the two extreme quantile estimators at |
| 194 | +# producing a well-calibrated conditational 90%-confidence interval. |
| 195 | +# |
| 196 | +# To do this we can compute the fraction of observations that fall between the |
| 197 | +# predictions: |
| 198 | +def coverage_fraction(y, y_low, y_high): |
| 199 | + return np.mean(np.logical_and(y >= y_low, y <= y_high)) |
51 | 200 |
|
52 |
| -clf.set_params(alpha=1.0 - alpha) |
53 |
| -clf.fit(X, y) |
54 | 201 |
|
55 |
| -# Make the prediction on the meshed x-axis |
56 |
| -y_lower = clf.predict(xx) |
| 202 | +coverage_fraction(y_train, |
| 203 | + all_models['q 0.05'].predict(X_train), |
| 204 | + all_models['q 0.95'].predict(X_train)) |
57 | 205 |
|
58 |
| -clf.set_params(loss='ls') |
59 |
| -clf.fit(X, y) |
| 206 | +# %% |
| 207 | +# On the training set the calibration is very close to the expected coverage |
| 208 | +# value for a 90% confidence interval. |
| 209 | +coverage_fraction(y_test, |
| 210 | + all_models['q 0.05'].predict(X_test), |
| 211 | + all_models['q 0.95'].predict(X_test)) |
60 | 212 |
|
61 |
| -# Make the prediction on the meshed x-axis |
62 |
| -y_pred = clf.predict(xx) |
63 | 213 |
|
64 |
| -# Plot the function, the prediction and the 95% confidence interval based on |
65 |
| -# the MSE |
66 |
| -fig = plt.figure() |
67 |
| -plt.plot(xx, f(xx), 'g:', label=r'$f(x) = x\,\sin(x)$') |
68 |
| -plt.plot(X, y, 'b.', markersize=10, label=u'Observations') |
69 |
| -plt.plot(xx, y_pred, 'r-', label=u'Prediction') |
| 214 | +# %% |
| 215 | +# On the test set, the estimated confidence interval is slightly too narrow. |
| 216 | +# Note, however, that we would need to wrap those metrics in a cross-validation |
| 217 | +# loop to assess their variability under data resampling. |
| 218 | +# |
| 219 | +# Tuning the hyper-parameters of the quantile regressors |
| 220 | +# ------------------------------------------------------ |
| 221 | +# |
| 222 | +# In the plot above, we observed that the 5th percentile regressor seems to |
| 223 | +# underfit and could not adapt to sinusoidal shape of the signal. |
| 224 | +# |
| 225 | +# The hyper-parameters of the model were approximately hand-tuned for the |
| 226 | +# median regressor and there is no reason than the same hyper-parameters are |
| 227 | +# suitable for the 5th percentile regressor. |
| 228 | +# |
| 229 | +# To confirm this hypothesis, we tune the hyper-parameters of a new regressor |
| 230 | +# of the 5th percentile by selecting the best model parameters by |
| 231 | +# cross-validation on the pinball loss with alpha=0.05: |
| 232 | + |
| 233 | +# %% |
| 234 | +from sklearn.model_selection import RandomizedSearchCV |
| 235 | +from sklearn.metrics import make_scorer |
| 236 | +from pprint import pprint |
| 237 | + |
| 238 | + |
| 239 | +param_grid = dict( |
| 240 | + learning_rate=[0.01, 0.05, 0.1], |
| 241 | + n_estimators=[100, 150, 200, 250, 300], |
| 242 | + max_depth=[2, 5, 10, 15, 20], |
| 243 | + min_samples_leaf=[1, 5, 10, 20, 30, 50], |
| 244 | + min_samples_split=[2, 5, 10, 20, 30, 50], |
| 245 | +) |
| 246 | +alpha = 0.05 |
| 247 | +neg_mean_pinball_loss_05p_scorer = make_scorer( |
| 248 | + mean_pinball_loss, |
| 249 | + alpha=alpha, |
| 250 | + greater_is_better=False, # maximize the negative loss |
| 251 | +) |
| 252 | +gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, random_state=0) |
| 253 | +search_05p = RandomizedSearchCV( |
| 254 | + gbr, |
| 255 | + param_grid, |
| 256 | + n_iter=10, # increase this if computational budget allows |
| 257 | + scoring=neg_mean_pinball_loss_05p_scorer, |
| 258 | + n_jobs=2, |
| 259 | + random_state=0, |
| 260 | +).fit(X_train, y_train) |
| 261 | +pprint(search_05p.best_params_) |
| 262 | + |
| 263 | +# %% |
| 264 | +# We observe that the search procedure identifies that deeper trees are needed |
| 265 | +# to get a good fit for the 5th percentile regressor. Deeper trees are more |
| 266 | +# expressive and less likely to underfit. |
| 267 | +# |
| 268 | +# Let's now tune the hyper-parameters for the 95th percentile regressor. We |
| 269 | +# need to redefine the `scoring` metric used to select the best model, along |
| 270 | +# with adjusting the alpha parameter of the inner gradient boosting estimator |
| 271 | +# itself: |
| 272 | +from sklearn.base import clone |
| 273 | + |
| 274 | +alpha = 0.95 |
| 275 | +neg_mean_pinball_loss_95p_scorer = make_scorer( |
| 276 | + mean_pinball_loss, |
| 277 | + alpha=alpha, |
| 278 | + greater_is_better=False, # maximize the negative loss |
| 279 | +) |
| 280 | +search_95p = clone(search_05p).set_params( |
| 281 | + estimator__alpha=alpha, |
| 282 | + scoring=neg_mean_pinball_loss_95p_scorer, |
| 283 | +) |
| 284 | +search_95p.fit(X_train, y_train) |
| 285 | +pprint(search_95p.best_params_) |
| 286 | + |
| 287 | +# %% |
| 288 | +# This time, shallower trees are selected and lead to a more constant piecewise |
| 289 | +# and therefore more robust estimation of the 95th percentile. This is |
| 290 | +# beneficial as it avoids overfitting the large outliers of the log-normal |
| 291 | +# additive noise. |
| 292 | +# |
| 293 | +# We can confirm this intuition by displaying the predicted 90% confidence |
| 294 | +# interval comprised by the predictions of those two tuned quantile regressors: |
| 295 | +# the prediction of the upper 95th percentile has a much coarser shape than the |
| 296 | +# prediction of the lower 5th percentile: |
| 297 | +y_lower = search_05p.predict(xx) |
| 298 | +y_upper = search_95p.predict(xx) |
| 299 | + |
| 300 | +fig = plt.figure(figsize=(10, 10)) |
| 301 | +plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$') |
| 302 | +plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations') |
70 | 303 | plt.plot(xx, y_upper, 'k-')
|
71 | 304 | plt.plot(xx, y_lower, 'k-')
|
72 |
| -plt.fill(np.concatenate([xx, xx[::-1]]), |
73 |
| - np.concatenate([y_upper, y_lower[::-1]]), |
74 |
| - alpha=.5, fc='b', ec='None', label='95% prediction interval') |
| 305 | +plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4, |
| 306 | + label='Predicted 90% interval') |
75 | 307 | plt.xlabel('$x$')
|
76 | 308 | plt.ylabel('$f(x)$')
|
77 |
| -plt.ylim(-10, 20) |
| 309 | +plt.ylim(-10, 25) |
78 | 310 | plt.legend(loc='upper left')
|
| 311 | +plt.title("Prediction with tuned hyper-parameters") |
79 | 312 | plt.show()
|
| 313 | + |
| 314 | +# %% |
| 315 | +# The plot looks qualitatively better than for the untuned models, especially |
| 316 | +# for the shape of the of lower quantile. |
| 317 | +# |
| 318 | +# We now quantitatively evaluate the joint-calibration of the pair of |
| 319 | +# estimators: |
| 320 | +coverage_fraction(y_train, |
| 321 | + search_05p.predict(X_train), |
| 322 | + search_95p.predict(X_train)) |
| 323 | +# %% |
| 324 | +coverage_fraction(y_test, |
| 325 | + search_05p.predict(X_test), |
| 326 | + search_95p.predict(X_test)) |
| 327 | +# %% |
| 328 | +# The calibration of the tuned pair is sadly not better on the test set: the |
| 329 | +# width of the estimated confidence interval is still too narrow. |
| 330 | +# |
| 331 | +# Again, we would need to wrap this study in a cross-validation loop to |
| 332 | +# better assess the variability of those estimates. |
0 commit comments