DevmallyaK
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
9.83 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
9.83 KB
diff --git a/‎dev/_downloads/2f3ef774a6d7e52e1e6b7ccbb75d25f0/plot_gradient_boosting_quantile.py
Lines changed: 300 additions & 47 deletions b/‎dev/_downloads/2f3ef774a6d7e52e1e6b7ccbb75d25f0/plot_gradient_boosting_quantile.py
Lines changed: 300 additions & 47 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
13.9 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
13.9 KB
@@ -3,77 +3,330 @@
 Prediction Intervals for Gradient Boosting Regression
 =====================================================
 
-This example shows how quantile regression can be used
-to create prediction intervals.
+This example shows how quantile regression can be used to create prediction
+intervals.
 """
-
+# %%
+# Generate some data for a synthetic regression problem by applying the
+# function f to uniformly sampled random inputs.
 import numpy as np
-import matplotlib.pyplot as plt
-
-from sklearn.ensemble import GradientBoostingRegressor
-
-np.random.seed(1)
+from sklearn.model_selection import train_test_split
 
 
 def f(x):
     """The function to predict."""
     return x * np.sin(x)
 
-#----------------------------------------------------------------------
-#  First the noiseless case
-X = np.atleast_2d(np.random.uniform(0, 10.0, size=100)).T
-X = X.astype(np.float32)
 
-# Observations
-y = f(X).ravel()
+rng = np.random.RandomState(42)
+X = np.atleast_2d(rng.uniform(0, 10.0, size=1000)).T
+expected_y = f(X).ravel()
+
+# %%
+# To make the problem interesting, we generate observations of the target y as
+# the sum of a deterministic term computed by the function f and a random noise
+# term that follows a centered `log-normal
+# <https://en.wikipedia.org/wiki/Log-normal_distribution>`_. To make this even
+# more interesting we consider the case where the amplitude of the noise
+# depends on the input variable x (heteroscedastic noise).
+#
+# The lognormal distribution is non-symmetric and long tailed: observing large
+# outliers is likely but it is impossible to observe small outliers.
+sigma = 0.5 + X.ravel() / 10
+noise = rng.lognormal(sigma=sigma) - np.exp(sigma ** 2 / 2)
+y = expected_y + noise
+
+# %%
+# Split into train, test datasets:
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+# %%
+# Fitting non-linear quantile and least squares regressors
+# --------------------------------------------------------
+#
+# Fit gradient boosting models trained with the quantile loss and
+# alpha=0.05, 0.5, 0.95.
+#
+# The models obtained for alpha=0.05 and alpha=0.95 produce a 90% confidence
+# interval (95% - 5% = 90%).
+#
+# The model trained with alpha=0.5 produces a regression of the median: on
+# average, there should be the same number of target observations above and
+# below the predicted values.
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.metrics import mean_pinball_loss, mean_squared_error
+
 
-dy = 1.5 + 1.0 * np.random.random(y.shape)
-noise = np.random.normal(0, dy)
-y += noise
-y = y.astype(np.float32)
+all_models = {}
+common_params = dict(
+    learning_rate=0.05,
+    n_estimators=250,
+    max_depth=2,
+    min_samples_leaf=9,
+    min_samples_split=9,
+)
+for alpha in [0.05, 0.5, 0.95]:
+    gbr = GradientBoostingRegressor(loss='quantile', alpha=alpha,
+                                    **common_params)
+    all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train)
 
-# Mesh the input space for evaluations of the real function, the prediction and
-# its MSE
+# %%
+# For the sake of comparison, also fit a baseline model trained with the usual
+# least squares loss (ls), also known as the mean squared error (MSE).
+gbr_ls = GradientBoostingRegressor(loss='ls', **common_params)
+all_models["ls"] = gbr_ls.fit(X_train, y_train)
+
+# %%
+# Create an evenly spaced evaluation set of input values spanning the [0, 10]
+# range.
 xx = np.atleast_2d(np.linspace(0, 10, 1000)).T
-xx = xx.astype(np.float32)
 
-alpha = 0.95
+# %%
+# Plot the true conditional mean function f, the prediction of the conditional
+# mean (least squares loss), the conditional median and the conditional 90%
+# interval (from 5th to 95th conditional percentiles).
+import matplotlib.pyplot as plt
+
+
+y_pred = all_models['ls'].predict(xx)
+y_lower = all_models['q 0.05'].predict(xx)
+y_upper = all_models['q 0.95'].predict(xx)
+y_med = all_models['q 0.50'].predict(xx)
+
+fig = plt.figure(figsize=(10, 10))
+plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$')
+plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations')
+plt.plot(xx, y_med, 'r-', label='Predicted median', color="orange")
+plt.plot(xx, y_pred, 'r-', label='Predicted mean')
+plt.plot(xx, y_upper, 'k-')
+plt.plot(xx, y_lower, 'k-')
+plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4,
+                 label='Predicted 90% interval')
+plt.xlabel('$x$')
+plt.ylabel('$f(x)$')
+plt.ylim(-10, 25)
+plt.legend(loc='upper left')
+plt.show()
+
+# %%
+# Comparing the predicted median with the predicted mean, we note that the
+# median is on average below the mean as the noise is skewed towards high
+# values (large outliers). The median estimate also seems to be smoother
+# because of its natural robustness to outliers.
+#
+# Also observe that the inductive bias of gradient boosting trees is
+# unfortunately preventing our 0.05 quantile to fully capture the sinoisoidal
+# shape of the signal, in particular around x=8. Tuning hyper-parameters can
+# reduce this effect as shown in the last part of this notebook.
+#
+# Analysis of the error metrics
+# -----------------------------
+#
+# Measure the models with :func:`mean_squared_error` and
+# :func:`mean_pinball_loss` metrics on the training dataset.
+import pandas as pd
+
+
+def highlight_min(x):
+    x_min = x.min()
+    return ['font-weight: bold' if v == x_min else ''
+            for v in x]
+
+
+results = []
+for name, gbr in sorted(all_models.items()):
+    metrics = {'model': name}
+    y_pred = gbr.predict(X_train)
+    for alpha in [0.05, 0.5, 0.95]:
+        metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(
+            y_train, y_pred, alpha=alpha)
+    metrics['MSE'] = mean_squared_error(y_train, y_pred)
+    results.append(metrics)
+
+pd.DataFrame(results).set_index('model').style.apply(highlight_min)
+
+# %%
+# One column shows all models evaluated by the same metric. The minimum number
+# on a column should be obtained when the model is trained and measured with
+# the same metric. This should be always the case on the training set if the
+# training converged.
+#
+# Note that because the target distribution is asymmetric, the expected
+# conditional mean and conditional median are signficiantly different and
+# therefore one could not use the least squares model get a good estimation of
+# the conditional median nor the converse.
+#
+# If the target distribution were symmetric and had no outliers (e.g. with a
+# Gaussian noise), then median estimator and the least squares estimator would
+# have yielded similar predictions.
+#
+# We then do the same on the test set.
+results = []
+for name, gbr in sorted(all_models.items()):
+    metrics = {'model': name}
+    y_pred = gbr.predict(X_test)
+    for alpha in [0.05, 0.5, 0.95]:
+        metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(
+            y_test, y_pred, alpha=alpha)
+    metrics['MSE'] = mean_squared_error(y_test, y_pred)
+    results.append(metrics)
 
-clf = GradientBoostingRegressor(loss='quantile', alpha=alpha,
-                                n_estimators=250, max_depth=3,
-                                learning_rate=.1, min_samples_leaf=9,
-                                min_samples_split=9)
+pd.DataFrame(results).set_index('model').style.apply(highlight_min)
 
-clf.fit(X, y)
 
-# Make the prediction on the meshed x-axis
-y_upper = clf.predict(xx)
+# %%
+# Errors are higher meaning the models slightly overfitted the data. It still
+# shows that the best test metric is obtained when the model is trained by
+# minimizing this same metric.
+#
+# Note that the conditional median estimator is competitive with the least
+# squares estimator in terms of MSE on the test set: this can be explained by
+# the fact the least squares estimator is very sensitive to large outliers
+# which can cause significant overfitting. This can be seen on the right hand
+# side of the previous plot. The conditional median estimator is biased
+# (underestimation for this asymetric noise) but is also naturally robust to
+# outliers and overfits less.
+#
+# Calibration of the confidence interval
+# --------------------------------------
+#
+# We can also evaluate the ability of the two extreme quantile estimators at
+# producing a well-calibrated conditational 90%-confidence interval.
+#
+# To do this we can compute the fraction of observations that fall between the
+# predictions:
+def coverage_fraction(y, y_low, y_high):
+    return np.mean(np.logical_and(y >= y_low, y <= y_high))
 
-clf.set_params(alpha=1.0 - alpha)
-clf.fit(X, y)
 
-# Make the prediction on the meshed x-axis
-y_lower = clf.predict(xx)
+coverage_fraction(y_train,
+                  all_models['q 0.05'].predict(X_train),
+                  all_models['q 0.95'].predict(X_train))
 
-clf.set_params(loss='ls')
-clf.fit(X, y)
+# %%
+# On the training set the calibration is very close to the expected coverage
+# value for a 90% confidence interval.
+coverage_fraction(y_test,
+                  all_models['q 0.05'].predict(X_test),
+                  all_models['q 0.95'].predict(X_test))
 
-# Make the prediction on the meshed x-axis
-y_pred = clf.predict(xx)
 
-# Plot the function, the prediction and the 95% confidence interval based on
-# the MSE
-fig = plt.figure()
-plt.plot(xx, f(xx), 'g:', label=r'$f(x) = x\,\sin(x)$')
-plt.plot(X, y, 'b.', markersize=10, label=u'Observations')
-plt.plot(xx, y_pred, 'r-', label=u'Prediction')
+# %%
+# On the test set, the estimated confidence interval is slightly too narrow.
+# Note, however, that we would need to wrap those metrics in a cross-validation
+# loop to assess their variability under data resampling.
+#
+# Tuning the hyper-parameters of the quantile regressors
+# ------------------------------------------------------
+#
+# In the plot above, we observed that the 5th percentile regressor seems to
+# underfit and could not adapt to sinusoidal shape of the signal.
+#
+# The hyper-parameters of the model were approximately hand-tuned for the
+# median regressor and there is no reason than the same hyper-parameters are
+# suitable for the 5th percentile regressor.
+#
+# To confirm this hypothesis, we tune the hyper-parameters of a new regressor
+# of the 5th percentile by selecting the best model parameters by
+# cross-validation on the pinball loss with alpha=0.05:
+
+# %%
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.metrics import make_scorer
+from pprint import pprint
+
+
+param_grid = dict(
+    learning_rate=[0.01, 0.05, 0.1],
+    n_estimators=[100, 150, 200, 250, 300],
+    max_depth=[2, 5, 10, 15, 20],
+    min_samples_leaf=[1, 5, 10, 20, 30, 50],
+    min_samples_split=[2, 5, 10, 20, 30, 50],
+)
+alpha = 0.05
+neg_mean_pinball_loss_05p_scorer = make_scorer(
+    mean_pinball_loss,
+    alpha=alpha,
+    greater_is_better=False,  # maximize the negative loss
+)
+gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, random_state=0)
+search_05p = RandomizedSearchCV(
+    gbr,
+    param_grid,
+    n_iter=10,  # increase this if computational budget allows
+    scoring=neg_mean_pinball_loss_05p_scorer,
+    n_jobs=2,
+    random_state=0,
+).fit(X_train, y_train)
+pprint(search_05p.best_params_)
+
+# %%
+# We observe that the search procedure identifies that deeper trees are needed
+# to get a good fit for the 5th percentile regressor. Deeper trees are more
+# expressive and less likely to underfit.
+#
+# Let's now tune the hyper-parameters for the 95th percentile regressor. We
+# need to redefine the `scoring` metric used to select the best model, along
+# with adjusting the alpha parameter of the inner gradient boosting estimator
+# itself:
+from sklearn.base import clone
+
+alpha = 0.95
+neg_mean_pinball_loss_95p_scorer = make_scorer(
+    mean_pinball_loss,
+    alpha=alpha,
+    greater_is_better=False,  # maximize the negative loss
+)
+search_95p = clone(search_05p).set_params(
+    estimator__alpha=alpha,
+    scoring=neg_mean_pinball_loss_95p_scorer,
+)
+search_95p.fit(X_train, y_train)
+pprint(search_95p.best_params_)
+
+# %%
+# This time, shallower trees are selected and lead to a more constant piecewise
+# and therefore more robust estimation of the 95th percentile. This is
+# beneficial as it avoids overfitting the large outliers of the log-normal
+# additive noise.
+#
+# We can confirm this intuition by displaying the predicted 90% confidence
+# interval comprised by the predictions of those two tuned quantile regressors:
+# the prediction of the upper 95th percentile has a much coarser shape than the
+# prediction of the lower 5th percentile:
+y_lower = search_05p.predict(xx)
+y_upper = search_95p.predict(xx)
+
+fig = plt.figure(figsize=(10, 10))
+plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$')
+plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations')
 plt.plot(xx, y_upper, 'k-')
 plt.plot(xx, y_lower, 'k-')
-plt.fill(np.concatenate([xx, xx[::-1]]),
-         np.concatenate([y_upper, y_lower[::-1]]),
-         alpha=.5, fc='b', ec='None', label='95% prediction interval')
+plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4,
+                 label='Predicted 90% interval')
 plt.xlabel('$x$')
 plt.ylabel('$f(x)$')
-plt.ylim(-10, 20)
+plt.ylim(-10, 25)
 plt.legend(loc='upper left')
+plt.title("Prediction with tuned hyper-parameters")
 plt.show()
+
+# %%
+# The plot looks qualitatively better than for the untuned models, especially
+# for the shape of the of lower quantile.
+#
+# We now quantitatively evaluate the joint-calibration of the pair of
+# estimators:
+coverage_fraction(y_train,
+                  search_05p.predict(X_train),
+                  search_95p.predict(X_train))
+# %%
+coverage_fraction(y_test,
+                  search_05p.predict(X_test),
+                  search_95p.predict(X_test))
+# %%
+# The calibration of the tuned pair is sadly not better on the test set: the
+# width of the estimated confidence interval is still too narrow.
+#
+# Again, we would need to wrap this study in a cross-validation loop to
+# better assess the variability of those estimates.