|
6 | 6 | This example demonstrates Gradient Boosting to produce a predictive
|
7 | 7 | model from an ensemble of weak predictive models. Gradient boosting can be used
|
8 | 8 | for regression and classification problems. Here, we will train a model to
|
9 |
| -tackle a diabetes regression task. |
10 |
| -
|
11 |
| -We will obtain the results from |
| 9 | +tackle a diabetes regression task. We will obtain the results from |
12 | 10 | :class:`~sklearn.ensemble.GradientBoostingRegressor` with least squares loss
|
13 | 11 | and 500 regression trees of depth 4.
|
14 | 12 |
|
| 13 | +Note: For larger datasets (n_samples >= 10000), please refer to |
| 14 | +:class:`sklearn.ensemble.HistGradientBoostingRegressor` |
15 | 15 | """
|
16 | 16 | print(__doc__)
|
17 | 17 |
|
18 | 18 | # Author: Peter Prettenhofer <[email protected]>
|
19 | 19 | # Maria Telenczuk <https://github.com/maikia>
|
| 20 | +# Katrina Ni <https://github.com/nilichen> |
20 | 21 | #
|
21 | 22 | # License: BSD 3 clause
|
22 | 23 |
|
23 |
| -import numpy as np |
24 | 24 | import matplotlib.pyplot as plt
|
25 |
| - |
26 |
| -from sklearn import ensemble |
27 |
| -from sklearn import datasets |
| 25 | +import numpy as np |
| 26 | +from sklearn import datasets, ensemble |
| 27 | +from sklearn.inspection import permutation_importance |
28 | 28 | from sklearn.metrics import mean_squared_error
|
29 | 29 | from sklearn.model_selection import train_test_split
|
30 | 30 |
|
|
47 | 47 | # regression model. You can play with those parameters to see how the
|
48 | 48 | # results change:
|
49 | 49 | #
|
50 |
| -# Here: |
51 |
| -# n_estimators : is the number of boosting stages which will be performed. |
52 |
| -# Later, we will plot and see how the deviance changes with those boosting |
53 |
| -# operations. |
54 |
| -# max_depth : this limits the number of nodes in the tree. The best value |
55 |
| -# depends on the interaction of the input variables. |
56 |
| -# min_samples_split : is the minimum number of samples required to split an |
57 |
| -# internal node. |
58 |
| -# learning_rate: tells how much the contribution of each tree will shrink |
59 |
| -# loss: here, we decided to use least squeares as a loss function, however |
60 |
| -# there are many other options (check |
61 |
| -# :class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are |
62 |
| -# other possibilities) |
63 |
| - |
64 |
| -X_train, X_test, y_train, y_test = train_test_split(X, y, |
65 |
| - test_size=0.1, |
66 |
| - random_state=13) |
| 50 | +# n_estimators : the number of boosting stages which will be performed. |
| 51 | +# Later, we will plot and see how the deviance changes with those boosting |
| 52 | +# operations. |
| 53 | +# |
| 54 | +# max_depth : limits the number of nodes in the tree. |
| 55 | +# The best value depends on the interaction of the input variables. |
| 56 | +# |
| 57 | +# min_samples_split : the minimum number of samples required to split an |
| 58 | +# internal node. |
| 59 | +# |
| 60 | +# learning_rate : how much the contribution of each tree will shrink |
| 61 | +# |
| 62 | +# loss : here, we decided to use least squeares as a loss function. |
| 63 | +# However there are many other options (check |
| 64 | +# :class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are |
| 65 | +# other possibilities) |
| 66 | + |
| 67 | +X_train, X_test, y_train, y_test = train_test_split( |
| 68 | + X, y, test_size=0.1, random_state=13) |
67 | 69 |
|
68 | 70 | params = {'n_estimators': 500,
|
69 | 71 | 'max_depth': 4,
|
|
92 | 94 | # test set deviance and then plot it.
|
93 | 95 |
|
94 | 96 | test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
|
95 |
| - |
96 | 97 | for i, y_pred in enumerate(clf.staged_predict(X_test)):
|
97 | 98 | test_score[i] = clf.loss_(y_test, y_pred)
|
98 | 99 |
|
99 |
| -fig = plt.figure(figsize=(12, 8)) |
100 |
| - |
101 |
| -plt.subplot(1, 2, 1) |
| 100 | +fig = plt.figure(figsize=(6, 6)) |
| 101 | +plt.subplot(1, 1, 1) |
102 | 102 | plt.title('Deviance')
|
103 | 103 | plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
|
104 | 104 | label='Training Set Deviance')
|
|
107 | 107 | plt.legend(loc='upper right')
|
108 | 108 | plt.xlabel('Boosting Iterations')
|
109 | 109 | plt.ylabel('Deviance')
|
| 110 | +fig.tight_layout() |
| 111 | +plt.show() |
110 | 112 |
|
111 | 113 | ##############################################################################
|
112 |
| -# Plot impurity-based feature importance |
| 114 | +# Plot feature importance |
113 | 115 | # -------------------------------------
|
114 | 116 | #
|
115 | 117 | # Careful, impurity-based feature importances can be misleading for
|
116 |
| -# high cardinality features (many unique values). See |
117 |
| -# :func:`sklearn.inspection.permutation_importance` as an alternative. |
| 118 | +# high cardinality features (many unique values). As an alternative, |
| 119 | +# the permutation importances of ``clf`` are computed on a |
| 120 | +# held out test set. See :ref:`permutation_importance` for more details. |
| 121 | +# |
| 122 | +# In this case, the two methods agree to identify the same top 2 features |
| 123 | +# as strongly predictive features but not in the same order. The third most |
| 124 | +# predictive feature, "bp", is also the same for the 2 methods. The remaining |
| 125 | +# features are less predictive and the error bars of the permutation plot |
| 126 | +# show that they overlap with 0. |
118 | 127 |
|
119 | 128 | feature_importance = clf.feature_importances_
|
120 |
| -# make importances relative to max importance |
121 |
| -feature_importance = 100.0 * (feature_importance / feature_importance.max()) |
122 | 129 | sorted_idx = np.argsort(feature_importance)
|
123 | 130 | pos = np.arange(sorted_idx.shape[0]) + .5
|
124 |
| -plt.subplot(1, 2, 2) |
| 131 | +fig = plt.figure(figsize=(12, 6)) |
| 132 | +plt.subplot(1, 2, 1) |
125 | 133 | plt.barh(pos, feature_importance[sorted_idx], align='center')
|
126 | 134 | plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
|
127 |
| -plt.xlabel('Relative Importance') |
128 |
| -plt.title('Variable Importance') |
129 |
| -fig.tight_layout() |
| 135 | +plt.title('Feature Importance (MDI)') |
130 | 136 |
|
| 137 | +result = permutation_importance(clf, X_test, y_test, n_repeats=10, |
| 138 | + random_state=42, n_jobs=2) |
| 139 | +sorted_idx = result.importances_mean.argsort() |
| 140 | +plt.subplot(1, 2, 2) |
| 141 | +plt.boxplot(result.importances[sorted_idx].T, |
| 142 | + vert=False, labels=np.array(diabetes.feature_names)[sorted_idx]) |
| 143 | +plt.title("Permutation Importance (test set)") |
| 144 | +fig.tight_layout() |
131 | 145 | plt.show()
|
0 commit comments