Skip to content

Commit 1277a22

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 104b736d4446d1d3a996bceb8340776dff1882a5
1 parent 92e7e2b commit 1277a22

File tree

1,202 files changed

+3841
-3816
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,202 files changed

+3841
-3816
lines changed
Binary file not shown.

dev/_downloads/679566501b743cb339497968edb9d62f/plot_gradient_boosting_regression.py

Lines changed: 51 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,25 @@
66
This example demonstrates Gradient Boosting to produce a predictive
77
model from an ensemble of weak predictive models. Gradient boosting can be used
88
for regression and classification problems. Here, we will train a model to
9-
tackle a diabetes regression task.
10-
11-
We will obtain the results from
9+
tackle a diabetes regression task. We will obtain the results from
1210
:class:`~sklearn.ensemble.GradientBoostingRegressor` with least squares loss
1311
and 500 regression trees of depth 4.
1412
13+
Note: For larger datasets (n_samples >= 10000), please refer to
14+
:class:`sklearn.ensemble.HistGradientBoostingRegressor`
1515
"""
1616
print(__doc__)
1717

1818
# Author: Peter Prettenhofer <[email protected]>
1919
# Maria Telenczuk <https://github.com/maikia>
20+
# Katrina Ni <https://github.com/nilichen>
2021
#
2122
# License: BSD 3 clause
2223

23-
import numpy as np
2424
import matplotlib.pyplot as plt
25-
26-
from sklearn import ensemble
27-
from sklearn import datasets
25+
import numpy as np
26+
from sklearn import datasets, ensemble
27+
from sklearn.inspection import permutation_importance
2828
from sklearn.metrics import mean_squared_error
2929
from sklearn.model_selection import train_test_split
3030

@@ -47,23 +47,25 @@
4747
# regression model. You can play with those parameters to see how the
4848
# results change:
4949
#
50-
# Here:
51-
# n_estimators : is the number of boosting stages which will be performed.
52-
# Later, we will plot and see how the deviance changes with those boosting
53-
# operations.
54-
# max_depth : this limits the number of nodes in the tree. The best value
55-
# depends on the interaction of the input variables.
56-
# min_samples_split : is the minimum number of samples required to split an
57-
# internal node.
58-
# learning_rate: tells how much the contribution of each tree will shrink
59-
# loss: here, we decided to use least squeares as a loss function, however
60-
# there are many other options (check
61-
# :class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are
62-
# other possibilities)
63-
64-
X_train, X_test, y_train, y_test = train_test_split(X, y,
65-
test_size=0.1,
66-
random_state=13)
50+
# n_estimators : the number of boosting stages which will be performed.
51+
# Later, we will plot and see how the deviance changes with those boosting
52+
# operations.
53+
#
54+
# max_depth : limits the number of nodes in the tree.
55+
# The best value depends on the interaction of the input variables.
56+
#
57+
# min_samples_split : the minimum number of samples required to split an
58+
# internal node.
59+
#
60+
# learning_rate : how much the contribution of each tree will shrink
61+
#
62+
# loss : here, we decided to use least squeares as a loss function.
63+
# However there are many other options (check
64+
# :class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are
65+
# other possibilities)
66+
67+
X_train, X_test, y_train, y_test = train_test_split(
68+
X, y, test_size=0.1, random_state=13)
6769

6870
params = {'n_estimators': 500,
6971
'max_depth': 4,
@@ -92,13 +94,11 @@
9294
# test set deviance and then plot it.
9395

9496
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
95-
9697
for i, y_pred in enumerate(clf.staged_predict(X_test)):
9798
test_score[i] = clf.loss_(y_test, y_pred)
9899

99-
fig = plt.figure(figsize=(12, 8))
100-
101-
plt.subplot(1, 2, 1)
100+
fig = plt.figure(figsize=(6, 6))
101+
plt.subplot(1, 1, 1)
102102
plt.title('Deviance')
103103
plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
104104
label='Training Set Deviance')
@@ -107,25 +107,39 @@
107107
plt.legend(loc='upper right')
108108
plt.xlabel('Boosting Iterations')
109109
plt.ylabel('Deviance')
110+
fig.tight_layout()
111+
plt.show()
110112

111113
##############################################################################
112-
# Plot impurity-based feature importance
114+
# Plot feature importance
113115
# -------------------------------------
114116
#
115117
# Careful, impurity-based feature importances can be misleading for
116-
# high cardinality features (many unique values). See
117-
# :func:`sklearn.inspection.permutation_importance` as an alternative.
118+
# high cardinality features (many unique values). As an alternative,
119+
# the permutation importances of ``clf`` are computed on a
120+
# held out test set. See :ref:`permutation_importance` for more details.
121+
#
122+
# In this case, the two methods agree to identify the same top 2 features
123+
# as strongly predictive features but not in the same order. The third most
124+
# predictive feature, "bp", is also the same for the 2 methods. The remaining
125+
# features are less predictive and the error bars of the permutation plot
126+
# show that they overlap with 0.
118127

119128
feature_importance = clf.feature_importances_
120-
# make importances relative to max importance
121-
feature_importance = 100.0 * (feature_importance / feature_importance.max())
122129
sorted_idx = np.argsort(feature_importance)
123130
pos = np.arange(sorted_idx.shape[0]) + .5
124-
plt.subplot(1, 2, 2)
131+
fig = plt.figure(figsize=(12, 6))
132+
plt.subplot(1, 2, 1)
125133
plt.barh(pos, feature_importance[sorted_idx], align='center')
126134
plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
127-
plt.xlabel('Relative Importance')
128-
plt.title('Variable Importance')
129-
fig.tight_layout()
135+
plt.title('Feature Importance (MDI)')
130136

137+
result = permutation_importance(clf, X_test, y_test, n_repeats=10,
138+
random_state=42, n_jobs=2)
139+
sorted_idx = result.importances_mean.argsort()
140+
plt.subplot(1, 2, 2)
141+
plt.boxplot(result.importances[sorted_idx].T,
142+
vert=False, labels=np.array(diabetes.feature_names)[sorted_idx])
143+
plt.title("Permutation Importance (test set)")
144+
fig.tight_layout()
131145
plt.show()

dev/_downloads/cdc6134a701824f26cc08df7bd1e479a/plot_gradient_boosting_regression.ipynb

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"cell_type": "markdown",
1616
"metadata": {},
1717
"source": [
18-
"\n# Gradient Boosting regression\n\n\nThis example demonstrates Gradient Boosting to produce a predictive\nmodel from an ensemble of weak predictive models. Gradient boosting can be used\nfor regression and classification problems. Here, we will train a model to\ntackle a diabetes regression task.\n\nWe will obtain the results from\n:class:`~sklearn.ensemble.GradientBoostingRegressor` with least squares loss\nand 500 regression trees of depth 4.\n"
18+
"\n# Gradient Boosting regression\n\n\nThis example demonstrates Gradient Boosting to produce a predictive\nmodel from an ensemble of weak predictive models. Gradient boosting can be used\nfor regression and classification problems. Here, we will train a model to\ntackle a diabetes regression task. We will obtain the results from\n:class:`~sklearn.ensemble.GradientBoostingRegressor` with least squares loss\nand 500 regression trees of depth 4.\n\nNote: For larger datasets (n_samples >= 10000), please refer to\n:class:`sklearn.ensemble.HistGradientBoostingRegressor`\n"
1919
]
2020
},
2121
{
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\n# Author: Peter Prettenhofer <[email protected]>\n# Maria Telenczuk <https://github.com/maikia>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import ensemble\nfrom sklearn import datasets\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.model_selection import train_test_split"
29+
"print(__doc__)\n\n# Author: Peter Prettenhofer <[email protected]>\n# Maria Telenczuk <https://github.com/maikia>\n# Katrina Ni <https://github.com/nilichen>\n#\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom sklearn import datasets, ensemble\nfrom sklearn.inspection import permutation_importance\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.model_selection import train_test_split"
3030
]
3131
},
3232
{
@@ -51,7 +51,7 @@
5151
"cell_type": "markdown",
5252
"metadata": {},
5353
"source": [
54-
"Data preprocessing\n-------------------------------------\n\nNext, we will split our dataset to use 90% for training and leave the rest\nfor testing. We will also prepare the parameters we want to use to fit our\nregression model. You can play with those parameters to see how the\nresults change:\n\nHere:\nn_estimators : is the number of boosting stages which will be performed.\n Later, we will plot and see how the deviance changes with those boosting\n operations.\nmax_depth : this limits the number of nodes in the tree. The best value\n depends on the interaction of the input variables.\nmin_samples_split : is the minimum number of samples required to split an\n internal node.\nlearning_rate: tells how much the contribution of each tree will shrink\nloss: here, we decided to use least squeares as a loss function, however\n there are many other options (check\n :class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are\n other possibilities)\n\n"
54+
"Data preprocessing\n-------------------------------------\n\nNext, we will split our dataset to use 90% for training and leave the rest\nfor testing. We will also prepare the parameters we want to use to fit our\nregression model. You can play with those parameters to see how the\nresults change:\n\nn_estimators : the number of boosting stages which will be performed.\nLater, we will plot and see how the deviance changes with those boosting\noperations.\n\nmax_depth : limits the number of nodes in the tree.\nThe best value depends on the interaction of the input variables.\n\nmin_samples_split : the minimum number of samples required to split an\ninternal node.\n\nlearning_rate : how much the contribution of each tree will shrink\n\nloss : here, we decided to use least squeares as a loss function.\nHowever there are many other options (check\n:class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are\nother possibilities)\n\n"
5555
]
5656
},
5757
{
@@ -62,7 +62,7 @@
6262
},
6363
"outputs": [],
6464
"source": [
65-
"X_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.1,\n random_state=13)\n\nparams = {'n_estimators': 500,\n 'max_depth': 4,\n 'min_samples_split': 5,\n 'learning_rate': 0.01,\n 'loss': 'ls'}"
65+
"X_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.1, random_state=13)\n\nparams = {'n_estimators': 500,\n 'max_depth': 4,\n 'min_samples_split': 5,\n 'learning_rate': 0.01,\n 'loss': 'ls'}"
6666
]
6767
},
6868
{
@@ -98,14 +98,14 @@
9898
},
9999
"outputs": [],
100100
"source": [
101-
"test_score = np.zeros((params['n_estimators'],), dtype=np.float64)\n\nfor i, y_pred in enumerate(clf.staged_predict(X_test)):\n test_score[i] = clf.loss_(y_test, y_pred)\n\nfig = plt.figure(figsize=(12, 8))\n\nplt.subplot(1, 2, 1)\nplt.title('Deviance')\nplt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',\n label='Training Set Deviance')\nplt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',\n label='Test Set Deviance')\nplt.legend(loc='upper right')\nplt.xlabel('Boosting Iterations')\nplt.ylabel('Deviance')"
101+
"test_score = np.zeros((params['n_estimators'],), dtype=np.float64)\nfor i, y_pred in enumerate(clf.staged_predict(X_test)):\n test_score[i] = clf.loss_(y_test, y_pred)\n\nfig = plt.figure(figsize=(6, 6))\nplt.subplot(1, 1, 1)\nplt.title('Deviance')\nplt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',\n label='Training Set Deviance')\nplt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',\n label='Test Set Deviance')\nplt.legend(loc='upper right')\nplt.xlabel('Boosting Iterations')\nplt.ylabel('Deviance')\nfig.tight_layout()\nplt.show()"
102102
]
103103
},
104104
{
105105
"cell_type": "markdown",
106106
"metadata": {},
107107
"source": [
108-
"Plot impurity-based feature importance\n-------------------------------------\n\nCareful, impurity-based feature importances can be misleading for\nhigh cardinality features (many unique values). See\n:func:`sklearn.inspection.permutation_importance` as an alternative.\n\n"
108+
"Plot feature importance\n-------------------------------------\n\nCareful, impurity-based feature importances can be misleading for\nhigh cardinality features (many unique values). As an alternative,\nthe permutation importances of ``clf`` are computed on a\nheld out test set. See `permutation_importance` for more details.\n\nIn this case, the two methods agree to identify the same top 2 features\nas strongly predictive features but not in the same order. The third most\npredictive feature, \"bp\", is also the same for the 2 methods. The remaining\nfeatures are less predictive and the error bars of the permutation plot\nshow that they overlap with 0.\n\n"
109109
]
110110
},
111111
{
@@ -116,7 +116,7 @@
116116
},
117117
"outputs": [],
118118
"source": [
119-
"feature_importance = clf.feature_importances_\n# make importances relative to max importance\nfeature_importance = 100.0 * (feature_importance / feature_importance.max())\nsorted_idx = np.argsort(feature_importance)\npos = np.arange(sorted_idx.shape[0]) + .5\nplt.subplot(1, 2, 2)\nplt.barh(pos, feature_importance[sorted_idx], align='center')\nplt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])\nplt.xlabel('Relative Importance')\nplt.title('Variable Importance')\nfig.tight_layout()\n\nplt.show()"
119+
"feature_importance = clf.feature_importances_\nsorted_idx = np.argsort(feature_importance)\npos = np.arange(sorted_idx.shape[0]) + .5\nfig = plt.figure(figsize=(12, 6))\nplt.subplot(1, 2, 1)\nplt.barh(pos, feature_importance[sorted_idx], align='center')\nplt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])\nplt.title('Feature Importance (MDI)')\n\nresult = permutation_importance(clf, X_test, y_test, n_repeats=10,\n random_state=42, n_jobs=2)\nsorted_idx = result.importances_mean.argsort()\nplt.subplot(1, 2, 2)\nplt.boxplot(result.importances[sorted_idx].T,\n vert=False, labels=np.array(diabetes.feature_names)[sorted_idx])\nplt.title(\"Permutation Importance (test set)\")\nfig.tight_layout()\nplt.show()"
120120
]
121121
}
122122
],
Binary file not shown.

dev/_downloads/scikit-learn-docs.pdf

18.8 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes
-614 Bytes
-614 Bytes
-608 Bytes
-608 Bytes

0 commit comments

Comments
 (0)