Skip to content

Commit b1ee500

Browse files
committed
Pushing the docs to dev/ for branch: master, commit b65c53d292097bf2382344cfdab8e7c126d215df
1 parent 193396a commit b1ee500

File tree

1,217 files changed

+4339
-3942
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,217 files changed

+4339
-3942
lines changed
Binary file not shown.

dev/_downloads/679566501b743cb339497968edb9d62f/plot_gradient_boosting_regression.py

Lines changed: 73 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,20 @@
33
Gradient Boosting regression
44
============================
55
6-
Demonstrate Gradient Boosting on the Boston housing dataset.
6+
This example demonstrates Gradient Boosting to produce a predictive
7+
model from an ensemble of weak predictive models. Gradient boosting can be used
8+
for regression and classification problems. Here, we will train a model to
9+
tackle a diabetes regression task.
10+
11+
We will obtain the results from
12+
:class:`~sklearn.ensemble.GradientBoostingRegressor` with least squares loss
13+
and 500 regression trees of depth 4.
714
8-
This example fits a Gradient Boosting model with least squares loss and
9-
500 regression trees of depth 4.
1015
"""
1116
print(__doc__)
1217

1318
# Author: Peter Prettenhofer <[email protected]>
19+
# Maria Telenczuk <https://github.com/maikia>
1420
#
1521
# License: BSD 3 clause
1622

@@ -19,38 +25,79 @@
1925

2026
from sklearn import ensemble
2127
from sklearn import datasets
22-
from sklearn.utils import shuffle
2328
from sklearn.metrics import mean_squared_error
29+
from sklearn.model_selection import train_test_split
30+
31+
##############################################################################
32+
# Load the data
33+
# -------------------------------------
34+
#
35+
# First we need to load the data. We set random state to be consistent with the
36+
# result.
37+
38+
diabetes = datasets.load_diabetes()
39+
X, y = diabetes.data, diabetes.target
40+
41+
##############################################################################
42+
# Data preprocessing
43+
# -------------------------------------
44+
#
45+
# Next, we will split our dataset to use 90% for training and leave the rest
46+
# for testing. We will also prepare the parameters we want to use to fit our
47+
# regression model. You can play with those parameters to see how the
48+
# results change:
49+
#
50+
# Here:
51+
# n_estimators : is the number of boosting stages which will be performed.
52+
# Later, we will plot and see how the deviance changes with those boosting
53+
# operations.
54+
# max_depth : this limits the number of nodes in the tree. The best value
55+
# depends on the interaction of the input variables.
56+
# min_samples_split : is the minimum number of samples required to split an
57+
# internal node.
58+
# learning_rate: tells how much the contribution of each tree will shrink
59+
# loss: here, we decided to use least squeares as a loss function, however
60+
# there are many other options (check
61+
# :class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are
62+
# other possibilities)
63+
64+
X_train, X_test, y_train, y_test = train_test_split(X, y,
65+
test_size=0.1,
66+
random_state=13)
2467

25-
# #############################################################################
26-
# Load data
27-
boston = datasets.load_boston()
28-
X, y = shuffle(boston.data, boston.target, random_state=13)
29-
X = X.astype(np.float32)
30-
offset = int(X.shape[0] * 0.9)
31-
X_train, y_train = X[:offset], y[:offset]
32-
X_test, y_test = X[offset:], y[offset:]
68+
params = {'n_estimators': 500,
69+
'max_depth': 4,
70+
'min_samples_split': 5,
71+
'learning_rate': 0.01,
72+
'loss': 'ls'}
3373

34-
# #############################################################################
74+
##############################################################################
3575
# Fit regression model
36-
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
37-
'learning_rate': 0.01, 'loss': 'ls'}
38-
clf = ensemble.GradientBoostingRegressor(**params)
76+
# -------------------------------------
77+
#
78+
# Now we will initiate the gradient boosting regressors and fit it with our
79+
# training data. Let's also look and the mean squared error on the test data.
3980

81+
clf = ensemble.GradientBoostingRegressor(**params)
4082
clf.fit(X_train, y_train)
83+
4184
mse = mean_squared_error(y_test, clf.predict(X_test))
42-
print("MSE: %.4f" % mse)
85+
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
4386

44-
# #############################################################################
87+
##############################################################################
4588
# Plot training deviance
89+
# -------------------------------------
90+
#
91+
# Finally, we will visualize the results. To do that we will first compute the
92+
# test set deviance and then plot it.
4693

47-
# compute test set deviance
4894
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
4995

5096
for i, y_pred in enumerate(clf.staged_predict(X_test)):
5197
test_score[i] = clf.loss_(y_test, y_pred)
5298

53-
plt.figure(figsize=(12, 6))
99+
fig = plt.figure(figsize=(12, 8))
100+
54101
plt.subplot(1, 2, 1)
55102
plt.title('Deviance')
56103
plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
@@ -61,10 +108,11 @@
61108
plt.xlabel('Boosting Iterations')
62109
plt.ylabel('Deviance')
63110

64-
# #############################################################################
111+
##############################################################################
65112
# Plot impurity-based feature importance
113+
# -------------------------------------
66114
#
67-
# Warning: impurity-based feature importances can be misleading for
115+
# Careful, impurity-based feature importances can be misleading for
68116
# high cardinality features (many unique values). See
69117
# :func:`sklearn.inspection.permutation_importance` as an alternative.
70118

@@ -75,7 +123,9 @@
75123
pos = np.arange(sorted_idx.shape[0]) + .5
76124
plt.subplot(1, 2, 2)
77125
plt.barh(pos, feature_importance[sorted_idx], align='center')
78-
plt.yticks(pos, boston.feature_names[sorted_idx])
126+
plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
79127
plt.xlabel('Relative Importance')
80128
plt.title('Variable Importance')
129+
fig.tight_layout()
130+
81131
plt.show()

dev/_downloads/cdc6134a701824f26cc08df7bd1e479a/plot_gradient_boosting_regression.ipynb

Lines changed: 92 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"cell_type": "markdown",
1616
"metadata": {},
1717
"source": [
18-
"\n# Gradient Boosting regression\n\n\nDemonstrate Gradient Boosting on the Boston housing dataset.\n\nThis example fits a Gradient Boosting model with least squares loss and\n500 regression trees of depth 4.\n"
18+
"\n# Gradient Boosting regression\n\n\nThis example demonstrates Gradient Boosting to produce a predictive\nmodel from an ensemble of weak predictive models. Gradient boosting can be used\nfor regression and classification problems. Here, we will train a model to\ntackle a diabetes regression task.\n\nWe will obtain the results from\n:class:`~sklearn.ensemble.GradientBoostingRegressor` with least squares loss\nand 500 regression trees of depth 4.\n"
1919
]
2020
},
2121
{
@@ -26,7 +26,97 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\n# Author: Peter Prettenhofer <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import ensemble\nfrom sklearn import datasets\nfrom sklearn.utils import shuffle\nfrom sklearn.metrics import mean_squared_error\n\n# #############################################################################\n# Load data\nboston = datasets.load_boston()\nX, y = shuffle(boston.data, boston.target, random_state=13)\nX = X.astype(np.float32)\noffset = int(X.shape[0] * 0.9)\nX_train, y_train = X[:offset], y[:offset]\nX_test, y_test = X[offset:], y[offset:]\n\n# #############################################################################\n# Fit regression model\nparams = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,\n 'learning_rate': 0.01, 'loss': 'ls'}\nclf = ensemble.GradientBoostingRegressor(**params)\n\nclf.fit(X_train, y_train)\nmse = mean_squared_error(y_test, clf.predict(X_test))\nprint(\"MSE: %.4f\" % mse)\n\n# #############################################################################\n# Plot training deviance\n\n# compute test set deviance\ntest_score = np.zeros((params['n_estimators'],), dtype=np.float64)\n\nfor i, y_pred in enumerate(clf.staged_predict(X_test)):\n test_score[i] = clf.loss_(y_test, y_pred)\n\nplt.figure(figsize=(12, 6))\nplt.subplot(1, 2, 1)\nplt.title('Deviance')\nplt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',\n label='Training Set Deviance')\nplt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',\n label='Test Set Deviance')\nplt.legend(loc='upper right')\nplt.xlabel('Boosting Iterations')\nplt.ylabel('Deviance')\n\n# #############################################################################\n# Plot impurity-based feature importance\n#\n# Warning: impurity-based feature importances can be misleading for\n# high cardinality features (many unique values). See\n# :func:`sklearn.inspection.permutation_importance` as an alternative.\n\nfeature_importance = clf.feature_importances_\n# make importances relative to max importance\nfeature_importance = 100.0 * (feature_importance / feature_importance.max())\nsorted_idx = np.argsort(feature_importance)\npos = np.arange(sorted_idx.shape[0]) + .5\nplt.subplot(1, 2, 2)\nplt.barh(pos, feature_importance[sorted_idx], align='center')\nplt.yticks(pos, boston.feature_names[sorted_idx])\nplt.xlabel('Relative Importance')\nplt.title('Variable Importance')\nplt.show()"
29+
"print(__doc__)\n\n# Author: Peter Prettenhofer <[email protected]>\n# Maria Telenczuk <https://github.com/maikia>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import ensemble\nfrom sklearn import datasets\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.model_selection import train_test_split"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"Load the data\n-------------------------------------\n\nFirst we need to load the data. We set random state to be consistent with the\nresult.\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"diabetes = datasets.load_diabetes()\nX, y = diabetes.data, diabetes.target"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {},
53+
"source": [
54+
"Data preprocessing\n-------------------------------------\n\nNext, we will split our dataset to use 90% for training and leave the rest\nfor testing. We will also prepare the parameters we want to use to fit our\nregression model. You can play with those parameters to see how the\nresults change:\n\nHere:\nn_estimators : is the number of boosting stages which will be performed.\n Later, we will plot and see how the deviance changes with those boosting\n operations.\nmax_depth : this limits the number of nodes in the tree. The best value\n depends on the interaction of the input variables.\nmin_samples_split : is the minimum number of samples required to split an\n internal node.\nlearning_rate: tells how much the contribution of each tree will shrink\nloss: here, we decided to use least squeares as a loss function, however\n there are many other options (check\n :class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are\n other possibilities)\n\n"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [],
64+
"source": [
65+
"X_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.1,\n random_state=13)\n\nparams = {'n_estimators': 500,\n 'max_depth': 4,\n 'min_samples_split': 5,\n 'learning_rate': 0.01,\n 'loss': 'ls'}"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"metadata": {},
71+
"source": [
72+
"Fit regression model\n-------------------------------------\n\nNow we will initiate the gradient boosting regressors and fit it with our\ntraining data. Let's also look and the mean squared error on the test data.\n\n"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": null,
78+
"metadata": {
79+
"collapsed": false
80+
},
81+
"outputs": [],
82+
"source": [
83+
"clf = ensemble.GradientBoostingRegressor(**params)\nclf.fit(X_train, y_train)\n\nmse = mean_squared_error(y_test, clf.predict(X_test))\nprint(\"The mean squared error (MSE) on test set: {:.4f}\".format(mse))"
84+
]
85+
},
86+
{
87+
"cell_type": "markdown",
88+
"metadata": {},
89+
"source": [
90+
"Plot training deviance\n-------------------------------------\n\nFinally, we will visualize the results. To do that we will first compute the\ntest set deviance and then plot it.\n\n"
91+
]
92+
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": null,
96+
"metadata": {
97+
"collapsed": false
98+
},
99+
"outputs": [],
100+
"source": [
101+
"test_score = np.zeros((params['n_estimators'],), dtype=np.float64)\n\nfor i, y_pred in enumerate(clf.staged_predict(X_test)):\n test_score[i] = clf.loss_(y_test, y_pred)\n\nfig = plt.figure(figsize=(12, 8))\n\nplt.subplot(1, 2, 1)\nplt.title('Deviance')\nplt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',\n label='Training Set Deviance')\nplt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',\n label='Test Set Deviance')\nplt.legend(loc='upper right')\nplt.xlabel('Boosting Iterations')\nplt.ylabel('Deviance')"
102+
]
103+
},
104+
{
105+
"cell_type": "markdown",
106+
"metadata": {},
107+
"source": [
108+
"Plot impurity-based feature importance\n-------------------------------------\n\nCareful, impurity-based feature importances can be misleading for\nhigh cardinality features (many unique values). See\n:func:`sklearn.inspection.permutation_importance` as an alternative.\n\n"
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": null,
114+
"metadata": {
115+
"collapsed": false
116+
},
117+
"outputs": [],
118+
"source": [
119+
"feature_importance = clf.feature_importances_\n# make importances relative to max importance\nfeature_importance = 100.0 * (feature_importance / feature_importance.max())\nsorted_idx = np.argsort(feature_importance)\npos = np.arange(sorted_idx.shape[0]) + .5\nplt.subplot(1, 2, 2)\nplt.barh(pos, feature_importance[sorted_idx], align='center')\nplt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])\nplt.xlabel('Relative Importance')\nplt.title('Variable Importance')\nfig.tight_layout()\n\nplt.show()"
30120
]
31121
}
32122
],
Binary file not shown.

dev/_downloads/scikit-learn-docs.pdf

-21.5 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes
781 Bytes
781 Bytes
649 Bytes
649 Bytes

0 commit comments

Comments
 (0)