Skip to content

Commit 4821369

Browse files
committed
Pushing the docs to dev/ for branch: master, commit cd3d502de08d4f95b2d56306a3ecf3e8f56c3236
1 parent 3fc66b8 commit 4821369

File tree

1,203 files changed

+5299
-3768
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,203 files changed

+5299
-3768
lines changed
Binary file not shown.
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
"""
2+
========================================
3+
Release Highlights for scikit-learn 0.22
4+
========================================
5+
6+
We are pleased to announce the release of scikit-learn 0.22, which comes
7+
with many bug fixes and new features! We detail below a few of the major
8+
features of this release. For an exhaustive list of all the changes, please
9+
refer to the :ref:`release notes <changes_0_22>`.
10+
11+
To install the latest version (with pip)::
12+
13+
pip install -U scikit-learn --upgrade
14+
15+
or with conda::
16+
17+
conda install scikit-learn
18+
"""
19+
20+
##############################################################################
21+
# Permutation-based feature importance
22+
# ------------------------------------
23+
#
24+
# The :func:`~sklearn.inspection.permutation_importance` can be used to get an
25+
# estimate of the importance of each feature, for any fitted estimator:
26+
27+
from sklearn.ensemble import RandomForestClassifier
28+
from sklearn.datasets import make_classification
29+
from sklearn.inspection import permutation_importance
30+
import matplotlib.pyplot as plt
31+
32+
X, y = make_classification(random_state=0, n_features=5, n_informative=3)
33+
rf = RandomForestClassifier(random_state=0).fit(X, y)
34+
result = permutation_importance(rf, X, y, n_repeats=10, random_state=0,
35+
n_jobs=-1)
36+
37+
fig, ax = plt.subplots()
38+
sorted_idx = result.importances_mean.argsort()
39+
ax.boxplot(result.importances[sorted_idx].T,
40+
vert=False, labels=range(X.shape[1]))
41+
ax.set_title("Permutation Importance of each feature")
42+
ax.set_ylabel("Features")
43+
fig.tight_layout()
44+
plt.show()
45+
46+
##############################################################################
47+
# Native support for missing values for gradient boosting
48+
# -------------------------------------------------------
49+
#
50+
# The :class:`~sklearn.ensemble.HistGradientBoostingClassifier`
51+
# and :class:`~sklearn.ensemble.HistGradientBoostingRegressor` now have native
52+
# support for missing values (NaNs). This means that there is no need for
53+
# imputing data when training or predicting.
54+
55+
from sklearn.experimental import enable_hist_gradient_boosting # noqa
56+
from sklearn.ensemble import HistGradientBoostingClassifier
57+
import numpy as np
58+
59+
X = np.array([0, 1, 2, np.nan]).reshape(-1, 1)
60+
y = [0, 0, 1, 1]
61+
62+
gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y)
63+
print(gbdt.predict(X))
64+
65+
##############################################################################
66+
# New plotting API
67+
# ----------------
68+
#
69+
# A new plotting API is available for creating visualizations. This new API
70+
# allows for quickly adjusting the visuals of a plot without involving any
71+
# recomputation. It is also possible to add different plots to the same
72+
# figure. See more examples in the :ref:`User Guide <visualizations>`.
73+
74+
from sklearn.model_selection import train_test_split
75+
from sklearn.svm import SVC
76+
from sklearn.metrics import plot_roc_curve
77+
78+
X, y = make_classification(random_state=0)
79+
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
80+
81+
svc = SVC(random_state=42)
82+
svc.fit(X_train, y_train)
83+
rfc = RandomForestClassifier(random_state=42)
84+
rfc.fit(X_train, y_train)
85+
86+
svc_disp = plot_roc_curve(svc, X_test, y_test)
87+
rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=svc_disp.ax_)
88+
rfc_disp.figure_.suptitle("ROC curve comparison")
89+
90+
plt.show()
91+
92+
#############################################################################
93+
# Tree pruning
94+
# ------------
95+
#
96+
# It is now possible to prune most tree-based estimators once the trees are
97+
# built. The pruning is based on minimal cost-complexity. Read more in the
98+
# :ref:`User Guide <minimal_cost_complexity_pruning>` for details.
99+
100+
X, y = make_classification(random_state=0)
101+
102+
rf = RandomForestClassifier(random_state=0, ccp_alpha=0).fit(X, y)
103+
print("Average number of nodes without pruning {:.1f}".format(
104+
np.mean([e.tree_.node_count for e in rf.estimators_])))
105+
106+
rf = RandomForestClassifier(random_state=0, ccp_alpha=0.05).fit(X, y)
107+
print("Average number of nodes with pruning {:.1f}".format(
108+
np.mean([e.tree_.node_count for e in rf.estimators_])))
109+
110+
############################################################################
111+
# Retrieve dataframes from OpenML
112+
# -------------------------------
113+
# :func:`datasets.fetch_openml` can now return pandas dataframe and thus
114+
# properly handle datasets with heterogeneous data:
115+
116+
from sklearn.datasets import fetch_openml
117+
118+
titanic = fetch_openml('titanic', version=1, as_frame=True)
119+
print(titanic.data.head()[['pclass', 'embarked']])
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n========================================\nRelease Highlights for scikit-learn 0.22\n========================================\n\nWe are pleased to announce the release of scikit-learn 0.22, which comes\nwith many bug fixes and new features! We detail below a few of the major\nfeatures of this release. For an exhaustive list of all the changes, please\nrefer to the `release notes <changes_0_22>`.\n\nTo install the latest version (with pip)::\n\n pip install -U scikit-learn --upgrade\n\nor with conda::\n\n conda install scikit-learn\n"
19+
]
20+
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"Permutation-based feature importance\n------------------------------------\n\nThe :func:`~sklearn.inspection.permutation_importance` can be used to get an\nestimate of the importance of each feature, for any fitted estimator:\n\n"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"metadata": {
32+
"collapsed": false
33+
},
34+
"outputs": [],
35+
"source": [
36+
"from sklearn.ensemble import RandomForestClassifier\nfrom sklearn.datasets import make_classification\nfrom sklearn.inspection import permutation_importance\nimport matplotlib.pyplot as plt\n\nX, y = make_classification(random_state=0, n_features=5, n_informative=3)\nrf = RandomForestClassifier(random_state=0).fit(X, y)\nresult = permutation_importance(rf, X, y, n_repeats=10, random_state=0,\n n_jobs=-1)\n\nfig, ax = plt.subplots()\nsorted_idx = result.importances_mean.argsort()\nax.boxplot(result.importances[sorted_idx].T,\n vert=False, labels=range(X.shape[1]))\nax.set_title(\"Permutation Importance of each feature\")\nax.set_ylabel(\"Features\")\nfig.tight_layout()\nplt.show()"
37+
]
38+
},
39+
{
40+
"cell_type": "markdown",
41+
"metadata": {},
42+
"source": [
43+
"Native support for missing values for gradient boosting\n-------------------------------------------------------\n\nThe :class:`~sklearn.ensemble.HistGradientBoostingClassifier`\nand :class:`~sklearn.ensemble.HistGradientBoostingRegressor` now have native\nsupport for missing values (NaNs). This means that there is no need for\nimputing data when training or predicting.\n\n"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": null,
49+
"metadata": {
50+
"collapsed": false
51+
},
52+
"outputs": [],
53+
"source": [
54+
"from sklearn.experimental import enable_hist_gradient_boosting # noqa\nfrom sklearn.ensemble import HistGradientBoostingClassifier\nimport numpy as np\n\nX = np.array([0, 1, 2, np.nan]).reshape(-1, 1)\ny = [0, 0, 1, 1]\n\ngbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y)\nprint(gbdt.predict(X))"
55+
]
56+
},
57+
{
58+
"cell_type": "markdown",
59+
"metadata": {},
60+
"source": [
61+
"New plotting API\n----------------\n\nA new plotting API is available for creating visualizations. This new API\nallows for quickly adjusting the visuals of a plot without involving any\nrecomputation. It is also possible to add different plots to the same\nfigure. See more examples in the `User Guide <visualizations>`.\n\n"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": null,
67+
"metadata": {
68+
"collapsed": false
69+
},
70+
"outputs": [],
71+
"source": [
72+
"from sklearn.model_selection import train_test_split\nfrom sklearn.svm import SVC\nfrom sklearn.metrics import plot_roc_curve\n\nX, y = make_classification(random_state=0)\nX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n\nsvc = SVC(random_state=42)\nsvc.fit(X_train, y_train)\nrfc = RandomForestClassifier(random_state=42)\nrfc.fit(X_train, y_train)\n\nsvc_disp = plot_roc_curve(svc, X_test, y_test)\nrfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=svc_disp.ax_)\nrfc_disp.figure_.suptitle(\"ROC curve comparison\")\n\nplt.show()"
73+
]
74+
},
75+
{
76+
"cell_type": "markdown",
77+
"metadata": {},
78+
"source": [
79+
"Tree pruning\n------------\n\nIt is now possible to prune most tree-based estimators once the trees are\nbuilt. The pruning is based on minimal cost-complexity. Read more in the\n`User Guide <minimal_cost_complexity_pruning>` for details.\n\n"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": null,
85+
"metadata": {
86+
"collapsed": false
87+
},
88+
"outputs": [],
89+
"source": [
90+
"X, y = make_classification(random_state=0)\n\nrf = RandomForestClassifier(random_state=0, ccp_alpha=0).fit(X, y)\nprint(\"Average number of nodes without pruning {:.1f}\".format(\n np.mean([e.tree_.node_count for e in rf.estimators_])))\n\nrf = RandomForestClassifier(random_state=0, ccp_alpha=0.05).fit(X, y)\nprint(\"Average number of nodes with pruning {:.1f}\".format(\n np.mean([e.tree_.node_count for e in rf.estimators_])))"
91+
]
92+
},
93+
{
94+
"cell_type": "markdown",
95+
"metadata": {},
96+
"source": [
97+
"Retrieve dataframes from OpenML\n-------------------------------\n:func:`datasets.fetch_openml` can now return pandas dataframe and thus\nproperly handle datasets with heterogeneous data:\n\n"
98+
]
99+
},
100+
{
101+
"cell_type": "code",
102+
"execution_count": null,
103+
"metadata": {
104+
"collapsed": false
105+
},
106+
"outputs": [],
107+
"source": [
108+
"from sklearn.datasets import fetch_openml\n\ntitanic = fetch_openml('titanic', version=1, as_frame=True)\nprint(titanic.data.head()[['pclass', 'embarked']])"
109+
]
110+
}
111+
],
112+
"metadata": {
113+
"kernelspec": {
114+
"display_name": "Python 3",
115+
"language": "python",
116+
"name": "python3"
117+
},
118+
"language_info": {
119+
"codemirror_mode": {
120+
"name": "ipython",
121+
"version": 3
122+
},
123+
"file_extension": ".py",
124+
"mimetype": "text/x-python",
125+
"name": "python",
126+
"nbconvert_exporter": "python",
127+
"pygments_lexer": "ipython3",
128+
"version": "3.7.4"
129+
}
130+
},
131+
"nbformat": 4,
132+
"nbformat_minor": 0
133+
}
Binary file not shown.

dev/_downloads/scikit-learn-docs.pdf

35.4 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes
-268 Bytes
-268 Bytes
-290 Bytes
-290 Bytes

0 commit comments

Comments
 (0)