Skip to content

Commit 47540ce

Browse files
committed
Pushing the docs to dev/ for branch: master, commit faaeba4e0fcd8b2a8548deb95261ae934171d9d1
1 parent 227200a commit 47540ce

File tree

1,209 files changed

+4116
-4105
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,209 files changed

+4116
-4105
lines changed
Binary file not shown.

dev/_downloads/51a82a09a4aa0f703f69fb5d4f15104f/plot_partial_dependence_visualization_api.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
},
6363
"outputs": [],
6464
"source": [
65-
"fig, ax = plt.subplots(figsize=(12, 6))\nax.set_title(\"Decision Tree\")\ntree_disp = plot_partial_dependence(tree, X, [\"LSTAT\", \"RM\"],\n feature_names=X.columns.tolist(), ax=ax)"
65+
"fig, ax = plt.subplots(figsize=(12, 6))\nax.set_title(\"Decision Tree\")\ntree_disp = plot_partial_dependence(tree, X, [\"LSTAT\", \"RM\"], ax=ax)"
6666
]
6767
},
6868
{
@@ -80,7 +80,7 @@
8080
},
8181
"outputs": [],
8282
"source": [
83-
"fig, ax = plt.subplots(figsize=(12, 6))\nax.set_title(\"Multi-layer Perceptron\")\nmlp_disp = plot_partial_dependence(mlp, X, [\"LSTAT\", \"RM\"],\n feature_names=X.columns.tolist(), ax=ax,\n line_kw={\"c\": \"red\"})"
83+
"fig, ax = plt.subplots(figsize=(12, 6))\nax.set_title(\"Multi-layer Perceptron\")\nmlp_disp = plot_partial_dependence(mlp, X, [\"LSTAT\", \"RM\"], ax=ax,\n line_kw={\"c\": \"red\"})"
8484
]
8585
},
8686
{
@@ -152,7 +152,7 @@
152152
},
153153
"outputs": [],
154154
"source": [
155-
"tree_disp = plot_partial_dependence(tree, X, [\"LSTAT\"],\n feature_names=X.columns.tolist())\nmlp_disp = plot_partial_dependence(mlp, X, [\"LSTAT\"],\n feature_names=X.columns.tolist(),\n ax=tree_disp.axes_, line_kw={\"c\": \"red\"})"
155+
"tree_disp = plot_partial_dependence(tree, X, [\"LSTAT\"])\nmlp_disp = plot_partial_dependence(mlp, X, [\"LSTAT\"],\n ax=tree_disp.axes_, line_kw={\"c\": \"red\"})"
156156
]
157157
}
158158
],

dev/_downloads/5a693c97e821586539ab9d250762742c/plot_partial_dependence.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
},
8181
"outputs": [],
8282
"source": [
83-
"print('Computing partial dependence plots...')\ntic = time()\n# We don't compute the 2-way PDP (5, 1) here, because it is a lot slower\n# with the brute method.\nfeatures = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms']\nplot_partial_dependence(est, X_train, features,\n feature_names=X_train.columns.tolist(),\n n_jobs=3, grid_resolution=20)\nprint(\"done in {:.3f}s\".format(time() - tic))\nfig = plt.gcf()\nfig.suptitle('Partial dependence of house value on non-___location features\\n'\n 'for the California housing dataset, with MLPRegressor')\nfig.subplots_adjust(hspace=0.3)"
83+
"print('Computing partial dependence plots...')\ntic = time()\n# We don't compute the 2-way PDP (5, 1) here, because it is a lot slower\n# with the brute method.\nfeatures = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms']\nplot_partial_dependence(est, X_train, features,\n n_jobs=3, grid_resolution=20)\nprint(\"done in {:.3f}s\".format(time() - tic))\nfig = plt.gcf()\nfig.suptitle('Partial dependence of house value on non-___location features\\n'\n 'for the California housing dataset, with MLPRegressor')\nfig.subplots_adjust(hspace=0.3)"
8484
]
8585
},
8686
{
@@ -116,7 +116,7 @@
116116
},
117117
"outputs": [],
118118
"source": [
119-
"print('Computing partial dependence plots...')\ntic = time()\nfeatures = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms',\n ('AveOccup', 'HouseAge')]\nplot_partial_dependence(est, X_train, features,\n feature_names=X_train.columns.tolist(),\n n_jobs=3, grid_resolution=20)\nprint(\"done in {:.3f}s\".format(time() - tic))\nfig = plt.gcf()\nfig.suptitle('Partial dependence of house value on non-___location features\\n'\n 'for the California housing dataset, with Gradient Boosting')\nfig.subplots_adjust(wspace=0.4, hspace=0.3)"
119+
"print('Computing partial dependence plots...')\ntic = time()\nfeatures = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms',\n ('AveOccup', 'HouseAge')]\nplot_partial_dependence(est, X_train, features,\n n_jobs=3, grid_resolution=20)\nprint(\"done in {:.3f}s\".format(time() - tic))\nfig = plt.gcf()\nfig.suptitle('Partial dependence of house value on non-___location features\\n'\n 'for the California housing dataset, with Gradient Boosting')\nfig.subplots_adjust(wspace=0.4, hspace=0.3)"
120120
]
121121
},
122122
{

dev/_downloads/781bb5a2dc85df6b75ee78d2eb118b0b/plot_partial_dependence_visualization_api.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,7 @@
5555
# defined by `ax` .
5656
fig, ax = plt.subplots(figsize=(12, 6))
5757
ax.set_title("Decision Tree")
58-
tree_disp = plot_partial_dependence(tree, X, ["LSTAT", "RM"],
59-
feature_names=X.columns.tolist(), ax=ax)
58+
tree_disp = plot_partial_dependence(tree, X, ["LSTAT", "RM"], ax=ax)
6059

6160
##############################################################################
6261
# The partial depdendence curves can be plotted for the multi-layer perceptron.
@@ -65,8 +64,7 @@
6564
# the curve.
6665
fig, ax = plt.subplots(figsize=(12, 6))
6766
ax.set_title("Multi-layer Perceptron")
68-
mlp_disp = plot_partial_dependence(mlp, X, ["LSTAT", "RM"],
69-
feature_names=X.columns.tolist(), ax=ax,
67+
mlp_disp = plot_partial_dependence(mlp, X, ["LSTAT", "RM"], ax=ax,
7068
line_kw={"c": "red"})
7169

7270
##############################################################################
@@ -134,8 +132,6 @@
134132
# Here, we plot the partial dependence curves for a single feature, "LSTAT", on
135133
# the same axes. In this case, `tree_disp.axes_` is passed into the second
136134
# plot function.
137-
tree_disp = plot_partial_dependence(tree, X, ["LSTAT"],
138-
feature_names=X.columns.tolist())
135+
tree_disp = plot_partial_dependence(tree, X, ["LSTAT"])
139136
mlp_disp = plot_partial_dependence(mlp, X, ["LSTAT"],
140-
feature_names=X.columns.tolist(),
141137
ax=tree_disp.axes_, line_kw={"c": "red"})

dev/_downloads/7ee55c12f8d3eb1dd8d2005d9dd7b6f1/plot_release_highlights_0_22_0.py

Lines changed: 111 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -20,27 +20,73 @@
2020
"""
2121

2222
##############################################################################
23-
# KNN Based Imputation
24-
# ------------------------------------
25-
# We now support imputation for completing missing values using k-Nearest
26-
# Neighbors.
23+
# New plotting API
24+
# ----------------
2725
#
28-
# Each sample's missing values are imputed using the mean value from
29-
# ``n_neighbors`` nearest neighbors found in the training set. Two samples are
30-
# close if the features that neither is missing are close.
31-
# By default, a euclidean distance metric
32-
# that supports missing values,
33-
# :func:`~metrics.nan_euclidean_distances`, is used to find the nearest
34-
# neighbors.
26+
# A new plotting API is available for creating visualizations. This new API
27+
# allows for quickly adjusting the visuals of a plot without involving any
28+
# recomputation. It is also possible to add different plots to the same
29+
# figure. See more examples in the :ref:`User Guide <visualizations>`.
30+
31+
from sklearn.model_selection import train_test_split
32+
from sklearn.svm import SVC
33+
from sklearn.metrics import plot_roc_curve
34+
from sklearn.ensemble import RandomForestClassifier
35+
from sklearn.datasets import make_classification
36+
import matplotlib.pyplot as plt
37+
38+
X, y = make_classification(random_state=0)
39+
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
40+
41+
svc = SVC(random_state=42)
42+
svc.fit(X_train, y_train)
43+
rfc = RandomForestClassifier(random_state=42)
44+
rfc.fit(X_train, y_train)
45+
46+
svc_disp = plot_roc_curve(svc, X_test, y_test)
47+
rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=svc_disp.ax_)
48+
rfc_disp.figure_.suptitle("ROC curve comparison")
49+
50+
plt.show()
51+
52+
############################################################################
53+
# Stacking Classifier and Regressor
54+
# ---------------------------------
55+
# :class:`~ensemble.StackingClassifier` and
56+
# :class:`~ensemble.StackingRegressor`
57+
# allow you to have a stack of estimators with a final classifier or
58+
# a regressor.
59+
# Stacked generalization consists in stacking the output of individual
60+
# estimators and use a classifier to compute the final prediction. Stacking
61+
# allows to use the strength of each individual estimator by using their output
62+
# as input of a final estimator.
63+
# Base estimators are fitted on the full ``X`` while
64+
# the final estimator is trained using cross-validated predictions of the
65+
# base estimators using ``cross_val_predict``.
3566
#
36-
# Read more in the :ref:`User Guide <knnimpute>`.
67+
# Read more in the :ref:`User Guide <stacking>`.
3768

38-
import numpy as np
39-
from sklearn.impute import KNNImputer
69+
from sklearn.datasets import load_iris
70+
from sklearn.svm import LinearSVC
71+
from sklearn.linear_model import LogisticRegression
72+
from sklearn.preprocessing import StandardScaler
73+
from sklearn.pipeline import make_pipeline
74+
from sklearn.ensemble import StackingClassifier
75+
from sklearn.model_selection import train_test_split
4076

41-
X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
42-
imputer = KNNImputer(n_neighbors=2)
43-
print(imputer.fit_transform(X))
77+
X, y = load_iris(return_X_y=True)
78+
estimators = [
79+
('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
80+
('svr', make_pipeline(StandardScaler(),
81+
LinearSVC(random_state=42)))
82+
]
83+
clf = StackingClassifier(
84+
estimators=estimators, final_estimator=LogisticRegression()
85+
)
86+
X_train, X_test, y_train, y_test = train_test_split(
87+
X, y, stratify=y, random_state=42
88+
)
89+
clf.fit(X_train, y_train).score(X_test, y_test)
4490

4591
##############################################################################
4692
# Permutation-based feature importance
@@ -50,9 +96,7 @@
5096
# estimate of the importance of each feature, for any fitted estimator:
5197

5298
from sklearn.ensemble import RandomForestClassifier
53-
from sklearn.datasets import make_classification
5499
from sklearn.inspection import permutation_importance
55-
import matplotlib.pyplot as plt
56100

57101
X, y = make_classification(random_state=0, n_features=5, n_informative=3)
58102
rf = RandomForestClassifier(random_state=0).fit(X, y)
@@ -87,32 +131,60 @@
87131
gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y)
88132
print(gbdt.predict(X))
89133

90-
##############################################################################
91-
# New plotting API
92-
# ----------------
93-
#
94-
# A new plotting API is available for creating visualizations. This new API
95-
# allows for quickly adjusting the visuals of a plot without involving any
96-
# recomputation. It is also possible to add different plots to the same
97-
# figure. See more examples in the :ref:`User Guide <visualizations>`.
134+
############################################################################
135+
# Precomputed sparse nearest neighbors graph
136+
# ------------------------------------------
137+
# Most estimators based on nearest neighbors graphs now accept precomputed
138+
# sparse graphs as input, to reuse the same graph for multiple estimator fits.
139+
# To use this feature in a pipeline, one can use the `memory` parameter, along
140+
# with one of the two new transformers,
141+
# :class:`neighbors.KNeighborsTransformer` and
142+
# :class:`neighbors.RadiusNeighborsTransformer`. The precomputation
143+
# can also be performed by custom estimators to use alternative
144+
# implementations, such as approximate nearest neighbors methods.
145+
# See more details in the :ref:`User Guide <neighbors_transformer>`.
98146

99-
from sklearn.model_selection import train_test_split
100-
from sklearn.svm import SVC
101-
from sklearn.metrics import plot_roc_curve
147+
from tempfile import TemporaryDirectory
148+
from sklearn.neighbors import KNeighborsTransformer
149+
from sklearn.manifold import Isomap
150+
from sklearn.pipeline import make_pipeline
102151

103152
X, y = make_classification(random_state=0)
104-
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
105153

106-
svc = SVC(random_state=42)
107-
svc.fit(X_train, y_train)
108-
rfc = RandomForestClassifier(random_state=42)
109-
rfc.fit(X_train, y_train)
154+
with TemporaryDirectory(prefix="sklearn_cache_") as tmpdir:
155+
estimator = make_pipeline(
156+
KNeighborsTransformer(n_neighbors=10, mode='distance'),
157+
Isomap(n_neighbors=10, metric='precomputed'),
158+
memory=tmpdir)
159+
estimator.fit(X)
110160

111-
svc_disp = plot_roc_curve(svc, X_test, y_test)
112-
rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=svc_disp.ax_)
113-
rfc_disp.figure_.suptitle("ROC curve comparison")
161+
# We can decrease the number of neighbors and the graph will not be
162+
# recomputed.
163+
estimator.set_params(isomap__n_neighbors=5)
164+
estimator.fit(X)
114165

115-
plt.show()
166+
##############################################################################
167+
# KNN Based Imputation
168+
# ------------------------------------
169+
# We now support imputation for completing missing values using k-Nearest
170+
# Neighbors.
171+
#
172+
# Each sample's missing values are imputed using the mean value from
173+
# ``n_neighbors`` nearest neighbors found in the training set. Two samples are
174+
# close if the features that neither is missing are close.
175+
# By default, a euclidean distance metric
176+
# that supports missing values,
177+
# :func:`~metrics.nan_euclidean_distances`, is used to find the nearest
178+
# neighbors.
179+
#
180+
# Read more in the :ref:`User Guide <knnimpute>`.
181+
182+
import numpy as np
183+
from sklearn.impute import KNNImputer
184+
185+
X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
186+
imputer = KNNImputer(n_neighbors=2)
187+
print(imputer.fit_transform(X))
116188

117189
#############################################################################
118190
# Tree pruning
@@ -143,76 +215,6 @@
143215
titanic = fetch_openml('titanic', version=1, as_frame=True)
144216
print(titanic.data.head()[['pclass', 'embarked']])
145217

146-
############################################################################
147-
# Precomputed sparse nearest neighbors graph
148-
# ------------------------------------------
149-
# Most estimators based on nearest neighbors graphs now accept precomputed
150-
# sparse graphs as input, to reuse the same graph for multiple estimator fits.
151-
# To use this feature in a pipeline, one can use the `memory` parameter, along
152-
# with one of the two new transformers,
153-
# :class:`neighbors.KNeighborsTransformer` and
154-
# :class:`neighbors.RadiusNeighborsTransformer`. The precomputation
155-
# can also be performed by custom estimators to use alternative
156-
# implementations, such as approximate nearest neighbors methods.
157-
# See more details in the :ref:`User Guide <neighbors_transformer>`.
158-
159-
from tempfile import TemporaryDirectory
160-
from sklearn.neighbors import KNeighborsTransformer
161-
from sklearn.manifold import Isomap
162-
from sklearn.pipeline import make_pipeline
163-
164-
with TemporaryDirectory(prefix="sklearn_cache_") as tmpdir:
165-
estimator = make_pipeline(
166-
KNeighborsTransformer(n_neighbors=10, mode='distance'),
167-
Isomap(n_neighbors=10, metric='precomputed'),
168-
memory=tmpdir)
169-
estimator.fit(X)
170-
171-
# We can decrease the number of neighbors and the graph will not be
172-
# recomputed.
173-
estimator.set_params(isomap__n_neighbors=5)
174-
estimator.fit(X)
175-
176-
############################################################################
177-
# Stacking Classifier and Regressor
178-
# ---------------------------------
179-
# :class:`~ensemble.StackingClassifier` and
180-
# :class:`~ensemble.StackingRegressor`
181-
# allow you to have a stack of estimators with a final classifier or
182-
# a regressor.
183-
# Stacked generalization consists in stacking the output of individual
184-
# estimators and use a classifier to compute the final prediction. Stacking
185-
# allows to use the strength of each individual estimator by using their output
186-
# as input of a final estimator.
187-
# Base estimators are fitted on the full ``X`` while
188-
# the final estimator is trained using cross-validated predictions of the
189-
# base estimators using ``cross_val_predict``.
190-
#
191-
# Read more in the :ref:`User Guide <stacking>`.
192-
193-
from sklearn.datasets import load_iris
194-
from sklearn.ensemble import RandomForestClassifier
195-
from sklearn.svm import LinearSVC
196-
from sklearn.linear_model import LogisticRegression
197-
from sklearn.preprocessing import StandardScaler
198-
from sklearn.pipeline import make_pipeline
199-
from sklearn.ensemble import StackingClassifier
200-
from sklearn.model_selection import train_test_split
201-
202-
X, y = load_iris(return_X_y=True)
203-
estimators = [
204-
('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
205-
('svr', make_pipeline(StandardScaler(),
206-
LinearSVC(random_state=42)))
207-
]
208-
clf = StackingClassifier(
209-
estimators=estimators, final_estimator=LogisticRegression()
210-
)
211-
X_train, X_test, y_train, y_test = train_test_split(
212-
X, y, stratify=y, random_state=42
213-
)
214-
clf.fit(X_train, y_train).score(X_test, y_test)
215-
216218
############################################################################
217219
# Checking scikit-learn compatibility of an estimator
218220
# ---------------------------------------------------

0 commit comments

Comments
 (0)