|
20 | 20 | """
|
21 | 21 |
|
22 | 22 | ##############################################################################
|
23 |
| -# KNN Based Imputation |
24 |
| -# ------------------------------------ |
25 |
| -# We now support imputation for completing missing values using k-Nearest |
26 |
| -# Neighbors. |
| 23 | +# New plotting API |
| 24 | +# ---------------- |
27 | 25 | #
|
28 |
| -# Each sample's missing values are imputed using the mean value from |
29 |
| -# ``n_neighbors`` nearest neighbors found in the training set. Two samples are |
30 |
| -# close if the features that neither is missing are close. |
31 |
| -# By default, a euclidean distance metric |
32 |
| -# that supports missing values, |
33 |
| -# :func:`~metrics.nan_euclidean_distances`, is used to find the nearest |
34 |
| -# neighbors. |
| 26 | +# A new plotting API is available for creating visualizations. This new API |
| 27 | +# allows for quickly adjusting the visuals of a plot without involving any |
| 28 | +# recomputation. It is also possible to add different plots to the same |
| 29 | +# figure. See more examples in the :ref:`User Guide <visualizations>`. |
| 30 | + |
| 31 | +from sklearn.model_selection import train_test_split |
| 32 | +from sklearn.svm import SVC |
| 33 | +from sklearn.metrics import plot_roc_curve |
| 34 | +from sklearn.ensemble import RandomForestClassifier |
| 35 | +from sklearn.datasets import make_classification |
| 36 | +import matplotlib.pyplot as plt |
| 37 | + |
| 38 | +X, y = make_classification(random_state=0) |
| 39 | +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) |
| 40 | + |
| 41 | +svc = SVC(random_state=42) |
| 42 | +svc.fit(X_train, y_train) |
| 43 | +rfc = RandomForestClassifier(random_state=42) |
| 44 | +rfc.fit(X_train, y_train) |
| 45 | + |
| 46 | +svc_disp = plot_roc_curve(svc, X_test, y_test) |
| 47 | +rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=svc_disp.ax_) |
| 48 | +rfc_disp.figure_.suptitle("ROC curve comparison") |
| 49 | + |
| 50 | +plt.show() |
| 51 | + |
| 52 | +############################################################################ |
| 53 | +# Stacking Classifier and Regressor |
| 54 | +# --------------------------------- |
| 55 | +# :class:`~ensemble.StackingClassifier` and |
| 56 | +# :class:`~ensemble.StackingRegressor` |
| 57 | +# allow you to have a stack of estimators with a final classifier or |
| 58 | +# a regressor. |
| 59 | +# Stacked generalization consists in stacking the output of individual |
| 60 | +# estimators and use a classifier to compute the final prediction. Stacking |
| 61 | +# allows to use the strength of each individual estimator by using their output |
| 62 | +# as input of a final estimator. |
| 63 | +# Base estimators are fitted on the full ``X`` while |
| 64 | +# the final estimator is trained using cross-validated predictions of the |
| 65 | +# base estimators using ``cross_val_predict``. |
35 | 66 | #
|
36 |
| -# Read more in the :ref:`User Guide <knnimpute>`. |
| 67 | +# Read more in the :ref:`User Guide <stacking>`. |
37 | 68 |
|
38 |
| -import numpy as np |
39 |
| -from sklearn.impute import KNNImputer |
| 69 | +from sklearn.datasets import load_iris |
| 70 | +from sklearn.svm import LinearSVC |
| 71 | +from sklearn.linear_model import LogisticRegression |
| 72 | +from sklearn.preprocessing import StandardScaler |
| 73 | +from sklearn.pipeline import make_pipeline |
| 74 | +from sklearn.ensemble import StackingClassifier |
| 75 | +from sklearn.model_selection import train_test_split |
40 | 76 |
|
41 |
| -X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]] |
42 |
| -imputer = KNNImputer(n_neighbors=2) |
43 |
| -print(imputer.fit_transform(X)) |
| 77 | +X, y = load_iris(return_X_y=True) |
| 78 | +estimators = [ |
| 79 | + ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), |
| 80 | + ('svr', make_pipeline(StandardScaler(), |
| 81 | + LinearSVC(random_state=42))) |
| 82 | +] |
| 83 | +clf = StackingClassifier( |
| 84 | + estimators=estimators, final_estimator=LogisticRegression() |
| 85 | +) |
| 86 | +X_train, X_test, y_train, y_test = train_test_split( |
| 87 | + X, y, stratify=y, random_state=42 |
| 88 | +) |
| 89 | +clf.fit(X_train, y_train).score(X_test, y_test) |
44 | 90 |
|
45 | 91 | ##############################################################################
|
46 | 92 | # Permutation-based feature importance
|
|
50 | 96 | # estimate of the importance of each feature, for any fitted estimator:
|
51 | 97 |
|
52 | 98 | from sklearn.ensemble import RandomForestClassifier
|
53 |
| -from sklearn.datasets import make_classification |
54 | 99 | from sklearn.inspection import permutation_importance
|
55 |
| -import matplotlib.pyplot as plt |
56 | 100 |
|
57 | 101 | X, y = make_classification(random_state=0, n_features=5, n_informative=3)
|
58 | 102 | rf = RandomForestClassifier(random_state=0).fit(X, y)
|
|
87 | 131 | gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y)
|
88 | 132 | print(gbdt.predict(X))
|
89 | 133 |
|
90 |
| -############################################################################## |
91 |
| -# New plotting API |
92 |
| -# ---------------- |
93 |
| -# |
94 |
| -# A new plotting API is available for creating visualizations. This new API |
95 |
| -# allows for quickly adjusting the visuals of a plot without involving any |
96 |
| -# recomputation. It is also possible to add different plots to the same |
97 |
| -# figure. See more examples in the :ref:`User Guide <visualizations>`. |
| 134 | +############################################################################ |
| 135 | +# Precomputed sparse nearest neighbors graph |
| 136 | +# ------------------------------------------ |
| 137 | +# Most estimators based on nearest neighbors graphs now accept precomputed |
| 138 | +# sparse graphs as input, to reuse the same graph for multiple estimator fits. |
| 139 | +# To use this feature in a pipeline, one can use the `memory` parameter, along |
| 140 | +# with one of the two new transformers, |
| 141 | +# :class:`neighbors.KNeighborsTransformer` and |
| 142 | +# :class:`neighbors.RadiusNeighborsTransformer`. The precomputation |
| 143 | +# can also be performed by custom estimators to use alternative |
| 144 | +# implementations, such as approximate nearest neighbors methods. |
| 145 | +# See more details in the :ref:`User Guide <neighbors_transformer>`. |
98 | 146 |
|
99 |
| -from sklearn.model_selection import train_test_split |
100 |
| -from sklearn.svm import SVC |
101 |
| -from sklearn.metrics import plot_roc_curve |
| 147 | +from tempfile import TemporaryDirectory |
| 148 | +from sklearn.neighbors import KNeighborsTransformer |
| 149 | +from sklearn.manifold import Isomap |
| 150 | +from sklearn.pipeline import make_pipeline |
102 | 151 |
|
103 | 152 | X, y = make_classification(random_state=0)
|
104 |
| -X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) |
105 | 153 |
|
106 |
| -svc = SVC(random_state=42) |
107 |
| -svc.fit(X_train, y_train) |
108 |
| -rfc = RandomForestClassifier(random_state=42) |
109 |
| -rfc.fit(X_train, y_train) |
| 154 | +with TemporaryDirectory(prefix="sklearn_cache_") as tmpdir: |
| 155 | + estimator = make_pipeline( |
| 156 | + KNeighborsTransformer(n_neighbors=10, mode='distance'), |
| 157 | + Isomap(n_neighbors=10, metric='precomputed'), |
| 158 | + memory=tmpdir) |
| 159 | + estimator.fit(X) |
110 | 160 |
|
111 |
| -svc_disp = plot_roc_curve(svc, X_test, y_test) |
112 |
| -rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=svc_disp.ax_) |
113 |
| -rfc_disp.figure_.suptitle("ROC curve comparison") |
| 161 | + # We can decrease the number of neighbors and the graph will not be |
| 162 | + # recomputed. |
| 163 | + estimator.set_params(isomap__n_neighbors=5) |
| 164 | + estimator.fit(X) |
114 | 165 |
|
115 |
| -plt.show() |
| 166 | +############################################################################## |
| 167 | +# KNN Based Imputation |
| 168 | +# ------------------------------------ |
| 169 | +# We now support imputation for completing missing values using k-Nearest |
| 170 | +# Neighbors. |
| 171 | +# |
| 172 | +# Each sample's missing values are imputed using the mean value from |
| 173 | +# ``n_neighbors`` nearest neighbors found in the training set. Two samples are |
| 174 | +# close if the features that neither is missing are close. |
| 175 | +# By default, a euclidean distance metric |
| 176 | +# that supports missing values, |
| 177 | +# :func:`~metrics.nan_euclidean_distances`, is used to find the nearest |
| 178 | +# neighbors. |
| 179 | +# |
| 180 | +# Read more in the :ref:`User Guide <knnimpute>`. |
| 181 | + |
| 182 | +import numpy as np |
| 183 | +from sklearn.impute import KNNImputer |
| 184 | + |
| 185 | +X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]] |
| 186 | +imputer = KNNImputer(n_neighbors=2) |
| 187 | +print(imputer.fit_transform(X)) |
116 | 188 |
|
117 | 189 | #############################################################################
|
118 | 190 | # Tree pruning
|
|
143 | 215 | titanic = fetch_openml('titanic', version=1, as_frame=True)
|
144 | 216 | print(titanic.data.head()[['pclass', 'embarked']])
|
145 | 217 |
|
146 |
| -############################################################################ |
147 |
| -# Precomputed sparse nearest neighbors graph |
148 |
| -# ------------------------------------------ |
149 |
| -# Most estimators based on nearest neighbors graphs now accept precomputed |
150 |
| -# sparse graphs as input, to reuse the same graph for multiple estimator fits. |
151 |
| -# To use this feature in a pipeline, one can use the `memory` parameter, along |
152 |
| -# with one of the two new transformers, |
153 |
| -# :class:`neighbors.KNeighborsTransformer` and |
154 |
| -# :class:`neighbors.RadiusNeighborsTransformer`. The precomputation |
155 |
| -# can also be performed by custom estimators to use alternative |
156 |
| -# implementations, such as approximate nearest neighbors methods. |
157 |
| -# See more details in the :ref:`User Guide <neighbors_transformer>`. |
158 |
| - |
159 |
| -from tempfile import TemporaryDirectory |
160 |
| -from sklearn.neighbors import KNeighborsTransformer |
161 |
| -from sklearn.manifold import Isomap |
162 |
| -from sklearn.pipeline import make_pipeline |
163 |
| - |
164 |
| -with TemporaryDirectory(prefix="sklearn_cache_") as tmpdir: |
165 |
| - estimator = make_pipeline( |
166 |
| - KNeighborsTransformer(n_neighbors=10, mode='distance'), |
167 |
| - Isomap(n_neighbors=10, metric='precomputed'), |
168 |
| - memory=tmpdir) |
169 |
| - estimator.fit(X) |
170 |
| - |
171 |
| - # We can decrease the number of neighbors and the graph will not be |
172 |
| - # recomputed. |
173 |
| - estimator.set_params(isomap__n_neighbors=5) |
174 |
| - estimator.fit(X) |
175 |
| - |
176 |
| -############################################################################ |
177 |
| -# Stacking Classifier and Regressor |
178 |
| -# --------------------------------- |
179 |
| -# :class:`~ensemble.StackingClassifier` and |
180 |
| -# :class:`~ensemble.StackingRegressor` |
181 |
| -# allow you to have a stack of estimators with a final classifier or |
182 |
| -# a regressor. |
183 |
| -# Stacked generalization consists in stacking the output of individual |
184 |
| -# estimators and use a classifier to compute the final prediction. Stacking |
185 |
| -# allows to use the strength of each individual estimator by using their output |
186 |
| -# as input of a final estimator. |
187 |
| -# Base estimators are fitted on the full ``X`` while |
188 |
| -# the final estimator is trained using cross-validated predictions of the |
189 |
| -# base estimators using ``cross_val_predict``. |
190 |
| -# |
191 |
| -# Read more in the :ref:`User Guide <stacking>`. |
192 |
| - |
193 |
| -from sklearn.datasets import load_iris |
194 |
| -from sklearn.ensemble import RandomForestClassifier |
195 |
| -from sklearn.svm import LinearSVC |
196 |
| -from sklearn.linear_model import LogisticRegression |
197 |
| -from sklearn.preprocessing import StandardScaler |
198 |
| -from sklearn.pipeline import make_pipeline |
199 |
| -from sklearn.ensemble import StackingClassifier |
200 |
| -from sklearn.model_selection import train_test_split |
201 |
| - |
202 |
| -X, y = load_iris(return_X_y=True) |
203 |
| -estimators = [ |
204 |
| - ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), |
205 |
| - ('svr', make_pipeline(StandardScaler(), |
206 |
| - LinearSVC(random_state=42))) |
207 |
| -] |
208 |
| -clf = StackingClassifier( |
209 |
| - estimators=estimators, final_estimator=LogisticRegression() |
210 |
| -) |
211 |
| -X_train, X_test, y_train, y_test = train_test_split( |
212 |
| - X, y, stratify=y, random_state=42 |
213 |
| -) |
214 |
| -clf.fit(X_train, y_train).score(X_test, y_test) |
215 |
| - |
216 | 218 | ############################################################################
|
217 | 219 | # Checking scikit-learn compatibility of an estimator
|
218 | 220 | # ---------------------------------------------------
|
|
0 commit comments