|
| 1 | +# flake8: noqa |
| 2 | +""" |
| 3 | +======================================== |
| 4 | +Release Highlights for scikit-learn 0.23 |
| 5 | +======================================== |
| 6 | +
|
| 7 | +.. currentmodule:: sklearn |
| 8 | +
|
| 9 | +We are pleased to announce the release of scikit-learn 0.23! Many bug fixes |
| 10 | +and improvements were added, as well as some new key features. We detail |
| 11 | +below a few of the major features of this release. **For an exhaustive list of |
| 12 | +all the changes**, please refer to the :ref:`release notes <changes_0_23>`. |
| 13 | +
|
| 14 | +To install the latest version (with pip):: |
| 15 | +
|
| 16 | + pip install --upgrade scikit-learn |
| 17 | +
|
| 18 | +or with conda:: |
| 19 | +
|
| 20 | + conda install scikit-learn |
| 21 | +""" |
| 22 | + |
| 23 | +############################################################################## |
| 24 | +# Generalized Linear Models, and Poisson loss for gradient boosting |
| 25 | +# ----------------------------------------------------------------- |
| 26 | +# Long-awaited Generalized Linear Models with non-normal loss functions are now |
| 27 | +# available. In particular, three new regressors were implemented: |
| 28 | +# :class:`~sklearn.linear_model.PoissonRegressor`, |
| 29 | +# :class:`~sklearn.linear_model.GammaRegressor`, and |
| 30 | +# :class:`~sklearn.linear_model.TweedieRegressor`. The Poisson regressor can be |
| 31 | +# used to model positive integer counts, or relative frequencies. Read more in |
| 32 | +# the :ref:`User Guide <Generalized_linear_regression>`. Additionally, |
| 33 | +# :class:`~sklearn.ensemble.HistGradientBoostingRegressor` supports a new |
| 34 | +# 'poisson' loss as well. |
| 35 | + |
| 36 | +import numpy as np |
| 37 | +from sklearn.model_selection import train_test_split |
| 38 | +from sklearn.linear_model import PoissonRegressor |
| 39 | +from sklearn.experimental import enable_hist_gradient_boosting # noqa |
| 40 | +from sklearn.ensemble import HistGradientBoostingRegressor |
| 41 | + |
| 42 | +n_samples, n_features = 1000, 20 |
| 43 | +rng = np.random.RandomState(0) |
| 44 | +X = rng.randn(n_samples, n_features) |
| 45 | +# positive integer target correlated with X[:, 5] with many zeros: |
| 46 | +y = rng.poisson(lam=np.exp(X[:, 5]) / 2) |
| 47 | +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) |
| 48 | +glm = PoissonRegressor() |
| 49 | +gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01) |
| 50 | +glm.fit(X_train, y_train) |
| 51 | +gbdt.fit(X_train, y_train) |
| 52 | +print(glm.score(X_test, y_test)) |
| 53 | +print(gbdt.score(X_test, y_test)) |
| 54 | + |
| 55 | +############################################################################## |
| 56 | +# Rich HTML representation for estimators |
| 57 | +# --------------------------------------- |
| 58 | +# Estimators can now be rendered in html in notebooks by enabling the |
| 59 | +# `display='diagram'` option. This is particularly useful to visualize |
| 60 | +# pipelines and composite estimators. Click on the entries to expand and see |
| 61 | +# details. |
| 62 | +from sklearn import set_config |
| 63 | +from sklearn.pipeline import make_pipeline |
| 64 | +from sklearn.preprocessing import OneHotEncoder, StandardScaler |
| 65 | +from sklearn.impute import SimpleImputer |
| 66 | +from sklearn.compose import make_column_transformer |
| 67 | +from sklearn.linear_model import LogisticRegression |
| 68 | +set_config(display='diagram') |
| 69 | + |
| 70 | +num_proc = make_pipeline(SimpleImputer(strategy='median'), StandardScaler()) |
| 71 | + |
| 72 | +cat_proc = make_pipeline( |
| 73 | + SimpleImputer(strategy='constant', fill_value='missing'), |
| 74 | + OneHotEncoder(handle_unknown='ignore')) |
| 75 | + |
| 76 | +preprocessor = make_column_transformer((num_proc, ('feat1', 'feat3')), |
| 77 | + (cat_proc, ('feat0', 'feat2'))) |
| 78 | + |
| 79 | +clf = make_pipeline(preprocessor, LogisticRegression()) |
| 80 | +clf |
| 81 | + |
| 82 | +############################################################################## |
| 83 | +# Scalability and stability improvements to KMeans |
| 84 | +# ------------------------------------------------ |
| 85 | +# The :class:`~sklearn.cluster.KMeans` estimator was entirely re-worked, and it |
| 86 | +# is now significantly faster and more stable. In addition, the Elkan algorithm |
| 87 | +# is now compatible with sparse matrices. The estimator uses OpenMP based |
| 88 | +# parallelism instead of relying on joblib, so the `n_jobs` parameter has no |
| 89 | +# effect anymore. For more details on how to control the number of threads, |
| 90 | +# please refer to our :ref:`parallelism` notes. |
| 91 | +import scipy |
| 92 | +import numpy as np |
| 93 | +from sklearn.model_selection import train_test_split |
| 94 | +from sklearn.cluster import KMeans |
| 95 | +from sklearn.datasets import make_blobs |
| 96 | +from sklearn.metrics import completeness_score |
| 97 | + |
| 98 | +rng = np.random.RandomState(0) |
| 99 | +X, y = make_blobs(random_state=rng) |
| 100 | +X = scipy.sparse.csr_matrix(X) |
| 101 | +X_train, X_test, _, y_test = train_test_split(X, y, random_state=rng) |
| 102 | +kmeans = KMeans(algorithm='elkan').fit(X_train) |
| 103 | +print(completeness_score(kmeans.predict(X_test), y_test)) |
| 104 | + |
| 105 | +############################################################################## |
| 106 | +# Improvements to the histogram-based Gradient Boosting estimators |
| 107 | +# ---------------------------------------------------------------- |
| 108 | +# Various improvements were made to |
| 109 | +# :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and |
| 110 | +# :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. On top of the |
| 111 | +# Poisson loss mentionned above, these estimators now support :ref:`sample |
| 112 | +# weights <sw_hgbdt>`. Also, an automatic early-stopping criterion was added: |
| 113 | +# early-stopping is enabled by default when the number of samples exceeds 10k. |
| 114 | +# Finally, users can now define :ref:`monotonic constraints |
| 115 | +# <monotonic_cst_gbdt>` to constrain the predictions based on the variations of |
| 116 | +# specific features. In the following example, we construct a target that is |
| 117 | +# generally positively correlated with the first feature, with some noise. |
| 118 | +# Applying monotoinc constraints allows the prediction to capture the global |
| 119 | +# effect of the first feature, instead of fitting the noise. |
| 120 | +import numpy as np |
| 121 | +from matplotlib import pyplot as plt |
| 122 | +from sklearn.model_selection import train_test_split |
| 123 | +from sklearn.inspection import plot_partial_dependence |
| 124 | +from sklearn.experimental import enable_hist_gradient_boosting # noqa |
| 125 | +from sklearn.ensemble import HistGradientBoostingRegressor |
| 126 | + |
| 127 | +n_samples = 500 |
| 128 | +rng = np.random.RandomState(0) |
| 129 | +X = rng.randn(n_samples, 2) |
| 130 | +noise = rng.normal(loc=0.0, scale=0.01, size=n_samples) |
| 131 | +y = (5 * X[:, 0] + np.sin(10 * np.pi * X[:, 0]) - noise) |
| 132 | + |
| 133 | +gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y) |
| 134 | +gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=[1, 0]).fit(X, y) |
| 135 | + |
| 136 | +disp = plot_partial_dependence( |
| 137 | + gbdt_no_cst, X, features=[0], feature_names=['feature 0'], |
| 138 | + line_kw={'linewidth': 4, 'label': 'unconstrained'}) |
| 139 | +plot_partial_dependence(gbdt_cst, X, features=[0], |
| 140 | + line_kw={'linewidth': 4, 'label': 'constrained'}, ax=disp.axes_) |
| 141 | +disp.axes_[0, 0].plot(X[:, 0], y, 'o', alpha=.5, zorder=-1, label='samples') |
| 142 | +disp.axes_[0, 0].set_ylim(-3, 3); disp.axes_[0, 0].set_xlim(-1, 1) |
| 143 | +plt.legend() |
| 144 | +plt.show() |
| 145 | + |
| 146 | +############################################################################## |
| 147 | +# Sample-weight support for Lasso and ElasticNet |
| 148 | +# ---------------------------------------------- |
| 149 | +# The two linear regressors :class:`~sklearn.linear_model.Lasso` and |
| 150 | +# :class:`~sklearn.linear_model.ElasticNet` now support sample weights. |
| 151 | + |
| 152 | +from sklearn.model_selection import train_test_split |
| 153 | +from sklearn.datasets import make_regression |
| 154 | +from sklearn.linear_model import Lasso |
| 155 | +import numpy as np |
| 156 | + |
| 157 | +n_samples, n_features = 1000, 20 |
| 158 | +rng = np.random.RandomState(0) |
| 159 | +X, y = make_regression(n_samples, n_features, random_state=rng) |
| 160 | +sample_weight = rng.rand(n_samples) |
| 161 | +X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split( |
| 162 | + X, y, sample_weight, random_state=rng) |
| 163 | +reg = Lasso() |
| 164 | +reg.fit(X_train, y_train, sample_weight=sw_train) |
| 165 | +print(reg.score(X_test, y_test, sw_test)) |
0 commit comments