Skip to content

Commit dc1d144

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 24d58668da03239cb1a45a1bd713d5485478a675
1 parent 16a7dfb commit dc1d144

File tree

1,227 files changed

+6897
-5311
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,227 files changed

+6897
-5311
lines changed

dev/.buildinfo

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: fb929b3c6a9413164e1ac03fbe586798
3+
config: 44afcf8dd215cc5d065a44ea3a818dd0
44
tags: 645f666f9bcd5a90fca523b33c5a78b7
Binary file not shown.
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
# flake8: noqa
2+
"""
3+
=======================================
4+
Release Highlights for scikit-learn 1.1
5+
=======================================
6+
7+
.. currentmodule:: sklearn
8+
9+
We are pleased to announce the release of scikit-learn 1.1! Many bug fixes
10+
and improvements were added, as well as some new key features. We detail
11+
below a few of the major features of this release. **For an exhaustive list of
12+
all the changes**, please refer to the :ref:`release notes <changes_1_1>`.
13+
14+
To install the latest version (with pip)::
15+
16+
pip install --upgrade scikit-learn
17+
18+
or with conda::
19+
20+
conda install -c conda-forge scikit-learn
21+
22+
"""
23+
24+
# %%
25+
# Quantile loss in :class:`ensemble.HistGradientBoostingRegressor`
26+
# ----------------------------------------------------------------
27+
# :class:`ensemble.HistGradientBoostingRegressor` can model quantiles with
28+
# `loss="quantile"` and the new parameter `quantile`.
29+
from sklearn.datasets import make_regression
30+
from sklearn.ensemble import HistGradientBoostingRegressor
31+
import numpy as np
32+
import matplotlib.pyplot as plt
33+
34+
# Simple regression function for X * cos(X)
35+
rng = np.random.RandomState(42)
36+
X_1d = np.linspace(0, 10, num=2000)
37+
X = X_1d.reshape(-1, 1)
38+
y = X_1d * np.cos(X_1d) + rng.normal(scale=X_1d / 3)
39+
40+
quantiles = [0.95, 0.5, 0.05]
41+
parameters = dict(loss="quantile", max_bins=32, max_iter=50)
42+
hist_quantiles = {
43+
f"quantile={quantile:.2f}": HistGradientBoostingRegressor(
44+
**parameters, quantile=quantile
45+
).fit(X, y)
46+
for quantile in quantiles
47+
}
48+
49+
fig, ax = plt.subplots()
50+
ax.plot(X_1d, y, "o", alpha=0.5, markersize=1)
51+
for quantile, hist in hist_quantiles.items():
52+
ax.plot(X_1d, hist.predict(X), label=quantile)
53+
ax.legend(loc="lower left")
54+
55+
56+
# %%
57+
# `get_feature_names_out` Available in all Transformers
58+
# -----------------------------------------------------
59+
# :term:`get_feature_names_out` is now available in all Transformers. This enables
60+
# :class:`pipeline.Pipeline` to construct the output feature names for more complex
61+
# pipelines:
62+
from sklearn.compose import ColumnTransformer
63+
from sklearn.preprocessing import OneHotEncoder, StandardScaler
64+
from sklearn.pipeline import make_pipeline
65+
from sklearn.impute import SimpleImputer
66+
from sklearn.feature_selection import SelectKBest
67+
from sklearn.datasets import fetch_openml
68+
from sklearn.linear_model import LogisticRegression
69+
70+
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
71+
numeric_features = ["age", "fare"]
72+
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
73+
categorical_features = ["embarked", "pclass"]
74+
75+
preprocessor = ColumnTransformer(
76+
[
77+
("num", numeric_transformer, numeric_features),
78+
(
79+
"cat",
80+
OneHotEncoder(handle_unknown="ignore", sparse=False),
81+
categorical_features,
82+
),
83+
],
84+
verbose_feature_names_out=False,
85+
)
86+
log_reg = make_pipeline(preprocessor, SelectKBest(k=7), LogisticRegression())
87+
log_reg.fit(X, y)
88+
89+
90+
# %%
91+
# Here we slice the pipeline to include all the steps but the last one. The output
92+
# feature names of this pipeline slice are the features put into logistic
93+
# regression. These names correspond directly to the coefficients in the logistic
94+
# regression:
95+
import pandas as pd
96+
97+
log_reg_input_features = log_reg[:-1].get_feature_names_out()
98+
pd.Series(log_reg[-1].coef_.ravel(), index=log_reg_input_features).plot.bar()
99+
100+
101+
# %%
102+
# Grouping infrequent categories in :class:`OneHotEncoder`
103+
# --------------------------------------------------------
104+
# :class:`OneHotEncoder` supports aggregating infrequent categories into a single
105+
# output for each feature. The parameters to enable the gathering of infrequent
106+
# categories are `min_frequency` and `max_categories`. See the
107+
# :ref:`User Guide <one_hot_encoder_infrequent_categories>` for more details.
108+
from sklearn.preprocessing import OneHotEncoder
109+
import numpy as np
110+
111+
X = np.array(
112+
[["dog"] * 5 + ["cat"] * 20 + ["rabbit"] * 10 + ["snake"] * 3], dtype=object
113+
).T
114+
enc = OneHotEncoder(min_frequency=6, sparse=False).fit(X)
115+
enc.infrequent_categories_
116+
117+
# %%
118+
# Since dog and snake are infrequent categories, they are grouped together when
119+
# transformed:
120+
encoded = enc.transform(np.array([["dog"], ["snake"], ["cat"], ["rabbit"]]))
121+
pd.DataFrame(encoded, columns=enc.get_feature_names_out())
122+
123+
# %%
124+
# Performance improvements
125+
# ------------------------
126+
# Reductions on pairwise distances for dense float64 datasets has been refactored
127+
# to better take advantage of non-blocking thread parallelism. For example,
128+
# :meth:`neighbors.NearestNeighbors.kneighbors` and
129+
# :meth:`neighbors.NearestNeighbors.radius_neighbors` can respectively be up to ×20 and
130+
# ×5 faster than previously. In summary, the following functions and estimators
131+
# now benefit from improved performance:
132+
#
133+
# - :func:`metrics.pairwise_distances_argmin`
134+
# - :func:`metrics.pairwise_distances_argmin_min`
135+
# - :class:`cluster.AffinityPropagation`
136+
# - :class:`cluster.Birch`
137+
# - :class:`cluster.MeanShift`
138+
# - :class:`cluster.OPTICS`
139+
# - :class:`cluster.SpectralClustering`
140+
# - :func:`feature_selection.mutual_info_regression`
141+
# - :class:`neighbors.KNeighborsClassifier`
142+
# - :class:`neighbors.KNeighborsRegressor`
143+
# - :class:`neighbors.RadiusNeighborsClassifier`
144+
# - :class:`neighbors.RadiusNeighborsRegressor`
145+
# - :class:`neighbors.LocalOutlierFactor`
146+
# - :class:`neighbors.NearestNeighbors`
147+
# - :class:`manifold.Isomap`
148+
# - :class:`manifold.LocallyLinearEmbedding`
149+
# - :class:`manifold.TSNE`
150+
# - :func:`manifold.trustworthiness`
151+
# - :class:`semi_supervised.LabelPropagation`
152+
# - :class:`semi_supervised.LabelSpreading`
153+
#
154+
# To know more about the technical details of this work, you can read
155+
# `this suite of blog posts <https://blog.scikit-learn.org/technical/performances/>`_.
156+
#
157+
# Moreover, the computation of loss functions has been refactored using
158+
# Cython resulting in performance improvements for the following estimators:
159+
#
160+
# - :class:`linear_model.LogisticRegression`
161+
# - :class:`linear_model.GammaRegressor`
162+
# - :class:`linear_model.PoissonRegressor`
163+
# - :class:`linear_model.TweedieRegressor`
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Release Highlights for scikit-learn 1.1\n\n.. currentmodule:: sklearn\n\nWe are pleased to announce the release of scikit-learn 1.1! Many bug fixes\nand improvements were added, as well as some new key features. We detail\nbelow a few of the major features of this release. **For an exhaustive list of\nall the changes**, please refer to the `release notes <changes_1_1>`.\n\nTo install the latest version (with pip)::\n\n pip install --upgrade scikit-learn\n\nor with conda::\n\n conda install -c conda-forge scikit-learn\n"
19+
]
20+
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"## Quantile loss in :class:`ensemble.HistGradientBoostingRegressor`\n:class:`ensemble.HistGradientBoostingRegressor` can model quantiles with\n`loss=\"quantile\"` and the new parameter `quantile`.\n\n"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"metadata": {
32+
"collapsed": false
33+
},
34+
"outputs": [],
35+
"source": [
36+
"from sklearn.datasets import make_regression\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n# Simple regression function for X * cos(X)\nrng = np.random.RandomState(42)\nX_1d = np.linspace(0, 10, num=2000)\nX = X_1d.reshape(-1, 1)\ny = X_1d * np.cos(X_1d) + rng.normal(scale=X_1d / 3)\n\nquantiles = [0.95, 0.5, 0.05]\nparameters = dict(loss=\"quantile\", max_bins=32, max_iter=50)\nhist_quantiles = {\n f\"quantile={quantile:.2f}\": HistGradientBoostingRegressor(\n **parameters, quantile=quantile\n ).fit(X, y)\n for quantile in quantiles\n}\n\nfig, ax = plt.subplots()\nax.plot(X_1d, y, \"o\", alpha=0.5, markersize=1)\nfor quantile, hist in hist_quantiles.items():\n ax.plot(X_1d, hist.predict(X), label=quantile)\nax.legend(loc=\"lower left\")"
37+
]
38+
},
39+
{
40+
"cell_type": "markdown",
41+
"metadata": {},
42+
"source": [
43+
"## `get_feature_names_out` Available in all Transformers\n:term:`get_feature_names_out` is now available in all Transformers. This enables\n:class:`pipeline.Pipeline` to construct the output feature names for more complex\npipelines:\n\n"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": null,
49+
"metadata": {
50+
"collapsed": false
51+
},
52+
"outputs": [],
53+
"source": [
54+
"from sklearn.compose import ColumnTransformer\nfrom sklearn.preprocessing import OneHotEncoder, StandardScaler\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.linear_model import LogisticRegression\n\nX, y = fetch_openml(\"titanic\", version=1, as_frame=True, return_X_y=True)\nnumeric_features = [\"age\", \"fare\"]\nnumeric_transformer = make_pipeline(SimpleImputer(strategy=\"median\"), StandardScaler())\ncategorical_features = [\"embarked\", \"pclass\"]\n\npreprocessor = ColumnTransformer(\n [\n (\"num\", numeric_transformer, numeric_features),\n (\n \"cat\",\n OneHotEncoder(handle_unknown=\"ignore\", sparse=False),\n categorical_features,\n ),\n ],\n verbose_feature_names_out=False,\n)\nlog_reg = make_pipeline(preprocessor, SelectKBest(k=7), LogisticRegression())\nlog_reg.fit(X, y)"
55+
]
56+
},
57+
{
58+
"cell_type": "markdown",
59+
"metadata": {},
60+
"source": [
61+
"Here we slice the pipeline to include all the steps but the last one. The output\nfeature names of this pipeline slice are the features put into logistic\nregression. These names correspond directly to the coefficients in the logistic\nregression:\n\n"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": null,
67+
"metadata": {
68+
"collapsed": false
69+
},
70+
"outputs": [],
71+
"source": [
72+
"import pandas as pd\n\nlog_reg_input_features = log_reg[:-1].get_feature_names_out()\npd.Series(log_reg[-1].coef_.ravel(), index=log_reg_input_features).plot.bar()"
73+
]
74+
},
75+
{
76+
"cell_type": "markdown",
77+
"metadata": {},
78+
"source": [
79+
"## Grouping infrequent categories in :class:`OneHotEncoder`\n:class:`OneHotEncoder` supports aggregating infrequent categories into a single\noutput for each feature. The parameters to enable the gathering of infrequent\ncategories are `min_frequency` and `max_categories`. See the\n`User Guide <one_hot_encoder_infrequent_categories>` for more details.\n\n"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": null,
85+
"metadata": {
86+
"collapsed": false
87+
},
88+
"outputs": [],
89+
"source": [
90+
"from sklearn.preprocessing import OneHotEncoder\nimport numpy as np\n\nX = np.array(\n [[\"dog\"] * 5 + [\"cat\"] * 20 + [\"rabbit\"] * 10 + [\"snake\"] * 3], dtype=object\n).T\nenc = OneHotEncoder(min_frequency=6, sparse=False).fit(X)\nenc.infrequent_categories_"
91+
]
92+
},
93+
{
94+
"cell_type": "markdown",
95+
"metadata": {},
96+
"source": [
97+
"Since dog and snake are infrequent categories, they are grouped together when\ntransformed:\n\n"
98+
]
99+
},
100+
{
101+
"cell_type": "code",
102+
"execution_count": null,
103+
"metadata": {
104+
"collapsed": false
105+
},
106+
"outputs": [],
107+
"source": [
108+
"encoded = enc.transform(np.array([[\"dog\"], [\"snake\"], [\"cat\"], [\"rabbit\"]]))\npd.DataFrame(encoded, columns=enc.get_feature_names_out())"
109+
]
110+
},
111+
{
112+
"cell_type": "markdown",
113+
"metadata": {},
114+
"source": [
115+
"## Performance improvements\nReductions on pairwise distances for dense float64 datasets has been refactored\nto better take advantage of non-blocking thread parallelism. For example,\n:meth:`neighbors.NearestNeighbors.kneighbors` and\n:meth:`neighbors.NearestNeighbors.radius_neighbors` can respectively be up to \u00d720 and\n\u00d75 faster than previously. In summary, the following functions and estimators\nnow benefit from improved performance:\n\n- :func:`metrics.pairwise_distances_argmin`\n- :func:`metrics.pairwise_distances_argmin_min`\n- :class:`cluster.AffinityPropagation`\n- :class:`cluster.Birch`\n- :class:`cluster.MeanShift`\n- :class:`cluster.OPTICS`\n- :class:`cluster.SpectralClustering`\n- :func:`feature_selection.mutual_info_regression`\n- :class:`neighbors.KNeighborsClassifier`\n- :class:`neighbors.KNeighborsRegressor`\n- :class:`neighbors.RadiusNeighborsClassifier`\n- :class:`neighbors.RadiusNeighborsRegressor`\n- :class:`neighbors.LocalOutlierFactor`\n- :class:`neighbors.NearestNeighbors`\n- :class:`manifold.Isomap`\n- :class:`manifold.LocallyLinearEmbedding`\n- :class:`manifold.TSNE`\n- :func:`manifold.trustworthiness`\n- :class:`semi_supervised.LabelPropagation`\n- :class:`semi_supervised.LabelSpreading`\n\nTo know more about the technical details of this work, you can read\n`this suite of blog posts <https://blog.scikit-learn.org/technical/performances/>`_.\n\nMoreover, the computation of loss functions has been refactored using\nCython resulting in performance improvements for the following estimators:\n\n- :class:`linear_model.LogisticRegression`\n- :class:`linear_model.GammaRegressor`\n- :class:`linear_model.PoissonRegressor`\n- :class:`linear_model.TweedieRegressor`\n\n"
116+
]
117+
}
118+
],
119+
"metadata": {
120+
"kernelspec": {
121+
"display_name": "Python 3",
122+
"language": "python",
123+
"name": "python3"
124+
},
125+
"language_info": {
126+
"codemirror_mode": {
127+
"name": "ipython",
128+
"version": 3
129+
},
130+
"file_extension": ".py",
131+
"mimetype": "text/x-python",
132+
"name": "python",
133+
"nbconvert_exporter": "python",
134+
"pygments_lexer": "ipython3",
135+
"version": "3.9.12"
136+
}
137+
},
138+
"nbformat": 4,
139+
"nbformat_minor": 0
140+
}
Binary file not shown.

dev/_downloads/scikit-learn-docs.zip

113 KB
Binary file not shown.
4 Bytes
141 Bytes
281 Bytes
33 Bytes

0 commit comments

Comments
 (0)