Skip to content

Commit 7fc3b35

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 6818c9baecaa8634da654e8fb6c3716df148b0b4
1 parent 6a861f3 commit 7fc3b35

File tree

1,275 files changed

+10924
-5006
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,275 files changed

+10924
-5006
lines changed
Binary file not shown.
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
"""
2+
================================
3+
Introducing the `set_output` API
4+
================================
5+
6+
.. currentmodule:: sklearn
7+
8+
This example will demonstrate the `set_output` API to configure transformers to
9+
output pandas DataFrames. `set_output` can be configured per estimator by calling
10+
the `set_output` method or globally by setting `set_config(transform_output="pandas")`.
11+
For details, see
12+
`SLEP018 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep018/proposal.html>`__.
13+
""" # noqa
14+
15+
# %%
16+
# First, we load the iris dataset as a DataFrame to demonstrate the `set_output` API.
17+
from sklearn.datasets import load_iris
18+
from sklearn.model_selection import train_test_split
19+
20+
X, y = load_iris(as_frame=True, return_X_y=True)
21+
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
22+
X_train.head()
23+
24+
# %%
25+
# To configure an estimator such as :class:`preprocessing.StandardScalar` to return
26+
# DataFrames, call `set_output`. This feature requires pandas to be installed.
27+
28+
from sklearn.preprocessing import StandardScaler
29+
30+
scaler = StandardScaler().set_output(transform="pandas")
31+
32+
scaler.fit(X_train)
33+
X_test_scaled = scaler.transform(X_test)
34+
X_test_scaled.head()
35+
36+
# %%
37+
# `set_output` can be called after `fit` to configure `transform` after the fact.
38+
scaler2 = StandardScaler()
39+
40+
scaler2.fit(X_train)
41+
X_test_np = scaler2.transform(X_test)
42+
print(f"Default output type: {type(X_test_np).__name__}")
43+
44+
scaler2.set_output(transform="pandas")
45+
X_test_df = scaler2.transform(X_test)
46+
print(f"Configured pandas output type: {type(X_test_df).__name__}")
47+
48+
# %%
49+
# In a :class:`pipeline.Pipeline`, `set_output` configures all steps to output
50+
# DataFrames.
51+
from sklearn.pipeline import make_pipeline
52+
from sklearn.linear_model import LogisticRegression
53+
from sklearn.feature_selection import SelectPercentile
54+
55+
clf = make_pipeline(
56+
StandardScaler(), SelectPercentile(percentile=75), LogisticRegression()
57+
)
58+
clf.set_output(transform="pandas")
59+
clf.fit(X_train, y_train)
60+
61+
# %%
62+
# Each transformer in the pipeline is configured to return DataFrames. This
63+
# means that the final logistic regression step contain the feature names.
64+
clf[-1].feature_names_in_
65+
66+
# %%
67+
# Next we load the titanic dataset to demonstrate `set_output` with
68+
# :class:`compose.ColumnTransformer` and heterogenous data.
69+
from sklearn.datasets import fetch_openml
70+
71+
X, y = fetch_openml(
72+
"titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
73+
)
74+
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
75+
76+
# %%
77+
# The `set_output` API can be configured globally by using :func:`set_config` and
78+
# setting the `transform_output` to `"pandas"`.
79+
from sklearn.compose import ColumnTransformer
80+
from sklearn.preprocessing import OneHotEncoder, StandardScaler
81+
from sklearn.impute import SimpleImputer
82+
from sklearn import set_config
83+
84+
set_config(transform_output="pandas")
85+
86+
num_pipe = make_pipeline(SimpleImputer(), StandardScaler())
87+
ct = ColumnTransformer(
88+
(
89+
("numerical", num_pipe, ["age", "fare"]),
90+
(
91+
"categorical",
92+
OneHotEncoder(
93+
sparse_output=False, drop="if_binary", handle_unknown="ignore"
94+
),
95+
["embarked", "sex", "pclass"],
96+
),
97+
),
98+
verbose_feature_names_out=False,
99+
)
100+
clf = make_pipeline(ct, SelectPercentile(percentile=50), LogisticRegression())
101+
clf.fit(X_train, y_train)
102+
clf.score(X_test, y_test)
103+
104+
# %%
105+
# With the global configuration, all transformers output DataFrames. This allows us to
106+
# easily plot the logistic regression coefficients with the corresponding feature names.
107+
import pandas as pd
108+
109+
log_reg = clf[-1]
110+
coef = pd.Series(log_reg.coef_.ravel(), index=log_reg.feature_names_in_)
111+
_ = coef.sort_values().plot.barh()
112+
113+
# %%
114+
# This resets `transform_output` to its default value to avoid impacting other
115+
# examples when generating the scikit-learn documentation
116+
set_config(transform_output="default")

dev/_downloads/40f4aad91af595a370d7582e3a23bed7/plot_roc.ipynb

Lines changed: 242 additions & 8 deletions
Large diffs are not rendered by default.
Binary file not shown.

0 commit comments

Comments
 (0)