scikit-learn
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
792 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
792 Bytes
diff --git a/‎dev/_downloads/4f6558a73e0c79834afc005bac34dc13/plot_target_encoder_cross_val.py
Lines changed: 83 additions & 54 deletions b/‎dev/_downloads/4f6558a73e0c79834afc005bac34dc13/plot_target_encoder_cross_val.py
Lines changed: 83 additions & 54 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
827 Bytes b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
827 Bytes
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 28b7307cedb45eb961cbd8228e144f2a
+config: ed2dda44dc5ed392cb6fbae664adcacd
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -6,21 +6,26 @@
 .. currentmodule:: sklearn.preprocessing
 
 The :class:`TargetEncoder` replaces each category of a categorical feature with
-the mean of the target variable for that category. This method is useful
+the shrunk mean of the target variable for that category. This method is useful
 in cases where there is a strong relationship between the categorical feature
 and the target. To prevent overfitting, :meth:`TargetEncoder.fit_transform` uses
-an internal cross fitting scheme to encode the training data to be used by a
-downstream model. In this example, we demonstrate the importance of the cross fitting
-procedure to prevent overfitting.
+an internal :term:`cross fitting` scheme to encode the training data to be used
+by a downstream model. This scheme involves splitting the data into *k* folds
+and encoding each fold using the encodings learnt using the other *k-1* folds.
+In this example, we demonstrate the importance of the cross
+fitting procedure to prevent overfitting.
 """
 
 # %%
 # Create Synthetic Dataset
 # ========================
-# For this example, we build a dataset with three categorical features: an informative
-# feature with medium cardinality, an uninformative feature with medium cardinality,
-# and an uninformative feature with high cardinality. First, we generate the informative
-# feature:
+# For this example, we build a dataset with three categorical features:
+#
+# * an informative feature with medium cardinality ("informative")
+# * an uninformative feature with medium cardinality ("shuffled")
+# * an uninformative feature with high cardinality ("near_unique")
+#
+# First, we generate the informative feature:
 import numpy as np
 
 from sklearn.preprocessing import KBinsDiscretizer
@@ -33,12 +38,16 @@
 n_categories = 100
 
 kbins = KBinsDiscretizer(
-    n_bins=n_categories, encode="ordinal", strategy="uniform", random_state=rng
+    n_bins=n_categories,
+    encode="ordinal",
+    strategy="uniform",
+    random_state=rng,
+    subsample=None,
 )
 X_informative = kbins.fit_transform((y + noise).reshape(-1, 1))
 
-# Remove the linear relationship between y and the bin index by permuting the values of
-# X_informative
+# Remove the linear relationship between y and the bin index by permuting the
+# values of X_informative:
 permuted_categories = rng.permutation(n_categories)
 X_informative = permuted_categories[X_informative.astype(np.int32)]
 
@@ -48,13 +57,13 @@
 X_shuffled = rng.permutation(X_informative)
 
 # %%
-# The uninformative feature with high cardinality is generated so that is independent of
-# the target variable. We will show that target encoding without cross fitting will
-# cause catastrophic overfitting for the downstream regressor. These high cardinality
-# features are basically unique identifiers for samples which should generally be
-# removed from machine learning dataset. In this example, we generate them to show how
-# :class:`TargetEncoder`'s default cross fitting behavior mitigates the overfitting
-# issue automatically.
+# The uninformative feature with high cardinality is generated so that it is
+# independent of the target variable. We will show that target encoding without
+# :term:`cross fitting` will cause catastrophic overfitting for the downstream
+# regressor. These high cardinality features are basically unique identifiers
+# for samples which should generally be removed from machine learning datasets.
+# In this example, we generate them to show how :class:`TargetEncoder`'s default
+# :term:`cross fitting` behavior mitigates the overfitting issue automatically.
 X_near_unique_categories = rng.choice(
     int(0.9 * n_samples), size=n_samples, replace=True
 ).reshape(-1, 1)
@@ -79,9 +88,10 @@
 # ==========================
 # In this section, we train a ridge regressor on the dataset with and without
 # encoding and explore the influence of target encoder with and without the
-# internal cross fitting. First, we see the Ridge model trained on the
-# raw features will have low performance, because the order of the informative
-# feature is not informative:
+# internal :term:`cross fitting`. First, we see the Ridge model trained on the
+# raw features will have low performance. This is because we permuted the order
+# of the informative feature meaning `X_informative` is not informative when
+# raw:
 import sklearn
 from sklearn.linear_model import Ridge
 
@@ -96,15 +106,15 @@
 
 # %%
 # Next, we create a pipeline with the target encoder and ridge model. The pipeline
-# uses :meth:`TargetEncoder.fit_transform` which uses cross fitting. We see that
-# the model fits the data well and generalizes to the test set:
+# uses :meth:`TargetEncoder.fit_transform` which uses :term:`cross fitting`. We
+# see that the model fits the data well and generalizes to the test set:
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import TargetEncoder
 
-model_with_cv = make_pipeline(TargetEncoder(random_state=0), ridge)
-model_with_cv.fit(X_train, y_train)
-print("Model with CV on training set: ", model_with_cv.score(X_train, y_train))
-print("Model with CV on test set: ", model_with_cv.score(X_test, y_test))
+model_with_cf = make_pipeline(TargetEncoder(random_state=0), ridge)
+model_with_cf.fit(X_train, y_train)
+print("Model with CF on train set: ", model_with_cf.score(X_train, y_train))
+print("Model with CF on test set: ", model_with_cf.score(X_test, y_test))
 
 # %%
 # The coefficients of the linear model shows that most of the weight is on the
@@ -114,49 +124,68 @@
 
 plt.rcParams["figure.constrained_layout.use"] = True
 
-coefs_cv = pd.Series(
-    model_with_cv[-1].coef_, index=model_with_cv[-1].feature_names_in_
+coefs_cf = pd.Series(
+    model_with_cf[-1].coef_, index=model_with_cf[-1].feature_names_in_
 ).sort_values()
-_ = coefs_cv.plot(kind="barh")
+ax = coefs_cf.plot(kind="barh")
+_ = ax.set(
+    title="Target encoded with cross fitting",
+    xlabel="Ridge coefficient",
+    ylabel="Feature",
+)
 
 # %%
-# While :meth:`TargetEncoder.fit_transform` uses an internal cross fitting scheme,
-# :meth:`TargetEncoder.transform` itself does not perform any cross fitting.
-# It uses the aggregation of the complete training set to transform the categorical
-# features. Thus, we can use :meth:`TargetEncoder.fit` followed by
-# :meth:`TargetEncoder.transform` to disable the cross fitting. This encoding
-# is then passed to the ridge model.
+# While :meth:`TargetEncoder.fit_transform` uses an internal
+# :term:`cross fitting` scheme to learn encodings for the training set,
+# :meth:`TargetEncoder.transform` itself does not.
+# It uses the complete training set to learn encodings and to transform the
+# categorical features. Thus, we can use :meth:`TargetEncoder.fit` followed by
+# :meth:`TargetEncoder.transform` to disable the :term:`cross fitting`. This
+# encoding is then passed to the ridge model.
 target_encoder = TargetEncoder(random_state=0)
 target_encoder.fit(X_train, y_train)
-X_train_no_cv_encoding = target_encoder.transform(X_train)
-X_test_no_cv_encoding = target_encoder.transform(X_test)
+X_train_no_cf_encoding = target_encoder.transform(X_train)
+X_test_no_cf_encoding = target_encoder.transform(X_test)
 
-model_no_cv = ridge.fit(X_train_no_cv_encoding, y_train)
+model_no_cf = ridge.fit(X_train_no_cf_encoding, y_train)
 
 # %%
-# We evaluate the model on the non-cross validated encoding and see that it overfits:
+# We evaluate the model that did not use :term:`cross fitting` when encoding and
+# see that it overfits:
 print(
-    "Model without CV on training set: ",
-    model_no_cv.score(X_train_no_cv_encoding, y_train),
+    "Model without CF on training set: ",
+    model_no_cf.score(X_train_no_cf_encoding, y_train),
 )
 print(
-    "Model without CV on test set: ", model_no_cv.score(X_test_no_cv_encoding, y_test)
+    "Model without CF on test set: ",
+    model_no_cf.score(
+        X_test_no_cf_encoding,
+        y_test,
+    ),
 )
 
 # %%
-# The ridge model overfits, because it assigns more weight to the extremely high
-# cardinality feature relative to the informative feature.
-coefs_no_cv = pd.Series(
-    model_no_cv.coef_, index=model_no_cv.feature_names_in_
+# The ridge model overfits because it assigns much more weight to the
+# uninformative extremely high cardinality ("near_unique") and medium
+# cardinality ("shuffled") features than when the model used
+# :term:`cross fitting` to encode the features.
+coefs_no_cf = pd.Series(
+    model_no_cf.coef_, index=model_no_cf.feature_names_in_
 ).sort_values()
-_ = coefs_no_cv.plot(kind="barh")
+ax = coefs_no_cf.plot(kind="barh")
+_ = ax.set(
+    title="Target encoded without cross fitting",
+    xlabel="Ridge coefficient",
+    ylabel="Feature",
+)
 
 # %%
 # Conclusion
 # ==========
-# This example demonstrates the importance of :class:`TargetEncoder`'s internal cross
-# fitting. It is important to use :meth:`TargetEncoder.fit_transform` to encode
-# training data before passing it to a machine learning model. When a
-# :class:`TargetEncoder` is a part of a :class:`~sklearn.pipeline.Pipeline` and the
-# pipeline is fitted, the pipeline will correctly call
-# :meth:`TargetEncoder.fit_transform` and pass the encoding along.
+# This example demonstrates the importance of :class:`TargetEncoder`'s internal
+# :term:`cross fitting`. It is important to use
+# :meth:`TargetEncoder.fit_transform` to encode training data before passing it
+# to a machine learning model. When a :class:`TargetEncoder` is a part of a
+# :class:`~sklearn.pipeline.Pipeline` and the pipeline is fitted, the pipeline
+# will correctly call :meth:`TargetEncoder.fit_transform` and use
+# :term:`cross fitting` when encoding the training data.