bsipocz
diff --git a/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
2.46 KB b/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
2.46 KB
diff --git a/‎dev/_downloads/b5a4a1546e908b944c14370f9e7e2a25/plot_column_transformer_mixed_types.ipynb
Lines changed: 81 additions & 2 deletions b/‎dev/_downloads/b5a4a1546e908b944c14370f9e7e2a25/plot_column_transformer_mixed_types.ipynb
Lines changed: 81 additions & 2 deletions
diff --git a/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
3.33 KB b/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
3.33 KB
diff --git a/‎dev/_downloads/ec7916875965bf7f54b7cfe8e6dc4cc2/plot_column_transformer_mixed_types.py
Lines changed: 74 additions & 22 deletions b/‎dev/_downloads/ec7916875965bf7f54b7cfe8e6dc4cc2/plot_column_transformer_mixed_types.py
Lines changed: 74 additions & 22 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
1.89 KB b/‎dev/_downloads/scikit-learn-docs.pdf
1.89 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-40 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-40 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-40 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-40 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-76 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-76 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
-76 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
-76 Bytes
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Column Transformer with Mixed Types\n\n\nThis example illustrates how to apply different preprocessing and\nfeature extraction pipelines to different subsets of features,\nusing :class:`sklearn.compose.ColumnTransformer`.\nThis is particularly handy for the case of datasets that contain\nheterogeneous data types, since we may want to scale the\nnumeric features and one-hot encode the categorical ones.\n\nIn this example, the numeric data is standard-scaled after\nmean-imputation, while the categorical data is one-hot\nencoded after imputing missing values with a new category\n(``'missing'``).\n\nFinally, the preprocessing pipeline is integrated in a\nfull prediction pipeline using :class:`sklearn.pipeline.Pipeline`,\ntogether with a simple classification model.\n"
+        "\n# Column Transformer with Mixed Types\n\n\nThis example illustrates how to apply different preprocessing and feature\nextraction pipelines to different subsets of features, using\n:class:`sklearn.compose.ColumnTransformer`. This is particularly handy for the\ncase of datasets that contain heterogeneous data types, since we may want to\nscale the numeric features and one-hot encode the categorical ones.\n\nIn this example, the numeric data is standard-scaled after mean-imputation,\nwhile the categorical data is one-hot encoded after imputing missing values\nwith a new category (``'missing'``).\n\nIn addition, we show two different ways to dispatch the columns to the\nparticular pre-processor: by column names and by column data types.\n\nFinally, the preprocessing pipeline is integrated in a full prediction pipeline\nusing :class:`sklearn.pipeline.Pipeline`, together with a simple classification\nmodel.\n"
       ]
     },
     {
@@ -26,7 +26,86 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\nnp.random.seed(0)\n\n# Load data from https://www.openml.org/d/40945\nX, y = fetch_openml(\"titanic\", version=1, as_frame=True, return_X_y=True)\n\n# Alternatively X and y can be obtained directly from the frame attribute:\n# X = titanic.frame.drop('survived', axis=1)\n# y = titanic.frame['survived']\n\n# We will train our classifier with the following features:\n# Numeric Features:\n# - age: float.\n# - fare: float.\n# Categorical Features:\n# - embarked: categories encoded as strings {'C', 'S', 'Q'}.\n# - sex: categories encoded as strings {'female', 'male'}.\n# - pclass: ordinal integers {1, 2, 3}.\n\n# We create the preprocessing pipelines for both numeric and categorical data.\nnumeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='median')),\n    ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n    ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n    transformers=[\n        ('num', numeric_transformer, numeric_features),\n        ('cat', categorical_transformer, categorical_features)])\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n                      ('classifier', LogisticRegression())])\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
+        "# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\nnp.random.seed(0)\n\n# Load data from https://www.openml.org/d/40945\nX, y = fetch_openml(\"titanic\", version=1, as_frame=True, return_X_y=True)\n\n# Alternatively X and y can be obtained directly from the frame attribute:\n# X = titanic.frame.drop('survived', axis=1)\n# y = titanic.frame['survived']"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Use ``ColumnTransformer`` by selecting column by names\n##############################################################################\n We will train our classifier with the following features:\n\n Numeric Features:\n\n * ``age``: float;\n * ``fare``: float.\n\n Categorical Features:\n\n * ``embarked``: categories encoded as strings ``{'C', 'S', 'Q'}``;\n * ``sex``: categories encoded as strings ``{'female', 'male'}``;\n * ``pclass``: ordinal integers ``{1, 2, 3}``.\n\n We create the preprocessing pipelines for both numeric and categorical data.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "numeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='median')),\n    ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n    ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n    transformers=[\n        ('num', numeric_transformer, numeric_features),\n        ('cat', categorical_transformer, categorical_features)])\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n                      ('classifier', LogisticRegression())])\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Use ``ColumnTransformer`` by selecting column by data types\n##############################################################################\n When dealing with a cleaned dataset, the preprocessing can be automatic by\n using the data types of the column to decide whether to treat a column as a\n numerical or categorical feature.\n :func:`sklearn.compose.make_column_selector` gives this possibility.\n First, let's only select a subset of columns to simplify our\n example.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare']\nX = X[subset_feature]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Then, we introspect the information regarding each column data type.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "X.info()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We can observe that the `embarked` and `sex` columns were tagged as\n`category` columns when loading the data with ``fetch_openml``. Therefore, we\ncan use this information to dispatch the categorical columns to the\n``categorical_transformer`` and the remaining columns to the\n``numerical_transformer``.\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>In practice, you will have to handle yourself the column data type.\n   If you want some columns to be considered as `category`, you will have to\n   convert them into categorical columns. If you are using pandas, you can\n   refer to their documentation regarding `Categorical data\n   <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_.</p></div>\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.compose import make_column_selector as selector\n\npreprocessor = ColumnTransformer(transformers=[\n    ('num', numeric_transformer, selector(dtype_exclude=\"category\")),\n    ('cat', categorical_transformer, selector(dtype_include=\"category\"))\n])\n\n# Reproduce the identical fit/score process\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
       ]
     },
     {
 
@@ -3,21 +3,22 @@
 Column Transformer with Mixed Types
 ===================================
 
-This example illustrates how to apply different preprocessing and
-feature extraction pipelines to different subsets of features,
-using :class:`sklearn.compose.ColumnTransformer`.
-This is particularly handy for the case of datasets that contain
-heterogeneous data types, since we may want to scale the
-numeric features and one-hot encode the categorical ones.
-
-In this example, the numeric data is standard-scaled after
-mean-imputation, while the categorical data is one-hot
-encoded after imputing missing values with a new category
-(``'missing'``).
-
-Finally, the preprocessing pipeline is integrated in a
-full prediction pipeline using :class:`sklearn.pipeline.Pipeline`,
-together with a simple classification model.
+This example illustrates how to apply different preprocessing and feature
+extraction pipelines to different subsets of features, using
+:class:`sklearn.compose.ColumnTransformer`. This is particularly handy for the
+case of datasets that contain heterogeneous data types, since we may want to
+scale the numeric features and one-hot encode the categorical ones.
+
+In this example, the numeric data is standard-scaled after mean-imputation,
+while the categorical data is one-hot encoded after imputing missing values
+with a new category (``'missing'``).
+
+In addition, we show two different ways to dispatch the columns to the
+particular pre-processor: by column names and by column data types.
+
+Finally, the preprocessing pipeline is integrated in a full prediction pipeline
+using :class:`sklearn.pipeline.Pipeline`, together with a simple classification
+model.
 """
 
 # Author: Pedro Morales <[email protected]>
@@ -43,16 +44,24 @@
 # X = titanic.frame.drop('survived', axis=1)
 # y = titanic.frame['survived']
 
+###############################################################################
+# Use ``ColumnTransformer`` by selecting column by names
+###############################################################################
 # We will train our classifier with the following features:
+#
 # Numeric Features:
-# - age: float.
-# - fare: float.
+#
+# * ``age``: float;
+# * ``fare``: float.
+#
 # Categorical Features:
-# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
-# - sex: categories encoded as strings {'female', 'male'}.
-# - pclass: ordinal integers {1, 2, 3}.
-
+#
+# * ``embarked``: categories encoded as strings ``{'C', 'S', 'Q'}``;
+# * ``sex``: categories encoded as strings ``{'female', 'male'}``;
+# * ``pclass``: ordinal integers ``{1, 2, 3}``.
+#
 # We create the preprocessing pipelines for both numeric and categorical data.
+
 numeric_features = ['age', 'fare']
 numeric_transformer = Pipeline(steps=[
     ('imputer', SimpleImputer(strategy='median')),
@@ -78,6 +87,50 @@
 clf.fit(X_train, y_train)
 print("model score: %.3f" % clf.score(X_test, y_test))
 
+###############################################################################
+# Use ``ColumnTransformer`` by selecting column by data types
+###############################################################################
+# When dealing with a cleaned dataset, the preprocessing can be automatic by
+# using the data types of the column to decide whether to treat a column as a
+# numerical or categorical feature.
+# :func:`sklearn.compose.make_column_selector` gives this possibility.
+# First, let's only select a subset of columns to simplify our
+# example.
+
+subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare']
+X = X[subset_feature]
+
+###############################################################################
+# Then, we introspect the information regarding each column data type.
+
+X.info()
+
+###############################################################################
+# We can observe that the `embarked` and `sex` columns were tagged as
+# `category` columns when loading the data with ``fetch_openml``. Therefore, we
+# can use this information to dispatch the categorical columns to the
+# ``categorical_transformer`` and the remaining columns to the
+# ``numerical_transformer``.
+
+###############################################################################
+# .. note:: In practice, you will have to handle yourself the column data type.
+#    If you want some columns to be considered as `category`, you will have to
+#    convert them into categorical columns. If you are using pandas, you can
+#    refer to their documentation regarding `Categorical data
+#    <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_.
+
+from sklearn.compose import make_column_selector as selector
+
+preprocessor = ColumnTransformer(transformers=[
+    ('num', numeric_transformer, selector(dtype_exclude="category")),
+    ('cat', categorical_transformer, selector(dtype_include="category"))
+])
+
+# Reproduce the identical fit/score process
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+
+clf.fit(X_train, y_train)
+print("model score: %.3f" % clf.score(X_test, y_test))
 
 ###############################################################################
 # Using the prediction pipeline in a grid search
@@ -89,7 +142,6 @@
 # and the regularization parameter of the logistic regression using
 # :class:`sklearn.model_selection.GridSearchCV`.
 
-
 param_grid = {
     'preprocessor__num__imputer__strategy': ['mean', 'median'],
     'classifier__C': [0.1, 1.0, 10, 100],