scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-46 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-46 Bytes
diff --git a/‎dev/_downloads/3c9b7bcd0b16f172ac12ffad61f3b5f0/plot_stack_predictors.ipynb
Lines changed: 85 additions & 5 deletions b/‎dev/_downloads/3c9b7bcd0b16f172ac12ffad61f3b5f0/plot_stack_predictors.ipynb
Lines changed: 85 additions & 5 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.18 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.18 KB
diff --git a/‎dev/_downloads/c6ccb1a9c5f82321f082e9767a2706f3/plot_stack_predictors.py
Lines changed: 69 additions & 71 deletions b/‎dev/_downloads/c6ccb1a9c5f82321f082e9767a2706f3/plot_stack_predictors.py
Lines changed: 69 additions & 71 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
43 Bytes b/‎dev/_downloads/scikit-learn-docs.zip
43 Bytes
diff --git a/‎dev/_images/binder_badge_logo.png
0 Bytes b/‎dev/_images/binder_badge_logo.png
0 Bytes
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-206 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-206 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-206 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-206 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-186 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-186 Bytes
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "print(__doc__)\n\n# Authors: Guillaume Lemaitre <[email protected]>\n#          Maria Telenczuk    <https://github.com/maikia>\n# License: BSD 3 clause"
+        "# Authors: Guillaume Lemaitre <[email protected]>\n#          Maria Telenczuk    <https://github.com/maikia>\n# License: BSD 3 clause\n\nprint(__doc__)\n\nfrom sklearn import set_config\nset_config(display='diagram')"
       ]
     },
     {
@@ -51,7 +51,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Make pipeline to preprocess the data\n\n Before we can use Ames dataset we still need to do some preprocessing.\n First, the dataset has many missing values. To impute them, we will exchange\n categorical missing values with the new category 'missing' while the\n numerical missing values with the 'mean' of the column. We will also encode\n the categories with either :class:`~sklearn.preprocessing.OneHotEncoder\n <sklearn.preprocessing.OneHotEncoder>` or\n :class:`~sklearn.preprocessing.OrdinalEncoder\n <sklearn.preprocessing.OrdinalEncoder>` depending for which type of model we\n will use them (linear or non-linear model). To facilitate this preprocessing\n we will make two pipelines.\n You can skip this section if your data is ready to use and does\n not need preprocessing\n\n"
+        "## Make pipeline to preprocess the data\n\n Before we can use Ames dataset we still need to do some preprocessing.\n First, we will select the categorical and numerical columns of the dataset to\n construct the first step of the pipeline.\n\n"
       ]
     },
     {
@@ -62,14 +62,94 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.compose import make_column_transformer\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.preprocessing import OrdinalEncoder\nfrom sklearn.preprocessing import StandardScaler\n\n\ncat_cols = X.columns[X.dtypes == 'O']\nnum_cols = X.columns[X.dtypes == 'float64']\n\ncategories = [\n    X[column].unique() for column in X[cat_cols]]\n\nfor cat in categories:\n    cat[cat == None] = 'missing'  # noqa\n\ncat_proc_nlin = make_pipeline(\n    SimpleImputer(missing_values=None, strategy='constant',\n                  fill_value='missing'),\n    OrdinalEncoder(categories=categories)\n    )\n\nnum_proc_nlin = make_pipeline(SimpleImputer(strategy='mean'))\n\ncat_proc_lin = make_pipeline(\n    SimpleImputer(missing_values=None,\n                  strategy='constant',\n                  fill_value='missing'),\n    OneHotEncoder(categories=categories)\n)\n\nnum_proc_lin = make_pipeline(\n    SimpleImputer(strategy='mean'),\n    StandardScaler()\n)\n\n# transformation to use for non-linear estimators\nprocessor_nlin = make_column_transformer(\n    (cat_proc_nlin, cat_cols),\n    (num_proc_nlin, num_cols),\n    remainder='passthrough')\n\n# transformation to use for linear estimators\nprocessor_lin = make_column_transformer(\n    (cat_proc_lin, cat_cols),\n    (num_proc_lin, num_cols),\n    remainder='passthrough')"
+        "from sklearn.compose import make_column_selector\n\ncat_selector = make_column_selector(dtype_include=object)\nnum_selector = make_column_selector(dtype_include=np.number)\ncat_selector(X)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "num_selector(X)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Then, we will need to design preprocessing pipelines which depends on the\nending regressor. If the ending regressor is a linear model, one needs to\none-hot encode the categories. If the ending regressor is a tree-based model\nan ordinal encoder will be sufficient. Besides, numerical values need to be\nstandardized for a linear model while the raw numerical data can be treated\nas is by a tree-based model. However, both models need an imputer to\nhandle missing values.\n\nWe will first design the pipeline required for the tree-based models.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.compose import make_column_transformer\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OrdinalEncoder\n\ncat_tree_processor = OrdinalEncoder(\n    handle_unknown=\"use_encoded_value\", unknown_value=-1)\nnum_tree_processor = SimpleImputer(strategy=\"mean\", add_indicator=True)\n\ntree_preprocessor = make_column_transformer(\n    (num_tree_processor, num_selector), (cat_tree_processor, cat_selector))\ntree_preprocessor"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Then, we will now define the preprocessor used when the ending regressor\nis a linear model.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.preprocessing import OneHotEncoder\nfrom sklearn.preprocessing import StandardScaler\n\ncat_linear_processor = OneHotEncoder(handle_unknown=\"ignore\")\nnum_linear_processor = make_pipeline(\n    StandardScaler(), SimpleImputer(strategy=\"mean\", add_indicator=True))\n\nlinear_preprocessor = make_column_transformer(\n    (num_linear_processor, num_selector), (cat_linear_processor, cat_selector))\nlinear_preprocessor"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Stack of predictors on a single data set\n\n It is sometimes tedious to find the model which will best perform on a given\n dataset. Stacking provide an alternative by combining the outputs of several\n learners, without the need to choose a model specifically. The performance of\n stacking is usually close to the best model and sometimes it can outperform\n the prediction performance of each individual model.\n\n Here, we combine 3 learners (linear and non-linear) and use a ridge regressor\n to combine their outputs together.\n\n Note: although we will make new pipelines with the processors which we wrote\n in the previous section for the 3 learners, the final estimator RidgeCV()\n does not need preprocessing of the data as it will be fed with the already\n preprocessed output from the 3 learners.\n\n"
+        "## Stack of predictors on a single data set\n\n It is sometimes tedious to find the model which will best perform on a given\n dataset. Stacking provide an alternative by combining the outputs of several\n learners, without the need to choose a model specifically. The performance of\n stacking is usually close to the best model and sometimes it can outperform\n the prediction performance of each individual model.\n\n Here, we combine 3 learners (linear and non-linear) and use a ridge regressor\n to combine their outputs together.\n\n .. note::\n    Although we will make new pipelines with the processors which we wrote in\n    the previous section for the 3 learners, the final estimator\n    :class:`~sklearn.linear_model.RidgeCV()` does not need preprocessing of\n    the data as it will be fed with the already preprocessed output from the 3\n    learners.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.linear_model import LassoCV\n\nlasso_pipeline = make_pipeline(linear_preprocessor, LassoCV())\nlasso_pipeline"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.ensemble import RandomForestRegressor\n\nrf_pipeline = make_pipeline(\n    tree_preprocessor, RandomForestRegressor(random_state=42))\nrf_pipeline"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.experimental import enable_hist_gradient_boosting  # noqa\nfrom sklearn.ensemble import HistGradientBoostingRegressor\n\ngbdt_pipeline = make_pipeline(\n    tree_preprocessor, HistGradientBoostingRegressor(random_state=0))\ngbdt_pipeline"
       ]
     },
     {
@@ -80,7 +160,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.experimental import enable_hist_gradient_boosting  # noqa\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.ensemble import StackingRegressor\nfrom sklearn.linear_model import LassoCV\nfrom sklearn.linear_model import RidgeCV\n\n\nlasso_pipeline = make_pipeline(processor_lin,\n                               LassoCV())\n\nrf_pipeline = make_pipeline(processor_nlin,\n                            RandomForestRegressor(random_state=42))\n\ngradient_pipeline = make_pipeline(\n    processor_nlin,\n    HistGradientBoostingRegressor(random_state=0))\n\nestimators = [('Random Forest', rf_pipeline),\n              ('Lasso', lasso_pipeline),\n              ('Gradient Boosting', gradient_pipeline)]\n\nstacking_regressor = StackingRegressor(estimators=estimators,\n                                       final_estimator=RidgeCV())"
+        "from sklearn.ensemble import StackingRegressor\nfrom sklearn.linear_model import RidgeCV\n\nestimators = [('Random Forest', rf_pipeline),\n              ('Lasso', lasso_pipeline),\n              ('Gradient Boosting', gbdt_pipeline)]\n\nstacking_regressor = StackingRegressor(\n    estimators=estimators, final_estimator=RidgeCV())\nstacking_regressor"
       ]
     },
     {
 
@@ -15,12 +15,15 @@
 stacking strategy. Stacking slightly improves the overall performance.
 
 """
-print(__doc__)
 
 # Authors: Guillaume Lemaitre <[email protected]>
 #          Maria Telenczuk    <https://github.com/maikia>
 # License: BSD 3 clause
 
+print(__doc__)
+
+from sklearn import set_config
+set_config(display='diagram')
 
 # %%
 # Download the dataset
@@ -73,68 +76,56 @@ def load_ames_housing():
 ##############################################################################
 #
 # Before we can use Ames dataset we still need to do some preprocessing.
-# First, the dataset has many missing values. To impute them, we will exchange
-# categorical missing values with the new category 'missing' while the
-# numerical missing values with the 'mean' of the column. We will also encode
-# the categories with either :class:`~sklearn.preprocessing.OneHotEncoder
-# <sklearn.preprocessing.OneHotEncoder>` or
-# :class:`~sklearn.preprocessing.OrdinalEncoder
-# <sklearn.preprocessing.OrdinalEncoder>` depending for which type of model we
-# will use them (linear or non-linear model). To facilitate this preprocessing
-# we will make two pipelines.
-# You can skip this section if your data is ready to use and does
-# not need preprocessing
+# First, we will select the categorical and numerical columns of the dataset to
+# construct the first step of the pipeline.
+
+from sklearn.compose import make_column_selector
+
+cat_selector = make_column_selector(dtype_include=object)
+num_selector = make_column_selector(dtype_include=np.number)
+cat_selector(X)
 
+# %%
+num_selector(X)
+
+# %%
+# Then, we will need to design preprocessing pipelines which depends on the
+# ending regressor. If the ending regressor is a linear model, one needs to
+# one-hot encode the categories. If the ending regressor is a tree-based model
+# an ordinal encoder will be sufficient. Besides, numerical values need to be
+# standardized for a linear model while the raw numerical data can be treated
+# as is by a tree-based model. However, both models need an imputer to
+# handle missing values.
+#
+# We will first design the pipeline required for the tree-based models.
 
 from sklearn.compose import make_column_transformer
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import OrdinalEncoder
-from sklearn.preprocessing import StandardScaler
-
-
-cat_cols = X.columns[X.dtypes == 'O']
-num_cols = X.columns[X.dtypes == 'float64']
 
-categories = [
-    X[column].unique() for column in X[cat_cols]]
+cat_tree_processor = OrdinalEncoder(
+    handle_unknown="use_encoded_value", unknown_value=-1)
+num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)
 
-for cat in categories:
-    cat[cat == None] = 'missing'  # noqa
+tree_preprocessor = make_column_transformer(
+    (num_tree_processor, num_selector), (cat_tree_processor, cat_selector))
+tree_preprocessor
 
-cat_proc_nlin = make_pipeline(
-    SimpleImputer(missing_values=None, strategy='constant',
-                  fill_value='missing'),
-    OrdinalEncoder(categories=categories)
-    )
-
-num_proc_nlin = make_pipeline(SimpleImputer(strategy='mean'))
-
-cat_proc_lin = make_pipeline(
-    SimpleImputer(missing_values=None,
-                  strategy='constant',
-                  fill_value='missing'),
-    OneHotEncoder(categories=categories)
-)
-
-num_proc_lin = make_pipeline(
-    SimpleImputer(strategy='mean'),
-    StandardScaler()
-)
+# %%
+# Then, we will now define the preprocessor used when the ending regressor
+# is a linear model.
 
-# transformation to use for non-linear estimators
-processor_nlin = make_column_transformer(
-    (cat_proc_nlin, cat_cols),
-    (num_proc_nlin, num_cols),
-    remainder='passthrough')
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
 
-# transformation to use for linear estimators
-processor_lin = make_column_transformer(
-    (cat_proc_lin, cat_cols),
-    (num_proc_lin, num_cols),
-    remainder='passthrough')
+cat_linear_processor = OneHotEncoder(handle_unknown="ignore")
+num_linear_processor = make_pipeline(
+    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True))
 
+linear_preprocessor = make_column_transformer(
+    (num_linear_processor, num_selector), (cat_linear_processor, cat_selector))
+linear_preprocessor
 
 # %%
 # Stack of predictors on a single data set
@@ -149,37 +140,44 @@ def load_ames_housing():
 # Here, we combine 3 learners (linear and non-linear) and use a ridge regressor
 # to combine their outputs together.
 #
-# Note: although we will make new pipelines with the processors which we wrote
-# in the previous section for the 3 learners, the final estimator RidgeCV()
-# does not need preprocessing of the data as it will be fed with the already
-# preprocessed output from the 3 learners.
+# .. note::
+#    Although we will make new pipelines with the processors which we wrote in
+#    the previous section for the 3 learners, the final estimator
+#    :class:`~sklearn.linear_model.RidgeCV()` does not need preprocessing of
+#    the data as it will be fed with the already preprocessed output from the 3
+#    learners.
 
+from sklearn.linear_model import LassoCV
 
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
-from sklearn.ensemble import HistGradientBoostingRegressor
+lasso_pipeline = make_pipeline(linear_preprocessor, LassoCV())
+lasso_pipeline
+
+# %%
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.ensemble import StackingRegressor
-from sklearn.linear_model import LassoCV
-from sklearn.linear_model import RidgeCV
 
+rf_pipeline = make_pipeline(
+    tree_preprocessor, RandomForestRegressor(random_state=42))
+rf_pipeline
 
-lasso_pipeline = make_pipeline(processor_lin,
-                               LassoCV())
+# %%
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
 
-rf_pipeline = make_pipeline(processor_nlin,
-                            RandomForestRegressor(random_state=42))
+gbdt_pipeline = make_pipeline(
+    tree_preprocessor, HistGradientBoostingRegressor(random_state=0))
+gbdt_pipeline
 
-gradient_pipeline = make_pipeline(
-    processor_nlin,
-    HistGradientBoostingRegressor(random_state=0))
+# %%
+from sklearn.ensemble import StackingRegressor
+from sklearn.linear_model import RidgeCV
 
 estimators = [('Random Forest', rf_pipeline),
               ('Lasso', lasso_pipeline),
-              ('Gradient Boosting', gradient_pipeline)]
-
-stacking_regressor = StackingRegressor(estimators=estimators,
-                                       final_estimator=RidgeCV())
+              ('Gradient Boosting', gbdt_pipeline)]
 
+stacking_regressor = StackingRegressor(
+    estimators=estimators, final_estimator=RidgeCV())
+stacking_regressor
 
 # %%
 # Measure and plot the results