solversa
diff --git a/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
1.83 KB b/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
1.83 KB
diff --git a/‎dev/_downloads/b5a4a1546e908b944c14370f9e7e2a25/plot_column_transformer_mixed_types.ipynb
Lines changed: 108 additions & 7 deletions b/‎dev/_downloads/b5a4a1546e908b944c14370f9e7e2a25/plot_column_transformer_mixed_types.ipynb
Lines changed: 108 additions & 7 deletions
diff --git a/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
2.96 KB b/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
2.96 KB
diff --git a/‎dev/_downloads/ec7916875965bf7f54b7cfe8e6dc4cc2/plot_column_transformer_mixed_types.py
Lines changed: 49 additions & 5 deletions b/‎dev/_downloads/ec7916875965bf7f54b7cfe8e6dc4cc2/plot_column_transformer_mixed_types.py
Lines changed: 49 additions & 5 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
18.7 KB b/‎dev/_downloads/scikit-learn-docs.pdf
18.7 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-166 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-166 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
-166 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
-166 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-60 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-60 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0041.png
-60 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0041.png
-60 Bytes
@@ -33,7 +33,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Use ``ColumnTransformer`` by selecting column by names\n##############################################################################\n We will train our classifier with the following features:\n\n Numeric Features:\n\n * ``age``: float;\n * ``fare``: float.\n\n Categorical Features:\n\n * ``embarked``: categories encoded as strings ``{'C', 'S', 'Q'}``;\n * ``sex``: categories encoded as strings ``{'female', 'male'}``;\n * ``pclass``: ordinal integers ``{1, 2, 3}``.\n\n We create the preprocessing pipelines for both numeric and categorical data.\n\n"
+        "Use ``ColumnTransformer`` by selecting column by names\n##############################################################################\n We will train our classifier with the following features:\n\n Numeric Features:\n\n * ``age``: float;\n * ``fare``: float.\n\n Categorical Features:\n\n * ``embarked``: categories encoded as strings ``{'C', 'S', 'Q'}``;\n * ``sex``: categories encoded as strings ``{'female', 'male'}``;\n * ``pclass``: ordinal integers ``{1, 2, 3}``.\n\n We create the preprocessing pipelines for both numeric and categorical data.\n Note that ``pclass`` could either be treated as a categorical or numeric\n feature.\n\n"
       ]
     },
     {
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "numeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='median')),\n    ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n    ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n    transformers=[\n        ('num', numeric_transformer, numeric_features),\n        ('cat', categorical_transformer, categorical_features)])\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n                      ('classifier', LogisticRegression())])\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
+        "numeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='median')),\n    ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n    ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n    transformers=[\n        ('num', numeric_transformer, numeric_features),\n        ('cat', categorical_transformer, categorical_features)])\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n                      ('classifier', LogisticRegression())])\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n                                                    random_state=0)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn import set_config\nset_config(display='diagram')\nclf"
+        "from sklearn import set_config\n\nset_config(display='diagram')\nclf"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare']\nX = X[subset_feature]"
+        "subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare']\nX_train, X_test = X_train[subset_feature], X_test[subset_feature]"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "X.info()"
+        "X_train.info()"
       ]
     },
     {
@@ -123,7 +123,36 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.compose import make_column_selector as selector\n\npreprocessor = ColumnTransformer(transformers=[\n    ('num', numeric_transformer, selector(dtype_exclude=\"category\")),\n    ('cat', categorical_transformer, selector(dtype_include=\"category\"))\n])\n\n# Reproduce the identical fit/score process\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
+        "from sklearn.compose import make_column_selector as selector\n\npreprocessor = ColumnTransformer(transformers=[\n    ('num', numeric_transformer, selector(dtype_exclude=\"category\")),\n    ('cat', categorical_transformer, selector(dtype_include=\"category\"))\n])\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n                      ('classifier', LogisticRegression())])\n\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The resulting score is not exactly the same as the one from the previous\npipeline becase the dtype-based selector treats the ``pclass`` columns as\na numeric features instead of a categorical feature as previously:\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "selector(dtype_exclude=\"category\")(X_train)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "selector(dtype_include=\"category\")(X_train)"
       ]
     },
     {
@@ -141,7 +170,79 @@
       },
       "outputs": [],
       "source": [
-        "param_grid = {\n    'preprocessor__num__imputer__strategy': ['mean', 'median'],\n    'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %.3f\"\n       % grid_search.score(X_test, y_test)))"
+        "param_grid = {\n    'preprocessor__num__imputer__strategy': ['mean', 'median'],\n    'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10)\ngrid_search"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Calling 'fit' triggers the cross-validated search for the best\nhyper-parameters combination:\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "grid_search.fit(X_train, y_train)\n\nprint(f\"Best params:\")\nprint(grid_search.best_params_)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The internal cross-validation scores obtained by those parameters is:\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(f\"Internal CV score: {grid_search.best_score_:.3f}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We can also introspect the top grid search results as a pandas dataframe:\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n\ncv_results = pd.DataFrame(grid_search.cv_results_)\ncv_results = cv_results.sort_values(\"mean_test_score\", ascending=False)\ncv_results[[\"mean_test_score\", \"std_test_score\",\n            \"param_preprocessor__num__imputer__strategy\",\n            \"param_classifier__C\"\n            ]].head(5)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The best hyper-parameters have be used to re-fit a final model on the full\ntraining set. We can evaluate that final model on held out test data that was\nnot used for hyparameter tuning.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print((\"best logistic regression from grid search: %.3f\"\n       % grid_search.score(X_test, y_test)))"
       ]
     }
   ],
 
@@ -61,6 +61,8 @@
 # * ``pclass``: ordinal integers ``{1, 2, 3}``.
 #
 # We create the preprocessing pipelines for both numeric and categorical data.
+# Note that ``pclass`` could either be treated as a categorical or numeric
+# feature.
 
 numeric_features = ['age', 'fare']
 numeric_transformer = Pipeline(steps=[
@@ -82,7 +84,8 @@
 clf = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', LogisticRegression())])
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
+                                                    random_state=0)
 
 clf.fit(X_train, y_train)
 print("model score: %.3f" % clf.score(X_test, y_test))
@@ -93,6 +96,7 @@
 # When the ``Pipeline`` is printed out in a jupyter notebook an HTML
 # representation of the estimator is displayed as follows:
 from sklearn import set_config
+
 set_config(display='diagram')
 clf
 
@@ -107,12 +111,12 @@
 # example.
 
 subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare']
-X = X[subset_feature]
+X_train, X_test = X_train[subset_feature], X_test[subset_feature]
 
 ###############################################################################
 # Then, we introspect the information regarding each column data type.
 
-X.info()
+X_train.info()
 
 ###############################################################################
 # We can observe that the `embarked` and `sex` columns were tagged as
@@ -134,13 +138,24 @@
     ('num', numeric_transformer, selector(dtype_exclude="category")),
     ('cat', categorical_transformer, selector(dtype_include="category"))
 ])
+clf = Pipeline(steps=[('preprocessor', preprocessor),
+                      ('classifier', LogisticRegression())])
 
-# Reproduce the identical fit/score process
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
 clf.fit(X_train, y_train)
 print("model score: %.3f" % clf.score(X_test, y_test))
 
+###############################################################################
+# The resulting score is not exactly the same as the one from the previous
+# pipeline becase the dtype-based selector treats the ``pclass`` columns as
+# a numeric features instead of a categorical feature as previously:
+
+selector(dtype_exclude="category")(X_train)
+
+###############################################################################
+
+selector(dtype_include="category")(X_train)
+
 ###############################################################################
 # Using the prediction pipeline in a grid search
 ###############################################################################
@@ -157,7 +172,36 @@
 }
 
 grid_search = GridSearchCV(clf, param_grid, cv=10)
+grid_search
+
+###############################################################################
+# Calling 'fit' triggers the cross-validated search for the best
+# hyper-parameters combination:
+#
 grid_search.fit(X_train, y_train)
 
+print(f"Best params:")
+print(grid_search.best_params_)
+
+###############################################################################
+# The internal cross-validation scores obtained by those parameters is:
+print(f"Internal CV score: {grid_search.best_score_:.3f}")
+
+###############################################################################
+# We can also introspect the top grid search results as a pandas dataframe:
+import pandas as pd
+
+cv_results = pd.DataFrame(grid_search.cv_results_)
+cv_results = cv_results.sort_values("mean_test_score", ascending=False)
+cv_results[["mean_test_score", "std_test_score",
+            "param_preprocessor__num__imputer__strategy",
+            "param_classifier__C"
+            ]].head(5)
+
+###############################################################################
+# The best hyper-parameters have be used to re-fit a final model on the full
+# training set. We can evaluate that final model on held out test data that was
+# not used for hyparameter tuning.
+#
 print(("best logistic regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)))