scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
663 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
663 Bytes
diff --git a/‎dev/_downloads/26f110ad6cff1a8a7c58b1a00d8b8b5a/plot_column_transformer_mixed_types.ipynb
Lines changed: 9 additions & 9 deletions b/‎dev/_downloads/26f110ad6cff1a8a7c58b1a00d8b8b5a/plot_column_transformer_mixed_types.ipynb
Lines changed: 9 additions & 9 deletions
diff --git a/‎dev/_downloads/41973816d3932cd07b75d8825fd2c13d/plot_svm_anova.py
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/41973816d3932cd07b75d8825fd2c13d/plot_svm_anova.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.04 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.04 KB
diff --git a/‎dev/_downloads/6f4a6a0d8063b616c4aa4db2865de57c/plot_svm_anova.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/6f4a6a0d8063b616c4aa4db2865de57c/plot_svm_anova.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/79c38d2f2cb1f2ef7d68e0cc7ea7b4e4/plot_column_transformer_mixed_types.py
Lines changed: 25 additions & 15 deletions b/‎dev/_downloads/79c38d2f2cb1f2ef7d68e0cc7ea7b4e4/plot_column_transformer_mixed_types.py
Lines changed: 25 additions & 15 deletions
diff --git a/‎dev/_downloads/e38f4849bd47832b7b365f2fa9d31dd6/plot_compare_reduction.ipynb
Lines changed: 23 additions & 1 deletion b/‎dev/_downloads/e38f4849bd47832b7b365f2fa9d31dd6/plot_compare_reduction.ipynb
Lines changed: 23 additions & 1 deletion
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Column Transformer with Mixed Types\n\n.. currentmodule:: sklearn\n\nThis example illustrates how to apply different preprocessing and feature\nextraction pipelines to different subsets of features, using\n:class:`~compose.ColumnTransformer`. This is particularly handy for the\ncase of datasets that contain heterogeneous data types, since we may want to\nscale the numeric features and one-hot encode the categorical ones.\n\nIn this example, the numeric data is standard-scaled after mean-imputation. The\ncategorical data is one-hot encoded via ``OneHotEncoder``, which\ncreates a new category for missing values.\n\nIn addition, we show two different ways to dispatch the columns to the\nparticular pre-processor: by column names and by column data types.\n\nFinally, the preprocessing pipeline is integrated in a full prediction pipeline\nusing :class:`~pipeline.Pipeline`, together with a simple classification\nmodel.\n"
+        "\n# Column Transformer with Mixed Types\n\n.. currentmodule:: sklearn\n\nThis example illustrates how to apply different preprocessing and feature\nextraction pipelines to different subsets of features, using\n:class:`~compose.ColumnTransformer`. This is particularly handy for the\ncase of datasets that contain heterogeneous data types, since we may want to\nscale the numeric features and one-hot encode the categorical ones.\n\nIn this example, the numeric data is standard-scaled after mean-imputation. The\ncategorical data is one-hot encoded via ``OneHotEncoder``, which\ncreates a new category for missing values. We further reduce the dimensionality\nby selecting categories using a chi-squared test.\n\nIn addition, we show two different ways to dispatch the columns to the\nparticular pre-processor: by column names and by column data types.\n\nFinally, the preprocessing pipeline is integrated in a full prediction pipeline\nusing :class:`~pipeline.Pipeline`, together with a simple classification\nmodel.\n"
       ]
     },
     {
@@ -37,7 +37,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\nnp.random.seed(0)"
+        "import numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, RandomizedSearchCV\nfrom sklearn.feature_selection import SelectPercentile, chi2\n\nnp.random.seed(0)"
       ]
     },
     {
@@ -73,7 +73,7 @@
       },
       "outputs": [],
       "source": [
-        "numeric_features = [\"age\", \"fare\"]\nnumeric_transformer = Pipeline(\n    steps=[(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]\n)\n\ncategorical_features = [\"embarked\", \"sex\", \"pclass\"]\ncategorical_transformer = OneHotEncoder(handle_unknown=\"ignore\")\n\npreprocessor = ColumnTransformer(\n    transformers=[\n        (\"num\", numeric_transformer, numeric_features),\n        (\"cat\", categorical_transformer, categorical_features),\n    ]\n)"
+        "numeric_features = [\"age\", \"fare\"]\nnumeric_transformer = Pipeline(\n    steps=[(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]\n)\n\ncategorical_features = [\"embarked\", \"sex\", \"pclass\"]\ncategorical_transformer = Pipeline(\n    steps=[\n        (\"encoder\", OneHotEncoder(handle_unknown=\"ignore\")),\n        (\"selector\", SelectPercentile(chi2, percentile=50)),\n    ]\n)\npreprocessor = ColumnTransformer(\n    transformers=[\n        (\"num\", numeric_transformer, numeric_features),\n        (\"cat\", categorical_transformer, categorical_features),\n    ]\n)"
       ]
     },
     {
@@ -206,7 +206,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Using the prediction pipeline in a grid search\n\nGrid search can also be performed on the different preprocessing steps\ndefined in the ``ColumnTransformer`` object, together with the classifier's\nhyperparameters as part of the ``Pipeline``.\nWe will search for both the imputer strategy of the numeric preprocessing\nand the regularization parameter of the logistic regression using\n:class:`~sklearn.model_selection.GridSearchCV`.\n\n"
+        "Using the prediction pipeline in a grid search\n\nGrid search can also be performed on the different preprocessing steps\ndefined in the ``ColumnTransformer`` object, together with the classifier's\nhyperparameters as part of the ``Pipeline``.\nWe will search for both the imputer strategy of the numeric preprocessing\nand the regularization parameter of the logistic regression using\n:class:`~sklearn.model_selection.RandomizedSearchCV`. This\nhyperparameter search randomly selects a fixed number of parameter\nsettings configured by `n_iter`. Alternatively, one can use\n:class:`~sklearn.model_selection.GridSearchCV` but the cartesian product of\nthe parameter space will be evaluated.\n\n"
       ]
     },
     {
@@ -217,7 +217,7 @@
       },
       "outputs": [],
       "source": [
-        "param_grid = {\n    \"preprocessor__num__imputer__strategy\": [\"mean\", \"median\"],\n    \"classifier__C\": [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10)\ngrid_search"
+        "param_grid = {\n    \"preprocessor__num__imputer__strategy\": [\"mean\", \"median\"],\n    \"preprocessor__cat__selector__percentile\": [10, 30, 50, 70],\n    \"classifier__C\": [0.1, 1.0, 10, 100],\n}\n\nsearch_cv = RandomizedSearchCV(clf, param_grid, n_iter=10, random_state=0)\nsearch_cv"
       ]
     },
     {
@@ -235,7 +235,7 @@
       },
       "outputs": [],
       "source": [
-        "grid_search.fit(X_train, y_train)\n\nprint(\"Best params:\")\nprint(grid_search.best_params_)"
+        "search_cv.fit(X_train, y_train)\n\nprint(\"Best params:\")\nprint(search_cv.best_params_)"
       ]
     },
     {
@@ -253,7 +253,7 @@
       },
       "outputs": [],
       "source": [
-        "print(f\"Internal CV score: {grid_search.best_score_:.3f}\")"
+        "print(f\"Internal CV score: {search_cv.best_score_:.3f}\")"
       ]
     },
     {
@@ -271,7 +271,7 @@
       },
       "outputs": [],
       "source": [
-        "import pandas as pd\n\ncv_results = pd.DataFrame(grid_search.cv_results_)\ncv_results = cv_results.sort_values(\"mean_test_score\", ascending=False)\ncv_results[\n    [\n        \"mean_test_score\",\n        \"std_test_score\",\n        \"param_preprocessor__num__imputer__strategy\",\n        \"param_classifier__C\",\n    ]\n].head(5)"
+        "import pandas as pd\n\ncv_results = pd.DataFrame(search_cv.cv_results_)\ncv_results = cv_results.sort_values(\"mean_test_score\", ascending=False)\ncv_results[\n    [\n        \"mean_test_score\",\n        \"std_test_score\",\n        \"param_preprocessor__num__imputer__strategy\",\n        \"param_preprocessor__cat__selector__percentile\",\n        \"param_classifier__C\",\n    ]\n].head(5)"
       ]
     },
     {
@@ -289,7 +289,7 @@
       },
       "outputs": [],
       "source": [
-        "print(\n    (\n        \"best logistic regression from grid search: %.3f\"\n        % grid_search.score(X_test, y_test)\n    )\n)"
+        "print(\n    \"accuracy of the best model from randomized search: \"\n    f\"{search_cv.score(X_test, y_test):.3f}\"\n)"
       ]
     }
   ],
 
@@ -26,7 +26,7 @@
 # Create the pipeline
 # -------------------
 from sklearn.pipeline import Pipeline
-from sklearn.feature_selection import SelectPercentile, chi2
+from sklearn.feature_selection import SelectPercentile, f_classif
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
 
@@ -35,7 +35,7 @@
 
 clf = Pipeline(
     [
-        ("anova", SelectPercentile(chi2)),
+        ("anova", SelectPercentile(f_classif)),
         ("scaler", StandardScaler()),
         ("svc", SVC(gamma="auto")),
     ]
 
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.pipeline import Pipeline\nfrom sklearn.feature_selection import SelectPercentile, chi2\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.svm import SVC\n\n# Create a feature-selection transform, a scaler and an instance of SVM that we\n# combine together to have a full-blown estimator\n\nclf = Pipeline(\n    [\n        (\"anova\", SelectPercentile(chi2)),\n        (\"scaler\", StandardScaler()),\n        (\"svc\", SVC(gamma=\"auto\")),\n    ]\n)"
+        "from sklearn.pipeline import Pipeline\nfrom sklearn.feature_selection import SelectPercentile, f_classif\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.svm import SVC\n\n# Create a feature-selection transform, a scaler and an instance of SVM that we\n# combine together to have a full-blown estimator\n\nclf = Pipeline(\n    [\n        (\"anova\", SelectPercentile(f_classif)),\n        (\"scaler\", StandardScaler()),\n        (\"svc\", SVC(gamma=\"auto\")),\n    ]\n)"
       ]
     },
     {
 
@@ -13,7 +13,8 @@
 
 In this example, the numeric data is standard-scaled after mean-imputation. The
 categorical data is one-hot encoded via ``OneHotEncoder``, which
-creates a new category for missing values.
+creates a new category for missing values. We further reduce the dimensionality
+by selecting categories using a chi-squared test.
 
 In addition, we show two different ways to dispatch the columns to the
 particular pre-processor: by column names and by column data types.
@@ -37,7 +38,8 @@
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.model_selection import train_test_split, RandomizedSearchCV
+from sklearn.feature_selection import SelectPercentile, chi2
 
 np.random.seed(0)
 
@@ -77,8 +79,12 @@
 )
 
 categorical_features = ["embarked", "sex", "pclass"]
-categorical_transformer = OneHotEncoder(handle_unknown="ignore")
-
+categorical_transformer = Pipeline(
+    steps=[
+        ("encoder", OneHotEncoder(handle_unknown="ignore")),
+        ("selector", SelectPercentile(chi2, percentile=50)),
+    ]
+)
 preprocessor = ColumnTransformer(
     transformers=[
         ("num", numeric_transformer, numeric_features),
@@ -173,40 +179,46 @@
 # hyperparameters as part of the ``Pipeline``.
 # We will search for both the imputer strategy of the numeric preprocessing
 # and the regularization parameter of the logistic regression using
-# :class:`~sklearn.model_selection.GridSearchCV`.
+# :class:`~sklearn.model_selection.RandomizedSearchCV`. This
+# hyperparameter search randomly selects a fixed number of parameter
+# settings configured by `n_iter`. Alternatively, one can use
+# :class:`~sklearn.model_selection.GridSearchCV` but the cartesian product of
+# the parameter space will be evaluated.
 
 param_grid = {
     "preprocessor__num__imputer__strategy": ["mean", "median"],
+    "preprocessor__cat__selector__percentile": [10, 30, 50, 70],
     "classifier__C": [0.1, 1.0, 10, 100],
 }
 
-grid_search = GridSearchCV(clf, param_grid, cv=10)
-grid_search
+search_cv = RandomizedSearchCV(clf, param_grid, n_iter=10, random_state=0)
+search_cv
 
 # %%
 # Calling 'fit' triggers the cross-validated search for the best
 # hyper-parameters combination:
 #
-grid_search.fit(X_train, y_train)
+search_cv.fit(X_train, y_train)
 
 print("Best params:")
-print(grid_search.best_params_)
+print(search_cv.best_params_)
 
 # %%
 # The internal cross-validation scores obtained by those parameters is:
-print(f"Internal CV score: {grid_search.best_score_:.3f}")
+print(f"Internal CV score: {search_cv.best_score_:.3f}")
 
 # %%
 # We can also introspect the top grid search results as a pandas dataframe:
 import pandas as pd
 
-cv_results = pd.DataFrame(grid_search.cv_results_)
+cv_results = pd.DataFrame(search_cv.cv_results_)
 cv_results = cv_results.sort_values("mean_test_score", ascending=False)
 cv_results[
     [
         "mean_test_score",
         "std_test_score",
         "param_preprocessor__num__imputer__strategy",
+        "param_preprocessor__cat__selector__percentile",
         "param_classifier__C",
     ]
 ].head(5)
@@ -217,8 +229,6 @@
 # not used for hyperparameter tuning.
 #
 print(
-    (
-        "best logistic regression from grid search: %.3f"
-        % grid_search.score(X_test, y_test)
-    )
+    "accuracy of the best model from randomized search: "
+    f"{search_cv.score(X_test, y_test):.3f}"
 )
@@ -18,6 +18,17 @@
         "\n# Selecting dimensionality reduction with Pipeline and GridSearchCV\n\nThis example constructs a pipeline that does dimensionality\nreduction followed by prediction with a support vector\nclassifier. It demonstrates the use of ``GridSearchCV`` and\n``Pipeline`` to optimize over different classes of estimators in a\nsingle CV run -- unsupervised ``PCA`` and ``NMF`` dimensionality\nreductions are compared to univariate feature selection during\nthe grid search.\n\nAdditionally, ``Pipeline`` can be instantiated with the ``memory``\nargument to memoize the transformers within the pipeline, avoiding to fit\nagain the same transformers over and over.\n\nNote that the use of ``memory`` to enable caching becomes interesting when the\nfitting of a transformer is costly.\n"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Authors: Robert McGibbon\n#          Joel Nothman\n#          Guillaume Lemaitre"
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {},
@@ -33,7 +44,18 @@
       },
       "outputs": [],
       "source": [
-        "# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\npipe = Pipeline(\n    [\n        # the reduce_dim stage is populated by the param_grid\n        (\"reduce_dim\", \"passthrough\"),\n        (\"classify\", LinearSVC(dual=False, max_iter=10000)),\n    ]\n)\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n    {\n        \"reduce_dim\": [PCA(iterated_power=7), NMF()],\n        \"reduce_dim__n_components\": N_FEATURES_OPTIONS,\n        \"classify__C\": C_OPTIONS,\n    },\n    {\n        \"reduce_dim\": [SelectKBest(chi2)],\n        \"reduce_dim__k\": N_FEATURES_OPTIONS,\n        \"classify__C\": C_OPTIONS,\n    },\n]\nreducer_labels = [\"PCA\", \"NMF\", \"KBest(chi2)\"]\n\ngrid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid)\nX, y = load_digits(return_X_y=True)\ngrid.fit(X, y)\n\nmean_scores = np.array(grid.cv_results_[\"mean_test_score\"])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = np.arange(len(N_FEATURES_OPTIONS)) * (len(reducer_labels) + 1) + 0.5\n\nplt.figure()\nCOLORS = \"bgrcmyk\"\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel(\"Reduced number of features\")\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel(\"Digit classification accuracy\")\nplt.ylim((0, 1))\nplt.legend(loc=\"upper left\")\n\nplt.show()"
+        "import numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, mutual_info_classif\nfrom sklearn.preprocessing import MinMaxScaler\n\nX, y = load_digits(return_X_y=True)\n\npipe = Pipeline(\n    [\n        (\"scaling\", MinMaxScaler()),\n        # the reduce_dim stage is populated by the param_grid\n        (\"reduce_dim\", \"passthrough\"),\n        (\"classify\", LinearSVC(dual=False, max_iter=10000)),\n    ]\n)\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n    {\n        \"reduce_dim\": [PCA(iterated_power=7), NMF(max_iter=1_000)],\n        \"reduce_dim__n_components\": N_FEATURES_OPTIONS,\n        \"classify__C\": C_OPTIONS,\n    },\n    {\n        \"reduce_dim\": [SelectKBest(mutual_info_classif)],\n        \"reduce_dim__k\": N_FEATURES_OPTIONS,\n        \"classify__C\": C_OPTIONS,\n    },\n]\nreducer_labels = [\"PCA\", \"NMF\", \"KBest(mutual_info_classif)\"]\n\ngrid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid)\ngrid.fit(X, y)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n\nmean_scores = np.array(grid.cv_results_[\"mean_test_score\"])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\n# create a dataframe to ease plotting\nmean_scores = pd.DataFrame(\n    mean_scores.T, index=N_FEATURES_OPTIONS, columns=reducer_labels\n)\n\nax = mean_scores.plot.bar()\nax.set_title(\"Comparing feature reduction techniques\")\nax.set_xlabel(\"Reduced number of features\")\nax.set_ylabel(\"Digit classification accuracy\")\nax.set_ylim((0, 1))\nax.legend(loc=\"upper left\")\n\nplt.show()"
       ]
     },
     {
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"cell_type": "markdown",`
`16`	`16`	`"metadata": {},`
`17`	`17`	`"source": [`
`18`		- "\n# Column Transformer with Mixed Types\n\n.. currentmodule:: sklearn\n\nThis example illustrates how to apply different preprocessing and feature\nextraction pipelines to different subsets of features, using\n:class:`~compose.ColumnTransformer`. This is particularly handy for the\ncase of datasets that contain heterogeneous data types, since we may want to\nscale the numeric features and one-hot encode the categorical ones.\n\nIn this example, the numeric data is standard-scaled after mean-imputation. The\ncategorical data is one-hot encoded via ``OneHotEncoder``, which\ncreates a new category for missing values.\n\nIn addition, we show two different ways to dispatch the columns to the\nparticular pre-processor: by column names and by column data types.\n\nFinally, the preprocessing pipeline is integrated in a full prediction pipeline\nusing :class:`~pipeline.Pipeline`, together with a simple classification\nmodel.\n"
	`18`	+ "\n# Column Transformer with Mixed Types\n\n.. currentmodule:: sklearn\n\nThis example illustrates how to apply different preprocessing and feature\nextraction pipelines to different subsets of features, using\n:class:`~compose.ColumnTransformer`. This is particularly handy for the\ncase of datasets that contain heterogeneous data types, since we may want to\nscale the numeric features and one-hot encode the categorical ones.\n\nIn this example, the numeric data is standard-scaled after mean-imputation. The\ncategorical data is one-hot encoded via ``OneHotEncoder``, which\ncreates a new category for missing values. We further reduce the dimensionality\nby selecting categories using a chi-squared test.\n\nIn addition, we show two different ways to dispatch the columns to the\nparticular pre-processor: by column names and by column data types.\n\nFinally, the preprocessing pipeline is integrated in a full prediction pipeline\nusing :class:`~pipeline.Pipeline`, together with a simple classification\nmodel.\n"
`19`	`19`	`]`
`20`	`20`	`},`
`21`	`21`	`{`
`@@ -37,7 +37,7 @@`
`37`	`37`	`},`
`38`	`38`	`"outputs": [],`
`39`	`39`	`"source": [`
`40`		`- "import numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\nnp.random.seed(0)"`
	`40`	`+ "import numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, RandomizedSearchCV\nfrom sklearn.feature_selection import SelectPercentile, chi2\n\nnp.random.seed(0)"`
`41`	`41`	`]`
`42`	`42`	`},`
`43`	`43`	`{`
`@@ -73,7 +73,7 @@`
`73`	`73`	`},`
`74`	`74`	`"outputs": [],`
`75`	`75`	`"source": [`
`76`		- "numeric_features = [\"age\", \"fare\"]\nnumeric_transformer = Pipeline(\n steps=[(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]\n)\n\ncategorical_features = [\"embarked\", \"sex\", \"pclass\"]\ncategorical_transformer = OneHotEncoder(handle_unknown=\"ignore\")\n\npreprocessor = ColumnTransformer(\n transformers=[\n (\"num\", numeric_transformer, numeric_features),\n (\"cat\", categorical_transformer, categorical_features),\n ]\n)"
	`76`	+ "numeric_features = [\"age\", \"fare\"]\nnumeric_transformer = Pipeline(\n steps=[(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]\n)\n\ncategorical_features = [\"embarked\", \"sex\", \"pclass\"]\ncategorical_transformer = Pipeline(\n steps=[\n (\"encoder\", OneHotEncoder(handle_unknown=\"ignore\")),\n (\"selector\", SelectPercentile(chi2, percentile=50)),\n ]\n)\npreprocessor = ColumnTransformer(\n transformers=[\n (\"num\", numeric_transformer, numeric_features),\n (\"cat\", categorical_transformer, categorical_features),\n ]\n)"
`77`	`77`	`]`
`78`	`78`	`},`
`79`	`79`	`{`
`@@ -206,7 +206,7 @@`
`206`	`206`	`"cell_type": "markdown",`
`207`	`207`	`"metadata": {},`
`208`	`208`	`"source": [`
`209`		- "Using the prediction pipeline in a grid search\n\nGrid search can also be performed on the different preprocessing steps\ndefined in the ``ColumnTransformer`` object, together with the classifier's\nhyperparameters as part of the ``Pipeline``.\nWe will search for both the imputer strategy of the numeric preprocessing\nand the regularization parameter of the logistic regression using\n:class:`~sklearn.model_selection.GridSearchCV`.\n\n"
	`209`	+ "Using the prediction pipeline in a grid search\n\nGrid search can also be performed on the different preprocessing steps\ndefined in the ``ColumnTransformer`` object, together with the classifier's\nhyperparameters as part of the ``Pipeline``.\nWe will search for both the imputer strategy of the numeric preprocessing\nand the regularization parameter of the logistic regression using\n:class:`~sklearn.model_selection.RandomizedSearchCV`. This\nhyperparameter search randomly selects a fixed number of parameter\nsettings configured by `n_iter`. Alternatively, one can use\n:class:`~sklearn.model_selection.GridSearchCV` but the cartesian product of\nthe parameter space will be evaluated.\n\n"
`210`	`210`	`]`
`211`	`211`	`},`
`212`	`212`	`{`
`@@ -217,7 +217,7 @@`
`217`	`217`	`},`
`218`	`218`	`"outputs": [],`
`219`	`219`	`"source": [`
`220`		`- "param_grid = {\n \"preprocessor__num__imputer__strategy\": [\"mean\", \"median\"],\n \"classifier__C\": [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10)\ngrid_search"`
	`220`	`+ "param_grid = {\n \"preprocessor__num__imputer__strategy\": [\"mean\", \"median\"],\n \"preprocessor__cat__selector__percentile\": [10, 30, 50, 70],\n \"classifier__C\": [0.1, 1.0, 10, 100],\n}\n\nsearch_cv = RandomizedSearchCV(clf, param_grid, n_iter=10, random_state=0)\nsearch_cv"`
`221`	`221`	`]`
`222`	`222`	`},`
`223`	`223`	`{`
`@@ -235,7 +235,7 @@`
`235`	`235`	`},`
`236`	`236`	`"outputs": [],`
`237`	`237`	`"source": [`
`238`		`- "grid_search.fit(X_train, y_train)\n\nprint(\"Best params:\")\nprint(grid_search.best_params_)"`
	`238`	`+ "search_cv.fit(X_train, y_train)\n\nprint(\"Best params:\")\nprint(search_cv.best_params_)"`
`239`	`239`	`]`
`240`	`240`	`},`
`241`	`241`	`{`
`@@ -253,7 +253,7 @@`
`253`	`253`	`},`
`254`	`254`	`"outputs": [],`
`255`	`255`	`"source": [`
`256`		`- "print(f\"Internal CV score: {grid_search.best_score_:.3f}\")"`
	`256`	`+ "print(f\"Internal CV score: {search_cv.best_score_:.3f}\")"`
`257`	`257`	`]`
`258`	`258`	`},`
`259`	`259`	`{`
`@@ -271,7 +271,7 @@`
`271`	`271`	`},`
`272`	`272`	`"outputs": [],`
`273`	`273`	`"source": [`
`274`		`- "import pandas as pd\n\ncv_results = pd.DataFrame(grid_search.cv_results_)\ncv_results = cv_results.sort_values(\"mean_test_score\", ascending=False)\ncv_results[\n [\n \"mean_test_score\",\n \"std_test_score\",\n \"param_preprocessor__num__imputer__strategy\",\n \"param_classifier__C\",\n ]\n].head(5)"`
	`274`	`+ "import pandas as pd\n\ncv_results = pd.DataFrame(search_cv.cv_results_)\ncv_results = cv_results.sort_values(\"mean_test_score\", ascending=False)\ncv_results[\n [\n \"mean_test_score\",\n \"std_test_score\",\n \"param_preprocessor__num__imputer__strategy\",\n \"param_preprocessor__cat__selector__percentile\",\n \"param_classifier__C\",\n ]\n].head(5)"`
`275`	`275`	`]`
`276`	`276`	`},`
`277`	`277`	`{`
`@@ -289,7 +289,7 @@`
`289`	`289`	`},`
`290`	`290`	`"outputs": [],`
`291`	`291`	`"source": [`
`292`		`- "print(\n (\n \"best logistic regression from grid search: %.3f\"\n % grid_search.score(X_test, y_test)\n )\n)"`
	`292`	`+ "print(\n \"accuracy of the best model from randomized search: \"\n f\"{search_cv.score(X_test, y_test):.3f}\"\n)"`
`293`	`293`	`]`
`294`	`294`	`}`
`295`	`295`	`],`
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@`
`51`	`51`	`},`
`52`	`52`	`"outputs": [],`
`53`	`53`	`"source": [`
`54`		`- "from sklearn.pipeline import Pipeline\nfrom sklearn.feature_selection import SelectPercentile, chi2\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.svm import SVC\n\n# Create a feature-selection transform, a scaler and an instance of SVM that we\n# combine together to have a full-blown estimator\n\nclf = Pipeline(\n [\n (\"anova\", SelectPercentile(chi2)),\n (\"scaler\", StandardScaler()),\n (\"svc\", SVC(gamma=\"auto\")),\n ]\n)"`
	`54`	`+ "from sklearn.pipeline import Pipeline\nfrom sklearn.feature_selection import SelectPercentile, f_classif\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.svm import SVC\n\n# Create a feature-selection transform, a scaler and an instance of SVM that we\n# combine together to have a full-blown estimator\n\nclf = Pipeline(\n [\n (\"anova\", SelectPercentile(f_classif)),\n (\"scaler\", StandardScaler()),\n (\"svc\", SVC(gamma=\"auto\")),\n ]\n)"`
`55`	`55`	`]`
`56`	`56`	`},`
`57`	`57`	`{`