Thayakaran
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
-183 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
-183 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
-180 Bytes b/‎dev/_downloads/auto_examples_python.zip
-180 Bytes
diff --git a/‎dev/_downloads/plot_column_transformer_mixed_types.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_column_transformer_mixed_types.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_column_transformer_mixed_types.py
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_column_transformer_mixed_types.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_compare_reduction.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/plot_compare_reduction.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_compare_reduction.py
Lines changed: 2 additions & 3 deletions b/‎dev/_downloads/plot_compare_reduction.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎dev/_downloads/plot_digits_kde_sampling.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_digits_kde_sampling.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_digits_kde_sampling.py
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_digits_kde_sampling.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_digits_pipe.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_digits_pipe.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_digits_pipe.py
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_digits_pipe.py
Lines changed: 1 addition & 1 deletion
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "param_grid = {\n    'preprocessor__num__imputer__strategy': ['mean', 'median'],\n    'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %.3f\"\n       % grid_search.score(X_test, y_test)))"
+        "param_grid = {\n    'preprocessor__num__imputer__strategy': ['mean', 'median'],\n    'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %.3f\"\n       % grid_search.score(X_test, y_test)))"
       ]
     }
   ],
 
@@ -96,7 +96,7 @@
     'classifier__C': [0.1, 1.0, 10, 100],
 }
 
-grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)
+grid_search = GridSearchCV(clf, param_grid, cv=10)
 grid_search.fit(X_train, y_train)
 
 print(("best logistic regression from grid search: %.3f"
 
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n    # the reduce_dim stage is populated by the param_grid\n    ('reduce_dim', 'passthrough'),\n    ('classify', LinearSVC(dual=False, max_iter=10000))\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n    {\n        'reduce_dim': [PCA(iterated_power=7), NMF()],\n        'reduce_dim__n_components': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n    {\n        'reduce_dim': [SelectKBest(chi2)],\n        'reduce_dim__k': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid, iid=False)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n               (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\n\nplt.show()"
+        "# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n    # the reduce_dim stage is populated by the param_grid\n    ('reduce_dim', 'passthrough'),\n    ('classify', LinearSVC(dual=False, max_iter=10000))\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n    {\n        'reduce_dim': [PCA(iterated_power=7), NMF()],\n        'reduce_dim__n_components': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n    {\n        'reduce_dim': [SelectKBest(chi2)],\n        'reduce_dim__k': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n               (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\n\nplt.show()"
       ]
     },
     {
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "from tempfile import mkdtemp\nfrom shutil import rmtree\nfrom joblib import Memory\n\n# Create a temporary folder to store the transformers of the pipeline\ncachedir = mkdtemp()\nmemory = Memory(___location=cachedir, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n                        ('classify', LinearSVC(dual=False, max_iter=10000))],\n                       memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\ngrid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid,\n                    iid=False)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\n# Delete the temporary cache before exiting\nrmtree(cachedir)"
+        "from tempfile import mkdtemp\nfrom shutil import rmtree\nfrom joblib import Memory\n\n# Create a temporary folder to store the transformers of the pipeline\ncachedir = mkdtemp()\nmemory = Memory(___location=cachedir, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n                        ('classify', LinearSVC(dual=False, max_iter=10000))],\n                       memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\ngrid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\n# Delete the temporary cache before exiting\nrmtree(cachedir)"
       ]
     },
     {
 
@@ -63,7 +63,7 @@
 ]
 reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']
 
-grid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid, iid=False)
+grid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid)
 digits = load_digits()
 grid.fit(digits.data, digits.target)
 
@@ -114,8 +114,7 @@
                        memory=memory)
 
 # This time, a cached pipeline will be used within the grid search
-grid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid,
-                    iid=False)
+grid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid)
 digits = load_digits()
 grid.fit(digits.data, digits.target)
 
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_digits\nfrom sklearn.neighbors import KernelDensity\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import GridSearchCV\n\n# load the data\ndigits = load_digits()\n\n# project the 64-dimensional data to a lower dimension\npca = PCA(n_components=15, whiten=False)\ndata = pca.fit_transform(digits.data)\n\n# use grid search cross-validation to optimize the bandwidth\nparams = {'bandwidth': np.logspace(-1, 1, 20)}\ngrid = GridSearchCV(KernelDensity(), params, cv=5, iid=False)\ngrid.fit(data)\n\nprint(\"best bandwidth: {0}\".format(grid.best_estimator_.bandwidth))\n\n# use the best estimator to compute the kernel density estimate\nkde = grid.best_estimator_\n\n# sample 44 new points from the data\nnew_data = kde.sample(44, random_state=0)\nnew_data = pca.inverse_transform(new_data)\n\n# turn data into a 4x11 grid\nnew_data = new_data.reshape((4, 11, -1))\nreal_data = digits.data[:44].reshape((4, 11, -1))\n\n# plot real digits and resampled digits\nfig, ax = plt.subplots(9, 11, subplot_kw=dict(xticks=[], yticks=[]))\nfor j in range(11):\n    ax[4, j].set_visible(False)\n    for i in range(4):\n        im = ax[i, j].imshow(real_data[i, j].reshape((8, 8)),\n                             cmap=plt.cm.binary, interpolation='nearest')\n        im.set_clim(0, 16)\n        im = ax[i + 5, j].imshow(new_data[i, j].reshape((8, 8)),\n                                 cmap=plt.cm.binary, interpolation='nearest')\n        im.set_clim(0, 16)\n\nax[0, 5].set_title('Selection from the input data')\nax[5, 5].set_title('\"New\" digits drawn from the kernel density model')\n\nplt.show()"
+        "import numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_digits\nfrom sklearn.neighbors import KernelDensity\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import GridSearchCV\n\n# load the data\ndigits = load_digits()\n\n# project the 64-dimensional data to a lower dimension\npca = PCA(n_components=15, whiten=False)\ndata = pca.fit_transform(digits.data)\n\n# use grid search cross-validation to optimize the bandwidth\nparams = {'bandwidth': np.logspace(-1, 1, 20)}\ngrid = GridSearchCV(KernelDensity(), params, cv=5)\ngrid.fit(data)\n\nprint(\"best bandwidth: {0}\".format(grid.best_estimator_.bandwidth))\n\n# use the best estimator to compute the kernel density estimate\nkde = grid.best_estimator_\n\n# sample 44 new points from the data\nnew_data = kde.sample(44, random_state=0)\nnew_data = pca.inverse_transform(new_data)\n\n# turn data into a 4x11 grid\nnew_data = new_data.reshape((4, 11, -1))\nreal_data = digits.data[:44].reshape((4, 11, -1))\n\n# plot real digits and resampled digits\nfig, ax = plt.subplots(9, 11, subplot_kw=dict(xticks=[], yticks=[]))\nfor j in range(11):\n    ax[4, j].set_visible(False)\n    for i in range(4):\n        im = ax[i, j].imshow(real_data[i, j].reshape((8, 8)),\n                             cmap=plt.cm.binary, interpolation='nearest')\n        im.set_clim(0, 16)\n        im = ax[i + 5, j].imshow(new_data[i, j].reshape((8, 8)),\n                                 cmap=plt.cm.binary, interpolation='nearest')\n        im.set_clim(0, 16)\n\nax[0, 5].set_title('Selection from the input data')\nax[5, 5].set_title('\"New\" digits drawn from the kernel density model')\n\nplt.show()"
       ]
     }
   ],
 
@@ -27,7 +27,7 @@
 
 # use grid search cross-validation to optimize the bandwidth
 params = {'bandwidth': np.logspace(-1, 1, 20)}
-grid = GridSearchCV(KernelDensity(), params, cv=5, iid=False)
+grid = GridSearchCV(KernelDensity(), params, cv=5)
 grid.fit(data)
 
 print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "print(__doc__)\n\n\n# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport pandas as pd\n\nfrom sklearn import datasets\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.model_selection import GridSearchCV\n\n\n# Define a pipeline to search for the best combination of PCA truncation\n# and classifier regularization.\nlogistic = SGDClassifier(loss='log', penalty='l2', early_stopping=True,\n                         max_iter=10000, tol=1e-5, random_state=0)\npca = PCA()\npipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])\n\ndigits = datasets.load_digits()\nX_digits = digits.data\ny_digits = digits.target\n\n# Parameters of pipelines can be set using \u2018__\u2019 separated parameter names:\nparam_grid = {\n    'pca__n_components': [5, 20, 30, 40, 50, 64],\n    'logistic__alpha': np.logspace(-4, 4, 5),\n}\nsearch = GridSearchCV(pipe, param_grid, iid=False, cv=5)\nsearch.fit(X_digits, y_digits)\nprint(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\nprint(search.best_params_)\n\n# Plot the PCA spectrum\npca.fit(X_digits)\n\nfig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))\nax0.plot(pca.explained_variance_ratio_, linewidth=2)\nax0.set_ylabel('PCA explained variance')\n\nax0.axvline(search.best_estimator_.named_steps['pca'].n_components,\n            linestyle=':', label='n_components chosen')\nax0.legend(prop=dict(size=12))\n\n# For each number of components, find the best classifier results\nresults = pd.DataFrame(search.cv_results_)\ncomponents_col = 'param_pca__n_components'\nbest_clfs = results.groupby(components_col).apply(\n    lambda g: g.nlargest(1, 'mean_test_score'))\n\nbest_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',\n               legend=False, ax=ax1)\nax1.set_ylabel('Classification accuracy (val)')\nax1.set_xlabel('n_components')\n\nplt.tight_layout()\nplt.show()"
+        "print(__doc__)\n\n\n# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport pandas as pd\n\nfrom sklearn import datasets\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.model_selection import GridSearchCV\n\n\n# Define a pipeline to search for the best combination of PCA truncation\n# and classifier regularization.\nlogistic = SGDClassifier(loss='log', penalty='l2', early_stopping=True,\n                         max_iter=10000, tol=1e-5, random_state=0)\npca = PCA()\npipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])\n\ndigits = datasets.load_digits()\nX_digits = digits.data\ny_digits = digits.target\n\n# Parameters of pipelines can be set using \u2018__\u2019 separated parameter names:\nparam_grid = {\n    'pca__n_components': [5, 20, 30, 40, 50, 64],\n    'logistic__alpha': np.logspace(-4, 4, 5),\n}\nsearch = GridSearchCV(pipe, param_grid, cv=5)\nsearch.fit(X_digits, y_digits)\nprint(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\nprint(search.best_params_)\n\n# Plot the PCA spectrum\npca.fit(X_digits)\n\nfig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))\nax0.plot(pca.explained_variance_ratio_, linewidth=2)\nax0.set_ylabel('PCA explained variance')\n\nax0.axvline(search.best_estimator_.named_steps['pca'].n_components,\n            linestyle=':', label='n_components chosen')\nax0.legend(prop=dict(size=12))\n\n# For each number of components, find the best classifier results\nresults = pd.DataFrame(search.cv_results_)\ncomponents_col = 'param_pca__n_components'\nbest_clfs = results.groupby(components_col).apply(\n    lambda g: g.nlargest(1, 'mean_test_score'))\n\nbest_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',\n               legend=False, ax=ax1)\nax1.set_ylabel('Classification accuracy (val)')\nax1.set_xlabel('n_components')\n\nplt.tight_layout()\nplt.show()"
       ]
     }
   ],
 
@@ -47,7 +47,7 @@
     'pca__n_components': [5, 20, 30, 40, 50, 64],
     'logistic__alpha': np.logspace(-4, 4, 5),
 }
-search = GridSearchCV(pipe, param_grid, iid=False, cv=5)
+search = GridSearchCV(pipe, param_grid, cv=5)
 search.fit(X_digits, y_digits)
 print("Best parameter (CV score=%0.3f):" % search.best_score_)
 print(search.best_params_)
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@`
`44`	`44`	`},`
`45`	`45`	`"outputs": [],`
`46`	`46`	`"source": [`
`47`		`- "param_grid = {\n 'preprocessor__num__imputer__strategy': ['mean', 'median'],\n 'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %.3f\"\n % grid_search.score(X_test, y_test)))"`
	`47`	`+ "param_grid = {\n 'preprocessor__num__imputer__strategy': ['mean', 'median'],\n 'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %.3f\"\n % grid_search.score(X_test, y_test)))"`
`48`	`48`	`]`
`49`	`49`	`}`
`50`	`50`	`],`
Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@`
`96`	`96`	`'classifier__C': [0.1, 1.0, 10, 100],`
`97`	`97`	`}`
`98`	`98`
`99`		`-grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)`
	`99`	`+grid_search = GridSearchCV(clf, param_grid, cv=10)`
`100`	`100`	`grid_search.fit(X_train, y_train)`
`101`	`101`
`102`	`102`	`print(("best logistic regression from grid search: %.3f"`
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`},`
`34`	`34`	`"outputs": [],`
`35`	`35`	`"source": [`
`36`		- "# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n # the reduce_dim stage is populated by the param_grid\n ('reduce_dim', 'passthrough'),\n ('classify', LinearSVC(dual=False, max_iter=10000))\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n {\n 'reduce_dim': [PCA(iterated_power=7), NMF()],\n 'reduce_dim__n_components': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n {\n 'reduce_dim': [SelectKBest(chi2)],\n 'reduce_dim__k': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid, iid=False)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\n\nplt.show()"
	`36`	+ "# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n # the reduce_dim stage is populated by the param_grid\n ('reduce_dim', 'passthrough'),\n ('classify', LinearSVC(dual=False, max_iter=10000))\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n {\n 'reduce_dim': [PCA(iterated_power=7), NMF()],\n 'reduce_dim__n_components': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n {\n 'reduce_dim': [SelectKBest(chi2)],\n 'reduce_dim__k': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\n\nplt.show()"
`37`	`37`	`]`
`38`	`38`	`},`
`39`	`39`	`{`
`@@ -51,7 +51,7 @@`
`51`	`51`	`},`
`52`	`52`	`"outputs": [],`
`53`	`53`	`"source": [`
`54`		- "from tempfile import mkdtemp\nfrom shutil import rmtree\nfrom joblib import Memory\n\n# Create a temporary folder to store the transformers of the pipeline\ncachedir = mkdtemp()\nmemory = Memory(___location=cachedir, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n ('classify', LinearSVC(dual=False, max_iter=10000))],\n memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\ngrid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid,\n iid=False)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\n# Delete the temporary cache before exiting\nrmtree(cachedir)"
	`54`	+ "from tempfile import mkdtemp\nfrom shutil import rmtree\nfrom joblib import Memory\n\n# Create a temporary folder to store the transformers of the pipeline\ncachedir = mkdtemp()\nmemory = Memory(___location=cachedir, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n ('classify', LinearSVC(dual=False, max_iter=10000))],\n memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\ngrid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\n# Delete the temporary cache before exiting\nrmtree(cachedir)"
`55`	`55`	`]`
`56`	`56`	`},`
`57`	`57`	`{`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "import numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_digits\nfrom sklearn.neighbors import KernelDensity\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import GridSearchCV\n\n# load the data\ndigits = load_digits()\n\n# project the 64-dimensional data to a lower dimension\npca = PCA(n_components=15, whiten=False)\ndata = pca.fit_transform(digits.data)\n\n# use grid search cross-validation to optimize the bandwidth\nparams = {'bandwidth': np.logspace(-1, 1, 20)}\ngrid = GridSearchCV(KernelDensity(), params, cv=5, iid=False)\ngrid.fit(data)\n\nprint(\"best bandwidth: {0}\".format(grid.best_estimator_.bandwidth))\n\n# use the best estimator to compute the kernel density estimate\nkde = grid.best_estimator_\n\n# sample 44 new points from the data\nnew_data = kde.sample(44, random_state=0)\nnew_data = pca.inverse_transform(new_data)\n\n# turn data into a 4x11 grid\nnew_data = new_data.reshape((4, 11, -1))\nreal_data = digits.data[:44].reshape((4, 11, -1))\n\n# plot real digits and resampled digits\nfig, ax = plt.subplots(9, 11, subplot_kw=dict(xticks=[], yticks=[]))\nfor j in range(11):\n ax[4, j].set_visible(False)\n for i in range(4):\n im = ax[i, j].imshow(real_data[i, j].reshape((8, 8)),\n cmap=plt.cm.binary, interpolation='nearest')\n im.set_clim(0, 16)\n im = ax[i + 5, j].imshow(new_data[i, j].reshape((8, 8)),\n cmap=plt.cm.binary, interpolation='nearest')\n im.set_clim(0, 16)\n\nax[0, 5].set_title('Selection from the input data')\nax[5, 5].set_title('\"New\" digits drawn from the kernel density model')\n\nplt.show()"
	`29`	+ "import numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_digits\nfrom sklearn.neighbors import KernelDensity\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import GridSearchCV\n\n# load the data\ndigits = load_digits()\n\n# project the 64-dimensional data to a lower dimension\npca = PCA(n_components=15, whiten=False)\ndata = pca.fit_transform(digits.data)\n\n# use grid search cross-validation to optimize the bandwidth\nparams = {'bandwidth': np.logspace(-1, 1, 20)}\ngrid = GridSearchCV(KernelDensity(), params, cv=5)\ngrid.fit(data)\n\nprint(\"best bandwidth: {0}\".format(grid.best_estimator_.bandwidth))\n\n# use the best estimator to compute the kernel density estimate\nkde = grid.best_estimator_\n\n# sample 44 new points from the data\nnew_data = kde.sample(44, random_state=0)\nnew_data = pca.inverse_transform(new_data)\n\n# turn data into a 4x11 grid\nnew_data = new_data.reshape((4, 11, -1))\nreal_data = digits.data[:44].reshape((4, 11, -1))\n\n# plot real digits and resampled digits\nfig, ax = plt.subplots(9, 11, subplot_kw=dict(xticks=[], yticks=[]))\nfor j in range(11):\n ax[4, j].set_visible(False)\n for i in range(4):\n im = ax[i, j].imshow(real_data[i, j].reshape((8, 8)),\n cmap=plt.cm.binary, interpolation='nearest')\n im.set_clim(0, 16)\n im = ax[i + 5, j].imshow(new_data[i, j].reshape((8, 8)),\n cmap=plt.cm.binary, interpolation='nearest')\n im.set_clim(0, 16)\n\nax[0, 5].set_title('Selection from the input data')\nax[5, 5].set_title('\"New\" digits drawn from the kernel density model')\n\nplt.show()"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@`
`47`	`47`	`'pca__n_components': [5, 20, 30, 40, 50, 64],`
`48`	`48`	`'logistic__alpha': np.logspace(-4, 4, 5),`
`49`	`49`	`}`
`50`		`-search = GridSearchCV(pipe, param_grid, iid=False, cv=5)`
	`50`	`+search = GridSearchCV(pipe, param_grid, cv=5)`
`51`	`51`	`search.fit(X_digits, y_digits)`
`52`	`52`	`print("Best parameter (CV score=%0.3f):" % search.best_score_)`
`53`	`53`	`print(search.best_params_)`