GRSEB9S
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
2.87 KB b/‎dev/_downloads/auto_examples_jupyter.zip
2.87 KB
diff --git a/‎dev/_downloads/auto_examples_python.zip
2.41 KB b/‎dev/_downloads/auto_examples_python.zip
2.41 KB
diff --git a/‎dev/_downloads/plot_compare_reduction.ipynb
Lines changed: 45 additions & 2 deletions b/‎dev/_downloads/plot_compare_reduction.ipynb
Lines changed: 45 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_compare_reduction.py
Lines changed: 61 additions & 6 deletions b/‎dev/_downloads/plot_compare_reduction.py
Lines changed: 61 additions & 6 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
31.7 KB b/‎dev/_downloads/scikit-learn-docs.pdf
31.7 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-66 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-66 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-66 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-66 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-65 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-65 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
-65 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
-65 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-52 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-52 Bytes
@@ -15,7 +15,50 @@
     }, 
     {
       "source": [
-        "\n# Selecting dimensionality reduction with Pipeline and GridSearchCV\n\n\nThis example constructs a pipeline that does dimensionality\nreduction followed by prediction with a support vector\nclassifier. It demonstrates the use of GridSearchCV and\nPipeline to optimize over different classes of estimators in a\nsingle CV run -- unsupervised PCA and NMF dimensionality\nreductions are compared to univariate feature selection during\nthe grid search.\n\n"
+        "\n# Selecting dimensionality reduction with Pipeline and GridSearchCV\n\n\nThis example constructs a pipeline that does dimensionality\nreduction followed by prediction with a support vector\nclassifier. It demonstrates the use of ``GridSearchCV`` and\n``Pipeline`` to optimize over different classes of estimators in a\nsingle CV run -- unsupervised ``PCA`` and ``NMF`` dimensionality\nreductions are compared to univariate feature selection during\nthe grid search.\n\nAdditionally, ``Pipeline`` can be instantiated with the ``memory``\nargument to memoize the transformers within the pipeline, avoiding to fit\nagain the same transformers over and over.\n\nNote that the use of ``memory`` to enable caching becomes interesting when the\nfitting of a transformer is costly.\n\n"
+      ], 
+      "cell_type": "markdown", 
+      "metadata": {}
+    }, 
+    {
+      "source": [
+        "Illustration of ``Pipeline`` and ``GridSearchCV``\n##############################################################################\n This section illustrates the use of a ``Pipeline`` with\n ``GridSearchCV``\n\n"
+      ], 
+      "cell_type": "markdown", 
+      "metadata": {}
+    }, 
+    {
+      "execution_count": null, 
+      "cell_type": "code", 
+      "source": [
+        "# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\nfrom __future__ import print_function, division\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n    ('reduce_dim', PCA()),\n    ('classify', LinearSVC())\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n    {\n        'reduce_dim': [PCA(iterated_power=7), NMF()],\n        'reduce_dim__n_components': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n    {\n        'reduce_dim': [SelectKBest(chi2)],\n        'reduce_dim__k': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n               (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')"
+      ], 
+      "outputs": [], 
+      "metadata": {
+        "collapsed": false
+      }
+    }, 
+    {
+      "source": [
+        "Caching transformers within a ``Pipeline``\n##############################################################################\n It is sometimes worthwhile storing the state of a specific transformer\n since it could be used again. Using a pipeline in ``GridSearchCV`` triggers\n such situations. Therefore, we use the argument ``memory`` to enable caching.\n\n .. warning::\n     Note that this example is, however, only an illustration since for this\n     specific case fitting PCA is not necessarily slower than loading the\n     cache. Hence, use the ``memory`` constructor parameter when the fitting\n     of a transformer is costly.\n\n"
+      ], 
+      "cell_type": "markdown", 
+      "metadata": {}
+    }, 
+    {
+      "execution_count": null, 
+      "cell_type": "code", 
+      "source": [
+        "from tempfile import mkdtemp\nfrom shutil import rmtree\nfrom sklearn.externals.joblib import Memory\n\n# Create a temporary folder to store the transformers of the pipeline\ncachedir = mkdtemp()\nmemory = Memory(cachedir=cachedir, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n                        ('classify', LinearSVC())],\n                       memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\ngrid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\n# Delete the temporary cache before exiting\nrmtree(cachedir)"
+      ], 
+      "outputs": [], 
+      "metadata": {
+        "collapsed": false
+      }
+    }, 
+    {
+      "source": [
+        "The ``PCA`` fitting is only computed at the evaluation of the first\nconfiguration of the ``C`` parameter of the ``LinearSVC`` classifier. The\nother configurations of ``C`` will trigger the loading of the cached ``PCA``\nestimator data, leading to save processing time. Therefore, the use of\ncaching the pipeline using ``memory`` is highly beneficial when fitting\na transformer is costly.\n\n"
       ], 
       "cell_type": "markdown", 
       "metadata": {}
@@ -24,7 +67,7 @@
       "execution_count": null, 
       "cell_type": "code", 
       "source": [
-        "# Authors: Robert McGibbon, Joel Nothman\n\nfrom __future__ import print_function, division\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n    ('reduce_dim', PCA()),\n    ('classify', LinearSVC())\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n    {\n        'reduce_dim': [PCA(iterated_power=7), NMF()],\n        'reduce_dim__n_components': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n    {\n        'reduce_dim': [SelectKBest(chi2)],\n        'reduce_dim__k': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n               (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\nplt.show()"
+        "plt.show()"
       ], 
       "outputs": [], 
       "metadata": {
 
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """
 =================================================================
@@ -7,13 +7,27 @@
 
 This example constructs a pipeline that does dimensionality
 reduction followed by prediction with a support vector
-classifier. It demonstrates the use of GridSearchCV and
-Pipeline to optimize over different classes of estimators in a
-single CV run -- unsupervised PCA and NMF dimensionality
+classifier. It demonstrates the use of ``GridSearchCV`` and
+``Pipeline`` to optimize over different classes of estimators in a
+single CV run -- unsupervised ``PCA`` and ``NMF`` dimensionality
 reductions are compared to univariate feature selection during
 the grid search.
+
+Additionally, ``Pipeline`` can be instantiated with the ``memory``
+argument to memoize the transformers within the pipeline, avoiding to fit
+again the same transformers over and over.
+
+Note that the use of ``memory`` to enable caching becomes interesting when the
+fitting of a transformer is costly.
 """
-# Authors: Robert McGibbon, Joel Nothman
+
+###############################################################################
+# Illustration of ``Pipeline`` and ``GridSearchCV``
+###############################################################################
+# This section illustrates the use of a ``Pipeline`` with
+# ``GridSearchCV``
+
+# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre
 
 from __future__ import print_function, division
 
@@ -49,7 +63,7 @@
 ]
 reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']
 
-grid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)
+grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)
 digits = load_digits()
 grid.fit(digits.data, digits.target)
 
@@ -72,4 +86,45 @@
 plt.ylabel('Digit classification accuracy')
 plt.ylim((0, 1))
 plt.legend(loc='upper left')
+
+###############################################################################
+# Caching transformers within a ``Pipeline``
+###############################################################################
+# It is sometimes worthwhile storing the state of a specific transformer
+# since it could be used again. Using a pipeline in ``GridSearchCV`` triggers
+# such situations. Therefore, we use the argument ``memory`` to enable caching.
+#
+# .. warning::
+#     Note that this example is, however, only an illustration since for this
+#     specific case fitting PCA is not necessarily slower than loading the
+#     cache. Hence, use the ``memory`` constructor parameter when the fitting
+#     of a transformer is costly.
+
+from tempfile import mkdtemp
+from shutil import rmtree
+from sklearn.externals.joblib import Memory
+
+# Create a temporary folder to store the transformers of the pipeline
+cachedir = mkdtemp()
+memory = Memory(cachedir=cachedir, verbose=10)
+cached_pipe = Pipeline([('reduce_dim', PCA()),
+                        ('classify', LinearSVC())],
+                       memory=memory)
+
+# This time, a cached pipeline will be used within the grid search
+grid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid)
+digits = load_digits()
+grid.fit(digits.data, digits.target)
+
+# Delete the temporary cache before exiting
+rmtree(cachedir)
+
+###############################################################################
+# The ``PCA`` fitting is only computed at the evaluation of the first
+# configuration of the ``C`` parameter of the ``LinearSVC`` classifier. The
+# other configurations of ``C`` will trigger the loading of the cached ``PCA``
+# estimator data, leading to save processing time. Therefore, the use of
+# caching the pipeline using ``memory`` is highly beneficial when fitting
+# a transformer is costly.
+
 plt.show()