scikit-learn
diff --git a/‎dev/_downloads/plot_compare_reduction.ipynb
Lines changed: 54 additions & 0 deletions b/‎dev/_downloads/plot_compare_reduction.ipynb
Lines changed: 54 additions & 0 deletions
diff --git a/‎dev/_downloads/plot_compare_reduction.py
Lines changed: 75 additions & 0 deletions b/‎dev/_downloads/plot_compare_reduction.py
Lines changed: 75 additions & 0 deletions
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
47 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
47 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
47 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
47 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
63 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
63 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
63 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
63 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
466 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
466 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
466 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
466 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
179 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
179 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0041.png
179 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0041.png
179 Bytes
@@ -0,0 +1,54 @@
+{
+  "nbformat_minor": 0, 
+  "nbformat": 4, 
+  "cells": [
+    {
+      "execution_count": null, 
+      "cell_type": "code", 
+      "source": [
+        "%matplotlib inline"
+      ], 
+      "outputs": [], 
+      "metadata": {
+        "collapsed": false
+      }
+    }, 
+    {
+      "source": [
+        "\n# Selecting dimensionality reduction with Pipeline and GridSearchCV\n\n\nThis example constructs a pipeline that does dimensionality\nreduction followed by prediction with a support vector\nclassifier. It demonstrates the use of GridSearchCV and\nPipeline to optimize over different classes of estimators in a\nsingle CV run -- unsupervised PCA and NMF dimensionality\nreductions are compared to univariate feature selection during\nthe grid search.\n"
+      ], 
+      "cell_type": "markdown", 
+      "metadata": {}
+    }, 
+    {
+      "execution_count": null, 
+      "cell_type": "code", 
+      "source": [
+        "# Authors: Robert McGibbon, Joel Nothman\n\nfrom __future__ import print_function, division\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n    ('reduce_dim', PCA()),\n    ('classify', LinearSVC())\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n    {\n        'reduce_dim': [PCA(iterated_power=7), NMF()],\n        'reduce_dim__n_components': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n    {\n        'reduce_dim': [SelectKBest(chi2)],\n        'reduce_dim__k': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.results_['test_mean_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n               (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\nplt.show()"
+      ], 
+      "outputs": [], 
+      "metadata": {
+        "collapsed": false
+      }
+    }
+  ], 
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 2", 
+      "name": "python2", 
+      "language": "python"
+    }, 
+    "language_info": {
+      "mimetype": "text/x-python", 
+      "nbconvert_exporter": "python", 
+      "name": "python", 
+      "file_extension": ".py", 
+      "version": "2.7.12", 
+      "pygments_lexer": "ipython2", 
+      "codemirror_mode": {
+        "version": 2, 
+        "name": "ipython"
+      }
+    }
+  }
+}
@@ -0,0 +1,75 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+=================================================================
+Selecting dimensionality reduction with Pipeline and GridSearchCV
+=================================================================
+
+This example constructs a pipeline that does dimensionality
+reduction followed by prediction with a support vector
+classifier. It demonstrates the use of GridSearchCV and
+Pipeline to optimize over different classes of estimators in a
+single CV run -- unsupervised PCA and NMF dimensionality
+reductions are compared to univariate feature selection during
+the grid search.
+"""
+# Authors: Robert McGibbon, Joel Nothman
+
+from __future__ import print_function, division
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.datasets import load_digits
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.svm import LinearSVC
+from sklearn.decomposition import PCA, NMF
+from sklearn.feature_selection import SelectKBest, chi2
+
+print(__doc__)
+
+pipe = Pipeline([
+    ('reduce_dim', PCA()),
+    ('classify', LinearSVC())
+])
+
+N_FEATURES_OPTIONS = [2, 4, 8]
+C_OPTIONS = [1, 10, 100, 1000]
+param_grid = [
+    {
+        'reduce_dim': [PCA(iterated_power=7), NMF()],
+        'reduce_dim__n_components': N_FEATURES_OPTIONS,
+        'classify__C': C_OPTIONS
+    },
+    {
+        'reduce_dim': [SelectKBest(chi2)],
+        'reduce_dim__k': N_FEATURES_OPTIONS,
+        'classify__C': C_OPTIONS
+    },
+]
+reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']
+
+grid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)
+digits = load_digits()
+grid.fit(digits.data, digits.target)
+
+mean_scores = np.array(grid.results_['test_mean_score'])
+# scores are in the order of param_grid iteration, which is alphabetical
+mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
+# select score for best C
+mean_scores = mean_scores.max(axis=0)
+bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
+               (len(reducer_labels) + 1) + .5)
+
+plt.figure()
+COLORS = 'bgrcmyk'
+for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
+    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])
+
+plt.title("Comparing feature reduction techniques")
+plt.xlabel('Reduced number of features')
+plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
+plt.ylabel('Digit classification accuracy')
+plt.ylim((0, 1))
+plt.legend(loc='upper left')
+plt.show()