Skip to content

Commit 02b7346

Browse files
committed
Pushing the docs for revision for branch: master, commit 5b20d484add50aec64a1bda5c52ed2ceb7557f36
1 parent 9825160 commit 02b7346

File tree

909 files changed

+3717
-2968
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

909 files changed

+3717
-2968
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"nbformat_minor": 0,
3+
"nbformat": 4,
4+
"cells": [
5+
{
6+
"execution_count": null,
7+
"cell_type": "code",
8+
"source": [
9+
"%matplotlib inline"
10+
],
11+
"outputs": [],
12+
"metadata": {
13+
"collapsed": false
14+
}
15+
},
16+
{
17+
"source": [
18+
"\n# Selecting dimensionality reduction with Pipeline and GridSearchCV\n\n\nThis example constructs a pipeline that does dimensionality\nreduction followed by prediction with a support vector\nclassifier. It demonstrates the use of GridSearchCV and\nPipeline to optimize over different classes of estimators in a\nsingle CV run -- unsupervised PCA and NMF dimensionality\nreductions are compared to univariate feature selection during\nthe grid search.\n"
19+
],
20+
"cell_type": "markdown",
21+
"metadata": {}
22+
},
23+
{
24+
"execution_count": null,
25+
"cell_type": "code",
26+
"source": [
27+
"# Authors: Robert McGibbon, Joel Nothman\n\nfrom __future__ import print_function, division\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n ('reduce_dim', PCA()),\n ('classify', LinearSVC())\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n {\n 'reduce_dim': [PCA(iterated_power=7), NMF()],\n 'reduce_dim__n_components': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n {\n 'reduce_dim': [SelectKBest(chi2)],\n 'reduce_dim__k': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.results_['test_mean_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\nplt.show()"
28+
],
29+
"outputs": [],
30+
"metadata": {
31+
"collapsed": false
32+
}
33+
}
34+
],
35+
"metadata": {
36+
"kernelspec": {
37+
"display_name": "Python 2",
38+
"name": "python2",
39+
"language": "python"
40+
},
41+
"language_info": {
42+
"mimetype": "text/x-python",
43+
"nbconvert_exporter": "python",
44+
"name": "python",
45+
"file_extension": ".py",
46+
"version": "2.7.12",
47+
"pygments_lexer": "ipython2",
48+
"codemirror_mode": {
49+
"version": 2,
50+
"name": "ipython"
51+
}
52+
}
53+
}
54+
}
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
"""
4+
=================================================================
5+
Selecting dimensionality reduction with Pipeline and GridSearchCV
6+
=================================================================
7+
8+
This example constructs a pipeline that does dimensionality
9+
reduction followed by prediction with a support vector
10+
classifier. It demonstrates the use of GridSearchCV and
11+
Pipeline to optimize over different classes of estimators in a
12+
single CV run -- unsupervised PCA and NMF dimensionality
13+
reductions are compared to univariate feature selection during
14+
the grid search.
15+
"""
16+
# Authors: Robert McGibbon, Joel Nothman
17+
18+
from __future__ import print_function, division
19+
20+
import numpy as np
21+
import matplotlib.pyplot as plt
22+
from sklearn.datasets import load_digits
23+
from sklearn.model_selection import GridSearchCV
24+
from sklearn.pipeline import Pipeline
25+
from sklearn.svm import LinearSVC
26+
from sklearn.decomposition import PCA, NMF
27+
from sklearn.feature_selection import SelectKBest, chi2
28+
29+
print(__doc__)
30+
31+
pipe = Pipeline([
32+
('reduce_dim', PCA()),
33+
('classify', LinearSVC())
34+
])
35+
36+
N_FEATURES_OPTIONS = [2, 4, 8]
37+
C_OPTIONS = [1, 10, 100, 1000]
38+
param_grid = [
39+
{
40+
'reduce_dim': [PCA(iterated_power=7), NMF()],
41+
'reduce_dim__n_components': N_FEATURES_OPTIONS,
42+
'classify__C': C_OPTIONS
43+
},
44+
{
45+
'reduce_dim': [SelectKBest(chi2)],
46+
'reduce_dim__k': N_FEATURES_OPTIONS,
47+
'classify__C': C_OPTIONS
48+
},
49+
]
50+
reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']
51+
52+
grid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)
53+
digits = load_digits()
54+
grid.fit(digits.data, digits.target)
55+
56+
mean_scores = np.array(grid.results_['test_mean_score'])
57+
# scores are in the order of param_grid iteration, which is alphabetical
58+
mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
59+
# select score for best C
60+
mean_scores = mean_scores.max(axis=0)
61+
bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
62+
(len(reducer_labels) + 1) + .5)
63+
64+
plt.figure()
65+
COLORS = 'bgrcmyk'
66+
for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
67+
plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])
68+
69+
plt.title("Comparing feature reduction techniques")
70+
plt.xlabel('Reduced number of features')
71+
plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
72+
plt.ylabel('Digit classification accuracy')
73+
plt.ylim((0, 1))
74+
plt.legend(loc='upper left')
75+
plt.show()
47 Bytes
47 Bytes
63 Bytes
63 Bytes
466 Bytes
466 Bytes
179 Bytes
179 Bytes

0 commit comments

Comments
 (0)