Skip to content

Commit 07c7e77

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 581752012c43e4838d920c33aad019e2cc24b40c
1 parent fc277a9 commit 07c7e77

File tree

1,056 files changed

+4178
-3357
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,056 files changed

+4178
-3357
lines changed
4.72 KB
Binary file not shown.
3.73 KB
Binary file not shown.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Balance model complexity and cross-validated score\n\n\nThis example balances model complexity and cross-validated score by\nfinding a decent accuracy within 1 standard deviation of the best accuracy\nscore while minimising the number of PCA components [1].\n\nThe figure shows the trade-off between cross-validated score and the number\nof PCA components. The balanced case is when n_components=6 and accuracy=0.80,\nwhich falls into the range within 1 standard deviation of the best accuracy\nscore.\n\n[1] Hastie, T., Tibshirani, R.,, Friedman, J. (2001). Model Assessment and\nSelection. The Elements of Statistical Learning (pp. 219-260). New York,\nNY, USA: Springer New York Inc..\n\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# Author: Wenhao Zhang <[email protected]>\n\nprint(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_digits\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\n\n\ndef lower_bound(cv_results):\n \"\"\"\n Calculate the lower bound within 1 standard deviation\n of the best `mean_test_scores`.\n\n Parameters\n ----------\n cv_results : dict of numpy(masked) ndarrays\n See attribute cv_results_ of `GridSearchCV`\n\n Returns\n -------\n float\n Lower bound within 1 standard deviation of the\n best `mean_test_score`.\n \"\"\"\n best_score_idx = np.argmax(cv_results['mean_test_score'])\n\n return (cv_results['mean_test_score'][best_score_idx]\n - cv_results['std_test_score'][best_score_idx])\n\n\ndef best_low_complexity(cv_results):\n \"\"\"\n Balance model complexity with cross-validated score.\n\n Parameters\n ----------\n cv_results : dict of numpy(masked) ndarrays\n See attribute cv_results_ of `GridSearchCV`.\n\n Return\n ------\n int\n Index of a model that has the fewest PCA components\n while has its test score within 1 standard deviation of the best\n `mean_test_score`.\n \"\"\"\n threshold = lower_bound(cv_results)\n candidate_idx = np.flatnonzero(cv_results['mean_test_score'] >= threshold)\n best_idx = candidate_idx[cv_results['param_reduce_dim__n_components']\n [candidate_idx].argmin()]\n return best_idx\n\n\npipe = Pipeline([\n ('reduce_dim', PCA(random_state=42)),\n ('classify', LinearSVC(random_state=42)),\n])\n\nparam_grid = {\n 'reduce_dim__n_components': [2, 4, 6, 8]\n}\n\ngrid = GridSearchCV(pipe, cv=10, n_jobs=1, param_grid=param_grid,\n scoring='accuracy', refit=best_low_complexity)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nn_components = grid.cv_results_['param_reduce_dim__n_components']\ntest_scores = grid.cv_results_['mean_test_score']\n\nplt.figure()\nplt.bar(n_components, test_scores, width=1.3, color='b')\n\nlower = lower_bound(grid.cv_results_)\nplt.axhline(np.max(test_scores), linestyle='--', color='y',\n label='Best score')\nplt.axhline(lower, linestyle='--', color='.5', label='Best score - 1 std')\n\nplt.title(\"Balance model complexity and cross-validated score\")\nplt.xlabel('Number of PCA components used')\nplt.ylabel('Digit classification accuracy')\nplt.xticks(n_components.tolist())\nplt.ylim((0, 1.0))\nplt.legend(loc='upper left')\n\nbest_index_ = grid.best_index_\n\nprint(\"The best_index_ is %d\" % best_index_)\nprint(\"The n_components selected is %d\" % n_components[best_index_])\nprint(\"The corresponding accuracy score is %.2f\"\n % grid.cv_results_['mean_test_score'][best_index_])\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.6.8"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
"""
2+
==================================================
3+
Balance model complexity and cross-validated score
4+
==================================================
5+
6+
This example balances model complexity and cross-validated score by
7+
finding a decent accuracy within 1 standard deviation of the best accuracy
8+
score while minimising the number of PCA components [1].
9+
10+
The figure shows the trade-off between cross-validated score and the number
11+
of PCA components. The balanced case is when n_components=6 and accuracy=0.80,
12+
which falls into the range within 1 standard deviation of the best accuracy
13+
score.
14+
15+
[1] Hastie, T., Tibshirani, R.,, Friedman, J. (2001). Model Assessment and
16+
Selection. The Elements of Statistical Learning (pp. 219-260). New York,
17+
NY, USA: Springer New York Inc..
18+
"""
19+
# Author: Wenhao Zhang <[email protected]>
20+
21+
print(__doc__)
22+
23+
import numpy as np
24+
import matplotlib.pyplot as plt
25+
26+
from sklearn.datasets import load_digits
27+
from sklearn.decomposition import PCA
28+
from sklearn.model_selection import GridSearchCV
29+
from sklearn.pipeline import Pipeline
30+
from sklearn.svm import LinearSVC
31+
32+
33+
def lower_bound(cv_results):
34+
"""
35+
Calculate the lower bound within 1 standard deviation
36+
of the best `mean_test_scores`.
37+
38+
Parameters
39+
----------
40+
cv_results : dict of numpy(masked) ndarrays
41+
See attribute cv_results_ of `GridSearchCV`
42+
43+
Returns
44+
-------
45+
float
46+
Lower bound within 1 standard deviation of the
47+
best `mean_test_score`.
48+
"""
49+
best_score_idx = np.argmax(cv_results['mean_test_score'])
50+
51+
return (cv_results['mean_test_score'][best_score_idx]
52+
- cv_results['std_test_score'][best_score_idx])
53+
54+
55+
def best_low_complexity(cv_results):
56+
"""
57+
Balance model complexity with cross-validated score.
58+
59+
Parameters
60+
----------
61+
cv_results : dict of numpy(masked) ndarrays
62+
See attribute cv_results_ of `GridSearchCV`.
63+
64+
Return
65+
------
66+
int
67+
Index of a model that has the fewest PCA components
68+
while has its test score within 1 standard deviation of the best
69+
`mean_test_score`.
70+
"""
71+
threshold = lower_bound(cv_results)
72+
candidate_idx = np.flatnonzero(cv_results['mean_test_score'] >= threshold)
73+
best_idx = candidate_idx[cv_results['param_reduce_dim__n_components']
74+
[candidate_idx].argmin()]
75+
return best_idx
76+
77+
78+
pipe = Pipeline([
79+
('reduce_dim', PCA(random_state=42)),
80+
('classify', LinearSVC(random_state=42)),
81+
])
82+
83+
param_grid = {
84+
'reduce_dim__n_components': [2, 4, 6, 8]
85+
}
86+
87+
grid = GridSearchCV(pipe, cv=10, n_jobs=1, param_grid=param_grid,
88+
scoring='accuracy', refit=best_low_complexity)
89+
digits = load_digits()
90+
grid.fit(digits.data, digits.target)
91+
92+
n_components = grid.cv_results_['param_reduce_dim__n_components']
93+
test_scores = grid.cv_results_['mean_test_score']
94+
95+
plt.figure()
96+
plt.bar(n_components, test_scores, width=1.3, color='b')
97+
98+
lower = lower_bound(grid.cv_results_)
99+
plt.axhline(np.max(test_scores), linestyle='--', color='y',
100+
label='Best score')
101+
plt.axhline(lower, linestyle='--', color='.5', label='Best score - 1 std')
102+
103+
plt.title("Balance model complexity and cross-validated score")
104+
plt.xlabel('Number of PCA components used')
105+
plt.ylabel('Digit classification accuracy')
106+
plt.xticks(n_components.tolist())
107+
plt.ylim((0, 1.0))
108+
plt.legend(loc='upper left')
109+
110+
best_index_ = grid.best_index_
111+
112+
print("The best_index_ is %d" % best_index_)
113+
print("The n_components selected is %d" % n_components[best_index_])
114+
print("The corresponding accuracy score is %.2f"
115+
% grid.cv_results_['mean_test_score'][best_index_])
116+
plt.show()

dev/_downloads/scikit-learn-docs.pdf

18 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes

0 commit comments

Comments
 (0)