Skip to content

Commit 8a2d4cf

Browse files
committed
Pushing the docs for revision for branch: master, commit 35bb1c6ce2aa15baa804e4e0ecce85feb499d003
1 parent ce826e8 commit 8a2d4cf

File tree

897 files changed

+2637
-2637
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

897 files changed

+2637
-2637
lines changed

dev/_downloads/grid_search_digits.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"execution_count": null,
2525
"cell_type": "code",
2626
"source": [
27-
"from __future__ import print_function\n\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import classification_report\nfrom sklearn.svm import SVC\n\nprint(__doc__)\n\n# Loading the Digits dataset\ndigits = datasets.load_digits()\n\n# To apply an classifier on this data, we need to flatten the image, to\n# turn the data in a (samples, feature) matrix:\nn_samples = len(digits.images)\nX = digits.images.reshape((n_samples, -1))\ny = digits.target\n\n# Split the dataset in two equal parts\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.5, random_state=0)\n\n# Set the parameters by cross-validation\ntuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],\n 'C': [1, 10, 100, 1000]},\n {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]\n\nscores = ['precision', 'recall']\n\nfor score in scores:\n print(\"# Tuning hyper-parameters for %s\" % score)\n print()\n\n clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,\n scoring='%s_macro' % score)\n clf.fit(X_train, y_train)\n\n print(\"Best parameters set found on development set:\")\n print()\n print(clf.best_params_)\n print()\n print(\"Grid scores on development set:\")\n print()\n means = clf.results_['test_mean_score']\n stds = clf.results_['test_std_score']\n for i in range(len(clf.results_['params'])):\n print(\"%0.3f (+/-%0.03f) for %r\"\n % (means[i], stds[i] * 2, clf.results_['params'][i]))\n print()\n\n print(\"Detailed classification report:\")\n print()\n print(\"The model is trained on the full development set.\")\n print(\"The scores are computed on the full evaluation set.\")\n print()\n y_true, y_pred = y_test, clf.predict(X_test)\n print(classification_report(y_true, y_pred))\n print()\n\n# Note the problem is too easy: the hyperparameter plateau is too flat and the\n# output model is the same for precision and recall with ties in quality."
27+
"from __future__ import print_function\n\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import classification_report\nfrom sklearn.svm import SVC\n\nprint(__doc__)\n\n# Loading the Digits dataset\ndigits = datasets.load_digits()\n\n# To apply an classifier on this data, we need to flatten the image, to\n# turn the data in a (samples, feature) matrix:\nn_samples = len(digits.images)\nX = digits.images.reshape((n_samples, -1))\ny = digits.target\n\n# Split the dataset in two equal parts\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.5, random_state=0)\n\n# Set the parameters by cross-validation\ntuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],\n 'C': [1, 10, 100, 1000]},\n {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]\n\nscores = ['precision', 'recall']\n\nfor score in scores:\n print(\"# Tuning hyper-parameters for %s\" % score)\n print()\n\n clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,\n scoring='%s_macro' % score)\n clf.fit(X_train, y_train)\n\n print(\"Best parameters set found on development set:\")\n print()\n print(clf.best_params_)\n print()\n print(\"Grid scores on development set:\")\n print()\n means = clf.cv_results_['mean_test_score']\n stds = clf.cv_results_['std_test_score']\n for mean, std, params in zip(means, stds, clf.cv_results_['params']):\n print(\"%0.3f (+/-%0.03f) for %r\"\n % (mean, std * 2, params))\n print()\n\n print(\"Detailed classification report:\")\n print()\n print(\"The model is trained on the full development set.\")\n print(\"The scores are computed on the full evaluation set.\")\n print()\n y_true, y_pred = y_test, clf.predict(X_test)\n print(classification_report(y_true, y_pred))\n print()\n\n# Note the problem is too easy: the hyperparameter plateau is too flat and the\n# output model is the same for precision and recall with ties in quality."
2828
],
2929
"outputs": [],
3030
"metadata": {

dev/_downloads/grid_search_digits.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,11 @@
6060
print()
6161
print("Grid scores on development set:")
6262
print()
63-
means = clf.results_['test_mean_score']
64-
stds = clf.results_['test_std_score']
65-
for i in range(len(clf.results_['params'])):
63+
means = clf.cv_results_['mean_test_score']
64+
stds = clf.cv_results_['std_test_score']
65+
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
6666
print("%0.3f (+/-%0.03f) for %r"
67-
% (means[i], stds[i] * 2, clf.results_['params'][i]))
67+
% (mean, std * 2, params))
6868
print()
6969

7070
print("Detailed classification report:")

dev/_downloads/plot_compare_reduction.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"execution_count": null,
2525
"cell_type": "code",
2626
"source": [
27-
"# Authors: Robert McGibbon, Joel Nothman\n\nfrom __future__ import print_function, division\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n ('reduce_dim', PCA()),\n ('classify', LinearSVC())\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n {\n 'reduce_dim': [PCA(iterated_power=7), NMF()],\n 'reduce_dim__n_components': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n {\n 'reduce_dim': [SelectKBest(chi2)],\n 'reduce_dim__k': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.results_['test_mean_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\nplt.show()"
27+
"# Authors: Robert McGibbon, Joel Nothman\n\nfrom __future__ import print_function, division\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n ('reduce_dim', PCA()),\n ('classify', LinearSVC())\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n {\n 'reduce_dim': [PCA(iterated_power=7), NMF()],\n 'reduce_dim__n_components': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n {\n 'reduce_dim': [SelectKBest(chi2)],\n 'reduce_dim__k': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\nplt.show()"
2828
],
2929
"outputs": [],
3030
"metadata": {

dev/_downloads/plot_compare_reduction.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
digits = load_digits()
5454
grid.fit(digits.data, digits.target)
5555

56-
mean_scores = np.array(grid.results_['test_mean_score'])
56+
mean_scores = np.array(grid.cv_results_['mean_test_score'])
5757
# scores are in the order of param_grid iteration, which is alphabetical
5858
mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
5959
# select score for best C

dev/_downloads/plot_rbf_parameters.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@
7878
"execution_count": null,
7979
"cell_type": "code",
8080
"source": [
81-
"plt.figure(figsize=(8, 6))\nxx, yy = np.meshgrid(np.linspace(-3, 3, 200), np.linspace(-3, 3, 200))\nfor (k, (C, gamma, clf)) in enumerate(classifiers):\n # evaluate decision function in a grid\n Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n Z = Z.reshape(xx.shape)\n\n # visualize decision function for these parameters\n plt.subplot(len(C_2d_range), len(gamma_2d_range), k + 1)\n plt.title(\"gamma=10^%d, C=10^%d\" % (np.log10(gamma), np.log10(C)),\n size='medium')\n\n # visualize parameter's effect on decision function\n plt.pcolormesh(xx, yy, -Z, cmap=plt.cm.RdBu)\n plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap=plt.cm.RdBu_r)\n plt.xticks(())\n plt.yticks(())\n plt.axis('tight')\n\nscores = grid.results_['test_mean_score'].reshape(len(C_range),\n len(gamma_range))\n\n# Draw heatmap of the validation accuracy as a function of gamma and C\n#\n# The score are encoded as colors with the hot colormap which varies from dark\n# red to bright yellow. As the most interesting scores are all located in the\n# 0.92 to 0.97 range we use a custom normalizer to set the mid-point to 0.92 so\n# as to make it easier to visualize the small variations of score values in the\n# interesting range while not brutally collapsing all the low score values to\n# the same color.\n\nplt.figure(figsize=(8, 6))\nplt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)\nplt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,\n norm=MidpointNormalize(vmin=0.2, midpoint=0.92))\nplt.xlabel('gamma')\nplt.ylabel('C')\nplt.colorbar()\nplt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)\nplt.yticks(np.arange(len(C_range)), C_range)\nplt.title('Validation accuracy')\nplt.show()"
81+
"plt.figure(figsize=(8, 6))\nxx, yy = np.meshgrid(np.linspace(-3, 3, 200), np.linspace(-3, 3, 200))\nfor (k, (C, gamma, clf)) in enumerate(classifiers):\n # evaluate decision function in a grid\n Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n Z = Z.reshape(xx.shape)\n\n # visualize decision function for these parameters\n plt.subplot(len(C_2d_range), len(gamma_2d_range), k + 1)\n plt.title(\"gamma=10^%d, C=10^%d\" % (np.log10(gamma), np.log10(C)),\n size='medium')\n\n # visualize parameter's effect on decision function\n plt.pcolormesh(xx, yy, -Z, cmap=plt.cm.RdBu)\n plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap=plt.cm.RdBu_r)\n plt.xticks(())\n plt.yticks(())\n plt.axis('tight')\n\nscores = grid.cv_results_['mean_test_score'].reshape(len(C_range),\n len(gamma_range))\n\n# Draw heatmap of the validation accuracy as a function of gamma and C\n#\n# The score are encoded as colors with the hot colormap which varies from dark\n# red to bright yellow. As the most interesting scores are all located in the\n# 0.92 to 0.97 range we use a custom normalizer to set the mid-point to 0.92 so\n# as to make it easier to visualize the small variations of score values in the\n# interesting range while not brutally collapsing all the low score values to\n# the same color.\n\nplt.figure(figsize=(8, 6))\nplt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)\nplt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,\n norm=MidpointNormalize(vmin=0.2, midpoint=0.92))\nplt.xlabel('gamma')\nplt.ylabel('C')\nplt.colorbar()\nplt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)\nplt.yticks(np.arange(len(C_range)), C_range)\nplt.title('Validation accuracy')\nplt.show()"
8282
],
8383
"outputs": [],
8484
"metadata": {

dev/_downloads/plot_rbf_parameters.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,8 @@ def __call__(self, value, clip=None):
171171
plt.yticks(())
172172
plt.axis('tight')
173173

174-
scores = grid.results_['test_mean_score'].reshape(len(C_range),
175-
len(gamma_range))
174+
scores = grid.cv_results_['mean_test_score'].reshape(len(C_range),
175+
len(gamma_range))
176176

177177
# Draw heatmap of the validation accuracy as a function of gamma and C
178178
#

dev/_downloads/plot_svm_scale_c.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"execution_count": null,
2525
"cell_type": "code",
2626
"source": [
27-
"print(__doc__)\n\n\n# Author: Andreas Mueller <[email protected]>\n# Jaques Grobler <[email protected]>\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.svm import LinearSVC\nfrom sklearn.model_selection import ShuffleSplit\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.utils import check_random_state\nfrom sklearn import datasets\n\nrnd = check_random_state(1)\n\n# set up dataset\nn_samples = 100\nn_features = 300\n\n# l1 data (only 5 informative features)\nX_1, y_1 = datasets.make_classification(n_samples=n_samples,\n n_features=n_features, n_informative=5,\n random_state=1)\n\n# l2 data: non sparse, but less features\ny_2 = np.sign(.5 - rnd.rand(n_samples))\nX_2 = rnd.randn(n_samples, n_features / 5) + y_2[:, np.newaxis]\nX_2 += 5 * rnd.randn(n_samples, n_features / 5)\n\nclf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False,\n tol=1e-3),\n np.logspace(-2.3, -1.3, 10), X_1, y_1),\n (LinearSVC(penalty='l2', loss='squared_hinge', dual=True,\n tol=1e-4),\n np.logspace(-4.5, -2, 10), X_2, y_2)]\n\ncolors = ['navy', 'cyan', 'darkorange']\nlw = 2\n\nfor fignum, (clf, cs, X, y) in enumerate(clf_sets):\n # set up the plot for each regressor\n plt.figure(fignum, figsize=(9, 10))\n\n for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]):\n param_grid = dict(C=cs)\n # To get nice curve, we need a large number of iterations to\n # reduce the variance\n grid = GridSearchCV(clf, refit=False, param_grid=param_grid,\n cv=ShuffleSplit(train_size=train_size,\n n_splits=250, random_state=1))\n grid.fit(X, y)\n scores = grid.results_['test_mean_score']\n\n scales = [(1, 'No scaling'),\n ((n_samples * train_size), '1/n_samples'),\n ]\n\n for subplotnum, (scaler, name) in enumerate(scales):\n plt.subplot(2, 1, subplotnum + 1)\n plt.xlabel('C')\n plt.ylabel('CV Score')\n grid_cs = cs * float(scaler) # scale the C's\n plt.semilogx(grid_cs, scores, label=\"fraction %.2f\" %\n train_size, color=colors[k], lw=lw)\n plt.title('scaling=%s, penalty=%s, loss=%s' %\n (name, clf.penalty, clf.loss))\n\n plt.legend(loc=\"best\")\nplt.show()"
27+
"print(__doc__)\n\n\n# Author: Andreas Mueller <[email protected]>\n# Jaques Grobler <[email protected]>\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.svm import LinearSVC\nfrom sklearn.model_selection import ShuffleSplit\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.utils import check_random_state\nfrom sklearn import datasets\n\nrnd = check_random_state(1)\n\n# set up dataset\nn_samples = 100\nn_features = 300\n\n# l1 data (only 5 informative features)\nX_1, y_1 = datasets.make_classification(n_samples=n_samples,\n n_features=n_features, n_informative=5,\n random_state=1)\n\n# l2 data: non sparse, but less features\ny_2 = np.sign(.5 - rnd.rand(n_samples))\nX_2 = rnd.randn(n_samples, n_features / 5) + y_2[:, np.newaxis]\nX_2 += 5 * rnd.randn(n_samples, n_features / 5)\n\nclf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False,\n tol=1e-3),\n np.logspace(-2.3, -1.3, 10), X_1, y_1),\n (LinearSVC(penalty='l2', loss='squared_hinge', dual=True,\n tol=1e-4),\n np.logspace(-4.5, -2, 10), X_2, y_2)]\n\ncolors = ['navy', 'cyan', 'darkorange']\nlw = 2\n\nfor fignum, (clf, cs, X, y) in enumerate(clf_sets):\n # set up the plot for each regressor\n plt.figure(fignum, figsize=(9, 10))\n\n for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]):\n param_grid = dict(C=cs)\n # To get nice curve, we need a large number of iterations to\n # reduce the variance\n grid = GridSearchCV(clf, refit=False, param_grid=param_grid,\n cv=ShuffleSplit(train_size=train_size,\n n_splits=250, random_state=1))\n grid.fit(X, y)\n scores = grid.cv_results_['mean_test_score']\n\n scales = [(1, 'No scaling'),\n ((n_samples * train_size), '1/n_samples'),\n ]\n\n for subplotnum, (scaler, name) in enumerate(scales):\n plt.subplot(2, 1, subplotnum + 1)\n plt.xlabel('C')\n plt.ylabel('CV Score')\n grid_cs = cs * float(scaler) # scale the C's\n plt.semilogx(grid_cs, scores, label=\"fraction %.2f\" %\n train_size, color=colors[k], lw=lw)\n plt.title('scaling=%s, penalty=%s, loss=%s' %\n (name, clf.penalty, clf.loss))\n\n plt.legend(loc=\"best\")\nplt.show()"
2828
],
2929
"outputs": [],
3030
"metadata": {

dev/_downloads/plot_svm_scale_c.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@
131131
cv=ShuffleSplit(train_size=train_size,
132132
n_splits=250, random_state=1))
133133
grid.fit(X, y)
134-
scores = grid.results_['test_mean_score']
134+
scores = grid.cv_results_['mean_test_score']
135135

136136
scales = [(1, 'No scaling'),
137137
((n_samples * train_size), '1/n_samples'),

0 commit comments

Comments
 (0)