Skip to content

Commit a9392be

Browse files
committed
Pushing the docs to dev/ for branch: master, commit a8efe008767c9398ee375140e99d581559319bef
1 parent f8514d2 commit a9392be

File tree

1,104 files changed

+3110
-4263
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,104 files changed

+3110
-4263
lines changed
-339 Bytes
Binary file not shown.
-323 Bytes
Binary file not shown.

dev/_downloads/plot_compare_reduction.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
},
3434
"outputs": [],
3535
"source": [
36-
"# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n # the reduce_dim stage is populated by the param_grid\n ('reduce_dim', 'passthrough'),\n ('classify', LinearSVC(dual=False, max_iter=10000))\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n {\n 'reduce_dim': [PCA(iterated_power=7), NMF()],\n 'reduce_dim__n_components': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n {\n 'reduce_dim': [SelectKBest(chi2)],\n 'reduce_dim__k': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\n\nplt.show()"
36+
"# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n # the reduce_dim stage is populated by the param_grid\n ('reduce_dim', 'passthrough'),\n ('classify', LinearSVC(dual=False, max_iter=10000))\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n {\n 'reduce_dim': [PCA(iterated_power=7), NMF()],\n 'reduce_dim__n_components': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n {\n 'reduce_dim': [SelectKBest(chi2)],\n 'reduce_dim__k': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid)\nX, y = load_digits(return_X_y=True)\ngrid.fit(X, y)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\n\nplt.show()"
3737
]
3838
},
3939
{
@@ -51,7 +51,7 @@
5151
},
5252
"outputs": [],
5353
"source": [
54-
"from joblib import Memory\nfrom shutil import rmtree\n\n# Create a temporary folder to store the transformers of the pipeline\n___location = 'cachedir'\nmemory = Memory(___location=___location, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n ('classify', LinearSVC(dual=False, max_iter=10000))],\n memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\ngrid = GridSearchCV(cached_pipe, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\n# Delete the temporary cache before exiting\nmemory.clear(warn=False)\nrmtree(___location)"
54+
"from joblib import Memory\nfrom shutil import rmtree\n\n# Create a temporary folder to store the transformers of the pipeline\n___location = 'cachedir'\nmemory = Memory(___location=___location, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n ('classify', LinearSVC(dual=False, max_iter=10000))],\n memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\n\n\n# Delete the temporary cache before exiting\nmemory.clear(warn=False)\nrmtree(___location)"
5555
]
5656
},
5757
{

dev/_downloads/plot_compare_reduction.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@
6464
reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']
6565

6666
grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid)
67-
digits = load_digits()
68-
grid.fit(digits.data, digits.target)
67+
X, y = load_digits(return_X_y=True)
68+
grid.fit(X, y)
6969

7070
mean_scores = np.array(grid.cv_results_['mean_test_score'])
7171
# scores are in the order of param_grid iteration, which is alphabetical
@@ -113,9 +113,7 @@
113113
memory=memory)
114114

115115
# This time, a cached pipeline will be used within the grid search
116-
grid = GridSearchCV(cached_pipe, n_jobs=1, param_grid=param_grid)
117-
digits = load_digits()
118-
grid.fit(digits.data, digits.target)
116+
119117

120118
# Delete the temporary cache before exiting
121119
memory.clear(warn=False)

dev/_downloads/plot_cv_digits.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\n\nimport numpy as np\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn import datasets, svm\n\ndigits = datasets.load_digits()\nX = digits.data\ny = digits.target\n\nsvc = svm.SVC(kernel='linear')\nC_s = np.logspace(-10, 0, 10)\n\nscores = list()\nscores_std = list()\nfor C in C_s:\n svc.C = C\n this_scores = cross_val_score(svc, X, y, n_jobs=1)\n scores.append(np.mean(this_scores))\n scores_std.append(np.std(this_scores))\n\n# Do the plotting\nimport matplotlib.pyplot as plt\nplt.figure()\nplt.semilogx(C_s, scores)\nplt.semilogx(C_s, np.array(scores) + np.array(scores_std), 'b--')\nplt.semilogx(C_s, np.array(scores) - np.array(scores_std), 'b--')\nlocs, labels = plt.yticks()\nplt.yticks(locs, list(map(lambda x: \"%g\" % x, locs)))\nplt.ylabel('CV score')\nplt.xlabel('Parameter C')\nplt.ylim(0, 1.1)\nplt.show()"
29+
"print(__doc__)\n\n\nimport numpy as np\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn import datasets, svm\n\nX, y = datasets.load_digits(return_X_y=True)\n\nsvc = svm.SVC(kernel='linear')\nC_s = np.logspace(-10, 0, 10)\n\nscores = list()\nscores_std = list()\nfor C in C_s:\n svc.C = C\n this_scores = cross_val_score(svc, X, y, n_jobs=1)\n scores.append(np.mean(this_scores))\n scores_std.append(np.std(this_scores))\n\n# Do the plotting\nimport matplotlib.pyplot as plt\nplt.figure()\nplt.semilogx(C_s, scores)\nplt.semilogx(C_s, np.array(scores) + np.array(scores_std), 'b--')\nplt.semilogx(C_s, np.array(scores) - np.array(scores_std), 'b--')\nlocs, labels = plt.yticks()\nplt.yticks(locs, list(map(lambda x: \"%g\" % x, locs)))\nplt.ylabel('CV score')\nplt.xlabel('Parameter C')\nplt.ylim(0, 1.1)\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_cv_digits.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@
1515
from sklearn.model_selection import cross_val_score
1616
from sklearn import datasets, svm
1717

18-
digits = datasets.load_digits()
19-
X = digits.data
20-
y = digits.target
18+
X, y = datasets.load_digits(return_X_y=True)
2119

2220
svc = svm.SVC(kernel='linear')
2321
C_s = np.logspace(-10, 0, 10)

dev/_downloads/plot_digits_classification_exercise.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\nfrom sklearn import datasets, neighbors, linear_model\n\ndigits = datasets.load_digits()\nX_digits = digits.data / digits.data.max()\ny_digits = digits.target\n\nn_samples = len(X_digits)\n\nX_train = X_digits[:int(.9 * n_samples)]\ny_train = y_digits[:int(.9 * n_samples)]\nX_test = X_digits[int(.9 * n_samples):]\ny_test = y_digits[int(.9 * n_samples):]\n\nknn = neighbors.KNeighborsClassifier()\nlogistic = linear_model.LogisticRegression(max_iter=1000)\n\nprint('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))\nprint('LogisticRegression score: %f'\n % logistic.fit(X_train, y_train).score(X_test, y_test))"
29+
"print(__doc__)\n\nfrom sklearn import datasets, neighbors, linear_model\n\nX_digits, y_digits = datasets.load_digits(return_X_y=True)\nX_digits = X_digits / X_digits.max()\n\nn_samples = len(X_digits)\n\nX_train = X_digits[:int(.9 * n_samples)]\ny_train = y_digits[:int(.9 * n_samples)]\nX_test = X_digits[int(.9 * n_samples):]\ny_test = y_digits[int(.9 * n_samples):]\n\nknn = neighbors.KNeighborsClassifier()\nlogistic = linear_model.LogisticRegression(max_iter=1000)\n\nprint('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))\nprint('LogisticRegression score: %f'\n % logistic.fit(X_train, y_train).score(X_test, y_test))"
3030
]
3131
}
3232
],

dev/_downloads/plot_digits_classification_exercise.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,8 @@
1414

1515
from sklearn import datasets, neighbors, linear_model
1616

17-
digits = datasets.load_digits()
18-
X_digits = digits.data / digits.data.max()
19-
y_digits = digits.target
17+
X_digits, y_digits = datasets.load_digits(return_X_y=True)
18+
X_digits = X_digits / X_digits.max()
2019

2120
n_samples = len(X_digits)
2221

dev/_downloads/plot_digits_linkage.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Authors: Gael Varoquaux\n# License: BSD 3 clause (C) INRIA 2014\n\nprint(__doc__)\nfrom time import time\n\nimport numpy as np\nfrom scipy import ndimage\nfrom matplotlib import pyplot as plt\n\nfrom sklearn import manifold, datasets\n\ndigits = datasets.load_digits(n_class=10)\nX = digits.data\ny = digits.target\nn_samples, n_features = X.shape\n\nnp.random.seed(0)\n\ndef nudge_images(X, y):\n # Having a larger dataset shows more clearly the behavior of the\n # methods, but we multiply the size of the dataset only by 2, as the\n # cost of the hierarchical clustering methods are strongly\n # super-linear in n_samples\n shift = lambda x: ndimage.shift(x.reshape((8, 8)),\n .3 * np.random.normal(size=2),\n mode='constant',\n ).ravel()\n X = np.concatenate([X, np.apply_along_axis(shift, 1, X)])\n Y = np.concatenate([y, y], axis=0)\n return X, Y\n\n\nX, y = nudge_images(X, y)\n\n\n#----------------------------------------------------------------------\n# Visualize the clustering\ndef plot_clustering(X_red, labels, title=None):\n x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0)\n X_red = (X_red - x_min) / (x_max - x_min)\n\n plt.figure(figsize=(6, 4))\n for i in range(X_red.shape[0]):\n plt.text(X_red[i, 0], X_red[i, 1], str(y[i]),\n color=plt.cm.nipy_spectral(labels[i] / 10.),\n fontdict={'weight': 'bold', 'size': 9})\n\n plt.xticks([])\n plt.yticks([])\n if title is not None:\n plt.title(title, size=17)\n plt.axis('off')\n plt.tight_layout(rect=[0, 0.03, 1, 0.95])\n\n#----------------------------------------------------------------------\n# 2D embedding of the digits dataset\nprint(\"Computing embedding\")\nX_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)\nprint(\"Done.\")\n\nfrom sklearn.cluster import AgglomerativeClustering\n\nfor linkage in ('ward', 'average', 'complete', 'single'):\n clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)\n t0 = time()\n clustering.fit(X_red)\n print(\"%s :\\t%.2fs\" % (linkage, time() - t0))\n\n plot_clustering(X_red, clustering.labels_, \"%s linkage\" % linkage)\n\n\nplt.show()"
29+
"# Authors: Gael Varoquaux\n# License: BSD 3 clause (C) INRIA 2014\n\nprint(__doc__)\nfrom time import time\n\nimport numpy as np\nfrom scipy import ndimage\nfrom matplotlib import pyplot as plt\n\nfrom sklearn import manifold, datasets\n\nX, y = datasets.load_digits(return_X_y=True)\nn_samples, n_features = X.shape\n\nnp.random.seed(0)\n\ndef nudge_images(X, y):\n # Having a larger dataset shows more clearly the behavior of the\n # methods, but we multiply the size of the dataset only by 2, as the\n # cost of the hierarchical clustering methods are strongly\n # super-linear in n_samples\n shift = lambda x: ndimage.shift(x.reshape((8, 8)),\n .3 * np.random.normal(size=2),\n mode='constant',\n ).ravel()\n X = np.concatenate([X, np.apply_along_axis(shift, 1, X)])\n Y = np.concatenate([y, y], axis=0)\n return X, Y\n\n\nX, y = nudge_images(X, y)\n\n\n#----------------------------------------------------------------------\n# Visualize the clustering\ndef plot_clustering(X_red, labels, title=None):\n x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0)\n X_red = (X_red - x_min) / (x_max - x_min)\n\n plt.figure(figsize=(6, 4))\n for i in range(X_red.shape[0]):\n plt.text(X_red[i, 0], X_red[i, 1], str(y[i]),\n color=plt.cm.nipy_spectral(labels[i] / 10.),\n fontdict={'weight': 'bold', 'size': 9})\n\n plt.xticks([])\n plt.yticks([])\n if title is not None:\n plt.title(title, size=17)\n plt.axis('off')\n plt.tight_layout(rect=[0, 0.03, 1, 0.95])\n\n#----------------------------------------------------------------------\n# 2D embedding of the digits dataset\nprint(\"Computing embedding\")\nX_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)\nprint(\"Done.\")\n\nfrom sklearn.cluster import AgglomerativeClustering\n\nfor linkage in ('ward', 'average', 'complete', 'single'):\n clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)\n t0 = time()\n clustering.fit(X_red)\n print(\"%s :\\t%.2fs\" % (linkage, time() - t0))\n\n plot_clustering(X_red, clustering.labels_, \"%s linkage\" % linkage)\n\n\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_digits_linkage.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,7 @@
3030

3131
from sklearn import manifold, datasets
3232

33-
digits = datasets.load_digits(n_class=10)
34-
X = digits.data
35-
y = digits.target
33+
X, y = datasets.load_digits(return_X_y=True)
3634
n_samples, n_features = X.shape
3735

3836
np.random.seed(0)

0 commit comments

Comments
 (0)