Skip to content

Commit 38fb3eb

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 69eb4d4678604147a8ed144b08b837a352fe7801
1 parent ffb47f0 commit 38fb3eb

File tree

1,098 files changed

+3357
-3374
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,098 files changed

+3357
-3374
lines changed
-183 Bytes
Binary file not shown.
-180 Bytes
Binary file not shown.

dev/_downloads/plot_column_transformer_mixed_types.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
},
4545
"outputs": [],
4646
"source": [
47-
"param_grid = {\n 'preprocessor__num__imputer__strategy': ['mean', 'median'],\n 'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %.3f\"\n % grid_search.score(X_test, y_test)))"
47+
"param_grid = {\n 'preprocessor__num__imputer__strategy': ['mean', 'median'],\n 'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %.3f\"\n % grid_search.score(X_test, y_test)))"
4848
]
4949
}
5050
],

dev/_downloads/plot_column_transformer_mixed_types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@
9696
'classifier__C': [0.1, 1.0, 10, 100],
9797
}
9898

99-
grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)
99+
grid_search = GridSearchCV(clf, param_grid, cv=10)
100100
grid_search.fit(X_train, y_train)
101101

102102
print(("best logistic regression from grid search: %.3f"

dev/_downloads/plot_compare_reduction.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
},
3434
"outputs": [],
3535
"source": [
36-
"# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n # the reduce_dim stage is populated by the param_grid\n ('reduce_dim', 'passthrough'),\n ('classify', LinearSVC(dual=False, max_iter=10000))\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n {\n 'reduce_dim': [PCA(iterated_power=7), NMF()],\n 'reduce_dim__n_components': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n {\n 'reduce_dim': [SelectKBest(chi2)],\n 'reduce_dim__k': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid, iid=False)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\n\nplt.show()"
36+
"# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n # the reduce_dim stage is populated by the param_grid\n ('reduce_dim', 'passthrough'),\n ('classify', LinearSVC(dual=False, max_iter=10000))\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n {\n 'reduce_dim': [PCA(iterated_power=7), NMF()],\n 'reduce_dim__n_components': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n {\n 'reduce_dim': [SelectKBest(chi2)],\n 'reduce_dim__k': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\n\nplt.show()"
3737
]
3838
},
3939
{
@@ -51,7 +51,7 @@
5151
},
5252
"outputs": [],
5353
"source": [
54-
"from tempfile import mkdtemp\nfrom shutil import rmtree\nfrom joblib import Memory\n\n# Create a temporary folder to store the transformers of the pipeline\ncachedir = mkdtemp()\nmemory = Memory(___location=cachedir, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n ('classify', LinearSVC(dual=False, max_iter=10000))],\n memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\ngrid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid,\n iid=False)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\n# Delete the temporary cache before exiting\nrmtree(cachedir)"
54+
"from tempfile import mkdtemp\nfrom shutil import rmtree\nfrom joblib import Memory\n\n# Create a temporary folder to store the transformers of the pipeline\ncachedir = mkdtemp()\nmemory = Memory(___location=cachedir, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n ('classify', LinearSVC(dual=False, max_iter=10000))],\n memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\ngrid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\n# Delete the temporary cache before exiting\nrmtree(cachedir)"
5555
]
5656
},
5757
{

dev/_downloads/plot_compare_reduction.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
]
6464
reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']
6565

66-
grid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid, iid=False)
66+
grid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid)
6767
digits = load_digits()
6868
grid.fit(digits.data, digits.target)
6969

@@ -114,8 +114,7 @@
114114
memory=memory)
115115

116116
# This time, a cached pipeline will be used within the grid search
117-
grid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid,
118-
iid=False)
117+
grid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid)
119118
digits = load_digits()
120119
grid.fit(digits.data, digits.target)
121120

dev/_downloads/plot_digits_kde_sampling.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"import numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_digits\nfrom sklearn.neighbors import KernelDensity\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import GridSearchCV\n\n# load the data\ndigits = load_digits()\n\n# project the 64-dimensional data to a lower dimension\npca = PCA(n_components=15, whiten=False)\ndata = pca.fit_transform(digits.data)\n\n# use grid search cross-validation to optimize the bandwidth\nparams = {'bandwidth': np.logspace(-1, 1, 20)}\ngrid = GridSearchCV(KernelDensity(), params, cv=5, iid=False)\ngrid.fit(data)\n\nprint(\"best bandwidth: {0}\".format(grid.best_estimator_.bandwidth))\n\n# use the best estimator to compute the kernel density estimate\nkde = grid.best_estimator_\n\n# sample 44 new points from the data\nnew_data = kde.sample(44, random_state=0)\nnew_data = pca.inverse_transform(new_data)\n\n# turn data into a 4x11 grid\nnew_data = new_data.reshape((4, 11, -1))\nreal_data = digits.data[:44].reshape((4, 11, -1))\n\n# plot real digits and resampled digits\nfig, ax = plt.subplots(9, 11, subplot_kw=dict(xticks=[], yticks=[]))\nfor j in range(11):\n ax[4, j].set_visible(False)\n for i in range(4):\n im = ax[i, j].imshow(real_data[i, j].reshape((8, 8)),\n cmap=plt.cm.binary, interpolation='nearest')\n im.set_clim(0, 16)\n im = ax[i + 5, j].imshow(new_data[i, j].reshape((8, 8)),\n cmap=plt.cm.binary, interpolation='nearest')\n im.set_clim(0, 16)\n\nax[0, 5].set_title('Selection from the input data')\nax[5, 5].set_title('\"New\" digits drawn from the kernel density model')\n\nplt.show()"
29+
"import numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_digits\nfrom sklearn.neighbors import KernelDensity\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import GridSearchCV\n\n# load the data\ndigits = load_digits()\n\n# project the 64-dimensional data to a lower dimension\npca = PCA(n_components=15, whiten=False)\ndata = pca.fit_transform(digits.data)\n\n# use grid search cross-validation to optimize the bandwidth\nparams = {'bandwidth': np.logspace(-1, 1, 20)}\ngrid = GridSearchCV(KernelDensity(), params, cv=5)\ngrid.fit(data)\n\nprint(\"best bandwidth: {0}\".format(grid.best_estimator_.bandwidth))\n\n# use the best estimator to compute the kernel density estimate\nkde = grid.best_estimator_\n\n# sample 44 new points from the data\nnew_data = kde.sample(44, random_state=0)\nnew_data = pca.inverse_transform(new_data)\n\n# turn data into a 4x11 grid\nnew_data = new_data.reshape((4, 11, -1))\nreal_data = digits.data[:44].reshape((4, 11, -1))\n\n# plot real digits and resampled digits\nfig, ax = plt.subplots(9, 11, subplot_kw=dict(xticks=[], yticks=[]))\nfor j in range(11):\n ax[4, j].set_visible(False)\n for i in range(4):\n im = ax[i, j].imshow(real_data[i, j].reshape((8, 8)),\n cmap=plt.cm.binary, interpolation='nearest')\n im.set_clim(0, 16)\n im = ax[i + 5, j].imshow(new_data[i, j].reshape((8, 8)),\n cmap=plt.cm.binary, interpolation='nearest')\n im.set_clim(0, 16)\n\nax[0, 5].set_title('Selection from the input data')\nax[5, 5].set_title('\"New\" digits drawn from the kernel density model')\n\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_digits_kde_sampling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
# use grid search cross-validation to optimize the bandwidth
2929
params = {'bandwidth': np.logspace(-1, 1, 20)}
30-
grid = GridSearchCV(KernelDensity(), params, cv=5, iid=False)
30+
grid = GridSearchCV(KernelDensity(), params, cv=5)
3131
grid.fit(data)
3232

3333
print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

dev/_downloads/plot_digits_pipe.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\n\n# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport pandas as pd\n\nfrom sklearn import datasets\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.model_selection import GridSearchCV\n\n\n# Define a pipeline to search for the best combination of PCA truncation\n# and classifier regularization.\nlogistic = SGDClassifier(loss='log', penalty='l2', early_stopping=True,\n max_iter=10000, tol=1e-5, random_state=0)\npca = PCA()\npipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])\n\ndigits = datasets.load_digits()\nX_digits = digits.data\ny_digits = digits.target\n\n# Parameters of pipelines can be set using \u2018__\u2019 separated parameter names:\nparam_grid = {\n 'pca__n_components': [5, 20, 30, 40, 50, 64],\n 'logistic__alpha': np.logspace(-4, 4, 5),\n}\nsearch = GridSearchCV(pipe, param_grid, iid=False, cv=5)\nsearch.fit(X_digits, y_digits)\nprint(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\nprint(search.best_params_)\n\n# Plot the PCA spectrum\npca.fit(X_digits)\n\nfig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))\nax0.plot(pca.explained_variance_ratio_, linewidth=2)\nax0.set_ylabel('PCA explained variance')\n\nax0.axvline(search.best_estimator_.named_steps['pca'].n_components,\n linestyle=':', label='n_components chosen')\nax0.legend(prop=dict(size=12))\n\n# For each number of components, find the best classifier results\nresults = pd.DataFrame(search.cv_results_)\ncomponents_col = 'param_pca__n_components'\nbest_clfs = results.groupby(components_col).apply(\n lambda g: g.nlargest(1, 'mean_test_score'))\n\nbest_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',\n legend=False, ax=ax1)\nax1.set_ylabel('Classification accuracy (val)')\nax1.set_xlabel('n_components')\n\nplt.tight_layout()\nplt.show()"
29+
"print(__doc__)\n\n\n# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport pandas as pd\n\nfrom sklearn import datasets\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.model_selection import GridSearchCV\n\n\n# Define a pipeline to search for the best combination of PCA truncation\n# and classifier regularization.\nlogistic = SGDClassifier(loss='log', penalty='l2', early_stopping=True,\n max_iter=10000, tol=1e-5, random_state=0)\npca = PCA()\npipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])\n\ndigits = datasets.load_digits()\nX_digits = digits.data\ny_digits = digits.target\n\n# Parameters of pipelines can be set using \u2018__\u2019 separated parameter names:\nparam_grid = {\n 'pca__n_components': [5, 20, 30, 40, 50, 64],\n 'logistic__alpha': np.logspace(-4, 4, 5),\n}\nsearch = GridSearchCV(pipe, param_grid, cv=5)\nsearch.fit(X_digits, y_digits)\nprint(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\nprint(search.best_params_)\n\n# Plot the PCA spectrum\npca.fit(X_digits)\n\nfig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))\nax0.plot(pca.explained_variance_ratio_, linewidth=2)\nax0.set_ylabel('PCA explained variance')\n\nax0.axvline(search.best_estimator_.named_steps['pca'].n_components,\n linestyle=':', label='n_components chosen')\nax0.legend(prop=dict(size=12))\n\n# For each number of components, find the best classifier results\nresults = pd.DataFrame(search.cv_results_)\ncomponents_col = 'param_pca__n_components'\nbest_clfs = results.groupby(components_col).apply(\n lambda g: g.nlargest(1, 'mean_test_score'))\n\nbest_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',\n legend=False, ax=ax1)\nax1.set_ylabel('Classification accuracy (val)')\nax1.set_xlabel('n_components')\n\nplt.tight_layout()\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_digits_pipe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
'pca__n_components': [5, 20, 30, 40, 50, 64],
4848
'logistic__alpha': np.logspace(-4, 4, 5),
4949
}
50-
search = GridSearchCV(pipe, param_grid, iid=False, cv=5)
50+
search = GridSearchCV(pipe, param_grid, cv=5)
5151
search.fit(X_digits, y_digits)
5252
print("Best parameter (CV score=%0.3f):" % search.best_score_)
5353
print(search.best_params_)

0 commit comments

Comments
 (0)