hephzaron
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
3 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
3 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
3 Bytes b/‎dev/_downloads/auto_examples_python.zip
3 Bytes
diff --git a/‎dev/_downloads/plot_compare_reduction.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/plot_compare_reduction.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_compare_reduction.py
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/plot_compare_reduction.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_gradient_boosting_oob.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_gradient_boosting_oob.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_gradient_boosting_oob.py
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_gradient_boosting_oob.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
-455 KB b/‎dev/_downloads/scikit-learn-docs.pdf
-455 KB
diff --git a/‎dev/_images/digicosme.png
11.1 KB b/‎dev/_images/digicosme.png
11.1 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-248 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-248 Bytes
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\nfrom __future__ import print_function, division\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n    ('reduce_dim', PCA()),\n    ('classify', LinearSVC())\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n    {\n        'reduce_dim': [PCA(iterated_power=7), NMF()],\n        'reduce_dim__n_components': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n    {\n        'reduce_dim': [SelectKBest(chi2)],\n        'reduce_dim__k': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n               (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\n\nplt.show()"
+        "# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\nfrom __future__ import print_function, division\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n    ('reduce_dim', PCA()),\n    ('classify', LinearSVC())\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n    {\n        'reduce_dim': [PCA(iterated_power=7), NMF()],\n        'reduce_dim__n_components': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n    {\n        'reduce_dim': [SelectKBest(chi2)],\n        'reduce_dim__k': N_FEATURES_OPTIONS,\n        'classify__C': C_OPTIONS\n    },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n               (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\n\nplt.show()"
       ]
     },
     {
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "from tempfile import mkdtemp\nfrom shutil import rmtree\nfrom sklearn.utils import Memory\n\n# Create a temporary folder to store the transformers of the pipeline\ncachedir = mkdtemp()\nmemory = Memory(cachedir=cachedir, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n                        ('classify', LinearSVC())],\n                       memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\ngrid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\n# Delete the temporary cache before exiting\nrmtree(cachedir)"
+        "from tempfile import mkdtemp\nfrom shutil import rmtree\nfrom sklearn.utils import Memory\n\n# Create a temporary folder to store the transformers of the pipeline\ncachedir = mkdtemp()\nmemory = Memory(cachedir=cachedir, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n                        ('classify', LinearSVC())],\n                       memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\ngrid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\n# Delete the temporary cache before exiting\nrmtree(cachedir)"
       ]
     },
     {
 
@@ -63,7 +63,7 @@
 ]
 reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']
 
-grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)
+grid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid)
 digits = load_digits()
 grid.fit(digits.data, digits.target)
 
@@ -114,7 +114,7 @@
                        memory=memory)
 
 # This time, a cached pipeline will be used within the grid search
-grid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid)
+grid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid)
 digits = load_digits()
 grid.fit(digits.data, digits.target)
 
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "print(__doc__)\n\n# Author: Peter Prettenhofer <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import ensemble\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import train_test_split\n\n\n# Generate data (adapted from G. Ridgeway's gbm example)\nn_samples = 1000\nrandom_state = np.random.RandomState(13)\nx1 = random_state.uniform(size=n_samples)\nx2 = random_state.uniform(size=n_samples)\nx3 = random_state.randint(0, 4, size=n_samples)\n\np = 1 / (1.0 + np.exp(-(np.sin(3 * x1) - 4 * x2 + x3)))\ny = random_state.binomial(1, p, size=n_samples)\n\nX = np.c_[x1, x2, x3]\n\nX = X.astype(np.float32)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,\n                                                    random_state=9)\n\n# Fit classifier with out-of-bag estimates\nparams = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,\n          'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}\nclf = ensemble.GradientBoostingClassifier(**params)\n\nclf.fit(X_train, y_train)\nacc = clf.score(X_test, y_test)\nprint(\"Accuracy: {:.4f}\".format(acc))\n\nn_estimators = params['n_estimators']\nx = np.arange(n_estimators) + 1\n\n\ndef heldout_score(clf, X_test, y_test):\n    \"\"\"compute deviance scores on ``X_test`` and ``y_test``. \"\"\"\n    score = np.zeros((n_estimators,), dtype=np.float64)\n    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):\n        score[i] = clf.loss_(y_test, y_pred)\n    return score\n\n\ndef cv_estimate(n_splits=3):\n    cv = KFold(n_splits=n_splits)\n    cv_clf = ensemble.GradientBoostingClassifier(**params)\n    val_scores = np.zeros((n_estimators,), dtype=np.float64)\n    for train, test in cv.split(X_train, y_train):\n        cv_clf.fit(X_train[train], y_train[train])\n        val_scores += heldout_score(cv_clf, X_train[test], y_train[test])\n    val_scores /= n_splits\n    return val_scores\n\n\n# Estimate best n_estimator using cross-validation\ncv_score = cv_estimate(3)\n\n# Compute best n_estimator for test data\ntest_score = heldout_score(clf, X_test, y_test)\n\n# negative cumulative sum of oob improvements\ncumsum = -np.cumsum(clf.oob_improvement_)\n\n# min loss according to OOB\noob_best_iter = x[np.argmin(cumsum)]\n\n# min loss according to test (normalize such that first loss is 0)\ntest_score -= test_score[0]\ntest_best_iter = x[np.argmin(test_score)]\n\n# min loss according to cv (normalize such that first loss is 0)\ncv_score -= cv_score[0]\ncv_best_iter = x[np.argmin(cv_score)]\n\n# color brew for the three curves\noob_color = list(map(lambda x: x / 256.0, (190, 174, 212)))\ntest_color = list(map(lambda x: x / 256.0, (127, 201, 127)))\ncv_color = list(map(lambda x: x / 256.0, (253, 192, 134)))\n\n# plot curves and vertical lines for best iterations\nplt.plot(x, cumsum, label='OOB loss', color=oob_color)\nplt.plot(x, test_score, label='Test loss', color=test_color)\nplt.plot(x, cv_score, label='CV loss', color=cv_color)\nplt.axvline(x=oob_best_iter, color=oob_color)\nplt.axvline(x=test_best_iter, color=test_color)\nplt.axvline(x=cv_best_iter, color=cv_color)\n\n# add three vertical lines to xticks\nxticks = plt.xticks()\nxticks_pos = np.array(xticks[0].tolist() +\n                      [oob_best_iter, cv_best_iter, test_best_iter])\nxticks_label = np.array(list(map(lambda t: int(t), xticks[0])) +\n                        ['OOB', 'CV', 'Test'])\nind = np.argsort(xticks_pos)\nxticks_pos = xticks_pos[ind]\nxticks_label = xticks_label[ind]\nplt.xticks(xticks_pos, xticks_label)\n\nplt.legend(loc='upper right')\nplt.ylabel('normalized loss')\nplt.xlabel('number of iterations')\n\nplt.show()"
+        "print(__doc__)\n\n# Author: Peter Prettenhofer <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import ensemble\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import train_test_split\n\n\n# Generate data (adapted from G. Ridgeway's gbm example)\nn_samples = 1000\nrandom_state = np.random.RandomState(13)\nx1 = random_state.uniform(size=n_samples)\nx2 = random_state.uniform(size=n_samples)\nx3 = random_state.randint(0, 4, size=n_samples)\n\np = 1 / (1.0 + np.exp(-(np.sin(3 * x1) - 4 * x2 + x3)))\ny = random_state.binomial(1, p, size=n_samples)\n\nX = np.c_[x1, x2, x3]\n\nX = X.astype(np.float32)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,\n                                                    random_state=9)\n\n# Fit classifier with out-of-bag estimates\nparams = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,\n          'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}\nclf = ensemble.GradientBoostingClassifier(**params)\n\nclf.fit(X_train, y_train)\nacc = clf.score(X_test, y_test)\nprint(\"Accuracy: {:.4f}\".format(acc))\n\nn_estimators = params['n_estimators']\nx = np.arange(n_estimators) + 1\n\n\ndef heldout_score(clf, X_test, y_test):\n    \"\"\"compute deviance scores on ``X_test`` and ``y_test``. \"\"\"\n    score = np.zeros((n_estimators,), dtype=np.float64)\n    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):\n        score[i] = clf.loss_(y_test, y_pred)\n    return score\n\n\ndef cv_estimate(n_splits=None):\n    cv = KFold(n_splits=n_splits)\n    cv_clf = ensemble.GradientBoostingClassifier(**params)\n    val_scores = np.zeros((n_estimators,), dtype=np.float64)\n    for train, test in cv.split(X_train, y_train):\n        cv_clf.fit(X_train[train], y_train[train])\n        val_scores += heldout_score(cv_clf, X_train[test], y_train[test])\n    val_scores /= n_splits\n    return val_scores\n\n\n# Estimate best n_estimator using cross-validation\ncv_score = cv_estimate(3)\n\n# Compute best n_estimator for test data\ntest_score = heldout_score(clf, X_test, y_test)\n\n# negative cumulative sum of oob improvements\ncumsum = -np.cumsum(clf.oob_improvement_)\n\n# min loss according to OOB\noob_best_iter = x[np.argmin(cumsum)]\n\n# min loss according to test (normalize such that first loss is 0)\ntest_score -= test_score[0]\ntest_best_iter = x[np.argmin(test_score)]\n\n# min loss according to cv (normalize such that first loss is 0)\ncv_score -= cv_score[0]\ncv_best_iter = x[np.argmin(cv_score)]\n\n# color brew for the three curves\noob_color = list(map(lambda x: x / 256.0, (190, 174, 212)))\ntest_color = list(map(lambda x: x / 256.0, (127, 201, 127)))\ncv_color = list(map(lambda x: x / 256.0, (253, 192, 134)))\n\n# plot curves and vertical lines for best iterations\nplt.plot(x, cumsum, label='OOB loss', color=oob_color)\nplt.plot(x, test_score, label='Test loss', color=test_color)\nplt.plot(x, cv_score, label='CV loss', color=cv_color)\nplt.axvline(x=oob_best_iter, color=oob_color)\nplt.axvline(x=test_best_iter, color=test_color)\nplt.axvline(x=cv_best_iter, color=cv_color)\n\n# add three vertical lines to xticks\nxticks = plt.xticks()\nxticks_pos = np.array(xticks[0].tolist() +\n                      [oob_best_iter, cv_best_iter, test_best_iter])\nxticks_label = np.array(list(map(lambda t: int(t), xticks[0])) +\n                        ['OOB', 'CV', 'Test'])\nind = np.argsort(xticks_pos)\nxticks_pos = xticks_pos[ind]\nxticks_label = xticks_label[ind]\nplt.xticks(xticks_pos, xticks_label)\n\nplt.legend(loc='upper right')\nplt.ylabel('normalized loss')\nplt.xlabel('number of iterations')\n\nplt.show()"
       ]
     }
   ],
 
@@ -74,7 +74,7 @@ def heldout_score(clf, X_test, y_test):
     return score
 
 
-def cv_estimate(n_splits=3):
+def cv_estimate(n_splits=None):
     cv = KFold(n_splits=n_splits)
     cv_clf = ensemble.GradientBoostingClassifier(**params)
     val_scores = np.zeros((n_estimators,), dtype=np.float64)
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`},`
`34`	`34`	`"outputs": [],`
`35`	`35`	`"source": [`
`36`		- "# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\nfrom __future__ import print_function, division\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n ('reduce_dim', PCA()),\n ('classify', LinearSVC())\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n {\n 'reduce_dim': [PCA(iterated_power=7), NMF()],\n 'reduce_dim__n_components': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n {\n 'reduce_dim': [SelectKBest(chi2)],\n 'reduce_dim__k': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\n\nplt.show()"
	`36`	+ "# Authors: Robert McGibbon, Joel Nothman, Guillaume Lemaitre\n\nfrom __future__ import print_function, division\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.decomposition import PCA, NMF\nfrom sklearn.feature_selection import SelectKBest, chi2\n\nprint(__doc__)\n\npipe = Pipeline([\n ('reduce_dim', PCA()),\n ('classify', LinearSVC())\n])\n\nN_FEATURES_OPTIONS = [2, 4, 8]\nC_OPTIONS = [1, 10, 100, 1000]\nparam_grid = [\n {\n 'reduce_dim': [PCA(iterated_power=7), NMF()],\n 'reduce_dim__n_components': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n {\n 'reduce_dim': [SelectKBest(chi2)],\n 'reduce_dim__k': N_FEATURES_OPTIONS,\n 'classify__C': C_OPTIONS\n },\n]\nreducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n\ngrid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nmean_scores = np.array(grid.cv_results_['mean_test_score'])\n# scores are in the order of param_grid iteration, which is alphabetical\nmean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))\n# select score for best C\nmean_scores = mean_scores.max(axis=0)\nbar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *\n (len(reducer_labels) + 1) + .5)\n\nplt.figure()\nCOLORS = 'bgrcmyk'\nfor i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):\n plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])\n\nplt.title(\"Comparing feature reduction techniques\")\nplt.xlabel('Reduced number of features')\nplt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)\nplt.ylabel('Digit classification accuracy')\nplt.ylim((0, 1))\nplt.legend(loc='upper left')\n\nplt.show()"
`37`	`37`	`]`
`38`	`38`	`},`
`39`	`39`	`{`
`@@ -51,7 +51,7 @@`
`51`	`51`	`},`
`52`	`52`	`"outputs": [],`
`53`	`53`	`"source": [`
`54`		- "from tempfile import mkdtemp\nfrom shutil import rmtree\nfrom sklearn.utils import Memory\n\n# Create a temporary folder to store the transformers of the pipeline\ncachedir = mkdtemp()\nmemory = Memory(cachedir=cachedir, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n ('classify', LinearSVC())],\n memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\ngrid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\n# Delete the temporary cache before exiting\nrmtree(cachedir)"
	`54`	+ "from tempfile import mkdtemp\nfrom shutil import rmtree\nfrom sklearn.utils import Memory\n\n# Create a temporary folder to store the transformers of the pipeline\ncachedir = mkdtemp()\nmemory = Memory(cachedir=cachedir, verbose=10)\ncached_pipe = Pipeline([('reduce_dim', PCA()),\n ('classify', LinearSVC())],\n memory=memory)\n\n# This time, a cached pipeline will be used within the grid search\ngrid = GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\n# Delete the temporary cache before exiting\nrmtree(cachedir)"
`55`	`55`	`]`
`56`	`56`	`},`
`57`	`57`	`{`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "print(__doc__)\n\n# Author: Peter Prettenhofer <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import ensemble\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import train_test_split\n\n\n# Generate data (adapted from G. Ridgeway's gbm example)\nn_samples = 1000\nrandom_state = np.random.RandomState(13)\nx1 = random_state.uniform(size=n_samples)\nx2 = random_state.uniform(size=n_samples)\nx3 = random_state.randint(0, 4, size=n_samples)\n\np = 1 / (1.0 + np.exp(-(np.sin(3 * x1) - 4 * x2 + x3)))\ny = random_state.binomial(1, p, size=n_samples)\n\nX = np.c_[x1, x2, x3]\n\nX = X.astype(np.float32)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,\n random_state=9)\n\n# Fit classifier with out-of-bag estimates\nparams = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,\n 'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}\nclf = ensemble.GradientBoostingClassifier(params)\n\nclf.fit(X_train, y_train)\nacc = clf.score(X_test, y_test)\nprint(\"Accuracy: {:.4f}\".format(acc))\n\nn_estimators = params['n_estimators']\nx = np.arange(n_estimators) + 1\n\n\ndef heldout_score(clf, X_test, y_test):\n \"\"\"compute deviance scores on ``X_test`` and ``y_test``. \"\"\"\n score = np.zeros((n_estimators,), dtype=np.float64)\n for i, y_pred in enumerate(clf.staged_decision_function(X_test)):\n score[i] = clf.loss_(y_test, y_pred)\n return score\n\n\ndef cv_estimate(n_splits=3):\n cv = KFold(n_splits=n_splits)\n cv_clf = ensemble.GradientBoostingClassifier(params)\n val_scores = np.zeros((n_estimators,), dtype=np.float64)\n for train, test in cv.split(X_train, y_train):\n cv_clf.fit(X_train[train], y_train[train])\n val_scores += heldout_score(cv_clf, X_train[test], y_train[test])\n val_scores /= n_splits\n return val_scores\n\n\n# Estimate best n_estimator using cross-validation\ncv_score = cv_estimate(3)\n\n# Compute best n_estimator for test data\ntest_score = heldout_score(clf, X_test, y_test)\n\n# negative cumulative sum of oob improvements\ncumsum = -np.cumsum(clf.oob_improvement_)\n\n# min loss according to OOB\noob_best_iter = x[np.argmin(cumsum)]\n\n# min loss according to test (normalize such that first loss is 0)\ntest_score -= test_score[0]\ntest_best_iter = x[np.argmin(test_score)]\n\n# min loss according to cv (normalize such that first loss is 0)\ncv_score -= cv_score[0]\ncv_best_iter = x[np.argmin(cv_score)]\n\n# color brew for the three curves\noob_color = list(map(lambda x: x / 256.0, (190, 174, 212)))\ntest_color = list(map(lambda x: x / 256.0, (127, 201, 127)))\ncv_color = list(map(lambda x: x / 256.0, (253, 192, 134)))\n\n# plot curves and vertical lines for best iterations\nplt.plot(x, cumsum, label='OOB loss', color=oob_color)\nplt.plot(x, test_score, label='Test loss', color=test_color)\nplt.plot(x, cv_score, label='CV loss', color=cv_color)\nplt.axvline(x=oob_best_iter, color=oob_color)\nplt.axvline(x=test_best_iter, color=test_color)\nplt.axvline(x=cv_best_iter, color=cv_color)\n\n# add three vertical lines to xticks\nxticks = plt.xticks()\nxticks_pos = np.array(xticks[0].tolist() +\n [oob_best_iter, cv_best_iter, test_best_iter])\nxticks_label = np.array(list(map(lambda t: int(t), xticks[0])) +\n ['OOB', 'CV', 'Test'])\nind = np.argsort(xticks_pos)\nxticks_pos = xticks_pos[ind]\nxticks_label = xticks_label[ind]\nplt.xticks(xticks_pos, xticks_label)\n\nplt.legend(loc='upper right')\nplt.ylabel('normalized loss')\nplt.xlabel('number of iterations')\n\nplt.show()"
	`29`	+ "print(__doc__)\n\n# Author: Peter Prettenhofer <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import ensemble\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import train_test_split\n\n\n# Generate data (adapted from G. Ridgeway's gbm example)\nn_samples = 1000\nrandom_state = np.random.RandomState(13)\nx1 = random_state.uniform(size=n_samples)\nx2 = random_state.uniform(size=n_samples)\nx3 = random_state.randint(0, 4, size=n_samples)\n\np = 1 / (1.0 + np.exp(-(np.sin(3 * x1) - 4 * x2 + x3)))\ny = random_state.binomial(1, p, size=n_samples)\n\nX = np.c_[x1, x2, x3]\n\nX = X.astype(np.float32)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,\n random_state=9)\n\n# Fit classifier with out-of-bag estimates\nparams = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,\n 'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}\nclf = ensemble.GradientBoostingClassifier(params)\n\nclf.fit(X_train, y_train)\nacc = clf.score(X_test, y_test)\nprint(\"Accuracy: {:.4f}\".format(acc))\n\nn_estimators = params['n_estimators']\nx = np.arange(n_estimators) + 1\n\n\ndef heldout_score(clf, X_test, y_test):\n \"\"\"compute deviance scores on ``X_test`` and ``y_test``. \"\"\"\n score = np.zeros((n_estimators,), dtype=np.float64)\n for i, y_pred in enumerate(clf.staged_decision_function(X_test)):\n score[i] = clf.loss_(y_test, y_pred)\n return score\n\n\ndef cv_estimate(n_splits=None):\n cv = KFold(n_splits=n_splits)\n cv_clf = ensemble.GradientBoostingClassifier(params)\n val_scores = np.zeros((n_estimators,), dtype=np.float64)\n for train, test in cv.split(X_train, y_train):\n cv_clf.fit(X_train[train], y_train[train])\n val_scores += heldout_score(cv_clf, X_train[test], y_train[test])\n val_scores /= n_splits\n return val_scores\n\n\n# Estimate best n_estimator using cross-validation\ncv_score = cv_estimate(3)\n\n# Compute best n_estimator for test data\ntest_score = heldout_score(clf, X_test, y_test)\n\n# negative cumulative sum of oob improvements\ncumsum = -np.cumsum(clf.oob_improvement_)\n\n# min loss according to OOB\noob_best_iter = x[np.argmin(cumsum)]\n\n# min loss according to test (normalize such that first loss is 0)\ntest_score -= test_score[0]\ntest_best_iter = x[np.argmin(test_score)]\n\n# min loss according to cv (normalize such that first loss is 0)\ncv_score -= cv_score[0]\ncv_best_iter = x[np.argmin(cv_score)]\n\n# color brew for the three curves\noob_color = list(map(lambda x: x / 256.0, (190, 174, 212)))\ntest_color = list(map(lambda x: x / 256.0, (127, 201, 127)))\ncv_color = list(map(lambda x: x / 256.0, (253, 192, 134)))\n\n# plot curves and vertical lines for best iterations\nplt.plot(x, cumsum, label='OOB loss', color=oob_color)\nplt.plot(x, test_score, label='Test loss', color=test_color)\nplt.plot(x, cv_score, label='CV loss', color=cv_color)\nplt.axvline(x=oob_best_iter, color=oob_color)\nplt.axvline(x=test_best_iter, color=test_color)\nplt.axvline(x=cv_best_iter, color=cv_color)\n\n# add three vertical lines to xticks\nxticks = plt.xticks()\nxticks_pos = np.array(xticks[0].tolist() +\n [oob_best_iter, cv_best_iter, test_best_iter])\nxticks_label = np.array(list(map(lambda t: int(t), xticks[0])) +\n ['OOB', 'CV', 'Test'])\nind = np.argsort(xticks_pos)\nxticks_pos = xticks_pos[ind]\nxticks_label = xticks_label[ind]\nplt.xticks(xticks_pos, xticks_label)\n\nplt.legend(loc='upper right')\nplt.ylabel('normalized loss')\nplt.xlabel('number of iterations')\n\nplt.show()"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`