dionysiskokkoris
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
4.72 KB b/‎dev/_downloads/auto_examples_jupyter.zip
4.72 KB
diff --git a/‎dev/_downloads/auto_examples_python.zip
3.73 KB b/‎dev/_downloads/auto_examples_python.zip
3.73 KB
diff --git a/‎dev/_downloads/plot_grid_search_refit_callable.ipynb
Lines changed: 54 additions & 0 deletions b/‎dev/_downloads/plot_grid_search_refit_callable.ipynb
Lines changed: 54 additions & 0 deletions
diff --git a/‎dev/_downloads/plot_grid_search_refit_callable.py
Lines changed: 116 additions & 0 deletions b/‎dev/_downloads/plot_grid_search_refit_callable.py
Lines changed: 116 additions & 0 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
18 KB b/‎dev/_downloads/scikit-learn-docs.pdf
18 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-214 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-214 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-214 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-214 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-102 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-102 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
-102 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
-102 Bytes
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Balance model complexity and cross-validated score\n\n\nThis example balances model complexity and cross-validated score by\nfinding a decent accuracy within 1 standard deviation of the best accuracy\nscore while minimising the number of PCA components [1].\n\nThe figure shows the trade-off between cross-validated score and the number\nof PCA components. The balanced case is when n_components=6 and accuracy=0.80,\nwhich falls into the range within 1 standard deviation of the best accuracy\nscore.\n\n[1] Hastie, T., Tibshirani, R.,, Friedman, J. (2001). Model Assessment and\nSelection. The Elements of Statistical Learning (pp. 219-260). New York,\nNY, USA: Springer New York Inc..\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Author: Wenhao Zhang <[email protected]>\n\nprint(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_digits\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\n\n\ndef lower_bound(cv_results):\n    \"\"\"\n    Calculate the lower bound within 1 standard deviation\n    of the best `mean_test_scores`.\n\n    Parameters\n    ----------\n    cv_results : dict of numpy(masked) ndarrays\n        See attribute cv_results_ of `GridSearchCV`\n\n    Returns\n    -------\n    float\n        Lower bound within 1 standard deviation of the\n        best `mean_test_score`.\n    \"\"\"\n    best_score_idx = np.argmax(cv_results['mean_test_score'])\n\n    return (cv_results['mean_test_score'][best_score_idx]\n            - cv_results['std_test_score'][best_score_idx])\n\n\ndef best_low_complexity(cv_results):\n    \"\"\"\n    Balance model complexity with cross-validated score.\n\n    Parameters\n    ----------\n    cv_results : dict of numpy(masked) ndarrays\n        See attribute cv_results_ of `GridSearchCV`.\n\n    Return\n    ------\n    int\n        Index of a model that has the fewest PCA components\n        while has its test score within 1 standard deviation of the best\n        `mean_test_score`.\n    \"\"\"\n    threshold = lower_bound(cv_results)\n    candidate_idx = np.flatnonzero(cv_results['mean_test_score'] >= threshold)\n    best_idx = candidate_idx[cv_results['param_reduce_dim__n_components']\n                             [candidate_idx].argmin()]\n    return best_idx\n\n\npipe = Pipeline([\n        ('reduce_dim', PCA(random_state=42)),\n        ('classify', LinearSVC(random_state=42)),\n])\n\nparam_grid = {\n    'reduce_dim__n_components': [2, 4, 6, 8]\n}\n\ngrid = GridSearchCV(pipe, cv=10, n_jobs=1, param_grid=param_grid,\n                    scoring='accuracy', refit=best_low_complexity)\ndigits = load_digits()\ngrid.fit(digits.data, digits.target)\n\nn_components = grid.cv_results_['param_reduce_dim__n_components']\ntest_scores = grid.cv_results_['mean_test_score']\n\nplt.figure()\nplt.bar(n_components, test_scores, width=1.3, color='b')\n\nlower = lower_bound(grid.cv_results_)\nplt.axhline(np.max(test_scores), linestyle='--', color='y',\n            label='Best score')\nplt.axhline(lower, linestyle='--', color='.5', label='Best score - 1 std')\n\nplt.title(\"Balance model complexity and cross-validated score\")\nplt.xlabel('Number of PCA components used')\nplt.ylabel('Digit classification accuracy')\nplt.xticks(n_components.tolist())\nplt.ylim((0, 1.0))\nplt.legend(loc='upper left')\n\nbest_index_ = grid.best_index_\n\nprint(\"The best_index_ is %d\" % best_index_)\nprint(\"The n_components selected is %d\" % n_components[best_index_])\nprint(\"The corresponding accuracy score is %.2f\"\n      % grid.cv_results_['mean_test_score'][best_index_])\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.8"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,116 @@
+"""
+==================================================
+Balance model complexity and cross-validated score
+==================================================
+
+This example balances model complexity and cross-validated score by
+finding a decent accuracy within 1 standard deviation of the best accuracy
+score while minimising the number of PCA components [1].
+
+The figure shows the trade-off between cross-validated score and the number
+of PCA components. The balanced case is when n_components=6 and accuracy=0.80,
+which falls into the range within 1 standard deviation of the best accuracy
+score.
+
+[1] Hastie, T., Tibshirani, R.,, Friedman, J. (2001). Model Assessment and
+Selection. The Elements of Statistical Learning (pp. 219-260). New York,
+NY, USA: Springer New York Inc..
+"""
+# Author: Wenhao Zhang <[email protected]>
+
+print(__doc__)
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.datasets import load_digits
+from sklearn.decomposition import PCA
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.svm import LinearSVC
+
+
+def lower_bound(cv_results):
+    """
+    Calculate the lower bound within 1 standard deviation
+    of the best `mean_test_scores`.
+
+    Parameters
+    ----------
+    cv_results : dict of numpy(masked) ndarrays
+        See attribute cv_results_ of `GridSearchCV`
+
+    Returns
+    -------
+    float
+        Lower bound within 1 standard deviation of the
+        best `mean_test_score`.
+    """
+    best_score_idx = np.argmax(cv_results['mean_test_score'])
+
+    return (cv_results['mean_test_score'][best_score_idx]
+            - cv_results['std_test_score'][best_score_idx])
+
+
+def best_low_complexity(cv_results):
+    """
+    Balance model complexity with cross-validated score.
+
+    Parameters
+    ----------
+    cv_results : dict of numpy(masked) ndarrays
+        See attribute cv_results_ of `GridSearchCV`.
+
+    Return
+    ------
+    int
+        Index of a model that has the fewest PCA components
+        while has its test score within 1 standard deviation of the best
+        `mean_test_score`.
+    """
+    threshold = lower_bound(cv_results)
+    candidate_idx = np.flatnonzero(cv_results['mean_test_score'] >= threshold)
+    best_idx = candidate_idx[cv_results['param_reduce_dim__n_components']
+                             [candidate_idx].argmin()]
+    return best_idx
+
+
+pipe = Pipeline([
+        ('reduce_dim', PCA(random_state=42)),
+        ('classify', LinearSVC(random_state=42)),
+])
+
+param_grid = {
+    'reduce_dim__n_components': [2, 4, 6, 8]
+}
+
+grid = GridSearchCV(pipe, cv=10, n_jobs=1, param_grid=param_grid,
+                    scoring='accuracy', refit=best_low_complexity)
+digits = load_digits()
+grid.fit(digits.data, digits.target)
+
+n_components = grid.cv_results_['param_reduce_dim__n_components']
+test_scores = grid.cv_results_['mean_test_score']
+
+plt.figure()
+plt.bar(n_components, test_scores, width=1.3, color='b')
+
+lower = lower_bound(grid.cv_results_)
+plt.axhline(np.max(test_scores), linestyle='--', color='y',
+            label='Best score')
+plt.axhline(lower, linestyle='--', color='.5', label='Best score - 1 std')
+
+plt.title("Balance model complexity and cross-validated score")
+plt.xlabel('Number of PCA components used')
+plt.ylabel('Digit classification accuracy')
+plt.xticks(n_components.tolist())
+plt.ylim((0, 1.0))
+plt.legend(loc='upper left')
+
+best_index_ = grid.best_index_
+
+print("The best_index_ is %d" % best_index_)
+print("The n_components selected is %d" % n_components[best_index_])
+print("The corresponding accuracy score is %.2f"
+      % grid.cv_results_['mean_test_score'][best_index_])
+plt.show()