scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
128 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
128 Bytes
diff --git a/‎dev/_downloads/27d42183163dfa32c3c487b21701b537/plot_cv_diabetes.ipynb
Lines changed: 44 additions & 1 deletion b/‎dev/_downloads/27d42183163dfa32c3c487b21701b537/plot_cv_diabetes.ipynb
Lines changed: 44 additions & 1 deletion
diff --git a/‎dev/_downloads/428a26d23bc55d1c898a0e4361695ad0/plot_cv_diabetes.py
Lines changed: 15 additions & 5 deletions b/‎dev/_downloads/428a26d23bc55d1c898a0e4361695ad0/plot_cv_diabetes.py
Lines changed: 15 additions & 5 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
642 Bytes b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
642 Bytes
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
-8.16 KB b/‎dev/_downloads/scikit-learn-docs.zip
-8.16 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-208 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-208 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-189 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-189 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-99 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-99 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-16 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-16 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-71 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-71 Bytes
@@ -18,6 +18,49 @@
         "\n# Cross-validation on diabetes Dataset Exercise\n\nA tutorial exercise which uses cross-validation with linear models.\n\nThis exercise is used in the `cv_estimators_tut` part of the\n`model_selection_tut` section of the `stat_learn_tut_index`.\n"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Load dataset and apply GridSearchCV\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.linear_model import Lasso\nfrom sklearn.model_selection import GridSearchCV\n\nX, y = datasets.load_diabetes(return_X_y=True)\nX = X[:150]\ny = y[:150]\n\nlasso = Lasso(random_state=0, max_iter=10000)\nalphas = np.logspace(-4, -0.5, 30)\n\ntuned_parameters = [{\"alpha\": alphas}]\nn_folds = 5\n\nclf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False)\nclf.fit(X, y)\nscores = clf.cv_results_[\"mean_test_score\"]\nscores_std = clf.cv_results_[\"std_test_score\"]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plot error lines showing +/- std. errors of the scores\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "plt.figure().set_size_inches(8, 6)\nplt.semilogx(alphas, scores)\n\nstd_error = scores_std / np.sqrt(n_folds)\n\nplt.semilogx(alphas, scores + std_error, \"b--\")\nplt.semilogx(alphas, scores - std_error, \"b--\")\n\n# alpha=0.2 controls the translucency of the fill color\nplt.fill_between(alphas, scores + std_error, scores - std_error, alpha=0.2)\n\nplt.ylabel(\"CV score +/- std error\")\nplt.xlabel(\"alpha\")\nplt.axhline(np.max(scores), linestyle=\"--\", color=\".5\")\nplt.xlim([alphas[0], alphas[-1]])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Bonus: how much can you trust the selection of alpha?\n\n"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -26,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.linear_model import LassoCV\nfrom sklearn.linear_model import Lasso\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import GridSearchCV\n\nX, y = datasets.load_diabetes(return_X_y=True)\nX = X[:150]\ny = y[:150]\n\nlasso = Lasso(random_state=0, max_iter=10000)\nalphas = np.logspace(-4, -0.5, 30)\n\ntuned_parameters = [{\"alpha\": alphas}]\nn_folds = 5\n\nclf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False)\nclf.fit(X, y)\nscores = clf.cv_results_[\"mean_test_score\"]\nscores_std = clf.cv_results_[\"std_test_score\"]\nplt.figure().set_size_inches(8, 6)\nplt.semilogx(alphas, scores)\n\n# plot error lines showing +/- std. errors of the scores\nstd_error = scores_std / np.sqrt(n_folds)\n\nplt.semilogx(alphas, scores + std_error, \"b--\")\nplt.semilogx(alphas, scores - std_error, \"b--\")\n\n# alpha=0.2 controls the translucency of the fill color\nplt.fill_between(alphas, scores + std_error, scores - std_error, alpha=0.2)\n\nplt.ylabel(\"CV score +/- std error\")\nplt.xlabel(\"alpha\")\nplt.axhline(np.max(scores), linestyle=\"--\", color=\".5\")\nplt.xlim([alphas[0], alphas[-1]])\n\n# #############################################################################\n# Bonus: how much can you trust the selection of alpha?\n\n# To answer this question we use the LassoCV object that sets its alpha\n# parameter automatically from the data by internal cross-validation (i.e. it\n# performs cross-validation on the training data it receives).\n# We use external cross-validation to see how much the automatically obtained\n# alphas differ across different cross-validation folds.\nlasso_cv = LassoCV(alphas=alphas, random_state=0, max_iter=10000)\nk_fold = KFold(3)\n\nprint(\"Answer to the bonus question:\", \"how much can you trust the selection of alpha?\")\nprint()\nprint(\"Alpha parameters maximising the generalization score on different\")\nprint(\"subsets of the data:\")\nfor k, (train, test) in enumerate(k_fold.split(X, y)):\n    lasso_cv.fit(X[train], y[train])\n    print(\n        \"[fold {0}] alpha: {1:.5f}, score: {2:.5f}\".format(\n            k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])\n        )\n    )\nprint()\nprint(\"Answer: Not very much since we obtained different alphas for different\")\nprint(\"subsets of the data and moreover, the scores for these alphas differ\")\nprint(\"quite substantially.\")\n\nplt.show()"
+        "# To answer this question we use the LassoCV object that sets its alpha\n# parameter automatically from the data by internal cross-validation (i.e. it\n# performs cross-validation on the training data it receives).\n# We use external cross-validation to see how much the automatically obtained\n# alphas differ across different cross-validation folds.\n\nfrom sklearn.linear_model import LassoCV\nfrom sklearn.model_selection import KFold\n\nlasso_cv = LassoCV(alphas=alphas, random_state=0, max_iter=10000)\nk_fold = KFold(3)\n\nprint(\"Answer to the bonus question:\", \"how much can you trust the selection of alpha?\")\nprint()\nprint(\"Alpha parameters maximising the generalization score on different\")\nprint(\"subsets of the data:\")\nfor k, (train, test) in enumerate(k_fold.split(X, y)):\n    lasso_cv.fit(X[train], y[train])\n    print(\n        \"[fold {0}] alpha: {1:.5f}, score: {2:.5f}\".format(\n            k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])\n        )\n    )\nprint()\nprint(\"Answer: Not very much since we obtained different alphas for different\")\nprint(\"subsets of the data and moreover, the scores for these alphas differ\")\nprint(\"quite substantially.\")\n\nplt.show()"
       ]
     }
   ],
 
@@ -10,13 +10,14 @@
 
 """
 
-import numpy as np
+# %%
+# Load dataset and apply GridSearchCV
+# -----------------------------------
 import matplotlib.pyplot as plt
+import numpy as np
 
 from sklearn import datasets
-from sklearn.linear_model import LassoCV
 from sklearn.linear_model import Lasso
-from sklearn.model_selection import KFold
 from sklearn.model_selection import GridSearchCV
 
 X, y = datasets.load_diabetes(return_X_y=True)
@@ -33,10 +34,14 @@
 clf.fit(X, y)
 scores = clf.cv_results_["mean_test_score"]
 scores_std = clf.cv_results_["std_test_score"]
+
+# %%
+# Plot error lines showing +/- std. errors of the scores
+# ------------------------------------------------------
+
 plt.figure().set_size_inches(8, 6)
 plt.semilogx(alphas, scores)
 
-# plot error lines showing +/- std. errors of the scores
 std_error = scores_std / np.sqrt(n_folds)
 
 plt.semilogx(alphas, scores + std_error, "b--")
@@ -50,14 +55,19 @@
 plt.axhline(np.max(scores), linestyle="--", color=".5")
 plt.xlim([alphas[0], alphas[-1]])
 
-# #############################################################################
+# %%
 # Bonus: how much can you trust the selection of alpha?
+# -----------------------------------------------------
 
 # To answer this question we use the LassoCV object that sets its alpha
 # parameter automatically from the data by internal cross-validation (i.e. it
 # performs cross-validation on the training data it receives).
 # We use external cross-validation to see how much the automatically obtained
 # alphas differ across different cross-validation folds.
+
+from sklearn.linear_model import LassoCV
+from sklearn.model_selection import KFold
+
 lasso_cv = LassoCV(alphas=alphas, random_state=0, max_iter=10000)
 k_fold = KFold(3)