scikit-learn
diff --git a/‎dev/_downloads/010337852815f8103ac6cca38a812b3c/plot_roc_crossval.py
Lines changed: 34 additions & 47 deletions b/‎dev/_downloads/010337852815f8103ac6cca38a812b3c/plot_roc_crossval.py
Lines changed: 34 additions & 47 deletions
diff --git a/‎dev/_downloads/055e8313e28f2f3b5fd508054dfe5fe0/plot_roc_crossval.ipynb
Lines changed: 5 additions & 23 deletions b/‎dev/_downloads/055e8313e28f2f3b5fd508054dfe5fe0/plot_roc_crossval.ipynb
Lines changed: 5 additions & 23 deletions
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-805 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-805 Bytes
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
-1.05 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
-1.05 KB
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
21.9 KB b/‎dev/_downloads/scikit-learn-docs.zip
21.9 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
212 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
212 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
203 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
203 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
22 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
22 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
19 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
19 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
62 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
62 Bytes
@@ -3,88 +3,77 @@
 Receiver Operating Characteristic (ROC) with cross validation
 =============================================================
 
-This example presents how to estimate and visualize the variance of the Receiver
-Operating Characteristic (ROC) metric using cross-validation.
+Example of Receiver Operating Characteristic (ROC) metric to evaluate
+classifier output quality using cross-validation.
 
-ROC curves typically feature true positive rate (TPR) on the Y axis, and false
-positive rate (FPR) on the X axis. This means that the top left corner of the
-plot is the "ideal" point - a FPR of zero, and a TPR of one. This is not very
-realistic, but it does mean that a larger Area Under the Curve (AUC) is usually
-better. The "steepness" of ROC curves is also important, since it is ideal to
-maximize the TPR while minimizing the FPR.
+ROC curves typically feature true positive rate on the Y axis, and false
+positive rate on the X axis. This means that the top left corner of the plot is
+the "ideal" point - a false positive rate of zero, and a true positive rate of
+one. This is not very realistic, but it does mean that a larger area under the
+curve (AUC) is usually better.
+
+The "steepness" of ROC curves is also important, since it is ideal to maximize
+the true positive rate while minimizing the false positive rate.
 
 This example shows the ROC response of different datasets, created from K-fold
 cross-validation. Taking all of these curves, it is possible to calculate the
-mean AUC, and see the variance of the curve when the
+mean area under curve, and see the variance of the curve when the
 training set is split into different subsets. This roughly shows how the
-classifier output is affected by changes in the training data, and how different
-the splits generated by K-fold cross-validation are from one another.
+classifier output is affected by changes in the training data, and how
+different the splits generated by K-fold cross-validation are from one another.
 
 .. note::
 
-    See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` for a
-    complement of the present example explaining the averaging strategies to
-    generalize the metrics for multiclass classifiers.
+    See also :func:`sklearn.metrics.roc_auc_score`,
+             :func:`sklearn.model_selection.cross_val_score`,
+             :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py`,
+
 """
 
 # %%
-# Load and prepare data
-# =====================
-#
-# We import the :ref:`iris_dataset` which contains 3 classes, each one
-# corresponding to a type of iris plant. One class is linearly separable from
-# the other 2; the latter are **not** linearly separable from each other.
-#
-# In the following we binarize the dataset by dropping the "virginica" class
-# (`class_id=2`). This means that the "versicolor" class (`class_id=1`) is
-# regarded as the positive class and "setosa" as the negative class
-# (`class_id=0`).
-
+# Data IO and generation
+# ----------------------
 import numpy as np
-from sklearn.datasets import load_iris
 
-iris = load_iris()
-target_names = iris.target_names
-X, y = iris.data, iris.target
+from sklearn import datasets
+
+# Import some data to play with
+iris = datasets.load_iris()
+X = iris.data
+y = iris.target
 X, y = X[y != 2], y[y != 2]
 n_samples, n_features = X.shape
 
-# %%
-# We also add noisy features to make the problem harder.
+# Add noisy features
 random_state = np.random.RandomState(0)
-X = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)
+X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
 
 # %%
 # Classification and ROC analysis
 # -------------------------------
-#
-# Here we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and
-# plot the ROC curves fold-wise. Notice that the baseline to define the chance
-# level (dashed ROC curve) is a classifier that would always predict the most
-# frequent class.
-
 import matplotlib.pyplot as plt
 
 from sklearn import svm
 from sklearn.metrics import auc
 from sklearn.metrics import RocCurveDisplay
 from sklearn.model_selection import StratifiedKFold
 
+# Run classifier with cross-validation and plot ROC curves
 cv = StratifiedKFold(n_splits=6)
 classifier = svm.SVC(kernel="linear", probability=True, random_state=random_state)
 
 tprs = []
 aucs = []
 mean_fpr = np.linspace(0, 1, 100)
 
-fig, ax = plt.subplots(figsize=(6, 6))
-for fold, (train, test) in enumerate(cv.split(X, y)):
+fig, ax = plt.subplots()
+for i, (train, test) in enumerate(cv.split(X, y)):
     classifier.fit(X[train], y[train])
     viz = RocCurveDisplay.from_estimator(
         classifier,
         X[test],
         y[test],
-        name=f"ROC fold {fold}",
+        name="ROC fold {}".format(i),
         alpha=0.3,
         lw=1,
         ax=ax,
@@ -93,7 +82,8 @@
     interp_tpr[0] = 0.0
     tprs.append(interp_tpr)
     aucs.append(viz.roc_auc)
-ax.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
+
+ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)
 
 mean_tpr = np.mean(tprs, axis=0)
 mean_tpr[-1] = 1.0
@@ -123,10 +113,7 @@
 ax.set(
     xlim=[-0.05, 1.05],
     ylim=[-0.05, 1.05],
-    xlabel="False Positive Rate",
-    ylabel="True Positive Rate",
-    title=f"Mean ROC curve with variability\n(Positive label '{target_names[1]}')",
+    title="Receiver operating characteristic example",
 )
-ax.axis("square")
 ax.legend(loc="lower right")
 plt.show()
@@ -15,14 +15,14 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Receiver Operating Characteristic (ROC) with cross validation\n\nThis example presents how to estimate and visualize the variance of the Receiver\nOperating Characteristic (ROC) metric using cross-validation.\n\nROC curves typically feature true positive rate (TPR) on the Y axis, and false\npositive rate (FPR) on the X axis. This means that the top left corner of the\nplot is the \"ideal\" point - a FPR of zero, and a TPR of one. This is not very\nrealistic, but it does mean that a larger Area Under the Curve (AUC) is usually\nbetter. The \"steepness\" of ROC curves is also important, since it is ideal to\nmaximize the TPR while minimizing the FPR.\n\nThis example shows the ROC response of different datasets, created from K-fold\ncross-validation. Taking all of these curves, it is possible to calculate the\nmean AUC, and see the variance of the curve when the\ntraining set is split into different subsets. This roughly shows how the\nclassifier output is affected by changes in the training data, and how different\nthe splits generated by K-fold cross-validation are from one another.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>See `sphx_glr_auto_examples_model_selection_plot_roc.py` for a\n    complement of the present example explaining the averaging strategies to\n    generalize the metrics for multiclass classifiers.</p></div>\n"
+        "\n# Receiver Operating Characteristic (ROC) with cross validation\n\nExample of Receiver Operating Characteristic (ROC) metric to evaluate\nclassifier output quality using cross-validation.\n\nROC curves typically feature true positive rate on the Y axis, and false\npositive rate on the X axis. This means that the top left corner of the plot is\nthe \"ideal\" point - a false positive rate of zero, and a true positive rate of\none. This is not very realistic, but it does mean that a larger area under the\ncurve (AUC) is usually better.\n\nThe \"steepness\" of ROC curves is also important, since it is ideal to maximize\nthe true positive rate while minimizing the false positive rate.\n\nThis example shows the ROC response of different datasets, created from K-fold\ncross-validation. Taking all of these curves, it is possible to calculate the\nmean area under curve, and see the variance of the curve when the\ntraining set is split into different subsets. This roughly shows how the\nclassifier output is affected by changes in the training data, and how\ndifferent the splits generated by K-fold cross-validation are from one another.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>See also :func:`sklearn.metrics.roc_auc_score`,\n             :func:`sklearn.model_selection.cross_val_score`,\n             `sphx_glr_auto_examples_model_selection_plot_roc.py`,</p></div>\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Load and prepare data\n\nWe import the `iris_dataset` which contains 3 classes, each one\ncorresponding to a type of iris plant. One class is linearly separable from\nthe other 2; the latter are **not** linearly separable from each other.\n\nIn the following we binarize the dataset by dropping the \"virginica\" class\n(`class_id=2`). This means that the \"versicolor\" class (`class_id=1`) is\nregarded as the positive class and \"setosa\" as the negative class\n(`class_id=0`).\n\n"
+        "## Data IO and generation\n\n"
       ]
     },
     {
@@ -33,14 +33,14 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nfrom sklearn.datasets import load_iris\n\niris = load_iris()\ntarget_names = iris.target_names\nX, y = iris.data, iris.target\nX, y = X[y != 2], y[y != 2]\nn_samples, n_features = X.shape"
+        "import numpy as np\n\nfrom sklearn import datasets\n\n# Import some data to play with\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\nX, y = X[y != 2], y[y != 2]\nn_samples, n_features = X.shape\n\n# Add noisy features\nrandom_state = np.random.RandomState(0)\nX = np.c_[X, random_state.randn(n_samples, 200 * n_features)]"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "We also add noisy features to make the problem harder.\n\n"
+        "## Classification and ROC analysis\n\n"
       ]
     },
     {
@@ -51,25 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "random_state = np.random.RandomState(0)\nX = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Classification and ROC analysis\n\nHere we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and\nplot the ROC curves fold-wise. Notice that the baseline to define the chance\nlevel (dashed ROC curve) is a classifier that would always predict the most\nfrequent class.\n\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "import matplotlib.pyplot as plt\n\nfrom sklearn import svm\nfrom sklearn.metrics import auc\nfrom sklearn.metrics import RocCurveDisplay\nfrom sklearn.model_selection import StratifiedKFold\n\ncv = StratifiedKFold(n_splits=6)\nclassifier = svm.SVC(kernel=\"linear\", probability=True, random_state=random_state)\n\ntprs = []\naucs = []\nmean_fpr = np.linspace(0, 1, 100)\n\nfig, ax = plt.subplots(figsize=(6, 6))\nfor fold, (train, test) in enumerate(cv.split(X, y)):\n    classifier.fit(X[train], y[train])\n    viz = RocCurveDisplay.from_estimator(\n        classifier,\n        X[test],\n        y[test],\n        name=f\"ROC fold {fold}\",\n        alpha=0.3,\n        lw=1,\n        ax=ax,\n    )\n    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n    interp_tpr[0] = 0.0\n    tprs.append(interp_tpr)\n    aucs.append(viz.roc_auc)\nax.plot([0, 1], [0, 1], \"k--\", label=\"chance level (AUC = 0.5)\")\n\nmean_tpr = np.mean(tprs, axis=0)\nmean_tpr[-1] = 1.0\nmean_auc = auc(mean_fpr, mean_tpr)\nstd_auc = np.std(aucs)\nax.plot(\n    mean_fpr,\n    mean_tpr,\n    color=\"b\",\n    label=r\"Mean ROC (AUC = %0.2f $\\pm$ %0.2f)\" % (mean_auc, std_auc),\n    lw=2,\n    alpha=0.8,\n)\n\nstd_tpr = np.std(tprs, axis=0)\ntprs_upper = np.minimum(mean_tpr + std_tpr, 1)\ntprs_lower = np.maximum(mean_tpr - std_tpr, 0)\nax.fill_between(\n    mean_fpr,\n    tprs_lower,\n    tprs_upper,\n    color=\"grey\",\n    alpha=0.2,\n    label=r\"$\\pm$ 1 std. dev.\",\n)\n\nax.set(\n    xlim=[-0.05, 1.05],\n    ylim=[-0.05, 1.05],\n    xlabel=\"False Positive Rate\",\n    ylabel=\"True Positive Rate\",\n    title=f\"Mean ROC curve with variability\\n(Positive label '{target_names[1]}')\",\n)\nax.axis(\"square\")\nax.legend(loc=\"lower right\")\nplt.show()"
+        "import matplotlib.pyplot as plt\n\nfrom sklearn import svm\nfrom sklearn.metrics import auc\nfrom sklearn.metrics import RocCurveDisplay\nfrom sklearn.model_selection import StratifiedKFold\n\n# Run classifier with cross-validation and plot ROC curves\ncv = StratifiedKFold(n_splits=6)\nclassifier = svm.SVC(kernel=\"linear\", probability=True, random_state=random_state)\n\ntprs = []\naucs = []\nmean_fpr = np.linspace(0, 1, 100)\n\nfig, ax = plt.subplots()\nfor i, (train, test) in enumerate(cv.split(X, y)):\n    classifier.fit(X[train], y[train])\n    viz = RocCurveDisplay.from_estimator(\n        classifier,\n        X[test],\n        y[test],\n        name=\"ROC fold {}\".format(i),\n        alpha=0.3,\n        lw=1,\n        ax=ax,\n    )\n    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n    interp_tpr[0] = 0.0\n    tprs.append(interp_tpr)\n    aucs.append(viz.roc_auc)\n\nax.plot([0, 1], [0, 1], linestyle=\"--\", lw=2, color=\"r\", label=\"Chance\", alpha=0.8)\n\nmean_tpr = np.mean(tprs, axis=0)\nmean_tpr[-1] = 1.0\nmean_auc = auc(mean_fpr, mean_tpr)\nstd_auc = np.std(aucs)\nax.plot(\n    mean_fpr,\n    mean_tpr,\n    color=\"b\",\n    label=r\"Mean ROC (AUC = %0.2f $\\pm$ %0.2f)\" % (mean_auc, std_auc),\n    lw=2,\n    alpha=0.8,\n)\n\nstd_tpr = np.std(tprs, axis=0)\ntprs_upper = np.minimum(mean_tpr + std_tpr, 1)\ntprs_lower = np.maximum(mean_tpr - std_tpr, 0)\nax.fill_between(\n    mean_fpr,\n    tprs_lower,\n    tprs_upper,\n    color=\"grey\",\n    alpha=0.2,\n    label=r\"$\\pm$ 1 std. dev.\",\n)\n\nax.set(\n    xlim=[-0.05, 1.05],\n    ylim=[-0.05, 1.05],\n    title=\"Receiver operating characteristic example\",\n)\nax.legend(loc=\"lower right\")\nplt.show()"
       ]
     }
   ],