scikit-learn
diff --git a/‎dev/_downloads/010337852815f8103ac6cca38a812b3c/plot_roc_crossval.py
Lines changed: 47 additions & 34 deletions b/‎dev/_downloads/010337852815f8103ac6cca38a812b3c/plot_roc_crossval.py
Lines changed: 47 additions & 34 deletions
diff --git a/‎dev/_downloads/055e8313e28f2f3b5fd508054dfe5fe0/plot_roc_crossval.ipynb
Lines changed: 23 additions & 5 deletions b/‎dev/_downloads/055e8313e28f2f3b5fd508054dfe5fe0/plot_roc_crossval.ipynb
Lines changed: 23 additions & 5 deletions
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
2.66 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
2.66 KB
diff --git a/‎dev/_downloads/2e4791a177381a6102b21e44083615c8/plot_poisson_regression_non_normal_loss.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/2e4791a177381a6102b21e44083615c8/plot_poisson_regression_non_normal_loss.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
3.96 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
3.96 KB
@@ -3,77 +3,88 @@
 Receiver Operating Characteristic (ROC) with cross validation
 =============================================================
 
-Example of Receiver Operating Characteristic (ROC) metric to evaluate
-classifier output quality using cross-validation.
+This example presents how to estimate and visualize the variance of the Receiver
+Operating Characteristic (ROC) metric using cross-validation.
 
-ROC curves typically feature true positive rate on the Y axis, and false
-positive rate on the X axis. This means that the top left corner of the plot is
-the "ideal" point - a false positive rate of zero, and a true positive rate of
-one. This is not very realistic, but it does mean that a larger area under the
-curve (AUC) is usually better.
-
-The "steepness" of ROC curves is also important, since it is ideal to maximize
-the true positive rate while minimizing the false positive rate.
+ROC curves typically feature true positive rate (TPR) on the Y axis, and false
+positive rate (FPR) on the X axis. This means that the top left corner of the
+plot is the "ideal" point - a FPR of zero, and a TPR of one. This is not very
+realistic, but it does mean that a larger Area Under the Curve (AUC) is usually
+better. The "steepness" of ROC curves is also important, since it is ideal to
+maximize the TPR while minimizing the FPR.
 
 This example shows the ROC response of different datasets, created from K-fold
 cross-validation. Taking all of these curves, it is possible to calculate the
-mean area under curve, and see the variance of the curve when the
+mean AUC, and see the variance of the curve when the
 training set is split into different subsets. This roughly shows how the
-classifier output is affected by changes in the training data, and how
-different the splits generated by K-fold cross-validation are from one another.
+classifier output is affected by changes in the training data, and how different
+the splits generated by K-fold cross-validation are from one another.
 
 .. note::
 
-    See also :func:`sklearn.metrics.roc_auc_score`,
-             :func:`sklearn.model_selection.cross_val_score`,
-             :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py`,
-
+    See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` for a
+    complement of the present example explaining the averaging strategies to
+    generalize the metrics for multiclass classifiers.
 """
 
 # %%
-# Data IO and generation
-# ----------------------
-import numpy as np
+# Load and prepare data
+# =====================
+#
+# We import the :ref:`iris_dataset` which contains 3 classes, each one
+# corresponding to a type of iris plant. One class is linearly separable from
+# the other 2; the latter are **not** linearly separable from each other.
+#
+# In the following we binarize the dataset by dropping the "virginica" class
+# (`class_id=2`). This means that the "versicolor" class (`class_id=1`) is
+# regarded as the positive class and "setosa" as the negative class
+# (`class_id=0`).
 
-from sklearn import datasets
+import numpy as np
+from sklearn.datasets import load_iris
 
-# Import some data to play with
-iris = datasets.load_iris()
-X = iris.data
-y = iris.target
+iris = load_iris()
+target_names = iris.target_names
+X, y = iris.data, iris.target
 X, y = X[y != 2], y[y != 2]
 n_samples, n_features = X.shape
 
-# Add noisy features
+# %%
+# We also add noisy features to make the problem harder.
 random_state = np.random.RandomState(0)
-X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
+X = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)
 
 # %%
 # Classification and ROC analysis
 # -------------------------------
+#
+# Here we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and
+# plot the ROC curves fold-wise. Notice that the baseline to define the chance
+# level (dashed ROC curve) is a classifier that would always predict the most
+# frequent class.
+
 import matplotlib.pyplot as plt
 
 from sklearn import svm
 from sklearn.metrics import auc
 from sklearn.metrics import RocCurveDisplay
 from sklearn.model_selection import StratifiedKFold
 
-# Run classifier with cross-validation and plot ROC curves
 cv = StratifiedKFold(n_splits=6)
 classifier = svm.SVC(kernel="linear", probability=True, random_state=random_state)
 
 tprs = []
 aucs = []
 mean_fpr = np.linspace(0, 1, 100)
 
-fig, ax = plt.subplots()
-for i, (train, test) in enumerate(cv.split(X, y)):
+fig, ax = plt.subplots(figsize=(6, 6))
+for fold, (train, test) in enumerate(cv.split(X, y)):
     classifier.fit(X[train], y[train])
     viz = RocCurveDisplay.from_estimator(
         classifier,
         X[test],
         y[test],
-        name="ROC fold {}".format(i),
+        name=f"ROC fold {fold}",
         alpha=0.3,
         lw=1,
         ax=ax,
@@ -82,8 +93,7 @@
     interp_tpr[0] = 0.0
     tprs.append(interp_tpr)
     aucs.append(viz.roc_auc)
-
-ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)
+ax.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
 
 mean_tpr = np.mean(tprs, axis=0)
 mean_tpr[-1] = 1.0
@@ -113,7 +123,10 @@
 ax.set(
     xlim=[-0.05, 1.05],
     ylim=[-0.05, 1.05],
-    title="Receiver operating characteristic example",
+    xlabel="False Positive Rate",
+    ylabel="True Positive Rate",
+    title=f"Mean ROC curve with variability\n(Positive label '{target_names[1]}')",
 )
+ax.axis("square")
 ax.legend(loc="lower right")
 plt.show()
@@ -15,14 +15,14 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Receiver Operating Characteristic (ROC) with cross validation\n\nExample of Receiver Operating Characteristic (ROC) metric to evaluate\nclassifier output quality using cross-validation.\n\nROC curves typically feature true positive rate on the Y axis, and false\npositive rate on the X axis. This means that the top left corner of the plot is\nthe \"ideal\" point - a false positive rate of zero, and a true positive rate of\none. This is not very realistic, but it does mean that a larger area under the\ncurve (AUC) is usually better.\n\nThe \"steepness\" of ROC curves is also important, since it is ideal to maximize\nthe true positive rate while minimizing the false positive rate.\n\nThis example shows the ROC response of different datasets, created from K-fold\ncross-validation. Taking all of these curves, it is possible to calculate the\nmean area under curve, and see the variance of the curve when the\ntraining set is split into different subsets. This roughly shows how the\nclassifier output is affected by changes in the training data, and how\ndifferent the splits generated by K-fold cross-validation are from one another.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>See also :func:`sklearn.metrics.roc_auc_score`,\n             :func:`sklearn.model_selection.cross_val_score`,\n             `sphx_glr_auto_examples_model_selection_plot_roc.py`,</p></div>\n"
+        "\n# Receiver Operating Characteristic (ROC) with cross validation\n\nThis example presents how to estimate and visualize the variance of the Receiver\nOperating Characteristic (ROC) metric using cross-validation.\n\nROC curves typically feature true positive rate (TPR) on the Y axis, and false\npositive rate (FPR) on the X axis. This means that the top left corner of the\nplot is the \"ideal\" point - a FPR of zero, and a TPR of one. This is not very\nrealistic, but it does mean that a larger Area Under the Curve (AUC) is usually\nbetter. The \"steepness\" of ROC curves is also important, since it is ideal to\nmaximize the TPR while minimizing the FPR.\n\nThis example shows the ROC response of different datasets, created from K-fold\ncross-validation. Taking all of these curves, it is possible to calculate the\nmean AUC, and see the variance of the curve when the\ntraining set is split into different subsets. This roughly shows how the\nclassifier output is affected by changes in the training data, and how different\nthe splits generated by K-fold cross-validation are from one another.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>See `sphx_glr_auto_examples_model_selection_plot_roc.py` for a\n    complement of the present example explaining the averaging strategies to\n    generalize the metrics for multiclass classifiers.</p></div>\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Data IO and generation\n\n"
+        "## Load and prepare data\n\nWe import the `iris_dataset` which contains 3 classes, each one\ncorresponding to a type of iris plant. One class is linearly separable from\nthe other 2; the latter are **not** linearly separable from each other.\n\nIn the following we binarize the dataset by dropping the \"virginica\" class\n(`class_id=2`). This means that the \"versicolor\" class (`class_id=1`) is\nregarded as the positive class and \"setosa\" as the negative class\n(`class_id=0`).\n\n"
       ]
     },
     {
@@ -33,14 +33,14 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\n\nfrom sklearn import datasets\n\n# Import some data to play with\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\nX, y = X[y != 2], y[y != 2]\nn_samples, n_features = X.shape\n\n# Add noisy features\nrandom_state = np.random.RandomState(0)\nX = np.c_[X, random_state.randn(n_samples, 200 * n_features)]"
+        "import numpy as np\nfrom sklearn.datasets import load_iris\n\niris = load_iris()\ntarget_names = iris.target_names\nX, y = iris.data, iris.target\nX, y = X[y != 2], y[y != 2]\nn_samples, n_features = X.shape"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Classification and ROC analysis\n\n"
+        "We also add noisy features to make the problem harder.\n\n"
       ]
     },
     {
@@ -51,7 +51,25 @@
       },
       "outputs": [],
       "source": [
-        "import matplotlib.pyplot as plt\n\nfrom sklearn import svm\nfrom sklearn.metrics import auc\nfrom sklearn.metrics import RocCurveDisplay\nfrom sklearn.model_selection import StratifiedKFold\n\n# Run classifier with cross-validation and plot ROC curves\ncv = StratifiedKFold(n_splits=6)\nclassifier = svm.SVC(kernel=\"linear\", probability=True, random_state=random_state)\n\ntprs = []\naucs = []\nmean_fpr = np.linspace(0, 1, 100)\n\nfig, ax = plt.subplots()\nfor i, (train, test) in enumerate(cv.split(X, y)):\n    classifier.fit(X[train], y[train])\n    viz = RocCurveDisplay.from_estimator(\n        classifier,\n        X[test],\n        y[test],\n        name=\"ROC fold {}\".format(i),\n        alpha=0.3,\n        lw=1,\n        ax=ax,\n    )\n    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n    interp_tpr[0] = 0.0\n    tprs.append(interp_tpr)\n    aucs.append(viz.roc_auc)\n\nax.plot([0, 1], [0, 1], linestyle=\"--\", lw=2, color=\"r\", label=\"Chance\", alpha=0.8)\n\nmean_tpr = np.mean(tprs, axis=0)\nmean_tpr[-1] = 1.0\nmean_auc = auc(mean_fpr, mean_tpr)\nstd_auc = np.std(aucs)\nax.plot(\n    mean_fpr,\n    mean_tpr,\n    color=\"b\",\n    label=r\"Mean ROC (AUC = %0.2f $\\pm$ %0.2f)\" % (mean_auc, std_auc),\n    lw=2,\n    alpha=0.8,\n)\n\nstd_tpr = np.std(tprs, axis=0)\ntprs_upper = np.minimum(mean_tpr + std_tpr, 1)\ntprs_lower = np.maximum(mean_tpr - std_tpr, 0)\nax.fill_between(\n    mean_fpr,\n    tprs_lower,\n    tprs_upper,\n    color=\"grey\",\n    alpha=0.2,\n    label=r\"$\\pm$ 1 std. dev.\",\n)\n\nax.set(\n    xlim=[-0.05, 1.05],\n    ylim=[-0.05, 1.05],\n    title=\"Receiver operating characteristic example\",\n)\nax.legend(loc=\"lower right\")\nplt.show()"
+        "random_state = np.random.RandomState(0)\nX = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Classification and ROC analysis\n\nHere we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and\nplot the ROC curves fold-wise. Notice that the baseline to define the chance\nlevel (dashed ROC curve) is a classifier that would always predict the most\nfrequent class.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n\nfrom sklearn import svm\nfrom sklearn.metrics import auc\nfrom sklearn.metrics import RocCurveDisplay\nfrom sklearn.model_selection import StratifiedKFold\n\ncv = StratifiedKFold(n_splits=6)\nclassifier = svm.SVC(kernel=\"linear\", probability=True, random_state=random_state)\n\ntprs = []\naucs = []\nmean_fpr = np.linspace(0, 1, 100)\n\nfig, ax = plt.subplots(figsize=(6, 6))\nfor fold, (train, test) in enumerate(cv.split(X, y)):\n    classifier.fit(X[train], y[train])\n    viz = RocCurveDisplay.from_estimator(\n        classifier,\n        X[test],\n        y[test],\n        name=f\"ROC fold {fold}\",\n        alpha=0.3,\n        lw=1,\n        ax=ax,\n    )\n    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n    interp_tpr[0] = 0.0\n    tprs.append(interp_tpr)\n    aucs.append(viz.roc_auc)\nax.plot([0, 1], [0, 1], \"k--\", label=\"chance level (AUC = 0.5)\")\n\nmean_tpr = np.mean(tprs, axis=0)\nmean_tpr[-1] = 1.0\nmean_auc = auc(mean_fpr, mean_tpr)\nstd_auc = np.std(aucs)\nax.plot(\n    mean_fpr,\n    mean_tpr,\n    color=\"b\",\n    label=r\"Mean ROC (AUC = %0.2f $\\pm$ %0.2f)\" % (mean_auc, std_auc),\n    lw=2,\n    alpha=0.8,\n)\n\nstd_tpr = np.std(tprs, axis=0)\ntprs_upper = np.minimum(mean_tpr + std_tpr, 1)\ntprs_lower = np.maximum(mean_tpr - std_tpr, 0)\nax.fill_between(\n    mean_fpr,\n    tprs_lower,\n    tprs_upper,\n    color=\"grey\",\n    alpha=0.2,\n    label=r\"$\\pm$ 1 std. dev.\",\n)\n\nax.set(\n    xlim=[-0.05, 1.05],\n    ylim=[-0.05, 1.05],\n    xlabel=\"False Positive Rate\",\n    ylabel=\"True Positive Rate\",\n    title=f\"Mean ROC curve with variability\\n(Positive label '{target_names[1]}')\",\n)\nax.axis(\"square\")\nax.legend(loc=\"lower right\")\nplt.show()"
       ]
     }
   ],
 
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import FunctionTransformer, OneHotEncoder\nfrom sklearn.preprocessing import StandardScaler, KBinsDiscretizer\nfrom sklearn.compose import ColumnTransformer\n\n\nlog_scale_transformer = make_pipeline(\n    FunctionTransformer(np.log, validate=False), StandardScaler()\n)\n\nlinear_model_preprocessor = ColumnTransformer(\n    [\n        (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n        (\"binned_numeric\", KBinsDiscretizer(n_bins=10), [\"VehAge\", \"DrivAge\"]),\n        (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n        (\n            \"onehot_categorical\",\n            OneHotEncoder(),\n            [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n        ),\n    ],\n    remainder=\"drop\",\n)"
+        "from sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import FunctionTransformer, OneHotEncoder\nfrom sklearn.preprocessing import StandardScaler, KBinsDiscretizer\nfrom sklearn.compose import ColumnTransformer\n\n\nlog_scale_transformer = make_pipeline(\n    FunctionTransformer(np.log, validate=False), StandardScaler()\n)\n\nlinear_model_preprocessor = ColumnTransformer(\n    [\n        (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n        (\n            \"binned_numeric\",\n            KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),\n            [\"VehAge\", \"DrivAge\"],\n        ),\n        (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n        (\n            \"onehot_categorical\",\n            OneHotEncoder(),\n            [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n        ),\n    ],\n    remainder=\"drop\",\n)"
       ]
     },
     {
@@ -170,7 +170,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.linear_model import PoissonRegressor\n\nn_samples = df_train.shape[0]\n\npoisson_glm = Pipeline(\n    [\n        (\"preprocessor\", linear_model_preprocessor),\n        (\"regressor\", PoissonRegressor(alpha=1e-12, max_iter=300)),\n    ]\n)\npoisson_glm.fit(\n    df_train, df_train[\"Frequency\"], regressor__sample_weight=df_train[\"Exposure\"]\n)\n\nprint(\"PoissonRegressor evaluation:\")\nscore_estimator(poisson_glm, df_test)"
+        "from sklearn.linear_model import PoissonRegressor\n\nn_samples = df_train.shape[0]\n\npoisson_glm = Pipeline(\n    [\n        (\"preprocessor\", linear_model_preprocessor),\n        (\"regressor\", PoissonRegressor(alpha=1e-12, solver=\"newton-cholesky\")),\n    ]\n)\npoisson_glm.fit(\n    df_train, df_train[\"Frequency\"], regressor__sample_weight=df_train[\"Exposure\"]\n)\n\nprint(\"PoissonRegressor evaluation:\")\nscore_estimator(poisson_glm, df_test)"
       ]
     },
     {
Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@`
`80`	`80`	`},`
`81`	`81`	`"outputs": [],`
`82`	`82`	`"source": [`
`83`		- "from sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import FunctionTransformer, OneHotEncoder\nfrom sklearn.preprocessing import StandardScaler, KBinsDiscretizer\nfrom sklearn.compose import ColumnTransformer\n\n\nlog_scale_transformer = make_pipeline(\n FunctionTransformer(np.log, validate=False), StandardScaler()\n)\n\nlinear_model_preprocessor = ColumnTransformer(\n [\n (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n (\"binned_numeric\", KBinsDiscretizer(n_bins=10), [\"VehAge\", \"DrivAge\"]),\n (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n (\n \"onehot_categorical\",\n OneHotEncoder(),\n [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n ),\n ],\n remainder=\"drop\",\n)"
	`83`	+ "from sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import FunctionTransformer, OneHotEncoder\nfrom sklearn.preprocessing import StandardScaler, KBinsDiscretizer\nfrom sklearn.compose import ColumnTransformer\n\n\nlog_scale_transformer = make_pipeline(\n FunctionTransformer(np.log, validate=False), StandardScaler()\n)\n\nlinear_model_preprocessor = ColumnTransformer(\n [\n (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n (\n \"binned_numeric\",\n KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),\n [\"VehAge\", \"DrivAge\"],\n ),\n (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n (\n \"onehot_categorical\",\n OneHotEncoder(),\n [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n ),\n ],\n remainder=\"drop\",\n)"
`84`	`84`	`]`
`85`	`85`	`},`
`86`	`86`	`{`
`@@ -170,7 +170,7 @@`
`170`	`170`	`},`
`171`	`171`	`"outputs": [],`
`172`	`172`	`"source": [`
`173`		`- "from sklearn.linear_model import PoissonRegressor\n\nn_samples = df_train.shape[0]\n\npoisson_glm = Pipeline(\n [\n (\"preprocessor\", linear_model_preprocessor),\n (\"regressor\", PoissonRegressor(alpha=1e-12, max_iter=300)),\n ]\n)\npoisson_glm.fit(\n df_train, df_train[\"Frequency\"], regressor__sample_weight=df_train[\"Exposure\"]\n)\n\nprint(\"PoissonRegressor evaluation:\")\nscore_estimator(poisson_glm, df_test)"`
	`173`	`+ "from sklearn.linear_model import PoissonRegressor\n\nn_samples = df_train.shape[0]\n\npoisson_glm = Pipeline(\n [\n (\"preprocessor\", linear_model_preprocessor),\n (\"regressor\", PoissonRegressor(alpha=1e-12, solver=\"newton-cholesky\")),\n ]\n)\npoisson_glm.fit(\n df_train, df_train[\"Frequency\"], regressor__sample_weight=df_train[\"Exposure\"]\n)\n\nprint(\"PoissonRegressor evaluation:\")\nscore_estimator(poisson_glm, df_test)"`
`174`	`174`	`]`
`175`	`175`	`},`
`176`	`176`	`{`