Skip to content

Commit 8cc0bdf

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 2335a8e831b44b28f30cedb355bab6ac4803ef8b
1 parent 413e29c commit 8cc0bdf

File tree

1,261 files changed

+4529
-4609
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,261 files changed

+4529
-4609
lines changed

dev/_downloads/010337852815f8103ac6cca38a812b3c/plot_roc_crossval.py

Lines changed: 34 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -3,88 +3,77 @@
33
Receiver Operating Characteristic (ROC) with cross validation
44
=============================================================
55
6-
This example presents how to estimate and visualize the variance of the Receiver
7-
Operating Characteristic (ROC) metric using cross-validation.
6+
Example of Receiver Operating Characteristic (ROC) metric to evaluate
7+
classifier output quality using cross-validation.
88
9-
ROC curves typically feature true positive rate (TPR) on the Y axis, and false
10-
positive rate (FPR) on the X axis. This means that the top left corner of the
11-
plot is the "ideal" point - a FPR of zero, and a TPR of one. This is not very
12-
realistic, but it does mean that a larger Area Under the Curve (AUC) is usually
13-
better. The "steepness" of ROC curves is also important, since it is ideal to
14-
maximize the TPR while minimizing the FPR.
9+
ROC curves typically feature true positive rate on the Y axis, and false
10+
positive rate on the X axis. This means that the top left corner of the plot is
11+
the "ideal" point - a false positive rate of zero, and a true positive rate of
12+
one. This is not very realistic, but it does mean that a larger area under the
13+
curve (AUC) is usually better.
14+
15+
The "steepness" of ROC curves is also important, since it is ideal to maximize
16+
the true positive rate while minimizing the false positive rate.
1517
1618
This example shows the ROC response of different datasets, created from K-fold
1719
cross-validation. Taking all of these curves, it is possible to calculate the
18-
mean AUC, and see the variance of the curve when the
20+
mean area under curve, and see the variance of the curve when the
1921
training set is split into different subsets. This roughly shows how the
20-
classifier output is affected by changes in the training data, and how different
21-
the splits generated by K-fold cross-validation are from one another.
22+
classifier output is affected by changes in the training data, and how
23+
different the splits generated by K-fold cross-validation are from one another.
2224
2325
.. note::
2426
25-
See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` for a
26-
complement of the present example explaining the averaging strategies to
27-
generalize the metrics for multiclass classifiers.
27+
See also :func:`sklearn.metrics.roc_auc_score`,
28+
:func:`sklearn.model_selection.cross_val_score`,
29+
:ref:`sphx_glr_auto_examples_model_selection_plot_roc.py`,
30+
2831
"""
2932

3033
# %%
31-
# Load and prepare data
32-
# =====================
33-
#
34-
# We import the :ref:`iris_dataset` which contains 3 classes, each one
35-
# corresponding to a type of iris plant. One class is linearly separable from
36-
# the other 2; the latter are **not** linearly separable from each other.
37-
#
38-
# In the following we binarize the dataset by dropping the "virginica" class
39-
# (`class_id=2`). This means that the "versicolor" class (`class_id=1`) is
40-
# regarded as the positive class and "setosa" as the negative class
41-
# (`class_id=0`).
42-
34+
# Data IO and generation
35+
# ----------------------
4336
import numpy as np
44-
from sklearn.datasets import load_iris
4537

46-
iris = load_iris()
47-
target_names = iris.target_names
48-
X, y = iris.data, iris.target
38+
from sklearn import datasets
39+
40+
# Import some data to play with
41+
iris = datasets.load_iris()
42+
X = iris.data
43+
y = iris.target
4944
X, y = X[y != 2], y[y != 2]
5045
n_samples, n_features = X.shape
5146

52-
# %%
53-
# We also add noisy features to make the problem harder.
47+
# Add noisy features
5448
random_state = np.random.RandomState(0)
55-
X = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)
49+
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
5650

5751
# %%
5852
# Classification and ROC analysis
5953
# -------------------------------
60-
#
61-
# Here we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and
62-
# plot the ROC curves fold-wise. Notice that the baseline to define the chance
63-
# level (dashed ROC curve) is a classifier that would always predict the most
64-
# frequent class.
65-
6654
import matplotlib.pyplot as plt
6755

6856
from sklearn import svm
6957
from sklearn.metrics import auc
7058
from sklearn.metrics import RocCurveDisplay
7159
from sklearn.model_selection import StratifiedKFold
7260

61+
# Run classifier with cross-validation and plot ROC curves
7362
cv = StratifiedKFold(n_splits=6)
7463
classifier = svm.SVC(kernel="linear", probability=True, random_state=random_state)
7564

7665
tprs = []
7766
aucs = []
7867
mean_fpr = np.linspace(0, 1, 100)
7968

80-
fig, ax = plt.subplots(figsize=(6, 6))
81-
for fold, (train, test) in enumerate(cv.split(X, y)):
69+
fig, ax = plt.subplots()
70+
for i, (train, test) in enumerate(cv.split(X, y)):
8271
classifier.fit(X[train], y[train])
8372
viz = RocCurveDisplay.from_estimator(
8473
classifier,
8574
X[test],
8675
y[test],
87-
name=f"ROC fold {fold}",
76+
name="ROC fold {}".format(i),
8877
alpha=0.3,
8978
lw=1,
9079
ax=ax,
@@ -93,7 +82,8 @@
9382
interp_tpr[0] = 0.0
9483
tprs.append(interp_tpr)
9584
aucs.append(viz.roc_auc)
96-
ax.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
85+
86+
ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)
9787

9888
mean_tpr = np.mean(tprs, axis=0)
9989
mean_tpr[-1] = 1.0
@@ -123,10 +113,7 @@
123113
ax.set(
124114
xlim=[-0.05, 1.05],
125115
ylim=[-0.05, 1.05],
126-
xlabel="False Positive Rate",
127-
ylabel="True Positive Rate",
128-
title=f"Mean ROC curve with variability\n(Positive label '{target_names[1]}')",
116+
title="Receiver operating characteristic example",
129117
)
130-
ax.axis("square")
131118
ax.legend(loc="lower right")
132119
plt.show()

dev/_downloads/055e8313e28f2f3b5fd508054dfe5fe0/plot_roc_crossval.ipynb

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515
"cell_type": "markdown",
1616
"metadata": {},
1717
"source": [
18-
"\n# Receiver Operating Characteristic (ROC) with cross validation\n\nThis example presents how to estimate and visualize the variance of the Receiver\nOperating Characteristic (ROC) metric using cross-validation.\n\nROC curves typically feature true positive rate (TPR) on the Y axis, and false\npositive rate (FPR) on the X axis. This means that the top left corner of the\nplot is the \"ideal\" point - a FPR of zero, and a TPR of one. This is not very\nrealistic, but it does mean that a larger Area Under the Curve (AUC) is usually\nbetter. The \"steepness\" of ROC curves is also important, since it is ideal to\nmaximize the TPR while minimizing the FPR.\n\nThis example shows the ROC response of different datasets, created from K-fold\ncross-validation. Taking all of these curves, it is possible to calculate the\nmean AUC, and see the variance of the curve when the\ntraining set is split into different subsets. This roughly shows how the\nclassifier output is affected by changes in the training data, and how different\nthe splits generated by K-fold cross-validation are from one another.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>See `sphx_glr_auto_examples_model_selection_plot_roc.py` for a\n complement of the present example explaining the averaging strategies to\n generalize the metrics for multiclass classifiers.</p></div>\n"
18+
"\n# Receiver Operating Characteristic (ROC) with cross validation\n\nExample of Receiver Operating Characteristic (ROC) metric to evaluate\nclassifier output quality using cross-validation.\n\nROC curves typically feature true positive rate on the Y axis, and false\npositive rate on the X axis. This means that the top left corner of the plot is\nthe \"ideal\" point - a false positive rate of zero, and a true positive rate of\none. This is not very realistic, but it does mean that a larger area under the\ncurve (AUC) is usually better.\n\nThe \"steepness\" of ROC curves is also important, since it is ideal to maximize\nthe true positive rate while minimizing the false positive rate.\n\nThis example shows the ROC response of different datasets, created from K-fold\ncross-validation. Taking all of these curves, it is possible to calculate the\nmean area under curve, and see the variance of the curve when the\ntraining set is split into different subsets. This roughly shows how the\nclassifier output is affected by changes in the training data, and how\ndifferent the splits generated by K-fold cross-validation are from one another.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>See also :func:`sklearn.metrics.roc_auc_score`,\n :func:`sklearn.model_selection.cross_val_score`,\n `sphx_glr_auto_examples_model_selection_plot_roc.py`,</p></div>\n"
1919
]
2020
},
2121
{
2222
"cell_type": "markdown",
2323
"metadata": {},
2424
"source": [
25-
"## Load and prepare data\n\nWe import the `iris_dataset` which contains 3 classes, each one\ncorresponding to a type of iris plant. One class is linearly separable from\nthe other 2; the latter are **not** linearly separable from each other.\n\nIn the following we binarize the dataset by dropping the \"virginica\" class\n(`class_id=2`). This means that the \"versicolor\" class (`class_id=1`) is\nregarded as the positive class and \"setosa\" as the negative class\n(`class_id=0`).\n\n"
25+
"## Data IO and generation\n\n"
2626
]
2727
},
2828
{
@@ -33,14 +33,14 @@
3333
},
3434
"outputs": [],
3535
"source": [
36-
"import numpy as np\nfrom sklearn.datasets import load_iris\n\niris = load_iris()\ntarget_names = iris.target_names\nX, y = iris.data, iris.target\nX, y = X[y != 2], y[y != 2]\nn_samples, n_features = X.shape"
36+
"import numpy as np\n\nfrom sklearn import datasets\n\n# Import some data to play with\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\nX, y = X[y != 2], y[y != 2]\nn_samples, n_features = X.shape\n\n# Add noisy features\nrandom_state = np.random.RandomState(0)\nX = np.c_[X, random_state.randn(n_samples, 200 * n_features)]"
3737
]
3838
},
3939
{
4040
"cell_type": "markdown",
4141
"metadata": {},
4242
"source": [
43-
"We also add noisy features to make the problem harder.\n\n"
43+
"## Classification and ROC analysis\n\n"
4444
]
4545
},
4646
{
@@ -51,25 +51,7 @@
5151
},
5252
"outputs": [],
5353
"source": [
54-
"random_state = np.random.RandomState(0)\nX = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)"
55-
]
56-
},
57-
{
58-
"cell_type": "markdown",
59-
"metadata": {},
60-
"source": [
61-
"### Classification and ROC analysis\n\nHere we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and\nplot the ROC curves fold-wise. Notice that the baseline to define the chance\nlevel (dashed ROC curve) is a classifier that would always predict the most\nfrequent class.\n\n"
62-
]
63-
},
64-
{
65-
"cell_type": "code",
66-
"execution_count": null,
67-
"metadata": {
68-
"collapsed": false
69-
},
70-
"outputs": [],
71-
"source": [
72-
"import matplotlib.pyplot as plt\n\nfrom sklearn import svm\nfrom sklearn.metrics import auc\nfrom sklearn.metrics import RocCurveDisplay\nfrom sklearn.model_selection import StratifiedKFold\n\ncv = StratifiedKFold(n_splits=6)\nclassifier = svm.SVC(kernel=\"linear\", probability=True, random_state=random_state)\n\ntprs = []\naucs = []\nmean_fpr = np.linspace(0, 1, 100)\n\nfig, ax = plt.subplots(figsize=(6, 6))\nfor fold, (train, test) in enumerate(cv.split(X, y)):\n classifier.fit(X[train], y[train])\n viz = RocCurveDisplay.from_estimator(\n classifier,\n X[test],\n y[test],\n name=f\"ROC fold {fold}\",\n alpha=0.3,\n lw=1,\n ax=ax,\n )\n interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n interp_tpr[0] = 0.0\n tprs.append(interp_tpr)\n aucs.append(viz.roc_auc)\nax.plot([0, 1], [0, 1], \"k--\", label=\"chance level (AUC = 0.5)\")\n\nmean_tpr = np.mean(tprs, axis=0)\nmean_tpr[-1] = 1.0\nmean_auc = auc(mean_fpr, mean_tpr)\nstd_auc = np.std(aucs)\nax.plot(\n mean_fpr,\n mean_tpr,\n color=\"b\",\n label=r\"Mean ROC (AUC = %0.2f $\\pm$ %0.2f)\" % (mean_auc, std_auc),\n lw=2,\n alpha=0.8,\n)\n\nstd_tpr = np.std(tprs, axis=0)\ntprs_upper = np.minimum(mean_tpr + std_tpr, 1)\ntprs_lower = np.maximum(mean_tpr - std_tpr, 0)\nax.fill_between(\n mean_fpr,\n tprs_lower,\n tprs_upper,\n color=\"grey\",\n alpha=0.2,\n label=r\"$\\pm$ 1 std. dev.\",\n)\n\nax.set(\n xlim=[-0.05, 1.05],\n ylim=[-0.05, 1.05],\n xlabel=\"False Positive Rate\",\n ylabel=\"True Positive Rate\",\n title=f\"Mean ROC curve with variability\\n(Positive label '{target_names[1]}')\",\n)\nax.axis(\"square\")\nax.legend(loc=\"lower right\")\nplt.show()"
54+
"import matplotlib.pyplot as plt\n\nfrom sklearn import svm\nfrom sklearn.metrics import auc\nfrom sklearn.metrics import RocCurveDisplay\nfrom sklearn.model_selection import StratifiedKFold\n\n# Run classifier with cross-validation and plot ROC curves\ncv = StratifiedKFold(n_splits=6)\nclassifier = svm.SVC(kernel=\"linear\", probability=True, random_state=random_state)\n\ntprs = []\naucs = []\nmean_fpr = np.linspace(0, 1, 100)\n\nfig, ax = plt.subplots()\nfor i, (train, test) in enumerate(cv.split(X, y)):\n classifier.fit(X[train], y[train])\n viz = RocCurveDisplay.from_estimator(\n classifier,\n X[test],\n y[test],\n name=\"ROC fold {}\".format(i),\n alpha=0.3,\n lw=1,\n ax=ax,\n )\n interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n interp_tpr[0] = 0.0\n tprs.append(interp_tpr)\n aucs.append(viz.roc_auc)\n\nax.plot([0, 1], [0, 1], linestyle=\"--\", lw=2, color=\"r\", label=\"Chance\", alpha=0.8)\n\nmean_tpr = np.mean(tprs, axis=0)\nmean_tpr[-1] = 1.0\nmean_auc = auc(mean_fpr, mean_tpr)\nstd_auc = np.std(aucs)\nax.plot(\n mean_fpr,\n mean_tpr,\n color=\"b\",\n label=r\"Mean ROC (AUC = %0.2f $\\pm$ %0.2f)\" % (mean_auc, std_auc),\n lw=2,\n alpha=0.8,\n)\n\nstd_tpr = np.std(tprs, axis=0)\ntprs_upper = np.minimum(mean_tpr + std_tpr, 1)\ntprs_lower = np.maximum(mean_tpr - std_tpr, 0)\nax.fill_between(\n mean_fpr,\n tprs_lower,\n tprs_upper,\n color=\"grey\",\n alpha=0.2,\n label=r\"$\\pm$ 1 std. dev.\",\n)\n\nax.set(\n xlim=[-0.05, 1.05],\n ylim=[-0.05, 1.05],\n title=\"Receiver operating characteristic example\",\n)\nax.legend(loc=\"lower right\")\nplt.show()"
7355
]
7456
}
7557
],
Binary file not shown.
Binary file not shown.

dev/_downloads/scikit-learn-docs.zip

21.9 KB
Binary file not shown.
212 Bytes
203 Bytes
22 Bytes
19 Bytes
62 Bytes

0 commit comments

Comments
 (0)