Skip to content

Commit c2fef4c

Browse files
committed
Pushing the docs to dev/ for branch: main, commit ff9344f3d8d11d38fa3a2497199113e5bac9537c
1 parent 8cc0bdf commit c2fef4c

File tree

1,249 files changed

+5107
-4612
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,249 files changed

+5107
-4612
lines changed

dev/_downloads/010337852815f8103ac6cca38a812b3c/plot_roc_crossval.py

Lines changed: 47 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3,77 +3,88 @@
33
Receiver Operating Characteristic (ROC) with cross validation
44
=============================================================
55
6-
Example of Receiver Operating Characteristic (ROC) metric to evaluate
7-
classifier output quality using cross-validation.
6+
This example presents how to estimate and visualize the variance of the Receiver
7+
Operating Characteristic (ROC) metric using cross-validation.
88
9-
ROC curves typically feature true positive rate on the Y axis, and false
10-
positive rate on the X axis. This means that the top left corner of the plot is
11-
the "ideal" point - a false positive rate of zero, and a true positive rate of
12-
one. This is not very realistic, but it does mean that a larger area under the
13-
curve (AUC) is usually better.
14-
15-
The "steepness" of ROC curves is also important, since it is ideal to maximize
16-
the true positive rate while minimizing the false positive rate.
9+
ROC curves typically feature true positive rate (TPR) on the Y axis, and false
10+
positive rate (FPR) on the X axis. This means that the top left corner of the
11+
plot is the "ideal" point - a FPR of zero, and a TPR of one. This is not very
12+
realistic, but it does mean that a larger Area Under the Curve (AUC) is usually
13+
better. The "steepness" of ROC curves is also important, since it is ideal to
14+
maximize the TPR while minimizing the FPR.
1715
1816
This example shows the ROC response of different datasets, created from K-fold
1917
cross-validation. Taking all of these curves, it is possible to calculate the
20-
mean area under curve, and see the variance of the curve when the
18+
mean AUC, and see the variance of the curve when the
2119
training set is split into different subsets. This roughly shows how the
22-
classifier output is affected by changes in the training data, and how
23-
different the splits generated by K-fold cross-validation are from one another.
20+
classifier output is affected by changes in the training data, and how different
21+
the splits generated by K-fold cross-validation are from one another.
2422
2523
.. note::
2624
27-
See also :func:`sklearn.metrics.roc_auc_score`,
28-
:func:`sklearn.model_selection.cross_val_score`,
29-
:ref:`sphx_glr_auto_examples_model_selection_plot_roc.py`,
30-
25+
See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` for a
26+
complement of the present example explaining the averaging strategies to
27+
generalize the metrics for multiclass classifiers.
3128
"""
3229

3330
# %%
34-
# Data IO and generation
35-
# ----------------------
36-
import numpy as np
31+
# Load and prepare data
32+
# =====================
33+
#
34+
# We import the :ref:`iris_dataset` which contains 3 classes, each one
35+
# corresponding to a type of iris plant. One class is linearly separable from
36+
# the other 2; the latter are **not** linearly separable from each other.
37+
#
38+
# In the following we binarize the dataset by dropping the "virginica" class
39+
# (`class_id=2`). This means that the "versicolor" class (`class_id=1`) is
40+
# regarded as the positive class and "setosa" as the negative class
41+
# (`class_id=0`).
3742

38-
from sklearn import datasets
43+
import numpy as np
44+
from sklearn.datasets import load_iris
3945

40-
# Import some data to play with
41-
iris = datasets.load_iris()
42-
X = iris.data
43-
y = iris.target
46+
iris = load_iris()
47+
target_names = iris.target_names
48+
X, y = iris.data, iris.target
4449
X, y = X[y != 2], y[y != 2]
4550
n_samples, n_features = X.shape
4651

47-
# Add noisy features
52+
# %%
53+
# We also add noisy features to make the problem harder.
4854
random_state = np.random.RandomState(0)
49-
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
55+
X = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)
5056

5157
# %%
5258
# Classification and ROC analysis
5359
# -------------------------------
60+
#
61+
# Here we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and
62+
# plot the ROC curves fold-wise. Notice that the baseline to define the chance
63+
# level (dashed ROC curve) is a classifier that would always predict the most
64+
# frequent class.
65+
5466
import matplotlib.pyplot as plt
5567

5668
from sklearn import svm
5769
from sklearn.metrics import auc
5870
from sklearn.metrics import RocCurveDisplay
5971
from sklearn.model_selection import StratifiedKFold
6072

61-
# Run classifier with cross-validation and plot ROC curves
6273
cv = StratifiedKFold(n_splits=6)
6374
classifier = svm.SVC(kernel="linear", probability=True, random_state=random_state)
6475

6576
tprs = []
6677
aucs = []
6778
mean_fpr = np.linspace(0, 1, 100)
6879

69-
fig, ax = plt.subplots()
70-
for i, (train, test) in enumerate(cv.split(X, y)):
80+
fig, ax = plt.subplots(figsize=(6, 6))
81+
for fold, (train, test) in enumerate(cv.split(X, y)):
7182
classifier.fit(X[train], y[train])
7283
viz = RocCurveDisplay.from_estimator(
7384
classifier,
7485
X[test],
7586
y[test],
76-
name="ROC fold {}".format(i),
87+
name=f"ROC fold {fold}",
7788
alpha=0.3,
7889
lw=1,
7990
ax=ax,
@@ -82,8 +93,7 @@
8293
interp_tpr[0] = 0.0
8394
tprs.append(interp_tpr)
8495
aucs.append(viz.roc_auc)
85-
86-
ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)
96+
ax.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
8797

8898
mean_tpr = np.mean(tprs, axis=0)
8999
mean_tpr[-1] = 1.0
@@ -113,7 +123,10 @@
113123
ax.set(
114124
xlim=[-0.05, 1.05],
115125
ylim=[-0.05, 1.05],
116-
title="Receiver operating characteristic example",
126+
xlabel="False Positive Rate",
127+
ylabel="True Positive Rate",
128+
title=f"Mean ROC curve with variability\n(Positive label '{target_names[1]}')",
117129
)
130+
ax.axis("square")
118131
ax.legend(loc="lower right")
119132
plt.show()

dev/_downloads/055e8313e28f2f3b5fd508054dfe5fe0/plot_roc_crossval.ipynb

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515
"cell_type": "markdown",
1616
"metadata": {},
1717
"source": [
18-
"\n# Receiver Operating Characteristic (ROC) with cross validation\n\nExample of Receiver Operating Characteristic (ROC) metric to evaluate\nclassifier output quality using cross-validation.\n\nROC curves typically feature true positive rate on the Y axis, and false\npositive rate on the X axis. This means that the top left corner of the plot is\nthe \"ideal\" point - a false positive rate of zero, and a true positive rate of\none. This is not very realistic, but it does mean that a larger area under the\ncurve (AUC) is usually better.\n\nThe \"steepness\" of ROC curves is also important, since it is ideal to maximize\nthe true positive rate while minimizing the false positive rate.\n\nThis example shows the ROC response of different datasets, created from K-fold\ncross-validation. Taking all of these curves, it is possible to calculate the\nmean area under curve, and see the variance of the curve when the\ntraining set is split into different subsets. This roughly shows how the\nclassifier output is affected by changes in the training data, and how\ndifferent the splits generated by K-fold cross-validation are from one another.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>See also :func:`sklearn.metrics.roc_auc_score`,\n :func:`sklearn.model_selection.cross_val_score`,\n `sphx_glr_auto_examples_model_selection_plot_roc.py`,</p></div>\n"
18+
"\n# Receiver Operating Characteristic (ROC) with cross validation\n\nThis example presents how to estimate and visualize the variance of the Receiver\nOperating Characteristic (ROC) metric using cross-validation.\n\nROC curves typically feature true positive rate (TPR) on the Y axis, and false\npositive rate (FPR) on the X axis. This means that the top left corner of the\nplot is the \"ideal\" point - a FPR of zero, and a TPR of one. This is not very\nrealistic, but it does mean that a larger Area Under the Curve (AUC) is usually\nbetter. The \"steepness\" of ROC curves is also important, since it is ideal to\nmaximize the TPR while minimizing the FPR.\n\nThis example shows the ROC response of different datasets, created from K-fold\ncross-validation. Taking all of these curves, it is possible to calculate the\nmean AUC, and see the variance of the curve when the\ntraining set is split into different subsets. This roughly shows how the\nclassifier output is affected by changes in the training data, and how different\nthe splits generated by K-fold cross-validation are from one another.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>See `sphx_glr_auto_examples_model_selection_plot_roc.py` for a\n complement of the present example explaining the averaging strategies to\n generalize the metrics for multiclass classifiers.</p></div>\n"
1919
]
2020
},
2121
{
2222
"cell_type": "markdown",
2323
"metadata": {},
2424
"source": [
25-
"## Data IO and generation\n\n"
25+
"## Load and prepare data\n\nWe import the `iris_dataset` which contains 3 classes, each one\ncorresponding to a type of iris plant. One class is linearly separable from\nthe other 2; the latter are **not** linearly separable from each other.\n\nIn the following we binarize the dataset by dropping the \"virginica\" class\n(`class_id=2`). This means that the \"versicolor\" class (`class_id=1`) is\nregarded as the positive class and \"setosa\" as the negative class\n(`class_id=0`).\n\n"
2626
]
2727
},
2828
{
@@ -33,14 +33,14 @@
3333
},
3434
"outputs": [],
3535
"source": [
36-
"import numpy as np\n\nfrom sklearn import datasets\n\n# Import some data to play with\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\nX, y = X[y != 2], y[y != 2]\nn_samples, n_features = X.shape\n\n# Add noisy features\nrandom_state = np.random.RandomState(0)\nX = np.c_[X, random_state.randn(n_samples, 200 * n_features)]"
36+
"import numpy as np\nfrom sklearn.datasets import load_iris\n\niris = load_iris()\ntarget_names = iris.target_names\nX, y = iris.data, iris.target\nX, y = X[y != 2], y[y != 2]\nn_samples, n_features = X.shape"
3737
]
3838
},
3939
{
4040
"cell_type": "markdown",
4141
"metadata": {},
4242
"source": [
43-
"## Classification and ROC analysis\n\n"
43+
"We also add noisy features to make the problem harder.\n\n"
4444
]
4545
},
4646
{
@@ -51,7 +51,25 @@
5151
},
5252
"outputs": [],
5353
"source": [
54-
"import matplotlib.pyplot as plt\n\nfrom sklearn import svm\nfrom sklearn.metrics import auc\nfrom sklearn.metrics import RocCurveDisplay\nfrom sklearn.model_selection import StratifiedKFold\n\n# Run classifier with cross-validation and plot ROC curves\ncv = StratifiedKFold(n_splits=6)\nclassifier = svm.SVC(kernel=\"linear\", probability=True, random_state=random_state)\n\ntprs = []\naucs = []\nmean_fpr = np.linspace(0, 1, 100)\n\nfig, ax = plt.subplots()\nfor i, (train, test) in enumerate(cv.split(X, y)):\n classifier.fit(X[train], y[train])\n viz = RocCurveDisplay.from_estimator(\n classifier,\n X[test],\n y[test],\n name=\"ROC fold {}\".format(i),\n alpha=0.3,\n lw=1,\n ax=ax,\n )\n interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n interp_tpr[0] = 0.0\n tprs.append(interp_tpr)\n aucs.append(viz.roc_auc)\n\nax.plot([0, 1], [0, 1], linestyle=\"--\", lw=2, color=\"r\", label=\"Chance\", alpha=0.8)\n\nmean_tpr = np.mean(tprs, axis=0)\nmean_tpr[-1] = 1.0\nmean_auc = auc(mean_fpr, mean_tpr)\nstd_auc = np.std(aucs)\nax.plot(\n mean_fpr,\n mean_tpr,\n color=\"b\",\n label=r\"Mean ROC (AUC = %0.2f $\\pm$ %0.2f)\" % (mean_auc, std_auc),\n lw=2,\n alpha=0.8,\n)\n\nstd_tpr = np.std(tprs, axis=0)\ntprs_upper = np.minimum(mean_tpr + std_tpr, 1)\ntprs_lower = np.maximum(mean_tpr - std_tpr, 0)\nax.fill_between(\n mean_fpr,\n tprs_lower,\n tprs_upper,\n color=\"grey\",\n alpha=0.2,\n label=r\"$\\pm$ 1 std. dev.\",\n)\n\nax.set(\n xlim=[-0.05, 1.05],\n ylim=[-0.05, 1.05],\n title=\"Receiver operating characteristic example\",\n)\nax.legend(loc=\"lower right\")\nplt.show()"
54+
"random_state = np.random.RandomState(0)\nX = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)"
55+
]
56+
},
57+
{
58+
"cell_type": "markdown",
59+
"metadata": {},
60+
"source": [
61+
"### Classification and ROC analysis\n\nHere we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and\nplot the ROC curves fold-wise. Notice that the baseline to define the chance\nlevel (dashed ROC curve) is a classifier that would always predict the most\nfrequent class.\n\n"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": null,
67+
"metadata": {
68+
"collapsed": false
69+
},
70+
"outputs": [],
71+
"source": [
72+
"import matplotlib.pyplot as plt\n\nfrom sklearn import svm\nfrom sklearn.metrics import auc\nfrom sklearn.metrics import RocCurveDisplay\nfrom sklearn.model_selection import StratifiedKFold\n\ncv = StratifiedKFold(n_splits=6)\nclassifier = svm.SVC(kernel=\"linear\", probability=True, random_state=random_state)\n\ntprs = []\naucs = []\nmean_fpr = np.linspace(0, 1, 100)\n\nfig, ax = plt.subplots(figsize=(6, 6))\nfor fold, (train, test) in enumerate(cv.split(X, y)):\n classifier.fit(X[train], y[train])\n viz = RocCurveDisplay.from_estimator(\n classifier,\n X[test],\n y[test],\n name=f\"ROC fold {fold}\",\n alpha=0.3,\n lw=1,\n ax=ax,\n )\n interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n interp_tpr[0] = 0.0\n tprs.append(interp_tpr)\n aucs.append(viz.roc_auc)\nax.plot([0, 1], [0, 1], \"k--\", label=\"chance level (AUC = 0.5)\")\n\nmean_tpr = np.mean(tprs, axis=0)\nmean_tpr[-1] = 1.0\nmean_auc = auc(mean_fpr, mean_tpr)\nstd_auc = np.std(aucs)\nax.plot(\n mean_fpr,\n mean_tpr,\n color=\"b\",\n label=r\"Mean ROC (AUC = %0.2f $\\pm$ %0.2f)\" % (mean_auc, std_auc),\n lw=2,\n alpha=0.8,\n)\n\nstd_tpr = np.std(tprs, axis=0)\ntprs_upper = np.minimum(mean_tpr + std_tpr, 1)\ntprs_lower = np.maximum(mean_tpr - std_tpr, 0)\nax.fill_between(\n mean_fpr,\n tprs_lower,\n tprs_upper,\n color=\"grey\",\n alpha=0.2,\n label=r\"$\\pm$ 1 std. dev.\",\n)\n\nax.set(\n xlim=[-0.05, 1.05],\n ylim=[-0.05, 1.05],\n xlabel=\"False Positive Rate\",\n ylabel=\"True Positive Rate\",\n title=f\"Mean ROC curve with variability\\n(Positive label '{target_names[1]}')\",\n)\nax.axis(\"square\")\nax.legend(loc=\"lower right\")\nplt.show()"
5573
]
5674
}
5775
],
Binary file not shown.

dev/_downloads/2e4791a177381a6102b21e44083615c8/plot_poisson_regression_non_normal_loss.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
},
8181
"outputs": [],
8282
"source": [
83-
"from sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import FunctionTransformer, OneHotEncoder\nfrom sklearn.preprocessing import StandardScaler, KBinsDiscretizer\nfrom sklearn.compose import ColumnTransformer\n\n\nlog_scale_transformer = make_pipeline(\n FunctionTransformer(np.log, validate=False), StandardScaler()\n)\n\nlinear_model_preprocessor = ColumnTransformer(\n [\n (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n (\"binned_numeric\", KBinsDiscretizer(n_bins=10), [\"VehAge\", \"DrivAge\"]),\n (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n (\n \"onehot_categorical\",\n OneHotEncoder(),\n [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n ),\n ],\n remainder=\"drop\",\n)"
83+
"from sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import FunctionTransformer, OneHotEncoder\nfrom sklearn.preprocessing import StandardScaler, KBinsDiscretizer\nfrom sklearn.compose import ColumnTransformer\n\n\nlog_scale_transformer = make_pipeline(\n FunctionTransformer(np.log, validate=False), StandardScaler()\n)\n\nlinear_model_preprocessor = ColumnTransformer(\n [\n (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n (\n \"binned_numeric\",\n KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),\n [\"VehAge\", \"DrivAge\"],\n ),\n (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n (\n \"onehot_categorical\",\n OneHotEncoder(),\n [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n ),\n ],\n remainder=\"drop\",\n)"
8484
]
8585
},
8686
{
@@ -170,7 +170,7 @@
170170
},
171171
"outputs": [],
172172
"source": [
173-
"from sklearn.linear_model import PoissonRegressor\n\nn_samples = df_train.shape[0]\n\npoisson_glm = Pipeline(\n [\n (\"preprocessor\", linear_model_preprocessor),\n (\"regressor\", PoissonRegressor(alpha=1e-12, max_iter=300)),\n ]\n)\npoisson_glm.fit(\n df_train, df_train[\"Frequency\"], regressor__sample_weight=df_train[\"Exposure\"]\n)\n\nprint(\"PoissonRegressor evaluation:\")\nscore_estimator(poisson_glm, df_test)"
173+
"from sklearn.linear_model import PoissonRegressor\n\nn_samples = df_train.shape[0]\n\npoisson_glm = Pipeline(\n [\n (\"preprocessor\", linear_model_preprocessor),\n (\"regressor\", PoissonRegressor(alpha=1e-12, solver=\"newton-cholesky\")),\n ]\n)\npoisson_glm.fit(\n df_train, df_train[\"Frequency\"], regressor__sample_weight=df_train[\"Exposure\"]\n)\n\nprint(\"PoissonRegressor evaluation:\")\nscore_estimator(poisson_glm, df_test)"
174174
]
175175
},
176176
{
Binary file not shown.

0 commit comments

Comments
 (0)