Skip to content

Commit 5681c77

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 3b5f4602138e043c5602699ca954438e33d49711
1 parent fc6f001 commit 5681c77

File tree

1,236 files changed

+6068
-4634
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,236 files changed

+6068
-4634
lines changed
Binary file not shown.
Binary file not shown.
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
"""
2+
==========================================
3+
Evaluation of outlier detection estimators
4+
==========================================
5+
6+
This example benchmarks outlier detection algorithms, :ref:`local_outlier_factor`
7+
(LOF) and :ref:`isolation_forest` (IForest), using ROC curves on
8+
classical anomaly detection datasets. The algorithm performance
9+
is assessed in an outlier detection context:
10+
11+
1. The algorithms are trained on the whole dataset which is assumed to
12+
contain outliers.
13+
14+
2. The ROC curve from :class:`~sklearn.metrics.RocCurveDisplay` is computed
15+
on the same dataset using the knowledge of the labels.
16+
17+
"""
18+
19+
# Author: Pharuj Rajborirug <[email protected]>
20+
# License: BSD 3 clause
21+
22+
print(__doc__)
23+
24+
# %%
25+
# Define a data preprocessing function
26+
# ----------------------------------
27+
#
28+
# The example uses real-world datasets available in
29+
# :class:`sklearn.datasets` and the sample size of some datasets is reduced
30+
# to speed up computation. After the data preprocessing, the datasets' targets
31+
# will have two classes, 0 representing inliers and 1 representing outliers.
32+
# The `preprocess_dataset` function returns data and target.
33+
34+
import numpy as np
35+
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml
36+
from sklearn.preprocessing import LabelBinarizer
37+
import pandas as pd
38+
39+
rng = np.random.RandomState(42)
40+
41+
42+
def preprocess_dataset(dataset_name):
43+
44+
# loading and vectorization
45+
print(f"Loading {dataset_name} data")
46+
if dataset_name in ["http", "smtp", "SA", "SF"]:
47+
dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=rng)
48+
X = dataset.data
49+
y = dataset.target
50+
lb = LabelBinarizer()
51+
52+
if dataset_name == "SF":
53+
idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)
54+
X = X[idx] # reduce the sample size
55+
y = y[idx]
56+
x1 = lb.fit_transform(X[:, 1].astype(str))
57+
X = np.c_[X[:, :1], x1, X[:, 2:]]
58+
elif dataset_name == "SA":
59+
idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)
60+
X = X[idx] # reduce the sample size
61+
y = y[idx]
62+
x1 = lb.fit_transform(X[:, 1].astype(str))
63+
x2 = lb.fit_transform(X[:, 2].astype(str))
64+
x3 = lb.fit_transform(X[:, 3].astype(str))
65+
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
66+
y = (y != b"normal.").astype(int)
67+
if dataset_name == "forestcover":
68+
dataset = fetch_covtype()
69+
X = dataset.data
70+
y = dataset.target
71+
idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)
72+
X = X[idx] # reduce the sample size
73+
y = y[idx]
74+
75+
# inliers are those with attribute 2
76+
# outliers are those with attribute 4
77+
s = (y == 2) + (y == 4)
78+
X = X[s, :]
79+
y = y[s]
80+
y = (y != 2).astype(int)
81+
if dataset_name in ["glass", "wdbc", "cardiotocography"]:
82+
dataset = fetch_openml(name=dataset_name, version=1, as_frame=False)
83+
X = dataset.data
84+
y = dataset.target
85+
86+
if dataset_name == "glass":
87+
s = y == "tableware"
88+
y = s.astype(int)
89+
if dataset_name == "wdbc":
90+
s = y == "2"
91+
y = s.astype(int)
92+
X_mal, y_mal = X[s], y[s]
93+
X_ben, y_ben = X[~s], y[~s]
94+
95+
# downsampled to 39 points (9.8% outliers)
96+
idx = rng.choice(y_mal.shape[0], 39, replace=False)
97+
X_mal2 = X_mal[idx]
98+
y_mal2 = y_mal[idx]
99+
X = np.concatenate((X_ben, X_mal2), axis=0)
100+
y = np.concatenate((y_ben, y_mal2), axis=0)
101+
if dataset_name == "cardiotocography":
102+
s = y == "3"
103+
y = s.astype(int)
104+
# 0 represents inliers, and 1 represents outliers
105+
y = pd.Series(y, dtype="category")
106+
return (X, y)
107+
108+
109+
# %%
110+
# Define an outlier prediction function
111+
# -------------------------------------
112+
# There is no particular reason to choose algorithms
113+
# :class:`~sklearn.neighbors.LocalOutlierFactor` and
114+
# :class:`~sklearn.ensemble.IsolationForest`. The goal is to show that
115+
# different algorithm performs well on different datasets. The following
116+
# `compute_prediction` function returns average outlier score of X.
117+
118+
119+
from sklearn.neighbors import LocalOutlierFactor
120+
from sklearn.ensemble import IsolationForest
121+
122+
123+
def compute_prediction(X, model_name):
124+
125+
print(f"Computing {model_name} prediction...")
126+
if model_name == "LOF":
127+
clf = LocalOutlierFactor(n_neighbors=20, contamination="auto")
128+
clf.fit(X)
129+
y_pred = clf.negative_outlier_factor_
130+
if model_name == "IForest":
131+
clf = IsolationForest(random_state=rng, contamination="auto")
132+
y_pred = clf.fit(X).decision_function(X)
133+
return y_pred
134+
135+
136+
# %%
137+
# Plot and interpret results
138+
# --------------------------
139+
#
140+
# The algorithm performance relates to how good the true positive rate (TPR)
141+
# is at low value of the false positive rate (FPR). The best algorithms
142+
# have the curve on the top-left of the plot and the area under curve (AUC)
143+
# close to 1. The diagonal dashed line represents a random classification
144+
# of outliers and inliers.
145+
146+
147+
import math
148+
import matplotlib.pyplot as plt
149+
from sklearn.metrics import RocCurveDisplay
150+
151+
datasets_name = [
152+
"http",
153+
"smtp",
154+
"SA",
155+
"SF",
156+
"forestcover",
157+
"glass",
158+
"wdbc",
159+
"cardiotocography",
160+
]
161+
162+
models_name = [
163+
"LOF",
164+
"IForest",
165+
]
166+
167+
# plotting parameters
168+
cols = 2
169+
linewidth = 1
170+
pos_label = 0 # mean 0 belongs to positive class
171+
rows = math.ceil(len(datasets_name) / cols)
172+
173+
fig, axs = plt.subplots(rows, cols, figsize=(10, rows * 3))
174+
175+
for i, dataset_name in enumerate(datasets_name):
176+
(X, y) = preprocess_dataset(dataset_name=dataset_name)
177+
178+
for model_name in models_name:
179+
y_pred = compute_prediction(X, model_name=model_name)
180+
display = RocCurveDisplay.from_predictions(
181+
y,
182+
y_pred,
183+
pos_label=pos_label,
184+
name=model_name,
185+
linewidth=linewidth,
186+
ax=axs[i // cols, i % cols],
187+
)
188+
axs[i // cols, i % cols].plot([0, 1], [0, 1], linewidth=linewidth, linestyle=":")
189+
axs[i // cols, i % cols].set_title(dataset_name)
190+
axs[i // cols, i % cols].set_xlabel("False Positive Rate")
191+
axs[i // cols, i % cols].set_ylabel("True Positive Rate")
192+
plt.tight_layout(pad=2.0) # spacing between subplots
193+
plt.show()
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Evaluation of outlier detection estimators\n\nThis example benchmarks outlier detection algorithms, `local_outlier_factor`\n(LOF) and `isolation_forest` (IForest), using ROC curves on\nclassical anomaly detection datasets. The algorithm performance\nis assessed in an outlier detection context:\n\n1. The algorithms are trained on the whole dataset which is assumed to\ncontain outliers.\n\n2. The ROC curve from :class:`~sklearn.metrics.RocCurveDisplay` is computed\non the same dataset using the knowledge of the labels.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# Author: Pharuj Rajborirug <[email protected]>\n# License: BSD 3 clause\n\nprint(__doc__)"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"## Define a data preprocessing function\n\nThe example uses real-world datasets available in\n:class:`sklearn.datasets` and the sample size of some datasets is reduced\nto speed up computation. After the data preprocessing, the datasets' targets\nwill have two classes, 0 representing inliers and 1 representing outliers.\nThe `preprocess_dataset` function returns data and target.\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"import numpy as np\nfrom sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml\nfrom sklearn.preprocessing import LabelBinarizer\nimport pandas as pd\n\nrng = np.random.RandomState(42)\n\n\ndef preprocess_dataset(dataset_name):\n\n # loading and vectorization\n print(f\"Loading {dataset_name} data\")\n if dataset_name in [\"http\", \"smtp\", \"SA\", \"SF\"]:\n dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=rng)\n X = dataset.data\n y = dataset.target\n lb = LabelBinarizer()\n\n if dataset_name == \"SF\":\n idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)\n X = X[idx] # reduce the sample size\n y = y[idx]\n x1 = lb.fit_transform(X[:, 1].astype(str))\n X = np.c_[X[:, :1], x1, X[:, 2:]]\n elif dataset_name == \"SA\":\n idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)\n X = X[idx] # reduce the sample size\n y = y[idx]\n x1 = lb.fit_transform(X[:, 1].astype(str))\n x2 = lb.fit_transform(X[:, 2].astype(str))\n x3 = lb.fit_transform(X[:, 3].astype(str))\n X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]\n y = (y != b\"normal.\").astype(int)\n if dataset_name == \"forestcover\":\n dataset = fetch_covtype()\n X = dataset.data\n y = dataset.target\n idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)\n X = X[idx] # reduce the sample size\n y = y[idx]\n\n # inliers are those with attribute 2\n # outliers are those with attribute 4\n s = (y == 2) + (y == 4)\n X = X[s, :]\n y = y[s]\n y = (y != 2).astype(int)\n if dataset_name in [\"glass\", \"wdbc\", \"cardiotocography\"]:\n dataset = fetch_openml(name=dataset_name, version=1, as_frame=False)\n X = dataset.data\n y = dataset.target\n\n if dataset_name == \"glass\":\n s = y == \"tableware\"\n y = s.astype(int)\n if dataset_name == \"wdbc\":\n s = y == \"2\"\n y = s.astype(int)\n X_mal, y_mal = X[s], y[s]\n X_ben, y_ben = X[~s], y[~s]\n\n # downsampled to 39 points (9.8% outliers)\n idx = rng.choice(y_mal.shape[0], 39, replace=False)\n X_mal2 = X_mal[idx]\n y_mal2 = y_mal[idx]\n X = np.concatenate((X_ben, X_mal2), axis=0)\n y = np.concatenate((y_ben, y_mal2), axis=0)\n if dataset_name == \"cardiotocography\":\n s = y == \"3\"\n y = s.astype(int)\n # 0 represents inliers, and 1 represents outliers\n y = pd.Series(y, dtype=\"category\")\n return (X, y)"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {},
53+
"source": [
54+
"## Define an outlier prediction function\nThere is no particular reason to choose algorithms\n:class:`~sklearn.neighbors.LocalOutlierFactor` and\n:class:`~sklearn.ensemble.IsolationForest`. The goal is to show that\ndifferent algorithm performs well on different datasets. The following\n`compute_prediction` function returns average outlier score of X.\n\n"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [],
64+
"source": [
65+
"from sklearn.neighbors import LocalOutlierFactor\nfrom sklearn.ensemble import IsolationForest\n\n\ndef compute_prediction(X, model_name):\n\n print(f\"Computing {model_name} prediction...\")\n if model_name == \"LOF\":\n clf = LocalOutlierFactor(n_neighbors=20, contamination=\"auto\")\n clf.fit(X)\n y_pred = clf.negative_outlier_factor_\n if model_name == \"IForest\":\n clf = IsolationForest(random_state=rng, contamination=\"auto\")\n y_pred = clf.fit(X).decision_function(X)\n return y_pred"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"metadata": {},
71+
"source": [
72+
"## Plot and interpret results\n\nThe algorithm performance relates to how good the true positive rate (TPR)\nis at low value of the false positive rate (FPR). The best algorithms\nhave the curve on the top-left of the plot and the area under curve (AUC)\nclose to 1. The diagonal dashed line represents a random classification\nof outliers and inliers.\n\n"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": null,
78+
"metadata": {
79+
"collapsed": false
80+
},
81+
"outputs": [],
82+
"source": [
83+
"import math\nimport matplotlib.pyplot as plt\nfrom sklearn.metrics import RocCurveDisplay\n\ndatasets_name = [\n \"http\",\n \"smtp\",\n \"SA\",\n \"SF\",\n \"forestcover\",\n \"glass\",\n \"wdbc\",\n \"cardiotocography\",\n]\n\nmodels_name = [\n \"LOF\",\n \"IForest\",\n]\n\n# plotting parameters\ncols = 2\nlinewidth = 1\npos_label = 0 # mean 0 belongs to positive class\nrows = math.ceil(len(datasets_name) / cols)\n\nfig, axs = plt.subplots(rows, cols, figsize=(10, rows * 3))\n\nfor i, dataset_name in enumerate(datasets_name):\n (X, y) = preprocess_dataset(dataset_name=dataset_name)\n\n for model_name in models_name:\n y_pred = compute_prediction(X, model_name=model_name)\n display = RocCurveDisplay.from_predictions(\n y,\n y_pred,\n pos_label=pos_label,\n name=model_name,\n linewidth=linewidth,\n ax=axs[i // cols, i % cols],\n )\n axs[i // cols, i % cols].plot([0, 1], [0, 1], linewidth=linewidth, linestyle=\":\")\n axs[i // cols, i % cols].set_title(dataset_name)\n axs[i // cols, i % cols].set_xlabel(\"False Positive Rate\")\n axs[i // cols, i % cols].set_ylabel(\"True Positive Rate\")\nplt.tight_layout(pad=2.0) # spacing between subplots\nplt.show()"
84+
]
85+
}
86+
],
87+
"metadata": {
88+
"kernelspec": {
89+
"display_name": "Python 3",
90+
"language": "python",
91+
"name": "python3"
92+
},
93+
"language_info": {
94+
"codemirror_mode": {
95+
"name": "ipython",
96+
"version": 3
97+
},
98+
"file_extension": ".py",
99+
"mimetype": "text/x-python",
100+
"name": "python",
101+
"nbconvert_exporter": "python",
102+
"pygments_lexer": "ipython3",
103+
"version": "3.9.12"
104+
}
105+
},
106+
"nbformat": 4,
107+
"nbformat_minor": 0
108+
}

dev/_downloads/scikit-learn-docs.zip

156 KB
Binary file not shown.
-185 Bytes
80 Bytes
2 Bytes
-69 Bytes
128 Bytes

0 commit comments

Comments
 (0)