Skip to content

Commit 1dcd45e

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 8e599c68fb06967313d0e76a053062158cd90312
1 parent 3bd450b commit 1dcd45e

File tree

1,017 files changed

+4078
-3299
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,017 files changed

+4078
-3299
lines changed
5.56 KB
Binary file not shown.
4.62 KB
Binary file not shown.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Comparing anomaly detection algorithms for outlier detection on toy datasets\n\n\nThis example shows characteristics of different anomaly detection algorithms\non 2D datasets. Datasets contain one or two modes (regions of high density)\nto illustrate the ability of algorithms to cope with multimodal data.\n\nFor each dataset, 15% of samples are generated as random uniform noise. This\nproportion is the value given to the nu parameter of the OneClassSVM and the\ncontamination parameter of the other outlier detection algorithms.\nDecision boundaries between inliers and outliers are displayed in black.\n\nLocal Outlier Factor (LOF) does not show a decision boundary in black as it\nhas no predict method to be applied on new data.\n\nWhile these examples give some intuition about the algorithms, this\nintuition might not apply to very high dimensional data.\n\nFinally, note that parameters of the models have been here handpicked but\nthat in practice they need to be adjusted. In the absence of labelled data,\nthe problem is completely unsupervised so model selection can be a challenge.\n\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# Author: Alexandre Gramfort <[email protected]>\n# Albert Thomas <[email protected]>\n# License: BSD 3 clause\n\nimport time\n\nimport numpy as np\nimport matplotlib\nimport matplotlib.pyplot as plt\n\nfrom sklearn import svm\nfrom sklearn.datasets import make_moons, make_blobs\nfrom sklearn.covariance import EllipticEnvelope\nfrom sklearn.ensemble import IsolationForest\nfrom sklearn.neighbors import LocalOutlierFactor\n\nprint(__doc__)\n\nmatplotlib.rcParams['contour.negative_linestyle'] = 'solid'\n\n# Example settings\nn_samples = 300\noutliers_fraction = 0.15\nn_outliers = int(outliers_fraction * n_samples)\nn_inliers = n_samples - n_outliers\n\n# define outlier/anomaly detection methods to be compared\nanomaly_algorithms = [\n (\"Robust covariance\", EllipticEnvelope(contamination=outliers_fraction)),\n (\"One-Class SVM\", svm.OneClassSVM(nu=outliers_fraction, kernel=\"rbf\",\n gamma=0.1)),\n (\"Isolation Forest\", IsolationForest(contamination=outliers_fraction,\n random_state=42)),\n (\"Local Outlier Factor\", LocalOutlierFactor(\n n_neighbors=35, contamination=outliers_fraction))]\n\n# Define datasets\nblobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2)\ndatasets = [\n make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5,\n **blobs_params)[0],\n make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3],\n **blobs_params)[0],\n 4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -\n np.array([0.5, 0.25])),\n 14. * (np.random.RandomState(42).rand(n_samples, 2) - 0.5)]\n\n# Compare given classifiers under given settings\nxx, yy = np.meshgrid(np.linspace(-7, 7, 150),\n np.linspace(-7, 7, 150))\n\nplt.figure(figsize=(len(anomaly_algorithms) * 2 + 3, 12.5))\nplt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,\n hspace=.01)\n\nplot_num = 1\nrng = np.random.RandomState(42)\n\nfor i_dataset, X in enumerate(datasets):\n # Add outliers\n X = np.concatenate([X, rng.uniform(low=-6, high=6,\n size=(n_outliers, 2))], axis=0)\n\n for name, algorithm in anomaly_algorithms:\n t0 = time.time()\n algorithm.fit(X)\n t1 = time.time()\n plt.subplot(len(datasets), len(anomaly_algorithms), plot_num)\n if i_dataset == 0:\n plt.title(name, size=18)\n\n # fit the data and tag outliers\n if name == \"Local Outlier Factor\":\n y_pred = algorithm.fit_predict(X)\n else:\n y_pred = algorithm.fit(X).predict(X)\n\n # plot the levels lines and the points\n if name != \"Local Outlier Factor\": # LOF does not implement predict\n Z = algorithm.predict(np.c_[xx.ravel(), yy.ravel()])\n Z = Z.reshape(xx.shape)\n plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')\n\n colors = np.array(['#377eb8', '#ff7f00'])\n plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred + 1) // 2])\n\n plt.xlim(-7, 7)\n plt.ylim(-7, 7)\n plt.xticks(())\n plt.yticks(())\n plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),\n transform=plt.gca().transAxes, size=15,\n horizontalalignment='right')\n plot_num += 1\n\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.6.3"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
"""
2+
============================================================================
3+
Comparing anomaly detection algorithms for outlier detection on toy datasets
4+
============================================================================
5+
6+
This example shows characteristics of different anomaly detection algorithms
7+
on 2D datasets. Datasets contain one or two modes (regions of high density)
8+
to illustrate the ability of algorithms to cope with multimodal data.
9+
10+
For each dataset, 15% of samples are generated as random uniform noise. This
11+
proportion is the value given to the nu parameter of the OneClassSVM and the
12+
contamination parameter of the other outlier detection algorithms.
13+
Decision boundaries between inliers and outliers are displayed in black.
14+
15+
Local Outlier Factor (LOF) does not show a decision boundary in black as it
16+
has no predict method to be applied on new data.
17+
18+
While these examples give some intuition about the algorithms, this
19+
intuition might not apply to very high dimensional data.
20+
21+
Finally, note that parameters of the models have been here handpicked but
22+
that in practice they need to be adjusted. In the absence of labelled data,
23+
the problem is completely unsupervised so model selection can be a challenge.
24+
"""
25+
26+
# Author: Alexandre Gramfort <[email protected]>
27+
# Albert Thomas <[email protected]>
28+
# License: BSD 3 clause
29+
30+
import time
31+
32+
import numpy as np
33+
import matplotlib
34+
import matplotlib.pyplot as plt
35+
36+
from sklearn import svm
37+
from sklearn.datasets import make_moons, make_blobs
38+
from sklearn.covariance import EllipticEnvelope
39+
from sklearn.ensemble import IsolationForest
40+
from sklearn.neighbors import LocalOutlierFactor
41+
42+
print(__doc__)
43+
44+
matplotlib.rcParams['contour.negative_linestyle'] = 'solid'
45+
46+
# Example settings
47+
n_samples = 300
48+
outliers_fraction = 0.15
49+
n_outliers = int(outliers_fraction * n_samples)
50+
n_inliers = n_samples - n_outliers
51+
52+
# define outlier/anomaly detection methods to be compared
53+
anomaly_algorithms = [
54+
("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
55+
("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf",
56+
gamma=0.1)),
57+
("Isolation Forest", IsolationForest(contamination=outliers_fraction,
58+
random_state=42)),
59+
("Local Outlier Factor", LocalOutlierFactor(
60+
n_neighbors=35, contamination=outliers_fraction))]
61+
62+
# Define datasets
63+
blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2)
64+
datasets = [
65+
make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5,
66+
**blobs_params)[0],
67+
make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3],
68+
**blobs_params)[0],
69+
4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -
70+
np.array([0.5, 0.25])),
71+
14. * (np.random.RandomState(42).rand(n_samples, 2) - 0.5)]
72+
73+
# Compare given classifiers under given settings
74+
xx, yy = np.meshgrid(np.linspace(-7, 7, 150),
75+
np.linspace(-7, 7, 150))
76+
77+
plt.figure(figsize=(len(anomaly_algorithms) * 2 + 3, 12.5))
78+
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
79+
hspace=.01)
80+
81+
plot_num = 1
82+
rng = np.random.RandomState(42)
83+
84+
for i_dataset, X in enumerate(datasets):
85+
# Add outliers
86+
X = np.concatenate([X, rng.uniform(low=-6, high=6,
87+
size=(n_outliers, 2))], axis=0)
88+
89+
for name, algorithm in anomaly_algorithms:
90+
t0 = time.time()
91+
algorithm.fit(X)
92+
t1 = time.time()
93+
plt.subplot(len(datasets), len(anomaly_algorithms), plot_num)
94+
if i_dataset == 0:
95+
plt.title(name, size=18)
96+
97+
# fit the data and tag outliers
98+
if name == "Local Outlier Factor":
99+
y_pred = algorithm.fit_predict(X)
100+
else:
101+
y_pred = algorithm.fit(X).predict(X)
102+
103+
# plot the levels lines and the points
104+
if name != "Local Outlier Factor": # LOF does not implement predict
105+
Z = algorithm.predict(np.c_[xx.ravel(), yy.ravel()])
106+
Z = Z.reshape(xx.shape)
107+
plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')
108+
109+
colors = np.array(['#377eb8', '#ff7f00'])
110+
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred + 1) // 2])
111+
112+
plt.xlim(-7, 7)
113+
plt.ylim(-7, 7)
114+
plt.xticks(())
115+
plt.yticks(())
116+
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
117+
transform=plt.gca().transAxes, size=15,
118+
horizontalalignment='right')
119+
plot_num += 1
120+
121+
plt.show()

dev/_downloads/scikit-learn-docs.pdf

590 KB
Binary file not shown.
-496 Bytes
-496 Bytes
-220 Bytes
-220 Bytes
671 Bytes

0 commit comments

Comments
 (0)