Skip to content

Commit d2acd9b

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 788a458bba353c2cf3cfa5a15d6f68315149ef9e
1 parent 8061ec2 commit d2acd9b

File tree

933 files changed

+4385
-3020
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

933 files changed

+4385
-3020
lines changed
3.79 KB
Binary file not shown.
2.72 KB
Binary file not shown.

dev/_downloads/plot_lof.ipynb

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"nbformat_minor": 0,
3+
"nbformat": 4,
4+
"cells": [
5+
{
6+
"execution_count": null,
7+
"cell_type": "code",
8+
"source": [
9+
"%matplotlib inline"
10+
],
11+
"outputs": [],
12+
"metadata": {
13+
"collapsed": false
14+
}
15+
},
16+
{
17+
"source": [
18+
"\n=================================================\nAnomaly detection with Local Outlier Factor (LOF)\n=================================================\n\nThis example presents the Local Outlier Factor (LOF) estimator. The LOF\nalgorithm is an unsupervised outlier detection method which computes the local\ndensity deviation of a given data point with respect to its neighbors.\nIt considers as outlier samples that have a substantially lower density than\ntheir neighbors.\n\nThe number of neighbors considered, (parameter n_neighbors) is typically\nchosen 1) greater than the minimum number of objects a cluster has to contain,\nso that other objects can be local outliers relative to this cluster, and 2)\nsmaller than the maximum number of close by objects that can potentially be\nlocal outliers.\nIn practice, such informations are generally not available, and taking\nn_neighbors=20 appears to work well in general.\n\n"
19+
],
20+
"cell_type": "markdown",
21+
"metadata": {}
22+
},
23+
{
24+
"execution_count": null,
25+
"cell_type": "code",
26+
"source": [
27+
"import numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.neighbors import LocalOutlierFactor\nprint(__doc__)\n\nnp.random.seed(42)\n\n# Generate train data\nX = 0.3 * np.random.randn(100, 2)\n# Generate some abnormal novel observations\nX_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))\nX = np.r_[X + 2, X - 2, X_outliers]\n\n# fit the model\nclf = LocalOutlierFactor(n_neighbors=20)\ny_pred = clf.fit_predict(X)\ny_pred_outliers = y_pred[200:]\n\n# plot the level sets of the decision function\nxx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))\nZ = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])\nZ = Z.reshape(xx.shape)\n\nplt.title(\"Local Outlier Factor (LOF)\")\nplt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)\n\na = plt.scatter(X[:200, 0], X[:200, 1], c='white')\nb = plt.scatter(X[200:, 0], X[200:, 1], c='red')\nplt.axis('tight')\nplt.xlim((-5, 5))\nplt.ylim((-5, 5))\nplt.legend([a, b],\n [\"normal observations\",\n \"abnormal observations\"],\n loc=\"upper left\")\nplt.show()"
28+
],
29+
"outputs": [],
30+
"metadata": {
31+
"collapsed": false
32+
}
33+
}
34+
],
35+
"metadata": {
36+
"kernelspec": {
37+
"display_name": "Python 2",
38+
"name": "python2",
39+
"language": "python"
40+
},
41+
"language_info": {
42+
"mimetype": "text/x-python",
43+
"nbconvert_exporter": "python",
44+
"name": "python",
45+
"file_extension": ".py",
46+
"version": "2.7.12",
47+
"pygments_lexer": "ipython2",
48+
"codemirror_mode": {
49+
"version": 2,
50+
"name": "ipython"
51+
}
52+
}
53+
}
54+
}

dev/_downloads/plot_lof.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""
2+
=================================================
3+
Anomaly detection with Local Outlier Factor (LOF)
4+
=================================================
5+
6+
This example presents the Local Outlier Factor (LOF) estimator. The LOF
7+
algorithm is an unsupervised outlier detection method which computes the local
8+
density deviation of a given data point with respect to its neighbors.
9+
It considers as outlier samples that have a substantially lower density than
10+
their neighbors.
11+
12+
The number of neighbors considered, (parameter n_neighbors) is typically
13+
chosen 1) greater than the minimum number of objects a cluster has to contain,
14+
so that other objects can be local outliers relative to this cluster, and 2)
15+
smaller than the maximum number of close by objects that can potentially be
16+
local outliers.
17+
In practice, such informations are generally not available, and taking
18+
n_neighbors=20 appears to work well in general.
19+
"""
20+
21+
import numpy as np
22+
import matplotlib.pyplot as plt
23+
from sklearn.neighbors import LocalOutlierFactor
24+
print(__doc__)
25+
26+
np.random.seed(42)
27+
28+
# Generate train data
29+
X = 0.3 * np.random.randn(100, 2)
30+
# Generate some abnormal novel observations
31+
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
32+
X = np.r_[X + 2, X - 2, X_outliers]
33+
34+
# fit the model
35+
clf = LocalOutlierFactor(n_neighbors=20)
36+
y_pred = clf.fit_predict(X)
37+
y_pred_outliers = y_pred[200:]
38+
39+
# plot the level sets of the decision function
40+
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
41+
Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
42+
Z = Z.reshape(xx.shape)
43+
44+
plt.title("Local Outlier Factor (LOF)")
45+
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
46+
47+
a = plt.scatter(X[:200, 0], X[:200, 1], c='white')
48+
b = plt.scatter(X[200:, 0], X[200:, 1], c='red')
49+
plt.axis('tight')
50+
plt.xlim((-5, 5))
51+
plt.ylim((-5, 5))
52+
plt.legend([a, b],
53+
["normal observations",
54+
"abnormal observations"],
55+
loc="upper left")
56+
plt.show()

dev/_downloads/plot_outlier_detection.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
},
1616
{
1717
"source": [
18-
"\n==========================================\nOutlier detection with several methods.\n==========================================\n\nWhen the amount of contamination is known, this example illustrates three\ndifferent ways of performing `outlier_detection`:\n\n- based on a robust estimator of covariance, which is assuming that the\n data are Gaussian distributed and performs better than the One-Class SVM\n in that case.\n\n- using the One-Class SVM and its ability to capture the shape of the\n data set, hence performing better when the data is strongly\n non-Gaussian, i.e. with two well-separated clusters;\n\n- using the Isolation Forest algorithm, which is based on random forests and\n hence more adapted to large-dimensional settings, even if it performs\n quite well in the examples below.\n\nThe ground truth about inliers and outliers is given by the points colors\nwhile the orange-filled area indicates which points are reported as inliers\nby each method.\n\nHere, we assume that we know the fraction of outliers in the datasets.\nThus rather than using the 'predict' method of the objects, we set the\nthreshold on the decision_function to separate out the corresponding\nfraction.\n\n"
18+
"\n==========================================\nOutlier detection with several methods.\n==========================================\n\nWhen the amount of contamination is known, this example illustrates three\ndifferent ways of performing `outlier_detection`:\n\n- based on a robust estimator of covariance, which is assuming that the\n data are Gaussian distributed and performs better than the One-Class SVM\n in that case.\n\n- using the One-Class SVM and its ability to capture the shape of the\n data set, hence performing better when the data is strongly\n non-Gaussian, i.e. with two well-separated clusters;\n\n- using the Isolation Forest algorithm, which is based on random forests and\n hence more adapted to large-dimensional settings, even if it performs\n quite well in the examples below.\n\n- using the Local Outlier Factor to measure the local deviation of a given\n data point with respect to its neighbors by comparing their local density.\n\nThe ground truth about inliers and outliers is given by the points colors\nwhile the orange-filled area indicates which points are reported as inliers\nby each method.\n\nHere, we assume that we know the fraction of outliers in the datasets.\nThus rather than using the 'predict' method of the objects, we set the\nthreshold on the decision_function to separate out the corresponding\nfraction.\n\n"
1919
],
2020
"cell_type": "markdown",
2121
"metadata": {}
@@ -24,7 +24,7 @@
2424
"execution_count": null,
2525
"cell_type": "code",
2626
"source": [
27-
"print(__doc__)\n\nimport numpy as np\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport matplotlib.font_manager\n\nfrom sklearn import svm\nfrom sklearn.covariance import EllipticEnvelope\nfrom sklearn.ensemble import IsolationForest\n\nrng = np.random.RandomState(42)\n\n# Example settings\nn_samples = 200\noutliers_fraction = 0.25\nclusters_separation = [0, 1, 2]\n\n# define two outlier detection tools to be compared\nclassifiers = {\n \"One-Class SVM\": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,\n kernel=\"rbf\", gamma=0.1),\n \"Robust covariance\": EllipticEnvelope(contamination=outliers_fraction),\n \"Isolation Forest\": IsolationForest(max_samples=n_samples,\n contamination=outliers_fraction,\n random_state=rng)}\n\n# Compare given classifiers under given settings\nxx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))\nn_inliers = int((1. - outliers_fraction) * n_samples)\nn_outliers = int(outliers_fraction * n_samples)\nground_truth = np.ones(n_samples, dtype=int)\nground_truth[-n_outliers:] = -1\n\n# Fit the problem with varying cluster separation\nfor i, offset in enumerate(clusters_separation):\n np.random.seed(42)\n # Data generation\n X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset\n X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset\n X = np.r_[X1, X2]\n # Add outliers\n X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]\n\n # Fit the model\n plt.figure(figsize=(10.8, 3.6))\n for i, (clf_name, clf) in enumerate(classifiers.items()):\n # fit the data and tag outliers\n clf.fit(X)\n scores_pred = clf.decision_function(X)\n threshold = stats.scoreatpercentile(scores_pred,\n 100 * outliers_fraction)\n y_pred = clf.predict(X)\n n_errors = (y_pred != ground_truth).sum()\n # plot the levels lines and the points\n Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n Z = Z.reshape(xx.shape)\n subplot = plt.subplot(1, 3, i + 1)\n subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),\n cmap=plt.cm.Blues_r)\n a = subplot.contour(xx, yy, Z, levels=[threshold],\n linewidths=2, colors='red')\n subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],\n colors='orange')\n b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white')\n c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black')\n subplot.axis('tight')\n subplot.legend(\n [a.collections[0], b, c],\n ['learned decision function', 'true inliers', 'true outliers'],\n prop=matplotlib.font_manager.FontProperties(size=11),\n loc='lower right')\n subplot.set_title(\"%d. %s (errors: %d)\" % (i + 1, clf_name, n_errors))\n subplot.set_xlim((-7, 7))\n subplot.set_ylim((-7, 7))\n plt.subplots_adjust(0.04, 0.1, 0.96, 0.92, 0.1, 0.26)\n\nplt.show()"
27+
"import numpy as np\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport matplotlib.font_manager\n\nfrom sklearn import svm\nfrom sklearn.covariance import EllipticEnvelope\nfrom sklearn.ensemble import IsolationForest\nfrom sklearn.neighbors import LocalOutlierFactor\n\nprint(__doc__)\n\nrng = np.random.RandomState(42)\n\n# Example settings\nn_samples = 200\noutliers_fraction = 0.25\nclusters_separation = [0, 1, 2]\n\n# define two outlier detection tools to be compared\nclassifiers = {\n \"One-Class SVM\": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,\n kernel=\"rbf\", gamma=0.1),\n \"Robust covariance\": EllipticEnvelope(contamination=outliers_fraction),\n \"Isolation Forest\": IsolationForest(max_samples=n_samples,\n contamination=outliers_fraction,\n random_state=rng),\n \"Local Outlier Factor\": LocalOutlierFactor(\n n_neighbors=35,\n contamination=outliers_fraction)}\n\n# Compare given classifiers under given settings\nxx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))\nn_inliers = int((1. - outliers_fraction) * n_samples)\nn_outliers = int(outliers_fraction * n_samples)\nground_truth = np.ones(n_samples, dtype=int)\nground_truth[-n_outliers:] = -1\n\n# Fit the problem with varying cluster separation\nfor i, offset in enumerate(clusters_separation):\n np.random.seed(42)\n # Data generation\n X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset\n X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset\n X = np.r_[X1, X2]\n # Add outliers\n X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]\n\n # Fit the model\n plt.figure(figsize=(9, 7))\n for i, (clf_name, clf) in enumerate(classifiers.items()):\n # fit the data and tag outliers\n if clf_name == \"Local Outlier Factor\":\n y_pred = clf.fit_predict(X)\n scores_pred = clf.negative_outlier_factor_\n else:\n clf.fit(X)\n scores_pred = clf.decision_function(X)\n y_pred = clf.predict(X)\n threshold = stats.scoreatpercentile(scores_pred,\n 100 * outliers_fraction)\n n_errors = (y_pred != ground_truth).sum()\n # plot the levels lines and the points\n if clf_name == \"Local Outlier Factor\":\n # decision_function is private for LOF\n Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])\n else:\n Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n Z = Z.reshape(xx.shape)\n subplot = plt.subplot(2, 2, i + 1)\n subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),\n cmap=plt.cm.Blues_r)\n a = subplot.contour(xx, yy, Z, levels=[threshold],\n linewidths=2, colors='red')\n subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],\n colors='orange')\n b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white')\n c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black')\n subplot.axis('tight')\n subplot.legend(\n [a.collections[0], b, c],\n ['learned decision function', 'true inliers', 'true outliers'],\n prop=matplotlib.font_manager.FontProperties(size=10),\n loc='lower right')\n subplot.set_xlabel(\"%d. %s (errors: %d)\" % (i + 1, clf_name, n_errors))\n subplot.set_xlim((-7, 7))\n subplot.set_ylim((-7, 7))\n plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)\n plt.suptitle(\"Outlier detection\")\n\nplt.show()"
2828
],
2929
"outputs": [],
3030
"metadata": {

dev/_downloads/plot_outlier_detection.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
hence more adapted to large-dimensional settings, even if it performs
1919
quite well in the examples below.
2020
21+
- using the Local Outlier Factor to measure the local deviation of a given
22+
data point with respect to its neighbors by comparing their local density.
23+
2124
The ground truth about inliers and outliers is given by the points colors
2225
while the orange-filled area indicates which points are reported as inliers
2326
by each method.
@@ -27,7 +30,6 @@
2730
threshold on the decision_function to separate out the corresponding
2831
fraction.
2932
"""
30-
print(__doc__)
3133

3234
import numpy as np
3335
from scipy import stats
@@ -37,6 +39,9 @@
3739
from sklearn import svm
3840
from sklearn.covariance import EllipticEnvelope
3941
from sklearn.ensemble import IsolationForest
42+
from sklearn.neighbors import LocalOutlierFactor
43+
44+
print(__doc__)
4045

4146
rng = np.random.RandomState(42)
4247

@@ -52,10 +57,13 @@
5257
"Robust covariance": EllipticEnvelope(contamination=outliers_fraction),
5358
"Isolation Forest": IsolationForest(max_samples=n_samples,
5459
contamination=outliers_fraction,
55-
random_state=rng)}
60+
random_state=rng),
61+
"Local Outlier Factor": LocalOutlierFactor(
62+
n_neighbors=35,
63+
contamination=outliers_fraction)}
5664

5765
# Compare given classifiers under given settings
58-
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
66+
xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
5967
n_inliers = int((1. - outliers_fraction) * n_samples)
6068
n_outliers = int(outliers_fraction * n_samples)
6169
ground_truth = np.ones(n_samples, dtype=int)
@@ -72,19 +80,27 @@
7280
X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
7381

7482
# Fit the model
75-
plt.figure(figsize=(10.8, 3.6))
83+
plt.figure(figsize=(9, 7))
7684
for i, (clf_name, clf) in enumerate(classifiers.items()):
7785
# fit the data and tag outliers
78-
clf.fit(X)
79-
scores_pred = clf.decision_function(X)
86+
if clf_name == "Local Outlier Factor":
87+
y_pred = clf.fit_predict(X)
88+
scores_pred = clf.negative_outlier_factor_
89+
else:
90+
clf.fit(X)
91+
scores_pred = clf.decision_function(X)
92+
y_pred = clf.predict(X)
8093
threshold = stats.scoreatpercentile(scores_pred,
8194
100 * outliers_fraction)
82-
y_pred = clf.predict(X)
8395
n_errors = (y_pred != ground_truth).sum()
8496
# plot the levels lines and the points
85-
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
97+
if clf_name == "Local Outlier Factor":
98+
# decision_function is private for LOF
99+
Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
100+
else:
101+
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
86102
Z = Z.reshape(xx.shape)
87-
subplot = plt.subplot(1, 3, i + 1)
103+
subplot = plt.subplot(2, 2, i + 1)
88104
subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
89105
cmap=plt.cm.Blues_r)
90106
a = subplot.contour(xx, yy, Z, levels=[threshold],
@@ -97,11 +113,12 @@
97113
subplot.legend(
98114
[a.collections[0], b, c],
99115
['learned decision function', 'true inliers', 'true outliers'],
100-
prop=matplotlib.font_manager.FontProperties(size=11),
116+
prop=matplotlib.font_manager.FontProperties(size=10),
101117
loc='lower right')
102-
subplot.set_title("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
118+
subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
103119
subplot.set_xlim((-7, 7))
104120
subplot.set_ylim((-7, 7))
105-
plt.subplots_adjust(0.04, 0.1, 0.96, 0.92, 0.1, 0.26)
121+
plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
122+
plt.suptitle("Outlier detection")
106123

107124
plt.show()

0 commit comments

Comments
 (0)