scikit-learn
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
3.79 KB b/‎dev/_downloads/auto_examples_jupyter.zip
3.79 KB
diff --git a/‎dev/_downloads/auto_examples_python.zip
2.72 KB b/‎dev/_downloads/auto_examples_python.zip
2.72 KB
diff --git a/‎dev/_downloads/plot_lof.ipynb
Lines changed: 54 additions & 0 deletions b/‎dev/_downloads/plot_lof.ipynb
Lines changed: 54 additions & 0 deletions
diff --git a/‎dev/_downloads/plot_lof.py
Lines changed: 56 additions & 0 deletions b/‎dev/_downloads/plot_lof.py
Lines changed: 56 additions & 0 deletions
diff --git a/‎dev/_downloads/plot_outlier_detection.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/plot_outlier_detection.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_outlier_detection.py
Lines changed: 29 additions & 12 deletions b/‎dev/_downloads/plot_outlier_detection.py
Lines changed: 29 additions & 12 deletions
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
106 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
106 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
106 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
106 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
151 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
151 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
151 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
151 Bytes
@@ -0,0 +1,54 @@
+{
+  "nbformat_minor": 0, 
+  "nbformat": 4, 
+  "cells": [
+    {
+      "execution_count": null, 
+      "cell_type": "code", 
+      "source": [
+        "%matplotlib inline"
+      ], 
+      "outputs": [], 
+      "metadata": {
+        "collapsed": false
+      }
+    }, 
+    {
+      "source": [
+        "\n=================================================\nAnomaly detection with Local Outlier Factor (LOF)\n=================================================\n\nThis example presents the Local Outlier Factor (LOF) estimator. The LOF\nalgorithm is an unsupervised outlier detection method which computes the local\ndensity deviation of a given data point with respect to its neighbors.\nIt considers as outlier samples that have a substantially lower density than\ntheir neighbors.\n\nThe number of neighbors considered, (parameter n_neighbors) is typically\nchosen 1) greater than the minimum number of objects a cluster has to contain,\nso that other objects can be local outliers relative to this cluster, and 2)\nsmaller than the maximum number of close by objects that can potentially be\nlocal outliers.\nIn practice, such informations are generally not available, and taking\nn_neighbors=20 appears to work well in general.\n\n"
+      ], 
+      "cell_type": "markdown", 
+      "metadata": {}
+    }, 
+    {
+      "execution_count": null, 
+      "cell_type": "code", 
+      "source": [
+        "import numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.neighbors import LocalOutlierFactor\nprint(__doc__)\n\nnp.random.seed(42)\n\n# Generate train data\nX = 0.3 * np.random.randn(100, 2)\n# Generate some abnormal novel observations\nX_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))\nX = np.r_[X + 2, X - 2, X_outliers]\n\n# fit the model\nclf = LocalOutlierFactor(n_neighbors=20)\ny_pred = clf.fit_predict(X)\ny_pred_outliers = y_pred[200:]\n\n# plot the level sets of the decision function\nxx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))\nZ = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])\nZ = Z.reshape(xx.shape)\n\nplt.title(\"Local Outlier Factor (LOF)\")\nplt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)\n\na = plt.scatter(X[:200, 0], X[:200, 1], c='white')\nb = plt.scatter(X[200:, 0], X[200:, 1], c='red')\nplt.axis('tight')\nplt.xlim((-5, 5))\nplt.ylim((-5, 5))\nplt.legend([a, b],\n           [\"normal observations\",\n            \"abnormal observations\"],\n           loc=\"upper left\")\nplt.show()"
+      ], 
+      "outputs": [], 
+      "metadata": {
+        "collapsed": false
+      }
+    }
+  ], 
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 2", 
+      "name": "python2", 
+      "language": "python"
+    }, 
+    "language_info": {
+      "mimetype": "text/x-python", 
+      "nbconvert_exporter": "python", 
+      "name": "python", 
+      "file_extension": ".py", 
+      "version": "2.7.12", 
+      "pygments_lexer": "ipython2", 
+      "codemirror_mode": {
+        "version": 2, 
+        "name": "ipython"
+      }
+    }
+  }
+}
@@ -0,0 +1,56 @@
+"""
+=================================================
+Anomaly detection with Local Outlier Factor (LOF)
+=================================================
+
+This example presents the Local Outlier Factor (LOF) estimator. The LOF
+algorithm is an unsupervised outlier detection method which computes the local
+density deviation of a given data point with respect to its neighbors.
+It considers as outlier samples that have a substantially lower density than
+their neighbors.
+
+The number of neighbors considered, (parameter n_neighbors) is typically
+chosen 1) greater than the minimum number of objects a cluster has to contain,
+so that other objects can be local outliers relative to this cluster, and 2)
+smaller than the maximum number of close by objects that can potentially be
+local outliers.
+In practice, such informations are generally not available, and taking
+n_neighbors=20 appears to work well in general.
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.neighbors import LocalOutlierFactor
+print(__doc__)
+
+np.random.seed(42)
+
+# Generate train data
+X = 0.3 * np.random.randn(100, 2)
+# Generate some abnormal novel observations
+X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
+X = np.r_[X + 2, X - 2, X_outliers]
+
+# fit the model
+clf = LocalOutlierFactor(n_neighbors=20)
+y_pred = clf.fit_predict(X)
+y_pred_outliers = y_pred[200:]
+
+# plot the level sets of the decision function
+xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
+Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
+Z = Z.reshape(xx.shape)
+
+plt.title("Local Outlier Factor (LOF)")
+plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
+
+a = plt.scatter(X[:200, 0], X[:200, 1], c='white')
+b = plt.scatter(X[200:, 0], X[200:, 1], c='red')
+plt.axis('tight')
+plt.xlim((-5, 5))
+plt.ylim((-5, 5))
+plt.legend([a, b],
+           ["normal observations",
+            "abnormal observations"],
+           loc="upper left")
+plt.show()
@@ -15,7 +15,7 @@
     }, 
     {
       "source": [
-        "\n==========================================\nOutlier detection with several methods.\n==========================================\n\nWhen the amount of contamination is known, this example illustrates three\ndifferent ways of performing `outlier_detection`:\n\n- based on a robust estimator of covariance, which is assuming that the\n  data are Gaussian distributed and performs better than the One-Class SVM\n  in that case.\n\n- using the One-Class SVM and its ability to capture the shape of the\n  data set, hence performing better when the data is strongly\n  non-Gaussian, i.e. with two well-separated clusters;\n\n- using the Isolation Forest algorithm, which is based on random forests and\n  hence more adapted to large-dimensional settings, even if it performs\n  quite well in the examples below.\n\nThe ground truth about inliers and outliers is given by the points colors\nwhile the orange-filled area indicates which points are reported as inliers\nby each method.\n\nHere, we assume that we know the fraction of outliers in the datasets.\nThus rather than using the 'predict' method of the objects, we set the\nthreshold on the decision_function to separate out the corresponding\nfraction.\n\n"
+        "\n==========================================\nOutlier detection with several methods.\n==========================================\n\nWhen the amount of contamination is known, this example illustrates three\ndifferent ways of performing `outlier_detection`:\n\n- based on a robust estimator of covariance, which is assuming that the\n  data are Gaussian distributed and performs better than the One-Class SVM\n  in that case.\n\n- using the One-Class SVM and its ability to capture the shape of the\n  data set, hence performing better when the data is strongly\n  non-Gaussian, i.e. with two well-separated clusters;\n\n- using the Isolation Forest algorithm, which is based on random forests and\n  hence more adapted to large-dimensional settings, even if it performs\n  quite well in the examples below.\n\n- using the Local Outlier Factor to measure the local deviation of a given\n  data point with respect to its neighbors by comparing their local density.\n\nThe ground truth about inliers and outliers is given by the points colors\nwhile the orange-filled area indicates which points are reported as inliers\nby each method.\n\nHere, we assume that we know the fraction of outliers in the datasets.\nThus rather than using the 'predict' method of the objects, we set the\nthreshold on the decision_function to separate out the corresponding\nfraction.\n\n"
       ], 
       "cell_type": "markdown", 
       "metadata": {}
@@ -24,7 +24,7 @@
       "execution_count": null, 
       "cell_type": "code", 
       "source": [
-        "print(__doc__)\n\nimport numpy as np\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport matplotlib.font_manager\n\nfrom sklearn import svm\nfrom sklearn.covariance import EllipticEnvelope\nfrom sklearn.ensemble import IsolationForest\n\nrng = np.random.RandomState(42)\n\n# Example settings\nn_samples = 200\noutliers_fraction = 0.25\nclusters_separation = [0, 1, 2]\n\n# define two outlier detection tools to be compared\nclassifiers = {\n    \"One-Class SVM\": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,\n                                     kernel=\"rbf\", gamma=0.1),\n    \"Robust covariance\": EllipticEnvelope(contamination=outliers_fraction),\n    \"Isolation Forest\": IsolationForest(max_samples=n_samples,\n                                        contamination=outliers_fraction,\n                                        random_state=rng)}\n\n# Compare given classifiers under given settings\nxx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))\nn_inliers = int((1. - outliers_fraction) * n_samples)\nn_outliers = int(outliers_fraction * n_samples)\nground_truth = np.ones(n_samples, dtype=int)\nground_truth[-n_outliers:] = -1\n\n# Fit the problem with varying cluster separation\nfor i, offset in enumerate(clusters_separation):\n    np.random.seed(42)\n    # Data generation\n    X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset\n    X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset\n    X = np.r_[X1, X2]\n    # Add outliers\n    X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]\n\n    # Fit the model\n    plt.figure(figsize=(10.8, 3.6))\n    for i, (clf_name, clf) in enumerate(classifiers.items()):\n        # fit the data and tag outliers\n        clf.fit(X)\n        scores_pred = clf.decision_function(X)\n        threshold = stats.scoreatpercentile(scores_pred,\n                                            100 * outliers_fraction)\n        y_pred = clf.predict(X)\n        n_errors = (y_pred != ground_truth).sum()\n        # plot the levels lines and the points\n        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n        Z = Z.reshape(xx.shape)\n        subplot = plt.subplot(1, 3, i + 1)\n        subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),\n                         cmap=plt.cm.Blues_r)\n        a = subplot.contour(xx, yy, Z, levels=[threshold],\n                            linewidths=2, colors='red')\n        subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],\n                         colors='orange')\n        b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white')\n        c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black')\n        subplot.axis('tight')\n        subplot.legend(\n            [a.collections[0], b, c],\n            ['learned decision function', 'true inliers', 'true outliers'],\n            prop=matplotlib.font_manager.FontProperties(size=11),\n            loc='lower right')\n        subplot.set_title(\"%d. %s (errors: %d)\" % (i + 1, clf_name, n_errors))\n        subplot.set_xlim((-7, 7))\n        subplot.set_ylim((-7, 7))\n    plt.subplots_adjust(0.04, 0.1, 0.96, 0.92, 0.1, 0.26)\n\nplt.show()"
+        "import numpy as np\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport matplotlib.font_manager\n\nfrom sklearn import svm\nfrom sklearn.covariance import EllipticEnvelope\nfrom sklearn.ensemble import IsolationForest\nfrom sklearn.neighbors import LocalOutlierFactor\n\nprint(__doc__)\n\nrng = np.random.RandomState(42)\n\n# Example settings\nn_samples = 200\noutliers_fraction = 0.25\nclusters_separation = [0, 1, 2]\n\n# define two outlier detection tools to be compared\nclassifiers = {\n    \"One-Class SVM\": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,\n                                     kernel=\"rbf\", gamma=0.1),\n    \"Robust covariance\": EllipticEnvelope(contamination=outliers_fraction),\n    \"Isolation Forest\": IsolationForest(max_samples=n_samples,\n                                        contamination=outliers_fraction,\n                                        random_state=rng),\n    \"Local Outlier Factor\": LocalOutlierFactor(\n        n_neighbors=35,\n        contamination=outliers_fraction)}\n\n# Compare given classifiers under given settings\nxx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))\nn_inliers = int((1. - outliers_fraction) * n_samples)\nn_outliers = int(outliers_fraction * n_samples)\nground_truth = np.ones(n_samples, dtype=int)\nground_truth[-n_outliers:] = -1\n\n# Fit the problem with varying cluster separation\nfor i, offset in enumerate(clusters_separation):\n    np.random.seed(42)\n    # Data generation\n    X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset\n    X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset\n    X = np.r_[X1, X2]\n    # Add outliers\n    X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]\n\n    # Fit the model\n    plt.figure(figsize=(9, 7))\n    for i, (clf_name, clf) in enumerate(classifiers.items()):\n        # fit the data and tag outliers\n        if clf_name == \"Local Outlier Factor\":\n            y_pred = clf.fit_predict(X)\n            scores_pred = clf.negative_outlier_factor_\n        else:\n            clf.fit(X)\n            scores_pred = clf.decision_function(X)\n            y_pred = clf.predict(X)\n        threshold = stats.scoreatpercentile(scores_pred,\n                                            100 * outliers_fraction)\n        n_errors = (y_pred != ground_truth).sum()\n        # plot the levels lines and the points\n        if clf_name == \"Local Outlier Factor\":\n            # decision_function is private for LOF\n            Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])\n        else:\n            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n        Z = Z.reshape(xx.shape)\n        subplot = plt.subplot(2, 2, i + 1)\n        subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),\n                         cmap=plt.cm.Blues_r)\n        a = subplot.contour(xx, yy, Z, levels=[threshold],\n                            linewidths=2, colors='red')\n        subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],\n                         colors='orange')\n        b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white')\n        c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black')\n        subplot.axis('tight')\n        subplot.legend(\n            [a.collections[0], b, c],\n            ['learned decision function', 'true inliers', 'true outliers'],\n            prop=matplotlib.font_manager.FontProperties(size=10),\n            loc='lower right')\n        subplot.set_xlabel(\"%d. %s (errors: %d)\" % (i + 1, clf_name, n_errors))\n        subplot.set_xlim((-7, 7))\n        subplot.set_ylim((-7, 7))\n    plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)\n    plt.suptitle(\"Outlier detection\")\n\nplt.show()"
       ], 
       "outputs": [], 
       "metadata": {
 
@@ -18,6 +18,9 @@
   hence more adapted to large-dimensional settings, even if it performs
   quite well in the examples below.
 
+- using the Local Outlier Factor to measure the local deviation of a given
+  data point with respect to its neighbors by comparing their local density.
+
 The ground truth about inliers and outliers is given by the points colors
 while the orange-filled area indicates which points are reported as inliers
 by each method.
@@ -27,7 +30,6 @@
 threshold on the decision_function to separate out the corresponding
 fraction.
 """
-print(__doc__)
 
 import numpy as np
 from scipy import stats
@@ -37,6 +39,9 @@
 from sklearn import svm
 from sklearn.covariance import EllipticEnvelope
 from sklearn.ensemble import IsolationForest
+from sklearn.neighbors import LocalOutlierFactor
+
+print(__doc__)
 
 rng = np.random.RandomState(42)
 
@@ -52,10 +57,13 @@
     "Robust covariance": EllipticEnvelope(contamination=outliers_fraction),
     "Isolation Forest": IsolationForest(max_samples=n_samples,
                                         contamination=outliers_fraction,
-                                        random_state=rng)}
+                                        random_state=rng),
+    "Local Outlier Factor": LocalOutlierFactor(
+        n_neighbors=35,
+        contamination=outliers_fraction)}
 
 # Compare given classifiers under given settings
-xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
+xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
 n_inliers = int((1. - outliers_fraction) * n_samples)
 n_outliers = int(outliers_fraction * n_samples)
 ground_truth = np.ones(n_samples, dtype=int)
@@ -72,19 +80,27 @@
     X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
 
     # Fit the model
-    plt.figure(figsize=(10.8, 3.6))
+    plt.figure(figsize=(9, 7))
     for i, (clf_name, clf) in enumerate(classifiers.items()):
         # fit the data and tag outliers
-        clf.fit(X)
-        scores_pred = clf.decision_function(X)
+        if clf_name == "Local Outlier Factor":
+            y_pred = clf.fit_predict(X)
+            scores_pred = clf.negative_outlier_factor_
+        else:
+            clf.fit(X)
+            scores_pred = clf.decision_function(X)
+            y_pred = clf.predict(X)
         threshold = stats.scoreatpercentile(scores_pred,
                                             100 * outliers_fraction)
-        y_pred = clf.predict(X)
         n_errors = (y_pred != ground_truth).sum()
         # plot the levels lines and the points
-        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
+        if clf_name == "Local Outlier Factor":
+            # decision_function is private for LOF
+            Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
+        else:
+            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
         Z = Z.reshape(xx.shape)
-        subplot = plt.subplot(1, 3, i + 1)
+        subplot = plt.subplot(2, 2, i + 1)
         subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
                          cmap=plt.cm.Blues_r)
         a = subplot.contour(xx, yy, Z, levels=[threshold],
@@ -97,11 +113,12 @@
         subplot.legend(
             [a.collections[0], b, c],
             ['learned decision function', 'true inliers', 'true outliers'],
-            prop=matplotlib.font_manager.FontProperties(size=11),
+            prop=matplotlib.font_manager.FontProperties(size=10),
             loc='lower right')
-        subplot.set_title("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
+        subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
         subplot.set_xlim((-7, 7))
         subplot.set_ylim((-7, 7))
-    plt.subplots_adjust(0.04, 0.1, 0.96, 0.92, 0.1, 0.26)
+    plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
+    plt.suptitle("Outlier detection")
 
 plt.show()