scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
6.4 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
6.4 KB
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
8.2 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
8.2 KB
diff --git a/‎dev/_downloads/b7e32fe54d613dce0d3c376377af061d/plot_outlier_detection_bench.py
Lines changed: 193 additions & 0 deletions b/‎dev/_downloads/b7e32fe54d613dce0d3c376377af061d/plot_outlier_detection_bench.py
Lines changed: 193 additions & 0 deletions
diff --git a/‎dev/_downloads/eacb6a63c887dafcff02b3cee64854ef/plot_outlier_detection_bench.ipynb
Lines changed: 108 additions & 0 deletions b/‎dev/_downloads/eacb6a63c887dafcff02b3cee64854ef/plot_outlier_detection_bench.ipynb
Lines changed: 108 additions & 0 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
156 KB b/‎dev/_downloads/scikit-learn-docs.zip
156 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-185 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-185 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
80 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
80 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
2 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
2 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-69 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-69 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
128 Bytes b/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
128 Bytes
@@ -0,0 +1,193 @@
+"""
+==========================================
+Evaluation of outlier detection estimators
+==========================================
+
+This example benchmarks outlier detection algorithms, :ref:`local_outlier_factor`
+(LOF) and :ref:`isolation_forest` (IForest), using ROC curves on
+classical anomaly detection datasets. The algorithm performance
+is assessed in an outlier detection context:
+
+1. The algorithms are trained on the whole dataset which is assumed to
+contain outliers.
+
+2. The ROC curve from :class:`~sklearn.metrics.RocCurveDisplay` is computed
+on the same dataset using the knowledge of the labels.
+
+"""
+
+# Author: Pharuj Rajborirug <[email protected]>
+# License: BSD 3 clause
+
+print(__doc__)
+
+# %%
+# Define a data preprocessing function
+# ----------------------------------
+#
+# The example uses real-world datasets available in
+# :class:`sklearn.datasets` and the sample size of some datasets is reduced
+# to speed up computation. After the data preprocessing, the datasets' targets
+# will have two classes, 0 representing inliers and 1 representing outliers.
+# The `preprocess_dataset` function returns data and target.
+
+import numpy as np
+from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml
+from sklearn.preprocessing import LabelBinarizer
+import pandas as pd
+
+rng = np.random.RandomState(42)
+
+
+def preprocess_dataset(dataset_name):
+
+    # loading and vectorization
+    print(f"Loading {dataset_name} data")
+    if dataset_name in ["http", "smtp", "SA", "SF"]:
+        dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=rng)
+        X = dataset.data
+        y = dataset.target
+        lb = LabelBinarizer()
+
+        if dataset_name == "SF":
+            idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)
+            X = X[idx]  # reduce the sample size
+            y = y[idx]
+            x1 = lb.fit_transform(X[:, 1].astype(str))
+            X = np.c_[X[:, :1], x1, X[:, 2:]]
+        elif dataset_name == "SA":
+            idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)
+            X = X[idx]  # reduce the sample size
+            y = y[idx]
+            x1 = lb.fit_transform(X[:, 1].astype(str))
+            x2 = lb.fit_transform(X[:, 2].astype(str))
+            x3 = lb.fit_transform(X[:, 3].astype(str))
+            X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
+        y = (y != b"normal.").astype(int)
+    if dataset_name == "forestcover":
+        dataset = fetch_covtype()
+        X = dataset.data
+        y = dataset.target
+        idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)
+        X = X[idx]  # reduce the sample size
+        y = y[idx]
+
+        # inliers are those with attribute 2
+        # outliers are those with attribute 4
+        s = (y == 2) + (y == 4)
+        X = X[s, :]
+        y = y[s]
+        y = (y != 2).astype(int)
+    if dataset_name in ["glass", "wdbc", "cardiotocography"]:
+        dataset = fetch_openml(name=dataset_name, version=1, as_frame=False)
+        X = dataset.data
+        y = dataset.target
+
+        if dataset_name == "glass":
+            s = y == "tableware"
+            y = s.astype(int)
+        if dataset_name == "wdbc":
+            s = y == "2"
+            y = s.astype(int)
+            X_mal, y_mal = X[s], y[s]
+            X_ben, y_ben = X[~s], y[~s]
+
+            # downsampled to 39 points (9.8% outliers)
+            idx = rng.choice(y_mal.shape[0], 39, replace=False)
+            X_mal2 = X_mal[idx]
+            y_mal2 = y_mal[idx]
+            X = np.concatenate((X_ben, X_mal2), axis=0)
+            y = np.concatenate((y_ben, y_mal2), axis=0)
+        if dataset_name == "cardiotocography":
+            s = y == "3"
+            y = s.astype(int)
+    # 0 represents inliers, and 1 represents outliers
+    y = pd.Series(y, dtype="category")
+    return (X, y)
+
+
+# %%
+# Define an outlier prediction function
+# -------------------------------------
+# There is no particular reason to choose algorithms
+# :class:`~sklearn.neighbors.LocalOutlierFactor` and
+# :class:`~sklearn.ensemble.IsolationForest`. The goal is to show that
+# different algorithm performs well on different datasets. The following
+# `compute_prediction` function returns average outlier score of X.
+
+
+from sklearn.neighbors import LocalOutlierFactor
+from sklearn.ensemble import IsolationForest
+
+
+def compute_prediction(X, model_name):
+
+    print(f"Computing {model_name} prediction...")
+    if model_name == "LOF":
+        clf = LocalOutlierFactor(n_neighbors=20, contamination="auto")
+        clf.fit(X)
+        y_pred = clf.negative_outlier_factor_
+    if model_name == "IForest":
+        clf = IsolationForest(random_state=rng, contamination="auto")
+        y_pred = clf.fit(X).decision_function(X)
+    return y_pred
+
+
+# %%
+# Plot and interpret results
+# --------------------------
+#
+# The algorithm performance relates to how good the true positive rate (TPR)
+# is at low value of the false positive rate (FPR). The best algorithms
+# have the curve on the top-left of the plot and the area under curve (AUC)
+# close to 1. The diagonal dashed line represents a random classification
+# of outliers and inliers.
+
+
+import math
+import matplotlib.pyplot as plt
+from sklearn.metrics import RocCurveDisplay
+
+datasets_name = [
+    "http",
+    "smtp",
+    "SA",
+    "SF",
+    "forestcover",
+    "glass",
+    "wdbc",
+    "cardiotocography",
+]
+
+models_name = [
+    "LOF",
+    "IForest",
+]
+
+# plotting parameters
+cols = 2
+linewidth = 1
+pos_label = 0  # mean 0 belongs to positive class
+rows = math.ceil(len(datasets_name) / cols)
+
+fig, axs = plt.subplots(rows, cols, figsize=(10, rows * 3))
+
+for i, dataset_name in enumerate(datasets_name):
+    (X, y) = preprocess_dataset(dataset_name=dataset_name)
+
+    for model_name in models_name:
+        y_pred = compute_prediction(X, model_name=model_name)
+        display = RocCurveDisplay.from_predictions(
+            y,
+            y_pred,
+            pos_label=pos_label,
+            name=model_name,
+            linewidth=linewidth,
+            ax=axs[i // cols, i % cols],
+        )
+    axs[i // cols, i % cols].plot([0, 1], [0, 1], linewidth=linewidth, linestyle=":")
+    axs[i // cols, i % cols].set_title(dataset_name)
+    axs[i // cols, i % cols].set_xlabel("False Positive Rate")
+    axs[i // cols, i % cols].set_ylabel("True Positive Rate")
+plt.tight_layout(pad=2.0)  # spacing between subplots
+plt.show()
@@ -0,0 +1,108 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Evaluation of outlier detection estimators\n\nThis example benchmarks outlier detection algorithms, `local_outlier_factor`\n(LOF) and `isolation_forest` (IForest), using ROC curves on\nclassical anomaly detection datasets. The algorithm performance\nis assessed in an outlier detection context:\n\n1. The algorithms are trained on the whole dataset which is assumed to\ncontain outliers.\n\n2. The ROC curve from :class:`~sklearn.metrics.RocCurveDisplay` is computed\non the same dataset using the knowledge of the labels.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Author: Pharuj Rajborirug <[email protected]>\n# License: BSD 3 clause\n\nprint(__doc__)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Define a data preprocessing function\n\nThe example uses real-world datasets available in\n:class:`sklearn.datasets` and the sample size of some datasets is reduced\nto speed up computation. After the data preprocessing, the datasets' targets\nwill have two classes, 0 representing inliers and 1 representing outliers.\nThe `preprocess_dataset` function returns data and target.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\nfrom sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml\nfrom sklearn.preprocessing import LabelBinarizer\nimport pandas as pd\n\nrng = np.random.RandomState(42)\n\n\ndef preprocess_dataset(dataset_name):\n\n    # loading and vectorization\n    print(f\"Loading {dataset_name} data\")\n    if dataset_name in [\"http\", \"smtp\", \"SA\", \"SF\"]:\n        dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=rng)\n        X = dataset.data\n        y = dataset.target\n        lb = LabelBinarizer()\n\n        if dataset_name == \"SF\":\n            idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)\n            X = X[idx]  # reduce the sample size\n            y = y[idx]\n            x1 = lb.fit_transform(X[:, 1].astype(str))\n            X = np.c_[X[:, :1], x1, X[:, 2:]]\n        elif dataset_name == \"SA\":\n            idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)\n            X = X[idx]  # reduce the sample size\n            y = y[idx]\n            x1 = lb.fit_transform(X[:, 1].astype(str))\n            x2 = lb.fit_transform(X[:, 2].astype(str))\n            x3 = lb.fit_transform(X[:, 3].astype(str))\n            X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]\n        y = (y != b\"normal.\").astype(int)\n    if dataset_name == \"forestcover\":\n        dataset = fetch_covtype()\n        X = dataset.data\n        y = dataset.target\n        idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)\n        X = X[idx]  # reduce the sample size\n        y = y[idx]\n\n        # inliers are those with attribute 2\n        # outliers are those with attribute 4\n        s = (y == 2) + (y == 4)\n        X = X[s, :]\n        y = y[s]\n        y = (y != 2).astype(int)\n    if dataset_name in [\"glass\", \"wdbc\", \"cardiotocography\"]:\n        dataset = fetch_openml(name=dataset_name, version=1, as_frame=False)\n        X = dataset.data\n        y = dataset.target\n\n        if dataset_name == \"glass\":\n            s = y == \"tableware\"\n            y = s.astype(int)\n        if dataset_name == \"wdbc\":\n            s = y == \"2\"\n            y = s.astype(int)\n            X_mal, y_mal = X[s], y[s]\n            X_ben, y_ben = X[~s], y[~s]\n\n            # downsampled to 39 points (9.8% outliers)\n            idx = rng.choice(y_mal.shape[0], 39, replace=False)\n            X_mal2 = X_mal[idx]\n            y_mal2 = y_mal[idx]\n            X = np.concatenate((X_ben, X_mal2), axis=0)\n            y = np.concatenate((y_ben, y_mal2), axis=0)\n        if dataset_name == \"cardiotocography\":\n            s = y == \"3\"\n            y = s.astype(int)\n    # 0 represents inliers, and 1 represents outliers\n    y = pd.Series(y, dtype=\"category\")\n    return (X, y)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Define an outlier prediction function\nThere is no particular reason to choose algorithms\n:class:`~sklearn.neighbors.LocalOutlierFactor` and\n:class:`~sklearn.ensemble.IsolationForest`. The goal is to show that\ndifferent algorithm performs well on different datasets. The following\n`compute_prediction` function returns average outlier score of X.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.neighbors import LocalOutlierFactor\nfrom sklearn.ensemble import IsolationForest\n\n\ndef compute_prediction(X, model_name):\n\n    print(f\"Computing {model_name} prediction...\")\n    if model_name == \"LOF\":\n        clf = LocalOutlierFactor(n_neighbors=20, contamination=\"auto\")\n        clf.fit(X)\n        y_pred = clf.negative_outlier_factor_\n    if model_name == \"IForest\":\n        clf = IsolationForest(random_state=rng, contamination=\"auto\")\n        y_pred = clf.fit(X).decision_function(X)\n    return y_pred"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plot and interpret results\n\nThe algorithm performance relates to how good the true positive rate (TPR)\nis at low value of the false positive rate (FPR). The best algorithms\nhave the curve on the top-left of the plot and the area under curve (AUC)\nclose to 1. The diagonal dashed line represents a random classification\nof outliers and inliers.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import math\nimport matplotlib.pyplot as plt\nfrom sklearn.metrics import RocCurveDisplay\n\ndatasets_name = [\n    \"http\",\n    \"smtp\",\n    \"SA\",\n    \"SF\",\n    \"forestcover\",\n    \"glass\",\n    \"wdbc\",\n    \"cardiotocography\",\n]\n\nmodels_name = [\n    \"LOF\",\n    \"IForest\",\n]\n\n# plotting parameters\ncols = 2\nlinewidth = 1\npos_label = 0  # mean 0 belongs to positive class\nrows = math.ceil(len(datasets_name) / cols)\n\nfig, axs = plt.subplots(rows, cols, figsize=(10, rows * 3))\n\nfor i, dataset_name in enumerate(datasets_name):\n    (X, y) = preprocess_dataset(dataset_name=dataset_name)\n\n    for model_name in models_name:\n        y_pred = compute_prediction(X, model_name=model_name)\n        display = RocCurveDisplay.from_predictions(\n            y,\n            y_pred,\n            pos_label=pos_label,\n            name=model_name,\n            linewidth=linewidth,\n            ax=axs[i // cols, i % cols],\n        )\n    axs[i // cols, i % cols].plot([0, 1], [0, 1], linewidth=linewidth, linestyle=\":\")\n    axs[i // cols, i % cols].set_title(dataset_name)\n    axs[i // cols, i % cols].set_xlabel(\"False Positive Rate\")\n    axs[i // cols, i % cols].set_ylabel(\"True Positive Rate\")\nplt.tight_layout(pad=2.0)  # spacing between subplots\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.12"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}