scikit-learn
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-869 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-869 Bytes
diff --git a/‎dev/_downloads/609eccf9ab7d476daf68967ce1fce0b7/plot_outlier_detection_wine.py
Lines changed: 64 additions & 84 deletions b/‎dev/_downloads/609eccf9ab7d476daf68967ce1fce0b7/plot_outlier_detection_wine.py
Lines changed: 64 additions & 84 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
-440 Bytes b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
-440 Bytes
diff --git a/‎dev/_downloads/dd28338257df6d2a7e6b9ff5f2743272/plot_outlier_detection_wine.ipynb
Lines changed: 33 additions & 4 deletions b/‎dev/_downloads/dd28338257df6d2a7e6b9ff5f2743272/plot_outlier_detection_wine.ipynb
Lines changed: 33 additions & 4 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
-19.4 KB b/‎dev/_downloads/scikit-learn-docs.zip
-19.4 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-167 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-167 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-170 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-170 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
68 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
68 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
454 Bytes b/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
454 Bytes
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 6bd6b226cfe6702d2bb6a77ccc4183ee
+config: d17036dce23f3b64de69fbfad3c2a4bc
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -21,66 +21,64 @@
 estimation of the data structure, but yet accurate to some extent.
 The One-Class SVM does not assume any parametric form of the data distribution
 and can therefore model the complex shape of the data much better.
-
-First example
--------------
-The first example illustrates how the Minimum Covariance Determinant
-robust estimator can help concentrate on a relevant cluster when outlying
-points exist. Here the empirical covariance estimation is skewed by points
-outside of the main cluster. Of course, some screening tools would have pointed
-out the presence of two clusters (Support Vector Machines, Gaussian Mixture
-Models, univariate outlier detection, ...). But had it been a high-dimensional
-example, none of these could be applied that easily.
-
 """
 
 # Author: Virgile Fritsch <[email protected]>
 # License: BSD 3 clause
 
-import matplotlib.font_manager
-import matplotlib.pyplot as plt
-import numpy as np
-
+# %%
+# First example
+# -------------
+#
+# The first example illustrates how the Minimum Covariance Determinant
+# robust estimator can help concentrate on a relevant cluster when outlying
+# points exist. Here the empirical covariance estimation is skewed by points
+# outside of the main cluster. Of course, some screening tools would have pointed
+# out the presence of two clusters (Support Vector Machines, Gaussian Mixture
+# Models, univariate outlier detection, ...). But had it been a high-dimensional
+# example, none of these could be applied that easily.
 from sklearn.covariance import EllipticEnvelope
-from sklearn.datasets import load_wine
+from sklearn.inspection import DecisionBoundaryDisplay
 from sklearn.svm import OneClassSVM
 
-# Define "classifiers" to be used
-classifiers = {
+estimators = {
     "Empirical Covariance": EllipticEnvelope(support_fraction=1.0, contamination=0.25),
     "Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope(
         contamination=0.25
     ),
     "OCSVM": OneClassSVM(nu=0.25, gamma=0.35),
 }
-colors = ["m", "g", "b"]
-legend1 = {}
-legend2 = {}
 
-# Get data
-X1 = load_wine()["data"][:, [1, 2]]  # two clusters
+# %%
+import matplotlib.lines as mlines
+import matplotlib.pyplot as plt
 
+from sklearn.datasets import load_wine
+
+X = load_wine()["data"][:, [1, 2]]  # two clusters
+
+fig, ax = plt.subplots()
+colors = ["tab:blue", "tab:orange", "tab:red"]
 # Learn a frontier for outlier detection with several classifiers
-xx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500))
-for i, (clf_name, clf) in enumerate(classifiers.items()):
-    plt.figure(1)
-    clf.fit(X1)
-    Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
-    Z1 = Z1.reshape(xx1.shape)
-    legend1[clf_name] = plt.contour(
-        xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i]
+legend_lines = []
+for color, (name, estimator) in zip(colors, estimators.items()):
+    estimator.fit(X)
+    DecisionBoundaryDisplay.from_estimator(
+        estimator,
+        X,
+        response_method="decision_function",
+        plot_method="contour",
+        levels=[0],
+        colors=color,
+        ax=ax,
     )
+    legend_lines.append(mlines.Line2D([], [], color=color, label=name))
 
-legend1_values_list = list(legend1.values())
-legend1_keys_list = list(legend1.keys())
 
-# Plot the results (= shape of the data points cloud)
-plt.figure(1)  # two clusters
-plt.title("Outlier detection on a real data set (wine recognition)")
-plt.scatter(X1[:, 0], X1[:, 1], color="black")
+ax.scatter(X[:, 0], X[:, 1], color="black")
 bbox_args = dict(boxstyle="round", fc="0.8")
 arrow_args = dict(arrowstyle="->")
-plt.annotate(
+ax.annotate(
     "outlying points",
     xy=(4, 2),
     xycoords="data",
@@ -89,26 +87,17 @@
     bbox=bbox_args,
     arrowprops=arrow_args,
 )
-plt.xlim((xx1.min(), xx1.max()))
-plt.ylim((yy1.min(), yy1.max()))
-plt.legend(
-    (
-        legend1_values_list[0].collections[0],
-        legend1_values_list[1].collections[0],
-        legend1_values_list[2].collections[0],
-    ),
-    (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
-    loc="upper center",
-    prop=matplotlib.font_manager.FontProperties(size=11),
+ax.legend(handles=legend_lines, loc="upper center")
+_ = ax.set(
+    xlabel="ash",
+    ylabel="malic_acid",
+    title="Outlier detection on a real data set (wine recognition)",
 )
-plt.ylabel("ash")
-plt.xlabel("malic_acid")
-
-plt.show()
 
 # %%
 # Second example
 # --------------
+#
 # The second example shows the ability of the Minimum Covariance Determinant
 # robust estimator of covariance to concentrate on the main mode of the data
 # distribution: the ___location seems to be well estimated, although the
@@ -117,41 +106,32 @@
 # capture the real data structure, but the difficulty is to adjust its kernel
 # bandwidth parameter so as to obtain a good compromise between the shape of
 # the data scatter matrix and the risk of over-fitting the data.
+X = load_wine()["data"][:, [6, 9]]  # "banana"-shaped
 
-# Get data
-X2 = load_wine()["data"][:, [6, 9]]  # "banana"-shaped
-
+fig, ax = plt.subplots()
+colors = ["tab:blue", "tab:orange", "tab:red"]
 # Learn a frontier for outlier detection with several classifiers
-xx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500))
-for i, (clf_name, clf) in enumerate(classifiers.items()):
-    plt.figure(2)
-    clf.fit(X2)
-    Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])
-    Z2 = Z2.reshape(xx2.shape)
-    legend2[clf_name] = plt.contour(
-        xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i]
+legend_lines = []
+for color, (name, estimator) in zip(colors, estimators.items()):
+    estimator.fit(X)
+    DecisionBoundaryDisplay.from_estimator(
+        estimator,
+        X,
+        response_method="decision_function",
+        plot_method="contour",
+        levels=[0],
+        colors=color,
+        ax=ax,
     )
+    legend_lines.append(mlines.Line2D([], [], color=color, label=name))
 
-legend2_values_list = list(legend2.values())
-legend2_keys_list = list(legend2.keys())
-
-# Plot the results (= shape of the data points cloud)
-plt.figure(2)  # "banana" shape
-plt.title("Outlier detection on a real data set (wine recognition)")
-plt.scatter(X2[:, 0], X2[:, 1], color="black")
-plt.xlim((xx2.min(), xx2.max()))
-plt.ylim((yy2.min(), yy2.max()))
-plt.legend(
-    (
-        legend2_values_list[0].collections[0],
-        legend2_values_list[1].collections[0],
-        legend2_values_list[2].collections[0],
-    ),
-    (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),
-    loc="upper center",
-    prop=matplotlib.font_manager.FontProperties(size=11),
+
+ax.scatter(X[:, 0], X[:, 1], color="black")
+ax.legend(handles=legend_lines, loc="upper center")
+ax.set(
+    xlabel="flavanoids",
+    ylabel="color_intensity",
+    title="Outlier detection on a real data set (wine recognition)",
 )
-plt.ylabel("color_intensity")
-plt.xlabel("flavanoids")
 
 plt.show()
@@ -4,7 +4,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Outlier detection on a real data set\n\nThis example illustrates the need for robust covariance estimation\non a real data set. It is useful both for outlier detection and for\na better understanding of the data structure.\n\nWe selected two sets of two variables from the Wine data set\nas an illustration of what kind of analysis can be done with several\noutlier detection tools. For the purpose of visualization, we are working\nwith two-dimensional examples, but one should be aware that things are\nnot so trivial in high-dimension, as it will be pointed out.\n\nIn both examples below, the main result is that the empirical covariance\nestimate, as a non-robust one, is highly influenced by the heterogeneous\nstructure of the observations. Although the robust covariance estimate is\nable to focus on the main mode of the data distribution, it sticks to the\nassumption that the data should be Gaussian distributed, yielding some biased\nestimation of the data structure, but yet accurate to some extent.\nThe One-Class SVM does not assume any parametric form of the data distribution\nand can therefore model the complex shape of the data much better.\n\n## First example\nThe first example illustrates how the Minimum Covariance Determinant\nrobust estimator can help concentrate on a relevant cluster when outlying\npoints exist. Here the empirical covariance estimation is skewed by points\noutside of the main cluster. Of course, some screening tools would have pointed\nout the presence of two clusters (Support Vector Machines, Gaussian Mixture\nModels, univariate outlier detection, ...). But had it been a high-dimensional\nexample, none of these could be applied that easily.\n"
+        "\n# Outlier detection on a real data set\n\nThis example illustrates the need for robust covariance estimation\non a real data set. It is useful both for outlier detection and for\na better understanding of the data structure.\n\nWe selected two sets of two variables from the Wine data set\nas an illustration of what kind of analysis can be done with several\noutlier detection tools. For the purpose of visualization, we are working\nwith two-dimensional examples, but one should be aware that things are\nnot so trivial in high-dimension, as it will be pointed out.\n\nIn both examples below, the main result is that the empirical covariance\nestimate, as a non-robust one, is highly influenced by the heterogeneous\nstructure of the observations. Although the robust covariance estimate is\nable to focus on the main mode of the data distribution, it sticks to the\nassumption that the data should be Gaussian distributed, yielding some biased\nestimation of the data structure, but yet accurate to some extent.\nThe One-Class SVM does not assume any parametric form of the data distribution\nand can therefore model the complex shape of the data much better.\n"
       ]
     },
     {
@@ -15,14 +15,14 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Virgile Fritsch <[email protected]>\n# License: BSD 3 clause\n\nimport matplotlib.font_manager\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.covariance import EllipticEnvelope\nfrom sklearn.datasets import load_wine\nfrom sklearn.svm import OneClassSVM\n\n# Define \"classifiers\" to be used\nclassifiers = {\n    \"Empirical Covariance\": EllipticEnvelope(support_fraction=1.0, contamination=0.25),\n    \"Robust Covariance (Minimum Covariance Determinant)\": EllipticEnvelope(\n        contamination=0.25\n    ),\n    \"OCSVM\": OneClassSVM(nu=0.25, gamma=0.35),\n}\ncolors = [\"m\", \"g\", \"b\"]\nlegend1 = {}\nlegend2 = {}\n\n# Get data\nX1 = load_wine()[\"data\"][:, [1, 2]]  # two clusters\n\n# Learn a frontier for outlier detection with several classifiers\nxx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500))\nfor i, (clf_name, clf) in enumerate(classifiers.items()):\n    plt.figure(1)\n    clf.fit(X1)\n    Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])\n    Z1 = Z1.reshape(xx1.shape)\n    legend1[clf_name] = plt.contour(\n        xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i]\n    )\n\nlegend1_values_list = list(legend1.values())\nlegend1_keys_list = list(legend1.keys())\n\n# Plot the results (= shape of the data points cloud)\nplt.figure(1)  # two clusters\nplt.title(\"Outlier detection on a real data set (wine recognition)\")\nplt.scatter(X1[:, 0], X1[:, 1], color=\"black\")\nbbox_args = dict(boxstyle=\"round\", fc=\"0.8\")\narrow_args = dict(arrowstyle=\"->\")\nplt.annotate(\n    \"outlying points\",\n    xy=(4, 2),\n    xycoords=\"data\",\n    textcoords=\"data\",\n    xytext=(3, 1.25),\n    bbox=bbox_args,\n    arrowprops=arrow_args,\n)\nplt.xlim((xx1.min(), xx1.max()))\nplt.ylim((yy1.min(), yy1.max()))\nplt.legend(\n    (\n        legend1_values_list[0].collections[0],\n        legend1_values_list[1].collections[0],\n        legend1_values_list[2].collections[0],\n    ),\n    (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),\n    loc=\"upper center\",\n    prop=matplotlib.font_manager.FontProperties(size=11),\n)\nplt.ylabel(\"ash\")\nplt.xlabel(\"malic_acid\")\n\nplt.show()"
+        "# Author: Virgile Fritsch <[email protected]>\n# License: BSD 3 clause"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Second example\nThe second example shows the ability of the Minimum Covariance Determinant\nrobust estimator of covariance to concentrate on the main mode of the data\ndistribution: the ___location seems to be well estimated, although the\ncovariance is hard to estimate due to the banana-shaped distribution. Anyway,\nwe can get rid of some outlying observations. The One-Class SVM is able to\ncapture the real data structure, but the difficulty is to adjust its kernel\nbandwidth parameter so as to obtain a good compromise between the shape of\nthe data scatter matrix and the risk of over-fitting the data.\n\n"
+        "## First example\n\nThe first example illustrates how the Minimum Covariance Determinant\nrobust estimator can help concentrate on a relevant cluster when outlying\npoints exist. Here the empirical covariance estimation is skewed by points\noutside of the main cluster. Of course, some screening tools would have pointed\nout the presence of two clusters (Support Vector Machines, Gaussian Mixture\nModels, univariate outlier detection, ...). But had it been a high-dimensional\nexample, none of these could be applied that easily.\n\n"
       ]
     },
     {
@@ -33,7 +33,36 @@
       },
       "outputs": [],
       "source": [
-        "# Get data\nX2 = load_wine()[\"data\"][:, [6, 9]]  # \"banana\"-shaped\n\n# Learn a frontier for outlier detection with several classifiers\nxx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500))\nfor i, (clf_name, clf) in enumerate(classifiers.items()):\n    plt.figure(2)\n    clf.fit(X2)\n    Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])\n    Z2 = Z2.reshape(xx2.shape)\n    legend2[clf_name] = plt.contour(\n        xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i]\n    )\n\nlegend2_values_list = list(legend2.values())\nlegend2_keys_list = list(legend2.keys())\n\n# Plot the results (= shape of the data points cloud)\nplt.figure(2)  # \"banana\" shape\nplt.title(\"Outlier detection on a real data set (wine recognition)\")\nplt.scatter(X2[:, 0], X2[:, 1], color=\"black\")\nplt.xlim((xx2.min(), xx2.max()))\nplt.ylim((yy2.min(), yy2.max()))\nplt.legend(\n    (\n        legend2_values_list[0].collections[0],\n        legend2_values_list[1].collections[0],\n        legend2_values_list[2].collections[0],\n    ),\n    (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),\n    loc=\"upper center\",\n    prop=matplotlib.font_manager.FontProperties(size=11),\n)\nplt.ylabel(\"color_intensity\")\nplt.xlabel(\"flavanoids\")\n\nplt.show()"
+        "from sklearn.covariance import EllipticEnvelope\nfrom sklearn.inspection import DecisionBoundaryDisplay\nfrom sklearn.svm import OneClassSVM\n\nestimators = {\n    \"Empirical Covariance\": EllipticEnvelope(support_fraction=1.0, contamination=0.25),\n    \"Robust Covariance (Minimum Covariance Determinant)\": EllipticEnvelope(\n        contamination=0.25\n    ),\n    \"OCSVM\": OneClassSVM(nu=0.25, gamma=0.35),\n}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.lines as mlines\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_wine\n\nX = load_wine()[\"data\"][:, [1, 2]]  # two clusters\n\nfig, ax = plt.subplots()\ncolors = [\"tab:blue\", \"tab:orange\", \"tab:red\"]\n# Learn a frontier for outlier detection with several classifiers\nlegend_lines = []\nfor color, (name, estimator) in zip(colors, estimators.items()):\n    estimator.fit(X)\n    DecisionBoundaryDisplay.from_estimator(\n        estimator,\n        X,\n        response_method=\"decision_function\",\n        plot_method=\"contour\",\n        levels=[0],\n        colors=color,\n        ax=ax,\n    )\n    legend_lines.append(mlines.Line2D([], [], color=color, label=name))\n\n\nax.scatter(X[:, 0], X[:, 1], color=\"black\")\nbbox_args = dict(boxstyle=\"round\", fc=\"0.8\")\narrow_args = dict(arrowstyle=\"->\")\nax.annotate(\n    \"outlying points\",\n    xy=(4, 2),\n    xycoords=\"data\",\n    textcoords=\"data\",\n    xytext=(3, 1.25),\n    bbox=bbox_args,\n    arrowprops=arrow_args,\n)\nax.legend(handles=legend_lines, loc=\"upper center\")\n_ = ax.set(\n    xlabel=\"ash\",\n    ylabel=\"malic_acid\",\n    title=\"Outlier detection on a real data set (wine recognition)\",\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Second example\n\nThe second example shows the ability of the Minimum Covariance Determinant\nrobust estimator of covariance to concentrate on the main mode of the data\ndistribution: the ___location seems to be well estimated, although the\ncovariance is hard to estimate due to the banana-shaped distribution. Anyway,\nwe can get rid of some outlying observations. The One-Class SVM is able to\ncapture the real data structure, but the difficulty is to adjust its kernel\nbandwidth parameter so as to obtain a good compromise between the shape of\nthe data scatter matrix and the risk of over-fitting the data.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "X = load_wine()[\"data\"][:, [6, 9]]  # \"banana\"-shaped\n\nfig, ax = plt.subplots()\ncolors = [\"tab:blue\", \"tab:orange\", \"tab:red\"]\n# Learn a frontier for outlier detection with several classifiers\nlegend_lines = []\nfor color, (name, estimator) in zip(colors, estimators.items()):\n    estimator.fit(X)\n    DecisionBoundaryDisplay.from_estimator(\n        estimator,\n        X,\n        response_method=\"decision_function\",\n        plot_method=\"contour\",\n        levels=[0],\n        colors=color,\n        ax=ax,\n    )\n    legend_lines.append(mlines.Line2D([], [], color=color, label=name))\n\n\nax.scatter(X[:, 0], X[:, 1], color=\"black\")\nax.legend(handles=legend_lines, loc=\"upper center\")\nax.set(\n    xlabel=\"flavanoids\",\n    ylabel=\"color_intensity\",\n    title=\"Outlier detection on a real data set (wine recognition)\",\n)\n\nplt.show()"
       ]
     }
   ],