Skip to content

Commit 0412ce5

Browse files
committed
Pushing the docs to dev/ for branch: main, commit a3a16046e2d5100657da6049d03fe93f96264cdc
1 parent 639b25e commit 0412ce5

File tree

1,309 files changed

+5997
-6002
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,309 files changed

+5997
-6002
lines changed

dev/.buildinfo

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: 6bd6b226cfe6702d2bb6a77ccc4183ee
3+
config: d17036dce23f3b64de69fbfad3c2a4bc
44
tags: 645f666f9bcd5a90fca523b33c5a78b7
Binary file not shown.

dev/_downloads/609eccf9ab7d476daf68967ce1fce0b7/plot_outlier_detection_wine.py

Lines changed: 64 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -21,66 +21,64 @@
2121
estimation of the data structure, but yet accurate to some extent.
2222
The One-Class SVM does not assume any parametric form of the data distribution
2323
and can therefore model the complex shape of the data much better.
24-
25-
First example
26-
-------------
27-
The first example illustrates how the Minimum Covariance Determinant
28-
robust estimator can help concentrate on a relevant cluster when outlying
29-
points exist. Here the empirical covariance estimation is skewed by points
30-
outside of the main cluster. Of course, some screening tools would have pointed
31-
out the presence of two clusters (Support Vector Machines, Gaussian Mixture
32-
Models, univariate outlier detection, ...). But had it been a high-dimensional
33-
example, none of these could be applied that easily.
34-
3524
"""
3625

3726
# Author: Virgile Fritsch <[email protected]>
3827
# License: BSD 3 clause
3928

40-
import matplotlib.font_manager
41-
import matplotlib.pyplot as plt
42-
import numpy as np
43-
29+
# %%
30+
# First example
31+
# -------------
32+
#
33+
# The first example illustrates how the Minimum Covariance Determinant
34+
# robust estimator can help concentrate on a relevant cluster when outlying
35+
# points exist. Here the empirical covariance estimation is skewed by points
36+
# outside of the main cluster. Of course, some screening tools would have pointed
37+
# out the presence of two clusters (Support Vector Machines, Gaussian Mixture
38+
# Models, univariate outlier detection, ...). But had it been a high-dimensional
39+
# example, none of these could be applied that easily.
4440
from sklearn.covariance import EllipticEnvelope
45-
from sklearn.datasets import load_wine
41+
from sklearn.inspection import DecisionBoundaryDisplay
4642
from sklearn.svm import OneClassSVM
4743

48-
# Define "classifiers" to be used
49-
classifiers = {
44+
estimators = {
5045
"Empirical Covariance": EllipticEnvelope(support_fraction=1.0, contamination=0.25),
5146
"Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope(
5247
contamination=0.25
5348
),
5449
"OCSVM": OneClassSVM(nu=0.25, gamma=0.35),
5550
}
56-
colors = ["m", "g", "b"]
57-
legend1 = {}
58-
legend2 = {}
5951

60-
# Get data
61-
X1 = load_wine()["data"][:, [1, 2]] # two clusters
52+
# %%
53+
import matplotlib.lines as mlines
54+
import matplotlib.pyplot as plt
6255

56+
from sklearn.datasets import load_wine
57+
58+
X = load_wine()["data"][:, [1, 2]] # two clusters
59+
60+
fig, ax = plt.subplots()
61+
colors = ["tab:blue", "tab:orange", "tab:red"]
6362
# Learn a frontier for outlier detection with several classifiers
64-
xx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500))
65-
for i, (clf_name, clf) in enumerate(classifiers.items()):
66-
plt.figure(1)
67-
clf.fit(X1)
68-
Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
69-
Z1 = Z1.reshape(xx1.shape)
70-
legend1[clf_name] = plt.contour(
71-
xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i]
63+
legend_lines = []
64+
for color, (name, estimator) in zip(colors, estimators.items()):
65+
estimator.fit(X)
66+
DecisionBoundaryDisplay.from_estimator(
67+
estimator,
68+
X,
69+
response_method="decision_function",
70+
plot_method="contour",
71+
levels=[0],
72+
colors=color,
73+
ax=ax,
7274
)
75+
legend_lines.append(mlines.Line2D([], [], color=color, label=name))
7376

74-
legend1_values_list = list(legend1.values())
75-
legend1_keys_list = list(legend1.keys())
7677

77-
# Plot the results (= shape of the data points cloud)
78-
plt.figure(1) # two clusters
79-
plt.title("Outlier detection on a real data set (wine recognition)")
80-
plt.scatter(X1[:, 0], X1[:, 1], color="black")
78+
ax.scatter(X[:, 0], X[:, 1], color="black")
8179
bbox_args = dict(boxstyle="round", fc="0.8")
8280
arrow_args = dict(arrowstyle="->")
83-
plt.annotate(
81+
ax.annotate(
8482
"outlying points",
8583
xy=(4, 2),
8684
xycoords="data",
@@ -89,26 +87,17 @@
8987
bbox=bbox_args,
9088
arrowprops=arrow_args,
9189
)
92-
plt.xlim((xx1.min(), xx1.max()))
93-
plt.ylim((yy1.min(), yy1.max()))
94-
plt.legend(
95-
(
96-
legend1_values_list[0].collections[0],
97-
legend1_values_list[1].collections[0],
98-
legend1_values_list[2].collections[0],
99-
),
100-
(legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
101-
loc="upper center",
102-
prop=matplotlib.font_manager.FontProperties(size=11),
90+
ax.legend(handles=legend_lines, loc="upper center")
91+
_ = ax.set(
92+
xlabel="ash",
93+
ylabel="malic_acid",
94+
title="Outlier detection on a real data set (wine recognition)",
10395
)
104-
plt.ylabel("ash")
105-
plt.xlabel("malic_acid")
106-
107-
plt.show()
10896

10997
# %%
11098
# Second example
11199
# --------------
100+
#
112101
# The second example shows the ability of the Minimum Covariance Determinant
113102
# robust estimator of covariance to concentrate on the main mode of the data
114103
# distribution: the ___location seems to be well estimated, although the
@@ -117,41 +106,32 @@
117106
# capture the real data structure, but the difficulty is to adjust its kernel
118107
# bandwidth parameter so as to obtain a good compromise between the shape of
119108
# the data scatter matrix and the risk of over-fitting the data.
109+
X = load_wine()["data"][:, [6, 9]] # "banana"-shaped
120110

121-
# Get data
122-
X2 = load_wine()["data"][:, [6, 9]] # "banana"-shaped
123-
111+
fig, ax = plt.subplots()
112+
colors = ["tab:blue", "tab:orange", "tab:red"]
124113
# Learn a frontier for outlier detection with several classifiers
125-
xx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500))
126-
for i, (clf_name, clf) in enumerate(classifiers.items()):
127-
plt.figure(2)
128-
clf.fit(X2)
129-
Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])
130-
Z2 = Z2.reshape(xx2.shape)
131-
legend2[clf_name] = plt.contour(
132-
xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i]
114+
legend_lines = []
115+
for color, (name, estimator) in zip(colors, estimators.items()):
116+
estimator.fit(X)
117+
DecisionBoundaryDisplay.from_estimator(
118+
estimator,
119+
X,
120+
response_method="decision_function",
121+
plot_method="contour",
122+
levels=[0],
123+
colors=color,
124+
ax=ax,
133125
)
126+
legend_lines.append(mlines.Line2D([], [], color=color, label=name))
134127

135-
legend2_values_list = list(legend2.values())
136-
legend2_keys_list = list(legend2.keys())
137-
138-
# Plot the results (= shape of the data points cloud)
139-
plt.figure(2) # "banana" shape
140-
plt.title("Outlier detection on a real data set (wine recognition)")
141-
plt.scatter(X2[:, 0], X2[:, 1], color="black")
142-
plt.xlim((xx2.min(), xx2.max()))
143-
plt.ylim((yy2.min(), yy2.max()))
144-
plt.legend(
145-
(
146-
legend2_values_list[0].collections[0],
147-
legend2_values_list[1].collections[0],
148-
legend2_values_list[2].collections[0],
149-
),
150-
(legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),
151-
loc="upper center",
152-
prop=matplotlib.font_manager.FontProperties(size=11),
128+
129+
ax.scatter(X[:, 0], X[:, 1], color="black")
130+
ax.legend(handles=legend_lines, loc="upper center")
131+
ax.set(
132+
xlabel="flavanoids",
133+
ylabel="color_intensity",
134+
title="Outlier detection on a real data set (wine recognition)",
153135
)
154-
plt.ylabel("color_intensity")
155-
plt.xlabel("flavanoids")
156136

157137
plt.show()
Binary file not shown.

dev/_downloads/dd28338257df6d2a7e6b9ff5f2743272/plot_outlier_detection_wine.ipynb

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"\n# Outlier detection on a real data set\n\nThis example illustrates the need for robust covariance estimation\non a real data set. It is useful both for outlier detection and for\na better understanding of the data structure.\n\nWe selected two sets of two variables from the Wine data set\nas an illustration of what kind of analysis can be done with several\noutlier detection tools. For the purpose of visualization, we are working\nwith two-dimensional examples, but one should be aware that things are\nnot so trivial in high-dimension, as it will be pointed out.\n\nIn both examples below, the main result is that the empirical covariance\nestimate, as a non-robust one, is highly influenced by the heterogeneous\nstructure of the observations. Although the robust covariance estimate is\nable to focus on the main mode of the data distribution, it sticks to the\nassumption that the data should be Gaussian distributed, yielding some biased\nestimation of the data structure, but yet accurate to some extent.\nThe One-Class SVM does not assume any parametric form of the data distribution\nand can therefore model the complex shape of the data much better.\n\n## First example\nThe first example illustrates how the Minimum Covariance Determinant\nrobust estimator can help concentrate on a relevant cluster when outlying\npoints exist. Here the empirical covariance estimation is skewed by points\noutside of the main cluster. Of course, some screening tools would have pointed\nout the presence of two clusters (Support Vector Machines, Gaussian Mixture\nModels, univariate outlier detection, ...). But had it been a high-dimensional\nexample, none of these could be applied that easily.\n"
7+
"\n# Outlier detection on a real data set\n\nThis example illustrates the need for robust covariance estimation\non a real data set. It is useful both for outlier detection and for\na better understanding of the data structure.\n\nWe selected two sets of two variables from the Wine data set\nas an illustration of what kind of analysis can be done with several\noutlier detection tools. For the purpose of visualization, we are working\nwith two-dimensional examples, but one should be aware that things are\nnot so trivial in high-dimension, as it will be pointed out.\n\nIn both examples below, the main result is that the empirical covariance\nestimate, as a non-robust one, is highly influenced by the heterogeneous\nstructure of the observations. Although the robust covariance estimate is\nable to focus on the main mode of the data distribution, it sticks to the\nassumption that the data should be Gaussian distributed, yielding some biased\nestimation of the data structure, but yet accurate to some extent.\nThe One-Class SVM does not assume any parametric form of the data distribution\nand can therefore model the complex shape of the data much better.\n"
88
]
99
},
1010
{
@@ -15,14 +15,14 @@
1515
},
1616
"outputs": [],
1717
"source": [
18-
"# Author: Virgile Fritsch <[email protected]>\n# License: BSD 3 clause\n\nimport matplotlib.font_manager\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.covariance import EllipticEnvelope\nfrom sklearn.datasets import load_wine\nfrom sklearn.svm import OneClassSVM\n\n# Define \"classifiers\" to be used\nclassifiers = {\n \"Empirical Covariance\": EllipticEnvelope(support_fraction=1.0, contamination=0.25),\n \"Robust Covariance (Minimum Covariance Determinant)\": EllipticEnvelope(\n contamination=0.25\n ),\n \"OCSVM\": OneClassSVM(nu=0.25, gamma=0.35),\n}\ncolors = [\"m\", \"g\", \"b\"]\nlegend1 = {}\nlegend2 = {}\n\n# Get data\nX1 = load_wine()[\"data\"][:, [1, 2]] # two clusters\n\n# Learn a frontier for outlier detection with several classifiers\nxx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500))\nfor i, (clf_name, clf) in enumerate(classifiers.items()):\n plt.figure(1)\n clf.fit(X1)\n Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])\n Z1 = Z1.reshape(xx1.shape)\n legend1[clf_name] = plt.contour(\n xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i]\n )\n\nlegend1_values_list = list(legend1.values())\nlegend1_keys_list = list(legend1.keys())\n\n# Plot the results (= shape of the data points cloud)\nplt.figure(1) # two clusters\nplt.title(\"Outlier detection on a real data set (wine recognition)\")\nplt.scatter(X1[:, 0], X1[:, 1], color=\"black\")\nbbox_args = dict(boxstyle=\"round\", fc=\"0.8\")\narrow_args = dict(arrowstyle=\"->\")\nplt.annotate(\n \"outlying points\",\n xy=(4, 2),\n xycoords=\"data\",\n textcoords=\"data\",\n xytext=(3, 1.25),\n bbox=bbox_args,\n arrowprops=arrow_args,\n)\nplt.xlim((xx1.min(), xx1.max()))\nplt.ylim((yy1.min(), yy1.max()))\nplt.legend(\n (\n legend1_values_list[0].collections[0],\n legend1_values_list[1].collections[0],\n legend1_values_list[2].collections[0],\n ),\n (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),\n loc=\"upper center\",\n prop=matplotlib.font_manager.FontProperties(size=11),\n)\nplt.ylabel(\"ash\")\nplt.xlabel(\"malic_acid\")\n\nplt.show()"
18+
"# Author: Virgile Fritsch <[email protected]>\n# License: BSD 3 clause"
1919
]
2020
},
2121
{
2222
"cell_type": "markdown",
2323
"metadata": {},
2424
"source": [
25-
"## Second example\nThe second example shows the ability of the Minimum Covariance Determinant\nrobust estimator of covariance to concentrate on the main mode of the data\ndistribution: the ___location seems to be well estimated, although the\ncovariance is hard to estimate due to the banana-shaped distribution. Anyway,\nwe can get rid of some outlying observations. The One-Class SVM is able to\ncapture the real data structure, but the difficulty is to adjust its kernel\nbandwidth parameter so as to obtain a good compromise between the shape of\nthe data scatter matrix and the risk of over-fitting the data.\n\n"
25+
"## First example\n\nThe first example illustrates how the Minimum Covariance Determinant\nrobust estimator can help concentrate on a relevant cluster when outlying\npoints exist. Here the empirical covariance estimation is skewed by points\noutside of the main cluster. Of course, some screening tools would have pointed\nout the presence of two clusters (Support Vector Machines, Gaussian Mixture\nModels, univariate outlier detection, ...). But had it been a high-dimensional\nexample, none of these could be applied that easily.\n\n"
2626
]
2727
},
2828
{
@@ -33,7 +33,36 @@
3333
},
3434
"outputs": [],
3535
"source": [
36-
"# Get data\nX2 = load_wine()[\"data\"][:, [6, 9]] # \"banana\"-shaped\n\n# Learn a frontier for outlier detection with several classifiers\nxx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500))\nfor i, (clf_name, clf) in enumerate(classifiers.items()):\n plt.figure(2)\n clf.fit(X2)\n Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])\n Z2 = Z2.reshape(xx2.shape)\n legend2[clf_name] = plt.contour(\n xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i]\n )\n\nlegend2_values_list = list(legend2.values())\nlegend2_keys_list = list(legend2.keys())\n\n# Plot the results (= shape of the data points cloud)\nplt.figure(2) # \"banana\" shape\nplt.title(\"Outlier detection on a real data set (wine recognition)\")\nplt.scatter(X2[:, 0], X2[:, 1], color=\"black\")\nplt.xlim((xx2.min(), xx2.max()))\nplt.ylim((yy2.min(), yy2.max()))\nplt.legend(\n (\n legend2_values_list[0].collections[0],\n legend2_values_list[1].collections[0],\n legend2_values_list[2].collections[0],\n ),\n (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),\n loc=\"upper center\",\n prop=matplotlib.font_manager.FontProperties(size=11),\n)\nplt.ylabel(\"color_intensity\")\nplt.xlabel(\"flavanoids\")\n\nplt.show()"
36+
"from sklearn.covariance import EllipticEnvelope\nfrom sklearn.inspection import DecisionBoundaryDisplay\nfrom sklearn.svm import OneClassSVM\n\nestimators = {\n \"Empirical Covariance\": EllipticEnvelope(support_fraction=1.0, contamination=0.25),\n \"Robust Covariance (Minimum Covariance Determinant)\": EllipticEnvelope(\n contamination=0.25\n ),\n \"OCSVM\": OneClassSVM(nu=0.25, gamma=0.35),\n}"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"import matplotlib.lines as mlines\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_wine\n\nX = load_wine()[\"data\"][:, [1, 2]] # two clusters\n\nfig, ax = plt.subplots()\ncolors = [\"tab:blue\", \"tab:orange\", \"tab:red\"]\n# Learn a frontier for outlier detection with several classifiers\nlegend_lines = []\nfor color, (name, estimator) in zip(colors, estimators.items()):\n estimator.fit(X)\n DecisionBoundaryDisplay.from_estimator(\n estimator,\n X,\n response_method=\"decision_function\",\n plot_method=\"contour\",\n levels=[0],\n colors=color,\n ax=ax,\n )\n legend_lines.append(mlines.Line2D([], [], color=color, label=name))\n\n\nax.scatter(X[:, 0], X[:, 1], color=\"black\")\nbbox_args = dict(boxstyle=\"round\", fc=\"0.8\")\narrow_args = dict(arrowstyle=\"->\")\nax.annotate(\n \"outlying points\",\n xy=(4, 2),\n xycoords=\"data\",\n textcoords=\"data\",\n xytext=(3, 1.25),\n bbox=bbox_args,\n arrowprops=arrow_args,\n)\nax.legend(handles=legend_lines, loc=\"upper center\")\n_ = ax.set(\n xlabel=\"ash\",\n ylabel=\"malic_acid\",\n title=\"Outlier detection on a real data set (wine recognition)\",\n)"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {},
53+
"source": [
54+
"## Second example\n\nThe second example shows the ability of the Minimum Covariance Determinant\nrobust estimator of covariance to concentrate on the main mode of the data\ndistribution: the ___location seems to be well estimated, although the\ncovariance is hard to estimate due to the banana-shaped distribution. Anyway,\nwe can get rid of some outlying observations. The One-Class SVM is able to\ncapture the real data structure, but the difficulty is to adjust its kernel\nbandwidth parameter so as to obtain a good compromise between the shape of\nthe data scatter matrix and the risk of over-fitting the data.\n\n"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [],
64+
"source": [
65+
"X = load_wine()[\"data\"][:, [6, 9]] # \"banana\"-shaped\n\nfig, ax = plt.subplots()\ncolors = [\"tab:blue\", \"tab:orange\", \"tab:red\"]\n# Learn a frontier for outlier detection with several classifiers\nlegend_lines = []\nfor color, (name, estimator) in zip(colors, estimators.items()):\n estimator.fit(X)\n DecisionBoundaryDisplay.from_estimator(\n estimator,\n X,\n response_method=\"decision_function\",\n plot_method=\"contour\",\n levels=[0],\n colors=color,\n ax=ax,\n )\n legend_lines.append(mlines.Line2D([], [], color=color, label=name))\n\n\nax.scatter(X[:, 0], X[:, 1], color=\"black\")\nax.legend(handles=legend_lines, loc=\"upper center\")\nax.set(\n xlabel=\"flavanoids\",\n ylabel=\"color_intensity\",\n title=\"Outlier detection on a real data set (wine recognition)\",\n)\n\nplt.show()"
3766
]
3867
}
3968
],

dev/_downloads/scikit-learn-docs.zip

-19.4 KB
Binary file not shown.
-167 Bytes
-170 Bytes
68 Bytes
454 Bytes

0 commit comments

Comments
 (0)