Skip to content

Commit b6e3095

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 3a724107bb9e31f223c18f79a521b1616bcb393a
1 parent 77746d4 commit b6e3095

File tree

1,206 files changed

+4008
-3952
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,206 files changed

+4008
-3952
lines changed
Binary file not shown.

dev/_downloads/7b1e6710831c989a5f70f428313e74ba/plot_outlier_detection_housing.ipynb

Lines changed: 0 additions & 54 deletions
This file was deleted.
Binary file not shown.

dev/_downloads/474b3de05b07f7738d37a02442a9544e/plot_outlier_detection_housing.py renamed to dev/_downloads/d6978a6e4faab50efb15ced9457fb58d/plot_outlier_detection_wine.py

Lines changed: 53 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -24,26 +24,13 @@
2424
2525
First example
2626
-------------
27-
The first example illustrates how robust covariance estimation can help
28-
concentrating on a relevant cluster when another one exists. Here, many
29-
observations are confounded into one and break down the empirical covariance
30-
estimation.
31-
Of course, some screening tools would have pointed out the presence of two
32-
clusters (Support Vector Machines, Gaussian Mixture Models, univariate
33-
outlier detection, ...). But had it been a high-dimensional example, none
34-
of these could be applied that easily.
35-
36-
Second example
37-
--------------
38-
The second example shows the ability of the Minimum Covariance Determinant
39-
robust estimator of covariance to concentrate on the main mode of the data
40-
distribution: the ___location seems to be well estimated, although the covariance
41-
is hard to estimate due to the banana-shaped distribution. Anyway, we can
42-
get rid of some outlying observations.
43-
The One-Class SVM is able to capture the real data structure, but the
44-
difficulty is to adjust its kernel bandwidth parameter so as to obtain
45-
a good compromise between the shape of the data scatter matrix and the
46-
risk of over-fitting the data.
27+
The first example illustrates how the Minimum Covariance Determinant
28+
robust estimator can help concentrate on a relevant cluster when outlying
29+
points exist. Here the empirical covariance estimation is skewed by points
30+
outside of the main cluster. Of course, some screening tools would have pointed
31+
out the presence of two clusters (Support Vector Machines, Gaussian Mixture
32+
Models, univariate outlier detection, ...). But had it been a high-dimensional
33+
example, none of these could be applied that easily.
4734
4835
"""
4936
print(__doc__)
@@ -56,68 +43,88 @@
5643
from sklearn.svm import OneClassSVM
5744
import matplotlib.pyplot as plt
5845
import matplotlib.font_manager
59-
from sklearn.datasets import load_boston
60-
61-
# Get data
62-
X1 = load_boston()['data'][:, [8, 10]] # two clusters
63-
X2 = load_boston()['data'][:, [5, 12]] # "banana"-shaped
46+
from sklearn.datasets import load_wine
6447

6548
# Define "classifiers" to be used
6649
classifiers = {
6750
"Empirical Covariance": EllipticEnvelope(support_fraction=1.,
68-
contamination=0.261),
51+
contamination=0.25),
6952
"Robust Covariance (Minimum Covariance Determinant)":
70-
EllipticEnvelope(contamination=0.261),
71-
"OCSVM": OneClassSVM(nu=0.261, gamma=0.05)}
53+
EllipticEnvelope(contamination=0.25),
54+
"OCSVM": OneClassSVM(nu=0.25, gamma=0.35)}
7255
colors = ['m', 'g', 'b']
7356
legend1 = {}
7457
legend2 = {}
7558

59+
# Get data
60+
X1 = load_wine()['data'][:, [1, 2]] # two clusters
61+
7662
# Learn a frontier for outlier detection with several classifiers
77-
xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500))
78-
xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500))
63+
xx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500))
7964
for i, (clf_name, clf) in enumerate(classifiers.items()):
8065
plt.figure(1)
8166
clf.fit(X1)
8267
Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
8368
Z1 = Z1.reshape(xx1.shape)
8469
legend1[clf_name] = plt.contour(
8570
xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i])
86-
plt.figure(2)
87-
clf.fit(X2)
88-
Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])
89-
Z2 = Z2.reshape(xx2.shape)
90-
legend2[clf_name] = plt.contour(
91-
xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i])
9271

9372
legend1_values_list = list(legend1.values())
9473
legend1_keys_list = list(legend1.keys())
9574

9675
# Plot the results (= shape of the data points cloud)
9776
plt.figure(1) # two clusters
98-
plt.title("Outlier detection on a real data set (boston housing)")
77+
plt.title("Outlier detection on a real data set (wine recognition)")
9978
plt.scatter(X1[:, 0], X1[:, 1], color='black')
10079
bbox_args = dict(boxstyle="round", fc="0.8")
10180
arrow_args = dict(arrowstyle="->")
102-
plt.annotate("several confounded points", xy=(24, 19),
81+
plt.annotate("outlying points", xy=(4, 2),
10382
xycoords="data", textcoords="data",
104-
xytext=(13, 10), bbox=bbox_args, arrowprops=arrow_args)
83+
xytext=(3, 1.25), bbox=bbox_args, arrowprops=arrow_args)
10584
plt.xlim((xx1.min(), xx1.max()))
10685
plt.ylim((yy1.min(), yy1.max()))
10786
plt.legend((legend1_values_list[0].collections[0],
10887
legend1_values_list[1].collections[0],
10988
legend1_values_list[2].collections[0]),
11089
(legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
11190
loc="upper center",
112-
prop=matplotlib.font_manager.FontProperties(size=12))
113-
plt.ylabel("accessibility to radial highways")
114-
plt.xlabel("pupil-teacher ratio by town")
91+
prop=matplotlib.font_manager.FontProperties(size=11))
92+
plt.ylabel("ash")
93+
plt.xlabel("malic_acid")
94+
95+
plt.show()
96+
97+
##############################################################################
98+
# Second example
99+
# --------------
100+
# The second example shows the ability of the Minimum Covariance Determinant
101+
# robust estimator of covariance to concentrate on the main mode of the data
102+
# distribution: the ___location seems to be well estimated, although the
103+
# covariance is hard to estimate due to the banana-shaped distribution. Anyway,
104+
# we can get rid of some outlying observations. The One-Class SVM is able to
105+
# capture the real data structure, but the difficulty is to adjust its kernel
106+
# bandwidth parameter so as to obtain a good compromise between the shape of
107+
# the data scatter matrix and the risk of over-fitting the data.
108+
109+
# Get data
110+
X2 = load_wine()['data'][:, [6, 9]] # "banana"-shaped
111+
112+
# Learn a frontier for outlier detection with several classifiers
113+
xx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500))
114+
for i, (clf_name, clf) in enumerate(classifiers.items()):
115+
plt.figure(2)
116+
clf.fit(X2)
117+
Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])
118+
Z2 = Z2.reshape(xx2.shape)
119+
legend2[clf_name] = plt.contour(
120+
xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i])
115121

116122
legend2_values_list = list(legend2.values())
117123
legend2_keys_list = list(legend2.keys())
118124

125+
# Plot the results (= shape of the data points cloud)
119126
plt.figure(2) # "banana" shape
120-
plt.title("Outlier detection on a real data set (boston housing)")
127+
plt.title("Outlier detection on a real data set (wine recognition)")
121128
plt.scatter(X2[:, 0], X2[:, 1], color='black')
122129
plt.xlim((xx2.min(), xx2.max()))
123130
plt.ylim((yy2.min(), yy2.max()))
@@ -126,8 +133,8 @@
126133
legend2_values_list[2].collections[0]),
127134
(legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),
128135
loc="upper center",
129-
prop=matplotlib.font_manager.FontProperties(size=12))
130-
plt.ylabel("% lower status of the population")
131-
plt.xlabel("average number of rooms per dwelling")
136+
prop=matplotlib.font_manager.FontProperties(size=11))
137+
plt.ylabel("color_intensity")
138+
plt.xlabel("flavanoids")
132139

133140
plt.show()

0 commit comments

Comments
 (0)