|
24 | 24 |
|
25 | 25 | First example
|
26 | 26 | -------------
|
27 |
| -The first example illustrates how robust covariance estimation can help |
28 |
| -concentrating on a relevant cluster when another one exists. Here, many |
29 |
| -observations are confounded into one and break down the empirical covariance |
30 |
| -estimation. |
31 |
| -Of course, some screening tools would have pointed out the presence of two |
32 |
| -clusters (Support Vector Machines, Gaussian Mixture Models, univariate |
33 |
| -outlier detection, ...). But had it been a high-dimensional example, none |
34 |
| -of these could be applied that easily. |
35 |
| -
|
36 |
| -Second example |
37 |
| --------------- |
38 |
| -The second example shows the ability of the Minimum Covariance Determinant |
39 |
| -robust estimator of covariance to concentrate on the main mode of the data |
40 |
| -distribution: the ___location seems to be well estimated, although the covariance |
41 |
| -is hard to estimate due to the banana-shaped distribution. Anyway, we can |
42 |
| -get rid of some outlying observations. |
43 |
| -The One-Class SVM is able to capture the real data structure, but the |
44 |
| -difficulty is to adjust its kernel bandwidth parameter so as to obtain |
45 |
| -a good compromise between the shape of the data scatter matrix and the |
46 |
| -risk of over-fitting the data. |
| 27 | +The first example illustrates how the Minimum Covariance Determinant |
| 28 | +robust estimator can help concentrate on a relevant cluster when outlying |
| 29 | +points exist. Here the empirical covariance estimation is skewed by points |
| 30 | +outside of the main cluster. Of course, some screening tools would have pointed |
| 31 | +out the presence of two clusters (Support Vector Machines, Gaussian Mixture |
| 32 | +Models, univariate outlier detection, ...). But had it been a high-dimensional |
| 33 | +example, none of these could be applied that easily. |
47 | 34 |
|
48 | 35 | """
|
49 | 36 | print(__doc__)
|
|
56 | 43 | from sklearn.svm import OneClassSVM
|
57 | 44 | import matplotlib.pyplot as plt
|
58 | 45 | import matplotlib.font_manager
|
59 |
| -from sklearn.datasets import load_boston |
60 |
| - |
61 |
| -# Get data |
62 |
| -X1 = load_boston()['data'][:, [8, 10]] # two clusters |
63 |
| -X2 = load_boston()['data'][:, [5, 12]] # "banana"-shaped |
| 46 | +from sklearn.datasets import load_wine |
64 | 47 |
|
65 | 48 | # Define "classifiers" to be used
|
66 | 49 | classifiers = {
|
67 | 50 | "Empirical Covariance": EllipticEnvelope(support_fraction=1.,
|
68 |
| - contamination=0.261), |
| 51 | + contamination=0.25), |
69 | 52 | "Robust Covariance (Minimum Covariance Determinant)":
|
70 |
| - EllipticEnvelope(contamination=0.261), |
71 |
| - "OCSVM": OneClassSVM(nu=0.261, gamma=0.05)} |
| 53 | + EllipticEnvelope(contamination=0.25), |
| 54 | + "OCSVM": OneClassSVM(nu=0.25, gamma=0.35)} |
72 | 55 | colors = ['m', 'g', 'b']
|
73 | 56 | legend1 = {}
|
74 | 57 | legend2 = {}
|
75 | 58 |
|
| 59 | +# Get data |
| 60 | +X1 = load_wine()['data'][:, [1, 2]] # two clusters |
| 61 | + |
76 | 62 | # Learn a frontier for outlier detection with several classifiers
|
77 |
| -xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500)) |
78 |
| -xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500)) |
| 63 | +xx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500)) |
79 | 64 | for i, (clf_name, clf) in enumerate(classifiers.items()):
|
80 | 65 | plt.figure(1)
|
81 | 66 | clf.fit(X1)
|
82 | 67 | Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
|
83 | 68 | Z1 = Z1.reshape(xx1.shape)
|
84 | 69 | legend1[clf_name] = plt.contour(
|
85 | 70 | xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i])
|
86 |
| - plt.figure(2) |
87 |
| - clf.fit(X2) |
88 |
| - Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()]) |
89 |
| - Z2 = Z2.reshape(xx2.shape) |
90 |
| - legend2[clf_name] = plt.contour( |
91 |
| - xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i]) |
92 | 71 |
|
93 | 72 | legend1_values_list = list(legend1.values())
|
94 | 73 | legend1_keys_list = list(legend1.keys())
|
95 | 74 |
|
96 | 75 | # Plot the results (= shape of the data points cloud)
|
97 | 76 | plt.figure(1) # two clusters
|
98 |
| -plt.title("Outlier detection on a real data set (boston housing)") |
| 77 | +plt.title("Outlier detection on a real data set (wine recognition)") |
99 | 78 | plt.scatter(X1[:, 0], X1[:, 1], color='black')
|
100 | 79 | bbox_args = dict(boxstyle="round", fc="0.8")
|
101 | 80 | arrow_args = dict(arrowstyle="->")
|
102 |
| -plt.annotate("several confounded points", xy=(24, 19), |
| 81 | +plt.annotate("outlying points", xy=(4, 2), |
103 | 82 | xycoords="data", textcoords="data",
|
104 |
| - xytext=(13, 10), bbox=bbox_args, arrowprops=arrow_args) |
| 83 | + xytext=(3, 1.25), bbox=bbox_args, arrowprops=arrow_args) |
105 | 84 | plt.xlim((xx1.min(), xx1.max()))
|
106 | 85 | plt.ylim((yy1.min(), yy1.max()))
|
107 | 86 | plt.legend((legend1_values_list[0].collections[0],
|
108 | 87 | legend1_values_list[1].collections[0],
|
109 | 88 | legend1_values_list[2].collections[0]),
|
110 | 89 | (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
|
111 | 90 | loc="upper center",
|
112 |
| - prop=matplotlib.font_manager.FontProperties(size=12)) |
113 |
| -plt.ylabel("accessibility to radial highways") |
114 |
| -plt.xlabel("pupil-teacher ratio by town") |
| 91 | + prop=matplotlib.font_manager.FontProperties(size=11)) |
| 92 | +plt.ylabel("ash") |
| 93 | +plt.xlabel("malic_acid") |
| 94 | + |
| 95 | +plt.show() |
| 96 | + |
| 97 | +############################################################################## |
| 98 | +# Second example |
| 99 | +# -------------- |
| 100 | +# The second example shows the ability of the Minimum Covariance Determinant |
| 101 | +# robust estimator of covariance to concentrate on the main mode of the data |
| 102 | +# distribution: the ___location seems to be well estimated, although the |
| 103 | +# covariance is hard to estimate due to the banana-shaped distribution. Anyway, |
| 104 | +# we can get rid of some outlying observations. The One-Class SVM is able to |
| 105 | +# capture the real data structure, but the difficulty is to adjust its kernel |
| 106 | +# bandwidth parameter so as to obtain a good compromise between the shape of |
| 107 | +# the data scatter matrix and the risk of over-fitting the data. |
| 108 | + |
| 109 | +# Get data |
| 110 | +X2 = load_wine()['data'][:, [6, 9]] # "banana"-shaped |
| 111 | + |
| 112 | +# Learn a frontier for outlier detection with several classifiers |
| 113 | +xx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500)) |
| 114 | +for i, (clf_name, clf) in enumerate(classifiers.items()): |
| 115 | + plt.figure(2) |
| 116 | + clf.fit(X2) |
| 117 | + Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()]) |
| 118 | + Z2 = Z2.reshape(xx2.shape) |
| 119 | + legend2[clf_name] = plt.contour( |
| 120 | + xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i]) |
115 | 121 |
|
116 | 122 | legend2_values_list = list(legend2.values())
|
117 | 123 | legend2_keys_list = list(legend2.keys())
|
118 | 124 |
|
| 125 | +# Plot the results (= shape of the data points cloud) |
119 | 126 | plt.figure(2) # "banana" shape
|
120 |
| -plt.title("Outlier detection on a real data set (boston housing)") |
| 127 | +plt.title("Outlier detection on a real data set (wine recognition)") |
121 | 128 | plt.scatter(X2[:, 0], X2[:, 1], color='black')
|
122 | 129 | plt.xlim((xx2.min(), xx2.max()))
|
123 | 130 | plt.ylim((yy2.min(), yy2.max()))
|
|
126 | 133 | legend2_values_list[2].collections[0]),
|
127 | 134 | (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),
|
128 | 135 | loc="upper center",
|
129 |
| - prop=matplotlib.font_manager.FontProperties(size=12)) |
130 |
| -plt.ylabel("% lower status of the population") |
131 |
| -plt.xlabel("average number of rooms per dwelling") |
| 136 | + prop=matplotlib.font_manager.FontProperties(size=11)) |
| 137 | +plt.ylabel("color_intensity") |
| 138 | +plt.xlabel("flavanoids") |
132 | 139 |
|
133 | 140 | plt.show()
|
0 commit comments