3
3
A demo of K-Means clustering on the handwritten digits data
4
4
===========================================================
5
5
6
- In this example we compare the various initialization strategies for
7
- K-means in terms of runtime and quality of the results.
6
+ In this example we compare the various initialization strategies for K-means in
7
+ terms of runtime and quality of the results.
8
8
9
- As the ground truth is known here, we also apply different cluster
10
- quality metrics to judge the goodness of fit of the cluster labels to the
11
- ground truth.
9
+ As the ground truth is known here, we also apply different cluster quality
10
+ metrics to judge the goodness of fit of the cluster labels to the ground truth.
12
11
13
12
Cluster quality metrics evaluated (see :ref:`clustering_evaluation` for
14
13
definitions and discussions of the metrics):
23
22
AMI adjusted mutual information
24
23
silhouette silhouette coefficient
25
24
=========== ========================================================
26
-
27
25
"""
28
26
print (__doc__ )
29
27
30
- from time import time
28
+ # %%
29
+ # Load the dataset
30
+ # ----------------
31
+ #
32
+ # We will start by loading the `digits` dataset. This dataset contains
33
+ # handwritten digits from 0 to 9. In the context of clustering, one would like
34
+ # to group images such that the handwritten digits on the image are the same.
35
+
31
36
import numpy as np
32
- import matplotlib . pyplot as plt
37
+ from sklearn . datasets import load_digits
33
38
39
+ data , labels = load_digits (return_X_y = True )
40
+ (n_samples , n_features ), n_digits = data .shape , np .unique (labels ).size
41
+
42
+ print (
43
+ f"# digits: { n_digits } ; # samples: { n_samples } ; # features { n_features } "
44
+ )
45
+
46
+ # %%
47
+ # Define our evaluation benchmark
48
+ # -------------------------------
49
+ #
50
+ # We will first our evaluation benchmark. During this benchmark, we intend to
51
+ # compare different initialization methods for KMeans. Our benchmark will:
52
+ #
53
+ # * create a pipeline which will scale the data using a
54
+ # :class:`~sklearn.preprocessing.StandardScaler`;
55
+ # * train and time the pipeline fitting;
56
+ # * measure the performance of the clustering obtained via different metrics.
57
+ from time import time
34
58
from sklearn import metrics
59
+ from sklearn .pipeline import make_pipeline
60
+ from sklearn .preprocessing import StandardScaler
61
+
62
+
63
+ def bench_k_means (kmeans , name , data , labels ):
64
+ """Benchmark to evaluate the KMeans initialization methods.
65
+
66
+ Parameters
67
+ ----------
68
+ kmeans : KMeans instance
69
+ A :class:`~sklearn.cluster.KMeans` instance with the initialization
70
+ already set.
71
+ name : str
72
+ Name given to the strategy. It will be used to show the results in a
73
+ table.
74
+ data : ndarray of shape (n_samples, n_features)
75
+ The data to cluster.
76
+ labels : ndarray of shape (n_samples,)
77
+ The labels used to compute the clustering metrics which requires some
78
+ supervision.
79
+ """
80
+ t0 = time ()
81
+ estimator = make_pipeline (StandardScaler (), kmeans ).fit (data )
82
+ fit_time = time () - t0
83
+ results = [name , fit_time , estimator [- 1 ].inertia_ ]
84
+
85
+ # Define the metrics which require only the true labels and estimator
86
+ # labels
87
+ clustering_metrics = [
88
+ metrics .homogeneity_score ,
89
+ metrics .completeness_score ,
90
+ metrics .v_measure_score ,
91
+ metrics .adjusted_rand_score ,
92
+ metrics .adjusted_mutual_info_score ,
93
+ ]
94
+ results += [m (labels , estimator [- 1 ].labels_ ) for m in clustering_metrics ]
95
+
96
+ # The silhouette score requires the full dataset
97
+ results += [
98
+ metrics .silhouette_score (data , estimator [- 1 ].labels_ ,
99
+ metric = "euclidean" , sample_size = 300 ,)
100
+ ]
101
+
102
+ # Show the results
103
+ formatter_result = ("{:9s}\t {:.3f}s\t {:.0f}\t {:.3f}\t {:.3f}"
104
+ "\t {:.3f}\t {:.3f}\t {:.3f}\t {:.3f}" )
105
+ print (formatter_result .format (* results ))
106
+
107
+
108
+ # %%
109
+ # Run the benchmark
110
+ # -----------------
111
+ #
112
+ # We will compare three approaches:
113
+ #
114
+ # * an initialization using `kmeans++`. This method is stochastic and we will
115
+ # run the initialization 4 times;
116
+ # * a random initialization. This method is stochastic as well and we will run
117
+ # the initialization 4 times;
118
+ # * an initialization based on a :class:`~sklearn.decomposition.PCA`
119
+ # projection. Indeed, we will use the components of the
120
+ # :class:`~sklearn.decomposition.PCA` to initialize KMeans. This method is
121
+ # deterministic and a single initialization suffice.
35
122
from sklearn .cluster import KMeans
36
- from sklearn .datasets import load_digits
37
123
from sklearn .decomposition import PCA
38
- from sklearn .preprocessing import scale
39
-
40
- np .random .seed (42 )
41
-
42
- X_digits , y_digits = load_digits (return_X_y = True )
43
- data = scale (X_digits )
44
-
45
- n_samples , n_features = data .shape
46
- n_digits = len (np .unique (y_digits ))
47
- labels = y_digits
48
-
49
- sample_size = 300
50
-
51
- print ("n_digits: %d, \t n_samples %d, \t n_features %d"
52
- % (n_digits , n_samples , n_features ))
53
-
54
124
55
125
print (82 * '_' )
56
126
print ('init\t \t time\t inertia\t homo\t compl\t v-meas\t ARI\t AMI\t silhouette' )
57
127
128
+ kmeans = KMeans (init = "k-means++" , n_clusters = n_digits , n_init = 4 ,
129
+ random_state = 0 )
130
+ bench_k_means (kmeans = kmeans , name = "k-means++" , data = data , labels = labels )
131
+
132
+ kmeans = KMeans (init = "random" , n_clusters = n_digits , n_init = 4 , random_state = 0 )
133
+ bench_k_means (kmeans = kmeans , name = "random" , data = data , labels = labels )
58
134
59
- def bench_k_means (estimator , name , data ):
60
- t0 = time ()
61
- estimator .fit (data )
62
- print ('%-9s\t %.2fs\t %i\t %.3f\t %.3f\t %.3f\t %.3f\t %.3f\t %.3f'
63
- % (name , (time () - t0 ), estimator .inertia_ ,
64
- metrics .homogeneity_score (labels , estimator .labels_ ),
65
- metrics .completeness_score (labels , estimator .labels_ ),
66
- metrics .v_measure_score (labels , estimator .labels_ ),
67
- metrics .adjusted_rand_score (labels , estimator .labels_ ),
68
- metrics .adjusted_mutual_info_score (labels , estimator .labels_ ),
69
- metrics .silhouette_score (data , estimator .labels_ ,
70
- metric = 'euclidean' ,
71
- sample_size = sample_size )))
72
-
73
- bench_k_means (KMeans (init = 'k-means++' , n_clusters = n_digits , n_init = 10 ),
74
- name = "k-means++" , data = data )
75
-
76
- bench_k_means (KMeans (init = 'random' , n_clusters = n_digits , n_init = 10 ),
77
- name = "random" , data = data )
78
-
79
- # in this case the seeding of the centers is deterministic, hence we run the
80
- # kmeans algorithm only once with n_init=1
81
135
pca = PCA (n_components = n_digits ).fit (data )
82
- bench_k_means ( KMeans (init = pca .components_ , n_clusters = n_digits , n_init = 1 ),
83
- name = "PCA-based" ,
84
- data = data )
136
+ kmeans = KMeans (init = pca .components_ , n_clusters = n_digits , n_init = 1 )
137
+ bench_k_means ( kmeans = kmeans , name = "PCA-based" , data = data , labels = labels )
138
+
85
139
print (82 * '_' )
86
140
87
- # #############################################################################
141
+ # %%
88
142
# Visualize the results on PCA-reduced data
143
+ # -----------------------------------------
144
+ #
145
+ # :class:`~sklearn.decomposition.PCA` allows to project the data from the
146
+ # original 64-dimensional space into a lower dimensional space. Subsequently,
147
+ # we can use :class:`~sklearn.decomposition.PCA` to project into a
148
+ # 2-dimensional space and plot the data and the clusters in this new space.
149
+ import matplotlib .pyplot as plt
89
150
90
151
reduced_data = PCA (n_components = 2 ).fit_transform (data )
91
- kmeans = KMeans (init = ' k-means++' , n_clusters = n_digits , n_init = 10 )
152
+ kmeans = KMeans (init = " k-means++" , n_clusters = n_digits , n_init = 4 )
92
153
kmeans .fit (reduced_data )
93
154
94
155
# Step size of the mesh. Decrease to increase the quality of the VQ.
@@ -106,19 +167,17 @@ def bench_k_means(estimator, name, data):
106
167
Z = Z .reshape (xx .shape )
107
168
plt .figure (1 )
108
169
plt .clf ()
109
- plt .imshow (Z , interpolation = ' nearest' ,
170
+ plt .imshow (Z , interpolation = " nearest" ,
110
171
extent = (xx .min (), xx .max (), yy .min (), yy .max ()),
111
- cmap = plt .cm .Paired ,
112
- aspect = 'auto' , origin = 'lower' )
172
+ cmap = plt .cm .Paired , aspect = "auto" , origin = "lower" )
113
173
114
174
plt .plot (reduced_data [:, 0 ], reduced_data [:, 1 ], 'k.' , markersize = 2 )
115
175
# Plot the centroids as a white X
116
176
centroids = kmeans .cluster_centers_
117
- plt .scatter (centroids [:, 0 ], centroids [:, 1 ],
118
- marker = 'x' , s = 169 , linewidths = 3 ,
119
- color = 'w' , zorder = 10 )
120
- plt .title ('K-means clustering on the digits dataset (PCA-reduced data)\n '
121
- 'Centroids are marked with white cross' )
177
+ plt .scatter (centroids [:, 0 ], centroids [:, 1 ], marker = "x" , s = 169 , linewidths = 3 ,
178
+ color = "w" , zorder = 10 )
179
+ plt .title ("K-means clustering on the digits dataset (PCA-reduced data)\n "
180
+ "Centroids are marked with white cross" )
122
181
plt .xlim (x_min , x_max )
123
182
plt .ylim (y_min , y_max )
124
183
plt .xticks (())
0 commit comments