Skip to content

Commit 493ca8b

Browse files
committed
Pushing the docs to dev/ for branch: master, commit e31b21d01fba9070096b74a041ea6cd818e4f77b
1 parent af7ef66 commit 493ca8b

File tree

1,221 files changed

+4363
-3994
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,221 files changed

+4363
-3994
lines changed
Binary file not shown.

dev/_downloads/5a87b25ba023ee709595b8d02049f021/plot_kmeans_digits.py

Lines changed: 119 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,11 @@
33
A demo of K-Means clustering on the handwritten digits data
44
===========================================================
55
6-
In this example we compare the various initialization strategies for
7-
K-means in terms of runtime and quality of the results.
6+
In this example we compare the various initialization strategies for K-means in
7+
terms of runtime and quality of the results.
88
9-
As the ground truth is known here, we also apply different cluster
10-
quality metrics to judge the goodness of fit of the cluster labels to the
11-
ground truth.
9+
As the ground truth is known here, we also apply different cluster quality
10+
metrics to judge the goodness of fit of the cluster labels to the ground truth.
1211
1312
Cluster quality metrics evaluated (see :ref:`clustering_evaluation` for
1413
definitions and discussions of the metrics):
@@ -23,72 +22,134 @@
2322
AMI adjusted mutual information
2423
silhouette silhouette coefficient
2524
=========== ========================================================
26-
2725
"""
2826
print(__doc__)
2927

30-
from time import time
28+
# %%
29+
# Load the dataset
30+
# ----------------
31+
#
32+
# We will start by loading the `digits` dataset. This dataset contains
33+
# handwritten digits from 0 to 9. In the context of clustering, one would like
34+
# to group images such that the handwritten digits on the image are the same.
35+
3136
import numpy as np
32-
import matplotlib.pyplot as plt
37+
from sklearn.datasets import load_digits
3338

39+
data, labels = load_digits(return_X_y=True)
40+
(n_samples, n_features), n_digits = data.shape, np.unique(labels).size
41+
42+
print(
43+
f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}"
44+
)
45+
46+
# %%
47+
# Define our evaluation benchmark
48+
# -------------------------------
49+
#
50+
# We will first our evaluation benchmark. During this benchmark, we intend to
51+
# compare different initialization methods for KMeans. Our benchmark will:
52+
#
53+
# * create a pipeline which will scale the data using a
54+
# :class:`~sklearn.preprocessing.StandardScaler`;
55+
# * train and time the pipeline fitting;
56+
# * measure the performance of the clustering obtained via different metrics.
57+
from time import time
3458
from sklearn import metrics
59+
from sklearn.pipeline import make_pipeline
60+
from sklearn.preprocessing import StandardScaler
61+
62+
63+
def bench_k_means(kmeans, name, data, labels):
64+
"""Benchmark to evaluate the KMeans initialization methods.
65+
66+
Parameters
67+
----------
68+
kmeans : KMeans instance
69+
A :class:`~sklearn.cluster.KMeans` instance with the initialization
70+
already set.
71+
name : str
72+
Name given to the strategy. It will be used to show the results in a
73+
table.
74+
data : ndarray of shape (n_samples, n_features)
75+
The data to cluster.
76+
labels : ndarray of shape (n_samples,)
77+
The labels used to compute the clustering metrics which requires some
78+
supervision.
79+
"""
80+
t0 = time()
81+
estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
82+
fit_time = time() - t0
83+
results = [name, fit_time, estimator[-1].inertia_]
84+
85+
# Define the metrics which require only the true labels and estimator
86+
# labels
87+
clustering_metrics = [
88+
metrics.homogeneity_score,
89+
metrics.completeness_score,
90+
metrics.v_measure_score,
91+
metrics.adjusted_rand_score,
92+
metrics.adjusted_mutual_info_score,
93+
]
94+
results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]
95+
96+
# The silhouette score requires the full dataset
97+
results += [
98+
metrics.silhouette_score(data, estimator[-1].labels_,
99+
metric="euclidean", sample_size=300,)
100+
]
101+
102+
# Show the results
103+
formatter_result = ("{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}"
104+
"\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
105+
print(formatter_result.format(*results))
106+
107+
108+
# %%
109+
# Run the benchmark
110+
# -----------------
111+
#
112+
# We will compare three approaches:
113+
#
114+
# * an initialization using `kmeans++`. This method is stochastic and we will
115+
# run the initialization 4 times;
116+
# * a random initialization. This method is stochastic as well and we will run
117+
# the initialization 4 times;
118+
# * an initialization based on a :class:`~sklearn.decomposition.PCA`
119+
# projection. Indeed, we will use the components of the
120+
# :class:`~sklearn.decomposition.PCA` to initialize KMeans. This method is
121+
# deterministic and a single initialization suffice.
35122
from sklearn.cluster import KMeans
36-
from sklearn.datasets import load_digits
37123
from sklearn.decomposition import PCA
38-
from sklearn.preprocessing import scale
39-
40-
np.random.seed(42)
41-
42-
X_digits, y_digits = load_digits(return_X_y=True)
43-
data = scale(X_digits)
44-
45-
n_samples, n_features = data.shape
46-
n_digits = len(np.unique(y_digits))
47-
labels = y_digits
48-
49-
sample_size = 300
50-
51-
print("n_digits: %d, \t n_samples %d, \t n_features %d"
52-
% (n_digits, n_samples, n_features))
53-
54124

55125
print(82 * '_')
56126
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
57127

128+
kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4,
129+
random_state=0)
130+
bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels)
131+
132+
kmeans = KMeans(init="random", n_clusters=n_digits, n_init=4, random_state=0)
133+
bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels)
58134

59-
def bench_k_means(estimator, name, data):
60-
t0 = time()
61-
estimator.fit(data)
62-
print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
63-
% (name, (time() - t0), estimator.inertia_,
64-
metrics.homogeneity_score(labels, estimator.labels_),
65-
metrics.completeness_score(labels, estimator.labels_),
66-
metrics.v_measure_score(labels, estimator.labels_),
67-
metrics.adjusted_rand_score(labels, estimator.labels_),
68-
metrics.adjusted_mutual_info_score(labels, estimator.labels_),
69-
metrics.silhouette_score(data, estimator.labels_,
70-
metric='euclidean',
71-
sample_size=sample_size)))
72-
73-
bench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10),
74-
name="k-means++", data=data)
75-
76-
bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),
77-
name="random", data=data)
78-
79-
# in this case the seeding of the centers is deterministic, hence we run the
80-
# kmeans algorithm only once with n_init=1
81135
pca = PCA(n_components=n_digits).fit(data)
82-
bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1),
83-
name="PCA-based",
84-
data=data)
136+
kmeans = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1)
137+
bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels)
138+
85139
print(82 * '_')
86140

87-
# #############################################################################
141+
# %%
88142
# Visualize the results on PCA-reduced data
143+
# -----------------------------------------
144+
#
145+
# :class:`~sklearn.decomposition.PCA` allows to project the data from the
146+
# original 64-dimensional space into a lower dimensional space. Subsequently,
147+
# we can use :class:`~sklearn.decomposition.PCA` to project into a
148+
# 2-dimensional space and plot the data and the clusters in this new space.
149+
import matplotlib.pyplot as plt
89150

90151
reduced_data = PCA(n_components=2).fit_transform(data)
91-
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
152+
kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4)
92153
kmeans.fit(reduced_data)
93154

94155
# Step size of the mesh. Decrease to increase the quality of the VQ.
@@ -106,19 +167,17 @@ def bench_k_means(estimator, name, data):
106167
Z = Z.reshape(xx.shape)
107168
plt.figure(1)
108169
plt.clf()
109-
plt.imshow(Z, interpolation='nearest',
170+
plt.imshow(Z, interpolation="nearest",
110171
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
111-
cmap=plt.cm.Paired,
112-
aspect='auto', origin='lower')
172+
cmap=plt.cm.Paired, aspect="auto", origin="lower")
113173

114174
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
115175
# Plot the centroids as a white X
116176
centroids = kmeans.cluster_centers_
117-
plt.scatter(centroids[:, 0], centroids[:, 1],
118-
marker='x', s=169, linewidths=3,
119-
color='w', zorder=10)
120-
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
121-
'Centroids are marked with white cross')
177+
plt.scatter(centroids[:, 0], centroids[:, 1], marker="x", s=169, linewidths=3,
178+
color="w", zorder=10)
179+
plt.title("K-means clustering on the digits dataset (PCA-reduced data)\n"
180+
"Centroids are marked with white cross")
122181
plt.xlim(x_min, x_max)
123182
plt.ylim(y_min, y_max)
124183
plt.xticks(())

0 commit comments

Comments
 (0)