scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
2.25 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
2.25 KB
diff --git a/‎dev/_downloads/5a87b25ba023ee709595b8d02049f021/plot_kmeans_digits.py
Lines changed: 119 additions & 60 deletions b/‎dev/_downloads/5a87b25ba023ee709595b8d02049f021/plot_kmeans_digits.py
Lines changed: 119 additions & 60 deletions
@@ -3,12 +3,11 @@
 A demo of K-Means clustering on the handwritten digits data
 ===========================================================
 
-In this example we compare the various initialization strategies for
-K-means in terms of runtime and quality of the results.
+In this example we compare the various initialization strategies for K-means in
+terms of runtime and quality of the results.
 
-As the ground truth is known here, we also apply different cluster
-quality metrics to judge the goodness of fit of the cluster labels to the
-ground truth.
+As the ground truth is known here, we also apply different cluster quality
+metrics to judge the goodness of fit of the cluster labels to the ground truth.
 
 Cluster quality metrics evaluated (see :ref:`clustering_evaluation` for
 definitions and discussions of the metrics):
@@ -23,72 +22,134 @@
 AMI          adjusted mutual information
 silhouette   silhouette coefficient
 =========== ========================================================
-
 """
 print(__doc__)
 
-from time import time
+# %%
+# Load the dataset
+# ----------------
+#
+# We will start by loading the `digits` dataset. This dataset contains
+# handwritten digits from 0 to 9. In the context of clustering, one would like
+# to group images such that the handwritten digits on the image are the same.
+
 import numpy as np
-import matplotlib.pyplot as plt
+from sklearn.datasets import load_digits
 
+data, labels = load_digits(return_X_y=True)
+(n_samples, n_features), n_digits = data.shape, np.unique(labels).size
+
+print(
+    f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}"
+)
+
+# %%
+# Define our evaluation benchmark
+# -------------------------------
+#
+# We will first our evaluation benchmark. During this benchmark, we intend to
+# compare different initialization methods for KMeans. Our benchmark will:
+#
+# * create a pipeline which will scale the data using a
+#   :class:`~sklearn.preprocessing.StandardScaler`;
+# * train and time the pipeline fitting;
+# * measure the performance of the clustering obtained via different metrics.
+from time import time
 from sklearn import metrics
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+
+
+def bench_k_means(kmeans, name, data, labels):
+    """Benchmark to evaluate the KMeans initialization methods.
+
+    Parameters
+    ----------
+    kmeans : KMeans instance
+        A :class:`~sklearn.cluster.KMeans` instance with the initialization
+        already set.
+    name : str
+        Name given to the strategy. It will be used to show the results in a
+        table.
+    data : ndarray of shape (n_samples, n_features)
+        The data to cluster.
+    labels : ndarray of shape (n_samples,)
+        The labels used to compute the clustering metrics which requires some
+        supervision.
+    """
+    t0 = time()
+    estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
+    fit_time = time() - t0
+    results = [name, fit_time, estimator[-1].inertia_]
+
+    # Define the metrics which require only the true labels and estimator
+    # labels
+    clustering_metrics = [
+        metrics.homogeneity_score,
+        metrics.completeness_score,
+        metrics.v_measure_score,
+        metrics.adjusted_rand_score,
+        metrics.adjusted_mutual_info_score,
+    ]
+    results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]
+
+    # The silhouette score requires the full dataset
+    results += [
+        metrics.silhouette_score(data, estimator[-1].labels_,
+                                 metric="euclidean", sample_size=300,)
+    ]
+
+    # Show the results
+    formatter_result = ("{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}"
+                        "\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
+    print(formatter_result.format(*results))
+
+
+# %%
+# Run the benchmark
+# -----------------
+#
+# We will compare three approaches:
+#
+# * an initialization using `kmeans++`. This method is stochastic and we will
+#   run the initialization 4 times;
+# * a random initialization. This method is stochastic as well and we will run
+#   the initialization 4 times;
+# * an initialization based on a :class:`~sklearn.decomposition.PCA`
+#   projection. Indeed, we will use the components of the
+#   :class:`~sklearn.decomposition.PCA` to initialize KMeans. This method is
+#   deterministic and a single initialization suffice.
 from sklearn.cluster import KMeans
-from sklearn.datasets import load_digits
 from sklearn.decomposition import PCA
-from sklearn.preprocessing import scale
-
-np.random.seed(42)
-
-X_digits, y_digits = load_digits(return_X_y=True)
-data = scale(X_digits)
-
-n_samples, n_features = data.shape
-n_digits = len(np.unique(y_digits))
-labels = y_digits
-
-sample_size = 300
-
-print("n_digits: %d, \t n_samples %d, \t n_features %d"
-      % (n_digits, n_samples, n_features))
-
 
 print(82 * '_')
 print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
 
+kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4,
+                random_state=0)
+bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels)
+
+kmeans = KMeans(init="random", n_clusters=n_digits, n_init=4, random_state=0)
+bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels)
 
-def bench_k_means(estimator, name, data):
-    t0 = time()
-    estimator.fit(data)
-    print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
-          % (name, (time() - t0), estimator.inertia_,
-             metrics.homogeneity_score(labels, estimator.labels_),
-             metrics.completeness_score(labels, estimator.labels_),
-             metrics.v_measure_score(labels, estimator.labels_),
-             metrics.adjusted_rand_score(labels, estimator.labels_),
-             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
-             metrics.silhouette_score(data, estimator.labels_,
-                                      metric='euclidean',
-                                      sample_size=sample_size)))
-
-bench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10),
-              name="k-means++", data=data)
-
-bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),
-              name="random", data=data)
-
-# in this case the seeding of the centers is deterministic, hence we run the
-# kmeans algorithm only once with n_init=1
 pca = PCA(n_components=n_digits).fit(data)
-bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1),
-              name="PCA-based",
-              data=data)
+kmeans = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1)
+bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels)
+
 print(82 * '_')
 
-# #############################################################################
+# %%
 # Visualize the results on PCA-reduced data
+# -----------------------------------------
+#
+# :class:`~sklearn.decomposition.PCA` allows to project the data from the
+# original 64-dimensional space into a lower dimensional space. Subsequently,
+# we can use :class:`~sklearn.decomposition.PCA` to project into a
+# 2-dimensional space and plot the data and the clusters in this new space.
+import matplotlib.pyplot as plt
 
 reduced_data = PCA(n_components=2).fit_transform(data)
-kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
+kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4)
 kmeans.fit(reduced_data)
 
 # Step size of the mesh. Decrease to increase the quality of the VQ.
@@ -106,19 +167,17 @@ def bench_k_means(estimator, name, data):
 Z = Z.reshape(xx.shape)
 plt.figure(1)
 plt.clf()
-plt.imshow(Z, interpolation='nearest',
+plt.imshow(Z, interpolation="nearest",
            extent=(xx.min(), xx.max(), yy.min(), yy.max()),
-           cmap=plt.cm.Paired,
-           aspect='auto', origin='lower')
+           cmap=plt.cm.Paired, aspect="auto", origin="lower")
 
 plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
 # Plot the centroids as a white X
 centroids = kmeans.cluster_centers_
-plt.scatter(centroids[:, 0], centroids[:, 1],
-            marker='x', s=169, linewidths=3,
-            color='w', zorder=10)
-plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
-          'Centroids are marked with white cross')
+plt.scatter(centroids[:, 0], centroids[:, 1], marker="x", s=169, linewidths=3,
+            color="w", zorder=10)
+plt.title("K-means clustering on the digits dataset (PCA-reduced data)\n"
+          "Centroids are marked with white cross")
 plt.xlim(x_min, x_max)
 plt.ylim(y_min, y_max)
 plt.xticks(())