scikit-learn
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-3 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-3 Bytes
diff --git a/‎dev/_downloads/18eb95af29bd5554020a8428b3ceac54/plot_cluster_iris.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/18eb95af29bd5554020a8428b3ceac54/plot_cluster_iris.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
-10 Bytes b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
-10 Bytes
diff --git a/‎dev/_downloads/751db3d5e6b909ff00972495eaae53df/plot_document_clustering.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/751db3d5e6b909ff00972495eaae53df/plot_document_clustering.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/82ec115874a062f9e8fa17efc63384c0/plot_color_quantization.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/82ec115874a062f9e8fa17efc63384c0/plot_color_quantization.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/a315e003c9ce53b89d5fa110538885fd/plot_cluster_iris.py
Lines changed: 4 additions & 5 deletions b/‎dev/_downloads/a315e003c9ce53b89d5fa110538885fd/plot_cluster_iris.py
Lines changed: 4 additions & 5 deletions
diff --git a/‎dev/_downloads/ba68199eea858ec04949b2c6c65147e0/plot_document_clustering.py
Lines changed: 3 additions & 2 deletions b/‎dev/_downloads/ba68199eea858ec04949b2c6c65147e0/plot_document_clustering.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎dev/_downloads/d0e47fc5f3661efb101abfd4d9461afe/plot_color_quantization.py
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/d0e47fc5f3661efb101abfd4d9461afe/plot_color_quantization.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
797 Bytes b/‎dev/_downloads/scikit-learn-docs.zip
797 Bytes
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 63371d7052f9cfa472a1470b98355ba6
+config: e5e861ce384f70b2e75a75e7791ab430
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -4,7 +4,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# K-means Clustering\n\nThe plot shows:\n\n- top left: What a K-means algorithm would yield using 8 clusters.\n\n- top right: What the effect of a bad initialization is\n  on the classification process: By setting n_init to only 1\n  (default is 10), the amount of times that the algorithm will\n  be run with different centroid seeds is reduced.\n\n- bottom left: What using eight clusters would deliver.\n\n- bottom right: The ground truth.\n"
+        "\n# K-means Clustering\n\nThe plot shows:\n\n- top left: What a K-means algorithm would yield using 8 clusters.\n\n- top right: What using three clusters would deliver.\n\n- bottom left: What the effect of a bad initialization is\n  on the classification process: By setting n_init to only 1\n  (default is 10), the amount of times that the algorithm will\n  be run with different centroid seeds is reduced.\n\n- bottom right: The ground truth.\n"
       ]
     },
     {
@@ -15,7 +15,7 @@
       },
       "outputs": [],
       "source": [
-        "# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\n# Though the following import is not directly being used, it is required\n# for 3D projection to work with matplotlib < 3.2\nimport mpl_toolkits.mplot3d  # noqa: F401\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.cluster import KMeans\n\nnp.random.seed(5)\n\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\n\nestimators = [\n    (\"k_means_iris_8\", KMeans(n_clusters=8)),\n    (\"k_means_iris_3\", KMeans(n_clusters=3)),\n    (\"k_means_iris_bad_init\", KMeans(n_clusters=3, n_init=1, init=\"random\")),\n]\n\nfig = plt.figure(figsize=(10, 8))\ntitles = [\"8 clusters\", \"3 clusters\", \"3 clusters, bad initialization\"]\nfor idx, ((name, est), title) in enumerate(zip(estimators, titles)):\n    ax = fig.add_subplot(2, 2, idx + 1, projection=\"3d\", elev=48, azim=134)\n    est.fit(X)\n    labels = est.labels_\n\n    ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor=\"k\")\n\n    ax.xaxis.set_ticklabels([])\n    ax.yaxis.set_ticklabels([])\n    ax.zaxis.set_ticklabels([])\n    ax.set_xlabel(\"Petal width\")\n    ax.set_ylabel(\"Sepal length\")\n    ax.set_zlabel(\"Petal length\")\n    ax.set_title(title)\n\n# Plot the ground truth\nax = fig.add_subplot(2, 2, 4, projection=\"3d\", elev=48, azim=134)\n\nfor name, label in [(\"Setosa\", 0), (\"Versicolour\", 1), (\"Virginica\", 2)]:\n    ax.text3D(\n        X[y == label, 3].mean(),\n        X[y == label, 0].mean(),\n        X[y == label, 2].mean() + 2,\n        name,\n        horizontalalignment=\"center\",\n        bbox=dict(alpha=0.2, edgecolor=\"w\", facecolor=\"w\"),\n    )\n# Reorder the labels to have colors matching the cluster results\ny = np.choose(y, [1, 2, 0]).astype(float)\nax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor=\"k\")\n\nax.xaxis.set_ticklabels([])\nax.yaxis.set_ticklabels([])\nax.zaxis.set_ticklabels([])\nax.set_xlabel(\"Petal width\")\nax.set_ylabel(\"Sepal length\")\nax.set_zlabel(\"Petal length\")\nax.set_title(\"Ground Truth\")\n\nplt.subplots_adjust(wspace=0.25, hspace=0.25)\nplt.show()"
+        "# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\n# Though the following import is not directly being used, it is required\n# for 3D projection to work with matplotlib < 3.2\nimport mpl_toolkits.mplot3d  # noqa: F401\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.cluster import KMeans\n\nnp.random.seed(5)\n\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\n\nestimators = [\n    (\"k_means_iris_8\", KMeans(n_clusters=8)),\n    (\"k_means_iris_3\", KMeans(n_clusters=3)),\n    (\"k_means_iris_bad_init\", KMeans(n_clusters=3, n_init=1, init=\"random\")),\n]\n\nfig = plt.figure(figsize=(10, 8))\ntitles = [\"8 clusters\", \"3 clusters\", \"3 clusters, bad initialization\"]\nfor idx, ((name, est), title) in enumerate(zip(estimators, titles)):\n    ax = fig.add_subplot(2, 2, idx + 1, projection=\"3d\", elev=48, azim=134)\n    est.fit(X)\n    labels = est.labels_\n\n    ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor=\"k\")\n\n    ax.xaxis.set_ticklabels([])\n    ax.yaxis.set_ticklabels([])\n    ax.zaxis.set_ticklabels([])\n    ax.set_xlabel(\"Petal width\")\n    ax.set_ylabel(\"Sepal length\")\n    ax.set_zlabel(\"Petal length\")\n    ax.set_title(title)\n\n# Plot the ground truth\nax = fig.add_subplot(2, 2, 4, projection=\"3d\", elev=48, azim=134)\n\nfor name, label in [(\"Setosa\", 0), (\"Versicolour\", 1), (\"Virginica\", 2)]:\n    ax.text3D(\n        X[y == label, 3].mean(),\n        X[y == label, 0].mean(),\n        X[y == label, 2].mean() + 2,\n        name,\n        horizontalalignment=\"center\",\n        bbox=dict(alpha=0.2, edgecolor=\"w\", facecolor=\"w\"),\n    )\n\nax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor=\"k\")\n\nax.xaxis.set_ticklabels([])\nax.yaxis.set_ticklabels([])\nax.zaxis.set_ticklabels([])\nax.set_xlabel(\"Petal width\")\nax.set_ylabel(\"Sepal length\")\nax.set_zlabel(\"Petal length\")\nax.set_title(\"Ground Truth\")\n\nplt.subplots_adjust(wspace=0.25, hspace=0.25)\nplt.show()"
       ]
     }
   ],
 
@@ -40,7 +40,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Quantifying the quality of clustering results\n\nIn this section we define a function to score different clustering pipelines\nusing several metrics.\n\nClustering algorithms are fundamentally unsupervised learning methods.\nHowever, since we happen to have class labels for this specific dataset, it is\npossible to use evaluation metrics that leverage this \"supervised\" ground\ntruth information to quantify the quality of the resulting clusters. Examples\nof such metrics are the following:\n\n- homogeneity, which quantifies how much clusters contain only members of a\n  single class;\n\n- completeness, which quantifies how much members of a given class are\n  assigned to the same clusters;\n\n- V-measure, the harmonic mean of completeness and homogeneity;\n\n- Rand-Index, which measures how frequently pairs of data points are grouped\n  consistently according to the result of the clustering algorithm and the\n  ground truth class assignment;\n\n- Adjusted Rand-Index, a chance-adjusted Rand-Index such that random cluster\n  assignment have an ARI of 0.0 in expectation.\n\nIf the ground truth labels are not known, evaluation can only be performed\nusing the model results itself. In that case, the Silhouette Coefficient comes\nin handy.\n\nFor more reference, see `clustering_evaluation`.\n\n"
+        "## Quantifying the quality of clustering results\n\nIn this section we define a function to score different clustering pipelines\nusing several metrics.\n\nClustering algorithms are fundamentally unsupervised learning methods.\nHowever, since we happen to have class labels for this specific dataset, it is\npossible to use evaluation metrics that leverage this \"supervised\" ground\ntruth information to quantify the quality of the resulting clusters. Examples\nof such metrics are the following:\n\n- homogeneity, which quantifies how much clusters contain only members of a\n  single class;\n\n- completeness, which quantifies how much members of a given class are\n  assigned to the same clusters;\n\n- V-measure, the harmonic mean of completeness and homogeneity;\n\n- Rand-Index, which measures how frequently pairs of data points are grouped\n  consistently according to the result of the clustering algorithm and the\n  ground truth class assignment;\n\n- Adjusted Rand-Index, a chance-adjusted Rand-Index such that random cluster\n  assignment have an ARI of 0.0 in expectation.\n\nIf the ground truth labels are not known, evaluation can only be performed\nusing the model results itself. In that case, the Silhouette Coefficient comes in\nhandy. See `sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`\nfor an example on how to do it.\n\nFor more reference, see `clustering_evaluation`.\n\n"
       ]
     },
     {
 
@@ -15,7 +15,7 @@
       },
       "outputs": [],
       "source": [
-        "# Authors: Robert Layton <[email protected]>\n#          Olivier Grisel <[email protected]>\n#          Mathieu Blondel <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom time import time\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.cluster import KMeans\nfrom sklearn.datasets import load_sample_image\nfrom sklearn.metrics import pairwise_distances_argmin\nfrom sklearn.utils import shuffle\n\nn_colors = 64\n\n# Load the Summer Palace photo\nchina = load_sample_image(\"china.jpg\")\n\n# Convert to floats instead of the default 8 bits integer coding. Dividing by\n# 255 is important so that plt.imshow behaves works well on float data (need to\n# be in the range [0-1])\nchina = np.array(china, dtype=np.float64) / 255\n\n# Load Image and transform to a 2D numpy array.\nw, h, d = original_shape = tuple(china.shape)\nassert d == 3\nimage_array = np.reshape(china, (w * h, d))\n\nprint(\"Fitting model on a small sub-sample of the data\")\nt0 = time()\nimage_array_sample = shuffle(image_array, random_state=0, n_samples=1_000)\nkmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)\nprint(f\"done in {time() - t0:0.3f}s.\")\n\n# Get labels for all points\nprint(\"Predicting color indices on the full image (k-means)\")\nt0 = time()\nlabels = kmeans.predict(image_array)\nprint(f\"done in {time() - t0:0.3f}s.\")\n\n\ncodebook_random = shuffle(image_array, random_state=0, n_samples=n_colors)\nprint(\"Predicting color indices on the full image (random)\")\nt0 = time()\nlabels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0)\nprint(f\"done in {time() - t0:0.3f}s.\")\n\n\ndef recreate_image(codebook, labels, w, h):\n    \"\"\"Recreate the (compressed) image from the code book & labels\"\"\"\n    return codebook[labels].reshape(w, h, -1)\n\n\n# Display all results, alongside original image\nplt.figure(1)\nplt.clf()\nplt.axis(\"off\")\nplt.title(\"Original image (96,615 colors)\")\nplt.imshow(china)\n\nplt.figure(2)\nplt.clf()\nplt.axis(\"off\")\nplt.title(f\"Quantized image ({n_colors} colors, K-Means)\")\nplt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h))\n\nplt.figure(3)\nplt.clf()\nplt.axis(\"off\")\nplt.title(f\"Quantized image ({n_colors} colors, Random)\")\nplt.imshow(recreate_image(codebook_random, labels_random, w, h))\nplt.show()"
+        "# Authors: Robert Layton <[email protected]>\n#          Olivier Grisel <[email protected]>\n#          Mathieu Blondel <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom time import time\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.cluster import KMeans\nfrom sklearn.datasets import load_sample_image\nfrom sklearn.metrics import pairwise_distances_argmin\nfrom sklearn.utils import shuffle\n\nn_colors = 64\n\n# Load the Summer Palace photo\nchina = load_sample_image(\"china.jpg\")\n\n# Convert to floats instead of the default 8 bits integer coding. Dividing by\n# 255 is important so that plt.imshow works well on float data (need to\n# be in the range [0-1])\nchina = np.array(china, dtype=np.float64) / 255\n\n# Load Image and transform to a 2D numpy array.\nw, h, d = original_shape = tuple(china.shape)\nassert d == 3\nimage_array = np.reshape(china, (w * h, d))\n\nprint(\"Fitting model on a small sub-sample of the data\")\nt0 = time()\nimage_array_sample = shuffle(image_array, random_state=0, n_samples=1_000)\nkmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)\nprint(f\"done in {time() - t0:0.3f}s.\")\n\n# Get labels for all points\nprint(\"Predicting color indices on the full image (k-means)\")\nt0 = time()\nlabels = kmeans.predict(image_array)\nprint(f\"done in {time() - t0:0.3f}s.\")\n\n\ncodebook_random = shuffle(image_array, random_state=0, n_samples=n_colors)\nprint(\"Predicting color indices on the full image (random)\")\nt0 = time()\nlabels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0)\nprint(f\"done in {time() - t0:0.3f}s.\")\n\n\ndef recreate_image(codebook, labels, w, h):\n    \"\"\"Recreate the (compressed) image from the code book & labels\"\"\"\n    return codebook[labels].reshape(w, h, -1)\n\n\n# Display all results, alongside original image\nplt.figure(1)\nplt.clf()\nplt.axis(\"off\")\nplt.title(\"Original image (96,615 colors)\")\nplt.imshow(china)\n\nplt.figure(2)\nplt.clf()\nplt.axis(\"off\")\nplt.title(f\"Quantized image ({n_colors} colors, K-Means)\")\nplt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h))\n\nplt.figure(3)\nplt.clf()\nplt.axis(\"off\")\nplt.title(f\"Quantized image ({n_colors} colors, Random)\")\nplt.imshow(recreate_image(codebook_random, labels_random, w, h))\nplt.show()"
       ]
     }
   ],
 
@@ -7,13 +7,13 @@
 
 - top left: What a K-means algorithm would yield using 8 clusters.
 
-- top right: What the effect of a bad initialization is
+- top right: What using three clusters would deliver.
+
+- bottom left: What the effect of a bad initialization is
   on the classification process: By setting n_init to only 1
   (default is 10), the amount of times that the algorithm will
   be run with different centroid seeds is reduced.
 
-- bottom left: What using eight clusters would deliver.
-
 - bottom right: The ground truth.
 
 """
@@ -73,8 +73,7 @@
         horizontalalignment="center",
         bbox=dict(alpha=0.2, edgecolor="w", facecolor="w"),
     )
-# Reorder the labels to have colors matching the cluster results
-y = np.choose(y, [1, 2, 0]).astype(float)
+
 ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor="k")
 
 ax.xaxis.set_ticklabels([])
 
@@ -99,8 +99,9 @@
 #   assignment have an ARI of 0.0 in expectation.
 #
 # If the ground truth labels are not known, evaluation can only be performed
-# using the model results itself. In that case, the Silhouette Coefficient comes
-# in handy.
+# using the model results itself. In that case, the Silhouette Coefficient comes in
+# handy. See :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`
+# for an example on how to do it.
 #
 # For more reference, see :ref:`clustering_evaluation`.
 
 
@@ -41,7 +41,7 @@
 china = load_sample_image("china.jpg")
 
 # Convert to floats instead of the default 8 bits integer coding. Dividing by
-# 255 is important so that plt.imshow behaves works well on float data (need to
+# 255 is important so that plt.imshow works well on float data (need to
 # be in the range [0-1])
 china = np.array(china, dtype=np.float64) / 255
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`"cell_type": "markdown",`
`5`	`5`	`"metadata": {},`
`6`	`6`	`"source": [`
`7`		`- "\n# K-means Clustering\n\nThe plot shows:\n\n- top left: What a K-means algorithm would yield using 8 clusters.\n\n- top right: What the effect of a bad initialization is\n on the classification process: By setting n_init to only 1\n (default is 10), the amount of times that the algorithm will\n be run with different centroid seeds is reduced.\n\n- bottom left: What using eight clusters would deliver.\n\n- bottom right: The ground truth.\n"`
	`7`	`+ "\n# K-means Clustering\n\nThe plot shows:\n\n- top left: What a K-means algorithm would yield using 8 clusters.\n\n- top right: What using three clusters would deliver.\n\n- bottom left: What the effect of a bad initialization is\n on the classification process: By setting n_init to only 1\n (default is 10), the amount of times that the algorithm will\n be run with different centroid seeds is reduced.\n\n- bottom right: The ground truth.\n"`
`8`	`8`	`]`
`9`	`9`	`},`
`10`	`10`	`{`
`@@ -15,7 +15,7 @@`
`15`	`15`	`},`
`16`	`16`	`"outputs": [],`
`17`	`17`	`"source": [`
`18`		- "# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\n# Though the following import is not directly being used, it is required\n# for 3D projection to work with matplotlib < 3.2\nimport mpl_toolkits.mplot3d # noqa: F401\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.cluster import KMeans\n\nnp.random.seed(5)\n\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\n\nestimators = [\n (\"k_means_iris_8\", KMeans(n_clusters=8)),\n (\"k_means_iris_3\", KMeans(n_clusters=3)),\n (\"k_means_iris_bad_init\", KMeans(n_clusters=3, n_init=1, init=\"random\")),\n]\n\nfig = plt.figure(figsize=(10, 8))\ntitles = [\"8 clusters\", \"3 clusters\", \"3 clusters, bad initialization\"]\nfor idx, ((name, est), title) in enumerate(zip(estimators, titles)):\n ax = fig.add_subplot(2, 2, idx + 1, projection=\"3d\", elev=48, azim=134)\n est.fit(X)\n labels = est.labels_\n\n ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor=\"k\")\n\n ax.xaxis.set_ticklabels([])\n ax.yaxis.set_ticklabels([])\n ax.zaxis.set_ticklabels([])\n ax.set_xlabel(\"Petal width\")\n ax.set_ylabel(\"Sepal length\")\n ax.set_zlabel(\"Petal length\")\n ax.set_title(title)\n\n# Plot the ground truth\nax = fig.add_subplot(2, 2, 4, projection=\"3d\", elev=48, azim=134)\n\nfor name, label in [(\"Setosa\", 0), (\"Versicolour\", 1), (\"Virginica\", 2)]:\n ax.text3D(\n X[y == label, 3].mean(),\n X[y == label, 0].mean(),\n X[y == label, 2].mean() + 2,\n name,\n horizontalalignment=\"center\",\n bbox=dict(alpha=0.2, edgecolor=\"w\", facecolor=\"w\"),\n )\n# Reorder the labels to have colors matching the cluster results\ny = np.choose(y, [1, 2, 0]).astype(float)\nax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor=\"k\")\n\nax.xaxis.set_ticklabels([])\nax.yaxis.set_ticklabels([])\nax.zaxis.set_ticklabels([])\nax.set_xlabel(\"Petal width\")\nax.set_ylabel(\"Sepal length\")\nax.set_zlabel(\"Petal length\")\nax.set_title(\"Ground Truth\")\n\nplt.subplots_adjust(wspace=0.25, hspace=0.25)\nplt.show()"
	`18`	+ "# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\n# Though the following import is not directly being used, it is required\n# for 3D projection to work with matplotlib < 3.2\nimport mpl_toolkits.mplot3d # noqa: F401\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.cluster import KMeans\n\nnp.random.seed(5)\n\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\n\nestimators = [\n (\"k_means_iris_8\", KMeans(n_clusters=8)),\n (\"k_means_iris_3\", KMeans(n_clusters=3)),\n (\"k_means_iris_bad_init\", KMeans(n_clusters=3, n_init=1, init=\"random\")),\n]\n\nfig = plt.figure(figsize=(10, 8))\ntitles = [\"8 clusters\", \"3 clusters\", \"3 clusters, bad initialization\"]\nfor idx, ((name, est), title) in enumerate(zip(estimators, titles)):\n ax = fig.add_subplot(2, 2, idx + 1, projection=\"3d\", elev=48, azim=134)\n est.fit(X)\n labels = est.labels_\n\n ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor=\"k\")\n\n ax.xaxis.set_ticklabels([])\n ax.yaxis.set_ticklabels([])\n ax.zaxis.set_ticklabels([])\n ax.set_xlabel(\"Petal width\")\n ax.set_ylabel(\"Sepal length\")\n ax.set_zlabel(\"Petal length\")\n ax.set_title(title)\n\n# Plot the ground truth\nax = fig.add_subplot(2, 2, 4, projection=\"3d\", elev=48, azim=134)\n\nfor name, label in [(\"Setosa\", 0), (\"Versicolour\", 1), (\"Virginica\", 2)]:\n ax.text3D(\n X[y == label, 3].mean(),\n X[y == label, 0].mean(),\n X[y == label, 2].mean() + 2,\n name,\n horizontalalignment=\"center\",\n bbox=dict(alpha=0.2, edgecolor=\"w\", facecolor=\"w\"),\n )\n\nax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor=\"k\")\n\nax.xaxis.set_ticklabels([])\nax.yaxis.set_ticklabels([])\nax.zaxis.set_ticklabels([])\nax.set_xlabel(\"Petal width\")\nax.set_ylabel(\"Sepal length\")\nax.set_zlabel(\"Petal length\")\nax.set_title(\"Ground Truth\")\n\nplt.subplots_adjust(wspace=0.25, hspace=0.25)\nplt.show()"
`19`	`19`	`]`
`20`	`20`	`}`
`21`	`21`	`],`
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@`
`40`	`40`	`"cell_type": "markdown",`
`41`	`41`	`"metadata": {},`
`42`	`42`	`"source": [`
`43`		- "## Quantifying the quality of clustering results\n\nIn this section we define a function to score different clustering pipelines\nusing several metrics.\n\nClustering algorithms are fundamentally unsupervised learning methods.\nHowever, since we happen to have class labels for this specific dataset, it is\npossible to use evaluation metrics that leverage this \"supervised\" ground\ntruth information to quantify the quality of the resulting clusters. Examples\nof such metrics are the following:\n\n- homogeneity, which quantifies how much clusters contain only members of a\n single class;\n\n- completeness, which quantifies how much members of a given class are\n assigned to the same clusters;\n\n- V-measure, the harmonic mean of completeness and homogeneity;\n\n- Rand-Index, which measures how frequently pairs of data points are grouped\n consistently according to the result of the clustering algorithm and the\n ground truth class assignment;\n\n- Adjusted Rand-Index, a chance-adjusted Rand-Index such that random cluster\n assignment have an ARI of 0.0 in expectation.\n\nIf the ground truth labels are not known, evaluation can only be performed\nusing the model results itself. In that case, the Silhouette Coefficient comes\nin handy.\n\nFor more reference, see `clustering_evaluation`.\n\n"
	`43`	+ "## Quantifying the quality of clustering results\n\nIn this section we define a function to score different clustering pipelines\nusing several metrics.\n\nClustering algorithms are fundamentally unsupervised learning methods.\nHowever, since we happen to have class labels for this specific dataset, it is\npossible to use evaluation metrics that leverage this \"supervised\" ground\ntruth information to quantify the quality of the resulting clusters. Examples\nof such metrics are the following:\n\n- homogeneity, which quantifies how much clusters contain only members of a\n single class;\n\n- completeness, which quantifies how much members of a given class are\n assigned to the same clusters;\n\n- V-measure, the harmonic mean of completeness and homogeneity;\n\n- Rand-Index, which measures how frequently pairs of data points are grouped\n consistently according to the result of the clustering algorithm and the\n ground truth class assignment;\n\n- Adjusted Rand-Index, a chance-adjusted Rand-Index such that random cluster\n assignment have an ARI of 0.0 in expectation.\n\nIf the ground truth labels are not known, evaluation can only be performed\nusing the model results itself. In that case, the Silhouette Coefficient comes in\nhandy. See `sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`\nfor an example on how to do it.\n\nFor more reference, see `clustering_evaluation`.\n\n"
`44`	`44`	`]`
`45`	`45`	`},`
`46`	`46`	`{`
Original file line number	Diff line number	Diff line change
`@@ -99,8 +99,9 @@`
`99`	`99`	`# assignment have an ARI of 0.0 in expectation.`
`100`	`100`	`#`
`101`	`101`	`# If the ground truth labels are not known, evaluation can only be performed`
`102`		`-# using the model results itself. In that case, the Silhouette Coefficient comes`
`103`		`-# in handy.`
	`102`	`+# using the model results itself. In that case, the Silhouette Coefficient comes in`
	`103`	+# handy. See :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`
	`104`	`+# for an example on how to do it.`
`104`	`105`	`#`
`105`	`106`	# For more reference, see :ref:`clustering_evaluation`.
`106`	`107`