scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
4.5 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
4.5 KB
diff --git a/‎dev/_downloads/5d2d581a4569eb0718dbdb8abf7cbbdf/plot_kmeans_assumptions.py
Lines changed: 148 additions & 39 deletions b/‎dev/_downloads/5d2d581a4569eb0718dbdb8abf7cbbdf/plot_kmeans_assumptions.py
Lines changed: 148 additions & 39 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
6.16 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
6.16 KB
diff --git a/‎dev/_downloads/b05e6cdf6d51481f37bf29b0bb92995e/plot_kmeans_assumptions.ipynb
Lines changed: 117 additions & 2 deletions b/‎dev/_downloads/b05e6cdf6d51481f37bf29b0bb92995e/plot_kmeans_assumptions.ipynb
Lines changed: 117 additions & 2 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
270 KB b/‎dev/_downloads/scikit-learn-docs.zip
270 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-186 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-186 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
1 Byte b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
1 Byte
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-111 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-111 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-212 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-212 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-85 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-85 Bytes
@@ -3,67 +3,176 @@
 Demonstration of k-means assumptions
 ====================================
 
-This example is meant to illustrate situations where k-means will produce
-unintuitive and possibly unexpected clusters. In the first three plots, the
-input data does not conform to some implicit assumption that k-means makes and
-undesirable clusters are produced as a result. In the last plot, k-means
-returns intuitive clusters despite unevenly sized blobs.
+This example is meant to illustrate situations where k-means produces
+unintuitive and possibly undesirable clusters.
 
 """
 
 # Author: Phil Roth <[email protected]>
+#         Arturo Amor <[email protected]>
 # License: BSD 3 clause
 
-import numpy as np
-import matplotlib.pyplot as plt
+# %%
+# Data generation
+# ---------------
+#
+# The function :func:`~sklearn.datasets.make_blobs` generates isotropic
+# (spherical) gaussian blobs. To obtain anisotropic (elliptical) gaussian blobs
+# one has to define a linear `transformation`.
 
-from sklearn.cluster import KMeans
+import numpy as np
 from sklearn.datasets import make_blobs
 
-plt.figure(figsize=(12, 12))
-
 n_samples = 1500
 random_state = 170
+transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
+
 X, y = make_blobs(n_samples=n_samples, random_state=random_state)
+X_aniso = np.dot(X, transformation)  # Anisotropic blobs
+X_varied, y_varied = make_blobs(
+    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
+)  # Unequal variance
+X_filtered = np.vstack(
+    (X[y == 0][:500], X[y == 1][:100], X[y == 2][:10])
+)  # Unevenly sized blobs
+y_filtered = [0] * 500 + [1] * 100 + [2] * 10
 
-# Incorrect number of clusters
-y_pred = KMeans(n_clusters=2, n_init="auto", random_state=random_state).fit_predict(X)
+# %%
+# We can visualize the resulting data:
 
-plt.subplot(221)
-plt.scatter(X[:, 0], X[:, 1], c=y_pred)
-plt.title("Incorrect Number of Blobs")
+import matplotlib.pyplot as plt
 
-# Anisotropicly distributed data
-transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
-X_aniso = np.dot(X, transformation)
-y_pred = KMeans(n_clusters=3, n_init="auto", random_state=random_state).fit_predict(
-    X_aniso
-)
+fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
 
-plt.subplot(222)
-plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
-plt.title("Anisotropicly Distributed Blobs")
+axs[0, 0].scatter(X[:, 0], X[:, 1], c=y)
+axs[0, 0].set_title("Mixture of Gaussian Blobs")
 
-# Different variance
-X_varied, y_varied = make_blobs(
-    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
-)
-y_pred = KMeans(n_clusters=3, n_init="auto", random_state=random_state).fit_predict(
-    X_varied
-)
+axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y)
+axs[0, 1].set_title("Anisotropically Distributed Blobs")
+
+axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_varied)
+axs[1, 0].set_title("Unequal Variance")
+
+axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_filtered)
+axs[1, 1].set_title("Unevenly Sized Blobs")
+
+plt.suptitle("Ground truth clusters").set_y(0.95)
+plt.show()
+
+# %%
+# Fit models and plot results
+# ---------------------------
+#
+# The previously generated data is now used to show how
+# :class:`~sklearn.cluster.KMeans` behaves in the following scenarios:
+#
+# - Non-optimal number of clusters: in a real setting there is no uniquely
+#   defined **true** number of clusters. An appropriate number of clusters has
+#   to be decided from data-based criteria and knowledge of the intended goal.
+# - Anisotropically distributed blobs: k-means consists of minimizing sample's
+#   euclidean distances to the centroid of the cluster they are assigned to. As
+#   a consequence, k-means is more appropriate for clusters that are isotropic
+#   and normally distributed (i.e. spherical gaussians).
+# - Unequal variance: k-means is equivalent to taking the maximum likelihood
+#   estimator for a "mixture" of k gaussian distributions with the same
+#   variances but with possibly different means.
+# - Unevenly sized blobs: there is no theoretical result about k-means that
+#   states that it requires similar cluster sizes to perform well, yet
+#   minimizing euclidean distances does mean that the more sparse and
+#   high-dimensional the problem is, the higher is the need to run the algorithm
+#   with different centroid seeds to ensure a global minimal inertia.
 
-plt.subplot(223)
-plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
-plt.title("Unequal Variance")
+from sklearn.cluster import KMeans
+
+common_params = {
+    "n_init": "auto",
+    "random_state": random_state,
+}
+
+fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
+
+y_pred = KMeans(n_clusters=2, **common_params).fit_predict(X)
+axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)
+axs[0, 0].set_title("Non-optimal Number of Clusters")
+
+y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso)
+axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
+axs[0, 1].set_title("Anisotropically Distributed Blobs")
+
+y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_varied)
+axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
+axs[1, 0].set_title("Unequal Variance")
+
+y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_filtered)
+axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
+axs[1, 1].set_title("Unevenly Sized Blobs")
+
+plt.suptitle("Unexpected KMeans clusters").set_y(0.95)
+plt.show()
+
+# %%
+# Possible solutions
+# ------------------
+#
+# For an example on how to find a correct number of blobs, see
+# :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.
+# In this case it suffices to set `n_clusters=3`.
+
+y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X)
+plt.scatter(X[:, 0], X[:, 1], c=y_pred)
+plt.title("Optimal Number of Clusters")
+plt.show()
+
+# %%
+# To deal with unevenly sized blobs one can increase the number of random
+# initializations. In this case we set `n_init=10` to avoid finding a
+# sub-optimal local minimum. For more details see :ref:`kmeans_sparse_high_dim`.
 
-# Unevenly sized blobs
-X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
 y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict(
     X_filtered
 )
-
-plt.subplot(224)
 plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
-plt.title("Unevenly Sized Blobs")
+plt.title("Unevenly Sized Blobs \nwith several initializations")
+plt.show()
+
+# %%
+# As anisotropic and unequal variances are real limitations of the k-means
+# algorithm, here we propose instead the use of
+# :class:`~sklearn.mixture.GaussianMixture`, which also assumes gaussian
+# clusters but does not impose any constraints on their variances. Notice that
+# one still has to find the correct number of blobs (see
+# :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`).
+#
+# For an example on how other clustering methods deal with anisotropic or
+# unequal variance blobs, see the example
+# :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`.
 
+from sklearn.mixture import GaussianMixture
+
+fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
+
+y_pred = GaussianMixture(n_components=3).fit_predict(X_aniso)
+ax1.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
+ax1.set_title("Anisotropically Distributed Blobs")
+
+y_pred = GaussianMixture(n_components=3).fit_predict(X_varied)
+ax2.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
+ax2.set_title("Unequal Variance")
+
+plt.suptitle("Gaussian mixture clusters").set_y(0.95)
 plt.show()
+
+# %%
+# Final remarks
+# -------------
+#
+# In high-dimensional spaces, Euclidean distances tend to become inflated
+# (not shown in this example). Running a dimensionality reduction algorithm
+# prior to k-means clustering can alleviate this problem and speed up the
+# computations (see the example
+# :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`).
+#
+# In the case where clusters are known to be isotropic, have similar variance
+# and are not too sparse, the k-means algorithm is quite effective and is one of
+# the fastest clustering algorithms available. This advantage is lost if one has
+# to restart it several times to avoid convergence to a local minimum.
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Demonstration of k-means assumptions\n\nThis example is meant to illustrate situations where k-means will produce\nunintuitive and possibly unexpected clusters. In the first three plots, the\ninput data does not conform to some implicit assumption that k-means makes and\nundesirable clusters are produced as a result. In the last plot, k-means\nreturns intuitive clusters despite unevenly sized blobs.\n"
+        "\n# Demonstration of k-means assumptions\n\nThis example is meant to illustrate situations where k-means produces\nunintuitive and possibly undesirable clusters.\n"
       ]
     },
     {
@@ -26,7 +26,122 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Phil Roth <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.cluster import KMeans\nfrom sklearn.datasets import make_blobs\n\nplt.figure(figsize=(12, 12))\n\nn_samples = 1500\nrandom_state = 170\nX, y = make_blobs(n_samples=n_samples, random_state=random_state)\n\n# Incorrect number of clusters\ny_pred = KMeans(n_clusters=2, n_init=\"auto\", random_state=random_state).fit_predict(X)\n\nplt.subplot(221)\nplt.scatter(X[:, 0], X[:, 1], c=y_pred)\nplt.title(\"Incorrect Number of Blobs\")\n\n# Anisotropicly distributed data\ntransformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]\nX_aniso = np.dot(X, transformation)\ny_pred = KMeans(n_clusters=3, n_init=\"auto\", random_state=random_state).fit_predict(\n    X_aniso\n)\n\nplt.subplot(222)\nplt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)\nplt.title(\"Anisotropicly Distributed Blobs\")\n\n# Different variance\nX_varied, y_varied = make_blobs(\n    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state\n)\ny_pred = KMeans(n_clusters=3, n_init=\"auto\", random_state=random_state).fit_predict(\n    X_varied\n)\n\nplt.subplot(223)\nplt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)\nplt.title(\"Unequal Variance\")\n\n# Unevenly sized blobs\nX_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))\ny_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict(\n    X_filtered\n)\n\nplt.subplot(224)\nplt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)\nplt.title(\"Unevenly Sized Blobs\")\n\nplt.show()"
+        "# Author: Phil Roth <[email protected]>\n#         Arturo Amor <[email protected]>\n# License: BSD 3 clause"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Data generation\n\nThe function :func:`~sklearn.datasets.make_blobs` generates isotropic\n(spherical) gaussian blobs. To obtain anisotropic (elliptical) gaussian blobs\none has to define a linear `transformation`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\nfrom sklearn.datasets import make_blobs\n\nn_samples = 1500\nrandom_state = 170\ntransformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]\n\nX, y = make_blobs(n_samples=n_samples, random_state=random_state)\nX_aniso = np.dot(X, transformation)  # Anisotropic blobs\nX_varied, y_varied = make_blobs(\n    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state\n)  # Unequal variance\nX_filtered = np.vstack(\n    (X[y == 0][:500], X[y == 1][:100], X[y == 2][:10])\n)  # Unevenly sized blobs\ny_filtered = [0] * 500 + [1] * 100 + [2] * 10"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We can visualize the resulting data:\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n\nfig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))\n\naxs[0, 0].scatter(X[:, 0], X[:, 1], c=y)\naxs[0, 0].set_title(\"Mixture of Gaussian Blobs\")\n\naxs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y)\naxs[0, 1].set_title(\"Anisotropically Distributed Blobs\")\n\naxs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_varied)\naxs[1, 0].set_title(\"Unequal Variance\")\n\naxs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_filtered)\naxs[1, 1].set_title(\"Unevenly Sized Blobs\")\n\nplt.suptitle(\"Ground truth clusters\").set_y(0.95)\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Fit models and plot results\n\nThe previously generated data is now used to show how\n:class:`~sklearn.cluster.KMeans` behaves in the following scenarios:\n\n- Non-optimal number of clusters: in a real setting there is no uniquely\n  defined **true** number of clusters. An appropriate number of clusters has\n  to be decided from data-based criteria and knowledge of the intended goal.\n- Anisotropically distributed blobs: k-means consists of minimizing sample's\n  euclidean distances to the centroid of the cluster they are assigned to. As\n  a consequence, k-means is more appropriate for clusters that are isotropic\n  and normally distributed (i.e. spherical gaussians).\n- Unequal variance: k-means is equivalent to taking the maximum likelihood\n  estimator for a \"mixture\" of k gaussian distributions with the same\n  variances but with possibly different means.\n- Unevenly sized blobs: there is no theoretical result about k-means that\n  states that it requires similar cluster sizes to perform well, yet\n  minimizing euclidean distances does mean that the more sparse and\n  high-dimensional the problem is, the higher is the need to run the algorithm\n  with different centroid seeds to ensure a global minimal inertia.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.cluster import KMeans\n\ncommon_params = {\n    \"n_init\": \"auto\",\n    \"random_state\": random_state,\n}\n\nfig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))\n\ny_pred = KMeans(n_clusters=2, **common_params).fit_predict(X)\naxs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)\naxs[0, 0].set_title(\"Non-optimal Number of Clusters\")\n\ny_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso)\naxs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)\naxs[0, 1].set_title(\"Anisotropically Distributed Blobs\")\n\ny_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_varied)\naxs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)\naxs[1, 0].set_title(\"Unequal Variance\")\n\ny_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_filtered)\naxs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)\naxs[1, 1].set_title(\"Unevenly Sized Blobs\")\n\nplt.suptitle(\"Unexpected KMeans clusters\").set_y(0.95)\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Possible solutions\n\nFor an example on how to find a correct number of blobs, see\n`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.\nIn this case it suffices to set `n_clusters=3`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X)\nplt.scatter(X[:, 0], X[:, 1], c=y_pred)\nplt.title(\"Optimal Number of Clusters\")\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "To deal with unevenly sized blobs one can increase the number of random\ninitializations. In this case we set `n_init=10` to avoid finding a\nsub-optimal local minimum. For more details see `kmeans_sparse_high_dim`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict(\n    X_filtered\n)\nplt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)\nplt.title(\"Unevenly Sized Blobs \\nwith several initializations\")\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "As anisotropic and unequal variances are real limitations of the k-means\nalgorithm, here we propose instead the use of\n:class:`~sklearn.mixture.GaussianMixture`, which also assumes gaussian\nclusters but does not impose any constraints on their variances. Notice that\none still has to find the correct number of blobs (see\n`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`).\n\nFor an example on how other clustering methods deal with anisotropic or\nunequal variance blobs, see the example\n`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.mixture import GaussianMixture\n\nfig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))\n\ny_pred = GaussianMixture(n_components=3).fit_predict(X_aniso)\nax1.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)\nax1.set_title(\"Anisotropically Distributed Blobs\")\n\ny_pred = GaussianMixture(n_components=3).fit_predict(X_varied)\nax2.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)\nax2.set_title(\"Unequal Variance\")\n\nplt.suptitle(\"Gaussian mixture clusters\").set_y(0.95)\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Final remarks\n\nIn high-dimensional spaces, Euclidean distances tend to become inflated\n(not shown in this example). Running a dimensionality reduction algorithm\nprior to k-means clustering can alleviate this problem and speed up the\ncomputations (see the example\n`sphx_glr_auto_examples_text_plot_document_clustering.py`).\n\nIn the case where clusters are known to be isotropic, have similar variance\nand are not too sparse, the k-means algorithm is quite effective and is one of\nthe fastest clustering algorithms available. This advantage is lost if one has\nto restart it several times to avoid convergence to a local minimum.\n\n"
       ]
     }
   ],