Skip to content

Commit 332ede5

Browse files
committed
Pushing the docs to dev/ for branch: main, commit cbfb6aba29e6d35fc0c3fb215b5ca114ff7207f5
1 parent af59052 commit 332ede5

File tree

1,372 files changed

+5531
-4858
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,372 files changed

+5531
-4858
lines changed
Binary file not shown.

dev/_downloads/5d2d581a4569eb0718dbdb8abf7cbbdf/plot_kmeans_assumptions.py

Lines changed: 148 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3,67 +3,176 @@
33
Demonstration of k-means assumptions
44
====================================
55
6-
This example is meant to illustrate situations where k-means will produce
7-
unintuitive and possibly unexpected clusters. In the first three plots, the
8-
input data does not conform to some implicit assumption that k-means makes and
9-
undesirable clusters are produced as a result. In the last plot, k-means
10-
returns intuitive clusters despite unevenly sized blobs.
6+
This example is meant to illustrate situations where k-means produces
7+
unintuitive and possibly undesirable clusters.
118
129
"""
1310

1411
# Author: Phil Roth <[email protected]>
12+
# Arturo Amor <[email protected]>
1513
# License: BSD 3 clause
1614

17-
import numpy as np
18-
import matplotlib.pyplot as plt
15+
# %%
16+
# Data generation
17+
# ---------------
18+
#
19+
# The function :func:`~sklearn.datasets.make_blobs` generates isotropic
20+
# (spherical) gaussian blobs. To obtain anisotropic (elliptical) gaussian blobs
21+
# one has to define a linear `transformation`.
1922

20-
from sklearn.cluster import KMeans
23+
import numpy as np
2124
from sklearn.datasets import make_blobs
2225

23-
plt.figure(figsize=(12, 12))
24-
2526
n_samples = 1500
2627
random_state = 170
28+
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
29+
2730
X, y = make_blobs(n_samples=n_samples, random_state=random_state)
31+
X_aniso = np.dot(X, transformation) # Anisotropic blobs
32+
X_varied, y_varied = make_blobs(
33+
n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
34+
) # Unequal variance
35+
X_filtered = np.vstack(
36+
(X[y == 0][:500], X[y == 1][:100], X[y == 2][:10])
37+
) # Unevenly sized blobs
38+
y_filtered = [0] * 500 + [1] * 100 + [2] * 10
2839

29-
# Incorrect number of clusters
30-
y_pred = KMeans(n_clusters=2, n_init="auto", random_state=random_state).fit_predict(X)
40+
# %%
41+
# We can visualize the resulting data:
3142

32-
plt.subplot(221)
33-
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
34-
plt.title("Incorrect Number of Blobs")
43+
import matplotlib.pyplot as plt
3544

36-
# Anisotropicly distributed data
37-
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
38-
X_aniso = np.dot(X, transformation)
39-
y_pred = KMeans(n_clusters=3, n_init="auto", random_state=random_state).fit_predict(
40-
X_aniso
41-
)
45+
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
4246

43-
plt.subplot(222)
44-
plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
45-
plt.title("Anisotropicly Distributed Blobs")
47+
axs[0, 0].scatter(X[:, 0], X[:, 1], c=y)
48+
axs[0, 0].set_title("Mixture of Gaussian Blobs")
4649

47-
# Different variance
48-
X_varied, y_varied = make_blobs(
49-
n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
50-
)
51-
y_pred = KMeans(n_clusters=3, n_init="auto", random_state=random_state).fit_predict(
52-
X_varied
53-
)
50+
axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y)
51+
axs[0, 1].set_title("Anisotropically Distributed Blobs")
52+
53+
axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_varied)
54+
axs[1, 0].set_title("Unequal Variance")
55+
56+
axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_filtered)
57+
axs[1, 1].set_title("Unevenly Sized Blobs")
58+
59+
plt.suptitle("Ground truth clusters").set_y(0.95)
60+
plt.show()
61+
62+
# %%
63+
# Fit models and plot results
64+
# ---------------------------
65+
#
66+
# The previously generated data is now used to show how
67+
# :class:`~sklearn.cluster.KMeans` behaves in the following scenarios:
68+
#
69+
# - Non-optimal number of clusters: in a real setting there is no uniquely
70+
# defined **true** number of clusters. An appropriate number of clusters has
71+
# to be decided from data-based criteria and knowledge of the intended goal.
72+
# - Anisotropically distributed blobs: k-means consists of minimizing sample's
73+
# euclidean distances to the centroid of the cluster they are assigned to. As
74+
# a consequence, k-means is more appropriate for clusters that are isotropic
75+
# and normally distributed (i.e. spherical gaussians).
76+
# - Unequal variance: k-means is equivalent to taking the maximum likelihood
77+
# estimator for a "mixture" of k gaussian distributions with the same
78+
# variances but with possibly different means.
79+
# - Unevenly sized blobs: there is no theoretical result about k-means that
80+
# states that it requires similar cluster sizes to perform well, yet
81+
# minimizing euclidean distances does mean that the more sparse and
82+
# high-dimensional the problem is, the higher is the need to run the algorithm
83+
# with different centroid seeds to ensure a global minimal inertia.
5484

55-
plt.subplot(223)
56-
plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
57-
plt.title("Unequal Variance")
85+
from sklearn.cluster import KMeans
86+
87+
common_params = {
88+
"n_init": "auto",
89+
"random_state": random_state,
90+
}
91+
92+
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
93+
94+
y_pred = KMeans(n_clusters=2, **common_params).fit_predict(X)
95+
axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)
96+
axs[0, 0].set_title("Non-optimal Number of Clusters")
97+
98+
y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso)
99+
axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
100+
axs[0, 1].set_title("Anisotropically Distributed Blobs")
101+
102+
y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_varied)
103+
axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
104+
axs[1, 0].set_title("Unequal Variance")
105+
106+
y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_filtered)
107+
axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
108+
axs[1, 1].set_title("Unevenly Sized Blobs")
109+
110+
plt.suptitle("Unexpected KMeans clusters").set_y(0.95)
111+
plt.show()
112+
113+
# %%
114+
# Possible solutions
115+
# ------------------
116+
#
117+
# For an example on how to find a correct number of blobs, see
118+
# :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.
119+
# In this case it suffices to set `n_clusters=3`.
120+
121+
y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X)
122+
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
123+
plt.title("Optimal Number of Clusters")
124+
plt.show()
125+
126+
# %%
127+
# To deal with unevenly sized blobs one can increase the number of random
128+
# initializations. In this case we set `n_init=10` to avoid finding a
129+
# sub-optimal local minimum. For more details see :ref:`kmeans_sparse_high_dim`.
58130

59-
# Unevenly sized blobs
60-
X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
61131
y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict(
62132
X_filtered
63133
)
64-
65-
plt.subplot(224)
66134
plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
67-
plt.title("Unevenly Sized Blobs")
135+
plt.title("Unevenly Sized Blobs \nwith several initializations")
136+
plt.show()
137+
138+
# %%
139+
# As anisotropic and unequal variances are real limitations of the k-means
140+
# algorithm, here we propose instead the use of
141+
# :class:`~sklearn.mixture.GaussianMixture`, which also assumes gaussian
142+
# clusters but does not impose any constraints on their variances. Notice that
143+
# one still has to find the correct number of blobs (see
144+
# :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`).
145+
#
146+
# For an example on how other clustering methods deal with anisotropic or
147+
# unequal variance blobs, see the example
148+
# :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`.
68149

150+
from sklearn.mixture import GaussianMixture
151+
152+
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
153+
154+
y_pred = GaussianMixture(n_components=3).fit_predict(X_aniso)
155+
ax1.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
156+
ax1.set_title("Anisotropically Distributed Blobs")
157+
158+
y_pred = GaussianMixture(n_components=3).fit_predict(X_varied)
159+
ax2.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
160+
ax2.set_title("Unequal Variance")
161+
162+
plt.suptitle("Gaussian mixture clusters").set_y(0.95)
69163
plt.show()
164+
165+
# %%
166+
# Final remarks
167+
# -------------
168+
#
169+
# In high-dimensional spaces, Euclidean distances tend to become inflated
170+
# (not shown in this example). Running a dimensionality reduction algorithm
171+
# prior to k-means clustering can alleviate this problem and speed up the
172+
# computations (see the example
173+
# :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`).
174+
#
175+
# In the case where clusters are known to be isotropic, have similar variance
176+
# and are not too sparse, the k-means algorithm is quite effective and is one of
177+
# the fastest clustering algorithms available. This advantage is lost if one has
178+
# to restart it several times to avoid convergence to a local minimum.
Binary file not shown.

dev/_downloads/b05e6cdf6d51481f37bf29b0bb92995e/plot_kmeans_assumptions.ipynb

Lines changed: 117 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"cell_type": "markdown",
1616
"metadata": {},
1717
"source": [
18-
"\n# Demonstration of k-means assumptions\n\nThis example is meant to illustrate situations where k-means will produce\nunintuitive and possibly unexpected clusters. In the first three plots, the\ninput data does not conform to some implicit assumption that k-means makes and\nundesirable clusters are produced as a result. In the last plot, k-means\nreturns intuitive clusters despite unevenly sized blobs.\n"
18+
"\n# Demonstration of k-means assumptions\n\nThis example is meant to illustrate situations where k-means produces\nunintuitive and possibly undesirable clusters.\n"
1919
]
2020
},
2121
{
@@ -26,7 +26,122 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Phil Roth <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.cluster import KMeans\nfrom sklearn.datasets import make_blobs\n\nplt.figure(figsize=(12, 12))\n\nn_samples = 1500\nrandom_state = 170\nX, y = make_blobs(n_samples=n_samples, random_state=random_state)\n\n# Incorrect number of clusters\ny_pred = KMeans(n_clusters=2, n_init=\"auto\", random_state=random_state).fit_predict(X)\n\nplt.subplot(221)\nplt.scatter(X[:, 0], X[:, 1], c=y_pred)\nplt.title(\"Incorrect Number of Blobs\")\n\n# Anisotropicly distributed data\ntransformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]\nX_aniso = np.dot(X, transformation)\ny_pred = KMeans(n_clusters=3, n_init=\"auto\", random_state=random_state).fit_predict(\n X_aniso\n)\n\nplt.subplot(222)\nplt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)\nplt.title(\"Anisotropicly Distributed Blobs\")\n\n# Different variance\nX_varied, y_varied = make_blobs(\n n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state\n)\ny_pred = KMeans(n_clusters=3, n_init=\"auto\", random_state=random_state).fit_predict(\n X_varied\n)\n\nplt.subplot(223)\nplt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)\nplt.title(\"Unequal Variance\")\n\n# Unevenly sized blobs\nX_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))\ny_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict(\n X_filtered\n)\n\nplt.subplot(224)\nplt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)\nplt.title(\"Unevenly Sized Blobs\")\n\nplt.show()"
29+
"# Author: Phil Roth <[email protected]>\n# Arturo Amor <[email protected]>\n# License: BSD 3 clause"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"## Data generation\n\nThe function :func:`~sklearn.datasets.make_blobs` generates isotropic\n(spherical) gaussian blobs. To obtain anisotropic (elliptical) gaussian blobs\none has to define a linear `transformation`.\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"import numpy as np\nfrom sklearn.datasets import make_blobs\n\nn_samples = 1500\nrandom_state = 170\ntransformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]\n\nX, y = make_blobs(n_samples=n_samples, random_state=random_state)\nX_aniso = np.dot(X, transformation) # Anisotropic blobs\nX_varied, y_varied = make_blobs(\n n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state\n) # Unequal variance\nX_filtered = np.vstack(\n (X[y == 0][:500], X[y == 1][:100], X[y == 2][:10])\n) # Unevenly sized blobs\ny_filtered = [0] * 500 + [1] * 100 + [2] * 10"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {},
53+
"source": [
54+
"We can visualize the resulting data:\n\n"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [],
64+
"source": [
65+
"import matplotlib.pyplot as plt\n\nfig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))\n\naxs[0, 0].scatter(X[:, 0], X[:, 1], c=y)\naxs[0, 0].set_title(\"Mixture of Gaussian Blobs\")\n\naxs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y)\naxs[0, 1].set_title(\"Anisotropically Distributed Blobs\")\n\naxs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_varied)\naxs[1, 0].set_title(\"Unequal Variance\")\n\naxs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_filtered)\naxs[1, 1].set_title(\"Unevenly Sized Blobs\")\n\nplt.suptitle(\"Ground truth clusters\").set_y(0.95)\nplt.show()"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"metadata": {},
71+
"source": [
72+
"## Fit models and plot results\n\nThe previously generated data is now used to show how\n:class:`~sklearn.cluster.KMeans` behaves in the following scenarios:\n\n- Non-optimal number of clusters: in a real setting there is no uniquely\n defined **true** number of clusters. An appropriate number of clusters has\n to be decided from data-based criteria and knowledge of the intended goal.\n- Anisotropically distributed blobs: k-means consists of minimizing sample's\n euclidean distances to the centroid of the cluster they are assigned to. As\n a consequence, k-means is more appropriate for clusters that are isotropic\n and normally distributed (i.e. spherical gaussians).\n- Unequal variance: k-means is equivalent to taking the maximum likelihood\n estimator for a \"mixture\" of k gaussian distributions with the same\n variances but with possibly different means.\n- Unevenly sized blobs: there is no theoretical result about k-means that\n states that it requires similar cluster sizes to perform well, yet\n minimizing euclidean distances does mean that the more sparse and\n high-dimensional the problem is, the higher is the need to run the algorithm\n with different centroid seeds to ensure a global minimal inertia.\n\n"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": null,
78+
"metadata": {
79+
"collapsed": false
80+
},
81+
"outputs": [],
82+
"source": [
83+
"from sklearn.cluster import KMeans\n\ncommon_params = {\n \"n_init\": \"auto\",\n \"random_state\": random_state,\n}\n\nfig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))\n\ny_pred = KMeans(n_clusters=2, **common_params).fit_predict(X)\naxs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)\naxs[0, 0].set_title(\"Non-optimal Number of Clusters\")\n\ny_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso)\naxs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)\naxs[0, 1].set_title(\"Anisotropically Distributed Blobs\")\n\ny_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_varied)\naxs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)\naxs[1, 0].set_title(\"Unequal Variance\")\n\ny_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_filtered)\naxs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)\naxs[1, 1].set_title(\"Unevenly Sized Blobs\")\n\nplt.suptitle(\"Unexpected KMeans clusters\").set_y(0.95)\nplt.show()"
84+
]
85+
},
86+
{
87+
"cell_type": "markdown",
88+
"metadata": {},
89+
"source": [
90+
"## Possible solutions\n\nFor an example on how to find a correct number of blobs, see\n`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.\nIn this case it suffices to set `n_clusters=3`.\n\n"
91+
]
92+
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": null,
96+
"metadata": {
97+
"collapsed": false
98+
},
99+
"outputs": [],
100+
"source": [
101+
"y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X)\nplt.scatter(X[:, 0], X[:, 1], c=y_pred)\nplt.title(\"Optimal Number of Clusters\")\nplt.show()"
102+
]
103+
},
104+
{
105+
"cell_type": "markdown",
106+
"metadata": {},
107+
"source": [
108+
"To deal with unevenly sized blobs one can increase the number of random\ninitializations. In this case we set `n_init=10` to avoid finding a\nsub-optimal local minimum. For more details see `kmeans_sparse_high_dim`.\n\n"
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": null,
114+
"metadata": {
115+
"collapsed": false
116+
},
117+
"outputs": [],
118+
"source": [
119+
"y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict(\n X_filtered\n)\nplt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)\nplt.title(\"Unevenly Sized Blobs \\nwith several initializations\")\nplt.show()"
120+
]
121+
},
122+
{
123+
"cell_type": "markdown",
124+
"metadata": {},
125+
"source": [
126+
"As anisotropic and unequal variances are real limitations of the k-means\nalgorithm, here we propose instead the use of\n:class:`~sklearn.mixture.GaussianMixture`, which also assumes gaussian\nclusters but does not impose any constraints on their variances. Notice that\none still has to find the correct number of blobs (see\n`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`).\n\nFor an example on how other clustering methods deal with anisotropic or\nunequal variance blobs, see the example\n`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`.\n\n"
127+
]
128+
},
129+
{
130+
"cell_type": "code",
131+
"execution_count": null,
132+
"metadata": {
133+
"collapsed": false
134+
},
135+
"outputs": [],
136+
"source": [
137+
"from sklearn.mixture import GaussianMixture\n\nfig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))\n\ny_pred = GaussianMixture(n_components=3).fit_predict(X_aniso)\nax1.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)\nax1.set_title(\"Anisotropically Distributed Blobs\")\n\ny_pred = GaussianMixture(n_components=3).fit_predict(X_varied)\nax2.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)\nax2.set_title(\"Unequal Variance\")\n\nplt.suptitle(\"Gaussian mixture clusters\").set_y(0.95)\nplt.show()"
138+
]
139+
},
140+
{
141+
"cell_type": "markdown",
142+
"metadata": {},
143+
"source": [
144+
"## Final remarks\n\nIn high-dimensional spaces, Euclidean distances tend to become inflated\n(not shown in this example). Running a dimensionality reduction algorithm\nprior to k-means clustering can alleviate this problem and speed up the\ncomputations (see the example\n`sphx_glr_auto_examples_text_plot_document_clustering.py`).\n\nIn the case where clusters are known to be isotropic, have similar variance\nand are not too sparse, the k-means algorithm is quite effective and is one of\nthe fastest clustering algorithms available. This advantage is lost if one has\nto restart it several times to avoid convergence to a local minimum.\n\n"
30145
]
31146
}
32147
],

dev/_downloads/scikit-learn-docs.zip

270 KB
Binary file not shown.
-186 Bytes
1 Byte
-111 Bytes
-212 Bytes
-85 Bytes

0 commit comments

Comments
 (0)