Skip to content

Commit 1ba0335

Browse files
committed
Pushing the docs to dev/ for branch: main, commit d7c84b239c18a4627e6af7c369a504cb7609bfcd
1 parent 0b0192d commit 1ba0335

File tree

1,232 files changed

+5787
-4744
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,232 files changed

+5787
-4744
lines changed
Binary file not shown.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# GMM Initialization Methods\n\nExamples of the different methods of initialization in Gaussian Mixture Models\n\nSee `gmm` for more information on the estimator.\n\nHere we generate some sample data with four easy to identify clusters. The\npurpose of this example is to show the four different methods for the\ninitialization parameter *init_param*.\n\nThe four initializations are *kmeans* (default), *random*, *random_from_data* and\n*k-means++*.\n\nOrange diamonds represent the initialization centers for the gmm generated by\nthe *init_param*. The rest of the data is represented as crosses and the\ncolouring represents the eventual associated classification after the GMM has\nfinished.\n\nThe numbers in the top right of each subplot represent the number of\niterations taken for the GaussianMixture to converge and the relative time\ntaken for the initialization part of the algorithm to run. The shorter\ninitialization times tend to have a greater number of iterations to converge.\n\nThe initialization time is the ratio of the time taken for that method versus\nthe time taken for the default *kmeans* method. As you can see all three\nalternative methods take less time to initialize when compared to *kmeans*.\n\nIn this example, when initialized with *random_from_data* or *random* the model takes\nmore iterations to converge. Here *k-means++* does a good job of both low\ntime to initialize and low number of GaussianMixture iterations to converge.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# Author: Gordon Walsh <[email protected]>\n# Data generation code from Jake Vanderplas <[email protected]>\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom sklearn.mixture import GaussianMixture\nfrom sklearn.utils.extmath import row_norms\nfrom sklearn.datasets._samples_generator import make_blobs\nfrom timeit import default_timer as timer\n\nprint(__doc__)\n\n# Generate some data\n\nX, y_true = make_blobs(n_samples=4000, centers=4, cluster_std=0.60, random_state=0)\nX = X[:, ::-1]\n\nn_samples = 4000\nn_components = 4\nx_squared_norms = row_norms(X, squared=True)\n\n\ndef get_initial_means(X, init_params, r):\n # Run a GaussianMixture with max_iter=0 to output the initalization means\n gmm = GaussianMixture(\n n_components=4, init_params=init_params, tol=1e-9, max_iter=0, random_state=r\n ).fit(X)\n return gmm.means_\n\n\nmethods = [\"kmeans\", \"random_from_data\", \"k-means++\", \"random\"]\ncolors = [\"navy\", \"turquoise\", \"cornflowerblue\", \"darkorange\"]\ntimes_init = {}\nrelative_times = {}\n\nplt.figure(figsize=(4 * len(methods) // 2, 6))\nplt.subplots_adjust(\n bottom=0.1, top=0.9, hspace=0.15, wspace=0.05, left=0.05, right=0.95\n)\n\nfor n, method in enumerate(methods):\n r = np.random.RandomState(seed=1234)\n plt.subplot(2, len(methods) // 2, n + 1)\n\n start = timer()\n ini = get_initial_means(X, method, r)\n end = timer()\n init_time = end - start\n\n gmm = GaussianMixture(\n n_components=4, means_init=ini, tol=1e-9, max_iter=2000, random_state=r\n ).fit(X)\n\n times_init[method] = init_time\n for i, color in enumerate(colors):\n data = X[gmm.predict(X) == i]\n plt.scatter(data[:, 0], data[:, 1], color=color, marker=\"x\")\n\n plt.scatter(\n ini[:, 0], ini[:, 1], s=75, marker=\"D\", c=\"orange\", lw=1.5, edgecolors=\"black\"\n )\n relative_times[method] = times_init[method] / times_init[methods[0]]\n\n plt.xticks(())\n plt.yticks(())\n plt.title(method, loc=\"left\", fontsize=12)\n plt.title(\n \"Iter %i | Init Time %.2fx\" % (gmm.n_iter_, relative_times[method]),\n loc=\"right\",\n fontsize=10,\n )\nplt.suptitle(\"GMM iterations and relative time taken to initialize\")\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.9.12"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Binary file not shown.
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
"""
2+
==========================
3+
GMM Initialization Methods
4+
==========================
5+
6+
Examples of the different methods of initialization in Gaussian Mixture Models
7+
8+
See :ref:`gmm` for more information on the estimator.
9+
10+
Here we generate some sample data with four easy to identify clusters. The
11+
purpose of this example is to show the four different methods for the
12+
initialization parameter *init_param*.
13+
14+
The four initializations are *kmeans* (default), *random*, *random_from_data* and
15+
*k-means++*.
16+
17+
Orange diamonds represent the initialization centers for the gmm generated by
18+
the *init_param*. The rest of the data is represented as crosses and the
19+
colouring represents the eventual associated classification after the GMM has
20+
finished.
21+
22+
The numbers in the top right of each subplot represent the number of
23+
iterations taken for the GaussianMixture to converge and the relative time
24+
taken for the initialization part of the algorithm to run. The shorter
25+
initialization times tend to have a greater number of iterations to converge.
26+
27+
The initialization time is the ratio of the time taken for that method versus
28+
the time taken for the default *kmeans* method. As you can see all three
29+
alternative methods take less time to initialize when compared to *kmeans*.
30+
31+
In this example, when initialized with *random_from_data* or *random* the model takes
32+
more iterations to converge. Here *k-means++* does a good job of both low
33+
time to initialize and low number of GaussianMixture iterations to converge.
34+
"""
35+
36+
37+
# Author: Gordon Walsh <[email protected]>
38+
# Data generation code from Jake Vanderplas <[email protected]>
39+
40+
import matplotlib.pyplot as plt
41+
import numpy as np
42+
from sklearn.mixture import GaussianMixture
43+
from sklearn.utils.extmath import row_norms
44+
from sklearn.datasets._samples_generator import make_blobs
45+
from timeit import default_timer as timer
46+
47+
print(__doc__)
48+
49+
# Generate some data
50+
51+
X, y_true = make_blobs(n_samples=4000, centers=4, cluster_std=0.60, random_state=0)
52+
X = X[:, ::-1]
53+
54+
n_samples = 4000
55+
n_components = 4
56+
x_squared_norms = row_norms(X, squared=True)
57+
58+
59+
def get_initial_means(X, init_params, r):
60+
# Run a GaussianMixture with max_iter=0 to output the initalization means
61+
gmm = GaussianMixture(
62+
n_components=4, init_params=init_params, tol=1e-9, max_iter=0, random_state=r
63+
).fit(X)
64+
return gmm.means_
65+
66+
67+
methods = ["kmeans", "random_from_data", "k-means++", "random"]
68+
colors = ["navy", "turquoise", "cornflowerblue", "darkorange"]
69+
times_init = {}
70+
relative_times = {}
71+
72+
plt.figure(figsize=(4 * len(methods) // 2, 6))
73+
plt.subplots_adjust(
74+
bottom=0.1, top=0.9, hspace=0.15, wspace=0.05, left=0.05, right=0.95
75+
)
76+
77+
for n, method in enumerate(methods):
78+
r = np.random.RandomState(seed=1234)
79+
plt.subplot(2, len(methods) // 2, n + 1)
80+
81+
start = timer()
82+
ini = get_initial_means(X, method, r)
83+
end = timer()
84+
init_time = end - start
85+
86+
gmm = GaussianMixture(
87+
n_components=4, means_init=ini, tol=1e-9, max_iter=2000, random_state=r
88+
).fit(X)
89+
90+
times_init[method] = init_time
91+
for i, color in enumerate(colors):
92+
data = X[gmm.predict(X) == i]
93+
plt.scatter(data[:, 0], data[:, 1], color=color, marker="x")
94+
95+
plt.scatter(
96+
ini[:, 0], ini[:, 1], s=75, marker="D", c="orange", lw=1.5, edgecolors="black"
97+
)
98+
relative_times[method] = times_init[method] / times_init[methods[0]]
99+
100+
plt.xticks(())
101+
plt.yticks(())
102+
plt.title(method, loc="left", fontsize=12)
103+
plt.title(
104+
"Iter %i | Init Time %.2fx" % (gmm.n_iter_, relative_times[method]),
105+
loc="right",
106+
fontsize=10,
107+
)
108+
plt.suptitle("GMM iterations and relative time taken to initialize")
109+
plt.show()

dev/_downloads/scikit-learn-docs.zip

167 KB
Binary file not shown.
99 Bytes
314 Bytes
-88 Bytes
18 Bytes
-2.91 KB

0 commit comments

Comments
 (0)