Skip to content

Commit 6d3e92e

Browse files
committed
Pushing the docs to dev/ for branch: main, commit f9321be77981ed82138da1d506f920af48c831a6
1 parent 941d927 commit 6d3e92e

File tree

1,223 files changed

+4704
-4444
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,223 files changed

+4704
-4444
lines changed
Binary file not shown.

dev/_downloads/1f948ff6f5face5a362672c4e36dd01e/plot_mini_batch_kmeans.ipynb

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,85 @@
1818
"\n# Comparison of the K-Means and MiniBatchKMeans clustering algorithms\n\nWe want to compare the performance of the MiniBatchKMeans and KMeans:\nthe MiniBatchKMeans is faster, but gives slightly different results (see\n`mini_batch_kmeans`).\n\nWe will cluster a set of data, first with KMeans and then with\nMiniBatchKMeans, and plot the results.\nWe will also plot the points that are labelled differently between the two\nalgorithms.\n"
1919
]
2020
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"## Generate the data\n\nWe start by generating the blobs of data to be clustered.\n\n"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"metadata": {
32+
"collapsed": false
33+
},
34+
"outputs": [],
35+
"source": [
36+
"import numpy as np\nfrom sklearn.datasets import make_blobs\n\nnp.random.seed(0)\n\nbatch_size = 45\ncenters = [[1, 1], [-1, -1], [1, -1]]\nn_clusters = len(centers)\nX, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)"
37+
]
38+
},
39+
{
40+
"cell_type": "markdown",
41+
"metadata": {},
42+
"source": [
43+
"## Compute clustering with KMeans\n\n"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": null,
49+
"metadata": {
50+
"collapsed": false
51+
},
52+
"outputs": [],
53+
"source": [
54+
"import time\nfrom sklearn.cluster import KMeans\n\nk_means = KMeans(init=\"k-means++\", n_clusters=3, n_init=10)\nt0 = time.time()\nk_means.fit(X)\nt_batch = time.time() - t0"
55+
]
56+
},
57+
{
58+
"cell_type": "markdown",
59+
"metadata": {},
60+
"source": [
61+
"## Compute clustering with MiniBatchKMeans\n\n"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": null,
67+
"metadata": {
68+
"collapsed": false
69+
},
70+
"outputs": [],
71+
"source": [
72+
"from sklearn.cluster import MiniBatchKMeans\n\nmbk = MiniBatchKMeans(\n init=\"k-means++\",\n n_clusters=3,\n batch_size=batch_size,\n n_init=10,\n max_no_improvement=10,\n verbose=0,\n)\nt0 = time.time()\nmbk.fit(X)\nt_mini_batch = time.time() - t0"
73+
]
74+
},
75+
{
76+
"cell_type": "markdown",
77+
"metadata": {},
78+
"source": [
79+
"## Establishing parity between clusters\n\nWe want to have the same color for the same cluster from both the\nMiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per\nclosest one.\n\n"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": null,
85+
"metadata": {
86+
"collapsed": false
87+
},
88+
"outputs": [],
89+
"source": [
90+
"from sklearn.metrics.pairwise import pairwise_distances_argmin\n\nk_means_cluster_centers = k_means.cluster_centers_\norder = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_)\nmbk_means_cluster_centers = mbk.cluster_centers_[order]\n\nk_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)\nmbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers)"
91+
]
92+
},
93+
{
94+
"cell_type": "markdown",
95+
"metadata": {},
96+
"source": [
97+
"## Plotting the results\n\n"
98+
]
99+
},
21100
{
22101
"cell_type": "code",
23102
"execution_count": null,
@@ -26,7 +105,7 @@
26105
},
27106
"outputs": [],
28107
"source": [
29-
"import time\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.cluster import MiniBatchKMeans, KMeans\nfrom sklearn.metrics.pairwise import pairwise_distances_argmin\nfrom sklearn.datasets import make_blobs\n\n# #############################################################################\n# Generate sample data\nnp.random.seed(0)\n\nbatch_size = 45\ncenters = [[1, 1], [-1, -1], [1, -1]]\nn_clusters = len(centers)\nX, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)\n\n# #############################################################################\n# Compute clustering with Means\n\nk_means = KMeans(init=\"k-means++\", n_clusters=3, n_init=10)\nt0 = time.time()\nk_means.fit(X)\nt_batch = time.time() - t0\n\n# #############################################################################\n# Compute clustering with MiniBatchKMeans\n\nmbk = MiniBatchKMeans(\n init=\"k-means++\",\n n_clusters=3,\n batch_size=batch_size,\n n_init=10,\n max_no_improvement=10,\n verbose=0,\n)\nt0 = time.time()\nmbk.fit(X)\nt_mini_batch = time.time() - t0\n\n# #############################################################################\n# Plot result\n\nfig = plt.figure(figsize=(8, 3))\nfig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)\ncolors = [\"#4EACC5\", \"#FF9C34\", \"#4E9A06\"]\n\n# We want to have the same colors for the same cluster from the\n# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per\n# closest one.\nk_means_cluster_centers = k_means.cluster_centers_\norder = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_)\nmbk_means_cluster_centers = mbk.cluster_centers_[order]\n\nk_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)\nmbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers)\n\n# KMeans\nax = fig.add_subplot(1, 3, 1)\nfor k, col in zip(range(n_clusters), colors):\n my_members = k_means_labels == k\n cluster_center = k_means_cluster_centers[k]\n ax.plot(X[my_members, 0], X[my_members, 1], \"w\", markerfacecolor=col, marker=\".\")\n ax.plot(\n cluster_center[0],\n cluster_center[1],\n \"o\",\n markerfacecolor=col,\n markeredgecolor=\"k\",\n markersize=6,\n )\nax.set_title(\"KMeans\")\nax.set_xticks(())\nax.set_yticks(())\nplt.text(-3.5, 1.8, \"train time: %.2fs\\ninertia: %f\" % (t_batch, k_means.inertia_))\n\n# MiniBatchKMeans\nax = fig.add_subplot(1, 3, 2)\nfor k, col in zip(range(n_clusters), colors):\n my_members = mbk_means_labels == k\n cluster_center = mbk_means_cluster_centers[k]\n ax.plot(X[my_members, 0], X[my_members, 1], \"w\", markerfacecolor=col, marker=\".\")\n ax.plot(\n cluster_center[0],\n cluster_center[1],\n \"o\",\n markerfacecolor=col,\n markeredgecolor=\"k\",\n markersize=6,\n )\nax.set_title(\"MiniBatchKMeans\")\nax.set_xticks(())\nax.set_yticks(())\nplt.text(-3.5, 1.8, \"train time: %.2fs\\ninertia: %f\" % (t_mini_batch, mbk.inertia_))\n\n# Initialise the different array to all False\ndifferent = mbk_means_labels == 4\nax = fig.add_subplot(1, 3, 3)\n\nfor k in range(n_clusters):\n different += (k_means_labels == k) != (mbk_means_labels == k)\n\nidentic = np.logical_not(different)\nax.plot(X[identic, 0], X[identic, 1], \"w\", markerfacecolor=\"#bbbbbb\", marker=\".\")\nax.plot(X[different, 0], X[different, 1], \"w\", markerfacecolor=\"m\", marker=\".\")\nax.set_title(\"Difference\")\nax.set_xticks(())\nax.set_yticks(())\n\nplt.show()"
108+
"import matplotlib.pyplot as plt\n\nfig = plt.figure(figsize=(8, 3))\nfig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)\ncolors = [\"#4EACC5\", \"#FF9C34\", \"#4E9A06\"]\n\n# KMeans\nax = fig.add_subplot(1, 3, 1)\nfor k, col in zip(range(n_clusters), colors):\n my_members = k_means_labels == k\n cluster_center = k_means_cluster_centers[k]\n ax.plot(X[my_members, 0], X[my_members, 1], \"w\", markerfacecolor=col, marker=\".\")\n ax.plot(\n cluster_center[0],\n cluster_center[1],\n \"o\",\n markerfacecolor=col,\n markeredgecolor=\"k\",\n markersize=6,\n )\nax.set_title(\"KMeans\")\nax.set_xticks(())\nax.set_yticks(())\nplt.text(-3.5, 1.8, \"train time: %.2fs\\ninertia: %f\" % (t_batch, k_means.inertia_))\n\n# MiniBatchKMeans\nax = fig.add_subplot(1, 3, 2)\nfor k, col in zip(range(n_clusters), colors):\n my_members = mbk_means_labels == k\n cluster_center = mbk_means_cluster_centers[k]\n ax.plot(X[my_members, 0], X[my_members, 1], \"w\", markerfacecolor=col, marker=\".\")\n ax.plot(\n cluster_center[0],\n cluster_center[1],\n \"o\",\n markerfacecolor=col,\n markeredgecolor=\"k\",\n markersize=6,\n )\nax.set_title(\"MiniBatchKMeans\")\nax.set_xticks(())\nax.set_yticks(())\nplt.text(-3.5, 1.8, \"train time: %.2fs\\ninertia: %f\" % (t_mini_batch, mbk.inertia_))\n\n# Initialize the different array to all False\ndifferent = mbk_means_labels == 4\nax = fig.add_subplot(1, 3, 3)\n\nfor k in range(n_clusters):\n different += (k_means_labels == k) != (mbk_means_labels == k)\n\nidentic = np.logical_not(different)\nax.plot(X[identic, 0], X[identic, 1], \"w\", markerfacecolor=\"#bbbbbb\", marker=\".\")\nax.plot(X[different, 0], X[different, 1], \"w\", markerfacecolor=\"m\", marker=\".\")\nax.set_title(\"Difference\")\nax.set_xticks(())\nax.set_yticks(())\n\nplt.show()"
30109
]
31110
}
32111
],

dev/_downloads/3735f7086bbd0007cd42d2c1f2b96f47/plot_mini_batch_kmeans.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,34 +14,39 @@
1414
1515
"""
1616

17-
import time
17+
# %%
18+
# Generate the data
19+
# -----------------
20+
#
21+
# We start by generating the blobs of data to be clustered.
1822

1923
import numpy as np
20-
import matplotlib.pyplot as plt
21-
22-
from sklearn.cluster import MiniBatchKMeans, KMeans
23-
from sklearn.metrics.pairwise import pairwise_distances_argmin
2424
from sklearn.datasets import make_blobs
2525

26-
# #############################################################################
27-
# Generate sample data
2826
np.random.seed(0)
2927

3028
batch_size = 45
3129
centers = [[1, 1], [-1, -1], [1, -1]]
3230
n_clusters = len(centers)
3331
X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)
3432

35-
# #############################################################################
36-
# Compute clustering with Means
33+
# %%
34+
# Compute clustering with KMeans
35+
# ------------------------------
36+
37+
import time
38+
from sklearn.cluster import KMeans
3739

3840
k_means = KMeans(init="k-means++", n_clusters=3, n_init=10)
3941
t0 = time.time()
4042
k_means.fit(X)
4143
t_batch = time.time() - t0
4244

43-
# #############################################################################
45+
# %%
4446
# Compute clustering with MiniBatchKMeans
47+
# ---------------------------------------
48+
49+
from sklearn.cluster import MiniBatchKMeans
4550

4651
mbk = MiniBatchKMeans(
4752
init="k-means++",
@@ -55,23 +60,33 @@
5560
mbk.fit(X)
5661
t_mini_batch = time.time() - t0
5762

58-
# #############################################################################
59-
# Plot result
60-
61-
fig = plt.figure(figsize=(8, 3))
62-
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
63-
colors = ["#4EACC5", "#FF9C34", "#4E9A06"]
64-
65-
# We want to have the same colors for the same cluster from the
63+
# %%
64+
# Establishing parity between clusters
65+
# ------------------------------------
66+
#
67+
# We want to have the same color for the same cluster from both the
6668
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
6769
# closest one.
70+
71+
from sklearn.metrics.pairwise import pairwise_distances_argmin
72+
6873
k_means_cluster_centers = k_means.cluster_centers_
6974
order = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_)
7075
mbk_means_cluster_centers = mbk.cluster_centers_[order]
7176

7277
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
7378
mbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers)
7479

80+
# %%
81+
# Plotting the results
82+
# --------------------
83+
84+
import matplotlib.pyplot as plt
85+
86+
fig = plt.figure(figsize=(8, 3))
87+
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
88+
colors = ["#4EACC5", "#FF9C34", "#4E9A06"]
89+
7590
# KMeans
7691
ax = fig.add_subplot(1, 3, 1)
7792
for k, col in zip(range(n_clusters), colors):
@@ -110,7 +125,7 @@
110125
ax.set_yticks(())
111126
plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_mini_batch, mbk.inertia_))
112127

113-
# Initialise the different array to all False
128+
# Initialize the different array to all False
114129
different = mbk_means_labels == 4
115130
ax = fig.add_subplot(1, 3, 3)
116131

Binary file not shown.

dev/_downloads/scikit-learn-docs.zip

-1.38 KB
Binary file not shown.
-146 Bytes
-298 Bytes
21 Bytes
254 Bytes
-52 Bytes

0 commit comments

Comments
 (0)