scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
7 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
7 Bytes
diff --git a/‎dev/_downloads/1f948ff6f5face5a362672c4e36dd01e/plot_mini_batch_kmeans.ipynb
Lines changed: 80 additions & 1 deletion b/‎dev/_downloads/1f948ff6f5face5a362672c4e36dd01e/plot_mini_batch_kmeans.ipynb
Lines changed: 80 additions & 1 deletion
diff --git a/‎dev/_downloads/3735f7086bbd0007cd42d2c1f2b96f47/plot_mini_batch_kmeans.py
Lines changed: 34 additions & 19 deletions b/‎dev/_downloads/3735f7086bbd0007cd42d2c1f2b96f47/plot_mini_batch_kmeans.py
Lines changed: 34 additions & 19 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.04 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.04 KB
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
-1.38 KB b/‎dev/_downloads/scikit-learn-docs.zip
-1.38 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-146 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-146 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-298 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-298 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
21 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
21 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
254 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
254 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-52 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-52 Bytes
@@ -18,6 +18,85 @@
         "\n# Comparison of the K-Means and MiniBatchKMeans clustering algorithms\n\nWe want to compare the performance of the MiniBatchKMeans and KMeans:\nthe MiniBatchKMeans is faster, but gives slightly different results (see\n`mini_batch_kmeans`).\n\nWe will cluster a set of data, first with KMeans and then with\nMiniBatchKMeans, and plot the results.\nWe will also plot the points that are labelled differently between the two\nalgorithms.\n"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Generate the data\n\nWe start by generating the blobs of data to be clustered.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\nfrom sklearn.datasets import make_blobs\n\nnp.random.seed(0)\n\nbatch_size = 45\ncenters = [[1, 1], [-1, -1], [1, -1]]\nn_clusters = len(centers)\nX, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Compute clustering with KMeans\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import time\nfrom sklearn.cluster import KMeans\n\nk_means = KMeans(init=\"k-means++\", n_clusters=3, n_init=10)\nt0 = time.time()\nk_means.fit(X)\nt_batch = time.time() - t0"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Compute clustering with MiniBatchKMeans\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.cluster import MiniBatchKMeans\n\nmbk = MiniBatchKMeans(\n    init=\"k-means++\",\n    n_clusters=3,\n    batch_size=batch_size,\n    n_init=10,\n    max_no_improvement=10,\n    verbose=0,\n)\nt0 = time.time()\nmbk.fit(X)\nt_mini_batch = time.time() - t0"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Establishing parity between clusters\n\nWe want to have the same color for the same cluster from both the\nMiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per\nclosest one.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.metrics.pairwise import pairwise_distances_argmin\n\nk_means_cluster_centers = k_means.cluster_centers_\norder = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_)\nmbk_means_cluster_centers = mbk.cluster_centers_[order]\n\nk_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)\nmbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plotting the results\n\n"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -26,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "import time\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.cluster import MiniBatchKMeans, KMeans\nfrom sklearn.metrics.pairwise import pairwise_distances_argmin\nfrom sklearn.datasets import make_blobs\n\n# #############################################################################\n# Generate sample data\nnp.random.seed(0)\n\nbatch_size = 45\ncenters = [[1, 1], [-1, -1], [1, -1]]\nn_clusters = len(centers)\nX, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)\n\n# #############################################################################\n# Compute clustering with Means\n\nk_means = KMeans(init=\"k-means++\", n_clusters=3, n_init=10)\nt0 = time.time()\nk_means.fit(X)\nt_batch = time.time() - t0\n\n# #############################################################################\n# Compute clustering with MiniBatchKMeans\n\nmbk = MiniBatchKMeans(\n    init=\"k-means++\",\n    n_clusters=3,\n    batch_size=batch_size,\n    n_init=10,\n    max_no_improvement=10,\n    verbose=0,\n)\nt0 = time.time()\nmbk.fit(X)\nt_mini_batch = time.time() - t0\n\n# #############################################################################\n# Plot result\n\nfig = plt.figure(figsize=(8, 3))\nfig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)\ncolors = [\"#4EACC5\", \"#FF9C34\", \"#4E9A06\"]\n\n# We want to have the same colors for the same cluster from the\n# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per\n# closest one.\nk_means_cluster_centers = k_means.cluster_centers_\norder = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_)\nmbk_means_cluster_centers = mbk.cluster_centers_[order]\n\nk_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)\nmbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers)\n\n# KMeans\nax = fig.add_subplot(1, 3, 1)\nfor k, col in zip(range(n_clusters), colors):\n    my_members = k_means_labels == k\n    cluster_center = k_means_cluster_centers[k]\n    ax.plot(X[my_members, 0], X[my_members, 1], \"w\", markerfacecolor=col, marker=\".\")\n    ax.plot(\n        cluster_center[0],\n        cluster_center[1],\n        \"o\",\n        markerfacecolor=col,\n        markeredgecolor=\"k\",\n        markersize=6,\n    )\nax.set_title(\"KMeans\")\nax.set_xticks(())\nax.set_yticks(())\nplt.text(-3.5, 1.8, \"train time: %.2fs\\ninertia: %f\" % (t_batch, k_means.inertia_))\n\n# MiniBatchKMeans\nax = fig.add_subplot(1, 3, 2)\nfor k, col in zip(range(n_clusters), colors):\n    my_members = mbk_means_labels == k\n    cluster_center = mbk_means_cluster_centers[k]\n    ax.plot(X[my_members, 0], X[my_members, 1], \"w\", markerfacecolor=col, marker=\".\")\n    ax.plot(\n        cluster_center[0],\n        cluster_center[1],\n        \"o\",\n        markerfacecolor=col,\n        markeredgecolor=\"k\",\n        markersize=6,\n    )\nax.set_title(\"MiniBatchKMeans\")\nax.set_xticks(())\nax.set_yticks(())\nplt.text(-3.5, 1.8, \"train time: %.2fs\\ninertia: %f\" % (t_mini_batch, mbk.inertia_))\n\n# Initialise the different array to all False\ndifferent = mbk_means_labels == 4\nax = fig.add_subplot(1, 3, 3)\n\nfor k in range(n_clusters):\n    different += (k_means_labels == k) != (mbk_means_labels == k)\n\nidentic = np.logical_not(different)\nax.plot(X[identic, 0], X[identic, 1], \"w\", markerfacecolor=\"#bbbbbb\", marker=\".\")\nax.plot(X[different, 0], X[different, 1], \"w\", markerfacecolor=\"m\", marker=\".\")\nax.set_title(\"Difference\")\nax.set_xticks(())\nax.set_yticks(())\n\nplt.show()"
+        "import matplotlib.pyplot as plt\n\nfig = plt.figure(figsize=(8, 3))\nfig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)\ncolors = [\"#4EACC5\", \"#FF9C34\", \"#4E9A06\"]\n\n# KMeans\nax = fig.add_subplot(1, 3, 1)\nfor k, col in zip(range(n_clusters), colors):\n    my_members = k_means_labels == k\n    cluster_center = k_means_cluster_centers[k]\n    ax.plot(X[my_members, 0], X[my_members, 1], \"w\", markerfacecolor=col, marker=\".\")\n    ax.plot(\n        cluster_center[0],\n        cluster_center[1],\n        \"o\",\n        markerfacecolor=col,\n        markeredgecolor=\"k\",\n        markersize=6,\n    )\nax.set_title(\"KMeans\")\nax.set_xticks(())\nax.set_yticks(())\nplt.text(-3.5, 1.8, \"train time: %.2fs\\ninertia: %f\" % (t_batch, k_means.inertia_))\n\n# MiniBatchKMeans\nax = fig.add_subplot(1, 3, 2)\nfor k, col in zip(range(n_clusters), colors):\n    my_members = mbk_means_labels == k\n    cluster_center = mbk_means_cluster_centers[k]\n    ax.plot(X[my_members, 0], X[my_members, 1], \"w\", markerfacecolor=col, marker=\".\")\n    ax.plot(\n        cluster_center[0],\n        cluster_center[1],\n        \"o\",\n        markerfacecolor=col,\n        markeredgecolor=\"k\",\n        markersize=6,\n    )\nax.set_title(\"MiniBatchKMeans\")\nax.set_xticks(())\nax.set_yticks(())\nplt.text(-3.5, 1.8, \"train time: %.2fs\\ninertia: %f\" % (t_mini_batch, mbk.inertia_))\n\n# Initialize the different array to all False\ndifferent = mbk_means_labels == 4\nax = fig.add_subplot(1, 3, 3)\n\nfor k in range(n_clusters):\n    different += (k_means_labels == k) != (mbk_means_labels == k)\n\nidentic = np.logical_not(different)\nax.plot(X[identic, 0], X[identic, 1], \"w\", markerfacecolor=\"#bbbbbb\", marker=\".\")\nax.plot(X[different, 0], X[different, 1], \"w\", markerfacecolor=\"m\", marker=\".\")\nax.set_title(\"Difference\")\nax.set_xticks(())\nax.set_yticks(())\n\nplt.show()"
       ]
     }
   ],
 
@@ -14,34 +14,39 @@
 
 """
 
-import time
+# %%
+# Generate the data
+# -----------------
+#
+# We start by generating the blobs of data to be clustered.
 
 import numpy as np
-import matplotlib.pyplot as plt
-
-from sklearn.cluster import MiniBatchKMeans, KMeans
-from sklearn.metrics.pairwise import pairwise_distances_argmin
 from sklearn.datasets import make_blobs
 
-# #############################################################################
-# Generate sample data
 np.random.seed(0)
 
 batch_size = 45
 centers = [[1, 1], [-1, -1], [1, -1]]
 n_clusters = len(centers)
 X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)
 
-# #############################################################################
-# Compute clustering with Means
+# %%
+# Compute clustering with KMeans
+# ------------------------------
+
+import time
+from sklearn.cluster import KMeans
 
 k_means = KMeans(init="k-means++", n_clusters=3, n_init=10)
 t0 = time.time()
 k_means.fit(X)
 t_batch = time.time() - t0
 
-# #############################################################################
+# %%
 # Compute clustering with MiniBatchKMeans
+# ---------------------------------------
+
+from sklearn.cluster import MiniBatchKMeans
 
 mbk = MiniBatchKMeans(
     init="k-means++",
@@ -55,23 +60,33 @@
 mbk.fit(X)
 t_mini_batch = time.time() - t0
 
-# #############################################################################
-# Plot result
-
-fig = plt.figure(figsize=(8, 3))
-fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
-colors = ["#4EACC5", "#FF9C34", "#4E9A06"]
-
-# We want to have the same colors for the same cluster from the
+# %%
+# Establishing parity between clusters
+# ------------------------------------
+#
+# We want to have the same color for the same cluster from both the
 # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
 # closest one.
+
+from sklearn.metrics.pairwise import pairwise_distances_argmin
+
 k_means_cluster_centers = k_means.cluster_centers_
 order = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_)
 mbk_means_cluster_centers = mbk.cluster_centers_[order]
 
 k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
 mbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers)
 
+# %%
+# Plotting the results
+# --------------------
+
+import matplotlib.pyplot as plt
+
+fig = plt.figure(figsize=(8, 3))
+fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
+colors = ["#4EACC5", "#FF9C34", "#4E9A06"]
+
 # KMeans
 ax = fig.add_subplot(1, 3, 1)
 for k, col in zip(range(n_clusters), colors):
@@ -110,7 +125,7 @@
 ax.set_yticks(())
 plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_mini_batch, mbk.inertia_))
 
-# Initialise the different array to all False
+# Initialize the different array to all False
 different = mbk_means_labels == 4
 ax = fig.add_subplot(1, 3, 3)