scikit-learn
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
2.65 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
2.65 KB
diff --git a/‎dev/_downloads/6b00e458f3e282f1cc421f077b2fcad1/plot_spectral_biclustering.ipynb
Lines changed: 124 additions & 2 deletions b/‎dev/_downloads/6b00e458f3e282f1cc421f077b2fcad1/plot_spectral_biclustering.ipynb
Lines changed: 124 additions & 2 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
4.41 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
4.41 KB
diff --git a/‎dev/_downloads/ac19db97f4bbd077ccffef2736ed5f3d/plot_spectral_biclustering.py
Lines changed: 88 additions & 31 deletions b/‎dev/_downloads/ac19db97f4bbd077ccffef2736ed5f3d/plot_spectral_biclustering.py
Lines changed: 88 additions & 31 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
25.1 KB b/‎dev/_downloads/scikit-learn-docs.zip
25.1 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
182 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
182 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
243 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
243 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
321 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
321 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
216 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
216 Bytes
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 837e3b264c6b0087245d25d256a33f76
+config: 863036b5c08a0041ad9992072466972c
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -4,7 +4,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# A demo of the Spectral Biclustering algorithm\n\nThis example demonstrates how to generate a checkerboard dataset and\nbicluster it using the Spectral Biclustering algorithm.\n\nThe data is generated with the ``make_checkerboard`` function, then\nshuffled and passed to the Spectral Biclustering algorithm. The rows\nand columns of the shuffled matrix are rearranged to show the\nbiclusters found by the algorithm.\n\nThe outer product of the row and column label vectors shows a\nrepresentation of the checkerboard structure.\n"
+        "\n# A demo of the Spectral Biclustering algorithm\n\nThis example demonstrates how to generate a checkerboard dataset and bicluster\nit using the :class:`~sklearn.cluster.SpectralBiclustering` algorithm. The\nspectral biclustering algorithm is specifically designed to cluster data by\nsimultaneously considering both the rows (samples) and columns (features) of a\nmatrix. It aims to identify patterns not only between samples but also within\nsubsets of samples, allowing for the detection of localized structure within the\ndata. This makes spectral biclustering particularly well-suited for datasets\nwhere the order or arrangement of features is fixed, such as in images, time\nseries, or genomes.\n\nThe data is generated, then shuffled and passed to the spectral biclustering\nalgorithm. The rows and columns of the shuffled matrix are then rearranged to\nplot the biclusters found.\n"
       ]
     },
     {
@@ -15,7 +15,129 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Kemal Eren <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nfrom matplotlib import pyplot as plt\n\nfrom sklearn.datasets import make_checkerboard\nfrom sklearn.cluster import SpectralBiclustering\nfrom sklearn.metrics import consensus_score\n\n\nn_clusters = (4, 3)\ndata, rows, columns = make_checkerboard(\n    shape=(300, 300), n_clusters=n_clusters, noise=10, shuffle=False, random_state=0\n)\n\nplt.matshow(data, cmap=plt.cm.Blues)\nplt.title(\"Original dataset\")\n\n# shuffle clusters\nrng = np.random.RandomState(0)\nrow_idx = rng.permutation(data.shape[0])\ncol_idx = rng.permutation(data.shape[1])\ndata = data[row_idx][:, col_idx]\n\nplt.matshow(data, cmap=plt.cm.Blues)\nplt.title(\"Shuffled dataset\")\n\nmodel = SpectralBiclustering(n_clusters=n_clusters, method=\"log\", random_state=0)\nmodel.fit(data)\nscore = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx]))\n\nprint(\"consensus score: {:.1f}\".format(score))\n\nfit_data = data[np.argsort(model.row_labels_)]\nfit_data = fit_data[:, np.argsort(model.column_labels_)]\n\nplt.matshow(fit_data, cmap=plt.cm.Blues)\nplt.title(\"After biclustering; rearranged to show biclusters\")\n\nplt.matshow(\n    np.outer(np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1),\n    cmap=plt.cm.Blues,\n)\nplt.title(\"Checkerboard structure of rearranged data\")\n\nplt.show()"
+        "# Author: Kemal Eren <[email protected]>\n# License: BSD 3 clause"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Generate sample data\nWe generate the sample data using the\n:func:`~sklearn.datasets.make_checkerboard` function. Each pixel within\n`shape=(300, 300)` represents with it's color a value from a uniform\ndistribution. The noise is added from a normal distribution, where the value\nchosen for `noise` is the standard deviation.\n\nAs you can see, the data is distributed over 12 cluster cells and is\nrelatively well distinguishable.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.datasets import make_checkerboard\nfrom matplotlib import pyplot as plt\n\nn_clusters = (4, 3)\ndata, rows, columns = make_checkerboard(\n    shape=(300, 300), n_clusters=n_clusters, noise=10, shuffle=False, random_state=42\n)\n\nplt.matshow(data, cmap=plt.cm.Blues)\nplt.title(\"Original dataset\")\n_ = plt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We shuffle the data and the goal is to reconstruct it afterwards using\n:class:`~sklearn.bicluster.SpectralBiclustering`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n\n# Creating lists of shuffled row and column indices\nrng = np.random.RandomState(0)\nrow_idx_shuffled = rng.permutation(data.shape[0])\ncol_idx_shuffled = rng.permutation(data.shape[1])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We redefine the shuffled data and plot it. We observe that we lost the\nstrucuture of original data matrix.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "data = data[row_idx_shuffled][:, col_idx_shuffled]\n\nplt.matshow(data, cmap=plt.cm.Blues)\nplt.title(\"Shuffled dataset\")\n_ = plt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Fitting `SpectralBiclustering`\nWe fit the model and compare the obtained clusters with the ground truth. Note\nthat when creating the model we specify the same number of clusters that we\nused to create the dataset (`n_clusters = (4, 3)`), which will contribute to\nobtain a good result.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.cluster import SpectralBiclustering\nfrom sklearn.metrics import consensus_score\n\nmodel = SpectralBiclustering(n_clusters=n_clusters, method=\"log\", random_state=0)\nmodel.fit(data)\n\n# Compute the similarity of two sets of biclusters\nscore = consensus_score(\n    model.biclusters_, (rows[:, row_idx_shuffled], columns[:, col_idx_shuffled])\n)\nprint(f\"consensus score: {score:.1f}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The score is between 0 and 1, where 1 corresponds to a perfect matching. It\nshows the quality of the biclustering.\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plotting results\nNow, we rearrange the data based on the row and column labels assigned by the\n:class:`~sklearn.cluster.SpectralBiclustering` model in ascending order and\nplot again. The `row_labels_` range from 0 to 3, while the `column_labels_`\nrange from 0 to 2, representing a total of 4 clusters per row and 3 clusters\nper column.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Reordering first the rows and then the columns.\nreordered_rows = data[np.argsort(model.row_labels_)]\nreordered_data = reordered_rows[:, np.argsort(model.column_labels_)]\n\nplt.matshow(reordered_data, cmap=plt.cm.Blues)\nplt.title(\"After biclustering; rearranged to show biclusters\")\n_ = plt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "As a last step, we want to demonstrate the relationships between the row\nand column labels assigned by the model. Therefore, we create a grid with\n:func:`numpy.outer`, which takes the sorted `row_labels_` and `column_labels_`\nand adds 1 to each to ensure that the labels start from 1 instead of 0 for\nbetter visualization.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "plt.matshow(\n    np.outer(np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1),\n    cmap=plt.cm.Blues,\n)\nplt.title(\"Checkerboard structure of rearranged data\")\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The outer product of the row and column label vectors shows a representation\nof the checkerboard structure, where different combinations of row and column\nlabels are represented by different shades of blue.\n\n"
       ]
     }
   ],
 
@@ -3,63 +3,120 @@
 A demo of the Spectral Biclustering algorithm
 =============================================
 
-This example demonstrates how to generate a checkerboard dataset and
-bicluster it using the Spectral Biclustering algorithm.
-
-The data is generated with the ``make_checkerboard`` function, then
-shuffled and passed to the Spectral Biclustering algorithm. The rows
-and columns of the shuffled matrix are rearranged to show the
-biclusters found by the algorithm.
-
-The outer product of the row and column label vectors shows a
-representation of the checkerboard structure.
-
+This example demonstrates how to generate a checkerboard dataset and bicluster
+it using the :class:`~sklearn.cluster.SpectralBiclustering` algorithm. The
+spectral biclustering algorithm is specifically designed to cluster data by
+simultaneously considering both the rows (samples) and columns (features) of a
+matrix. It aims to identify patterns not only between samples but also within
+subsets of samples, allowing for the detection of localized structure within the
+data. This makes spectral biclustering particularly well-suited for datasets
+where the order or arrangement of features is fixed, such as in images, time
+series, or genomes.
+
+The data is generated, then shuffled and passed to the spectral biclustering
+algorithm. The rows and columns of the shuffled matrix are then rearranged to
+plot the biclusters found.
 """
 
 # Author: Kemal Eren <[email protected]>
 # License: BSD 3 clause
 
-import numpy as np
-from matplotlib import pyplot as plt
-
+# %%
+# Generate sample data
+# --------------------
+# We generate the sample data using the
+# :func:`~sklearn.datasets.make_checkerboard` function. Each pixel within
+# `shape=(300, 300)` represents with it's color a value from a uniform
+# distribution. The noise is added from a normal distribution, where the value
+# chosen for `noise` is the standard deviation.
+#
+# As you can see, the data is distributed over 12 cluster cells and is
+# relatively well distinguishable.
 from sklearn.datasets import make_checkerboard
-from sklearn.cluster import SpectralBiclustering
-from sklearn.metrics import consensus_score
-
+from matplotlib import pyplot as plt
 
 n_clusters = (4, 3)
 data, rows, columns = make_checkerboard(
-    shape=(300, 300), n_clusters=n_clusters, noise=10, shuffle=False, random_state=0
+    shape=(300, 300), n_clusters=n_clusters, noise=10, shuffle=False, random_state=42
 )
 
 plt.matshow(data, cmap=plt.cm.Blues)
 plt.title("Original dataset")
+_ = plt.show()
 
-# shuffle clusters
+# %%
+# We shuffle the data and the goal is to reconstruct it afterwards using
+# :class:`~sklearn.bicluster.SpectralBiclustering`.
+import numpy as np
+
+# Creating lists of shuffled row and column indices
 rng = np.random.RandomState(0)
-row_idx = rng.permutation(data.shape[0])
-col_idx = rng.permutation(data.shape[1])
-data = data[row_idx][:, col_idx]
+row_idx_shuffled = rng.permutation(data.shape[0])
+col_idx_shuffled = rng.permutation(data.shape[1])
+
+# %%
+# We redefine the shuffled data and plot it. We observe that we lost the
+# strucuture of original data matrix.
+data = data[row_idx_shuffled][:, col_idx_shuffled]
 
 plt.matshow(data, cmap=plt.cm.Blues)
 plt.title("Shuffled dataset")
+_ = plt.show()
+
+# %%
+# Fitting `SpectralBiclustering`
+# ------------------------------
+# We fit the model and compare the obtained clusters with the ground truth. Note
+# that when creating the model we specify the same number of clusters that we
+# used to create the dataset (`n_clusters = (4, 3)`), which will contribute to
+# obtain a good result.
+from sklearn.cluster import SpectralBiclustering
+from sklearn.metrics import consensus_score
 
 model = SpectralBiclustering(n_clusters=n_clusters, method="log", random_state=0)
 model.fit(data)
-score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx]))
-
-print("consensus score: {:.1f}".format(score))
 
-fit_data = data[np.argsort(model.row_labels_)]
-fit_data = fit_data[:, np.argsort(model.column_labels_)]
-
-plt.matshow(fit_data, cmap=plt.cm.Blues)
+# Compute the similarity of two sets of biclusters
+score = consensus_score(
+    model.biclusters_, (rows[:, row_idx_shuffled], columns[:, col_idx_shuffled])
+)
+print(f"consensus score: {score:.1f}")
+
+# %%
+# The score is between 0 and 1, where 1 corresponds to a perfect matching. It
+# shows the quality of the biclustering.
+
+# %%
+# Plotting results
+# ----------------
+# Now, we rearrange the data based on the row and column labels assigned by the
+# :class:`~sklearn.cluster.SpectralBiclustering` model in ascending order and
+# plot again. The `row_labels_` range from 0 to 3, while the `column_labels_`
+# range from 0 to 2, representing a total of 4 clusters per row and 3 clusters
+# per column.
+
+# Reordering first the rows and then the columns.
+reordered_rows = data[np.argsort(model.row_labels_)]
+reordered_data = reordered_rows[:, np.argsort(model.column_labels_)]
+
+plt.matshow(reordered_data, cmap=plt.cm.Blues)
 plt.title("After biclustering; rearranged to show biclusters")
-
+_ = plt.show()
+
+# %%
+# As a last step, we want to demonstrate the relationships between the row
+# and column labels assigned by the model. Therefore, we create a grid with
+# :func:`numpy.outer`, which takes the sorted `row_labels_` and `column_labels_`
+# and adds 1 to each to ensure that the labels start from 1 instead of 0 for
+# better visualization.
 plt.matshow(
     np.outer(np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1),
     cmap=plt.cm.Blues,
 )
 plt.title("Checkerboard structure of rearranged data")
-
 plt.show()
+
+# %%
+# The outer product of the row and column label vectors shows a representation
+# of the checkerboard structure, where different combinations of row and column
+# labels are represented by different shades of blue.