Skip to content

Commit 03d4a1e

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 056864d9c558131d2706b46f6ddf084671b428b6
1 parent ac9450a commit 03d4a1e

File tree

1,328 files changed

+7239
-7195
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,328 files changed

+7239
-7195
lines changed

dev/.buildinfo

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: 63371d7052f9cfa472a1470b98355ba6
3+
config: e5e861ce384f70b2e75a75e7791ab430
44
tags: 645f666f9bcd5a90fca523b33c5a78b7
Binary file not shown.

dev/_downloads/18eb95af29bd5554020a8428b3ceac54/plot_cluster_iris.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"\n# K-means Clustering\n\nThe plot shows:\n\n- top left: What a K-means algorithm would yield using 8 clusters.\n\n- top right: What the effect of a bad initialization is\n on the classification process: By setting n_init to only 1\n (default is 10), the amount of times that the algorithm will\n be run with different centroid seeds is reduced.\n\n- bottom left: What using eight clusters would deliver.\n\n- bottom right: The ground truth.\n"
7+
"\n# K-means Clustering\n\nThe plot shows:\n\n- top left: What a K-means algorithm would yield using 8 clusters.\n\n- top right: What using three clusters would deliver.\n\n- bottom left: What the effect of a bad initialization is\n on the classification process: By setting n_init to only 1\n (default is 10), the amount of times that the algorithm will\n be run with different centroid seeds is reduced.\n\n- bottom right: The ground truth.\n"
88
]
99
},
1010
{
@@ -15,7 +15,7 @@
1515
},
1616
"outputs": [],
1717
"source": [
18-
"# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\n# Though the following import is not directly being used, it is required\n# for 3D projection to work with matplotlib < 3.2\nimport mpl_toolkits.mplot3d # noqa: F401\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.cluster import KMeans\n\nnp.random.seed(5)\n\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\n\nestimators = [\n (\"k_means_iris_8\", KMeans(n_clusters=8)),\n (\"k_means_iris_3\", KMeans(n_clusters=3)),\n (\"k_means_iris_bad_init\", KMeans(n_clusters=3, n_init=1, init=\"random\")),\n]\n\nfig = plt.figure(figsize=(10, 8))\ntitles = [\"8 clusters\", \"3 clusters\", \"3 clusters, bad initialization\"]\nfor idx, ((name, est), title) in enumerate(zip(estimators, titles)):\n ax = fig.add_subplot(2, 2, idx + 1, projection=\"3d\", elev=48, azim=134)\n est.fit(X)\n labels = est.labels_\n\n ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor=\"k\")\n\n ax.xaxis.set_ticklabels([])\n ax.yaxis.set_ticklabels([])\n ax.zaxis.set_ticklabels([])\n ax.set_xlabel(\"Petal width\")\n ax.set_ylabel(\"Sepal length\")\n ax.set_zlabel(\"Petal length\")\n ax.set_title(title)\n\n# Plot the ground truth\nax = fig.add_subplot(2, 2, 4, projection=\"3d\", elev=48, azim=134)\n\nfor name, label in [(\"Setosa\", 0), (\"Versicolour\", 1), (\"Virginica\", 2)]:\n ax.text3D(\n X[y == label, 3].mean(),\n X[y == label, 0].mean(),\n X[y == label, 2].mean() + 2,\n name,\n horizontalalignment=\"center\",\n bbox=dict(alpha=0.2, edgecolor=\"w\", facecolor=\"w\"),\n )\n# Reorder the labels to have colors matching the cluster results\ny = np.choose(y, [1, 2, 0]).astype(float)\nax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor=\"k\")\n\nax.xaxis.set_ticklabels([])\nax.yaxis.set_ticklabels([])\nax.zaxis.set_ticklabels([])\nax.set_xlabel(\"Petal width\")\nax.set_ylabel(\"Sepal length\")\nax.set_zlabel(\"Petal length\")\nax.set_title(\"Ground Truth\")\n\nplt.subplots_adjust(wspace=0.25, hspace=0.25)\nplt.show()"
18+
"# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\n# Though the following import is not directly being used, it is required\n# for 3D projection to work with matplotlib < 3.2\nimport mpl_toolkits.mplot3d # noqa: F401\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.cluster import KMeans\n\nnp.random.seed(5)\n\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\n\nestimators = [\n (\"k_means_iris_8\", KMeans(n_clusters=8)),\n (\"k_means_iris_3\", KMeans(n_clusters=3)),\n (\"k_means_iris_bad_init\", KMeans(n_clusters=3, n_init=1, init=\"random\")),\n]\n\nfig = plt.figure(figsize=(10, 8))\ntitles = [\"8 clusters\", \"3 clusters\", \"3 clusters, bad initialization\"]\nfor idx, ((name, est), title) in enumerate(zip(estimators, titles)):\n ax = fig.add_subplot(2, 2, idx + 1, projection=\"3d\", elev=48, azim=134)\n est.fit(X)\n labels = est.labels_\n\n ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor=\"k\")\n\n ax.xaxis.set_ticklabels([])\n ax.yaxis.set_ticklabels([])\n ax.zaxis.set_ticklabels([])\n ax.set_xlabel(\"Petal width\")\n ax.set_ylabel(\"Sepal length\")\n ax.set_zlabel(\"Petal length\")\n ax.set_title(title)\n\n# Plot the ground truth\nax = fig.add_subplot(2, 2, 4, projection=\"3d\", elev=48, azim=134)\n\nfor name, label in [(\"Setosa\", 0), (\"Versicolour\", 1), (\"Virginica\", 2)]:\n ax.text3D(\n X[y == label, 3].mean(),\n X[y == label, 0].mean(),\n X[y == label, 2].mean() + 2,\n name,\n horizontalalignment=\"center\",\n bbox=dict(alpha=0.2, edgecolor=\"w\", facecolor=\"w\"),\n )\n\nax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor=\"k\")\n\nax.xaxis.set_ticklabels([])\nax.yaxis.set_ticklabels([])\nax.zaxis.set_ticklabels([])\nax.set_xlabel(\"Petal width\")\nax.set_ylabel(\"Sepal length\")\nax.set_zlabel(\"Petal length\")\nax.set_title(\"Ground Truth\")\n\nplt.subplots_adjust(wspace=0.25, hspace=0.25)\nplt.show()"
1919
]
2020
}
2121
],
Binary file not shown.

dev/_downloads/751db3d5e6b909ff00972495eaae53df/plot_document_clustering.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
"cell_type": "markdown",
4141
"metadata": {},
4242
"source": [
43-
"## Quantifying the quality of clustering results\n\nIn this section we define a function to score different clustering pipelines\nusing several metrics.\n\nClustering algorithms are fundamentally unsupervised learning methods.\nHowever, since we happen to have class labels for this specific dataset, it is\npossible to use evaluation metrics that leverage this \"supervised\" ground\ntruth information to quantify the quality of the resulting clusters. Examples\nof such metrics are the following:\n\n- homogeneity, which quantifies how much clusters contain only members of a\n single class;\n\n- completeness, which quantifies how much members of a given class are\n assigned to the same clusters;\n\n- V-measure, the harmonic mean of completeness and homogeneity;\n\n- Rand-Index, which measures how frequently pairs of data points are grouped\n consistently according to the result of the clustering algorithm and the\n ground truth class assignment;\n\n- Adjusted Rand-Index, a chance-adjusted Rand-Index such that random cluster\n assignment have an ARI of 0.0 in expectation.\n\nIf the ground truth labels are not known, evaluation can only be performed\nusing the model results itself. In that case, the Silhouette Coefficient comes\nin handy.\n\nFor more reference, see `clustering_evaluation`.\n\n"
43+
"## Quantifying the quality of clustering results\n\nIn this section we define a function to score different clustering pipelines\nusing several metrics.\n\nClustering algorithms are fundamentally unsupervised learning methods.\nHowever, since we happen to have class labels for this specific dataset, it is\npossible to use evaluation metrics that leverage this \"supervised\" ground\ntruth information to quantify the quality of the resulting clusters. Examples\nof such metrics are the following:\n\n- homogeneity, which quantifies how much clusters contain only members of a\n single class;\n\n- completeness, which quantifies how much members of a given class are\n assigned to the same clusters;\n\n- V-measure, the harmonic mean of completeness and homogeneity;\n\n- Rand-Index, which measures how frequently pairs of data points are grouped\n consistently according to the result of the clustering algorithm and the\n ground truth class assignment;\n\n- Adjusted Rand-Index, a chance-adjusted Rand-Index such that random cluster\n assignment have an ARI of 0.0 in expectation.\n\nIf the ground truth labels are not known, evaluation can only be performed\nusing the model results itself. In that case, the Silhouette Coefficient comes in\nhandy. See `sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`\nfor an example on how to do it.\n\nFor more reference, see `clustering_evaluation`.\n\n"
4444
]
4545
},
4646
{

dev/_downloads/82ec115874a062f9e8fa17efc63384c0/plot_color_quantization.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
},
1616
"outputs": [],
1717
"source": [
18-
"# Authors: Robert Layton <[email protected]>\n# Olivier Grisel <[email protected]>\n# Mathieu Blondel <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom time import time\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.cluster import KMeans\nfrom sklearn.datasets import load_sample_image\nfrom sklearn.metrics import pairwise_distances_argmin\nfrom sklearn.utils import shuffle\n\nn_colors = 64\n\n# Load the Summer Palace photo\nchina = load_sample_image(\"china.jpg\")\n\n# Convert to floats instead of the default 8 bits integer coding. Dividing by\n# 255 is important so that plt.imshow behaves works well on float data (need to\n# be in the range [0-1])\nchina = np.array(china, dtype=np.float64) / 255\n\n# Load Image and transform to a 2D numpy array.\nw, h, d = original_shape = tuple(china.shape)\nassert d == 3\nimage_array = np.reshape(china, (w * h, d))\n\nprint(\"Fitting model on a small sub-sample of the data\")\nt0 = time()\nimage_array_sample = shuffle(image_array, random_state=0, n_samples=1_000)\nkmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)\nprint(f\"done in {time() - t0:0.3f}s.\")\n\n# Get labels for all points\nprint(\"Predicting color indices on the full image (k-means)\")\nt0 = time()\nlabels = kmeans.predict(image_array)\nprint(f\"done in {time() - t0:0.3f}s.\")\n\n\ncodebook_random = shuffle(image_array, random_state=0, n_samples=n_colors)\nprint(\"Predicting color indices on the full image (random)\")\nt0 = time()\nlabels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0)\nprint(f\"done in {time() - t0:0.3f}s.\")\n\n\ndef recreate_image(codebook, labels, w, h):\n \"\"\"Recreate the (compressed) image from the code book & labels\"\"\"\n return codebook[labels].reshape(w, h, -1)\n\n\n# Display all results, alongside original image\nplt.figure(1)\nplt.clf()\nplt.axis(\"off\")\nplt.title(\"Original image (96,615 colors)\")\nplt.imshow(china)\n\nplt.figure(2)\nplt.clf()\nplt.axis(\"off\")\nplt.title(f\"Quantized image ({n_colors} colors, K-Means)\")\nplt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h))\n\nplt.figure(3)\nplt.clf()\nplt.axis(\"off\")\nplt.title(f\"Quantized image ({n_colors} colors, Random)\")\nplt.imshow(recreate_image(codebook_random, labels_random, w, h))\nplt.show()"
18+
"# Authors: Robert Layton <[email protected]>\n# Olivier Grisel <[email protected]>\n# Mathieu Blondel <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom time import time\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.cluster import KMeans\nfrom sklearn.datasets import load_sample_image\nfrom sklearn.metrics import pairwise_distances_argmin\nfrom sklearn.utils import shuffle\n\nn_colors = 64\n\n# Load the Summer Palace photo\nchina = load_sample_image(\"china.jpg\")\n\n# Convert to floats instead of the default 8 bits integer coding. Dividing by\n# 255 is important so that plt.imshow works well on float data (need to\n# be in the range [0-1])\nchina = np.array(china, dtype=np.float64) / 255\n\n# Load Image and transform to a 2D numpy array.\nw, h, d = original_shape = tuple(china.shape)\nassert d == 3\nimage_array = np.reshape(china, (w * h, d))\n\nprint(\"Fitting model on a small sub-sample of the data\")\nt0 = time()\nimage_array_sample = shuffle(image_array, random_state=0, n_samples=1_000)\nkmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)\nprint(f\"done in {time() - t0:0.3f}s.\")\n\n# Get labels for all points\nprint(\"Predicting color indices on the full image (k-means)\")\nt0 = time()\nlabels = kmeans.predict(image_array)\nprint(f\"done in {time() - t0:0.3f}s.\")\n\n\ncodebook_random = shuffle(image_array, random_state=0, n_samples=n_colors)\nprint(\"Predicting color indices on the full image (random)\")\nt0 = time()\nlabels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0)\nprint(f\"done in {time() - t0:0.3f}s.\")\n\n\ndef recreate_image(codebook, labels, w, h):\n \"\"\"Recreate the (compressed) image from the code book & labels\"\"\"\n return codebook[labels].reshape(w, h, -1)\n\n\n# Display all results, alongside original image\nplt.figure(1)\nplt.clf()\nplt.axis(\"off\")\nplt.title(\"Original image (96,615 colors)\")\nplt.imshow(china)\n\nplt.figure(2)\nplt.clf()\nplt.axis(\"off\")\nplt.title(f\"Quantized image ({n_colors} colors, K-Means)\")\nplt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h))\n\nplt.figure(3)\nplt.clf()\nplt.axis(\"off\")\nplt.title(f\"Quantized image ({n_colors} colors, Random)\")\nplt.imshow(recreate_image(codebook_random, labels_random, w, h))\nplt.show()"
1919
]
2020
}
2121
],

dev/_downloads/a315e003c9ce53b89d5fa110538885fd/plot_cluster_iris.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
88
- top left: What a K-means algorithm would yield using 8 clusters.
99
10-
- top right: What the effect of a bad initialization is
10+
- top right: What using three clusters would deliver.
11+
12+
- bottom left: What the effect of a bad initialization is
1113
on the classification process: By setting n_init to only 1
1214
(default is 10), the amount of times that the algorithm will
1315
be run with different centroid seeds is reduced.
1416
15-
- bottom left: What using eight clusters would deliver.
16-
1717
- bottom right: The ground truth.
1818
1919
"""
@@ -73,8 +73,7 @@
7373
horizontalalignment="center",
7474
bbox=dict(alpha=0.2, edgecolor="w", facecolor="w"),
7575
)
76-
# Reorder the labels to have colors matching the cluster results
77-
y = np.choose(y, [1, 2, 0]).astype(float)
76+
7877
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor="k")
7978

8079
ax.xaxis.set_ticklabels([])

dev/_downloads/ba68199eea858ec04949b2c6c65147e0/plot_document_clustering.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,9 @@
9999
# assignment have an ARI of 0.0 in expectation.
100100
#
101101
# If the ground truth labels are not known, evaluation can only be performed
102-
# using the model results itself. In that case, the Silhouette Coefficient comes
103-
# in handy.
102+
# using the model results itself. In that case, the Silhouette Coefficient comes in
103+
# handy. See :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`
104+
# for an example on how to do it.
104105
#
105106
# For more reference, see :ref:`clustering_evaluation`.
106107

dev/_downloads/d0e47fc5f3661efb101abfd4d9461afe/plot_color_quantization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
china = load_sample_image("china.jpg")
4242

4343
# Convert to floats instead of the default 8 bits integer coding. Dividing by
44-
# 255 is important so that plt.imshow behaves works well on float data (need to
44+
# 255 is important so that plt.imshow works well on float data (need to
4545
# be in the range [0-1])
4646
china = np.array(china, dtype=np.float64) / 255
4747

dev/_downloads/scikit-learn-docs.zip

797 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)