Skip to content

Commit 4758431

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 9548e8f4cdf11c5dfded57ee6661e7bcff0c374b
1 parent 8f35b11 commit 4758431

File tree

1,128 files changed

+5183
-3871
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,128 files changed

+5183
-3871
lines changed
5.17 KB
Binary file not shown.
4.17 KB
Binary file not shown.

dev/_downloads/plot_cluster_comparison.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\nimport time\nimport warnings\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import cluster, datasets, mixture\nfrom sklearn.neighbors import kneighbors_graph\nfrom sklearn.preprocessing import StandardScaler\nfrom itertools import cycle, islice\n\nnp.random.seed(0)\n\n# ============\n# Generate datasets. We choose the size big enough to see the scalability\n# of the algorithms, but not too big to avoid too long running times\n# ============\nn_samples = 1500\nnoisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,\n noise=.05)\nnoisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)\nblobs = datasets.make_blobs(n_samples=n_samples, random_state=8)\nno_structure = np.random.rand(n_samples, 2), None\n\n# Anisotropicly distributed data\nrandom_state = 170\nX, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)\ntransformation = [[0.6, -0.6], [-0.4, 0.8]]\nX_aniso = np.dot(X, transformation)\naniso = (X_aniso, y)\n\n# blobs with varied variances\nvaried = datasets.make_blobs(n_samples=n_samples,\n cluster_std=[1.0, 2.5, 0.5],\n random_state=random_state)\n\n# ============\n# Set up cluster parameters\n# ============\nplt.figure(figsize=(9 * 2 + 3, 12.5))\nplt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,\n hspace=.01)\n\nplot_num = 1\n\ndefault_base = {'quantile': .3,\n 'eps': .3,\n 'damping': .9,\n 'preference': -200,\n 'n_neighbors': 10,\n 'n_clusters': 3}\n\ndatasets = [\n (noisy_circles, {'damping': .77, 'preference': -240,\n 'quantile': .2, 'n_clusters': 2}),\n (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),\n (varied, {'eps': .18, 'n_neighbors': 2}),\n (aniso, {'eps': .15, 'n_neighbors': 2}),\n (blobs, {}),\n (no_structure, {})]\n\nfor i_dataset, (dataset, algo_params) in enumerate(datasets):\n # update parameters with dataset-specific values\n params = default_base.copy()\n params.update(algo_params)\n\n X, y = dataset\n\n # normalize dataset for easier parameter selection\n X = StandardScaler().fit_transform(X)\n\n # estimate bandwidth for mean shift\n bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])\n\n # connectivity matrix for structured Ward\n connectivity = kneighbors_graph(\n X, n_neighbors=params['n_neighbors'], include_self=False)\n # make connectivity symmetric\n connectivity = 0.5 * (connectivity + connectivity.T)\n\n # ============\n # Create cluster objects\n # ============\n ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)\n two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])\n ward = cluster.AgglomerativeClustering(\n n_clusters=params['n_clusters'], linkage='ward',\n connectivity=connectivity)\n spectral = cluster.SpectralClustering(\n n_clusters=params['n_clusters'], eigen_solver='arpack',\n affinity=\"nearest_neighbors\")\n dbscan = cluster.DBSCAN(eps=params['eps'])\n affinity_propagation = cluster.AffinityPropagation(\n damping=params['damping'], preference=params['preference'])\n average_linkage = cluster.AgglomerativeClustering(\n linkage=\"average\", affinity=\"cityblock\",\n n_clusters=params['n_clusters'], connectivity=connectivity)\n birch = cluster.Birch(n_clusters=params['n_clusters'])\n gmm = mixture.GaussianMixture(\n n_components=params['n_clusters'], covariance_type='full')\n\n clustering_algorithms = (\n ('MiniBatchKMeans', two_means),\n ('AffinityPropagation', affinity_propagation),\n ('MeanShift', ms),\n ('SpectralClustering', spectral),\n ('Ward', ward),\n ('AgglomerativeClustering', average_linkage),\n ('DBSCAN', dbscan),\n ('Birch', birch),\n ('GaussianMixture', gmm)\n )\n\n for name, algorithm in clustering_algorithms:\n t0 = time.time()\n\n # catch warnings related to kneighbors_graph\n with warnings.catch_warnings():\n warnings.filterwarnings(\n \"ignore\",\n message=\"the number of connected components of the \" +\n \"connectivity matrix is [0-9]{1,2}\" +\n \" > 1. Completing it to avoid stopping the tree early.\",\n category=UserWarning)\n warnings.filterwarnings(\n \"ignore\",\n message=\"Graph is not fully connected, spectral embedding\" +\n \" may not work as expected.\",\n category=UserWarning)\n algorithm.fit(X)\n\n t1 = time.time()\n if hasattr(algorithm, 'labels_'):\n y_pred = algorithm.labels_.astype(np.int)\n else:\n y_pred = algorithm.predict(X)\n\n plt.subplot(len(datasets), len(clustering_algorithms), plot_num)\n if i_dataset == 0:\n plt.title(name, size=18)\n\n colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',\n '#f781bf', '#a65628', '#984ea3',\n '#999999', '#e41a1c', '#dede00']),\n int(max(y_pred) + 1))))\n # add black color for outliers (if any)\n colors = np.append(colors, [\"#000000\"])\n plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])\n\n plt.xlim(-2.5, 2.5)\n plt.ylim(-2.5, 2.5)\n plt.xticks(())\n plt.yticks(())\n plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),\n transform=plt.gca().transAxes, size=15,\n horizontalalignment='right')\n plot_num += 1\n\nplt.show()"
29+
"print(__doc__)\n\nimport time\nimport warnings\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import cluster, datasets, mixture\nfrom sklearn.neighbors import kneighbors_graph\nfrom sklearn.preprocessing import StandardScaler\nfrom itertools import cycle, islice\n\nnp.random.seed(0)\n\n# ============\n# Generate datasets. We choose the size big enough to see the scalability\n# of the algorithms, but not too big to avoid too long running times\n# ============\nn_samples = 1500\nnoisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,\n noise=.05)\nnoisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)\nblobs = datasets.make_blobs(n_samples=n_samples, random_state=8)\nno_structure = np.random.rand(n_samples, 2), None\n\n# Anisotropicly distributed data\nrandom_state = 170\nX, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)\ntransformation = [[0.6, -0.6], [-0.4, 0.8]]\nX_aniso = np.dot(X, transformation)\naniso = (X_aniso, y)\n\n# blobs with varied variances\nvaried = datasets.make_blobs(n_samples=n_samples,\n cluster_std=[1.0, 2.5, 0.5],\n random_state=random_state)\n\n# ============\n# Set up cluster parameters\n# ============\nplt.figure(figsize=(9 * 2 + 3, 12.5))\nplt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,\n hspace=.01)\n\nplot_num = 1\n\ndefault_base = {'quantile': .3,\n 'eps': .3,\n 'damping': .9,\n 'preference': -200,\n 'n_neighbors': 10,\n 'n_clusters': 3,\n 'min_samples': 20,\n 'xi': 0.05,\n 'min_cluster_size': 0.1}\n\ndatasets = [\n (noisy_circles, {'damping': .77, 'preference': -240,\n 'quantile': .2, 'n_clusters': 2,\n 'min_samples': 20, 'xi': 0.25}),\n (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),\n (varied, {'eps': .18, 'n_neighbors': 2,\n 'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}),\n (aniso, {'eps': .15, 'n_neighbors': 2,\n 'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}),\n (blobs, {}),\n (no_structure, {})]\n\nfor i_dataset, (dataset, algo_params) in enumerate(datasets):\n # update parameters with dataset-specific values\n params = default_base.copy()\n params.update(algo_params)\n\n X, y = dataset\n\n # normalize dataset for easier parameter selection\n X = StandardScaler().fit_transform(X)\n\n # estimate bandwidth for mean shift\n bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])\n\n # connectivity matrix for structured Ward\n connectivity = kneighbors_graph(\n X, n_neighbors=params['n_neighbors'], include_self=False)\n # make connectivity symmetric\n connectivity = 0.5 * (connectivity + connectivity.T)\n\n # ============\n # Create cluster objects\n # ============\n ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)\n two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])\n ward = cluster.AgglomerativeClustering(\n n_clusters=params['n_clusters'], linkage='ward',\n connectivity=connectivity)\n spectral = cluster.SpectralClustering(\n n_clusters=params['n_clusters'], eigen_solver='arpack',\n affinity=\"nearest_neighbors\")\n dbscan = cluster.DBSCAN(eps=params['eps'])\n optics = cluster.OPTICS(min_samples=params['min_samples'],\n xi=params['xi'],\n min_cluster_size=params['min_cluster_size'])\n affinity_propagation = cluster.AffinityPropagation(\n damping=params['damping'], preference=params['preference'])\n average_linkage = cluster.AgglomerativeClustering(\n linkage=\"average\", affinity=\"cityblock\",\n n_clusters=params['n_clusters'], connectivity=connectivity)\n birch = cluster.Birch(n_clusters=params['n_clusters'])\n gmm = mixture.GaussianMixture(\n n_components=params['n_clusters'], covariance_type='full')\n\n clustering_algorithms = (\n ('MiniBatchKMeans', two_means),\n ('AffinityPropagation', affinity_propagation),\n ('MeanShift', ms),\n ('SpectralClustering', spectral),\n ('Ward', ward),\n ('AgglomerativeClustering', average_linkage),\n ('DBSCAN', dbscan),\n ('OPTICS', optics),\n ('Birch', birch),\n ('GaussianMixture', gmm)\n )\n\n for name, algorithm in clustering_algorithms:\n t0 = time.time()\n\n # catch warnings related to kneighbors_graph\n with warnings.catch_warnings():\n warnings.filterwarnings(\n \"ignore\",\n message=\"the number of connected components of the \" +\n \"connectivity matrix is [0-9]{1,2}\" +\n \" > 1. Completing it to avoid stopping the tree early.\",\n category=UserWarning)\n warnings.filterwarnings(\n \"ignore\",\n message=\"Graph is not fully connected, spectral embedding\" +\n \" may not work as expected.\",\n category=UserWarning)\n algorithm.fit(X)\n\n t1 = time.time()\n if hasattr(algorithm, 'labels_'):\n y_pred = algorithm.labels_.astype(np.int)\n else:\n y_pred = algorithm.predict(X)\n\n plt.subplot(len(datasets), len(clustering_algorithms), plot_num)\n if i_dataset == 0:\n plt.title(name, size=18)\n\n colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',\n '#f781bf', '#a65628', '#984ea3',\n '#999999', '#e41a1c', '#dede00']),\n int(max(y_pred) + 1))))\n # add black color for outliers (if any)\n colors = np.append(colors, [\"#000000\"])\n plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])\n\n plt.xlim(-2.5, 2.5)\n plt.ylim(-2.5, 2.5)\n plt.xticks(())\n plt.yticks(())\n plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),\n transform=plt.gca().transAxes, size=15,\n horizontalalignment='right')\n plot_num += 1\n\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_cluster_comparison.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,20 @@
7474
'damping': .9,
7575
'preference': -200,
7676
'n_neighbors': 10,
77-
'n_clusters': 3}
77+
'n_clusters': 3,
78+
'min_samples': 20,
79+
'xi': 0.05,
80+
'min_cluster_size': 0.1}
7881

7982
datasets = [
8083
(noisy_circles, {'damping': .77, 'preference': -240,
81-
'quantile': .2, 'n_clusters': 2}),
84+
'quantile': .2, 'n_clusters': 2,
85+
'min_samples': 20, 'xi': 0.25}),
8286
(noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),
83-
(varied, {'eps': .18, 'n_neighbors': 2}),
84-
(aniso, {'eps': .15, 'n_neighbors': 2}),
87+
(varied, {'eps': .18, 'n_neighbors': 2,
88+
'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}),
89+
(aniso, {'eps': .15, 'n_neighbors': 2,
90+
'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}),
8591
(blobs, {}),
8692
(no_structure, {})]
8793

@@ -116,6 +122,9 @@
116122
n_clusters=params['n_clusters'], eigen_solver='arpack',
117123
affinity="nearest_neighbors")
118124
dbscan = cluster.DBSCAN(eps=params['eps'])
125+
optics = cluster.OPTICS(min_samples=params['min_samples'],
126+
xi=params['xi'],
127+
min_cluster_size=params['min_cluster_size'])
119128
affinity_propagation = cluster.AffinityPropagation(
120129
damping=params['damping'], preference=params['preference'])
121130
average_linkage = cluster.AgglomerativeClustering(
@@ -133,6 +142,7 @@
133142
('Ward', ward),
134143
('AgglomerativeClustering', average_linkage),
135144
('DBSCAN', dbscan),
145+
('OPTICS', optics),
136146
('Birch', birch),
137147
('GaussianMixture', gmm)
138148
)

dev/_downloads/plot_optics.ipynb

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Demo of OPTICS clustering algorithm\n\nFinds core samples of high density and expands clusters from them.\nThis example uses data that is generated so that the clusters have\ndifferent densities.\nThe :class:`sklearn.cluster.OPTICS` is first used with its Xi cluster detection\nmethod, and then setting specific thresholds on the reachability, which\ncorresponds to :class:`sklearn.cluster.DBSCAN`. We can see that the different\nclusters of OPTICS's Xi method can be recovered with different choices of\nthresholds in DBSCAN.\n\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# Authors: Shane Grigsby <[email protected]>\n# Adrin Jalali <[email protected]>\n# License: BSD 3 clause\n\n\nfrom sklearn.cluster import OPTICS, cluster_optics_dbscan\nimport matplotlib.gridspec as gridspec\nimport matplotlib.pyplot as plt\nimport numpy as np\n\n# Generate sample data\n\nnp.random.seed(0)\nn_points_per_cluster = 250\n\nC1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)\nC2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)\nC3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2)\nC4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2)\nC5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)\nC6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)\nX = np.vstack((C1, C2, C3, C4, C5, C6))\n\nclust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)\n\n# Run the fit\nclust.fit(X)\n\nlabels_050 = cluster_optics_dbscan(reachability=clust.reachability_,\n core_distances=clust.core_distances_,\n ordering=clust.ordering_, eps=0.5)\nlabels_200 = cluster_optics_dbscan(reachability=clust.reachability_,\n core_distances=clust.core_distances_,\n ordering=clust.ordering_, eps=2)\n\nspace = np.arange(len(X))\nreachability = clust.reachability_[clust.ordering_]\nlabels = clust.labels_[clust.ordering_]\n\nplt.figure(figsize=(10, 7))\nG = gridspec.GridSpec(2, 3)\nax1 = plt.subplot(G[0, :])\nax2 = plt.subplot(G[1, 0])\nax3 = plt.subplot(G[1, 1])\nax4 = plt.subplot(G[1, 2])\n\n# Reachability plot\ncolors = ['g.', 'r.', 'b.', 'y.', 'c.']\nfor klass, color in zip(range(0, 5), colors):\n Xk = space[labels == klass]\n Rk = reachability[labels == klass]\n ax1.plot(Xk, Rk, color, alpha=0.3)\nax1.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.3)\nax1.plot(space, np.full_like(space, 2., dtype=float), 'k-', alpha=0.5)\nax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.', alpha=0.5)\nax1.set_ylabel('Reachability (epsilon distance)')\nax1.set_title('Reachability Plot')\n\n# OPTICS\ncolors = ['g.', 'r.', 'b.', 'y.', 'c.']\nfor klass, color in zip(range(0, 5), colors):\n Xk = X[clust.labels_ == klass]\n ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)\nax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], 'k+', alpha=0.1)\nax2.set_title('Automatic Clustering\\nOPTICS')\n\n# DBSCAN at 0.5\ncolors = ['g', 'greenyellow', 'olive', 'r', 'b', 'c']\nfor klass, color in zip(range(0, 6), colors):\n Xk = X[labels_050 == klass]\n ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3, marker='.')\nax3.plot(X[labels_050 == -1, 0], X[labels_050 == -1, 1], 'k+', alpha=0.1)\nax3.set_title('Clustering at 0.5 epsilon cut\\nDBSCAN')\n\n# DBSCAN at 2.\ncolors = ['g.', 'm.', 'y.', 'c.']\nfor klass, color in zip(range(0, 4), colors):\n Xk = X[labels_200 == klass]\n ax4.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)\nax4.plot(X[labels_200 == -1, 0], X[labels_200 == -1, 1], 'k+', alpha=0.1)\nax4.set_title('Clustering at 2.0 epsilon cut\\nDBSCAN')\n\nplt.tight_layout()\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.6.8"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}

0 commit comments

Comments
 (0)