Skip to content

Commit 26b7c90

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 14295f9ceb2f1a9ea281ae19c06f3169be8a674f
1 parent abe9a52 commit 26b7c90

File tree

1,213 files changed

+3711
-3712
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,213 files changed

+3711
-3712
lines changed

dev/_downloads/31ed7d76091fdf7cbba173b644810790/plot_spectral_coclustering.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\n# Author: Kemal Eren <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nfrom matplotlib import pyplot as plt\n\nfrom sklearn.datasets import make_biclusters\nfrom sklearn.datasets import samples_generator as sg\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.metrics import consensus_score\n\ndata, rows, columns = make_biclusters(\n shape=(300, 300), n_clusters=5, noise=5,\n shuffle=False, random_state=0)\n\nplt.matshow(data, cmap=plt.cm.Blues)\nplt.title(\"Original dataset\")\n\ndata, row_idx, col_idx = sg._shuffle(data, random_state=0)\nplt.matshow(data, cmap=plt.cm.Blues)\nplt.title(\"Shuffled dataset\")\n\nmodel = SpectralCoclustering(n_clusters=5, random_state=0)\nmodel.fit(data)\nscore = consensus_score(model.biclusters_,\n (rows[:, row_idx], columns[:, col_idx]))\n\nprint(\"consensus score: {:.3f}\".format(score))\n\nfit_data = data[np.argsort(model.row_labels_)]\nfit_data = fit_data[:, np.argsort(model.column_labels_)]\n\nplt.matshow(fit_data, cmap=plt.cm.Blues)\nplt.title(\"After biclustering; rearranged to show biclusters\")\n\nplt.show()"
29+
"print(__doc__)\n\n# Author: Kemal Eren <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nfrom matplotlib import pyplot as plt\n\nfrom sklearn.datasets import make_biclusters\nfrom sklearn.datasets import samples_generator as sg\nfrom sklearn.cluster import SpectralCoclustering\nfrom sklearn.metrics import consensus_score\n\ndata, rows, columns = make_biclusters(\n shape=(300, 300), n_clusters=5, noise=5,\n shuffle=False, random_state=0)\n\nplt.matshow(data, cmap=plt.cm.Blues)\nplt.title(\"Original dataset\")\n\ndata, row_idx, col_idx = sg._shuffle(data, random_state=0)\nplt.matshow(data, cmap=plt.cm.Blues)\nplt.title(\"Shuffled dataset\")\n\nmodel = SpectralCoclustering(n_clusters=5, random_state=0)\nmodel.fit(data)\nscore = consensus_score(model.biclusters_,\n (rows[:, row_idx], columns[:, col_idx]))\n\nprint(\"consensus score: {:.3f}\".format(score))\n\nfit_data = data[np.argsort(model.row_labels_)]\nfit_data = fit_data[:, np.argsort(model.column_labels_)]\n\nplt.matshow(fit_data, cmap=plt.cm.Blues)\nplt.title(\"After biclustering; rearranged to show biclusters\")\n\nplt.show()"
3030
]
3131
}
3232
],
Binary file not shown.

dev/_downloads/511b08a16690d8c6fcc389c528d35567/plot_spectral_biclustering.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\n# Author: Kemal Eren <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nfrom matplotlib import pyplot as plt\n\nfrom sklearn.datasets import make_checkerboard\nfrom sklearn.datasets import samples_generator as sg\nfrom sklearn.cluster.bicluster import SpectralBiclustering\nfrom sklearn.metrics import consensus_score\n\nn_clusters = (4, 3)\ndata, rows, columns = make_checkerboard(\n shape=(300, 300), n_clusters=n_clusters, noise=10,\n shuffle=False, random_state=0)\n\nplt.matshow(data, cmap=plt.cm.Blues)\nplt.title(\"Original dataset\")\n\ndata, row_idx, col_idx = sg._shuffle(data, random_state=0)\nplt.matshow(data, cmap=plt.cm.Blues)\nplt.title(\"Shuffled dataset\")\n\nmodel = SpectralBiclustering(n_clusters=n_clusters, method='log',\n random_state=0)\nmodel.fit(data)\nscore = consensus_score(model.biclusters_,\n (rows[:, row_idx], columns[:, col_idx]))\n\nprint(\"consensus score: {:.1f}\".format(score))\n\nfit_data = data[np.argsort(model.row_labels_)]\nfit_data = fit_data[:, np.argsort(model.column_labels_)]\n\nplt.matshow(fit_data, cmap=plt.cm.Blues)\nplt.title(\"After biclustering; rearranged to show biclusters\")\n\nplt.matshow(np.outer(np.sort(model.row_labels_) + 1,\n np.sort(model.column_labels_) + 1),\n cmap=plt.cm.Blues)\nplt.title(\"Checkerboard structure of rearranged data\")\n\nplt.show()"
29+
"print(__doc__)\n\n# Author: Kemal Eren <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nfrom matplotlib import pyplot as plt\n\nfrom sklearn.datasets import make_checkerboard\nfrom sklearn.datasets import samples_generator as sg\nfrom sklearn.cluster import SpectralBiclustering\nfrom sklearn.metrics import consensus_score\n\nn_clusters = (4, 3)\ndata, rows, columns = make_checkerboard(\n shape=(300, 300), n_clusters=n_clusters, noise=10,\n shuffle=False, random_state=0)\n\nplt.matshow(data, cmap=plt.cm.Blues)\nplt.title(\"Original dataset\")\n\ndata, row_idx, col_idx = sg._shuffle(data, random_state=0)\nplt.matshow(data, cmap=plt.cm.Blues)\nplt.title(\"Shuffled dataset\")\n\nmodel = SpectralBiclustering(n_clusters=n_clusters, method='log',\n random_state=0)\nmodel.fit(data)\nscore = consensus_score(model.biclusters_,\n (rows[:, row_idx], columns[:, col_idx]))\n\nprint(\"consensus score: {:.1f}\".format(score))\n\nfit_data = data[np.argsort(model.row_labels_)]\nfit_data = fit_data[:, np.argsort(model.column_labels_)]\n\nplt.matshow(fit_data, cmap=plt.cm.Blues)\nplt.title(\"After biclustering; rearranged to show biclusters\")\n\nplt.matshow(np.outer(np.sort(model.row_labels_) + 1,\n np.sort(model.column_labels_) + 1),\n cmap=plt.cm.Blues)\nplt.title(\"Checkerboard structure of rearranged data\")\n\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/6049764a39743097e7a04b7ebae6375d/plot_spectral_coclustering.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
from sklearn.datasets import make_biclusters
2626
from sklearn.datasets import samples_generator as sg
27-
from sklearn.cluster.bicluster import SpectralCoclustering
27+
from sklearn.cluster import SpectralCoclustering
2828
from sklearn.metrics import consensus_score
2929

3030
data, rows, columns = make_biclusters(

dev/_downloads/99d96fddf9163d237911171f1552b649/plot_spectral_biclustering.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
from sklearn.datasets import make_checkerboard
2727
from sklearn.datasets import samples_generator as sg
28-
from sklearn.cluster.bicluster import SpectralBiclustering
28+
from sklearn.cluster import SpectralBiclustering
2929
from sklearn.metrics import consensus_score
3030

3131
n_clusters = (4, 3)

dev/_downloads/a3203aeb5144eed10e8503e17c6b24de/plot_bicluster_newsgroups.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
import numpy as np
3030

31-
from sklearn.cluster.bicluster import SpectralCoclustering
31+
from sklearn.cluster import SpectralCoclustering
3232
from sklearn.cluster import MiniBatchKMeans
3333
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups
3434
from sklearn.feature_extraction.text import TfidfVectorizer
Binary file not shown.

dev/_downloads/ec072ecf8a5ca08e2e4f2b414cce312f/plot_bicluster_newsgroups.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"from collections import defaultdict\nimport operator\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_normalizer(tokens):\n \"\"\" Map all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n return (\"#NUMBER\" if token[0].isdigit() else token for token in tokens)\n\n\nclass NumberNormalizingVectorizer(TfidfVectorizer):\n def build_tokenizer(self):\n tokenize = super().build_tokenizer()\n return lambda doc: list(number_normalizer(tokenize(doc)))\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n 'comp.windows.x', 'misc.forsale', 'rec.autos',\n 'rec.motorcycles', 'rec.sport.baseball',\n 'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n 'sci.med', 'sci.space', 'soc.religion.christian',\n 'talk.politics.guns', 'talk.politics.mideast',\n 'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = (X[row_complement][:, cols].sum() +\n X[rows][:, col_complement].sum())\n return cut / weight\n\n\ndef most_common(d):\n \"\"\"Items of a defaultdict(int) with the highest values.\n\n Like Counter.most_common in Python >=2.7.\n \"\"\"\n return sorted(d.items(), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = defaultdict(int)\n for i in cluster_docs:\n counter[document_names[i]] += 1\n cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n for name, c in most_common(counter)[:3])\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n word_col[out_of_cluster_docs, :].sum(axis=0))\n word_scores = word_scores.ravel()\n important_words = list(feature_names[cluster_words[i]]\n for i in word_scores.argsort()[:-11:-1])\n\n print(\"bicluster {} : {} documents, {} words\".format(\n idx, n_rows, n_cols))\n print(\"categories : {}\".format(cat_string))\n print(\"words : {}\\n\".format(', '.join(important_words)))"
29+
"from collections import defaultdict\nimport operator\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_normalizer(tokens):\n \"\"\" Map all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n return (\"#NUMBER\" if token[0].isdigit() else token for token in tokens)\n\n\nclass NumberNormalizingVectorizer(TfidfVectorizer):\n def build_tokenizer(self):\n tokenize = super().build_tokenizer()\n return lambda doc: list(number_normalizer(tokenize(doc)))\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n 'comp.windows.x', 'misc.forsale', 'rec.autos',\n 'rec.motorcycles', 'rec.sport.baseball',\n 'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n 'sci.med', 'sci.space', 'soc.religion.christian',\n 'talk.politics.guns', 'talk.politics.mideast',\n 'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = (X[row_complement][:, cols].sum() +\n X[rows][:, col_complement].sum())\n return cut / weight\n\n\ndef most_common(d):\n \"\"\"Items of a defaultdict(int) with the highest values.\n\n Like Counter.most_common in Python >=2.7.\n \"\"\"\n return sorted(d.items(), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = defaultdict(int)\n for i in cluster_docs:\n counter[document_names[i]] += 1\n cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n for name, c in most_common(counter)[:3])\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n word_col[out_of_cluster_docs, :].sum(axis=0))\n word_scores = word_scores.ravel()\n important_words = list(feature_names[cluster_words[i]]\n for i in word_scores.argsort()[:-11:-1])\n\n print(\"bicluster {} : {} documents, {} words\".format(\n idx, n_rows, n_cols))\n print(\"categories : {}\".format(cat_string))\n print(\"words : {}\\n\".format(', '.join(important_words)))"
3030
]
3131
}
3232
],

dev/_downloads/scikit-learn-docs.pdf

6.82 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes

0 commit comments

Comments
 (0)