Skip to content

Commit 7e26409

Browse files
committed
Pushing the docs to dev/ for branch: master, commit b46e6382daf2b9518a54066106e88197555ca18d
1 parent 9c8cd76 commit 7e26409

File tree

1,001 files changed

+3069
-3015
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,001 files changed

+3069
-3015
lines changed
6 Bytes
Binary file not shown.
16 Bytes
Binary file not shown.

dev/_downloads/plot_bicluster_newsgroups.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"from __future__ import print_function\n\nfrom collections import defaultdict\nimport operator\nimport re\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.externals.six import iteritems\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_aware_tokenizer(doc):\n \"\"\" Tokenizer that maps all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n token_pattern = re.compile(u'(?u)\\\\b\\\\w\\\\w+\\\\b')\n tokens = token_pattern.findall(doc)\n tokens = [\"#NUMBER\" if token[0] in \"0123456789_\" else token\n for token in tokens]\n return tokens\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n 'comp.windows.x', 'misc.forsale', 'rec.autos',\n 'rec.motorcycles', 'rec.sport.baseball',\n 'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n 'sci.med', 'sci.space', 'soc.religion.christian',\n 'talk.politics.guns', 'talk.politics.mideast',\n 'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = TfidfVectorizer(stop_words='english', min_df=5,\n tokenizer=number_aware_tokenizer)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = (X[row_complement][:, cols].sum() +\n X[rows][:, col_complement].sum())\n return cut / weight\n\n\ndef most_common(d):\n \"\"\"Items of a defaultdict(int) with the highest values.\n\n Like Counter.most_common in Python >=2.7.\n \"\"\"\n return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = defaultdict(int)\n for i in cluster_docs:\n counter[document_names[i]] += 1\n cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n for name, c in most_common(counter)[:3])\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n word_col[out_of_cluster_docs, :].sum(axis=0))\n word_scores = word_scores.ravel()\n important_words = list(feature_names[cluster_words[i]]\n for i in word_scores.argsort()[:-11:-1])\n\n print(\"bicluster {} : {} documents, {} words\".format(\n idx, n_rows, n_cols))\n print(\"categories : {}\".format(cat_string))\n print(\"words : {}\\n\".format(', '.join(important_words)))"
29+
"from __future__ import print_function\n\nfrom collections import defaultdict\nimport operator\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.externals.six import iteritems\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_normalizer(tokens):\n \"\"\" Map all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n return (\"#NUMBER\" if token[0].isdigit() else token for token in tokens)\n\n\nclass NumberNormalizingVectorizer(TfidfVectorizer):\n def build_tokenizer(self):\n tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()\n return lambda doc: list(number_normalizer(tokenize(doc)))\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n 'comp.windows.x', 'misc.forsale', 'rec.autos',\n 'rec.motorcycles', 'rec.sport.baseball',\n 'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n 'sci.med', 'sci.space', 'soc.religion.christian',\n 'talk.politics.guns', 'talk.politics.mideast',\n 'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = (X[row_complement][:, cols].sum() +\n X[rows][:, col_complement].sum())\n return cut / weight\n\n\ndef most_common(d):\n \"\"\"Items of a defaultdict(int) with the highest values.\n\n Like Counter.most_common in Python >=2.7.\n \"\"\"\n return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = defaultdict(int)\n for i in cluster_docs:\n counter[document_names[i]] += 1\n cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n for name, c in most_common(counter)[:3])\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n word_col[out_of_cluster_docs, :].sum(axis=0))\n word_scores = word_scores.ravel()\n important_words = list(feature_names[cluster_words[i]]\n for i in word_scores.argsort()[:-11:-1])\n\n print(\"bicluster {} : {} documents, {} words\".format(\n idx, n_rows, n_cols))\n print(\"categories : {}\".format(cat_string))\n print(\"words : {}\\n\".format(', '.join(important_words)))"
3030
]
3131
}
3232
],

dev/_downloads/plot_bicluster_newsgroups.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626

2727
from collections import defaultdict
2828
import operator
29-
import re
3029
from time import time
3130

3231
import numpy as np
@@ -41,18 +40,20 @@
4140
print(__doc__)
4241

4342

44-
def number_aware_tokenizer(doc):
45-
""" Tokenizer that maps all numeric tokens to a placeholder.
43+
def number_normalizer(tokens):
44+
""" Map all numeric tokens to a placeholder.
4645
4746
For many applications, tokens that begin with a number are not directly
4847
useful, but the fact that such a token exists can be relevant. By applying
4948
this form of dimensionality reduction, some methods may perform better.
5049
"""
51-
token_pattern = re.compile(u'(?u)\\b\\w\\w+\\b')
52-
tokens = token_pattern.findall(doc)
53-
tokens = ["#NUMBER" if token[0] in "0123456789_" else token
54-
for token in tokens]
55-
return tokens
50+
return ("#NUMBER" if token[0].isdigit() else token for token in tokens)
51+
52+
53+
class NumberNormalizingVectorizer(TfidfVectorizer):
54+
def build_tokenizer(self):
55+
tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
56+
return lambda doc: list(number_normalizer(tokenize(doc)))
5657

5758

5859
# exclude 'comp.os.ms-windows.misc'
@@ -67,8 +68,7 @@ def number_aware_tokenizer(doc):
6768
newsgroups = fetch_20newsgroups(categories=categories)
6869
y_true = newsgroups.target
6970

70-
vectorizer = TfidfVectorizer(stop_words='english', min_df=5,
71-
tokenizer=number_aware_tokenizer)
71+
vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)
7272
cocluster = SpectralCoclustering(n_clusters=len(categories),
7373
svd_method='arpack', random_state=0)
7474
kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,

dev/_downloads/scikit-learn-docs.pdf

4.22 KB
Binary file not shown.
68 Bytes
68 Bytes
85 Bytes
85 Bytes
-210 Bytes

0 commit comments

Comments
 (0)