GRSEB9S
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
-1.17 KB b/‎dev/_downloads/auto_examples_jupyter.zip
-1.17 KB
diff --git a/‎dev/_downloads/auto_examples_python.zip
-1.14 KB b/‎dev/_downloads/auto_examples_python.zip
-1.14 KB
diff --git a/‎dev/_downloads/plot_bicluster_newsgroups.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/plot_bicluster_newsgroups.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_bicluster_newsgroups.py
Lines changed: 3 additions & 39 deletions b/‎dev/_downloads/plot_bicluster_newsgroups.py
Lines changed: 3 additions & 39 deletions
diff --git a/‎dev/_downloads/plot_feature_selection_pipeline.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_feature_selection_pipeline.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_feature_selection_pipeline.py
Lines changed: 9 additions & 4 deletions b/‎dev/_downloads/plot_feature_selection_pipeline.py
Lines changed: 9 additions & 4 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
7.77 KB b/‎dev/_downloads/scikit-learn-docs.pdf
7.77 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-66 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-66 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-66 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-66 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-64 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-64 Bytes
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Biclustering documents with the Spectral Co-clustering algorithm\n\n\nThis example demonstrates the Spectral Co-clustering algorithm on the\ntwenty newsgroups dataset. The 'comp.os.ms-windows.misc' category is\nexcluded because it contains many posts containing nothing but data.\n\nThe TF-IDF vectorized posts form a word frequency matrix, which is\nthen biclustered using Dhillon's Spectral Co-Clustering algorithm. The\nresulting document-word biclusters indicate subsets words used more\noften in those subsets documents.\n\nFor a few of the best biclusters, its most common document categories\nand its ten most important words get printed. The best biclusters are\ndetermined by their normalized cut. The best words are determined by\ncomparing their sums inside and outside the bicluster.\n\nFor comparison, the documents are also clustered using\nMiniBatchKMeans. The document clusters derived from the biclusters\nachieve a better V-measure than clusters found by MiniBatchKMeans.\n\nOutput::\n\n    Vectorizing...\n    Coclustering...\n    Done in 9.53s. V-measure: 0.4455\n    MiniBatchKMeans...\n    Done in 12.00s. V-measure: 0.3309\n\n    Best biclusters:\n    ----------------\n    bicluster 0 : 1951 documents, 4373 words\n    categories   : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med\n    words        : gun, guns, geb, banks, firearms, drugs, gordon, clinton,\n                   cdt, amendment\n\n    bicluster 1 : 1165 documents, 3304 words\n    categories   : 29% talk.politics.mideast, 26% soc.religion.christian,\n                   25% alt.atheism\n    words        : god, jesus, christians, atheists, kent, sin, morality,\n                   belief, resurrection, marriage\n\n    bicluster 2 : 2219 documents, 2830 words\n    categories   : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware,\n                   16% comp.graphics\n    words        : voltage, dsp, board, receiver, circuit, shipping, packages,\n                   stereo, compression, package\n\n    bicluster 3 : 1860 documents, 2745 words\n    categories   : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale\n    words        : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw,\n                   bikes\n\n    bicluster 4 : 12 documents, 155 words\n    categories   : 100% rec.sport.hockey\n    words        : scorer, unassisted, reichel, semak, sweeney, kovalenko,\n                   ricci, audette, momesso, nedved\n\n\n"
+        "\n# Biclustering documents with the Spectral Co-clustering algorithm\n\n\nThis example demonstrates the Spectral Co-clustering algorithm on the\ntwenty newsgroups dataset. The 'comp.os.ms-windows.misc' category is\nexcluded because it contains many posts containing nothing but data.\n\nThe TF-IDF vectorized posts form a word frequency matrix, which is\nthen biclustered using Dhillon's Spectral Co-Clustering algorithm. The\nresulting document-word biclusters indicate subsets words used more\noften in those subsets documents.\n\nFor a few of the best biclusters, its most common document categories\nand its ten most important words get printed. The best biclusters are\ndetermined by their normalized cut. The best words are determined by\ncomparing their sums inside and outside the bicluster.\n\nFor comparison, the documents are also clustered using\nMiniBatchKMeans. The document clusters derived from the biclusters\nachieve a better V-measure than clusters found by MiniBatchKMeans.\n\n\n"
       ]
     },
     {
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "from __future__ import print_function\n\nprint(__doc__)\n\nfrom collections import defaultdict\nimport operator\nimport re\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.externals.six import iteritems\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\n\ndef number_aware_tokenizer(doc):\n    \"\"\" Tokenizer that maps all numeric tokens to a placeholder.\n\n    For many applications, tokens that begin with a number are not directly\n    useful, but the fact that such a token exists can be relevant.  By applying\n    this form of dimensionality reduction, some methods may perform better.\n    \"\"\"\n    token_pattern = re.compile(u'(?u)\\\\b\\\\w\\\\w+\\\\b')\n    tokens = token_pattern.findall(doc)\n    tokens = [\"#NUMBER\" if token[0] in \"0123456789_\" else token\n              for token in tokens]\n    return tokens\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n              'comp.windows.x', 'misc.forsale', 'rec.autos',\n              'rec.motorcycles', 'rec.sport.baseball',\n              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n              'sci.med', 'sci.space', 'soc.religion.christian',\n              'talk.politics.guns', 'talk.politics.mideast',\n              'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = TfidfVectorizer(stop_words='english', min_df=5,\n                             tokenizer=number_aware_tokenizer)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n                                 svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n                         random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n    time() - start_time,\n    v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n    time() - start_time,\n    v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n    rows, cols = cocluster.get_indices(i)\n    if not (np.any(rows) and np.any(cols)):\n        import sys\n        return sys.float_info.max\n    row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n    col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n    # Note: the following is identical to X[rows[:, np.newaxis],\n    # cols].sum() but much faster in scipy <= 0.16\n    weight = X[rows][:, cols].sum()\n    cut = (X[row_complement][:, cols].sum() +\n           X[rows][:, col_complement].sum())\n    return cut / weight\n\n\ndef most_common(d):\n    \"\"\"Items of a defaultdict(int) with the highest values.\n\n    Like Counter.most_common in Python >=2.7.\n    \"\"\"\n    return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n                       for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n    n_rows, n_cols = cocluster.get_shape(cluster)\n    cluster_docs, cluster_words = cocluster.get_indices(cluster)\n    if not len(cluster_docs) or not len(cluster_words):\n        continue\n\n    # categories\n    counter = defaultdict(int)\n    for i in cluster_docs:\n        counter[document_names[i]] += 1\n    cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n                           for name, c in most_common(counter)[:3])\n\n    # words\n    out_of_cluster_docs = cocluster.row_labels_ != cluster\n    out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n    word_col = X[:, cluster_words]\n    word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n                           word_col[out_of_cluster_docs, :].sum(axis=0))\n    word_scores = word_scores.ravel()\n    important_words = list(feature_names[cluster_words[i]]\n                           for i in word_scores.argsort()[:-11:-1])\n\n    print(\"bicluster {} : {} documents, {} words\".format(\n        idx, n_rows, n_cols))\n    print(\"categories   : {}\".format(cat_string))\n    print(\"words        : {}\\n\".format(', '.join(important_words)))"
+        "from __future__ import print_function\n\nfrom collections import defaultdict\nimport operator\nimport re\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.externals.six import iteritems\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_aware_tokenizer(doc):\n    \"\"\" Tokenizer that maps all numeric tokens to a placeholder.\n\n    For many applications, tokens that begin with a number are not directly\n    useful, but the fact that such a token exists can be relevant.  By applying\n    this form of dimensionality reduction, some methods may perform better.\n    \"\"\"\n    token_pattern = re.compile(u'(?u)\\\\b\\\\w\\\\w+\\\\b')\n    tokens = token_pattern.findall(doc)\n    tokens = [\"#NUMBER\" if token[0] in \"0123456789_\" else token\n              for token in tokens]\n    return tokens\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n              'comp.windows.x', 'misc.forsale', 'rec.autos',\n              'rec.motorcycles', 'rec.sport.baseball',\n              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n              'sci.med', 'sci.space', 'soc.religion.christian',\n              'talk.politics.guns', 'talk.politics.mideast',\n              'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = TfidfVectorizer(stop_words='english', min_df=5,\n                             tokenizer=number_aware_tokenizer)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n                                 svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n                         random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n    time() - start_time,\n    v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n    time() - start_time,\n    v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n    rows, cols = cocluster.get_indices(i)\n    if not (np.any(rows) and np.any(cols)):\n        import sys\n        return sys.float_info.max\n    row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n    col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n    # Note: the following is identical to X[rows[:, np.newaxis],\n    # cols].sum() but much faster in scipy <= 0.16\n    weight = X[rows][:, cols].sum()\n    cut = (X[row_complement][:, cols].sum() +\n           X[rows][:, col_complement].sum())\n    return cut / weight\n\n\ndef most_common(d):\n    \"\"\"Items of a defaultdict(int) with the highest values.\n\n    Like Counter.most_common in Python >=2.7.\n    \"\"\"\n    return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n                       for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n    n_rows, n_cols = cocluster.get_shape(cluster)\n    cluster_docs, cluster_words = cocluster.get_indices(cluster)\n    if not len(cluster_docs) or not len(cluster_words):\n        continue\n\n    # categories\n    counter = defaultdict(int)\n    for i in cluster_docs:\n        counter[document_names[i]] += 1\n    cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n                           for name, c in most_common(counter)[:3])\n\n    # words\n    out_of_cluster_docs = cocluster.row_labels_ != cluster\n    out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n    word_col = X[:, cluster_words]\n    word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n                           word_col[out_of_cluster_docs, :].sum(axis=0))\n    word_scores = word_scores.ravel()\n    important_words = list(feature_names[cluster_words[i]]\n                           for i in word_scores.argsort()[:-11:-1])\n\n    print(\"bicluster {} : {} documents, {} words\".format(\n        idx, n_rows, n_cols))\n    print(\"categories   : {}\".format(cat_string))\n    print(\"words        : {}\\n\".format(', '.join(important_words)))"
       ]
     }
   ],
 
@@ -21,48 +21,9 @@
 MiniBatchKMeans. The document clusters derived from the biclusters
 achieve a better V-measure than clusters found by MiniBatchKMeans.
 
-Output::
-
-    Vectorizing...
-    Coclustering...
-    Done in 9.53s. V-measure: 0.4455
-    MiniBatchKMeans...
-    Done in 12.00s. V-measure: 0.3309
-
-    Best biclusters:
-    ----------------
-    bicluster 0 : 1951 documents, 4373 words
-    categories   : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med
-    words        : gun, guns, geb, banks, firearms, drugs, gordon, clinton,
-                   cdt, amendment
-
-    bicluster 1 : 1165 documents, 3304 words
-    categories   : 29% talk.politics.mideast, 26% soc.religion.christian,
-                   25% alt.atheism
-    words        : god, jesus, christians, atheists, kent, sin, morality,
-                   belief, resurrection, marriage
-
-    bicluster 2 : 2219 documents, 2830 words
-    categories   : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware,
-                   16% comp.graphics
-    words        : voltage, dsp, board, receiver, circuit, shipping, packages,
-                   stereo, compression, package
-
-    bicluster 3 : 1860 documents, 2745 words
-    categories   : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale
-    words        : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw,
-                   bikes
-
-    bicluster 4 : 12 documents, 155 words
-    categories   : 100% rec.sport.hockey
-    words        : scorer, unassisted, reichel, semak, sweeney, kovalenko,
-                   ricci, audette, momesso, nedved
-
 """
 from __future__ import print_function
 
-print(__doc__)
-
 from collections import defaultdict
 import operator
 import re
@@ -77,6 +38,8 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.cluster import v_measure_score
 
+print(__doc__)
+
 
 def number_aware_tokenizer(doc):
     """ Tokenizer that maps all numeric tokens to a placeholder.
@@ -91,6 +54,7 @@ def number_aware_tokenizer(doc):
               for token in tokens]
     return tokens
 
+
 # exclude 'comp.os.ms-windows.misc'
 categories = ['alt.atheism', 'comp.graphics',
               'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "print(__doc__)\n\nfrom sklearn import svm\nfrom sklearn.datasets import samples_generator\nfrom sklearn.feature_selection import SelectKBest, f_regression\nfrom sklearn.pipeline import make_pipeline\n\n# import some data to play with\nX, y = samples_generator.make_classification(\n    n_features=20, n_informative=3, n_redundant=0, n_classes=4,\n    n_clusters_per_class=2)\n\n# ANOVA SVM-C\n# 1) anova filter, take 3 best ranked features\nanova_filter = SelectKBest(f_regression, k=3)\n# 2) svm\nclf = svm.SVC(kernel='linear')\n\nanova_svm = make_pipeline(anova_filter, clf)\nanova_svm.fit(X, y)\nanova_svm.predict(X)"
+        "from sklearn import svm\nfrom sklearn.datasets import samples_generator\nfrom sklearn.feature_selection import SelectKBest, f_regression\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import classification_report\n\nprint(__doc__)\n\n# import some data to play with\nX, y = samples_generator.make_classification(\n    n_features=20, n_informative=3, n_redundant=0, n_classes=4,\n    n_clusters_per_class=2)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n\n# ANOVA SVM-C\n# 1) anova filter, take 3 best ranked features\nanova_filter = SelectKBest(f_regression, k=3)\n# 2) svm\nclf = svm.SVC(kernel='linear')\n\nanova_svm = make_pipeline(anova_filter, clf)\nanova_svm.fit(X_train, y_train)\ny_pred = anova_svm.predict(X_test)\nprint(classification_report(y_test, y_pred))"
       ]
     }
   ],
 
@@ -6,24 +6,29 @@
 Simple usage of Pipeline that runs successively a univariate
 feature selection with anova and then a C-SVM of the selected features.
 """
-print(__doc__)
-
 from sklearn import svm
 from sklearn.datasets import samples_generator
 from sklearn.feature_selection import SelectKBest, f_regression
 from sklearn.pipeline import make_pipeline
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report
+
+print(__doc__)
 
 # import some data to play with
 X, y = samples_generator.make_classification(
     n_features=20, n_informative=3, n_redundant=0, n_classes=4,
     n_clusters_per_class=2)
 
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+
 # ANOVA SVM-C
 # 1) anova filter, take 3 best ranked features
 anova_filter = SelectKBest(f_regression, k=3)
 # 2) svm
 clf = svm.SVC(kernel='linear')
 
 anova_svm = make_pipeline(anova_filter, clf)
-anova_svm.fit(X, y)
-anova_svm.predict(X)
+anova_svm.fit(X_train, y_train)
+y_pred = anova_svm.predict(X_test)
+print(classification_report(y_test, y_pred))
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"cell_type": "markdown",`
`16`	`16`	`"metadata": {},`
`17`	`17`	`"source": [`
`18`		- "\n# Biclustering documents with the Spectral Co-clustering algorithm\n\n\nThis example demonstrates the Spectral Co-clustering algorithm on the\ntwenty newsgroups dataset. The 'comp.os.ms-windows.misc' category is\nexcluded because it contains many posts containing nothing but data.\n\nThe TF-IDF vectorized posts form a word frequency matrix, which is\nthen biclustered using Dhillon's Spectral Co-Clustering algorithm. The\nresulting document-word biclusters indicate subsets words used more\noften in those subsets documents.\n\nFor a few of the best biclusters, its most common document categories\nand its ten most important words get printed. The best biclusters are\ndetermined by their normalized cut. The best words are determined by\ncomparing their sums inside and outside the bicluster.\n\nFor comparison, the documents are also clustered using\nMiniBatchKMeans. The document clusters derived from the biclusters\nachieve a better V-measure than clusters found by MiniBatchKMeans.\n\nOutput::\n\n Vectorizing...\n Coclustering...\n Done in 9.53s. V-measure: 0.4455\n MiniBatchKMeans...\n Done in 12.00s. V-measure: 0.3309\n\n Best biclusters:\n ----------------\n bicluster 0 : 1951 documents, 4373 words\n categories : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med\n words : gun, guns, geb, banks, firearms, drugs, gordon, clinton,\n cdt, amendment\n\n bicluster 1 : 1165 documents, 3304 words\n categories : 29% talk.politics.mideast, 26% soc.religion.christian,\n 25% alt.atheism\n words : god, jesus, christians, atheists, kent, sin, morality,\n belief, resurrection, marriage\n\n bicluster 2 : 2219 documents, 2830 words\n categories : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware,\n 16% comp.graphics\n words : voltage, dsp, board, receiver, circuit, shipping, packages,\n stereo, compression, package\n\n bicluster 3 : 1860 documents, 2745 words\n categories : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale\n words : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw,\n bikes\n\n bicluster 4 : 12 documents, 155 words\n categories : 100% rec.sport.hockey\n words : scorer, unassisted, reichel, semak, sweeney, kovalenko,\n ricci, audette, momesso, nedved\n\n\n"
	`18`	+ "\n# Biclustering documents with the Spectral Co-clustering algorithm\n\n\nThis example demonstrates the Spectral Co-clustering algorithm on the\ntwenty newsgroups dataset. The 'comp.os.ms-windows.misc' category is\nexcluded because it contains many posts containing nothing but data.\n\nThe TF-IDF vectorized posts form a word frequency matrix, which is\nthen biclustered using Dhillon's Spectral Co-Clustering algorithm. The\nresulting document-word biclusters indicate subsets words used more\noften in those subsets documents.\n\nFor a few of the best biclusters, its most common document categories\nand its ten most important words get printed. The best biclusters are\ndetermined by their normalized cut. The best words are determined by\ncomparing their sums inside and outside the bicluster.\n\nFor comparison, the documents are also clustered using\nMiniBatchKMeans. The document clusters derived from the biclusters\nachieve a better V-measure than clusters found by MiniBatchKMeans.\n\n\n"
`19`	`19`	`]`
`20`	`20`	`},`
`21`	`21`	`{`
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "from __future__ import print_function\n\nprint(__doc__)\n\nfrom collections import defaultdict\nimport operator\nimport re\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.externals.six import iteritems\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\n\ndef number_aware_tokenizer(doc):\n \"\"\" Tokenizer that maps all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n token_pattern = re.compile(u'(?u)\\\\b\\\\w\\\\w+\\\\b')\n tokens = token_pattern.findall(doc)\n tokens = [\"#NUMBER\" if token[0] in \"0123456789_\" else token\n for token in tokens]\n return tokens\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n 'comp.windows.x', 'misc.forsale', 'rec.autos',\n 'rec.motorcycles', 'rec.sport.baseball',\n 'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n 'sci.med', 'sci.space', 'soc.religion.christian',\n 'talk.politics.guns', 'talk.politics.mideast',\n 'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = TfidfVectorizer(stop_words='english', min_df=5,\n tokenizer=number_aware_tokenizer)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = (X[row_complement][:, cols].sum() +\n X[rows][:, col_complement].sum())\n return cut / weight\n\n\ndef most_common(d):\n \"\"\"Items of a defaultdict(int) with the highest values.\n\n Like Counter.most_common in Python >=2.7.\n \"\"\"\n return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = defaultdict(int)\n for i in cluster_docs:\n counter[document_names[i]] += 1\n cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n for name, c in most_common(counter)[:3])\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n word_col[out_of_cluster_docs, :].sum(axis=0))\n word_scores = word_scores.ravel()\n important_words = list(feature_names[cluster_words[i]]\n for i in word_scores.argsort()[:-11:-1])\n\n print(\"bicluster {} : {} documents, {} words\".format(\n idx, n_rows, n_cols))\n print(\"categories : {}\".format(cat_string))\n print(\"words : {}\\n\".format(', '.join(important_words)))"
	`29`	+ "from __future__ import print_function\n\nfrom collections import defaultdict\nimport operator\nimport re\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.externals.six import iteritems\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_aware_tokenizer(doc):\n \"\"\" Tokenizer that maps all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n token_pattern = re.compile(u'(?u)\\\\b\\\\w\\\\w+\\\\b')\n tokens = token_pattern.findall(doc)\n tokens = [\"#NUMBER\" if token[0] in \"0123456789_\" else token\n for token in tokens]\n return tokens\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n 'comp.windows.x', 'misc.forsale', 'rec.autos',\n 'rec.motorcycles', 'rec.sport.baseball',\n 'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n 'sci.med', 'sci.space', 'soc.religion.christian',\n 'talk.politics.guns', 'talk.politics.mideast',\n 'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = TfidfVectorizer(stop_words='english', min_df=5,\n tokenizer=number_aware_tokenizer)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = (X[row_complement][:, cols].sum() +\n X[rows][:, col_complement].sum())\n return cut / weight\n\n\ndef most_common(d):\n \"\"\"Items of a defaultdict(int) with the highest values.\n\n Like Counter.most_common in Python >=2.7.\n \"\"\"\n return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = defaultdict(int)\n for i in cluster_docs:\n counter[document_names[i]] += 1\n cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n for name, c in most_common(counter)[:3])\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n word_col[out_of_cluster_docs, :].sum(axis=0))\n word_scores = word_scores.ravel()\n important_words = list(feature_names[cluster_words[i]]\n for i in word_scores.argsort()[:-11:-1])\n\n print(\"bicluster {} : {} documents, {} words\".format(\n idx, n_rows, n_cols))\n print(\"categories : {}\".format(cat_string))\n print(\"words : {}\\n\".format(', '.join(important_words)))"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`