Skip to content

Commit c6ddaa7

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 3de7da329fb35b23683a33517191682c8092d38e
1 parent ed05521 commit c6ddaa7

File tree

934 files changed

+3063
-3118
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

934 files changed

+3063
-3118
lines changed
-1.17 KB
Binary file not shown.
-1.14 KB
Binary file not shown.

dev/_downloads/plot_bicluster_newsgroups.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"cell_type": "markdown",
1616
"metadata": {},
1717
"source": [
18-
"\n# Biclustering documents with the Spectral Co-clustering algorithm\n\n\nThis example demonstrates the Spectral Co-clustering algorithm on the\ntwenty newsgroups dataset. The 'comp.os.ms-windows.misc' category is\nexcluded because it contains many posts containing nothing but data.\n\nThe TF-IDF vectorized posts form a word frequency matrix, which is\nthen biclustered using Dhillon's Spectral Co-Clustering algorithm. The\nresulting document-word biclusters indicate subsets words used more\noften in those subsets documents.\n\nFor a few of the best biclusters, its most common document categories\nand its ten most important words get printed. The best biclusters are\ndetermined by their normalized cut. The best words are determined by\ncomparing their sums inside and outside the bicluster.\n\nFor comparison, the documents are also clustered using\nMiniBatchKMeans. The document clusters derived from the biclusters\nachieve a better V-measure than clusters found by MiniBatchKMeans.\n\nOutput::\n\n Vectorizing...\n Coclustering...\n Done in 9.53s. V-measure: 0.4455\n MiniBatchKMeans...\n Done in 12.00s. V-measure: 0.3309\n\n Best biclusters:\n ----------------\n bicluster 0 : 1951 documents, 4373 words\n categories : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med\n words : gun, guns, geb, banks, firearms, drugs, gordon, clinton,\n cdt, amendment\n\n bicluster 1 : 1165 documents, 3304 words\n categories : 29% talk.politics.mideast, 26% soc.religion.christian,\n 25% alt.atheism\n words : god, jesus, christians, atheists, kent, sin, morality,\n belief, resurrection, marriage\n\n bicluster 2 : 2219 documents, 2830 words\n categories : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware,\n 16% comp.graphics\n words : voltage, dsp, board, receiver, circuit, shipping, packages,\n stereo, compression, package\n\n bicluster 3 : 1860 documents, 2745 words\n categories : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale\n words : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw,\n bikes\n\n bicluster 4 : 12 documents, 155 words\n categories : 100% rec.sport.hockey\n words : scorer, unassisted, reichel, semak, sweeney, kovalenko,\n ricci, audette, momesso, nedved\n\n\n"
18+
"\n# Biclustering documents with the Spectral Co-clustering algorithm\n\n\nThis example demonstrates the Spectral Co-clustering algorithm on the\ntwenty newsgroups dataset. The 'comp.os.ms-windows.misc' category is\nexcluded because it contains many posts containing nothing but data.\n\nThe TF-IDF vectorized posts form a word frequency matrix, which is\nthen biclustered using Dhillon's Spectral Co-Clustering algorithm. The\nresulting document-word biclusters indicate subsets words used more\noften in those subsets documents.\n\nFor a few of the best biclusters, its most common document categories\nand its ten most important words get printed. The best biclusters are\ndetermined by their normalized cut. The best words are determined by\ncomparing their sums inside and outside the bicluster.\n\nFor comparison, the documents are also clustered using\nMiniBatchKMeans. The document clusters derived from the biclusters\nachieve a better V-measure than clusters found by MiniBatchKMeans.\n\n\n"
1919
]
2020
},
2121
{
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"from __future__ import print_function\n\nprint(__doc__)\n\nfrom collections import defaultdict\nimport operator\nimport re\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.externals.six import iteritems\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\n\ndef number_aware_tokenizer(doc):\n \"\"\" Tokenizer that maps all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n token_pattern = re.compile(u'(?u)\\\\b\\\\w\\\\w+\\\\b')\n tokens = token_pattern.findall(doc)\n tokens = [\"#NUMBER\" if token[0] in \"0123456789_\" else token\n for token in tokens]\n return tokens\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n 'comp.windows.x', 'misc.forsale', 'rec.autos',\n 'rec.motorcycles', 'rec.sport.baseball',\n 'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n 'sci.med', 'sci.space', 'soc.religion.christian',\n 'talk.politics.guns', 'talk.politics.mideast',\n 'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = TfidfVectorizer(stop_words='english', min_df=5,\n tokenizer=number_aware_tokenizer)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = (X[row_complement][:, cols].sum() +\n X[rows][:, col_complement].sum())\n return cut / weight\n\n\ndef most_common(d):\n \"\"\"Items of a defaultdict(int) with the highest values.\n\n Like Counter.most_common in Python >=2.7.\n \"\"\"\n return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = defaultdict(int)\n for i in cluster_docs:\n counter[document_names[i]] += 1\n cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n for name, c in most_common(counter)[:3])\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n word_col[out_of_cluster_docs, :].sum(axis=0))\n word_scores = word_scores.ravel()\n important_words = list(feature_names[cluster_words[i]]\n for i in word_scores.argsort()[:-11:-1])\n\n print(\"bicluster {} : {} documents, {} words\".format(\n idx, n_rows, n_cols))\n print(\"categories : {}\".format(cat_string))\n print(\"words : {}\\n\".format(', '.join(important_words)))"
29+
"from __future__ import print_function\n\nfrom collections import defaultdict\nimport operator\nimport re\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.externals.six import iteritems\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_aware_tokenizer(doc):\n \"\"\" Tokenizer that maps all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n token_pattern = re.compile(u'(?u)\\\\b\\\\w\\\\w+\\\\b')\n tokens = token_pattern.findall(doc)\n tokens = [\"#NUMBER\" if token[0] in \"0123456789_\" else token\n for token in tokens]\n return tokens\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n 'comp.windows.x', 'misc.forsale', 'rec.autos',\n 'rec.motorcycles', 'rec.sport.baseball',\n 'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n 'sci.med', 'sci.space', 'soc.religion.christian',\n 'talk.politics.guns', 'talk.politics.mideast',\n 'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = TfidfVectorizer(stop_words='english', min_df=5,\n tokenizer=number_aware_tokenizer)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = (X[row_complement][:, cols].sum() +\n X[rows][:, col_complement].sum())\n return cut / weight\n\n\ndef most_common(d):\n \"\"\"Items of a defaultdict(int) with the highest values.\n\n Like Counter.most_common in Python >=2.7.\n \"\"\"\n return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = defaultdict(int)\n for i in cluster_docs:\n counter[document_names[i]] += 1\n cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n for name, c in most_common(counter)[:3])\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n word_col[out_of_cluster_docs, :].sum(axis=0))\n word_scores = word_scores.ravel()\n important_words = list(feature_names[cluster_words[i]]\n for i in word_scores.argsort()[:-11:-1])\n\n print(\"bicluster {} : {} documents, {} words\".format(\n idx, n_rows, n_cols))\n print(\"categories : {}\".format(cat_string))\n print(\"words : {}\\n\".format(', '.join(important_words)))"
3030
]
3131
}
3232
],

dev/_downloads/plot_bicluster_newsgroups.py

Lines changed: 3 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -21,48 +21,9 @@
2121
MiniBatchKMeans. The document clusters derived from the biclusters
2222
achieve a better V-measure than clusters found by MiniBatchKMeans.
2323
24-
Output::
25-
26-
Vectorizing...
27-
Coclustering...
28-
Done in 9.53s. V-measure: 0.4455
29-
MiniBatchKMeans...
30-
Done in 12.00s. V-measure: 0.3309
31-
32-
Best biclusters:
33-
----------------
34-
bicluster 0 : 1951 documents, 4373 words
35-
categories : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med
36-
words : gun, guns, geb, banks, firearms, drugs, gordon, clinton,
37-
cdt, amendment
38-
39-
bicluster 1 : 1165 documents, 3304 words
40-
categories : 29% talk.politics.mideast, 26% soc.religion.christian,
41-
25% alt.atheism
42-
words : god, jesus, christians, atheists, kent, sin, morality,
43-
belief, resurrection, marriage
44-
45-
bicluster 2 : 2219 documents, 2830 words
46-
categories : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware,
47-
16% comp.graphics
48-
words : voltage, dsp, board, receiver, circuit, shipping, packages,
49-
stereo, compression, package
50-
51-
bicluster 3 : 1860 documents, 2745 words
52-
categories : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale
53-
words : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw,
54-
bikes
55-
56-
bicluster 4 : 12 documents, 155 words
57-
categories : 100% rec.sport.hockey
58-
words : scorer, unassisted, reichel, semak, sweeney, kovalenko,
59-
ricci, audette, momesso, nedved
60-
6124
"""
6225
from __future__ import print_function
6326

64-
print(__doc__)
65-
6627
from collections import defaultdict
6728
import operator
6829
import re
@@ -77,6 +38,8 @@
7738
from sklearn.feature_extraction.text import TfidfVectorizer
7839
from sklearn.metrics.cluster import v_measure_score
7940

41+
print(__doc__)
42+
8043

8144
def number_aware_tokenizer(doc):
8245
""" Tokenizer that maps all numeric tokens to a placeholder.
@@ -91,6 +54,7 @@ def number_aware_tokenizer(doc):
9154
for token in tokens]
9255
return tokens
9356

57+
9458
# exclude 'comp.os.ms-windows.misc'
9559
categories = ['alt.atheism', 'comp.graphics',
9660
'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',

dev/_downloads/plot_feature_selection_pipeline.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\nfrom sklearn import svm\nfrom sklearn.datasets import samples_generator\nfrom sklearn.feature_selection import SelectKBest, f_regression\nfrom sklearn.pipeline import make_pipeline\n\n# import some data to play with\nX, y = samples_generator.make_classification(\n n_features=20, n_informative=3, n_redundant=0, n_classes=4,\n n_clusters_per_class=2)\n\n# ANOVA SVM-C\n# 1) anova filter, take 3 best ranked features\nanova_filter = SelectKBest(f_regression, k=3)\n# 2) svm\nclf = svm.SVC(kernel='linear')\n\nanova_svm = make_pipeline(anova_filter, clf)\nanova_svm.fit(X, y)\nanova_svm.predict(X)"
29+
"from sklearn import svm\nfrom sklearn.datasets import samples_generator\nfrom sklearn.feature_selection import SelectKBest, f_regression\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import classification_report\n\nprint(__doc__)\n\n# import some data to play with\nX, y = samples_generator.make_classification(\n n_features=20, n_informative=3, n_redundant=0, n_classes=4,\n n_clusters_per_class=2)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n\n# ANOVA SVM-C\n# 1) anova filter, take 3 best ranked features\nanova_filter = SelectKBest(f_regression, k=3)\n# 2) svm\nclf = svm.SVC(kernel='linear')\n\nanova_svm = make_pipeline(anova_filter, clf)\nanova_svm.fit(X_train, y_train)\ny_pred = anova_svm.predict(X_test)\nprint(classification_report(y_test, y_pred))"
3030
]
3131
}
3232
],

dev/_downloads/plot_feature_selection_pipeline.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,29 @@
66
Simple usage of Pipeline that runs successively a univariate
77
feature selection with anova and then a C-SVM of the selected features.
88
"""
9-
print(__doc__)
10-
119
from sklearn import svm
1210
from sklearn.datasets import samples_generator
1311
from sklearn.feature_selection import SelectKBest, f_regression
1412
from sklearn.pipeline import make_pipeline
13+
from sklearn.model_selection import train_test_split
14+
from sklearn.metrics import classification_report
15+
16+
print(__doc__)
1517

1618
# import some data to play with
1719
X, y = samples_generator.make_classification(
1820
n_features=20, n_informative=3, n_redundant=0, n_classes=4,
1921
n_clusters_per_class=2)
2022

23+
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
24+
2125
# ANOVA SVM-C
2226
# 1) anova filter, take 3 best ranked features
2327
anova_filter = SelectKBest(f_regression, k=3)
2428
# 2) svm
2529
clf = svm.SVC(kernel='linear')
2630

2731
anova_svm = make_pipeline(anova_filter, clf)
28-
anova_svm.fit(X, y)
29-
anova_svm.predict(X)
32+
anova_svm.fit(X_train, y_train)
33+
y_pred = anova_svm.predict(X_test)
34+
print(classification_report(y_test, y_pred))

dev/_downloads/scikit-learn-docs.pdf

7.77 KB
Binary file not shown.

0 commit comments

Comments
 (0)