Skip to content

Commit 99d714e

Browse files
committed
add 0.18 stable website
1 parent 118eed9 commit 99d714e

File tree

3,501 files changed

+446515
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3,501 files changed

+446515
-0
lines changed

0.18/.buildinfo

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Sphinx build info version 1
2+
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3+
config: ee0995393e1e51351d5bec758afe160b
4+
tags: 645f666f9bcd5a90fca523b33c5a78b7
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"nbformat_minor": 0,
3+
"cells": [
4+
{
5+
"source": [
6+
"%matplotlib inline"
7+
],
8+
"metadata": {
9+
"collapsed": false
10+
},
11+
"execution_count": null,
12+
"outputs": [],
13+
"cell_type": "code"
14+
},
15+
{
16+
"source": [
17+
"\n# Biclustering documents with the Spectral Co-clustering algorithm\n\n\nThis example demonstrates the Spectral Co-clustering algorithm on the\ntwenty newsgroups dataset. The 'comp.os.ms-windows.misc' category is\nexcluded because it contains many posts containing nothing but data.\n\nThe TF-IDF vectorized posts form a word frequency matrix, which is\nthen biclustered using Dhillon's Spectral Co-Clustering algorithm. The\nresulting document-word biclusters indicate subsets words used more\noften in those subsets documents.\n\nFor a few of the best biclusters, its most common document categories\nand its ten most important words get printed. The best biclusters are\ndetermined by their normalized cut. The best words are determined by\ncomparing their sums inside and outside the bicluster.\n\nFor comparison, the documents are also clustered using\nMiniBatchKMeans. The document clusters derived from the biclusters\nachieve a better V-measure than clusters found by MiniBatchKMeans.\n\nOutput::\n\n Vectorizing...\n Coclustering...\n Done in 9.53s. V-measure: 0.4455\n MiniBatchKMeans...\n Done in 12.00s. V-measure: 0.3309\n\n Best biclusters:\n ----------------\n bicluster 0 : 1951 documents, 4373 words\n categories : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med\n words : gun, guns, geb, banks, firearms, drugs, gordon, clinton, cdt, amendment\n\n bicluster 1 : 1165 documents, 3304 words\n categories : 29% talk.politics.mideast, 26% soc.religion.christian, 25% alt.atheism\n words : god, jesus, christians, atheists, kent, sin, morality, belief, resurrection, marriage\n\n bicluster 2 : 2219 documents, 2830 words\n categories : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware, 16% comp.graphics\n words : voltage, dsp, board, receiver, circuit, shipping, packages, stereo, compression, package\n\n bicluster 3 : 1860 documents, 2745 words\n categories : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale\n words : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw, bikes\n\n bicluster 4 : 12 documents, 155 words\n categories : 100% rec.sport.hockey\n words : scorer, unassisted, reichel, semak, sweeney, kovalenko, ricci, audette, momesso, nedved\n\n"
18+
],
19+
"metadata": {},
20+
"cell_type": "markdown"
21+
},
22+
{
23+
"source": [
24+
"from __future__ import print_function\n\nprint(__doc__)\n\nfrom collections import defaultdict\nimport operator\nimport re\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.externals.six import iteritems\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\n\ndef number_aware_tokenizer(doc):\n \"\"\" Tokenizer that maps all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n token_pattern = re.compile(u'(?u)\\\\b\\\\w\\\\w+\\\\b')\n tokens = token_pattern.findall(doc)\n tokens = [\"#NUMBER\" if token[0] in \"0123456789_\" else token\n for token in tokens]\n return tokens\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n 'comp.windows.x', 'misc.forsale', 'rec.autos',\n 'rec.motorcycles', 'rec.sport.baseball',\n 'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n 'sci.med', 'sci.space', 'soc.religion.christian',\n 'talk.politics.guns', 'talk.politics.mideast',\n 'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = TfidfVectorizer(stop_words='english', min_df=5,\n tokenizer=number_aware_tokenizer)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis], cols].sum() but\n # much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = (X[row_complement][:, cols].sum() +\n X[rows][:, col_complement].sum())\n return cut / weight\n\n\ndef most_common(d):\n \"\"\"Items of a defaultdict(int) with the highest values.\n\n Like Counter.most_common in Python >=2.7.\n \"\"\"\n return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = defaultdict(int)\n for i in cluster_docs:\n counter[document_names[i]] += 1\n cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n for name, c in most_common(counter)[:3])\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n word_col[out_of_cluster_docs, :].sum(axis=0))\n word_scores = word_scores.ravel()\n important_words = list(feature_names[cluster_words[i]]\n for i in word_scores.argsort()[:-11:-1])\n\n print(\"bicluster {} : {} documents, {} words\".format(\n idx, n_rows, n_cols))\n print(\"categories : {}\".format(cat_string))\n print(\"words : {}\\n\".format(', '.join(important_words)))"
25+
],
26+
"metadata": {
27+
"collapsed": false
28+
},
29+
"execution_count": null,
30+
"outputs": [],
31+
"cell_type": "code"
32+
}
33+
],
34+
"metadata": {
35+
"kernelspec": {
36+
"language": "python",
37+
"name": "python3",
38+
"display_name": "Python 3"
39+
},
40+
"language_info": {
41+
"pygments_lexer": "ipython3",
42+
"version": "3.5.2",
43+
"name": "python",
44+
"mimetype": "text/x-python",
45+
"codemirror_mode": {
46+
"version": 3,
47+
"name": "ipython"
48+
},
49+
"nbconvert_exporter": "python",
50+
"file_extension": ".py"
51+
}
52+
},
53+
"nbformat": 4
54+
}
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
"""
2+
================================================================
3+
Biclustering documents with the Spectral Co-clustering algorithm
4+
================================================================
5+
6+
This example demonstrates the Spectral Co-clustering algorithm on the
7+
twenty newsgroups dataset. The 'comp.os.ms-windows.misc' category is
8+
excluded because it contains many posts containing nothing but data.
9+
10+
The TF-IDF vectorized posts form a word frequency matrix, which is
11+
then biclustered using Dhillon's Spectral Co-Clustering algorithm. The
12+
resulting document-word biclusters indicate subsets words used more
13+
often in those subsets documents.
14+
15+
For a few of the best biclusters, its most common document categories
16+
and its ten most important words get printed. The best biclusters are
17+
determined by their normalized cut. The best words are determined by
18+
comparing their sums inside and outside the bicluster.
19+
20+
For comparison, the documents are also clustered using
21+
MiniBatchKMeans. The document clusters derived from the biclusters
22+
achieve a better V-measure than clusters found by MiniBatchKMeans.
23+
24+
Output::
25+
26+
Vectorizing...
27+
Coclustering...
28+
Done in 9.53s. V-measure: 0.4455
29+
MiniBatchKMeans...
30+
Done in 12.00s. V-measure: 0.3309
31+
32+
Best biclusters:
33+
----------------
34+
bicluster 0 : 1951 documents, 4373 words
35+
categories : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med
36+
words : gun, guns, geb, banks, firearms, drugs, gordon, clinton, cdt, amendment
37+
38+
bicluster 1 : 1165 documents, 3304 words
39+
categories : 29% talk.politics.mideast, 26% soc.religion.christian, 25% alt.atheism
40+
words : god, jesus, christians, atheists, kent, sin, morality, belief, resurrection, marriage
41+
42+
bicluster 2 : 2219 documents, 2830 words
43+
categories : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware, 16% comp.graphics
44+
words : voltage, dsp, board, receiver, circuit, shipping, packages, stereo, compression, package
45+
46+
bicluster 3 : 1860 documents, 2745 words
47+
categories : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale
48+
words : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw, bikes
49+
50+
bicluster 4 : 12 documents, 155 words
51+
categories : 100% rec.sport.hockey
52+
words : scorer, unassisted, reichel, semak, sweeney, kovalenko, ricci, audette, momesso, nedved
53+
54+
"""
55+
from __future__ import print_function
56+
57+
print(__doc__)
58+
59+
from collections import defaultdict
60+
import operator
61+
import re
62+
from time import time
63+
64+
import numpy as np
65+
66+
from sklearn.cluster.bicluster import SpectralCoclustering
67+
from sklearn.cluster import MiniBatchKMeans
68+
from sklearn.externals.six import iteritems
69+
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups
70+
from sklearn.feature_extraction.text import TfidfVectorizer
71+
from sklearn.metrics.cluster import v_measure_score
72+
73+
74+
def number_aware_tokenizer(doc):
75+
""" Tokenizer that maps all numeric tokens to a placeholder.
76+
77+
For many applications, tokens that begin with a number are not directly
78+
useful, but the fact that such a token exists can be relevant. By applying
79+
this form of dimensionality reduction, some methods may perform better.
80+
"""
81+
token_pattern = re.compile(u'(?u)\\b\\w\\w+\\b')
82+
tokens = token_pattern.findall(doc)
83+
tokens = ["#NUMBER" if token[0] in "0123456789_" else token
84+
for token in tokens]
85+
return tokens
86+
87+
# exclude 'comp.os.ms-windows.misc'
88+
categories = ['alt.atheism', 'comp.graphics',
89+
'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
90+
'comp.windows.x', 'misc.forsale', 'rec.autos',
91+
'rec.motorcycles', 'rec.sport.baseball',
92+
'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
93+
'sci.med', 'sci.space', 'soc.religion.christian',
94+
'talk.politics.guns', 'talk.politics.mideast',
95+
'talk.politics.misc', 'talk.religion.misc']
96+
newsgroups = fetch_20newsgroups(categories=categories)
97+
y_true = newsgroups.target
98+
99+
vectorizer = TfidfVectorizer(stop_words='english', min_df=5,
100+
tokenizer=number_aware_tokenizer)
101+
cocluster = SpectralCoclustering(n_clusters=len(categories),
102+
svd_method='arpack', random_state=0)
103+
kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,
104+
random_state=0)
105+
106+
print("Vectorizing...")
107+
X = vectorizer.fit_transform(newsgroups.data)
108+
109+
print("Coclustering...")
110+
start_time = time()
111+
cocluster.fit(X)
112+
y_cocluster = cocluster.row_labels_
113+
print("Done in {:.2f}s. V-measure: {:.4f}".format(
114+
time() - start_time,
115+
v_measure_score(y_cocluster, y_true)))
116+
117+
print("MiniBatchKMeans...")
118+
start_time = time()
119+
y_kmeans = kmeans.fit_predict(X)
120+
print("Done in {:.2f}s. V-measure: {:.4f}".format(
121+
time() - start_time,
122+
v_measure_score(y_kmeans, y_true)))
123+
124+
feature_names = vectorizer.get_feature_names()
125+
document_names = list(newsgroups.target_names[i] for i in newsgroups.target)
126+
127+
128+
def bicluster_ncut(i):
129+
rows, cols = cocluster.get_indices(i)
130+
if not (np.any(rows) and np.any(cols)):
131+
import sys
132+
return sys.float_info.max
133+
row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]
134+
col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]
135+
# Note: the following is identical to X[rows[:, np.newaxis], cols].sum() but
136+
# much faster in scipy <= 0.16
137+
weight = X[rows][:, cols].sum()
138+
cut = (X[row_complement][:, cols].sum() +
139+
X[rows][:, col_complement].sum())
140+
return cut / weight
141+
142+
143+
def most_common(d):
144+
"""Items of a defaultdict(int) with the highest values.
145+
146+
Like Counter.most_common in Python >=2.7.
147+
"""
148+
return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)
149+
150+
151+
bicluster_ncuts = list(bicluster_ncut(i)
152+
for i in range(len(newsgroups.target_names)))
153+
best_idx = np.argsort(bicluster_ncuts)[:5]
154+
155+
print()
156+
print("Best biclusters:")
157+
print("----------------")
158+
for idx, cluster in enumerate(best_idx):
159+
n_rows, n_cols = cocluster.get_shape(cluster)
160+
cluster_docs, cluster_words = cocluster.get_indices(cluster)
161+
if not len(cluster_docs) or not len(cluster_words):
162+
continue
163+
164+
# categories
165+
counter = defaultdict(int)
166+
for i in cluster_docs:
167+
counter[document_names[i]] += 1
168+
cat_string = ", ".join("{:.0f}% {}".format(float(c) / n_rows * 100, name)
169+
for name, c in most_common(counter)[:3])
170+
171+
# words
172+
out_of_cluster_docs = cocluster.row_labels_ != cluster
173+
out_of_cluster_docs = np.where(out_of_cluster_docs)[0]
174+
word_col = X[:, cluster_words]
175+
word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -
176+
word_col[out_of_cluster_docs, :].sum(axis=0))
177+
word_scores = word_scores.ravel()
178+
important_words = list(feature_names[cluster_words[i]]
179+
for i in word_scores.argsort()[:-11:-1])
180+
181+
print("bicluster {} : {} documents, {} words".format(
182+
idx, n_rows, n_cols))
183+
print("categories : {}".format(cat_string))
184+
print("words : {}\n".format(', '.join(important_words)))
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"nbformat_minor": 0,
3+
"cells": [
4+
{
5+
"source": [
6+
"%matplotlib inline"
7+
],
8+
"metadata": {
9+
"collapsed": false
10+
},
11+
"execution_count": null,
12+
"outputs": [],
13+
"cell_type": "code"
14+
},
15+
{
16+
"source": [
17+
"\n# Digits Classification Exercise\n\n\nA tutorial exercise regarding the use of classification techniques on\nthe Digits dataset.\n\nThis exercise is used in the :ref:`clf_tut` part of the\n:ref:`supervised_learning_tut` section of the\n:ref:`stat_learn_tut_index`.\n"
18+
],
19+
"metadata": {},
20+
"cell_type": "markdown"
21+
},
22+
{
23+
"source": [
24+
"print(__doc__)\n\nfrom sklearn import datasets, neighbors, linear_model\n\ndigits = datasets.load_digits()\nX_digits = digits.data\ny_digits = digits.target\n\nn_samples = len(X_digits)\n\nX_train = X_digits[:.9 * n_samples]\ny_train = y_digits[:.9 * n_samples]\nX_test = X_digits[.9 * n_samples:]\ny_test = y_digits[.9 * n_samples:]\n\nknn = neighbors.KNeighborsClassifier()\nlogistic = linear_model.LogisticRegression()\n\nprint('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))\nprint('LogisticRegression score: %f'\n % logistic.fit(X_train, y_train).score(X_test, y_test))"
25+
],
26+
"metadata": {
27+
"collapsed": false
28+
},
29+
"execution_count": null,
30+
"outputs": [],
31+
"cell_type": "code"
32+
}
33+
],
34+
"metadata": {
35+
"kernelspec": {
36+
"language": "python",
37+
"name": "python3",
38+
"display_name": "Python 3"
39+
},
40+
"language_info": {
41+
"pygments_lexer": "ipython3",
42+
"version": "3.5.2",
43+
"name": "python",
44+
"mimetype": "text/x-python",
45+
"codemirror_mode": {
46+
"version": 3,
47+
"name": "ipython"
48+
},
49+
"nbconvert_exporter": "python",
50+
"file_extension": ".py"
51+
}
52+
},
53+
"nbformat": 4
54+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""
2+
================================
3+
Digits Classification Exercise
4+
================================
5+
6+
A tutorial exercise regarding the use of classification techniques on
7+
the Digits dataset.
8+
9+
This exercise is used in the :ref:`clf_tut` part of the
10+
:ref:`supervised_learning_tut` section of the
11+
:ref:`stat_learn_tut_index`.
12+
"""
13+
print(__doc__)
14+
15+
from sklearn import datasets, neighbors, linear_model
16+
17+
digits = datasets.load_digits()
18+
X_digits = digits.data
19+
y_digits = digits.target
20+
21+
n_samples = len(X_digits)
22+
23+
X_train = X_digits[:.9 * n_samples]
24+
y_train = y_digits[:.9 * n_samples]
25+
X_test = X_digits[.9 * n_samples:]
26+
y_test = y_digits[.9 * n_samples:]
27+
28+
knn = neighbors.KNeighborsClassifier()
29+
logistic = linear_model.LogisticRegression()
30+
31+
print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
32+
print('LogisticRegression score: %f'
33+
% logistic.fit(X_train, y_train).score(X_test, y_test))

0 commit comments

Comments
 (0)