Skip to content

Commit be4187c

Browse files
committed
Pushing the docs to dev/ for branch: master, commit b189254bd9cd4192bb887c326ca5e1c5588a5dea
1 parent 2646c1c commit be4187c

File tree

1,083 files changed

+7438
-11364
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,083 files changed

+7438
-11364
lines changed
-29.6 KB
Binary file not shown.
-120 Bytes
Binary file not shown.

dev/_downloads/document_classification_20newsgroups.ipynb

Lines changed: 1 addition & 37 deletions
Large diffs are not rendered by default.

dev/_downloads/document_classification_20newsgroups.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def is_interactive():
100100
print()
101101

102102

103-
###############################################################################
103+
# #############################################################################
104104
# Load some categories from the training set
105105
if opts.all_categories:
106106
categories = None
@@ -152,7 +152,7 @@ def size_mb(docs):
152152
print("Extracting features from the training data using a sparse vectorizer")
153153
t0 = time()
154154
if opts.use_hashing:
155-
vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
155+
vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
156156
n_features=opts.n_features)
157157
X_train = vectorizer.transform(data_train.data)
158158
else:
@@ -201,7 +201,7 @@ def trim(s):
201201
return s if len(s) <= 80 else s[:77] + "..."
202202

203203

204-
###############################################################################
204+
# #############################################################################
205205
# Benchmark classifiers
206206
def benchmark(clf):
207207
print('_' * 80)

dev/_downloads/document_clustering.ipynb

Lines changed: 1 addition & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -26,43 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Peter Prettenhofer <[email protected]>\n# Lars Buitinck\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn import metrics\n\nfrom sklearn.cluster import KMeans, MiniBatchKMeans\n\nimport logging\nfrom optparse import OptionParser\nimport sys\nfrom time import time\n\nimport numpy as np\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\"--lsa\",\n dest=\"n_components\", type=\"int\",\n help=\"Preprocess documents with latent semantic analysis.\")\nop.add_option(\"--no-minibatch\",\n action=\"store_false\", dest=\"minibatch\", default=True,\n help=\"Use ordinary k-means algorithm (in batch mode).\")\nop.add_option(\"--no-idf\",\n action=\"store_false\", dest=\"use_idf\", default=True,\n help=\"Disable Inverse Document Frequency feature weighting.\")\nop.add_option(\"--use-hashing\",\n action=\"store_true\", default=False,\n help=\"Use a hashing feature vectorizer\")\nop.add_option(\"--n-features\", type=int, default=10000,\n help=\"Maximum number of features (dimensions)\"\n \" to extract from text.\")\nop.add_option(\"--verbose\",\n action=\"store_true\", dest=\"verbose\", default=False,\n help=\"Print progress reports inside k-means algorithm.\")\n\nprint(__doc__)\nop.print_help()\n\n\ndef is_interactive():\n return not hasattr(sys.modules['__main__'], '__file__')\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n op.error(\"this script takes no arguments.\")\n sys.exit(1)"
30-
]
31-
},
32-
{
33-
"cell_type": "markdown",
34-
"metadata": {},
35-
"source": [
36-
"Load some categories from the training set\n\n"
37-
]
38-
},
39-
{
40-
"cell_type": "code",
41-
"execution_count": null,
42-
"metadata": {
43-
"collapsed": false
44-
},
45-
"outputs": [],
46-
"source": [
47-
"categories = [\n 'alt.atheism',\n 'talk.religion.misc',\n 'comp.graphics',\n 'sci.space',\n]\n# Uncomment the following to do the analysis on all the categories\n# categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndataset = fetch_20newsgroups(subset='all', categories=categories,\n shuffle=True, random_state=42)\n\nprint(\"%d documents\" % len(dataset.data))\nprint(\"%d categories\" % len(dataset.target_names))\nprint()\n\nlabels = dataset.target\ntrue_k = np.unique(labels).shape[0]\n\nprint(\"Extracting features from the training dataset using a sparse vectorizer\")\nt0 = time()\nif opts.use_hashing:\n if opts.use_idf:\n # Perform an IDF normalization on the output of HashingVectorizer\n hasher = HashingVectorizer(n_features=opts.n_features,\n stop_words='english', non_negative=True,\n norm=None, binary=False)\n vectorizer = make_pipeline(hasher, TfidfTransformer())\n else:\n vectorizer = HashingVectorizer(n_features=opts.n_features,\n stop_words='english',\n non_negative=False, norm='l2',\n binary=False)\nelse:\n vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,\n min_df=2, stop_words='english',\n use_idf=opts.use_idf)\nX = vectorizer.fit_transform(dataset.data)\n\nprint(\"done in %fs\" % (time() - t0))\nprint(\"n_samples: %d, n_features: %d\" % X.shape)\nprint()\n\nif opts.n_components:\n print(\"Performing dimensionality reduction using LSA\")\n t0 = time()\n # Vectorizer results are normalized, which makes KMeans behave as\n # spherical k-means for better results. Since LSA/SVD results are\n # not normalized, we have to redo the normalization.\n svd = TruncatedSVD(opts.n_components)\n normalizer = Normalizer(copy=False)\n lsa = make_pipeline(svd, normalizer)\n\n X = lsa.fit_transform(X)\n\n print(\"done in %fs\" % (time() - t0))\n\n explained_variance = svd.explained_variance_ratio_.sum()\n print(\"Explained variance of the SVD step: {}%\".format(\n int(explained_variance * 100)))\n\n print()"
48-
]
49-
},
50-
{
51-
"cell_type": "markdown",
52-
"metadata": {},
53-
"source": [
54-
"Do the actual clustering\n\n"
55-
]
56-
},
57-
{
58-
"cell_type": "code",
59-
"execution_count": null,
60-
"metadata": {
61-
"collapsed": false
62-
},
63-
"outputs": [],
64-
"source": [
65-
"if opts.minibatch:\n km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,\n init_size=1000, batch_size=1000, verbose=opts.verbose)\nelse:\n km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,\n verbose=opts.verbose)\n\nprint(\"Clustering sparse data with %s\" % km)\nt0 = time()\nkm.fit(X)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint()\n\nprint(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\nprint(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\nprint(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\nprint(\"Adjusted Rand-Index: %.3f\"\n % metrics.adjusted_rand_score(labels, km.labels_))\nprint(\"Silhouette Coefficient: %0.3f\"\n % metrics.silhouette_score(X, km.labels_, sample_size=1000))\n\nprint()\n\n\nif not opts.use_hashing:\n print(\"Top terms per cluster:\")\n\n if opts.n_components:\n original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n order_centroids = original_space_centroids.argsort()[:, ::-1]\n else:\n order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n\n terms = vectorizer.get_feature_names()\n for i in range(true_k):\n print(\"Cluster %d:\" % i, end='')\n for ind in order_centroids[i, :10]:\n print(' %s' % terms[ind], end='')\n print()"
29+
"# Author: Peter Prettenhofer <[email protected]>\n# Lars Buitinck\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn import metrics\n\nfrom sklearn.cluster import KMeans, MiniBatchKMeans\n\nimport logging\nfrom optparse import OptionParser\nimport sys\nfrom time import time\n\nimport numpy as np\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\"--lsa\",\n dest=\"n_components\", type=\"int\",\n help=\"Preprocess documents with latent semantic analysis.\")\nop.add_option(\"--no-minibatch\",\n action=\"store_false\", dest=\"minibatch\", default=True,\n help=\"Use ordinary k-means algorithm (in batch mode).\")\nop.add_option(\"--no-idf\",\n action=\"store_false\", dest=\"use_idf\", default=True,\n help=\"Disable Inverse Document Frequency feature weighting.\")\nop.add_option(\"--use-hashing\",\n action=\"store_true\", default=False,\n help=\"Use a hashing feature vectorizer\")\nop.add_option(\"--n-features\", type=int, default=10000,\n help=\"Maximum number of features (dimensions)\"\n \" to extract from text.\")\nop.add_option(\"--verbose\",\n action=\"store_true\", dest=\"verbose\", default=False,\n help=\"Print progress reports inside k-means algorithm.\")\n\nprint(__doc__)\nop.print_help()\n\n\ndef is_interactive():\n return not hasattr(sys.modules['__main__'], '__file__')\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n op.error(\"this script takes no arguments.\")\n sys.exit(1)\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n 'alt.atheism',\n 'talk.religion.misc',\n 'comp.graphics',\n 'sci.space',\n]\n# Uncomment the following to do the analysis on all the categories\n# categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndataset = fetch_20newsgroups(subset='all', categories=categories,\n shuffle=True, random_state=42)\n\nprint(\"%d documents\" % len(dataset.data))\nprint(\"%d categories\" % len(dataset.target_names))\nprint()\n\nlabels = dataset.target\ntrue_k = np.unique(labels).shape[0]\n\nprint(\"Extracting features from the training dataset using a sparse vectorizer\")\nt0 = time()\nif opts.use_hashing:\n if opts.use_idf:\n # Perform an IDF normalization on the output of HashingVectorizer\n hasher = HashingVectorizer(n_features=opts.n_features,\n stop_words='english', alternate_sign=False,\n norm=None, binary=False)\n vectorizer = make_pipeline(hasher, TfidfTransformer())\n else:\n vectorizer = HashingVectorizer(n_features=opts.n_features,\n stop_words='english',\n alternate_sign=False, norm='l2',\n binary=False)\nelse:\n vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,\n min_df=2, stop_words='english',\n use_idf=opts.use_idf)\nX = vectorizer.fit_transform(dataset.data)\n\nprint(\"done in %fs\" % (time() - t0))\nprint(\"n_samples: %d, n_features: %d\" % X.shape)\nprint()\n\nif opts.n_components:\n print(\"Performing dimensionality reduction using LSA\")\n t0 = time()\n # Vectorizer results are normalized, which makes KMeans behave as\n # spherical k-means for better results. Since LSA/SVD results are\n # not normalized, we have to redo the normalization.\n svd = TruncatedSVD(opts.n_components)\n normalizer = Normalizer(copy=False)\n lsa = make_pipeline(svd, normalizer)\n\n X = lsa.fit_transform(X)\n\n print(\"done in %fs\" % (time() - t0))\n\n explained_variance = svd.explained_variance_ratio_.sum()\n print(\"Explained variance of the SVD step: {}%\".format(\n int(explained_variance * 100)))\n\n print()\n\n\n# #############################################################################\n# Do the actual clustering\n\nif opts.minibatch:\n km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,\n init_size=1000, batch_size=1000, verbose=opts.verbose)\nelse:\n km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,\n verbose=opts.verbose)\n\nprint(\"Clustering sparse data with %s\" % km)\nt0 = time()\nkm.fit(X)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint()\n\nprint(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\nprint(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\nprint(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\nprint(\"Adjusted Rand-Index: %.3f\"\n % metrics.adjusted_rand_score(labels, km.labels_))\nprint(\"Silhouette Coefficient: %0.3f\"\n % metrics.silhouette_score(X, km.labels_, sample_size=1000))\n\nprint()\n\n\nif not opts.use_hashing:\n print(\"Top terms per cluster:\")\n\n if opts.n_components:\n original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n order_centroids = original_space_centroids.argsort()[:, ::-1]\n else:\n order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n\n terms = vectorizer.get_feature_names()\n for i in range(true_k):\n print(\"Cluster %d:\" % i, end='')\n for ind in order_centroids[i, :10]:\n print(' %s' % terms[ind], end='')\n print()"
6630
]
6731
}
6832
],

dev/_downloads/document_clustering.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def is_interactive():
114114
sys.exit(1)
115115

116116

117-
###############################################################################
117+
# #############################################################################
118118
# Load some categories from the training set
119119
categories = [
120120
'alt.atheism',
@@ -144,13 +144,13 @@ def is_interactive():
144144
if opts.use_idf:
145145
# Perform an IDF normalization on the output of HashingVectorizer
146146
hasher = HashingVectorizer(n_features=opts.n_features,
147-
stop_words='english', non_negative=True,
147+
stop_words='english', alternate_sign=False,
148148
norm=None, binary=False)
149149
vectorizer = make_pipeline(hasher, TfidfTransformer())
150150
else:
151151
vectorizer = HashingVectorizer(n_features=opts.n_features,
152152
stop_words='english',
153-
non_negative=False, norm='l2',
153+
alternate_sign=False, norm='l2',
154154
binary=False)
155155
else:
156156
vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
@@ -183,7 +183,7 @@ def is_interactive():
183183
print()
184184

185185

186-
###############################################################################
186+
# #############################################################################
187187
# Do the actual clustering
188188

189189
if opts.minibatch:

0 commit comments

Comments
 (0)