GRSEB9S
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
11 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
11 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
10 Bytes b/‎dev/_downloads/auto_examples_python.zip
10 Bytes
diff --git a/‎dev/_downloads/plot_topics_extraction_with_nmf_lda.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_topics_extraction_with_nmf_lda.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_topics_extraction_with_nmf_lda.py
Lines changed: 3 additions & 2 deletions b/‎dev/_downloads/plot_topics_extraction_with_nmf_lda.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
1 KB b/‎dev/_downloads/scikit-learn-docs.pdf
1 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-68 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-68 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-68 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-68 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-1 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-1 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
-1 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
-1 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-133 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-133 Bytes
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Olivier Grisel <[email protected]>\n#         Lars Buitinck\n#         Chyi-Kwei Yau <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\nfrom time import time\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_components = 10\nn_top_words = 20\n\n\ndef print_top_words(model, feature_names, n_top_words):\n    for topic_idx, topic in enumerate(model.components_):\n        message = \"Topic #%d: \" % topic_idx\n        message += \" \".join([feature_names[i]\n                             for i in topic.argsort()[:-n_top_words - 1:-1]])\n        print(message)\n    print()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndataset = fetch_20newsgroups(shuffle=True, random_state=1,\n                             remove=('headers', 'footers', 'quotes'))\ndata_samples = dataset.data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,\n                                   max_features=n_features,\n                                   stop_words='english')\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,\n                                max_features=n_features,\n                                stop_words='english')\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n      \"n_samples=%d and n_features=%d...\"\n      % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1,\n          alpha=.1, l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model (Frobenius norm):\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\n# Fit the NMF model\nprint(\"Fitting the NMF model (generalized Kullback-Leibler divergence) with \"\n      \"tf-idf features, n_samples=%d and n_features=%d...\"\n      % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler',\n          solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model (generalized Kullback-Leibler divergence):\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\nprint(\"Fitting LDA models with tf features, \"\n      \"n_samples=%d and n_features=%d...\"\n      % (n_samples, n_features))\nlda = LatentDirichletAllocation(n_components=n_components, max_iter=5,\n                                learning_method='online',\n                                learning_offset=50.,\n                                random_state=0)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in LDA model:\")\ntf_feature_names = tf_vectorizer.get_feature_names()\nprint_top_words(lda, tf_feature_names, n_top_words)"
+        "# Author: Olivier Grisel <[email protected]>\n#         Lars Buitinck\n#         Chyi-Kwei Yau <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\nfrom time import time\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_components = 10\nn_top_words = 20\n\n\ndef print_top_words(model, feature_names, n_top_words):\n    for topic_idx, topic in enumerate(model.components_):\n        message = \"Topic #%d: \" % topic_idx\n        message += \" \".join([feature_names[i]\n                             for i in topic.argsort()[:-n_top_words - 1:-1]])\n        print(message)\n    print()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndataset = fetch_20newsgroups(shuffle=True, random_state=1,\n                             remove=('headers', 'footers', 'quotes'))\ndata_samples = dataset.data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,\n                                   max_features=n_features,\n                                   stop_words='english')\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,\n                                max_features=n_features,\n                                stop_words='english')\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n      \"n_samples=%d and n_features=%d...\"\n      % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1,\n          alpha=.1, l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model (Frobenius norm):\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\n# Fit the NMF model\nprint(\"Fitting the NMF model (generalized Kullback-Leibler divergence) with \"\n      \"tf-idf features, n_samples=%d and n_features=%d...\"\n      % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1,\n          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,\n          l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model (generalized Kullback-Leibler divergence):\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\nprint(\"Fitting LDA models with tf features, \"\n      \"n_samples=%d and n_features=%d...\"\n      % (n_samples, n_features))\nlda = LatentDirichletAllocation(n_components=n_components, max_iter=5,\n                                learning_method='online',\n                                learning_offset=50.,\n                                random_state=0)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in LDA model:\")\ntf_feature_names = tf_vectorizer.get_feature_names()\nprint_top_words(lda, tf_feature_names, n_top_words)"
       ]
     }
   ],
 
@@ -98,8 +98,9 @@ def print_top_words(model, feature_names, n_top_words):
       "tf-idf features, n_samples=%d and n_features=%d..."
       % (n_samples, n_features))
 t0 = time()
-nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler',
-          solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf)
+nmf = NMF(n_components=n_components, random_state=1,
+          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
+          l1_ratio=.5).fit(tfidf)
 print("done in %0.3fs." % (time() - t0))
 
 print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Author: Olivier Grisel <[email protected]>\n# Lars Buitinck\n# Chyi-Kwei Yau <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\nfrom time import time\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_components = 10\nn_top_words = 20\n\n\ndef print_top_words(model, feature_names, n_top_words):\n for topic_idx, topic in enumerate(model.components_):\n message = \"Topic #%d: \" % topic_idx\n message += \" \".join([feature_names[i]\n for i in topic.argsort()[:-n_top_words - 1:-1]])\n print(message)\n print()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndataset = fetch_20newsgroups(shuffle=True, random_state=1,\n remove=('headers', 'footers', 'quotes'))\ndata_samples = dataset.data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,\n max_features=n_features,\n stop_words='english')\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,\n max_features=n_features,\n stop_words='english')\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n \"n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1,\n alpha=.1, l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model (Frobenius norm):\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\n# Fit the NMF model\nprint(\"Fitting the NMF model (generalized Kullback-Leibler divergence) with \"\n \"tf-idf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler',\n solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model (generalized Kullback-Leibler divergence):\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\nprint(\"Fitting LDA models with tf features, \"\n \"n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nlda = LatentDirichletAllocation(n_components=n_components, max_iter=5,\n learning_method='online',\n learning_offset=50.,\n random_state=0)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in LDA model:\")\ntf_feature_names = tf_vectorizer.get_feature_names()\nprint_top_words(lda, tf_feature_names, n_top_words)"
	`29`	+ "# Author: Olivier Grisel <[email protected]>\n# Lars Buitinck\n# Chyi-Kwei Yau <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\nfrom time import time\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_components = 10\nn_top_words = 20\n\n\ndef print_top_words(model, feature_names, n_top_words):\n for topic_idx, topic in enumerate(model.components_):\n message = \"Topic #%d: \" % topic_idx\n message += \" \".join([feature_names[i]\n for i in topic.argsort()[:-n_top_words - 1:-1]])\n print(message)\n print()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndataset = fetch_20newsgroups(shuffle=True, random_state=1,\n remove=('headers', 'footers', 'quotes'))\ndata_samples = dataset.data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,\n max_features=n_features,\n stop_words='english')\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,\n max_features=n_features,\n stop_words='english')\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n \"n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1,\n alpha=.1, l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model (Frobenius norm):\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\n# Fit the NMF model\nprint(\"Fitting the NMF model (generalized Kullback-Leibler divergence) with \"\n \"tf-idf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1,\n beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,\n l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model (generalized Kullback-Leibler divergence):\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\nprint(\"Fitting LDA models with tf features, \"\n \"n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nlda = LatentDirichletAllocation(n_components=n_components, max_iter=5,\n learning_method='online',\n learning_offset=50.,\n random_state=0)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in LDA model:\")\ntf_feature_names = tf_vectorizer.get_feature_names()\nprint_top_words(lda, tf_feature_names, n_top_words)"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`