"# Author: Olivier Grisel <
[email protected]>\n# Lars Buitinck\n# Chyi-Kwei Yau <
[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\nfrom time import time\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_components = 10\nn_top_words = 20\n\n\ndef print_top_words(model, feature_names, n_top_words):\n for topic_idx, topic in enumerate(model.components_):\n message = \"Topic #%d: \" % topic_idx\n message += \" \".join([feature_names[i]\n for i in topic.argsort()[:-n_top_words - 1:-1]])\n print(message)\n print()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndataset = fetch_20newsgroups(shuffle=True, random_state=1,\n remove=('headers', 'footers', 'quotes'))\ndata_samples = dataset.data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,\n max_features=n_features,\n stop_words='english')\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,\n max_features=n_features,\n stop_words='english')\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n \"n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1,\n alpha=.1, l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model (Frobenius norm):\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\n# Fit the NMF model\nprint(\"Fitting the NMF model (generalized Kullback-Leibler divergence) with \"\n \"tf-idf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler',\n solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model (generalized Kullback-Leibler divergence):\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\nprint(\"Fitting LDA models with tf features, \"\n \"n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nlda = LatentDirichletAllocation(n_components=n_components, max_iter=5,\n learning_method='online',\n learning_offset=50.,\n random_state=0)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in LDA model:\")\ntf_feature_names = tf_vectorizer.get_feature_names()\nprint_top_words(lda, tf_feature_names, n_top_words)"
0 commit comments