"# Author: Olivier Grisel <
[email protected]>\n# Lars Buitinck\n# Chyi-Kwei Yau <
[email protected]>\n# License: BSD 3 clause\n\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, MiniBatchNMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_components = 10\nn_top_words = 20\nbatch_size = 128\ninit = \"nndsvda\"\n\n\ndef plot_top_words(model, feature_names, n_top_words, title):\n fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)\n axes = axes.flatten()\n for topic_idx, topic in enumerate(model.components_):\n top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]\n top_features = [feature_names[i] for i in top_features_ind]\n weights = topic[top_features_ind]\n\n ax = axes[topic_idx]\n ax.barh(top_features, weights, height=0.7)\n ax.set_title(f\"Topic {topic_idx +1}\", fontdict={\"fontsize\": 30})\n ax.invert_yaxis()\n ax.tick_params(axis=\"both\", which=\"major\", labelsize=20)\n for i in \"top right left\".split():\n ax.spines[i].set_visible(False)\n fig.suptitle(title, fontsize=40)\n\n plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)\n plt.show()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndata, _ = fetch_20newsgroups(\n shuffle=True,\n random_state=1,\n remove=(\"headers\", \"footers\", \"quotes\"),\n return_X_y=True,\n)\ndata_samples = data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(\n max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(\n max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\n \"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n \"n_samples=%d and n_features=%d...\" % (n_samples, n_features)\n)\nt0 = time()\nnmf = NMF(\n n_components=n_components,\n random_state=1,\n init=init,\n beta_loss=\"frobenius\",\n alpha_W=0.00005,\n alpha_H=0.00005,\n l1_ratio=1,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n nmf, tfidf_feature_names, n_top_words, \"Topics in NMF model (Frobenius norm)\"\n)\n\n# Fit the NMF model\nprint(\n \"\\n\" * 2,\n \"Fitting the NMF model (generalized Kullback-Leibler \"\n \"divergence) with tf-idf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features),\n)\nt0 = time()\nnmf = NMF(\n n_components=n_components,\n random_state=1,\n init=init,\n beta_loss=\"kullback-leibler\",\n solver=\"mu\",\n max_iter=1000,\n alpha_W=0.00005,\n alpha_H=0.00005,\n l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n nmf,\n tfidf_feature_names,\n n_top_words,\n \"Topics in NMF model (generalized Kullback-Leibler divergence)\",\n)\n\n# Fit the MiniBatchNMF model\nprint(\n \"\\n\" * 2,\n \"Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf \"\n \"features, n_samples=%d and n_features=%d, batch_size=%d...\"\n % (n_samples, n_features, batch_size),\n)\nt0 = time()\nmbnmf = MiniBatchNMF(\n n_components=n_components,\n random_state=1,\n batch_size=batch_size,\n init=init,\n beta_loss=\"frobenius\",\n alpha_W=0.00005,\n alpha_H=0.00005,\n l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n mbnmf,\n tfidf_feature_names,\n n_top_words,\n \"Topics in MiniBatchNMF model (Frobenius norm)\",\n)\n\n# Fit the MiniBatchNMF model\nprint(\n \"\\n\" * 2,\n \"Fitting the MiniBatchNMF model (generalized Kullback-Leibler \"\n \"divergence) with tf-idf features, n_samples=%d and n_features=%d, \"\n \"batch_size=%d...\" % (n_samples, n_features, batch_size),\n)\nt0 = time()\nmbnmf = MiniBatchNMF(\n n_components=n_components,\n random_state=1,\n batch_size=batch_size,\n init=init,\n beta_loss=\"kullback-leibler\",\n alpha_W=0.00005,\n alpha_H=0.00005,\n l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n mbnmf,\n tfidf_feature_names,\n n_top_words,\n \"Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)\",\n)\n\nprint(\n \"\\n\" * 2,\n \"Fitting LDA models with tf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features),\n)\nlda = LatentDirichletAllocation(\n n_components=n_components,\n max_iter=5,\n learning_method=\"online\",\n learning_offset=50.0,\n random_state=0,\n)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntf_feature_names = tf_vectorizer.get_feature_names_out()\nplot_top_words(lda, tf_feature_names, n_top_words, \"Topics in LDA model\")"
0 commit comments