scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.57 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.57 KB
diff --git a/‎dev/_downloads/2b2bebba7f9fb4d03b9c12d63c8b44ad/plot_topics_extraction_with_nmf_lda.py
Lines changed: 72 additions & 3 deletions b/‎dev/_downloads/2b2bebba7f9fb4d03b9c12d63c8b44ad/plot_topics_extraction_with_nmf_lda.py
Lines changed: 72 additions & 3 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.67 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.67 KB
diff --git a/‎dev/_downloads/b26574ccf9c31e12ab2afd8d683f3279/plot_topics_extraction_with_nmf_lda.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/b26574ccf9c31e12ab2afd8d683f3279/plot_topics_extraction_with_nmf_lda.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
499 KB b/‎dev/_downloads/scikit-learn-docs.zip
499 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
46 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
46 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-75 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-75 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-39 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-39 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-9 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-9 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
-138 Bytes b/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
-138 Bytes
@@ -30,13 +30,15 @@
 import matplotlib.pyplot as plt
 
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
-from sklearn.decomposition import NMF, LatentDirichletAllocation
+from sklearn.decomposition import NMF, MiniBatchNMF, LatentDirichletAllocation
 from sklearn.datasets import fetch_20newsgroups
 
 n_samples = 2000
 n_features = 1000
 n_components = 10
 n_top_words = 20
+batch_size = 128
+init = "nndsvda"
 
 
 def plot_top_words(model, feature_names, n_top_words, title):
@@ -101,7 +103,15 @@ def plot_top_words(model, feature_names, n_top_words, title):
     "n_samples=%d and n_features=%d..." % (n_samples, n_features)
 )
 t0 = time()
-nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, l1_ratio=0.5).fit(tfidf)
+nmf = NMF(
+    n_components=n_components,
+    random_state=1,
+    init=init,
+    beta_loss="frobenius",
+    alpha_W=0.00005,
+    alpha_H=0.00005,
+    l1_ratio=1,
+).fit(tfidf)
 print("done in %0.3fs." % (time() - t0))
 
 
@@ -121,10 +131,12 @@ def plot_top_words(model, feature_names, n_top_words, title):
 nmf = NMF(
     n_components=n_components,
     random_state=1,
+    init=init,
     beta_loss="kullback-leibler",
     solver="mu",
     max_iter=1000,
-    alpha=0.1,
+    alpha_W=0.00005,
+    alpha_H=0.00005,
     l1_ratio=0.5,
 ).fit(tfidf)
 print("done in %0.3fs." % (time() - t0))
@@ -137,6 +149,63 @@ def plot_top_words(model, feature_names, n_top_words, title):
     "Topics in NMF model (generalized Kullback-Leibler divergence)",
 )
 
+# Fit the MiniBatchNMF model
+print(
+    "\n" * 2,
+    "Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf "
+    "features, n_samples=%d and n_features=%d, batch_size=%d..."
+    % (n_samples, n_features, batch_size),
+)
+t0 = time()
+mbnmf = MiniBatchNMF(
+    n_components=n_components,
+    random_state=1,
+    batch_size=batch_size,
+    init=init,
+    beta_loss="frobenius",
+    alpha_W=0.00005,
+    alpha_H=0.00005,
+    l1_ratio=0.5,
+).fit(tfidf)
+print("done in %0.3fs." % (time() - t0))
+
+
+tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
+plot_top_words(
+    mbnmf,
+    tfidf_feature_names,
+    n_top_words,
+    "Topics in MiniBatchNMF model (Frobenius norm)",
+)
+
+# Fit the MiniBatchNMF model
+print(
+    "\n" * 2,
+    "Fitting the MiniBatchNMF model (generalized Kullback-Leibler "
+    "divergence) with tf-idf features, n_samples=%d and n_features=%d, "
+    "batch_size=%d..." % (n_samples, n_features, batch_size),
+)
+t0 = time()
+mbnmf = MiniBatchNMF(
+    n_components=n_components,
+    random_state=1,
+    batch_size=batch_size,
+    init=init,
+    beta_loss="kullback-leibler",
+    alpha_W=0.00005,
+    alpha_H=0.00005,
+    l1_ratio=0.5,
+).fit(tfidf)
+print("done in %0.3fs." % (time() - t0))
+
+tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
+plot_top_words(
+    mbnmf,
+    tfidf_feature_names,
+    n_top_words,
+    "Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)",
+)
+
 print(
     "\n" * 2,
     "Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Olivier Grisel <[email protected]>\n#         Lars Buitinck\n#         Chyi-Kwei Yau <[email protected]>\n# License: BSD 3 clause\n\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_components = 10\nn_top_words = 20\n\n\ndef plot_top_words(model, feature_names, n_top_words, title):\n    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)\n    axes = axes.flatten()\n    for topic_idx, topic in enumerate(model.components_):\n        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]\n        top_features = [feature_names[i] for i in top_features_ind]\n        weights = topic[top_features_ind]\n\n        ax = axes[topic_idx]\n        ax.barh(top_features, weights, height=0.7)\n        ax.set_title(f\"Topic {topic_idx +1}\", fontdict={\"fontsize\": 30})\n        ax.invert_yaxis()\n        ax.tick_params(axis=\"both\", which=\"major\", labelsize=20)\n        for i in \"top right left\".split():\n            ax.spines[i].set_visible(False)\n        fig.suptitle(title, fontsize=40)\n\n    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)\n    plt.show()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndata, _ = fetch_20newsgroups(\n    shuffle=True,\n    random_state=1,\n    remove=(\"headers\", \"footers\", \"quotes\"),\n    return_X_y=True,\n)\ndata_samples = data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(\n    max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(\n    max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\n    \"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n    \"n_samples=%d and n_features=%d...\" % (n_samples, n_features)\n)\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1, alpha=0.1, l1_ratio=0.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n    nmf, tfidf_feature_names, n_top_words, \"Topics in NMF model (Frobenius norm)\"\n)\n\n# Fit the NMF model\nprint(\n    \"\\n\" * 2,\n    \"Fitting the NMF model (generalized Kullback-Leibler \"\n    \"divergence) with tf-idf features, n_samples=%d and n_features=%d...\"\n    % (n_samples, n_features),\n)\nt0 = time()\nnmf = NMF(\n    n_components=n_components,\n    random_state=1,\n    beta_loss=\"kullback-leibler\",\n    solver=\"mu\",\n    max_iter=1000,\n    alpha=0.1,\n    l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n    nmf,\n    tfidf_feature_names,\n    n_top_words,\n    \"Topics in NMF model (generalized Kullback-Leibler divergence)\",\n)\n\nprint(\n    \"\\n\" * 2,\n    \"Fitting LDA models with tf features, n_samples=%d and n_features=%d...\"\n    % (n_samples, n_features),\n)\nlda = LatentDirichletAllocation(\n    n_components=n_components,\n    max_iter=5,\n    learning_method=\"online\",\n    learning_offset=50.0,\n    random_state=0,\n)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntf_feature_names = tf_vectorizer.get_feature_names_out()\nplot_top_words(lda, tf_feature_names, n_top_words, \"Topics in LDA model\")"
+        "# Author: Olivier Grisel <[email protected]>\n#         Lars Buitinck\n#         Chyi-Kwei Yau <[email protected]>\n# License: BSD 3 clause\n\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, MiniBatchNMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_components = 10\nn_top_words = 20\nbatch_size = 128\ninit = \"nndsvda\"\n\n\ndef plot_top_words(model, feature_names, n_top_words, title):\n    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)\n    axes = axes.flatten()\n    for topic_idx, topic in enumerate(model.components_):\n        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]\n        top_features = [feature_names[i] for i in top_features_ind]\n        weights = topic[top_features_ind]\n\n        ax = axes[topic_idx]\n        ax.barh(top_features, weights, height=0.7)\n        ax.set_title(f\"Topic {topic_idx +1}\", fontdict={\"fontsize\": 30})\n        ax.invert_yaxis()\n        ax.tick_params(axis=\"both\", which=\"major\", labelsize=20)\n        for i in \"top right left\".split():\n            ax.spines[i].set_visible(False)\n        fig.suptitle(title, fontsize=40)\n\n    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)\n    plt.show()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndata, _ = fetch_20newsgroups(\n    shuffle=True,\n    random_state=1,\n    remove=(\"headers\", \"footers\", \"quotes\"),\n    return_X_y=True,\n)\ndata_samples = data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(\n    max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(\n    max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\n    \"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n    \"n_samples=%d and n_features=%d...\" % (n_samples, n_features)\n)\nt0 = time()\nnmf = NMF(\n    n_components=n_components,\n    random_state=1,\n    init=init,\n    beta_loss=\"frobenius\",\n    alpha_W=0.00005,\n    alpha_H=0.00005,\n    l1_ratio=1,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n    nmf, tfidf_feature_names, n_top_words, \"Topics in NMF model (Frobenius norm)\"\n)\n\n# Fit the NMF model\nprint(\n    \"\\n\" * 2,\n    \"Fitting the NMF model (generalized Kullback-Leibler \"\n    \"divergence) with tf-idf features, n_samples=%d and n_features=%d...\"\n    % (n_samples, n_features),\n)\nt0 = time()\nnmf = NMF(\n    n_components=n_components,\n    random_state=1,\n    init=init,\n    beta_loss=\"kullback-leibler\",\n    solver=\"mu\",\n    max_iter=1000,\n    alpha_W=0.00005,\n    alpha_H=0.00005,\n    l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n    nmf,\n    tfidf_feature_names,\n    n_top_words,\n    \"Topics in NMF model (generalized Kullback-Leibler divergence)\",\n)\n\n# Fit the MiniBatchNMF model\nprint(\n    \"\\n\" * 2,\n    \"Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf \"\n    \"features, n_samples=%d and n_features=%d, batch_size=%d...\"\n    % (n_samples, n_features, batch_size),\n)\nt0 = time()\nmbnmf = MiniBatchNMF(\n    n_components=n_components,\n    random_state=1,\n    batch_size=batch_size,\n    init=init,\n    beta_loss=\"frobenius\",\n    alpha_W=0.00005,\n    alpha_H=0.00005,\n    l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n    mbnmf,\n    tfidf_feature_names,\n    n_top_words,\n    \"Topics in MiniBatchNMF model (Frobenius norm)\",\n)\n\n# Fit the MiniBatchNMF model\nprint(\n    \"\\n\" * 2,\n    \"Fitting the MiniBatchNMF model (generalized Kullback-Leibler \"\n    \"divergence) with tf-idf features, n_samples=%d and n_features=%d, \"\n    \"batch_size=%d...\" % (n_samples, n_features, batch_size),\n)\nt0 = time()\nmbnmf = MiniBatchNMF(\n    n_components=n_components,\n    random_state=1,\n    batch_size=batch_size,\n    init=init,\n    beta_loss=\"kullback-leibler\",\n    alpha_W=0.00005,\n    alpha_H=0.00005,\n    l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n    mbnmf,\n    tfidf_feature_names,\n    n_top_words,\n    \"Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)\",\n)\n\nprint(\n    \"\\n\" * 2,\n    \"Fitting LDA models with tf features, n_samples=%d and n_features=%d...\"\n    % (n_samples, n_features),\n)\nlda = LatentDirichletAllocation(\n    n_components=n_components,\n    max_iter=5,\n    learning_method=\"online\",\n    learning_offset=50.0,\n    random_state=0,\n)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntf_feature_names = tf_vectorizer.get_feature_names_out()\nplot_top_words(lda, tf_feature_names, n_top_words, \"Topics in LDA model\")"
       ]
     }
   ],
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Author: Olivier Grisel <[email protected]>\n# Lars Buitinck\n# Chyi-Kwei Yau <[email protected]>\n# License: BSD 3 clause\n\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_components = 10\nn_top_words = 20\n\n\ndef plot_top_words(model, feature_names, n_top_words, title):\n fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)\n axes = axes.flatten()\n for topic_idx, topic in enumerate(model.components_):\n top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]\n top_features = [feature_names[i] for i in top_features_ind]\n weights = topic[top_features_ind]\n\n ax = axes[topic_idx]\n ax.barh(top_features, weights, height=0.7)\n ax.set_title(f\"Topic {topic_idx +1}\", fontdict={\"fontsize\": 30})\n ax.invert_yaxis()\n ax.tick_params(axis=\"both\", which=\"major\", labelsize=20)\n for i in \"top right left\".split():\n ax.spines[i].set_visible(False)\n fig.suptitle(title, fontsize=40)\n\n plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)\n plt.show()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndata, _ = fetch_20newsgroups(\n shuffle=True,\n random_state=1,\n remove=(\"headers\", \"footers\", \"quotes\"),\n return_X_y=True,\n)\ndata_samples = data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(\n max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(\n max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\n \"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n \"n_samples=%d and n_features=%d...\" % (n_samples, n_features)\n)\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1, alpha=0.1, l1_ratio=0.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n nmf, tfidf_feature_names, n_top_words, \"Topics in NMF model (Frobenius norm)\"\n)\n\n# Fit the NMF model\nprint(\n \"\\n\" * 2,\n \"Fitting the NMF model (generalized Kullback-Leibler \"\n \"divergence) with tf-idf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features),\n)\nt0 = time()\nnmf = NMF(\n n_components=n_components,\n random_state=1,\n beta_loss=\"kullback-leibler\",\n solver=\"mu\",\n max_iter=1000,\n alpha=0.1,\n l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n nmf,\n tfidf_feature_names,\n n_top_words,\n \"Topics in NMF model (generalized Kullback-Leibler divergence)\",\n)\n\nprint(\n \"\\n\" * 2,\n \"Fitting LDA models with tf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features),\n)\nlda = LatentDirichletAllocation(\n n_components=n_components,\n max_iter=5,\n learning_method=\"online\",\n learning_offset=50.0,\n random_state=0,\n)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntf_feature_names = tf_vectorizer.get_feature_names_out()\nplot_top_words(lda, tf_feature_names, n_top_words, \"Topics in LDA model\")"
	`29`	+ "# Author: Olivier Grisel <[email protected]>\n# Lars Buitinck\n# Chyi-Kwei Yau <[email protected]>\n# License: BSD 3 clause\n\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, MiniBatchNMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_components = 10\nn_top_words = 20\nbatch_size = 128\ninit = \"nndsvda\"\n\n\ndef plot_top_words(model, feature_names, n_top_words, title):\n fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)\n axes = axes.flatten()\n for topic_idx, topic in enumerate(model.components_):\n top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]\n top_features = [feature_names[i] for i in top_features_ind]\n weights = topic[top_features_ind]\n\n ax = axes[topic_idx]\n ax.barh(top_features, weights, height=0.7)\n ax.set_title(f\"Topic {topic_idx +1}\", fontdict={\"fontsize\": 30})\n ax.invert_yaxis()\n ax.tick_params(axis=\"both\", which=\"major\", labelsize=20)\n for i in \"top right left\".split():\n ax.spines[i].set_visible(False)\n fig.suptitle(title, fontsize=40)\n\n plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)\n plt.show()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndata, _ = fetch_20newsgroups(\n shuffle=True,\n random_state=1,\n remove=(\"headers\", \"footers\", \"quotes\"),\n return_X_y=True,\n)\ndata_samples = data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(\n max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(\n max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\n \"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n \"n_samples=%d and n_features=%d...\" % (n_samples, n_features)\n)\nt0 = time()\nnmf = NMF(\n n_components=n_components,\n random_state=1,\n init=init,\n beta_loss=\"frobenius\",\n alpha_W=0.00005,\n alpha_H=0.00005,\n l1_ratio=1,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n nmf, tfidf_feature_names, n_top_words, \"Topics in NMF model (Frobenius norm)\"\n)\n\n# Fit the NMF model\nprint(\n \"\\n\" * 2,\n \"Fitting the NMF model (generalized Kullback-Leibler \"\n \"divergence) with tf-idf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features),\n)\nt0 = time()\nnmf = NMF(\n n_components=n_components,\n random_state=1,\n init=init,\n beta_loss=\"kullback-leibler\",\n solver=\"mu\",\n max_iter=1000,\n alpha_W=0.00005,\n alpha_H=0.00005,\n l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n nmf,\n tfidf_feature_names,\n n_top_words,\n \"Topics in NMF model (generalized Kullback-Leibler divergence)\",\n)\n\n# Fit the MiniBatchNMF model\nprint(\n \"\\n\" * 2,\n \"Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf \"\n \"features, n_samples=%d and n_features=%d, batch_size=%d...\"\n % (n_samples, n_features, batch_size),\n)\nt0 = time()\nmbnmf = MiniBatchNMF(\n n_components=n_components,\n random_state=1,\n batch_size=batch_size,\n init=init,\n beta_loss=\"frobenius\",\n alpha_W=0.00005,\n alpha_H=0.00005,\n l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n mbnmf,\n tfidf_feature_names,\n n_top_words,\n \"Topics in MiniBatchNMF model (Frobenius norm)\",\n)\n\n# Fit the MiniBatchNMF model\nprint(\n \"\\n\" * 2,\n \"Fitting the MiniBatchNMF model (generalized Kullback-Leibler \"\n \"divergence) with tf-idf features, n_samples=%d and n_features=%d, \"\n \"batch_size=%d...\" % (n_samples, n_features, batch_size),\n)\nt0 = time()\nmbnmf = MiniBatchNMF(\n n_components=n_components,\n random_state=1,\n batch_size=batch_size,\n init=init,\n beta_loss=\"kullback-leibler\",\n alpha_W=0.00005,\n alpha_H=0.00005,\n l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n mbnmf,\n tfidf_feature_names,\n n_top_words,\n \"Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)\",\n)\n\nprint(\n \"\\n\" * 2,\n \"Fitting LDA models with tf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features),\n)\nlda = LatentDirichletAllocation(\n n_components=n_components,\n max_iter=5,\n learning_method=\"online\",\n learning_offset=50.0,\n random_state=0,\n)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntf_feature_names = tf_vectorizer.get_feature_names_out()\nplot_top_words(lda, tf_feature_names, n_top_words, \"Topics in LDA model\")"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`