Skip to content

Commit a9a59af

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 69132ebbd39f070590ca01813340b5b12c0d02ab
1 parent dbfdb10 commit a9a59af

File tree

1,243 files changed

+5362
-4276
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,243 files changed

+5362
-4276
lines changed
Binary file not shown.

dev/_downloads/2b2bebba7f9fb4d03b9c12d63c8b44ad/plot_topics_extraction_with_nmf_lda.py

Lines changed: 72 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,15 @@
3030
import matplotlib.pyplot as plt
3131

3232
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
33-
from sklearn.decomposition import NMF, LatentDirichletAllocation
33+
from sklearn.decomposition import NMF, MiniBatchNMF, LatentDirichletAllocation
3434
from sklearn.datasets import fetch_20newsgroups
3535

3636
n_samples = 2000
3737
n_features = 1000
3838
n_components = 10
3939
n_top_words = 20
40+
batch_size = 128
41+
init = "nndsvda"
4042

4143

4244
def plot_top_words(model, feature_names, n_top_words, title):
@@ -101,7 +103,15 @@ def plot_top_words(model, feature_names, n_top_words, title):
101103
"n_samples=%d and n_features=%d..." % (n_samples, n_features)
102104
)
103105
t0 = time()
104-
nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, l1_ratio=0.5).fit(tfidf)
106+
nmf = NMF(
107+
n_components=n_components,
108+
random_state=1,
109+
init=init,
110+
beta_loss="frobenius",
111+
alpha_W=0.00005,
112+
alpha_H=0.00005,
113+
l1_ratio=1,
114+
).fit(tfidf)
105115
print("done in %0.3fs." % (time() - t0))
106116

107117

@@ -121,10 +131,12 @@ def plot_top_words(model, feature_names, n_top_words, title):
121131
nmf = NMF(
122132
n_components=n_components,
123133
random_state=1,
134+
init=init,
124135
beta_loss="kullback-leibler",
125136
solver="mu",
126137
max_iter=1000,
127-
alpha=0.1,
138+
alpha_W=0.00005,
139+
alpha_H=0.00005,
128140
l1_ratio=0.5,
129141
).fit(tfidf)
130142
print("done in %0.3fs." % (time() - t0))
@@ -137,6 +149,63 @@ def plot_top_words(model, feature_names, n_top_words, title):
137149
"Topics in NMF model (generalized Kullback-Leibler divergence)",
138150
)
139151

152+
# Fit the MiniBatchNMF model
153+
print(
154+
"\n" * 2,
155+
"Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf "
156+
"features, n_samples=%d and n_features=%d, batch_size=%d..."
157+
% (n_samples, n_features, batch_size),
158+
)
159+
t0 = time()
160+
mbnmf = MiniBatchNMF(
161+
n_components=n_components,
162+
random_state=1,
163+
batch_size=batch_size,
164+
init=init,
165+
beta_loss="frobenius",
166+
alpha_W=0.00005,
167+
alpha_H=0.00005,
168+
l1_ratio=0.5,
169+
).fit(tfidf)
170+
print("done in %0.3fs." % (time() - t0))
171+
172+
173+
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
174+
plot_top_words(
175+
mbnmf,
176+
tfidf_feature_names,
177+
n_top_words,
178+
"Topics in MiniBatchNMF model (Frobenius norm)",
179+
)
180+
181+
# Fit the MiniBatchNMF model
182+
print(
183+
"\n" * 2,
184+
"Fitting the MiniBatchNMF model (generalized Kullback-Leibler "
185+
"divergence) with tf-idf features, n_samples=%d and n_features=%d, "
186+
"batch_size=%d..." % (n_samples, n_features, batch_size),
187+
)
188+
t0 = time()
189+
mbnmf = MiniBatchNMF(
190+
n_components=n_components,
191+
random_state=1,
192+
batch_size=batch_size,
193+
init=init,
194+
beta_loss="kullback-leibler",
195+
alpha_W=0.00005,
196+
alpha_H=0.00005,
197+
l1_ratio=0.5,
198+
).fit(tfidf)
199+
print("done in %0.3fs." % (time() - t0))
200+
201+
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
202+
plot_top_words(
203+
mbnmf,
204+
tfidf_feature_names,
205+
n_top_words,
206+
"Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)",
207+
)
208+
140209
print(
141210
"\n" * 2,
142211
"Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
Binary file not shown.

dev/_downloads/b26574ccf9c31e12ab2afd8d683f3279/plot_topics_extraction_with_nmf_lda.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Olivier Grisel <[email protected]>\n# Lars Buitinck\n# Chyi-Kwei Yau <[email protected]>\n# License: BSD 3 clause\n\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_components = 10\nn_top_words = 20\n\n\ndef plot_top_words(model, feature_names, n_top_words, title):\n fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)\n axes = axes.flatten()\n for topic_idx, topic in enumerate(model.components_):\n top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]\n top_features = [feature_names[i] for i in top_features_ind]\n weights = topic[top_features_ind]\n\n ax = axes[topic_idx]\n ax.barh(top_features, weights, height=0.7)\n ax.set_title(f\"Topic {topic_idx +1}\", fontdict={\"fontsize\": 30})\n ax.invert_yaxis()\n ax.tick_params(axis=\"both\", which=\"major\", labelsize=20)\n for i in \"top right left\".split():\n ax.spines[i].set_visible(False)\n fig.suptitle(title, fontsize=40)\n\n plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)\n plt.show()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndata, _ = fetch_20newsgroups(\n shuffle=True,\n random_state=1,\n remove=(\"headers\", \"footers\", \"quotes\"),\n return_X_y=True,\n)\ndata_samples = data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(\n max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(\n max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\n \"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n \"n_samples=%d and n_features=%d...\" % (n_samples, n_features)\n)\nt0 = time()\nnmf = NMF(n_components=n_components, random_state=1, alpha=0.1, l1_ratio=0.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n nmf, tfidf_feature_names, n_top_words, \"Topics in NMF model (Frobenius norm)\"\n)\n\n# Fit the NMF model\nprint(\n \"\\n\" * 2,\n \"Fitting the NMF model (generalized Kullback-Leibler \"\n \"divergence) with tf-idf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features),\n)\nt0 = time()\nnmf = NMF(\n n_components=n_components,\n random_state=1,\n beta_loss=\"kullback-leibler\",\n solver=\"mu\",\n max_iter=1000,\n alpha=0.1,\n l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n nmf,\n tfidf_feature_names,\n n_top_words,\n \"Topics in NMF model (generalized Kullback-Leibler divergence)\",\n)\n\nprint(\n \"\\n\" * 2,\n \"Fitting LDA models with tf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features),\n)\nlda = LatentDirichletAllocation(\n n_components=n_components,\n max_iter=5,\n learning_method=\"online\",\n learning_offset=50.0,\n random_state=0,\n)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntf_feature_names = tf_vectorizer.get_feature_names_out()\nplot_top_words(lda, tf_feature_names, n_top_words, \"Topics in LDA model\")"
29+
"# Author: Olivier Grisel <[email protected]>\n# Lars Buitinck\n# Chyi-Kwei Yau <[email protected]>\n# License: BSD 3 clause\n\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, MiniBatchNMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_components = 10\nn_top_words = 20\nbatch_size = 128\ninit = \"nndsvda\"\n\n\ndef plot_top_words(model, feature_names, n_top_words, title):\n fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)\n axes = axes.flatten()\n for topic_idx, topic in enumerate(model.components_):\n top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]\n top_features = [feature_names[i] for i in top_features_ind]\n weights = topic[top_features_ind]\n\n ax = axes[topic_idx]\n ax.barh(top_features, weights, height=0.7)\n ax.set_title(f\"Topic {topic_idx +1}\", fontdict={\"fontsize\": 30})\n ax.invert_yaxis()\n ax.tick_params(axis=\"both\", which=\"major\", labelsize=20)\n for i in \"top right left\".split():\n ax.spines[i].set_visible(False)\n fig.suptitle(title, fontsize=40)\n\n plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)\n plt.show()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndata, _ = fetch_20newsgroups(\n shuffle=True,\n random_state=1,\n remove=(\"headers\", \"footers\", \"quotes\"),\n return_X_y=True,\n)\ndata_samples = data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(\n max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(\n max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n)\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\n \"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n \"n_samples=%d and n_features=%d...\" % (n_samples, n_features)\n)\nt0 = time()\nnmf = NMF(\n n_components=n_components,\n random_state=1,\n init=init,\n beta_loss=\"frobenius\",\n alpha_W=0.00005,\n alpha_H=0.00005,\n l1_ratio=1,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n nmf, tfidf_feature_names, n_top_words, \"Topics in NMF model (Frobenius norm)\"\n)\n\n# Fit the NMF model\nprint(\n \"\\n\" * 2,\n \"Fitting the NMF model (generalized Kullback-Leibler \"\n \"divergence) with tf-idf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features),\n)\nt0 = time()\nnmf = NMF(\n n_components=n_components,\n random_state=1,\n init=init,\n beta_loss=\"kullback-leibler\",\n solver=\"mu\",\n max_iter=1000,\n alpha_W=0.00005,\n alpha_H=0.00005,\n l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n nmf,\n tfidf_feature_names,\n n_top_words,\n \"Topics in NMF model (generalized Kullback-Leibler divergence)\",\n)\n\n# Fit the MiniBatchNMF model\nprint(\n \"\\n\" * 2,\n \"Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf \"\n \"features, n_samples=%d and n_features=%d, batch_size=%d...\"\n % (n_samples, n_features, batch_size),\n)\nt0 = time()\nmbnmf = MiniBatchNMF(\n n_components=n_components,\n random_state=1,\n batch_size=batch_size,\n init=init,\n beta_loss=\"frobenius\",\n alpha_W=0.00005,\n alpha_H=0.00005,\n l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n mbnmf,\n tfidf_feature_names,\n n_top_words,\n \"Topics in MiniBatchNMF model (Frobenius norm)\",\n)\n\n# Fit the MiniBatchNMF model\nprint(\n \"\\n\" * 2,\n \"Fitting the MiniBatchNMF model (generalized Kullback-Leibler \"\n \"divergence) with tf-idf features, n_samples=%d and n_features=%d, \"\n \"batch_size=%d...\" % (n_samples, n_features, batch_size),\n)\nt0 = time()\nmbnmf = MiniBatchNMF(\n n_components=n_components,\n random_state=1,\n batch_size=batch_size,\n init=init,\n beta_loss=\"kullback-leibler\",\n alpha_W=0.00005,\n alpha_H=0.00005,\n l1_ratio=0.5,\n).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\nplot_top_words(\n mbnmf,\n tfidf_feature_names,\n n_top_words,\n \"Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)\",\n)\n\nprint(\n \"\\n\" * 2,\n \"Fitting LDA models with tf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features),\n)\nlda = LatentDirichletAllocation(\n n_components=n_components,\n max_iter=5,\n learning_method=\"online\",\n learning_offset=50.0,\n random_state=0,\n)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\ntf_feature_names = tf_vectorizer.get_feature_names_out()\nplot_top_words(lda, tf_feature_names, n_top_words, \"Topics in LDA model\")"
3030
]
3131
}
3232
],

dev/_downloads/scikit-learn-docs.zip

499 KB
Binary file not shown.
46 Bytes
-75 Bytes
-39 Bytes
-9 Bytes
-138 Bytes

0 commit comments

Comments
 (0)