Skip to content

Commit f85884a

Browse files
committed
Pushing the docs to dev/ for branch: master, commit ae4f7104d65cecef11c26208bb15266b43ddc0a2
1 parent 81884de commit f85884a

File tree

971 files changed

+3668
-2945
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

971 files changed

+3668
-2945
lines changed
2.78 KB
Binary file not shown.
1.79 KB
Binary file not shown.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"nbformat_minor": 0,
3+
"nbformat": 4,
4+
"cells": [
5+
{
6+
"execution_count": null,
7+
"cell_type": "code",
8+
"source": [
9+
"%matplotlib inline"
10+
],
11+
"outputs": [],
12+
"metadata": {
13+
"collapsed": false
14+
}
15+
},
16+
{
17+
"source": [
18+
"\n# Beta-divergence loss functions\n\n\nA plot that compares the various Beta-divergence loss functions supported by\nthe Multiplicative-Update ('mu') solver in :class:`sklearn.decomposition.NMF`.\n\n"
19+
],
20+
"cell_type": "markdown",
21+
"metadata": {}
22+
},
23+
{
24+
"execution_count": null,
25+
"cell_type": "code",
26+
"source": [
27+
"import numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.decomposition.nmf import _beta_divergence\n\nprint(__doc__)\n\nx = np.linspace(0.001, 4, 1000)\ny = np.zeros(x.shape)\n\ncolors = 'mbgyr'\nfor j, beta in enumerate((0., 0.5, 1., 1.5, 2.)):\n for i, xi in enumerate(x):\n y[i] = _beta_divergence(1, xi, 1, beta)\n name = \"beta = %1.1f\" % beta\n plt.plot(x, y, label=name, color=colors[j])\n\nplt.xlabel(\"x\")\nplt.title(\"beta-divergence(1, x)\")\nplt.legend(loc=0)\nplt.axis([0, 4, 0, 3])\nplt.show()"
28+
],
29+
"outputs": [],
30+
"metadata": {
31+
"collapsed": false
32+
}
33+
}
34+
],
35+
"metadata": {
36+
"kernelspec": {
37+
"display_name": "Python 2",
38+
"name": "python2",
39+
"language": "python"
40+
},
41+
"language_info": {
42+
"mimetype": "text/x-python",
43+
"nbconvert_exporter": "python",
44+
"name": "python",
45+
"file_extension": ".py",
46+
"version": "2.7.12",
47+
"pygments_lexer": "ipython2",
48+
"codemirror_mode": {
49+
"version": 2,
50+
"name": "ipython"
51+
}
52+
}
53+
}
54+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""
2+
==============================
3+
Beta-divergence loss functions
4+
==============================
5+
6+
A plot that compares the various Beta-divergence loss functions supported by
7+
the Multiplicative-Update ('mu') solver in :class:`sklearn.decomposition.NMF`.
8+
"""
9+
import numpy as np
10+
import matplotlib.pyplot as plt
11+
from sklearn.decomposition.nmf import _beta_divergence
12+
13+
print(__doc__)
14+
15+
x = np.linspace(0.001, 4, 1000)
16+
y = np.zeros(x.shape)
17+
18+
colors = 'mbgyr'
19+
for j, beta in enumerate((0., 0.5, 1., 1.5, 2.)):
20+
for i, xi in enumerate(x):
21+
y[i] = _beta_divergence(1, xi, 1, beta)
22+
name = "beta = %1.1f" % beta
23+
plt.plot(x, y, label=name, color=colors[j])
24+
25+
plt.xlabel("x")
26+
plt.title("beta-divergence(1, x)")
27+
plt.legend(loc=0)
28+
plt.axis([0, 4, 0, 3])
29+
plt.show()

dev/_downloads/scikit-learn-docs.pdf

178 KB
Binary file not shown.

dev/_downloads/topics_extraction_with_nmf_lda.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
},
1616
{
1717
"source": [
18-
"\n# Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation\n\n\nThis is an example of applying :class:`sklearn.decomposition.NMF`\nand :class:`sklearn.decomposition.LatentDirichletAllocation` on a corpus of documents and\nextract additive models of the topic structure of the corpus.\nThe output is a list of topics, each represented as a list of terms\n(weights are not shown).\n\nThe default parameters (n_samples / n_features / n_topics) should make\nthe example runnable in a couple of tens of seconds. You can try to\nincrease the dimensions of the problem, but be aware that the time\ncomplexity is polynomial in NMF. In LDA, the time complexity is\nproportional to (n_samples * iterations).\n\n"
18+
"\n# Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation\n\n\nThis is an example of applying :class:`sklearn.decomposition.NMF`\nand :class:`sklearn.decomposition.LatentDirichletAllocation` on a corpus of documents and\nextract additive models of the topic structure of the corpus.\nThe output is a list of topics, each represented as a list of terms\n(weights are not shown).\n\nNon-negative Matrix Factorization is applied with two different objective\nfunctions: the Frobenius norm, and the generalized Kullback-Leibler divergence.\nThe latter is equivalent to Probabilistic Latent Semantic Indexing.\n\nThe default parameters (n_samples / n_features / n_topics) should make\nthe example runnable in a couple of tens of seconds. You can try to\nincrease the dimensions of the problem, but be aware that the time\ncomplexity is polynomial in NMF. In LDA, the time complexity is\nproportional to (n_samples * iterations).\n\n"
1919
],
2020
"cell_type": "markdown",
2121
"metadata": {}
@@ -24,7 +24,7 @@
2424
"execution_count": null,
2525
"cell_type": "code",
2626
"source": [
27-
"# Author: Olivier Grisel <[email protected]>\n# Lars Buitinck\n# Chyi-Kwei Yau <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\nfrom time import time\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_topics = 10\nn_top_words = 20\n\n\ndef print_top_words(model, feature_names, n_top_words):\n for topic_idx, topic in enumerate(model.components_):\n print(\"Topic #%d:\" % topic_idx)\n print(\" \".join([feature_names[i]\n for i in topic.argsort()[:-n_top_words - 1:-1]]))\n print()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndataset = fetch_20newsgroups(shuffle=True, random_state=1,\n remove=('headers', 'footers', 'quotes'))\ndata_samples = dataset.data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,\n max_features=n_features,\n stop_words='english')\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,\n max_features=n_features,\n stop_words='english')\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Fit the NMF model\nprint(\"Fitting the NMF model with tf-idf features, \"\n \"n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_topics, random_state=1,\n alpha=.1, l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model:\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\nprint(\"Fitting LDA models with tf features, \"\n \"n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nlda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,\n learning_method='online',\n learning_offset=50.,\n random_state=0)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in LDA model:\")\ntf_feature_names = tf_vectorizer.get_feature_names()\nprint_top_words(lda, tf_feature_names, n_top_words)"
27+
"# Author: Olivier Grisel <[email protected]>\n# Lars Buitinck\n# Chyi-Kwei Yau <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\nfrom time import time\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.decomposition import NMF, LatentDirichletAllocation\nfrom sklearn.datasets import fetch_20newsgroups\n\nn_samples = 2000\nn_features = 1000\nn_topics = 10\nn_top_words = 20\n\n\ndef print_top_words(model, feature_names, n_top_words):\n for topic_idx, topic in enumerate(model.components_):\n message = \"Topic #%d: \" % topic_idx\n message += \" \".join([feature_names[i]\n for i in topic.argsort()[:-n_top_words - 1:-1]])\n print(message)\n print()\n\n\n# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics\n# to filter out useless terms early on: the posts are stripped of headers,\n# footers and quoted replies, and common English words, words occurring in\n# only one document or in at least 95% of the documents are removed.\n\nprint(\"Loading dataset...\")\nt0 = time()\ndataset = fetch_20newsgroups(shuffle=True, random_state=1,\n remove=('headers', 'footers', 'quotes'))\ndata_samples = dataset.data[:n_samples]\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf-idf features for NMF.\nprint(\"Extracting tf-idf features for NMF...\")\ntfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,\n max_features=n_features,\n stop_words='english')\nt0 = time()\ntfidf = tfidf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\n# Use tf (raw term count) features for LDA.\nprint(\"Extracting tf features for LDA...\")\ntf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,\n max_features=n_features,\n stop_words='english')\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint(\"done in %0.3fs.\" % (time() - t0))\nprint()\n\n# Fit the NMF model\nprint(\"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n \"n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_topics, random_state=1,\n alpha=.1, l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model (Frobenius norm):\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\n# Fit the NMF model\nprint(\"Fitting the NMF model (generalized Kullback-Leibler divergence) with \"\n \"tf-idf features, n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nt0 = time()\nnmf = NMF(n_components=n_topics, random_state=1, beta_loss='kullback-leibler',\n solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in NMF model (generalized Kullback-Leibler divergence):\")\ntfidf_feature_names = tfidf_vectorizer.get_feature_names()\nprint_top_words(nmf, tfidf_feature_names, n_top_words)\n\nprint(\"Fitting LDA models with tf features, \"\n \"n_samples=%d and n_features=%d...\"\n % (n_samples, n_features))\nlda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,\n learning_method='online',\n learning_offset=50.,\n random_state=0)\nt0 = time()\nlda.fit(tf)\nprint(\"done in %0.3fs.\" % (time() - t0))\n\nprint(\"\\nTopics in LDA model:\")\ntf_feature_names = tf_vectorizer.get_feature_names()\nprint_top_words(lda, tf_feature_names, n_top_words)"
2828
],
2929
"outputs": [],
3030
"metadata": {

dev/_downloads/topics_extraction_with_nmf_lda.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
The output is a list of topics, each represented as a list of terms
1010
(weights are not shown).
1111
12+
Non-negative Matrix Factorization is applied with two different objective
13+
functions: the Frobenius norm, and the generalized Kullback-Leibler divergence.
14+
The latter is equivalent to Probabilistic Latent Semantic Indexing.
15+
1216
The default parameters (n_samples / n_features / n_topics) should make
1317
the example runnable in a couple of tens of seconds. You can try to
1418
increase the dimensions of the problem, but be aware that the time
@@ -36,9 +40,10 @@
3640

3741
def print_top_words(model, feature_names, n_top_words):
3842
for topic_idx, topic in enumerate(model.components_):
39-
print("Topic #%d:" % topic_idx)
40-
print(" ".join([feature_names[i]
41-
for i in topic.argsort()[:-n_top_words - 1:-1]]))
43+
message = "Topic #%d: " % topic_idx
44+
message += " ".join([feature_names[i]
45+
for i in topic.argsort()[:-n_top_words - 1:-1]])
46+
print(message)
4247
print()
4348

4449

@@ -71,17 +76,31 @@ def print_top_words(model, feature_names, n_top_words):
7176
t0 = time()
7277
tf = tf_vectorizer.fit_transform(data_samples)
7378
print("done in %0.3fs." % (time() - t0))
79+
print()
7480

7581
# Fit the NMF model
76-
print("Fitting the NMF model with tf-idf features, "
82+
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
7783
"n_samples=%d and n_features=%d..."
7884
% (n_samples, n_features))
7985
t0 = time()
8086
nmf = NMF(n_components=n_topics, random_state=1,
8187
alpha=.1, l1_ratio=.5).fit(tfidf)
8288
print("done in %0.3fs." % (time() - t0))
8389

84-
print("\nTopics in NMF model:")
90+
print("\nTopics in NMF model (Frobenius norm):")
91+
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
92+
print_top_words(nmf, tfidf_feature_names, n_top_words)
93+
94+
# Fit the NMF model
95+
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
96+
"tf-idf features, n_samples=%d and n_features=%d..."
97+
% (n_samples, n_features))
98+
t0 = time()
99+
nmf = NMF(n_components=n_topics, random_state=1, beta_loss='kullback-leibler',
100+
solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf)
101+
print("done in %0.3fs." % (time() - t0))
102+
103+
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
85104
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
86105
print_top_words(nmf, tfidf_feature_names, n_top_words)
87106

1.37 KB
1.68 KB
310 Bytes

0 commit comments

Comments
 (0)