Skip to content

Commit cf81163

Browse files
committed
update to master again
1 parent 406134e commit cf81163

File tree

1,738 files changed

+8166
-6806
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,738 files changed

+8166
-6806
lines changed

dev/_downloads/plot_cv_predict.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@
2020
# is a prediction obtained by cross validated:
2121
predicted = cross_val_predict(lr, boston.data, y, cv=10)
2222

23-
fig,ax = plt.subplots()
23+
fig, ax = plt.subplots()
2424
ax.scatter(y, predicted)
2525
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
2626
ax.set_xlabel('Measured')
2727
ax.set_ylabel('Predicted')
28-
fig.show()
28+
plt.show()
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""
2+
=========================================================
3+
Using FunctionTransformer to select columns
4+
=========================================================
5+
6+
Shows how to use a function transformer in a pipeline. If you know your
7+
dataset's first principle component is irrelevant for a classification task,
8+
you can use the FunctionTransformer to select all but the first column of the
9+
PCA transformed data.
10+
"""
11+
import matplotlib.pyplot as plt
12+
import numpy as np
13+
14+
from sklearn.cross_validation import train_test_split
15+
from sklearn.decomposition import PCA
16+
from sklearn.pipeline import make_pipeline
17+
from sklearn.preprocessing import FunctionTransformer
18+
19+
20+
def _generate_vector(shift=0.5, noise=15):
21+
return np.arange(1000) + (np.random.rand(1000) - shift) * noise
22+
23+
24+
def generate_dataset():
25+
"""
26+
This dataset is two lines with a slope ~ 1, where one has
27+
a y offset of ~100
28+
"""
29+
return np.vstack((
30+
np.vstack((
31+
_generate_vector(),
32+
_generate_vector() + 100,
33+
)).T,
34+
np.vstack((
35+
_generate_vector(),
36+
_generate_vector(),
37+
)).T,
38+
)), np.hstack((np.zeros(1000), np.ones(1000)))
39+
40+
41+
def all_but_first_column(X):
42+
return X[:, 1:]
43+
44+
45+
def drop_first_component(X, y):
46+
"""
47+
Create a pipeline with PCA and the column selector and use it to
48+
transform the dataset.
49+
"""
50+
pipeline = make_pipeline(
51+
PCA(), FunctionTransformer(all_but_first_column),
52+
)
53+
X_train, X_test, y_train, y_test = train_test_split(X, y)
54+
pipeline.fit(X_train, y_train)
55+
return pipeline.transform(X_test), y_test
56+
57+
58+
if __name__ == '__main__':
59+
X, y = generate_dataset()
60+
plt.scatter(X[:, 0], X[:, 1], c=y, s=50)
61+
plt.show()
62+
X_transformed, y_transformed = drop_first_component(*generate_dataset())
63+
plt.scatter(
64+
X_transformed[:, 0],
65+
np.zeros(len(X_transformed)),
66+
c=y_transformed,
67+
s=50,
68+
)
69+
plt.show()

dev/_downloads/plot_image_denoising.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@
4646

4747
###############################################################################
4848
# Load Lena image and extract patches
49-
5049
lena = lena() / 256.0
5150

5251
# downsample for higher speed
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""
2+
====================================
3+
Demonstration of k-means assumptions
4+
====================================
5+
6+
This example is meant to illustrate situations where k-means will produce
7+
unintuitive and possibly unexpected clusters. In the first three plots, the
8+
input data does not conform to some implicit assumption that k-means makes and
9+
undesirable clusters are produced as a result. In the last plot, k-means
10+
returns intuitive clusters despite unevenly sized blobs.
11+
"""
12+
print(__doc__)
13+
14+
# Author: Phil Roth <[email protected]>
15+
# License: BSD 3 clause
16+
17+
import numpy as np
18+
import matplotlib.pyplot as plt
19+
20+
from sklearn.cluster import KMeans
21+
from sklearn.datasets import make_blobs
22+
23+
plt.figure(figsize=(12, 12))
24+
25+
n_samples = 1500
26+
random_state = 170
27+
X, y = make_blobs(n_samples=n_samples, random_state=random_state)
28+
29+
# Incorrect number of clusters
30+
y_pred = KMeans(n_clusters=2, random_state=random_state).fit_predict(X)
31+
32+
plt.subplot(221)
33+
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
34+
plt.title("Incorrect Number of Blobs")
35+
36+
# Anisotropicly distributed data
37+
transformation = [[ 0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
38+
X_aniso = np.dot(X, transformation)
39+
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_aniso)
40+
41+
plt.subplot(222)
42+
plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
43+
plt.title("Anisotropicly Distributed Blobs")
44+
45+
# Different variance
46+
X_varied, y_varied = make_blobs(n_samples=n_samples,
47+
cluster_std=[1.0, 2.5, 0.5],
48+
random_state=random_state)
49+
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_varied)
50+
51+
plt.subplot(223)
52+
plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
53+
plt.title("Unequal Variance")
54+
55+
# Unevenly sized blobs
56+
X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
57+
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_filtered)
58+
59+
plt.subplot(224)
60+
plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
61+
plt.title("Unevenly Sized Blobs")
62+
63+
plt.show()
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""
2+
========================================================================================
3+
Topics extraction with Non-Negative Matrix Factorization And Latent Dirichlet Allocation
4+
========================================================================================
5+
6+
This is an example of applying Non Negative Matrix Factorization
7+
and Latent Dirichlet Allocation on a corpus of documents and
8+
extract additive models of the topic structure of the corpus.
9+
The output is a list of topics, each represented as a list of terms
10+
(weights are not shown).
11+
12+
The default parameters (n_samples / n_features / n_topics) should make
13+
the example runnable in a couple of tens of seconds. You can try to
14+
increase the dimensions of the problem, but be aware that the time
15+
complexity is polynomial in NMF. In LDA, the time complexity is
16+
proportional to (n_samples * iterations).
17+
"""
18+
19+
# Author: Olivier Grisel <[email protected]>
20+
# Lars Buitinck <[email protected]>
21+
# Chyi-Kwei Yau <[email protected]>
22+
# License: BSD 3 clause
23+
24+
from __future__ import print_function
25+
from time import time
26+
27+
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
28+
from sklearn.decomposition import NMF, LatentDirichletAllocation
29+
from sklearn.datasets import fetch_20newsgroups
30+
31+
n_samples = 2000
32+
n_features = 1000
33+
n_topics = 10
34+
n_top_words = 20
35+
36+
37+
def print_top_words(model, feature_names, n_top_words):
38+
for topic_idx, topic in enumerate(model.components_):
39+
print("Topic #%d:" % topic_idx)
40+
print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
41+
print()
42+
43+
44+
# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
45+
# to filter out useless terms early on: the posts are stripped of headers,
46+
# footers and quoted replies, and common English words, words occurring in
47+
# only one document or in at least 95% of the documents are removed.
48+
49+
t0 = time()
50+
print("Loading dataset and extracting features...")
51+
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
52+
remove=('headers', 'footers', 'quotes'))
53+
data_samples = dataset.data[:n_samples]
54+
55+
# use tf-idf feature for NMF model
56+
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,
57+
stop_words='english')
58+
tfidf = tfidf_vectorizer.fit_transform(data_samples)
59+
60+
# use tf feature for LDA model
61+
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
62+
stop_words='english')
63+
tf = tf_vectorizer.fit_transform(data_samples)
64+
print("done in %0.3fs." % (time() - t0))
65+
66+
# Fit the NMF model
67+
print("Fitting the NMF model with tf-idf feature, n_samples=%d and n_features=%d..."
68+
% (n_samples, n_features))
69+
nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)
70+
print("done in %0.3fs." % (time() - t0))
71+
72+
print("\nTopics in NMF model:")
73+
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
74+
print_top_words(nmf, tfidf_feature_names, n_top_words)
75+
76+
print("\nFitting LDA models with tf feature, n_samples=%d and n_features=%d..."
77+
% (n_samples, n_features))
78+
79+
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
80+
learning_method='online', learning_offset=50.,
81+
random_state=0)
82+
lda.fit(tf)
83+
print("done in %0.3fs." % (time() - t0))
84+
85+
print("\nTopics in LDA model:")
86+
tf_feature_names = tf_vectorizer.get_feature_names()
87+
print_top_words(lda, tf_feature_names, n_top_words)

dev/_images/lda_model_graph.png

18.6 KB

0 commit comments

Comments
 (0)