scikit-learn
diff --git a/‎dev/_downloads/plot_cv_predict.py
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/plot_cv_predict.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_function_transformer.py
Lines changed: 69 additions & 0 deletions b/‎dev/_downloads/plot_function_transformer.py
Lines changed: 69 additions & 0 deletions
diff --git a/‎dev/_downloads/plot_image_denoising.py
Lines changed: 0 additions & 1 deletion b/‎dev/_downloads/plot_image_denoising.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎dev/_downloads/plot_kmeans_assumptions.py
Lines changed: 63 additions & 0 deletions b/‎dev/_downloads/plot_kmeans_assumptions.py
Lines changed: 63 additions & 0 deletions
diff --git a/‎dev/_downloads/topics_extraction_with_nmf_lda.py
Lines changed: 87 additions & 0 deletions b/‎dev/_downloads/topics_extraction_with_nmf_lda.py
Lines changed: 87 additions & 0 deletions
diff --git a/‎dev/_images/lda_model_graph.png
18.6 KB b/‎dev/_images/lda_model_graph.png
18.6 KB
diff --git a/‎dev/_images/math/0027034d8a10372a06deaf4f4084c01956587479.png
237 Bytes b/‎dev/_images/math/0027034d8a10372a06deaf4f4084c01956587479.png
237 Bytes
diff --git a/‎dev/_images/math/0062f406d725fe3412de1e465412315a9df04e88.png
270 Bytes b/‎dev/_images/math/0062f406d725fe3412de1e465412315a9df04e88.png
270 Bytes
diff --git a/‎dev/_images/math/00a7cc411ba8123cd0a6eb67d65f0ff324742540.png
1.56 KB b/‎dev/_images/math/00a7cc411ba8123cd0a6eb67d65f0ff324742540.png
1.56 KB
diff --git a/‎dev/_images/math/00e48e340f1254f19f827a05a73372ccbcb5b2ec.png
1.33 KB b/‎dev/_images/math/00e48e340f1254f19f827a05a73372ccbcb5b2ec.png
1.33 KB
@@ -20,9 +20,9 @@
 # is a prediction obtained by cross validated:
 predicted = cross_val_predict(lr, boston.data, y, cv=10)
 
-fig,ax = plt.subplots()
+fig, ax = plt.subplots()
 ax.scatter(y, predicted)
 ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
 ax.set_xlabel('Measured')
 ax.set_ylabel('Predicted')
-fig.show()
+plt.show()
@@ -0,0 +1,69 @@
+"""
+=========================================================
+Using FunctionTransformer to select columns
+=========================================================
+
+Shows how to use a function transformer in a pipeline. If you know your
+dataset's first principle component is irrelevant for a classification task,
+you can use the FunctionTransformer to select all but the first column of the
+PCA transformed data.
+"""
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.cross_validation import train_test_split
+from sklearn.decomposition import PCA
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer
+
+
+def _generate_vector(shift=0.5, noise=15):
+    return np.arange(1000) + (np.random.rand(1000) - shift) * noise
+
+
+def generate_dataset():
+    """
+    This dataset is two lines with a slope ~ 1, where one has
+    a y offset of ~100
+    """
+    return np.vstack((
+        np.vstack((
+            _generate_vector(),
+            _generate_vector() + 100,
+        )).T,
+        np.vstack((
+            _generate_vector(),
+            _generate_vector(),
+        )).T,
+    )), np.hstack((np.zeros(1000), np.ones(1000)))
+
+
+def all_but_first_column(X):
+    return X[:, 1:]
+
+
+def drop_first_component(X, y):
+    """
+    Create a pipeline with PCA and the column selector and use it to
+    transform the dataset.
+    """
+    pipeline = make_pipeline(
+        PCA(), FunctionTransformer(all_but_first_column),
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+    pipeline.fit(X_train, y_train)
+    return pipeline.transform(X_test), y_test
+
+
+if __name__ == '__main__':
+    X, y = generate_dataset()
+    plt.scatter(X[:, 0], X[:, 1], c=y, s=50)
+    plt.show()
+    X_transformed, y_transformed = drop_first_component(*generate_dataset())
+    plt.scatter(
+        X_transformed[:, 0],
+        np.zeros(len(X_transformed)),
+        c=y_transformed,
+        s=50,
+    )
+    plt.show()
@@ -46,7 +46,6 @@
 
 ###############################################################################
 # Load Lena image and extract patches
-
 lena = lena() / 256.0
 
 # downsample for higher speed
 
@@ -0,0 +1,63 @@
+"""
+====================================
+Demonstration of k-means assumptions
+====================================
+
+This example is meant to illustrate situations where k-means will produce
+unintuitive and possibly unexpected clusters. In the first three plots, the
+input data does not conform to some implicit assumption that k-means makes and
+undesirable clusters are produced as a result. In the last plot, k-means
+returns intuitive clusters despite unevenly sized blobs.
+"""
+print(__doc__)
+
+# Author: Phil Roth <[email protected]>
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.cluster import KMeans
+from sklearn.datasets import make_blobs
+
+plt.figure(figsize=(12, 12))
+
+n_samples = 1500
+random_state = 170
+X, y = make_blobs(n_samples=n_samples, random_state=random_state)
+
+# Incorrect number of clusters
+y_pred = KMeans(n_clusters=2, random_state=random_state).fit_predict(X)
+
+plt.subplot(221)
+plt.scatter(X[:, 0], X[:, 1], c=y_pred)
+plt.title("Incorrect Number of Blobs")
+
+# Anisotropicly distributed data
+transformation = [[ 0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
+X_aniso = np.dot(X, transformation)
+y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_aniso)
+
+plt.subplot(222)
+plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
+plt.title("Anisotropicly Distributed Blobs")
+
+# Different variance
+X_varied, y_varied = make_blobs(n_samples=n_samples,
+                                cluster_std=[1.0, 2.5, 0.5],
+                                random_state=random_state)
+y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_varied)
+
+plt.subplot(223)
+plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
+plt.title("Unequal Variance")
+
+# Unevenly sized blobs
+X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
+y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_filtered)
+
+plt.subplot(224)
+plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
+plt.title("Unevenly Sized Blobs")
+
+plt.show()
@@ -0,0 +1,87 @@
+"""
+========================================================================================
+Topics extraction with Non-Negative Matrix Factorization And Latent Dirichlet Allocation
+========================================================================================
+
+This is an example of applying Non Negative Matrix Factorization 
+and Latent Dirichlet Allocation on a corpus of documents and
+extract additive models of the topic structure of the corpus.
+The output is a list of topics, each represented as a list of terms
+(weights are not shown).
+
+The default parameters (n_samples / n_features / n_topics) should make
+the example runnable in a couple of tens of seconds. You can try to
+increase the dimensions of the problem, but be aware that the time
+complexity is polynomial in NMF. In LDA, the time complexity is
+proportional to (n_samples * iterations).
+"""
+
+# Author: Olivier Grisel <[email protected]>
+#         Lars Buitinck <[email protected]>
+#         Chyi-Kwei Yau <[email protected]>
+# License: BSD 3 clause
+
+from __future__ import print_function
+from time import time
+
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn.decomposition import NMF, LatentDirichletAllocation
+from sklearn.datasets import fetch_20newsgroups
+
+n_samples = 2000
+n_features = 1000
+n_topics = 10
+n_top_words = 20
+
+
+def print_top_words(model, feature_names, n_top_words):
+    for topic_idx, topic in enumerate(model.components_):
+        print("Topic #%d:" % topic_idx)
+        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
+    print()
+
+
+# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
+# to filter out useless terms early on: the posts are stripped of headers,
+# footers and quoted replies, and common English words, words occurring in
+# only one document or in at least 95% of the documents are removed.
+
+t0 = time()
+print("Loading dataset and extracting features...")
+dataset = fetch_20newsgroups(shuffle=True, random_state=1,
+                             remove=('headers', 'footers', 'quotes'))
+data_samples = dataset.data[:n_samples]
+
+# use tf-idf feature for NMF model
+tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,
+                                   stop_words='english')
+tfidf = tfidf_vectorizer.fit_transform(data_samples)
+
+# use tf feature for LDA model
+tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
+                                stop_words='english')
+tf = tf_vectorizer.fit_transform(data_samples)
+print("done in %0.3fs." % (time() - t0))
+
+# Fit the NMF model
+print("Fitting the NMF model with tf-idf feature, n_samples=%d and n_features=%d..."
+      % (n_samples, n_features))
+nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)
+print("done in %0.3fs." % (time() - t0))
+
+print("\nTopics in NMF model:")
+tfidf_feature_names = tfidf_vectorizer.get_feature_names()
+print_top_words(nmf, tfidf_feature_names, n_top_words)
+
+print("\nFitting LDA models with tf feature, n_samples=%d and n_features=%d..."
+      % (n_samples, n_features))
+
+lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
+                                learning_method='online', learning_offset=50.,
+                                random_state=0)
+lda.fit(tf)
+print("done in %0.3fs." % (time() - t0))
+
+print("\nTopics in LDA model:")
+tf_feature_names = tf_vectorizer.get_feature_names()
+print_top_words(lda, tf_feature_names, n_top_words)