scikit-learn
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
13.6 KB b/‎dev/_downloads/auto_examples_jupyter.zip
13.6 KB
diff --git a/‎dev/_downloads/auto_examples_python.zip
10.8 KB b/‎dev/_downloads/auto_examples_python.zip
10.8 KB
diff --git a/‎dev/_downloads/plot_lle_digits.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/plot_lle_digits.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_lle_digits.py
Lines changed: 31 additions & 15 deletions b/‎dev/_downloads/plot_lle_digits.py
Lines changed: 31 additions & 15 deletions
diff --git a/‎dev/_downloads/plot_nca_classification.ipynb
Lines changed: 54 additions & 0 deletions b/‎dev/_downloads/plot_nca_classification.ipynb
Lines changed: 54 additions & 0 deletions
diff --git a/‎dev/_downloads/plot_nca_classification.py
Lines changed: 88 additions & 0 deletions b/‎dev/_downloads/plot_nca_classification.py
Lines changed: 88 additions & 0 deletions
diff --git a/‎dev/_downloads/plot_nca_dim_reduction.ipynb
Lines changed: 54 additions & 0 deletions b/‎dev/_downloads/plot_nca_dim_reduction.ipynb
Lines changed: 54 additions & 0 deletions
@@ -15,6 +15,11 @@
 this example, which is not the default setting. It ensures global stability
 of the embedding, i.e., the embedding does not depend on random
 initialization.
+
+Linear Discriminant Analysis, from the :mod:`sklearn.discriminant_analysis`
+module, and Neighborhood Components Analysis, from the :mod:`sklearn.neighbors`
+module, are supervised dimensionality reduction method, i.e. they make use of
+the provided labels, contrary to other methods.
 """
 
 # Authors: Fabian Pedregosa <[email protected]>
@@ -30,7 +35,7 @@
 import matplotlib.pyplot as plt
 from matplotlib import offsetbox
 from sklearn import (manifold, datasets, decomposition, ensemble,
-                     discriminant_analysis, random_projection)
+                     discriminant_analysis, random_projection, neighbors)
 
 digits = datasets.load_digits(n_class=6)
 X = digits.data
@@ -39,7 +44,7 @@
 n_neighbors = 30
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Scale and visualize the embedding vectors
 def plot_embedding(X, title=None):
     x_min, x_max = np.min(X, 0), np.max(X, 0)
@@ -70,7 +75,7 @@ def plot_embedding(X, title=None):
         plt.title(title)
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Plot images of the digits
 n_img_per_row = 20
 img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row))
@@ -86,7 +91,7 @@ def plot_embedding(X, title=None):
 plt.title('A selection from the 64-dimensional digits dataset')
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Random 2D projection using a random unitary matrix
 print("Computing random projection")
 rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
@@ -104,7 +109,7 @@ def plot_embedding(X, title=None):
                "Principal Components projection of the digits (time %.2fs)" %
                (time() - t0))
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Projection on to the first 2 linear discriminant components
 
 print("Computing Linear Discriminant Analysis projection")
@@ -117,9 +122,9 @@ def plot_embedding(X, title=None):
                (time() - t0))
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Isomap projection of the digits dataset
-print("Computing Isomap embedding")
+print("Computing Isomap projection")
 t0 = time()
 X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X)
 print("Done.")
@@ -128,7 +133,7 @@ def plot_embedding(X, title=None):
                (time() - t0))
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Locally linear embedding of the digits dataset
 print("Computing LLE embedding")
 clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
@@ -141,7 +146,7 @@ def plot_embedding(X, title=None):
                (time() - t0))
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Modified Locally linear embedding of the digits dataset
 print("Computing modified LLE embedding")
 clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
@@ -154,7 +159,7 @@ def plot_embedding(X, title=None):
                (time() - t0))
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # HLLE embedding of the digits dataset
 print("Computing Hessian LLE embedding")
 clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
@@ -167,7 +172,7 @@ def plot_embedding(X, title=None):
                (time() - t0))
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # LTSA embedding of the digits dataset
 print("Computing LTSA embedding")
 clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
@@ -179,7 +184,7 @@ def plot_embedding(X, title=None):
                "Local Tangent Space Alignment of the digits (time %.2fs)" %
                (time() - t0))
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # MDS  embedding of the digits dataset
 print("Computing MDS embedding")
 clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
@@ -190,7 +195,7 @@ def plot_embedding(X, title=None):
                "MDS embedding of the digits (time %.2fs)" %
                (time() - t0))
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Random Trees embedding of the digits dataset
 print("Computing Totally Random Trees embedding")
 hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
@@ -204,7 +209,7 @@ def plot_embedding(X, title=None):
                "Random forest embedding of the digits (time %.2fs)" %
                (time() - t0))
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Spectral embedding of the digits dataset
 print("Computing Spectral embedding")
 embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
@@ -216,7 +221,7 @@ def plot_embedding(X, title=None):
                "Spectral embedding of the digits (time %.2fs)" %
                (time() - t0))
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # t-SNE embedding of the digits dataset
 print("Computing t-SNE embedding")
 tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
@@ -227,4 +232,15 @@ def plot_embedding(X, title=None):
                "t-SNE embedding of the digits (time %.2fs)" %
                (time() - t0))
 
+# ----------------------------------------------------------------------
+# NCA projection of the digits dataset
+print("Computing NCA projection")
+nca = neighbors.NeighborhoodComponentsAnalysis(n_components=2, random_state=0)
+t0 = time()
+X_nca = nca.fit_transform(X, y)
+
+plot_embedding(X_nca,
+               "NCA embedding of the digits (time %.2fs)" %
+               (time() - t0))
+
 plt.show()
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Comparing Nearest Neighbors with and without Neighborhood Components Analysis\n\n\nAn example comparing nearest neighbors classification with and without\nNeighborhood Components Analysis.\n\nIt will plot the class decision boundaries given by a Nearest Neighbors\nclassifier when using the Euclidean distance on the original features, versus\nusing the Euclidean distance after the transformation learned by Neighborhood\nComponents Analysis. The latter aims to find a linear transformation that\nmaximises the (stochastic) nearest neighbor classification accuracy on the\ntraining set.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.neighbors import (KNeighborsClassifier,\n                               NeighborhoodComponentsAnalysis)\nfrom sklearn.pipeline import Pipeline\n\n\nprint(__doc__)\n\nn_neighbors = 1\n\ndataset = datasets.load_iris()\nX, y = dataset.data, dataset.target\n\n# we only take two features. We could avoid this ugly\n# slicing by using a two-dim dataset\nX = X[:, [0, 2]]\n\nX_train, X_test, y_train, y_test = \\\n    train_test_split(X, y, stratify=y, test_size=0.7, random_state=42)\n\nh = .01  # step size in the mesh\n\n# Create color maps\ncmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\ncmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\n\nnames = ['KNN', 'NCA, KNN']\n\nclassifiers = [Pipeline([('scaler', StandardScaler()),\n                         ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))\n                         ]),\n               Pipeline([('scaler', StandardScaler()),\n                         ('nca', NeighborhoodComponentsAnalysis()),\n                         ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))\n                         ])\n               ]\n\nx_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\ny_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\nxx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n                     np.arange(y_min, y_max, h))\n\nfor name, clf in zip(names, classifiers):\n\n    clf.fit(X_train, y_train)\n    score = clf.score(X_test, y_test)\n\n    # Plot the decision boundary. For that, we will assign a color to each\n    # point in the mesh [x_min, x_max]x[y_min, y_max].\n    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n    # Put the result into a color plot\n    Z = Z.reshape(xx.shape)\n    plt.figure()\n    plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8)\n\n    # Plot also the training and testing points\n    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)\n    plt.xlim(xx.min(), xx.max())\n    plt.ylim(yy.min(), yy.max())\n    plt.title(\"{} (k = {})\".format(name, n_neighbors))\n    plt.text(0.9, 0.1, '{:.2f}'.format(score), size=15,\n             ha='center', va='center', transform=plt.gca().transAxes)\n\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.8"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,88 @@
+"""
+=============================================================================
+Comparing Nearest Neighbors with and without Neighborhood Components Analysis
+=============================================================================
+
+An example comparing nearest neighbors classification with and without
+Neighborhood Components Analysis.
+
+It will plot the class decision boundaries given by a Nearest Neighbors
+classifier when using the Euclidean distance on the original features, versus
+using the Euclidean distance after the transformation learned by Neighborhood
+Components Analysis. The latter aims to find a linear transformation that
+maximises the (stochastic) nearest neighbor classification accuracy on the
+training set.
+"""
+
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.neighbors import (KNeighborsClassifier,
+                               NeighborhoodComponentsAnalysis)
+from sklearn.pipeline import Pipeline
+
+
+print(__doc__)
+
+n_neighbors = 1
+
+dataset = datasets.load_iris()
+X, y = dataset.data, dataset.target
+
+# we only take two features. We could avoid this ugly
+# slicing by using a two-dim dataset
+X = X[:, [0, 2]]
+
+X_train, X_test, y_train, y_test = \
+    train_test_split(X, y, stratify=y, test_size=0.7, random_state=42)
+
+h = .01  # step size in the mesh
+
+# Create color maps
+cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
+cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
+
+names = ['KNN', 'NCA, KNN']
+
+classifiers = [Pipeline([('scaler', StandardScaler()),
+                         ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
+                         ]),
+               Pipeline([('scaler', StandardScaler()),
+                         ('nca', NeighborhoodComponentsAnalysis()),
+                         ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
+                         ])
+               ]
+
+x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
+y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+                     np.arange(y_min, y_max, h))
+
+for name, clf in zip(names, classifiers):
+
+    clf.fit(X_train, y_train)
+    score = clf.score(X_test, y_test)
+
+    # Plot the decision boundary. For that, we will assign a color to each
+    # point in the mesh [x_min, x_max]x[y_min, y_max].
+    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
+
+    # Put the result into a color plot
+    Z = Z.reshape(xx.shape)
+    plt.figure()
+    plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8)
+
+    # Plot also the training and testing points
+    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
+    plt.xlim(xx.min(), xx.max())
+    plt.ylim(yy.min(), yy.max())
+    plt.title("{} (k = {})".format(name, n_neighbors))
+    plt.text(0.9, 0.1, '{:.2f}'.format(score), size=15,
+             ha='center', va='center', transform=plt.gca().transAxes)
+
+plt.show()
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Dimensionality Reduction with Neighborhood Components Analysis\n\n\nSample usage of Neighborhood Components Analysis for dimensionality reduction.\n\nThis example compares different (linear) dimensionality reduction methods\napplied on the Digits data set. The data set contains images of digits from\n0 to 9 with approximately 180 samples of each class. Each image is of\ndimension 8x8 = 64, and is reduced to a two-dimensional data point.\n\nPrincipal Component Analysis (PCA) applied to this data identifies the\ncombination of attributes (principal components, or directions in the\nfeature space) that account for the most variance in the data. Here we\nplot the different samples on the 2 first principal components.\n\nLinear Discriminant Analysis (LDA) tries to identify attributes that\naccount for the most variance *between classes*. In particular,\nLDA, in contrast to PCA, is a supervised method, using known class labels.\n\nNeighborhood Components Analysis (NCA) tries to find a feature space such\nthat a stochastic nearest neighbor algorithm will give the best accuracy.\nLike LDA, it is a supervised method.\n\nOne can see that NCA enforces a clustering of the data that is visually\nmeaningful despite the large reduction in dimension.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.decomposition import PCA\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.neighbors import (KNeighborsClassifier,\n                               NeighborhoodComponentsAnalysis)\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import StandardScaler\n\nprint(__doc__)\n\nn_neighbors = 3\nrandom_state = 0\n\n# Load Digits dataset\ndigits = datasets.load_digits()\nX, y = digits.data, digits.target\n\n# Split into train/test\nX_train, X_test, y_train, y_test = \\\n    train_test_split(X, y, test_size=0.5, stratify=y,\n                     random_state=random_state)\n\ndim = len(X[0])\nn_classes = len(np.unique(y))\n\n# Reduce dimension to 2 with PCA\npca = make_pipeline(StandardScaler(),\n                    PCA(n_components=2, random_state=random_state))\n\n# Reduce dimension to 2 with LinearDiscriminantAnalysis\nlda = make_pipeline(StandardScaler(),\n                    LinearDiscriminantAnalysis(n_components=2))\n\n# Reduce dimension to 2 with NeighborhoodComponentAnalysis\nnca = make_pipeline(StandardScaler(),\n                    NeighborhoodComponentsAnalysis(n_components=2,\n                                                   random_state=random_state))\n\n# Use a nearest neighbor classifier to evaluate the methods\nknn = KNeighborsClassifier(n_neighbors=n_neighbors)\n\n# Make a list of the methods to be compared\ndim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]\n\n# plt.figure()\nfor i, (name, model) in enumerate(dim_reduction_methods):\n    plt.figure()\n    # plt.subplot(1, 3, i + 1, aspect=1)\n\n    # Fit the method's model\n    model.fit(X_train, y_train)\n\n    # Fit a nearest neighbor classifier on the embedded training set\n    knn.fit(model.transform(X_train), y_train)\n\n    # Compute the nearest neighbor accuracy on the embedded test set\n    acc_knn = knn.score(model.transform(X_test), y_test)\n\n    # Embed the data set in 2 dimensions using the fitted model\n    X_embedded = model.transform(X)\n\n    # Plot the projected points and show the evaluation score\n    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1')\n    plt.title(\"{}, KNN (k={})\\nTest accuracy = {:.2f}\".format(name,\n                                                              n_neighbors,\n                                                              acc_knn))\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.8"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}