Skip to content

Commit d49d1aa

Browse files
committed
Pushing the docs to dev/ for branch: master, commit d31b67f23d3d785b17261ada293710380683bc82
1 parent 486da37 commit d49d1aa

File tree

1,119 files changed

+7028
-3485
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,119 files changed

+7028
-3485
lines changed
13.6 KB
Binary file not shown.
10.8 KB
Binary file not shown.

dev/_downloads/plot_lle_digits.ipynb

Lines changed: 2 additions & 2 deletions
Large diffs are not rendered by default.

dev/_downloads/plot_lle_digits.py

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
this example, which is not the default setting. It ensures global stability
1616
of the embedding, i.e., the embedding does not depend on random
1717
initialization.
18+
19+
Linear Discriminant Analysis, from the :mod:`sklearn.discriminant_analysis`
20+
module, and Neighborhood Components Analysis, from the :mod:`sklearn.neighbors`
21+
module, are supervised dimensionality reduction method, i.e. they make use of
22+
the provided labels, contrary to other methods.
1823
"""
1924

2025
# Authors: Fabian Pedregosa <[email protected]>
@@ -30,7 +35,7 @@
3035
import matplotlib.pyplot as plt
3136
from matplotlib import offsetbox
3237
from sklearn import (manifold, datasets, decomposition, ensemble,
33-
discriminant_analysis, random_projection)
38+
discriminant_analysis, random_projection, neighbors)
3439

3540
digits = datasets.load_digits(n_class=6)
3641
X = digits.data
@@ -39,7 +44,7 @@
3944
n_neighbors = 30
4045

4146

42-
#----------------------------------------------------------------------
47+
# ----------------------------------------------------------------------
4348
# Scale and visualize the embedding vectors
4449
def plot_embedding(X, title=None):
4550
x_min, x_max = np.min(X, 0), np.max(X, 0)
@@ -70,7 +75,7 @@ def plot_embedding(X, title=None):
7075
plt.title(title)
7176

7277

73-
#----------------------------------------------------------------------
78+
# ----------------------------------------------------------------------
7479
# Plot images of the digits
7580
n_img_per_row = 20
7681
img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row))
@@ -86,7 +91,7 @@ def plot_embedding(X, title=None):
8691
plt.title('A selection from the 64-dimensional digits dataset')
8792

8893

89-
#----------------------------------------------------------------------
94+
# ----------------------------------------------------------------------
9095
# Random 2D projection using a random unitary matrix
9196
print("Computing random projection")
9297
rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
@@ -104,7 +109,7 @@ def plot_embedding(X, title=None):
104109
"Principal Components projection of the digits (time %.2fs)" %
105110
(time() - t0))
106111

107-
#----------------------------------------------------------------------
112+
# ----------------------------------------------------------------------
108113
# Projection on to the first 2 linear discriminant components
109114

110115
print("Computing Linear Discriminant Analysis projection")
@@ -117,9 +122,9 @@ def plot_embedding(X, title=None):
117122
(time() - t0))
118123

119124

120-
#----------------------------------------------------------------------
125+
# ----------------------------------------------------------------------
121126
# Isomap projection of the digits dataset
122-
print("Computing Isomap embedding")
127+
print("Computing Isomap projection")
123128
t0 = time()
124129
X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X)
125130
print("Done.")
@@ -128,7 +133,7 @@ def plot_embedding(X, title=None):
128133
(time() - t0))
129134

130135

131-
#----------------------------------------------------------------------
136+
# ----------------------------------------------------------------------
132137
# Locally linear embedding of the digits dataset
133138
print("Computing LLE embedding")
134139
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
@@ -141,7 +146,7 @@ def plot_embedding(X, title=None):
141146
(time() - t0))
142147

143148

144-
#----------------------------------------------------------------------
149+
# ----------------------------------------------------------------------
145150
# Modified Locally linear embedding of the digits dataset
146151
print("Computing modified LLE embedding")
147152
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
@@ -154,7 +159,7 @@ def plot_embedding(X, title=None):
154159
(time() - t0))
155160

156161

157-
#----------------------------------------------------------------------
162+
# ----------------------------------------------------------------------
158163
# HLLE embedding of the digits dataset
159164
print("Computing Hessian LLE embedding")
160165
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
@@ -167,7 +172,7 @@ def plot_embedding(X, title=None):
167172
(time() - t0))
168173

169174

170-
#----------------------------------------------------------------------
175+
# ----------------------------------------------------------------------
171176
# LTSA embedding of the digits dataset
172177
print("Computing LTSA embedding")
173178
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
@@ -179,7 +184,7 @@ def plot_embedding(X, title=None):
179184
"Local Tangent Space Alignment of the digits (time %.2fs)" %
180185
(time() - t0))
181186

182-
#----------------------------------------------------------------------
187+
# ----------------------------------------------------------------------
183188
# MDS embedding of the digits dataset
184189
print("Computing MDS embedding")
185190
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
@@ -190,7 +195,7 @@ def plot_embedding(X, title=None):
190195
"MDS embedding of the digits (time %.2fs)" %
191196
(time() - t0))
192197

193-
#----------------------------------------------------------------------
198+
# ----------------------------------------------------------------------
194199
# Random Trees embedding of the digits dataset
195200
print("Computing Totally Random Trees embedding")
196201
hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
@@ -204,7 +209,7 @@ def plot_embedding(X, title=None):
204209
"Random forest embedding of the digits (time %.2fs)" %
205210
(time() - t0))
206211

207-
#----------------------------------------------------------------------
212+
# ----------------------------------------------------------------------
208213
# Spectral embedding of the digits dataset
209214
print("Computing Spectral embedding")
210215
embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
@@ -216,7 +221,7 @@ def plot_embedding(X, title=None):
216221
"Spectral embedding of the digits (time %.2fs)" %
217222
(time() - t0))
218223

219-
#----------------------------------------------------------------------
224+
# ----------------------------------------------------------------------
220225
# t-SNE embedding of the digits dataset
221226
print("Computing t-SNE embedding")
222227
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
@@ -227,4 +232,15 @@ def plot_embedding(X, title=None):
227232
"t-SNE embedding of the digits (time %.2fs)" %
228233
(time() - t0))
229234

235+
# ----------------------------------------------------------------------
236+
# NCA projection of the digits dataset
237+
print("Computing NCA projection")
238+
nca = neighbors.NeighborhoodComponentsAnalysis(n_components=2, random_state=0)
239+
t0 = time()
240+
X_nca = nca.fit_transform(X, y)
241+
242+
plot_embedding(X_nca,
243+
"NCA embedding of the digits (time %.2fs)" %
244+
(time() - t0))
245+
230246
plt.show()
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Comparing Nearest Neighbors with and without Neighborhood Components Analysis\n\n\nAn example comparing nearest neighbors classification with and without\nNeighborhood Components Analysis.\n\nIt will plot the class decision boundaries given by a Nearest Neighbors\nclassifier when using the Euclidean distance on the original features, versus\nusing the Euclidean distance after the transformation learned by Neighborhood\nComponents Analysis. The latter aims to find a linear transformation that\nmaximises the (stochastic) nearest neighbor classification accuracy on the\ntraining set.\n\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.neighbors import (KNeighborsClassifier,\n NeighborhoodComponentsAnalysis)\nfrom sklearn.pipeline import Pipeline\n\n\nprint(__doc__)\n\nn_neighbors = 1\n\ndataset = datasets.load_iris()\nX, y = dataset.data, dataset.target\n\n# we only take two features. We could avoid this ugly\n# slicing by using a two-dim dataset\nX = X[:, [0, 2]]\n\nX_train, X_test, y_train, y_test = \\\n train_test_split(X, y, stratify=y, test_size=0.7, random_state=42)\n\nh = .01 # step size in the mesh\n\n# Create color maps\ncmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\ncmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\n\nnames = ['KNN', 'NCA, KNN']\n\nclassifiers = [Pipeline([('scaler', StandardScaler()),\n ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))\n ]),\n Pipeline([('scaler', StandardScaler()),\n ('nca', NeighborhoodComponentsAnalysis()),\n ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))\n ])\n ]\n\nx_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\ny_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\nxx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n np.arange(y_min, y_max, h))\n\nfor name, clf in zip(names, classifiers):\n\n clf.fit(X_train, y_train)\n score = clf.score(X_test, y_test)\n\n # Plot the decision boundary. For that, we will assign a color to each\n # point in the mesh [x_min, x_max]x[y_min, y_max].\n Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n # Put the result into a color plot\n Z = Z.reshape(xx.shape)\n plt.figure()\n plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8)\n\n # Plot also the training and testing points\n plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)\n plt.xlim(xx.min(), xx.max())\n plt.ylim(yy.min(), yy.max())\n plt.title(\"{} (k = {})\".format(name, n_neighbors))\n plt.text(0.9, 0.1, '{:.2f}'.format(score), size=15,\n ha='center', va='center', transform=plt.gca().transAxes)\n\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.6.8"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
"""
2+
=============================================================================
3+
Comparing Nearest Neighbors with and without Neighborhood Components Analysis
4+
=============================================================================
5+
6+
An example comparing nearest neighbors classification with and without
7+
Neighborhood Components Analysis.
8+
9+
It will plot the class decision boundaries given by a Nearest Neighbors
10+
classifier when using the Euclidean distance on the original features, versus
11+
using the Euclidean distance after the transformation learned by Neighborhood
12+
Components Analysis. The latter aims to find a linear transformation that
13+
maximises the (stochastic) nearest neighbor classification accuracy on the
14+
training set.
15+
"""
16+
17+
# License: BSD 3 clause
18+
19+
import numpy as np
20+
import matplotlib.pyplot as plt
21+
from matplotlib.colors import ListedColormap
22+
from sklearn import datasets
23+
from sklearn.model_selection import train_test_split
24+
from sklearn.preprocessing import StandardScaler
25+
from sklearn.neighbors import (KNeighborsClassifier,
26+
NeighborhoodComponentsAnalysis)
27+
from sklearn.pipeline import Pipeline
28+
29+
30+
print(__doc__)
31+
32+
n_neighbors = 1
33+
34+
dataset = datasets.load_iris()
35+
X, y = dataset.data, dataset.target
36+
37+
# we only take two features. We could avoid this ugly
38+
# slicing by using a two-dim dataset
39+
X = X[:, [0, 2]]
40+
41+
X_train, X_test, y_train, y_test = \
42+
train_test_split(X, y, stratify=y, test_size=0.7, random_state=42)
43+
44+
h = .01 # step size in the mesh
45+
46+
# Create color maps
47+
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
48+
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
49+
50+
names = ['KNN', 'NCA, KNN']
51+
52+
classifiers = [Pipeline([('scaler', StandardScaler()),
53+
('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
54+
]),
55+
Pipeline([('scaler', StandardScaler()),
56+
('nca', NeighborhoodComponentsAnalysis()),
57+
('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
58+
])
59+
]
60+
61+
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
62+
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
63+
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
64+
np.arange(y_min, y_max, h))
65+
66+
for name, clf in zip(names, classifiers):
67+
68+
clf.fit(X_train, y_train)
69+
score = clf.score(X_test, y_test)
70+
71+
# Plot the decision boundary. For that, we will assign a color to each
72+
# point in the mesh [x_min, x_max]x[y_min, y_max].
73+
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
74+
75+
# Put the result into a color plot
76+
Z = Z.reshape(xx.shape)
77+
plt.figure()
78+
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8)
79+
80+
# Plot also the training and testing points
81+
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
82+
plt.xlim(xx.min(), xx.max())
83+
plt.ylim(yy.min(), yy.max())
84+
plt.title("{} (k = {})".format(name, n_neighbors))
85+
plt.text(0.9, 0.1, '{:.2f}'.format(score), size=15,
86+
ha='center', va='center', transform=plt.gca().transAxes)
87+
88+
plt.show()
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Dimensionality Reduction with Neighborhood Components Analysis\n\n\nSample usage of Neighborhood Components Analysis for dimensionality reduction.\n\nThis example compares different (linear) dimensionality reduction methods\napplied on the Digits data set. The data set contains images of digits from\n0 to 9 with approximately 180 samples of each class. Each image is of\ndimension 8x8 = 64, and is reduced to a two-dimensional data point.\n\nPrincipal Component Analysis (PCA) applied to this data identifies the\ncombination of attributes (principal components, or directions in the\nfeature space) that account for the most variance in the data. Here we\nplot the different samples on the 2 first principal components.\n\nLinear Discriminant Analysis (LDA) tries to identify attributes that\naccount for the most variance *between classes*. In particular,\nLDA, in contrast to PCA, is a supervised method, using known class labels.\n\nNeighborhood Components Analysis (NCA) tries to find a feature space such\nthat a stochastic nearest neighbor algorithm will give the best accuracy.\nLike LDA, it is a supervised method.\n\nOne can see that NCA enforces a clustering of the data that is visually\nmeaningful despite the large reduction in dimension.\n\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.decomposition import PCA\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.neighbors import (KNeighborsClassifier,\n NeighborhoodComponentsAnalysis)\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import StandardScaler\n\nprint(__doc__)\n\nn_neighbors = 3\nrandom_state = 0\n\n# Load Digits dataset\ndigits = datasets.load_digits()\nX, y = digits.data, digits.target\n\n# Split into train/test\nX_train, X_test, y_train, y_test = \\\n train_test_split(X, y, test_size=0.5, stratify=y,\n random_state=random_state)\n\ndim = len(X[0])\nn_classes = len(np.unique(y))\n\n# Reduce dimension to 2 with PCA\npca = make_pipeline(StandardScaler(),\n PCA(n_components=2, random_state=random_state))\n\n# Reduce dimension to 2 with LinearDiscriminantAnalysis\nlda = make_pipeline(StandardScaler(),\n LinearDiscriminantAnalysis(n_components=2))\n\n# Reduce dimension to 2 with NeighborhoodComponentAnalysis\nnca = make_pipeline(StandardScaler(),\n NeighborhoodComponentsAnalysis(n_components=2,\n random_state=random_state))\n\n# Use a nearest neighbor classifier to evaluate the methods\nknn = KNeighborsClassifier(n_neighbors=n_neighbors)\n\n# Make a list of the methods to be compared\ndim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]\n\n# plt.figure()\nfor i, (name, model) in enumerate(dim_reduction_methods):\n plt.figure()\n # plt.subplot(1, 3, i + 1, aspect=1)\n\n # Fit the method's model\n model.fit(X_train, y_train)\n\n # Fit a nearest neighbor classifier on the embedded training set\n knn.fit(model.transform(X_train), y_train)\n\n # Compute the nearest neighbor accuracy on the embedded test set\n acc_knn = knn.score(model.transform(X_test), y_test)\n\n # Embed the data set in 2 dimensions using the fitted model\n X_embedded = model.transform(X)\n\n # Plot the projected points and show the evaluation score\n plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1')\n plt.title(\"{}, KNN (k={})\\nTest accuracy = {:.2f}\".format(name,\n n_neighbors,\n acc_knn))\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.6.8"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}

0 commit comments

Comments
 (0)