Skip to content

Commit c71f8c3

Browse files
committed
Pushing the docs to dev/ for branch: master, commit c24e749d2e112237b32ea60a8e7023447f67f6b5
1 parent c649ff0 commit c71f8c3

File tree

2,407 files changed

+13551
-6539
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

2,407 files changed

+13551
-6539
lines changed

dev/.buildinfo

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: c3012f96f030707b87aed004ce706ee9
3+
config: ebb9ea68698aea15799d10fa0216399e
44
tags: 645f666f9bcd5a90fca523b33c5a78b7
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
"""
2+
=======================================================
3+
Scalable learning with polynomial kernel aproximation
4+
=======================================================
5+
6+
This example illustrates the use of :class:`PolynomialCountSketch` to
7+
efficiently generate polynomial kernel feature-space approximations.
8+
This is used to train linear classifiers that approximate the accuracy
9+
of kernelized ones.
10+
11+
.. currentmodule:: sklearn.kernel_approximation
12+
13+
We use the Covtype dataset [2], trying to reproduce the experiments on the
14+
original paper of Tensor Sketch [1], i.e. the algorithm implemented by
15+
:class:`PolynomialCountSketch`.
16+
17+
First, we compute the accuracy of a linear classifier on the original
18+
features. Then, we train linear classifiers on different numbers of
19+
features (`n_components`) generated by :class:`PolynomialCountSketch`,
20+
approximating the accuracy of a kernelized classifier in a scalable manner.
21+
"""
22+
print(__doc__)
23+
24+
# Author: Daniel Lopez-Sanchez <[email protected]>
25+
# License: BSD 3 clause
26+
import matplotlib.pyplot as plt
27+
from sklearn.datasets import fetch_covtype
28+
from sklearn.model_selection import train_test_split
29+
from sklearn.preprocessing import MinMaxScaler, Normalizer
30+
from sklearn.svm import LinearSVC
31+
from sklearn.kernel_approximation import PolynomialCountSketch
32+
from sklearn.pipeline import Pipeline, make_pipeline
33+
import time
34+
35+
# %%
36+
# Load the Covtype dataset, which contains 581,012 samples
37+
# with 54 features each, distributed among 6 classes. The goal of this dataset
38+
# is to predict forest cover type from cartographic variables only
39+
# (no remotely sensed data). After loading, we transform it into a binary
40+
# classification problem to match the version of the dataset in the
41+
# LIBSVM webpage [2], which was the one used in [1].
42+
43+
X, y = fetch_covtype(return_X_y=True)
44+
45+
y[y != 2] = 0
46+
y[y == 2] = 1 # We will try to separate class 2 from the other 6 classes.
47+
48+
# %%
49+
# Here we select 5,000 samples for training and 10,000 for testing.
50+
# To actually reproduce the results in the original Tensor Sketch paper,
51+
# select 100,000 for training.
52+
53+
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5_000,
54+
test_size=10_000,
55+
random_state=42)
56+
57+
# %%
58+
# Now scale features to the range [0, 1] to match the format of the dataset in
59+
# the LIBSVM webpage, and then normalize to unit length as done in the
60+
# original Tensor Sketch paper [1].
61+
62+
mm = make_pipeline(MinMaxScaler(), Normalizer())
63+
X_train = mm.fit_transform(X_train)
64+
X_test = mm.transform(X_test)
65+
66+
67+
# %%
68+
# As a baseline, train a linear SVM on the original features and print the
69+
# accuracy. We also measure and store accuracies and training times to
70+
# plot them latter.
71+
72+
results = {}
73+
74+
lsvm = LinearSVC()
75+
start = time.time()
76+
lsvm.fit(X_train, y_train)
77+
lsvm_time = time.time() - start
78+
lsvm_score = 100 * lsvm.score(X_test, y_test)
79+
80+
results["LSVM"] = {"time": lsvm_time, "score": lsvm_score}
81+
print(f"Linear SVM score on raw features: {lsvm_score:.2f}%")
82+
83+
# %%
84+
# Then we train linear SVMs on the features generated by
85+
# :class:`PolynomialCountSketch` with different values for `n_components`,
86+
# showing that these kernel feature approximations improve the accuracy
87+
# of linear classification. In typical application scenarios, `n_components`
88+
# should be larger than the number of features in the input representation
89+
# in order to achieve an improvement with respect to linear classification.
90+
# As a rule of thumb, the optimum of evaluation score / run time cost is
91+
# typically achieved at around `n_components` = 10 * `n_features`, though this
92+
# might depend on the specific dataset being handled. Note that, since the
93+
# original samples have 54 features, the explicit feature map of the
94+
# polynomial kernel of degree four would have approximately 8.5 million
95+
# features (precisely, 54^4). Thanks to :class:`PolynomialCountSketch`, we can
96+
# condense most of the discriminative information of that feature space into a
97+
# much more compact representation. We repeat the experiment 5 times to
98+
# compensate for the stochastic nature of :class:`PolynomialCountSketch`.
99+
100+
n_runs = 3
101+
for n_components in [250, 500, 1000, 2000]:
102+
103+
ps_lsvm_time = 0
104+
ps_lsvm_score = 0
105+
for _ in range(n_runs):
106+
107+
pipeline = Pipeline(steps=[("kernel_approximator",
108+
PolynomialCountSketch(
109+
n_components=n_components,
110+
degree=4)),
111+
("linear_classifier", LinearSVC())])
112+
113+
start = time.time()
114+
pipeline.fit(X_train, y_train)
115+
ps_lsvm_time += time.time() - start
116+
ps_lsvm_score += 100 * pipeline.score(X_test, y_test)
117+
118+
ps_lsvm_time /= n_runs
119+
ps_lsvm_score /= n_runs
120+
121+
results[f"LSVM + PS({n_components})"] = {
122+
"time": ps_lsvm_time, "score": ps_lsvm_score
123+
}
124+
print(f"Linear SVM score on {n_components} PolynomialCountSketch " +
125+
f"features: {ps_lsvm_score:.2f}%")
126+
127+
# %%
128+
# Train a kernelized SVM to see how well :class:`PolynomialCountSketch`
129+
# is approximating the performance of the kernel. This, of course, may take
130+
# some time, as the SVC class has a relatively poor scalability. This is the
131+
# reason why kernel approximators are so useful:
132+
133+
from sklearn.svm import SVC
134+
135+
ksvm = SVC(C=500., kernel="poly", degree=4, coef0=0, gamma=1.)
136+
137+
start = time.time()
138+
ksvm.fit(X_train, y_train)
139+
ksvm_time = time.time() - start
140+
ksvm_score = 100 * ksvm.score(X_test, y_test)
141+
142+
results["KSVM"] = {"time": ksvm_time, "score": ksvm_score}
143+
print(f"Kernel-SVM score on raw featrues: {ksvm_score:.2f}%")
144+
145+
# %%
146+
# Finally, plot the resuts of the different methods against their training
147+
# times. As we can see, the kernelized SVM achieves a higher accuracy,
148+
# but its training time is much larger and, most importantly, will grow
149+
# much faster if the number of training samples increases.
150+
151+
N_COMPONENTS = [250, 500, 1000, 2000]
152+
153+
fig, ax = plt.subplots(figsize=(7, 7))
154+
ax.scatter([results["LSVM"]["time"], ], [results["LSVM"]["score"], ],
155+
label="Linear SVM", c="green", marker="^")
156+
157+
ax.scatter([results["LSVM + PS(250)"]["time"], ],
158+
[results["LSVM + PS(250)"]["score"], ],
159+
label="Linear SVM + PolynomialCountSketch", c="blue")
160+
for n_components in N_COMPONENTS:
161+
ax.scatter([results[f"LSVM + PS({n_components})"]["time"], ],
162+
[results[f"LSVM + PS({n_components})"]["score"], ],
163+
c="blue")
164+
ax.annotate(f"n_comp.={n_components}",
165+
(results[f"LSVM + PS({n_components})"]["time"],
166+
results[f"LSVM + PS({n_components})"]["score"]),
167+
xytext=(-30, 10), textcoords="offset pixels")
168+
169+
ax.scatter([results["KSVM"]["time"], ], [results["KSVM"]["score"], ],
170+
label="Kernel SVM", c="red", marker="x")
171+
172+
ax.set_xlabel("Training time (s)")
173+
ax.set_ylabel("Accurary (%)")
174+
ax.legend()
175+
plt.show()
176+
177+
# %%
178+
# References
179+
# ==========
180+
#
181+
# [1] Pham, Ninh and Rasmus Pagh. "Fast and scalable polynomial kernels via
182+
# explicit feature maps." KDD '13 (2013).
183+
# https://doi.org/10.1145/2487575.2487591
184+
#
185+
# [2] LIBSVM binary datasets repository
186+
# https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Principal Component Regression vs Partial Least Squares Regression\n\n\nThis example compares `Principal Component Regression\n<https://en.wikipedia.org/wiki/Principal_component_regression>`_ (PCR) and\n`Partial Least Squares Regression\n<https://en.wikipedia.org/wiki/Partial_least_squares_regression>`_ (PLS) on a\ntoy dataset. Our goal is to illustrate how PLS can outperform PCR when the\ntarget is strongly correlated with some directions in the data that have a\nlow variance.\n\nPCR is a regressor composed of two steps: first,\n:class:`~sklearn.decomposition.PCA` is applied to the training data, possibly\nperforming dimensionality reduction; then, a regressor (e.g. a linear\nregressor) is trained on the transformed samples. In\n:class:`~sklearn.decomposition.PCA`, the transformation is purely\nunsupervised, meaning that no information about the targets is used. As a\nresult, PCR may perform poorly in some datasets where the target is strongly\ncorrelated with *directions* that have low variance. Indeed, the\ndimensionality reduction of PCA projects the data into a lower dimensional\nspace where the variance of the projected data is greedily maximized along\neach axis. Despite them having the most predictive power on the target, the\ndirections with a lower variance will be dropped, and the final regressor\nwill not be able to leverage them.\n\nPLS is both a transformer and a regressor, and it is quite similar to PCR: it\nalso applies a dimensionality reduction to the samples before applying a\nlinear regressor to the transformed data. The main difference with PCR is\nthat the PLS transformation is supervised. Therefore, as we will see in this\nexample, it does not suffer from the issue we just mentioned.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"print(__doc__)"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"The data\n--------\n\nWe start by creating a simple dataset with two features. Before we even dive\ninto PCR and PLS, we fit a PCA estimator to display the two principal\ncomponents of this dataset, i.e. the two directions that explain the most\nvariance in the data.\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"import numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.decomposition import PCA\n\nrng = np.random.RandomState(0)\nn_samples = 500\ncov = [[3, 3],\n [3, 4]]\nX = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples)\npca = PCA(n_components=2).fit(X)\n\n\nplt.scatter(X[:, 0], X[:, 1], alpha=.3, label='samples')\nfor i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)):\n comp = comp * var # scale component by its variance explanation power\n plt.plot([0, comp[0]], [0, comp[1]], label=f\"Component {i}\", linewidth=5,\n color=f\"C{i + 2}\")\nplt.gca().set(aspect='equal',\n title=\"2-dimensional dataset with principal components\",\n xlabel='first feature', ylabel='second feature')\nplt.legend()\nplt.show()"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {},
53+
"source": [
54+
"For the purpose of this example, we now define the target `y` such that it is\nstrongly correlated with a direction that has a small variance. To this end,\nwe will project `X` onto the second component, and add some noise to it.\n\n"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [],
64+
"source": [
65+
"y = X.dot(pca.components_[1]) + rng.normal(size=n_samples) / 2\n\nfig, axes = plt.subplots(1, 2, figsize=(10, 3))\n\naxes[0].scatter(X.dot(pca.components_[0]), y, alpha=.3)\naxes[0].set(xlabel='Projected data onto first PCA component', ylabel='y')\naxes[1].scatter(X.dot(pca.components_[1]), y, alpha=.3)\naxes[1].set(xlabel='Projected data onto second PCA component', ylabel='y')\nplt.tight_layout()\nplt.show()"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"metadata": {},
71+
"source": [
72+
"Projection on one component and predictive power\n------------------------------------------------\n\nWe now create two regressors: PCR and PLS, and for our illustration purposes\nwe set the number of components to 1. Before feeding the data to the PCA step\nof PCR, we first standardize it, as recommended by good practice. The PLS\nestimator has built-in scaling capabilities.\n\nFor both models, we plot the projected data onto the first component against\nthe target. In both cases, this projected data is what the regressors will\nuse as training data.\n\n"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": null,
78+
"metadata": {
79+
"collapsed": false
80+
},
81+
"outputs": [],
82+
"source": [
83+
"from sklearn.model_selection import train_test_split\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\nfrom sklearn.cross_decomposition import PLSRegression\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)\n\npcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())\npcr.fit(X_train, y_train)\npca = pcr.named_steps['pca'] # retrieve the PCA step of the pipeline\n\npls = PLSRegression(n_components=1)\npls.fit(X_train, y_train)\n\nfig, axes = plt.subplots(1, 2, figsize=(10, 3))\naxes[0].scatter(pca.transform(X_test), y_test, alpha=.3, label='ground truth')\naxes[0].scatter(pca.transform(X_test), pcr.predict(X_test), alpha=.3,\n label='predictions')\naxes[0].set(xlabel='Projected data onto first PCA component',\n ylabel='y', title='PCR / PCA')\naxes[0].legend()\naxes[1].scatter(pls.transform(X_test), y_test, alpha=.3, label='ground truth')\naxes[1].scatter(pls.transform(X_test), pls.predict(X_test), alpha=.3,\n label='predictions')\naxes[1].set(xlabel='Projected data onto first PLS component',\n ylabel='y', title='PLS')\naxes[1].legend()\nplt.tight_layout()\nplt.show()"
84+
]
85+
},
86+
{
87+
"cell_type": "markdown",
88+
"metadata": {},
89+
"source": [
90+
"As expected, the unsupervised PCA transformation of PCR has dropped the\nsecond component, i.e. the direction with the lowest variance, despite\nit being the most predictive direction. This is because PCA is a completely\nunsupervised transformation, and results in the projected data having a low\npredictive power on the target.\n\nOn the other hand, the PLS regressor manages to capture the effect of the\ndirection with the lowest variance, thanks to its use of target information\nduring the transformation: it can recogize that this direction is actually\nthe most predictive. We note that the first PLS component is negatively\ncorrelated with the target, which comes from the fact that the signs of\neigenvectors are arbitrary.\n\nWe also print the R-squared scores of both estimators, which further confirms\nthat PLS is a better alternative than PCR in this case. A negative R-squared\nindicates that PCR performs worse than a regressor that would simply predict\nthe mean of the target.\n\n"
91+
]
92+
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": null,
96+
"metadata": {
97+
"collapsed": false
98+
},
99+
"outputs": [],
100+
"source": [
101+
"print(f\"PCR r-squared {pcr.score(X_test, y_test):.3f}\")\nprint(f\"PLS r-squared {pls.score(X_test, y_test):.3f}\")"
102+
]
103+
},
104+
{
105+
"cell_type": "markdown",
106+
"metadata": {},
107+
"source": [
108+
"As a final remark, we note that PCR with 2 components performs as well as\nPLS: this is because in this case, PCR was able to leverage the second\ncomponent which has the most preditive power on the target.\n\n"
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": null,
114+
"metadata": {
115+
"collapsed": false
116+
},
117+
"outputs": [],
118+
"source": [
119+
"pca_2 = make_pipeline(PCA(n_components=2), LinearRegression())\npca_2.fit(X_train, y_train)\nprint(f\"PCR r-squared with 2 components {pca_2.score(X_test, y_test):.3f}\")"
120+
]
121+
}
122+
],
123+
"metadata": {
124+
"kernelspec": {
125+
"display_name": "Python 3",
126+
"language": "python",
127+
"name": "python3"
128+
},
129+
"language_info": {
130+
"codemirror_mode": {
131+
"name": "ipython",
132+
"version": 3
133+
},
134+
"file_extension": ".py",
135+
"mimetype": "text/x-python",
136+
"name": "python",
137+
"nbconvert_exporter": "python",
138+
"pygments_lexer": "ipython3",
139+
"version": "3.8.5"
140+
}
141+
},
142+
"nbformat": 4,
143+
"nbformat_minor": 0
144+
}

dev/_downloads/0b39f715b5e32f01df3d212b6d822b82/plot_calibration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
clf_sigmoid.fit(X_train, y_train, sample_weight=sw_train)
7474
prob_pos_sigmoid = clf_sigmoid.predict_proba(X_test)[:, 1]
7575

76-
print("Brier scores: (the smaller the better)")
76+
print("Brier score losses: (the smaller the better)")
7777

7878
clf_score = brier_score_loss(y_test, prob_pos_clf, sample_weight=sw_test)
7979
print("No calibration: %1.3f" % clf_score)

0 commit comments

Comments
 (0)