Skip to content

Commit 11347d1

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 5a23a850fd061df0c51b8b7917f8589133917ffe
1 parent b57efdb commit 11347d1

File tree

1,216 files changed

+4583
-4517
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,216 files changed

+4583
-4517
lines changed
Binary file not shown.
Binary file not shown.

dev/_downloads/c557c992950d3a2cf0cc4280c9dbf39b/plot_scalable_poly_kernels.py

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,51 +24,64 @@
2424
# Author: Daniel Lopez-Sanchez <[email protected]>
2525
# License: BSD 3 clause
2626

27-
import matplotlib.pyplot as plt
28-
from sklearn.datasets import fetch_covtype
29-
from sklearn.model_selection import train_test_split
30-
from sklearn.preprocessing import MinMaxScaler, Normalizer
31-
from sklearn.svm import LinearSVC
32-
from sklearn.kernel_approximation import PolynomialCountSketch
33-
from sklearn.pipeline import Pipeline, make_pipeline
34-
import time
35-
3627
# %%
28+
# Preparing the data
29+
# ------------------
30+
#
3731
# Load the Covtype dataset, which contains 581,012 samples
3832
# with 54 features each, distributed among 6 classes. The goal of this dataset
3933
# is to predict forest cover type from cartographic variables only
4034
# (no remotely sensed data). After loading, we transform it into a binary
4135
# classification problem to match the version of the dataset in the
4236
# LIBSVM webpage [2], which was the one used in [1].
4337

38+
from sklearn.datasets import fetch_covtype
39+
4440
X, y = fetch_covtype(return_X_y=True)
4541

4642
y[y != 2] = 0
4743
y[y == 2] = 1 # We will try to separate class 2 from the other 6 classes.
4844

4945
# %%
46+
# Partitioning the data
47+
# ---------------------
48+
#
5049
# Here we select 5,000 samples for training and 10,000 for testing.
5150
# To actually reproduce the results in the original Tensor Sketch paper,
5251
# select 100,000 for training.
5352

53+
from sklearn.model_selection import train_test_split
54+
5455
X_train, X_test, y_train, y_test = train_test_split(
5556
X, y, train_size=5_000, test_size=10_000, random_state=42
5657
)
5758

5859
# %%
60+
# Feature normalization
61+
# ---------------------
62+
#
5963
# Now scale features to the range [0, 1] to match the format of the dataset in
6064
# the LIBSVM webpage, and then normalize to unit length as done in the
6165
# original Tensor Sketch paper [1].
6266

67+
from sklearn.preprocessing import MinMaxScaler, Normalizer
68+
from sklearn.pipeline import make_pipeline
69+
6370
mm = make_pipeline(MinMaxScaler(), Normalizer())
6471
X_train = mm.fit_transform(X_train)
6572
X_test = mm.transform(X_test)
6673

6774
# %%
75+
# Establishing a baseline model
76+
# -----------------------------
77+
#
6878
# As a baseline, train a linear SVM on the original features and print the
6979
# accuracy. We also measure and store accuracies and training times to
7080
# plot them later.
7181

82+
import time
83+
from sklearn.svm import LinearSVC
84+
7285
results = {}
7386

7487
lsvm = LinearSVC()
@@ -81,6 +94,9 @@
8194
print(f"Linear SVM score on raw features: {lsvm_score:.2f}%")
8295

8396
# %%
97+
# Establishing the kernel approximation model
98+
# -------------------------------------------
99+
#
84100
# Then we train linear SVMs on the features generated by
85101
# :class:`PolynomialCountSketch` with different values for `n_components`,
86102
# showing that these kernel feature approximations improve the accuracy
@@ -98,6 +114,8 @@
98114
# (`n_runs` = 1) in this example, in practice one should repeat the experiment several
99115
# times to compensate for the stochastic nature of :class:`PolynomialCountSketch`.
100116

117+
from sklearn.kernel_approximation import PolynomialCountSketch
118+
101119
n_runs = 1
102120
N_COMPONENTS = [250, 500, 1000, 2000]
103121

@@ -107,14 +125,9 @@
107125
ps_lsvm_score = 0
108126
for _ in range(n_runs):
109127

110-
pipeline = Pipeline(
111-
steps=[
112-
(
113-
"kernel_approximator",
114-
PolynomialCountSketch(n_components=n_components, degree=4),
115-
),
116-
("linear_classifier", LinearSVC()),
117-
]
128+
pipeline = make_pipeline(
129+
PolynomialCountSketch(n_components=n_components, degree=4),
130+
LinearSVC(),
118131
)
119132

120133
start = time.time()
@@ -135,6 +148,9 @@
135148
)
136149

137150
# %%
151+
# Establishing the kernelized SVM model
152+
# -------------------------------------
153+
#
138154
# Train a kernelized SVM to see how well :class:`PolynomialCountSketch`
139155
# is approximating the performance of the kernel. This, of course, may take
140156
# some time, as the SVC class has a relatively poor scalability. This is the
@@ -153,11 +169,16 @@
153169
print(f"Kernel-SVM score on raw features: {ksvm_score:.2f}%")
154170

155171
# %%
172+
# Comparing the results
173+
# ---------------------
174+
#
156175
# Finally, plot the results of the different methods against their training
157176
# times. As we can see, the kernelized SVM achieves a higher accuracy,
158177
# but its training time is much larger and, most importantly, will grow
159178
# much faster if the number of training samples increases.
160179

180+
import matplotlib.pyplot as plt
181+
161182
fig, ax = plt.subplots(figsize=(7, 7))
162183
ax.scatter(
163184
[

dev/_downloads/cb5b0b55b4ddb01e9ad80e6e28417c64/plot_scalable_poly_kernels.ipynb

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Daniel Lopez-Sanchez <[email protected]>\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import fetch_covtype\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import MinMaxScaler, Normalizer\nfrom sklearn.svm import LinearSVC\nfrom sklearn.kernel_approximation import PolynomialCountSketch\nfrom sklearn.pipeline import Pipeline, make_pipeline\nimport time"
29+
"# Author: Daniel Lopez-Sanchez <[email protected]>\n# License: BSD 3 clause"
3030
]
3131
},
3232
{
3333
"cell_type": "markdown",
3434
"metadata": {},
3535
"source": [
36-
"Load the Covtype dataset, which contains 581,012 samples\nwith 54 features each, distributed among 6 classes. The goal of this dataset\nis to predict forest cover type from cartographic variables only\n(no remotely sensed data). After loading, we transform it into a binary\nclassification problem to match the version of the dataset in the\nLIBSVM webpage [2], which was the one used in [1].\n\n"
36+
"## Preparing the data\n\nLoad the Covtype dataset, which contains 581,012 samples\nwith 54 features each, distributed among 6 classes. The goal of this dataset\nis to predict forest cover type from cartographic variables only\n(no remotely sensed data). After loading, we transform it into a binary\nclassification problem to match the version of the dataset in the\nLIBSVM webpage [2], which was the one used in [1].\n\n"
3737
]
3838
},
3939
{
@@ -44,14 +44,14 @@
4444
},
4545
"outputs": [],
4646
"source": [
47-
"X, y = fetch_covtype(return_X_y=True)\n\ny[y != 2] = 0\ny[y == 2] = 1 # We will try to separate class 2 from the other 6 classes."
47+
"from sklearn.datasets import fetch_covtype\n\nX, y = fetch_covtype(return_X_y=True)\n\ny[y != 2] = 0\ny[y == 2] = 1 # We will try to separate class 2 from the other 6 classes."
4848
]
4949
},
5050
{
5151
"cell_type": "markdown",
5252
"metadata": {},
5353
"source": [
54-
"Here we select 5,000 samples for training and 10,000 for testing.\nTo actually reproduce the results in the original Tensor Sketch paper,\nselect 100,000 for training.\n\n"
54+
"## Partitioning the data\n\nHere we select 5,000 samples for training and 10,000 for testing.\nTo actually reproduce the results in the original Tensor Sketch paper,\nselect 100,000 for training.\n\n"
5555
]
5656
},
5757
{
@@ -62,14 +62,14 @@
6262
},
6363
"outputs": [],
6464
"source": [
65-
"X_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=5_000, test_size=10_000, random_state=42\n)"
65+
"from sklearn.model_selection import train_test_split\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=5_000, test_size=10_000, random_state=42\n)"
6666
]
6767
},
6868
{
6969
"cell_type": "markdown",
7070
"metadata": {},
7171
"source": [
72-
"Now scale features to the range [0, 1] to match the format of the dataset in\nthe LIBSVM webpage, and then normalize to unit length as done in the\noriginal Tensor Sketch paper [1].\n\n"
72+
"## Feature normalization\n\nNow scale features to the range [0, 1] to match the format of the dataset in\nthe LIBSVM webpage, and then normalize to unit length as done in the\noriginal Tensor Sketch paper [1].\n\n"
7373
]
7474
},
7575
{
@@ -80,14 +80,14 @@
8080
},
8181
"outputs": [],
8282
"source": [
83-
"mm = make_pipeline(MinMaxScaler(), Normalizer())\nX_train = mm.fit_transform(X_train)\nX_test = mm.transform(X_test)"
83+
"from sklearn.preprocessing import MinMaxScaler, Normalizer\nfrom sklearn.pipeline import make_pipeline\n\nmm = make_pipeline(MinMaxScaler(), Normalizer())\nX_train = mm.fit_transform(X_train)\nX_test = mm.transform(X_test)"
8484
]
8585
},
8686
{
8787
"cell_type": "markdown",
8888
"metadata": {},
8989
"source": [
90-
"As a baseline, train a linear SVM on the original features and print the\naccuracy. We also measure and store accuracies and training times to\nplot them later.\n\n"
90+
"## Establishing a baseline model\n\nAs a baseline, train a linear SVM on the original features and print the\naccuracy. We also measure and store accuracies and training times to\nplot them later.\n\n"
9191
]
9292
},
9393
{
@@ -98,14 +98,14 @@
9898
},
9999
"outputs": [],
100100
"source": [
101-
"results = {}\n\nlsvm = LinearSVC()\nstart = time.time()\nlsvm.fit(X_train, y_train)\nlsvm_time = time.time() - start\nlsvm_score = 100 * lsvm.score(X_test, y_test)\n\nresults[\"LSVM\"] = {\"time\": lsvm_time, \"score\": lsvm_score}\nprint(f\"Linear SVM score on raw features: {lsvm_score:.2f}%\")"
101+
"import time\nfrom sklearn.svm import LinearSVC\n\nresults = {}\n\nlsvm = LinearSVC()\nstart = time.time()\nlsvm.fit(X_train, y_train)\nlsvm_time = time.time() - start\nlsvm_score = 100 * lsvm.score(X_test, y_test)\n\nresults[\"LSVM\"] = {\"time\": lsvm_time, \"score\": lsvm_score}\nprint(f\"Linear SVM score on raw features: {lsvm_score:.2f}%\")"
102102
]
103103
},
104104
{
105105
"cell_type": "markdown",
106106
"metadata": {},
107107
"source": [
108-
"Then we train linear SVMs on the features generated by\n:class:`PolynomialCountSketch` with different values for `n_components`,\nshowing that these kernel feature approximations improve the accuracy\nof linear classification. In typical application scenarios, `n_components`\nshould be larger than the number of features in the input representation\nin order to achieve an improvement with respect to linear classification.\nAs a rule of thumb, the optimum of evaluation score / run time cost is\ntypically achieved at around `n_components` = 10 * `n_features`, though this\nmight depend on the specific dataset being handled. Note that, since the\noriginal samples have 54 features, the explicit feature map of the\npolynomial kernel of degree four would have approximately 8.5 million\nfeatures (precisely, 54^4). Thanks to :class:`PolynomialCountSketch`, we can\ncondense most of the discriminative information of that feature space into a\nmuch more compact representation. While we run the experiment only a single time\n(`n_runs` = 1) in this example, in practice one should repeat the experiment several\ntimes to compensate for the stochastic nature of :class:`PolynomialCountSketch`.\n\n"
108+
"## Establishing the kernel approximation model\n\nThen we train linear SVMs on the features generated by\n:class:`PolynomialCountSketch` with different values for `n_components`,\nshowing that these kernel feature approximations improve the accuracy\nof linear classification. In typical application scenarios, `n_components`\nshould be larger than the number of features in the input representation\nin order to achieve an improvement with respect to linear classification.\nAs a rule of thumb, the optimum of evaluation score / run time cost is\ntypically achieved at around `n_components` = 10 * `n_features`, though this\nmight depend on the specific dataset being handled. Note that, since the\noriginal samples have 54 features, the explicit feature map of the\npolynomial kernel of degree four would have approximately 8.5 million\nfeatures (precisely, 54^4). Thanks to :class:`PolynomialCountSketch`, we can\ncondense most of the discriminative information of that feature space into a\nmuch more compact representation. While we run the experiment only a single time\n(`n_runs` = 1) in this example, in practice one should repeat the experiment several\ntimes to compensate for the stochastic nature of :class:`PolynomialCountSketch`.\n\n"
109109
]
110110
},
111111
{
@@ -116,14 +116,14 @@
116116
},
117117
"outputs": [],
118118
"source": [
119-
"n_runs = 1\nN_COMPONENTS = [250, 500, 1000, 2000]\n\nfor n_components in N_COMPONENTS:\n\n ps_lsvm_time = 0\n ps_lsvm_score = 0\n for _ in range(n_runs):\n\n pipeline = Pipeline(\n steps=[\n (\n \"kernel_approximator\",\n PolynomialCountSketch(n_components=n_components, degree=4),\n ),\n (\"linear_classifier\", LinearSVC()),\n ]\n )\n\n start = time.time()\n pipeline.fit(X_train, y_train)\n ps_lsvm_time += time.time() - start\n ps_lsvm_score += 100 * pipeline.score(X_test, y_test)\n\n ps_lsvm_time /= n_runs\n ps_lsvm_score /= n_runs\n\n results[f\"LSVM + PS({n_components})\"] = {\n \"time\": ps_lsvm_time,\n \"score\": ps_lsvm_score,\n }\n print(\n f\"Linear SVM score on {n_components} PolynomialCountSketch \"\n + f\"features: {ps_lsvm_score:.2f}%\"\n )"
119+
"from sklearn.kernel_approximation import PolynomialCountSketch\n\nn_runs = 1\nN_COMPONENTS = [250, 500, 1000, 2000]\n\nfor n_components in N_COMPONENTS:\n\n ps_lsvm_time = 0\n ps_lsvm_score = 0\n for _ in range(n_runs):\n\n pipeline = make_pipeline(\n PolynomialCountSketch(n_components=n_components, degree=4),\n LinearSVC(),\n )\n\n start = time.time()\n pipeline.fit(X_train, y_train)\n ps_lsvm_time += time.time() - start\n ps_lsvm_score += 100 * pipeline.score(X_test, y_test)\n\n ps_lsvm_time /= n_runs\n ps_lsvm_score /= n_runs\n\n results[f\"LSVM + PS({n_components})\"] = {\n \"time\": ps_lsvm_time,\n \"score\": ps_lsvm_score,\n }\n print(\n f\"Linear SVM score on {n_components} PolynomialCountSketch \"\n + f\"features: {ps_lsvm_score:.2f}%\"\n )"
120120
]
121121
},
122122
{
123123
"cell_type": "markdown",
124124
"metadata": {},
125125
"source": [
126-
"Train a kernelized SVM to see how well :class:`PolynomialCountSketch`\nis approximating the performance of the kernel. This, of course, may take\nsome time, as the SVC class has a relatively poor scalability. This is the\nreason why kernel approximators are so useful:\n\n"
126+
"## Establishing the kernelized SVM model\n\nTrain a kernelized SVM to see how well :class:`PolynomialCountSketch`\nis approximating the performance of the kernel. This, of course, may take\nsome time, as the SVC class has a relatively poor scalability. This is the\nreason why kernel approximators are so useful:\n\n"
127127
]
128128
},
129129
{
@@ -141,7 +141,7 @@
141141
"cell_type": "markdown",
142142
"metadata": {},
143143
"source": [
144-
"Finally, plot the results of the different methods against their training\ntimes. As we can see, the kernelized SVM achieves a higher accuracy,\nbut its training time is much larger and, most importantly, will grow\nmuch faster if the number of training samples increases.\n\n"
144+
"## Comparing the results\n\nFinally, plot the results of the different methods against their training\ntimes. As we can see, the kernelized SVM achieves a higher accuracy,\nbut its training time is much larger and, most importantly, will grow\nmuch faster if the number of training samples increases.\n\n"
145145
]
146146
},
147147
{
@@ -152,14 +152,14 @@
152152
},
153153
"outputs": [],
154154
"source": [
155-
"fig, ax = plt.subplots(figsize=(7, 7))\nax.scatter(\n [\n results[\"LSVM\"][\"time\"],\n ],\n [\n results[\"LSVM\"][\"score\"],\n ],\n label=\"Linear SVM\",\n c=\"green\",\n marker=\"^\",\n)\n\nax.scatter(\n [\n results[\"LSVM + PS(250)\"][\"time\"],\n ],\n [\n results[\"LSVM + PS(250)\"][\"score\"],\n ],\n label=\"Linear SVM + PolynomialCountSketch\",\n c=\"blue\",\n)\n\nfor n_components in N_COMPONENTS:\n ax.scatter(\n [\n results[f\"LSVM + PS({n_components})\"][\"time\"],\n ],\n [\n results[f\"LSVM + PS({n_components})\"][\"score\"],\n ],\n c=\"blue\",\n )\n ax.annotate(\n f\"n_comp.={n_components}\",\n (\n results[f\"LSVM + PS({n_components})\"][\"time\"],\n results[f\"LSVM + PS({n_components})\"][\"score\"],\n ),\n xytext=(-30, 10),\n textcoords=\"offset pixels\",\n )\n\nax.scatter(\n [\n results[\"KSVM\"][\"time\"],\n ],\n [\n results[\"KSVM\"][\"score\"],\n ],\n label=\"Kernel SVM\",\n c=\"red\",\n marker=\"x\",\n)\n\nax.set_xlabel(\"Training time (s)\")\nax.set_ylabel(\"Accuracy (%)\")\nax.legend()\nplt.show()"
155+
"import matplotlib.pyplot as plt\n\nfig, ax = plt.subplots(figsize=(7, 7))\nax.scatter(\n [\n results[\"LSVM\"][\"time\"],\n ],\n [\n results[\"LSVM\"][\"score\"],\n ],\n label=\"Linear SVM\",\n c=\"green\",\n marker=\"^\",\n)\n\nax.scatter(\n [\n results[\"LSVM + PS(250)\"][\"time\"],\n ],\n [\n results[\"LSVM + PS(250)\"][\"score\"],\n ],\n label=\"Linear SVM + PolynomialCountSketch\",\n c=\"blue\",\n)\n\nfor n_components in N_COMPONENTS:\n ax.scatter(\n [\n results[f\"LSVM + PS({n_components})\"][\"time\"],\n ],\n [\n results[f\"LSVM + PS({n_components})\"][\"score\"],\n ],\n c=\"blue\",\n )\n ax.annotate(\n f\"n_comp.={n_components}\",\n (\n results[f\"LSVM + PS({n_components})\"][\"time\"],\n results[f\"LSVM + PS({n_components})\"][\"score\"],\n ),\n xytext=(-30, 10),\n textcoords=\"offset pixels\",\n )\n\nax.scatter(\n [\n results[\"KSVM\"][\"time\"],\n ],\n [\n results[\"KSVM\"][\"score\"],\n ],\n label=\"Kernel SVM\",\n c=\"red\",\n marker=\"x\",\n)\n\nax.set_xlabel(\"Training time (s)\")\nax.set_ylabel(\"Accuracy (%)\")\nax.legend()\nplt.show()"
156156
]
157157
},
158158
{
159159
"cell_type": "markdown",
160160
"metadata": {},
161161
"source": [
162-
"## References\n\n[1] Pham, Ninh and Rasmus Pagh. \"Fast and scalable polynomial kernels via\nexplicit feature maps.\" KDD '13 (2013).\nhttps://doi.org/10.1145/2487575.2487591\n\n[2] LIBSVM binary datasets repository\nhttps://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html\n\n"
162+
"### References\n\n[1] Pham, Ninh and Rasmus Pagh. \"Fast and scalable polynomial kernels via\nexplicit feature maps.\" KDD '13 (2013).\nhttps://doi.org/10.1145/2487575.2487591\n\n[2] LIBSVM binary datasets repository\nhttps://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html\n\n"
163163
]
164164
}
165165
],

dev/_downloads/scikit-learn-docs.zip

2.6 KB
Binary file not shown.
-99 Bytes
-228 Bytes
398 Bytes
7 Bytes
-162 Bytes

0 commit comments

Comments
 (0)