scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
287 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
287 Bytes
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
87 Bytes b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
87 Bytes
diff --git a/‎dev/_downloads/c557c992950d3a2cf0cc4280c9dbf39b/plot_scalable_poly_kernels.py
Lines changed: 38 additions & 17 deletions b/‎dev/_downloads/c557c992950d3a2cf0cc4280c9dbf39b/plot_scalable_poly_kernels.py
Lines changed: 38 additions & 17 deletions
diff --git a/‎dev/_downloads/cb5b0b55b4ddb01e9ad80e6e28417c64/plot_scalable_poly_kernels.ipynb
Lines changed: 15 additions & 15 deletions b/‎dev/_downloads/cb5b0b55b4ddb01e9ad80e6e28417c64/plot_scalable_poly_kernels.ipynb
Lines changed: 15 additions & 15 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
2.6 KB b/‎dev/_downloads/scikit-learn-docs.zip
2.6 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-99 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-99 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-228 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-228 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
398 Bytes b/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
398 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_anomaly_comparison_thumb.png
7 Bytes b/‎dev/_images/sphx_glr_plot_anomaly_comparison_thumb.png
7 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_001.png
-162 Bytes b/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_001.png
-162 Bytes
@@ -24,51 +24,64 @@
 # Author: Daniel Lopez-Sanchez <[email protected]>
 # License: BSD 3 clause
 
-import matplotlib.pyplot as plt
-from sklearn.datasets import fetch_covtype
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import MinMaxScaler, Normalizer
-from sklearn.svm import LinearSVC
-from sklearn.kernel_approximation import PolynomialCountSketch
-from sklearn.pipeline import Pipeline, make_pipeline
-import time
-
 # %%
+# Preparing the data
+# ------------------
+#
 # Load the Covtype dataset, which contains 581,012 samples
 # with 54 features each, distributed among 6 classes. The goal of this dataset
 # is to predict forest cover type from cartographic variables only
 # (no remotely sensed data). After loading, we transform it into a binary
 # classification problem to match the version of the dataset in the
 # LIBSVM webpage [2], which was the one used in [1].
 
+from sklearn.datasets import fetch_covtype
+
 X, y = fetch_covtype(return_X_y=True)
 
 y[y != 2] = 0
 y[y == 2] = 1  # We will try to separate class 2 from the other 6 classes.
 
 # %%
+# Partitioning the data
+# ---------------------
+#
 # Here we select 5,000 samples for training and 10,000 for testing.
 # To actually reproduce the results in the original Tensor Sketch paper,
 # select 100,000 for training.
 
+from sklearn.model_selection import train_test_split
+
 X_train, X_test, y_train, y_test = train_test_split(
     X, y, train_size=5_000, test_size=10_000, random_state=42
 )
 
 # %%
+# Feature normalization
+# ---------------------
+#
 # Now scale features to the range [0, 1] to match the format of the dataset in
 # the LIBSVM webpage, and then normalize to unit length as done in the
 # original Tensor Sketch paper [1].
 
+from sklearn.preprocessing import MinMaxScaler, Normalizer
+from sklearn.pipeline import make_pipeline
+
 mm = make_pipeline(MinMaxScaler(), Normalizer())
 X_train = mm.fit_transform(X_train)
 X_test = mm.transform(X_test)
 
 # %%
+# Establishing a baseline model
+# -----------------------------
+#
 # As a baseline, train a linear SVM on the original features and print the
 # accuracy. We also measure and store accuracies and training times to
 # plot them later.
 
+import time
+from sklearn.svm import LinearSVC
+
 results = {}
 
 lsvm = LinearSVC()
@@ -81,6 +94,9 @@
 print(f"Linear SVM score on raw features: {lsvm_score:.2f}%")
 
 # %%
+# Establishing the kernel approximation model
+# -------------------------------------------
+#
 # Then we train linear SVMs on the features generated by
 # :class:`PolynomialCountSketch` with different values for `n_components`,
 # showing that these kernel feature approximations improve the accuracy
@@ -98,6 +114,8 @@
 # (`n_runs` = 1) in this example, in practice one should repeat the experiment several
 # times to compensate for the stochastic nature of :class:`PolynomialCountSketch`.
 
+from sklearn.kernel_approximation import PolynomialCountSketch
+
 n_runs = 1
 N_COMPONENTS = [250, 500, 1000, 2000]
 
@@ -107,14 +125,9 @@
     ps_lsvm_score = 0
     for _ in range(n_runs):
 
-        pipeline = Pipeline(
-            steps=[
-                (
-                    "kernel_approximator",
-                    PolynomialCountSketch(n_components=n_components, degree=4),
-                ),
-                ("linear_classifier", LinearSVC()),
-            ]
+        pipeline = make_pipeline(
+            PolynomialCountSketch(n_components=n_components, degree=4),
+            LinearSVC(),
         )
 
         start = time.time()
@@ -135,6 +148,9 @@
     )
 
 # %%
+# Establishing the kernelized SVM model
+# -------------------------------------
+#
 # Train a kernelized SVM to see how well :class:`PolynomialCountSketch`
 # is approximating the performance of the kernel. This, of course, may take
 # some time, as the SVC class has a relatively poor scalability. This is the
@@ -153,11 +169,16 @@
 print(f"Kernel-SVM score on raw features: {ksvm_score:.2f}%")
 
 # %%
+# Comparing the results
+# ---------------------
+#
 # Finally, plot the results of the different methods against their training
 # times. As we can see, the kernelized SVM achieves a higher accuracy,
 # but its training time is much larger and, most importantly, will grow
 # much faster if the number of training samples increases.
 
+import matplotlib.pyplot as plt
+
 fig, ax = plt.subplots(figsize=(7, 7))
 ax.scatter(
     [
 
@@ -26,14 +26,14 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Daniel Lopez-Sanchez <[email protected]>\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import fetch_covtype\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import MinMaxScaler, Normalizer\nfrom sklearn.svm import LinearSVC\nfrom sklearn.kernel_approximation import PolynomialCountSketch\nfrom sklearn.pipeline import Pipeline, make_pipeline\nimport time"
+        "# Author: Daniel Lopez-Sanchez <[email protected]>\n# License: BSD 3 clause"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Load the Covtype dataset, which contains 581,012 samples\nwith 54 features each, distributed among 6 classes. The goal of this dataset\nis to predict forest cover type from cartographic variables only\n(no remotely sensed data). After loading, we transform it into a binary\nclassification problem to match the version of the dataset in the\nLIBSVM webpage [2], which was the one used in [1].\n\n"
+        "## Preparing the data\n\nLoad the Covtype dataset, which contains 581,012 samples\nwith 54 features each, distributed among 6 classes. The goal of this dataset\nis to predict forest cover type from cartographic variables only\n(no remotely sensed data). After loading, we transform it into a binary\nclassification problem to match the version of the dataset in the\nLIBSVM webpage [2], which was the one used in [1].\n\n"
       ]
     },
     {
@@ -44,14 +44,14 @@
       },
       "outputs": [],
       "source": [
-        "X, y = fetch_covtype(return_X_y=True)\n\ny[y != 2] = 0\ny[y == 2] = 1  # We will try to separate class 2 from the other 6 classes."
+        "from sklearn.datasets import fetch_covtype\n\nX, y = fetch_covtype(return_X_y=True)\n\ny[y != 2] = 0\ny[y == 2] = 1  # We will try to separate class 2 from the other 6 classes."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Here we select 5,000 samples for training and 10,000 for testing.\nTo actually reproduce the results in the original Tensor Sketch paper,\nselect 100,000 for training.\n\n"
+        "## Partitioning the data\n\nHere we select 5,000 samples for training and 10,000 for testing.\nTo actually reproduce the results in the original Tensor Sketch paper,\nselect 100,000 for training.\n\n"
       ]
     },
     {
@@ -62,14 +62,14 @@
       },
       "outputs": [],
       "source": [
-        "X_train, X_test, y_train, y_test = train_test_split(\n    X, y, train_size=5_000, test_size=10_000, random_state=42\n)"
+        "from sklearn.model_selection import train_test_split\n\nX_train, X_test, y_train, y_test = train_test_split(\n    X, y, train_size=5_000, test_size=10_000, random_state=42\n)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Now scale features to the range [0, 1] to match the format of the dataset in\nthe LIBSVM webpage, and then normalize to unit length as done in the\noriginal Tensor Sketch paper [1].\n\n"
+        "## Feature normalization\n\nNow scale features to the range [0, 1] to match the format of the dataset in\nthe LIBSVM webpage, and then normalize to unit length as done in the\noriginal Tensor Sketch paper [1].\n\n"
       ]
     },
     {
@@ -80,14 +80,14 @@
       },
       "outputs": [],
       "source": [
-        "mm = make_pipeline(MinMaxScaler(), Normalizer())\nX_train = mm.fit_transform(X_train)\nX_test = mm.transform(X_test)"
+        "from sklearn.preprocessing import MinMaxScaler, Normalizer\nfrom sklearn.pipeline import make_pipeline\n\nmm = make_pipeline(MinMaxScaler(), Normalizer())\nX_train = mm.fit_transform(X_train)\nX_test = mm.transform(X_test)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "As a baseline, train a linear SVM on the original features and print the\naccuracy. We also measure and store accuracies and training times to\nplot them later.\n\n"
+        "## Establishing a baseline model\n\nAs a baseline, train a linear SVM on the original features and print the\naccuracy. We also measure and store accuracies and training times to\nplot them later.\n\n"
       ]
     },
     {
@@ -98,14 +98,14 @@
       },
       "outputs": [],
       "source": [
-        "results = {}\n\nlsvm = LinearSVC()\nstart = time.time()\nlsvm.fit(X_train, y_train)\nlsvm_time = time.time() - start\nlsvm_score = 100 * lsvm.score(X_test, y_test)\n\nresults[\"LSVM\"] = {\"time\": lsvm_time, \"score\": lsvm_score}\nprint(f\"Linear SVM score on raw features: {lsvm_score:.2f}%\")"
+        "import time\nfrom sklearn.svm import LinearSVC\n\nresults = {}\n\nlsvm = LinearSVC()\nstart = time.time()\nlsvm.fit(X_train, y_train)\nlsvm_time = time.time() - start\nlsvm_score = 100 * lsvm.score(X_test, y_test)\n\nresults[\"LSVM\"] = {\"time\": lsvm_time, \"score\": lsvm_score}\nprint(f\"Linear SVM score on raw features: {lsvm_score:.2f}%\")"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Then we train linear SVMs on the features generated by\n:class:`PolynomialCountSketch` with different values for `n_components`,\nshowing that these kernel feature approximations improve the accuracy\nof linear classification. In typical application scenarios, `n_components`\nshould be larger than the number of features in the input representation\nin order to achieve an improvement with respect to linear classification.\nAs a rule of thumb, the optimum of evaluation score / run time cost is\ntypically achieved at around `n_components` = 10 * `n_features`, though this\nmight depend on the specific dataset being handled. Note that, since the\noriginal samples have 54 features, the explicit feature map of the\npolynomial kernel of degree four would have approximately 8.5 million\nfeatures (precisely, 54^4). Thanks to :class:`PolynomialCountSketch`, we can\ncondense most of the discriminative information of that feature space into a\nmuch more compact representation. While we run the experiment only a single time\n(`n_runs` = 1) in this example, in practice one should repeat the experiment several\ntimes to compensate for the stochastic nature of :class:`PolynomialCountSketch`.\n\n"
+        "## Establishing the kernel approximation model\n\nThen we train linear SVMs on the features generated by\n:class:`PolynomialCountSketch` with different values for `n_components`,\nshowing that these kernel feature approximations improve the accuracy\nof linear classification. In typical application scenarios, `n_components`\nshould be larger than the number of features in the input representation\nin order to achieve an improvement with respect to linear classification.\nAs a rule of thumb, the optimum of evaluation score / run time cost is\ntypically achieved at around `n_components` = 10 * `n_features`, though this\nmight depend on the specific dataset being handled. Note that, since the\noriginal samples have 54 features, the explicit feature map of the\npolynomial kernel of degree four would have approximately 8.5 million\nfeatures (precisely, 54^4). Thanks to :class:`PolynomialCountSketch`, we can\ncondense most of the discriminative information of that feature space into a\nmuch more compact representation. While we run the experiment only a single time\n(`n_runs` = 1) in this example, in practice one should repeat the experiment several\ntimes to compensate for the stochastic nature of :class:`PolynomialCountSketch`.\n\n"
       ]
     },
     {
@@ -116,14 +116,14 @@
       },
       "outputs": [],
       "source": [
-        "n_runs = 1\nN_COMPONENTS = [250, 500, 1000, 2000]\n\nfor n_components in N_COMPONENTS:\n\n    ps_lsvm_time = 0\n    ps_lsvm_score = 0\n    for _ in range(n_runs):\n\n        pipeline = Pipeline(\n            steps=[\n                (\n                    \"kernel_approximator\",\n                    PolynomialCountSketch(n_components=n_components, degree=4),\n                ),\n                (\"linear_classifier\", LinearSVC()),\n            ]\n        )\n\n        start = time.time()\n        pipeline.fit(X_train, y_train)\n        ps_lsvm_time += time.time() - start\n        ps_lsvm_score += 100 * pipeline.score(X_test, y_test)\n\n    ps_lsvm_time /= n_runs\n    ps_lsvm_score /= n_runs\n\n    results[f\"LSVM + PS({n_components})\"] = {\n        \"time\": ps_lsvm_time,\n        \"score\": ps_lsvm_score,\n    }\n    print(\n        f\"Linear SVM score on {n_components} PolynomialCountSketch \"\n        + f\"features: {ps_lsvm_score:.2f}%\"\n    )"
+        "from sklearn.kernel_approximation import PolynomialCountSketch\n\nn_runs = 1\nN_COMPONENTS = [250, 500, 1000, 2000]\n\nfor n_components in N_COMPONENTS:\n\n    ps_lsvm_time = 0\n    ps_lsvm_score = 0\n    for _ in range(n_runs):\n\n        pipeline = make_pipeline(\n            PolynomialCountSketch(n_components=n_components, degree=4),\n            LinearSVC(),\n        )\n\n        start = time.time()\n        pipeline.fit(X_train, y_train)\n        ps_lsvm_time += time.time() - start\n        ps_lsvm_score += 100 * pipeline.score(X_test, y_test)\n\n    ps_lsvm_time /= n_runs\n    ps_lsvm_score /= n_runs\n\n    results[f\"LSVM + PS({n_components})\"] = {\n        \"time\": ps_lsvm_time,\n        \"score\": ps_lsvm_score,\n    }\n    print(\n        f\"Linear SVM score on {n_components} PolynomialCountSketch \"\n        + f\"features: {ps_lsvm_score:.2f}%\"\n    )"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Train a kernelized SVM to see how well :class:`PolynomialCountSketch`\nis approximating the performance of the kernel. This, of course, may take\nsome time, as the SVC class has a relatively poor scalability. This is the\nreason why kernel approximators are so useful:\n\n"
+        "## Establishing the kernelized SVM model\n\nTrain a kernelized SVM to see how well :class:`PolynomialCountSketch`\nis approximating the performance of the kernel. This, of course, may take\nsome time, as the SVC class has a relatively poor scalability. This is the\nreason why kernel approximators are so useful:\n\n"
       ]
     },
     {
@@ -141,7 +141,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Finally, plot the results of the different methods against their training\ntimes. As we can see, the kernelized SVM achieves a higher accuracy,\nbut its training time is much larger and, most importantly, will grow\nmuch faster if the number of training samples increases.\n\n"
+        "## Comparing the results\n\nFinally, plot the results of the different methods against their training\ntimes. As we can see, the kernelized SVM achieves a higher accuracy,\nbut its training time is much larger and, most importantly, will grow\nmuch faster if the number of training samples increases.\n\n"
       ]
     },
     {
@@ -152,14 +152,14 @@
       },
       "outputs": [],
       "source": [
-        "fig, ax = plt.subplots(figsize=(7, 7))\nax.scatter(\n    [\n        results[\"LSVM\"][\"time\"],\n    ],\n    [\n        results[\"LSVM\"][\"score\"],\n    ],\n    label=\"Linear SVM\",\n    c=\"green\",\n    marker=\"^\",\n)\n\nax.scatter(\n    [\n        results[\"LSVM + PS(250)\"][\"time\"],\n    ],\n    [\n        results[\"LSVM + PS(250)\"][\"score\"],\n    ],\n    label=\"Linear SVM + PolynomialCountSketch\",\n    c=\"blue\",\n)\n\nfor n_components in N_COMPONENTS:\n    ax.scatter(\n        [\n            results[f\"LSVM + PS({n_components})\"][\"time\"],\n        ],\n        [\n            results[f\"LSVM + PS({n_components})\"][\"score\"],\n        ],\n        c=\"blue\",\n    )\n    ax.annotate(\n        f\"n_comp.={n_components}\",\n        (\n            results[f\"LSVM + PS({n_components})\"][\"time\"],\n            results[f\"LSVM + PS({n_components})\"][\"score\"],\n        ),\n        xytext=(-30, 10),\n        textcoords=\"offset pixels\",\n    )\n\nax.scatter(\n    [\n        results[\"KSVM\"][\"time\"],\n    ],\n    [\n        results[\"KSVM\"][\"score\"],\n    ],\n    label=\"Kernel SVM\",\n    c=\"red\",\n    marker=\"x\",\n)\n\nax.set_xlabel(\"Training time (s)\")\nax.set_ylabel(\"Accuracy (%)\")\nax.legend()\nplt.show()"
+        "import matplotlib.pyplot as plt\n\nfig, ax = plt.subplots(figsize=(7, 7))\nax.scatter(\n    [\n        results[\"LSVM\"][\"time\"],\n    ],\n    [\n        results[\"LSVM\"][\"score\"],\n    ],\n    label=\"Linear SVM\",\n    c=\"green\",\n    marker=\"^\",\n)\n\nax.scatter(\n    [\n        results[\"LSVM + PS(250)\"][\"time\"],\n    ],\n    [\n        results[\"LSVM + PS(250)\"][\"score\"],\n    ],\n    label=\"Linear SVM + PolynomialCountSketch\",\n    c=\"blue\",\n)\n\nfor n_components in N_COMPONENTS:\n    ax.scatter(\n        [\n            results[f\"LSVM + PS({n_components})\"][\"time\"],\n        ],\n        [\n            results[f\"LSVM + PS({n_components})\"][\"score\"],\n        ],\n        c=\"blue\",\n    )\n    ax.annotate(\n        f\"n_comp.={n_components}\",\n        (\n            results[f\"LSVM + PS({n_components})\"][\"time\"],\n            results[f\"LSVM + PS({n_components})\"][\"score\"],\n        ),\n        xytext=(-30, 10),\n        textcoords=\"offset pixels\",\n    )\n\nax.scatter(\n    [\n        results[\"KSVM\"][\"time\"],\n    ],\n    [\n        results[\"KSVM\"][\"score\"],\n    ],\n    label=\"Kernel SVM\",\n    c=\"red\",\n    marker=\"x\",\n)\n\nax.set_xlabel(\"Training time (s)\")\nax.set_ylabel(\"Accuracy (%)\")\nax.legend()\nplt.show()"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## References\n\n[1] Pham, Ninh and Rasmus Pagh. \"Fast and scalable polynomial kernels via\nexplicit feature maps.\" KDD '13 (2013).\nhttps://doi.org/10.1145/2487575.2487591\n\n[2] LIBSVM binary datasets repository\nhttps://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html\n\n"
+        "### References\n\n[1] Pham, Ninh and Rasmus Pagh. \"Fast and scalable polynomial kernels via\nexplicit feature maps.\" KDD '13 (2013).\nhttps://doi.org/10.1145/2487575.2487591\n\n[2] LIBSVM binary datasets repository\nhttps://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html\n\n"
       ]
     }
   ],
Original file line number	Diff line number	Diff line change
`@@ -26,14 +26,14 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		`- "# Author: Daniel Lopez-Sanchez <[email protected]>\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import fetch_covtype\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import MinMaxScaler, Normalizer\nfrom sklearn.svm import LinearSVC\nfrom sklearn.kernel_approximation import PolynomialCountSketch\nfrom sklearn.pipeline import Pipeline, make_pipeline\nimport time"`
	`29`	`+ "# Author: Daniel Lopez-Sanchez <[email protected]>\n# License: BSD 3 clause"`
`30`	`30`	`]`
`31`	`31`	`},`
`32`	`32`	`{`
`33`	`33`	`"cell_type": "markdown",`
`34`	`34`	`"metadata": {},`
`35`	`35`	`"source": [`
`36`		`- "Load the Covtype dataset, which contains 581,012 samples\nwith 54 features each, distributed among 6 classes. The goal of this dataset\nis to predict forest cover type from cartographic variables only\n(no remotely sensed data). After loading, we transform it into a binary\nclassification problem to match the version of the dataset in the\nLIBSVM webpage [2], which was the one used in [1].\n\n"`
	`36`	`+ "## Preparing the data\n\nLoad the Covtype dataset, which contains 581,012 samples\nwith 54 features each, distributed among 6 classes. The goal of this dataset\nis to predict forest cover type from cartographic variables only\n(no remotely sensed data). After loading, we transform it into a binary\nclassification problem to match the version of the dataset in the\nLIBSVM webpage [2], which was the one used in [1].\n\n"`
`37`	`37`	`]`
`38`	`38`	`},`
`39`	`39`	`{`
`@@ -44,14 +44,14 @@`
`44`	`44`	`},`
`45`	`45`	`"outputs": [],`
`46`	`46`	`"source": [`
`47`		`- "X, y = fetch_covtype(return_X_y=True)\n\ny[y != 2] = 0\ny[y == 2] = 1 # We will try to separate class 2 from the other 6 classes."`
	`47`	`+ "from sklearn.datasets import fetch_covtype\n\nX, y = fetch_covtype(return_X_y=True)\n\ny[y != 2] = 0\ny[y == 2] = 1 # We will try to separate class 2 from the other 6 classes."`
`48`	`48`	`]`
`49`	`49`	`},`
`50`	`50`	`{`
`51`	`51`	`"cell_type": "markdown",`
`52`	`52`	`"metadata": {},`
`53`	`53`	`"source": [`
`54`		`- "Here we select 5,000 samples for training and 10,000 for testing.\nTo actually reproduce the results in the original Tensor Sketch paper,\nselect 100,000 for training.\n\n"`
	`54`	`+ "## Partitioning the data\n\nHere we select 5,000 samples for training and 10,000 for testing.\nTo actually reproduce the results in the original Tensor Sketch paper,\nselect 100,000 for training.\n\n"`
`55`	`55`	`]`
`56`	`56`	`},`
`57`	`57`	`{`
`@@ -62,14 +62,14 @@`
`62`	`62`	`},`
`63`	`63`	`"outputs": [],`
`64`	`64`	`"source": [`
`65`		`- "X_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=5_000, test_size=10_000, random_state=42\n)"`
	`65`	`+ "from sklearn.model_selection import train_test_split\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=5_000, test_size=10_000, random_state=42\n)"`
`66`	`66`	`]`
`67`	`67`	`},`
`68`	`68`	`{`
`69`	`69`	`"cell_type": "markdown",`
`70`	`70`	`"metadata": {},`
`71`	`71`	`"source": [`
`72`		`- "Now scale features to the range [0, 1] to match the format of the dataset in\nthe LIBSVM webpage, and then normalize to unit length as done in the\noriginal Tensor Sketch paper [1].\n\n"`
	`72`	`+ "## Feature normalization\n\nNow scale features to the range [0, 1] to match the format of the dataset in\nthe LIBSVM webpage, and then normalize to unit length as done in the\noriginal Tensor Sketch paper [1].\n\n"`
`73`	`73`	`]`
`74`	`74`	`},`
`75`	`75`	`{`
`@@ -80,14 +80,14 @@`
`80`	`80`	`},`
`81`	`81`	`"outputs": [],`
`82`	`82`	`"source": [`
`83`		`- "mm = make_pipeline(MinMaxScaler(), Normalizer())\nX_train = mm.fit_transform(X_train)\nX_test = mm.transform(X_test)"`
	`83`	`+ "from sklearn.preprocessing import MinMaxScaler, Normalizer\nfrom sklearn.pipeline import make_pipeline\n\nmm = make_pipeline(MinMaxScaler(), Normalizer())\nX_train = mm.fit_transform(X_train)\nX_test = mm.transform(X_test)"`
`84`	`84`	`]`
`85`	`85`	`},`
`86`	`86`	`{`
`87`	`87`	`"cell_type": "markdown",`
`88`	`88`	`"metadata": {},`
`89`	`89`	`"source": [`
`90`		`- "As a baseline, train a linear SVM on the original features and print the\naccuracy. We also measure and store accuracies and training times to\nplot them later.\n\n"`
	`90`	`+ "## Establishing a baseline model\n\nAs a baseline, train a linear SVM on the original features and print the\naccuracy. We also measure and store accuracies and training times to\nplot them later.\n\n"`
`91`	`91`	`]`
`92`	`92`	`},`
`93`	`93`	`{`
`@@ -98,14 +98,14 @@`
`98`	`98`	`},`
`99`	`99`	`"outputs": [],`
`100`	`100`	`"source": [`
`101`		`- "results = {}\n\nlsvm = LinearSVC()\nstart = time.time()\nlsvm.fit(X_train, y_train)\nlsvm_time = time.time() - start\nlsvm_score = 100 * lsvm.score(X_test, y_test)\n\nresults[\"LSVM\"] = {\"time\": lsvm_time, \"score\": lsvm_score}\nprint(f\"Linear SVM score on raw features: {lsvm_score:.2f}%\")"`
	`101`	`+ "import time\nfrom sklearn.svm import LinearSVC\n\nresults = {}\n\nlsvm = LinearSVC()\nstart = time.time()\nlsvm.fit(X_train, y_train)\nlsvm_time = time.time() - start\nlsvm_score = 100 * lsvm.score(X_test, y_test)\n\nresults[\"LSVM\"] = {\"time\": lsvm_time, \"score\": lsvm_score}\nprint(f\"Linear SVM score on raw features: {lsvm_score:.2f}%\")"`
`102`	`102`	`]`
`103`	`103`	`},`
`104`	`104`	`{`
`105`	`105`	`"cell_type": "markdown",`
`106`	`106`	`"metadata": {},`
`107`	`107`	`"source": [`
`108`		- "Then we train linear SVMs on the features generated by\n:class:`PolynomialCountSketch` with different values for `n_components`,\nshowing that these kernel feature approximations improve the accuracy\nof linear classification. In typical application scenarios, `n_components`\nshould be larger than the number of features in the input representation\nin order to achieve an improvement with respect to linear classification.\nAs a rule of thumb, the optimum of evaluation score / run time cost is\ntypically achieved at around `n_components` = 10 * `n_features`, though this\nmight depend on the specific dataset being handled. Note that, since the\noriginal samples have 54 features, the explicit feature map of the\npolynomial kernel of degree four would have approximately 8.5 million\nfeatures (precisely, 54^4). Thanks to :class:`PolynomialCountSketch`, we can\ncondense most of the discriminative information of that feature space into a\nmuch more compact representation. While we run the experiment only a single time\n(`n_runs` = 1) in this example, in practice one should repeat the experiment several\ntimes to compensate for the stochastic nature of :class:`PolynomialCountSketch`.\n\n"
	`108`	+ "## Establishing the kernel approximation model\n\nThen we train linear SVMs on the features generated by\n:class:`PolynomialCountSketch` with different values for `n_components`,\nshowing that these kernel feature approximations improve the accuracy\nof linear classification. In typical application scenarios, `n_components`\nshould be larger than the number of features in the input representation\nin order to achieve an improvement with respect to linear classification.\nAs a rule of thumb, the optimum of evaluation score / run time cost is\ntypically achieved at around `n_components` = 10 * `n_features`, though this\nmight depend on the specific dataset being handled. Note that, since the\noriginal samples have 54 features, the explicit feature map of the\npolynomial kernel of degree four would have approximately 8.5 million\nfeatures (precisely, 54^4). Thanks to :class:`PolynomialCountSketch`, we can\ncondense most of the discriminative information of that feature space into a\nmuch more compact representation. While we run the experiment only a single time\n(`n_runs` = 1) in this example, in practice one should repeat the experiment several\ntimes to compensate for the stochastic nature of :class:`PolynomialCountSketch`.\n\n"
`109`	`109`	`]`
`110`	`110`	`},`
`111`	`111`	`{`
`@@ -116,14 +116,14 @@`
`116`	`116`	`},`
`117`	`117`	`"outputs": [],`
`118`	`118`	`"source": [`
`119`		- "n_runs = 1\nN_COMPONENTS = [250, 500, 1000, 2000]\n\nfor n_components in N_COMPONENTS:\n\n ps_lsvm_time = 0\n ps_lsvm_score = 0\n for _ in range(n_runs):\n\n pipeline = Pipeline(\n steps=[\n (\n \"kernel_approximator\",\n PolynomialCountSketch(n_components=n_components, degree=4),\n ),\n (\"linear_classifier\", LinearSVC()),\n ]\n )\n\n start = time.time()\n pipeline.fit(X_train, y_train)\n ps_lsvm_time += time.time() - start\n ps_lsvm_score += 100 * pipeline.score(X_test, y_test)\n\n ps_lsvm_time /= n_runs\n ps_lsvm_score /= n_runs\n\n results[f\"LSVM + PS({n_components})\"] = {\n \"time\": ps_lsvm_time,\n \"score\": ps_lsvm_score,\n }\n print(\n f\"Linear SVM score on {n_components} PolynomialCountSketch \"\n + f\"features: {ps_lsvm_score:.2f}%\"\n )"
	`119`	+ "from sklearn.kernel_approximation import PolynomialCountSketch\n\nn_runs = 1\nN_COMPONENTS = [250, 500, 1000, 2000]\n\nfor n_components in N_COMPONENTS:\n\n ps_lsvm_time = 0\n ps_lsvm_score = 0\n for _ in range(n_runs):\n\n pipeline = make_pipeline(\n PolynomialCountSketch(n_components=n_components, degree=4),\n LinearSVC(),\n )\n\n start = time.time()\n pipeline.fit(X_train, y_train)\n ps_lsvm_time += time.time() - start\n ps_lsvm_score += 100 * pipeline.score(X_test, y_test)\n\n ps_lsvm_time /= n_runs\n ps_lsvm_score /= n_runs\n\n results[f\"LSVM + PS({n_components})\"] = {\n \"time\": ps_lsvm_time,\n \"score\": ps_lsvm_score,\n }\n print(\n f\"Linear SVM score on {n_components} PolynomialCountSketch \"\n + f\"features: {ps_lsvm_score:.2f}%\"\n )"
`120`	`120`	`]`
`121`	`121`	`},`
`122`	`122`	`{`
`123`	`123`	`"cell_type": "markdown",`
`124`	`124`	`"metadata": {},`
`125`	`125`	`"source": [`
`126`		- "Train a kernelized SVM to see how well :class:`PolynomialCountSketch`\nis approximating the performance of the kernel. This, of course, may take\nsome time, as the SVC class has a relatively poor scalability. This is the\nreason why kernel approximators are so useful:\n\n"
	`126`	+ "## Establishing the kernelized SVM model\n\nTrain a kernelized SVM to see how well :class:`PolynomialCountSketch`\nis approximating the performance of the kernel. This, of course, may take\nsome time, as the SVC class has a relatively poor scalability. This is the\nreason why kernel approximators are so useful:\n\n"
`127`	`127`	`]`
`128`	`128`	`},`
`129`	`129`	`{`
`@@ -141,7 +141,7 @@`
`141`	`141`	`"cell_type": "markdown",`
`142`	`142`	`"metadata": {},`
`143`	`143`	`"source": [`
`144`		`- "Finally, plot the results of the different methods against their training\ntimes. As we can see, the kernelized SVM achieves a higher accuracy,\nbut its training time is much larger and, most importantly, will grow\nmuch faster if the number of training samples increases.\n\n"`
	`144`	`+ "## Comparing the results\n\nFinally, plot the results of the different methods against their training\ntimes. As we can see, the kernelized SVM achieves a higher accuracy,\nbut its training time is much larger and, most importantly, will grow\nmuch faster if the number of training samples increases.\n\n"`
`145`	`145`	`]`
`146`	`146`	`},`
`147`	`147`	`{`
`@@ -152,14 +152,14 @@`
`152`	`152`	`},`
`153`	`153`	`"outputs": [],`
`154`	`154`	`"source": [`
`155`		- "fig, ax = plt.subplots(figsize=(7, 7))\nax.scatter(\n [\n results[\"LSVM\"][\"time\"],\n ],\n [\n results[\"LSVM\"][\"score\"],\n ],\n label=\"Linear SVM\",\n c=\"green\",\n marker=\"^\",\n)\n\nax.scatter(\n [\n results[\"LSVM + PS(250)\"][\"time\"],\n ],\n [\n results[\"LSVM + PS(250)\"][\"score\"],\n ],\n label=\"Linear SVM + PolynomialCountSketch\",\n c=\"blue\",\n)\n\nfor n_components in N_COMPONENTS:\n ax.scatter(\n [\n results[f\"LSVM + PS({n_components})\"][\"time\"],\n ],\n [\n results[f\"LSVM + PS({n_components})\"][\"score\"],\n ],\n c=\"blue\",\n )\n ax.annotate(\n f\"n_comp.={n_components}\",\n (\n results[f\"LSVM + PS({n_components})\"][\"time\"],\n results[f\"LSVM + PS({n_components})\"][\"score\"],\n ),\n xytext=(-30, 10),\n textcoords=\"offset pixels\",\n )\n\nax.scatter(\n [\n results[\"KSVM\"][\"time\"],\n ],\n [\n results[\"KSVM\"][\"score\"],\n ],\n label=\"Kernel SVM\",\n c=\"red\",\n marker=\"x\",\n)\n\nax.set_xlabel(\"Training time (s)\")\nax.set_ylabel(\"Accuracy (%)\")\nax.legend()\nplt.show()"
	`155`	+ "import matplotlib.pyplot as plt\n\nfig, ax = plt.subplots(figsize=(7, 7))\nax.scatter(\n [\n results[\"LSVM\"][\"time\"],\n ],\n [\n results[\"LSVM\"][\"score\"],\n ],\n label=\"Linear SVM\",\n c=\"green\",\n marker=\"^\",\n)\n\nax.scatter(\n [\n results[\"LSVM + PS(250)\"][\"time\"],\n ],\n [\n results[\"LSVM + PS(250)\"][\"score\"],\n ],\n label=\"Linear SVM + PolynomialCountSketch\",\n c=\"blue\",\n)\n\nfor n_components in N_COMPONENTS:\n ax.scatter(\n [\n results[f\"LSVM + PS({n_components})\"][\"time\"],\n ],\n [\n results[f\"LSVM + PS({n_components})\"][\"score\"],\n ],\n c=\"blue\",\n )\n ax.annotate(\n f\"n_comp.={n_components}\",\n (\n results[f\"LSVM + PS({n_components})\"][\"time\"],\n results[f\"LSVM + PS({n_components})\"][\"score\"],\n ),\n xytext=(-30, 10),\n textcoords=\"offset pixels\",\n )\n\nax.scatter(\n [\n results[\"KSVM\"][\"time\"],\n ],\n [\n results[\"KSVM\"][\"score\"],\n ],\n label=\"Kernel SVM\",\n c=\"red\",\n marker=\"x\",\n)\n\nax.set_xlabel(\"Training time (s)\")\nax.set_ylabel(\"Accuracy (%)\")\nax.legend()\nplt.show()"
`156`	`156`	`]`
`157`	`157`	`},`
`158`	`158`	`{`
`159`	`159`	`"cell_type": "markdown",`
`160`	`160`	`"metadata": {},`
`161`	`161`	`"source": [`
`162`		`- "## References\n\n[1] Pham, Ninh and Rasmus Pagh. \"Fast and scalable polynomial kernels via\nexplicit feature maps.\" KDD '13 (2013).\nhttps://doi.org/10.1145/2487575.2487591\n\n[2] LIBSVM binary datasets repository\nhttps://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html\n\n"`
	`162`	`+ "### References\n\n[1] Pham, Ninh and Rasmus Pagh. \"Fast and scalable polynomial kernels via\nexplicit feature maps.\" KDD '13 (2013).\nhttps://doi.org/10.1145/2487575.2487591\n\n[2] LIBSVM binary datasets repository\nhttps://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html\n\n"`
`163`	`163`	`]`
`164`	`164`	`}`
`165`	`165`	`],`