samim0088
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/0429b84d0ed727851ef75c085baa503e/plot_scalable_poly_kernels.py
Lines changed: 186 additions & 0 deletions b/‎dev/_downloads/0429b84d0ed727851ef75c085baa503e/plot_scalable_poly_kernels.py
Lines changed: 186 additions & 0 deletions
diff --git a/‎dev/_downloads/0486bf9e537e44cedd2a236d034bcd90/plot_pcr_vs_pls.ipynb
Lines changed: 144 additions & 0 deletions b/‎dev/_downloads/0486bf9e537e44cedd2a236d034bcd90/plot_pcr_vs_pls.ipynb
Lines changed: 144 additions & 0 deletions
diff --git a/‎dev/_downloads/0b39f715b5e32f01df3d212b6d822b82/plot_calibration.py
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/0b39f715b5e32f01df3d212b6d822b82/plot_calibration.py
Lines changed: 1 addition & 1 deletion
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: c3012f96f030707b87aed004ce706ee9
+config: ebb9ea68698aea15799d10fa0216399e
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -0,0 +1,186 @@
+"""
+=======================================================
+Scalable learning with polynomial kernel aproximation
+=======================================================
+
+This example illustrates the use of :class:`PolynomialCountSketch` to
+efficiently generate polynomial kernel feature-space approximations.
+This is used to train linear classifiers that approximate the accuracy
+of kernelized ones.
+
+.. currentmodule:: sklearn.kernel_approximation
+
+We use the Covtype dataset [2], trying to reproduce the experiments on the
+original paper of Tensor Sketch [1], i.e. the algorithm implemented by
+:class:`PolynomialCountSketch`.
+
+First, we compute the accuracy of a linear classifier on the original
+features. Then, we train linear classifiers on different numbers of
+features (`n_components`) generated by :class:`PolynomialCountSketch`,
+approximating the accuracy of a kernelized classifier in a scalable manner.
+"""
+print(__doc__)
+
+# Author: Daniel Lopez-Sanchez <[email protected]>
+# License: BSD 3 clause
+import matplotlib.pyplot as plt
+from sklearn.datasets import fetch_covtype
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MinMaxScaler, Normalizer
+from sklearn.svm import LinearSVC
+from sklearn.kernel_approximation import PolynomialCountSketch
+from sklearn.pipeline import Pipeline, make_pipeline
+import time
+
+# %%
+# Load the Covtype dataset, which contains 581,012 samples
+# with 54 features each, distributed among 6 classes. The goal of this dataset
+# is to predict forest cover type from cartographic variables only
+# (no remotely sensed data). After loading, we transform it into a binary
+# classification problem to match the version of the dataset in the
+# LIBSVM webpage [2], which was the one used in [1].
+
+X, y = fetch_covtype(return_X_y=True)
+
+y[y != 2] = 0
+y[y == 2] = 1  # We will try to separate class 2 from the other 6 classes.
+
+# %%
+# Here we select 5,000 samples for training and 10,000 for testing.
+# To actually reproduce the results in the original Tensor Sketch paper,
+# select 100,000 for training.
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5_000,
+                                                    test_size=10_000,
+                                                    random_state=42)
+
+# %%
+# Now scale features to the range [0, 1] to match the format of the dataset in
+# the LIBSVM webpage, and then normalize to unit length as done in the
+# original Tensor Sketch paper [1].
+
+mm = make_pipeline(MinMaxScaler(), Normalizer())
+X_train = mm.fit_transform(X_train)
+X_test = mm.transform(X_test)
+
+
+# %%
+# As a baseline, train a linear SVM on the original features and print the
+# accuracy. We also measure and store accuracies and training times to
+# plot them latter.
+
+results = {}
+
+lsvm = LinearSVC()
+start = time.time()
+lsvm.fit(X_train, y_train)
+lsvm_time = time.time() - start
+lsvm_score = 100 * lsvm.score(X_test, y_test)
+
+results["LSVM"] = {"time": lsvm_time, "score": lsvm_score}
+print(f"Linear SVM score on raw features: {lsvm_score:.2f}%")
+
+# %%
+# Then we train linear SVMs on the features generated by
+# :class:`PolynomialCountSketch` with different values for `n_components`,
+# showing that these kernel feature approximations improve the accuracy
+# of linear classification. In typical application scenarios, `n_components`
+# should be larger than the number of features in the input representation
+# in order to achieve an improvement with respect to linear classification.
+# As a rule of thumb, the optimum of evaluation score / run time cost is
+# typically achieved at around `n_components` = 10 * `n_features`, though this
+# might depend on the specific dataset being handled. Note that, since the
+# original samples have 54 features, the explicit feature map of the
+# polynomial kernel of degree four would have approximately 8.5 million
+# features (precisely, 54^4). Thanks to :class:`PolynomialCountSketch`, we can
+# condense most of the discriminative information of that feature space into a
+# much more compact representation. We repeat the experiment 5 times to
+# compensate for the stochastic nature of :class:`PolynomialCountSketch`.
+
+n_runs = 3
+for n_components in [250, 500, 1000, 2000]:
+
+    ps_lsvm_time = 0
+    ps_lsvm_score = 0
+    for _ in range(n_runs):
+
+        pipeline = Pipeline(steps=[("kernel_approximator",
+                                    PolynomialCountSketch(
+                                        n_components=n_components,
+                                        degree=4)),
+                                   ("linear_classifier", LinearSVC())])
+
+        start = time.time()
+        pipeline.fit(X_train, y_train)
+        ps_lsvm_time += time.time() - start
+        ps_lsvm_score += 100 * pipeline.score(X_test, y_test)
+
+    ps_lsvm_time /= n_runs
+    ps_lsvm_score /= n_runs
+
+    results[f"LSVM + PS({n_components})"] = {
+        "time": ps_lsvm_time, "score": ps_lsvm_score
+    }
+    print(f"Linear SVM score on {n_components} PolynomialCountSketch " +
+          f"features: {ps_lsvm_score:.2f}%")
+
+# %%
+# Train a kernelized SVM to see how well :class:`PolynomialCountSketch`
+# is approximating the performance of the kernel. This, of course, may take
+# some time, as the SVC class has a relatively poor scalability. This is the
+# reason why kernel approximators are so useful:
+
+from sklearn.svm import SVC
+
+ksvm = SVC(C=500., kernel="poly", degree=4, coef0=0, gamma=1.)
+
+start = time.time()
+ksvm.fit(X_train, y_train)
+ksvm_time = time.time() - start
+ksvm_score = 100 * ksvm.score(X_test, y_test)
+
+results["KSVM"] = {"time": ksvm_time, "score": ksvm_score}
+print(f"Kernel-SVM score on raw featrues: {ksvm_score:.2f}%")
+
+# %%
+# Finally, plot the resuts of the different methods against their training
+# times. As we can see, the kernelized SVM achieves a higher accuracy,
+# but its training time is much larger and, most importantly, will grow
+# much faster if the number of training samples increases.
+
+N_COMPONENTS = [250, 500, 1000, 2000]
+
+fig, ax = plt.subplots(figsize=(7, 7))
+ax.scatter([results["LSVM"]["time"], ], [results["LSVM"]["score"], ],
+           label="Linear SVM", c="green", marker="^")
+
+ax.scatter([results["LSVM + PS(250)"]["time"], ],
+           [results["LSVM + PS(250)"]["score"], ],
+           label="Linear SVM + PolynomialCountSketch", c="blue")
+for n_components in N_COMPONENTS:
+    ax.scatter([results[f"LSVM + PS({n_components})"]["time"], ],
+               [results[f"LSVM + PS({n_components})"]["score"], ],
+               c="blue")
+    ax.annotate(f"n_comp.={n_components}",
+                (results[f"LSVM + PS({n_components})"]["time"],
+                 results[f"LSVM + PS({n_components})"]["score"]),
+                xytext=(-30, 10), textcoords="offset pixels")
+
+ax.scatter([results["KSVM"]["time"], ], [results["KSVM"]["score"], ],
+           label="Kernel SVM", c="red", marker="x")
+
+ax.set_xlabel("Training time (s)")
+ax.set_ylabel("Accurary (%)")
+ax.legend()
+plt.show()
+
+# %%
+# References
+# ==========
+#
+# [1] Pham, Ninh and Rasmus Pagh. "Fast and scalable polynomial kernels via
+# explicit feature maps." KDD '13 (2013).
+# https://doi.org/10.1145/2487575.2487591
+#
+# [2] LIBSVM binary datasets repository
+# https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
@@ -0,0 +1,144 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Principal Component Regression vs Partial Least Squares Regression\n\n\nThis example compares `Principal Component Regression\n<https://en.wikipedia.org/wiki/Principal_component_regression>`_ (PCR) and\n`Partial Least Squares Regression\n<https://en.wikipedia.org/wiki/Partial_least_squares_regression>`_ (PLS) on a\ntoy dataset. Our goal is to illustrate how PLS can outperform PCR when the\ntarget is strongly correlated with some directions in the data that have a\nlow variance.\n\nPCR is a regressor composed of two steps: first,\n:class:`~sklearn.decomposition.PCA` is applied to the training data, possibly\nperforming dimensionality reduction; then, a regressor (e.g. a linear\nregressor) is trained on the transformed samples. In\n:class:`~sklearn.decomposition.PCA`, the transformation is purely\nunsupervised, meaning that no information about the targets is used. As a\nresult, PCR may perform poorly in some datasets where the target is strongly\ncorrelated with *directions* that have low variance. Indeed, the\ndimensionality reduction of PCA projects the data into a lower dimensional\nspace where the variance of the projected data is greedily maximized along\neach axis. Despite them having the most predictive power on the target, the\ndirections with a lower variance will be dropped, and the final regressor\nwill not be able to leverage them.\n\nPLS is both a transformer and a regressor, and it is quite similar to PCR: it\nalso applies a dimensionality reduction to the samples before applying a\nlinear regressor to the transformed data. The main difference with PCR is\nthat the PLS transformation is supervised. Therefore, as we will see in this\nexample, it does not suffer from the issue we just mentioned.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(__doc__)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The data\n--------\n\nWe start by creating a simple dataset with two features. Before we even dive\ninto PCR and PLS, we fit a PCA estimator to display the two principal\ncomponents of this dataset, i.e. the two directions that explain the most\nvariance in the data.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.decomposition import PCA\n\nrng = np.random.RandomState(0)\nn_samples = 500\ncov = [[3, 3],\n       [3, 4]]\nX = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples)\npca = PCA(n_components=2).fit(X)\n\n\nplt.scatter(X[:, 0], X[:, 1], alpha=.3, label='samples')\nfor i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)):\n    comp = comp * var  # scale component by its variance explanation power\n    plt.plot([0, comp[0]], [0, comp[1]], label=f\"Component {i}\", linewidth=5,\n             color=f\"C{i + 2}\")\nplt.gca().set(aspect='equal',\n              title=\"2-dimensional dataset with principal components\",\n              xlabel='first feature', ylabel='second feature')\nplt.legend()\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "For the purpose of this example, we now define the target `y` such that it is\nstrongly correlated with a direction that has a small variance. To this end,\nwe will project `X` onto the second component, and add some noise to it.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "y = X.dot(pca.components_[1]) + rng.normal(size=n_samples) / 2\n\nfig, axes = plt.subplots(1, 2, figsize=(10, 3))\n\naxes[0].scatter(X.dot(pca.components_[0]), y, alpha=.3)\naxes[0].set(xlabel='Projected data onto first PCA component', ylabel='y')\naxes[1].scatter(X.dot(pca.components_[1]), y, alpha=.3)\naxes[1].set(xlabel='Projected data onto second PCA component', ylabel='y')\nplt.tight_layout()\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Projection on one component and predictive power\n------------------------------------------------\n\nWe now create two regressors: PCR and PLS, and for our illustration purposes\nwe set the number of components to 1. Before feeding the data to the PCA step\nof PCR, we first standardize it, as recommended by good practice. The PLS\nestimator has built-in scaling capabilities.\n\nFor both models, we plot the projected data onto the first component against\nthe target. In both cases, this projected data is what the regressors will\nuse as training data.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.model_selection import train_test_split\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\nfrom sklearn.cross_decomposition import PLSRegression\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)\n\npcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())\npcr.fit(X_train, y_train)\npca = pcr.named_steps['pca']  # retrieve the PCA step of the pipeline\n\npls = PLSRegression(n_components=1)\npls.fit(X_train, y_train)\n\nfig, axes = plt.subplots(1, 2, figsize=(10, 3))\naxes[0].scatter(pca.transform(X_test), y_test, alpha=.3, label='ground truth')\naxes[0].scatter(pca.transform(X_test), pcr.predict(X_test), alpha=.3,\n                label='predictions')\naxes[0].set(xlabel='Projected data onto first PCA component',\n            ylabel='y', title='PCR / PCA')\naxes[0].legend()\naxes[1].scatter(pls.transform(X_test), y_test, alpha=.3, label='ground truth')\naxes[1].scatter(pls.transform(X_test), pls.predict(X_test), alpha=.3,\n                label='predictions')\naxes[1].set(xlabel='Projected data onto first PLS component',\n            ylabel='y', title='PLS')\naxes[1].legend()\nplt.tight_layout()\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "As expected, the unsupervised PCA transformation of PCR has dropped the\nsecond component, i.e. the direction with the lowest variance, despite\nit being the most predictive direction. This is because PCA is a completely\nunsupervised transformation, and results in the projected data having a low\npredictive power on the target.\n\nOn the other hand, the PLS regressor manages to capture the effect of the\ndirection with the lowest variance, thanks to its use of target information\nduring the transformation: it can recogize that this direction is actually\nthe most predictive. We note that the first PLS component is negatively\ncorrelated with the target, which comes from the fact that the signs of\neigenvectors are arbitrary.\n\nWe also print the R-squared scores of both estimators, which further confirms\nthat PLS is a better alternative than PCR in this case. A negative R-squared\nindicates that PCR performs worse than a regressor that would simply predict\nthe mean of the target.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(f\"PCR r-squared {pcr.score(X_test, y_test):.3f}\")\nprint(f\"PLS r-squared {pls.score(X_test, y_test):.3f}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "As a final remark, we note that PCR with 2 components performs as well as\nPLS: this is because in this case, PCR was able to leverage the second\ncomponent which has the most preditive power on the target.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "pca_2 = make_pipeline(PCA(n_components=2), LinearRegression())\npca_2.fit(X_train, y_train)\nprint(f\"PCR r-squared with 2 components {pca_2.score(X_test, y_test):.3f}\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -73,7 +73,7 @@
 clf_sigmoid.fit(X_train, y_train, sample_weight=sw_train)
 prob_pos_sigmoid = clf_sigmoid.predict_proba(X_test)[:, 1]
 
-print("Brier scores: (the smaller the better)")
+print("Brier score losses: (the smaller the better)")
 
 clf_score = brier_score_loss(y_test, prob_pos_clf, sample_weight=sw_test)
 print("No calibration: %1.3f" % clf_score)