|
14 | 14 |
|
15 | 15 | This example illustrates both methods on an artificial dataset, which
|
16 | 16 | consists of a sinusoidal target function and strong noise added to every fifth
|
17 |
| -datapoint. The first figure compares the learned model of KRR and SVR when both |
18 |
| -complexity/regularization and bandwidth of the RBF kernel are optimized using |
19 |
| -grid-search. The learned functions are very similar; however, fitting KRR is |
20 |
| -approx. seven times faster than fitting SVR (both with grid-search). However, |
21 |
| -prediction of 100000 target values is more than tree times faster with SVR |
22 |
| -since it has learned a sparse model using only approx. 1/3 of the 100 training |
23 |
| -datapoints as support vectors. |
24 |
| -
|
25 |
| -The next figure compares the time for fitting and prediction of KRR and SVR for |
26 |
| -different sizes of the training set. Fitting KRR is faster than SVR for medium- |
27 |
| -sized training sets (less than 1000 samples); however, for larger training sets |
28 |
| -SVR scales better. With regard to prediction time, SVR is faster than |
29 |
| -KRR for all sizes of the training set because of the learned sparse |
30 |
| -solution. Note that the degree of sparsity and thus the prediction time depends |
31 |
| -on the parameters epsilon and C of the SVR. |
| 17 | +datapoint. |
32 | 18 |
|
33 | 19 | """
|
34 |
| - |
| 20 | +# %% |
35 | 21 | # Authors: Jan Hendrik Metzen <[email protected]>
|
36 | 22 | # License: BSD 3 clause
|
37 | 23 |
|
38 |
| -import time |
39 |
| - |
| 24 | +# %% |
| 25 | +# Generate sample data |
| 26 | +# -------------------- |
40 | 27 | import numpy as np
|
41 | 28 |
|
42 |
| -from sklearn.svm import SVR |
43 |
| -from sklearn.model_selection import GridSearchCV |
44 |
| -from sklearn.model_selection import learning_curve |
45 |
| -from sklearn.kernel_ridge import KernelRidge |
46 |
| -import matplotlib.pyplot as plt |
47 |
| - |
48 | 29 | rng = np.random.RandomState(42)
|
49 | 30 |
|
50 |
| -# ############################################################################# |
51 |
| -# Generate sample data |
52 | 31 | X = 5 * rng.rand(10000, 1)
|
53 | 32 | y = np.sin(X).ravel()
|
54 | 33 |
|
|
57 | 36 |
|
58 | 37 | X_plot = np.linspace(0, 5, 100000)[:, None]
|
59 | 38 |
|
60 |
| -# ############################################################################# |
61 |
| -# Fit regression model |
| 39 | +# %% |
| 40 | +# Construct the kernel-based regression models |
| 41 | +# -------------------------------------------- |
| 42 | + |
| 43 | +from sklearn.model_selection import GridSearchCV |
| 44 | +from sklearn.svm import SVR |
| 45 | +from sklearn.kernel_ridge import KernelRidge |
| 46 | + |
62 | 47 | train_size = 100
|
| 48 | + |
63 | 49 | svr = GridSearchCV(
|
64 | 50 | SVR(kernel="rbf", gamma=0.1),
|
65 | 51 | param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)},
|
|
70 | 56 | param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5)},
|
71 | 57 | )
|
72 | 58 |
|
| 59 | +# %% |
| 60 | +# Compare times of SVR and Kernel Ridge Regression |
| 61 | +# ------------------------------------------------ |
| 62 | + |
| 63 | +import time |
| 64 | + |
73 | 65 | t0 = time.time()
|
74 | 66 | svr.fit(X[:train_size], y[:train_size])
|
75 | 67 | svr_fit = time.time() - t0
|
| 68 | +print(f"Best SVR with params: {svr.best_params_} and R2 score: {svr.best_score_:.3f}") |
76 | 69 | print("SVR complexity and bandwidth selected and model fitted in %.3f s" % svr_fit)
|
77 | 70 |
|
78 | 71 | t0 = time.time()
|
79 | 72 | kr.fit(X[:train_size], y[:train_size])
|
80 | 73 | kr_fit = time.time() - t0
|
| 74 | +print(f"Best KRR with params: {kr.best_params_} and R2 score: {kr.best_score_:.3f}") |
81 | 75 | print("KRR complexity and bandwidth selected and model fitted in %.3f s" % kr_fit)
|
82 | 76 |
|
83 | 77 | sv_ratio = svr.best_estimator_.support_.shape[0] / train_size
|
|
93 | 87 | kr_predict = time.time() - t0
|
94 | 88 | print("KRR prediction for %d inputs in %.3f s" % (X_plot.shape[0], kr_predict))
|
95 | 89 |
|
96 |
| - |
97 |
| -# ############################################################################# |
| 90 | +# %% |
98 | 91 | # Look at the results
|
| 92 | +# ------------------- |
| 93 | + |
| 94 | +import matplotlib.pyplot as plt |
| 95 | + |
99 | 96 | sv_ind = svr.best_estimator_.support_
|
100 | 97 | plt.scatter(
|
101 | 98 | X[sv_ind],
|
|
119 | 116 | plt.xlabel("data")
|
120 | 117 | plt.ylabel("target")
|
121 | 118 | plt.title("SVR versus Kernel Ridge")
|
122 |
| -plt.legend() |
| 119 | +_ = plt.legend() |
| 120 | + |
| 121 | +# %% |
| 122 | +# The previous figure compares the learned model of KRR and SVR when both |
| 123 | +# complexity/regularization and bandwidth of the RBF kernel are optimized using |
| 124 | +# grid-search. The learned functions are very similar; however, fitting KRR is |
| 125 | +# approximatively 3-4 times faster than fitting SVR (both with grid-search). |
| 126 | +# |
| 127 | +# Prediction of 100000 target values could be in theory approximately three |
| 128 | +# times faster with SVR since it has learned a sparse model using only |
| 129 | +# approximately 1/3 of the training datapoints as support vectors. However, in |
| 130 | +# practice, this is not necessarily the case because of implementation details |
| 131 | +# in the way the kernel function is computed for each model that can make the |
| 132 | +# KRR model as fast or even faster despite computing more arithmetic |
| 133 | +# operations. |
| 134 | + |
| 135 | +# %% |
| 136 | +# Visualize training and prediction times |
| 137 | +# --------------------------------------- |
123 | 138 |
|
124 |
| -# Visualize training and prediction time |
125 | 139 | plt.figure()
|
126 | 140 |
|
127 |
| -# Generate sample data |
128 |
| -X = 5 * rng.rand(10000, 1) |
129 |
| -y = np.sin(X).ravel() |
130 |
| -y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5)) |
131 | 141 | sizes = np.logspace(1, 3.8, 7).astype(int)
|
132 | 142 | for name, estimator in {
|
133 | 143 | "KRR": KernelRidge(kernel="rbf", alpha=0.01, gamma=10),
|
|
164 | 174 | plt.xlabel("Train size")
|
165 | 175 | plt.ylabel("Time (seconds)")
|
166 | 176 | plt.title("Execution Time")
|
167 |
| -plt.legend(loc="best") |
| 177 | +_ = plt.legend(loc="best") |
| 178 | + |
| 179 | +# %% |
| 180 | +# This figure compares the time for fitting and prediction of KRR and SVR for |
| 181 | +# different sizes of the training set. Fitting KRR is faster than SVR for |
| 182 | +# medium-sized training sets (less than a few thousand samples); however, for |
| 183 | +# larger training sets SVR scales better. With regard to prediction time, SVR |
| 184 | +# should be faster than KRR for all sizes of the training set because of the |
| 185 | +# learned sparse solution, however this is not necessarily the case in practice |
| 186 | +# because of implementation details. Note that the degree of sparsity and thus |
| 187 | +# the prediction time depends on the parameters epsilon and C of the SVR. |
| 188 | + |
| 189 | +# %% |
| 190 | +# Visualize the learning curves |
| 191 | +# ----------------------------- |
| 192 | + |
| 193 | +from sklearn.model_selection import learning_curve |
168 | 194 |
|
169 |
| -# Visualize learning curves |
170 | 195 | plt.figure()
|
171 | 196 |
|
172 | 197 | svr = SVR(kernel="rbf", C=1e1, gamma=0.1)
|
|
188 | 213 | cv=10,
|
189 | 214 | )
|
190 | 215 |
|
191 |
| -plt.plot(train_sizes, -test_scores_svr.mean(1), "o-", color="r", label="SVR") |
192 |
| -plt.plot(train_sizes, -test_scores_kr.mean(1), "o-", color="g", label="KRR") |
| 216 | +plt.plot(train_sizes, -test_scores_kr.mean(1), "o--", color="g", label="KRR") |
| 217 | +plt.plot(train_sizes, -test_scores_svr.mean(1), "o--", color="r", label="SVR") |
193 | 218 | plt.xlabel("Train size")
|
194 | 219 | plt.ylabel("Mean Squared Error")
|
195 | 220 | plt.title("Learning curves")
|
|
0 commit comments