Skip to content

Commit aa1d86e

Browse files
committed
Pushing the docs for revision for branch: master, commit 540c7c66d2d93d4e4b5d49b3309ac0050b86e5eb
1 parent 714cf1f commit aa1d86e

File tree

914 files changed

+4616
-3490
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

914 files changed

+4616
-3490
lines changed

dev/_downloads/plot_huber_vs_ridge.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""
2+
=======================================================
3+
HuberRegressor vs Ridge on dataset with strong outliers
4+
=======================================================
5+
6+
Fit Ridge and HuberRegressor on a dataset with outliers.
7+
8+
The example shows that the predictions in ridge are strongly influenced
9+
by the outliers present in the dataset. The Huber regressor is less
10+
influenced by the outliers since the model uses the linear loss for these.
11+
As the parameter epsilon is increased for the Huber regressor, the decision
12+
function approaches that of the ridge.
13+
"""
14+
15+
# Authors: Manoj Kumar [email protected]
16+
# License: BSD 3 clause
17+
18+
print(__doc__)
19+
20+
import numpy as np
21+
import matplotlib.pyplot as plt
22+
23+
from sklearn.datasets import make_regression
24+
from sklearn.linear_model import HuberRegressor, Ridge
25+
26+
# Generate toy data.
27+
rng = np.random.RandomState(0)
28+
X, y = make_regression(n_samples=20, n_features=1, random_state=0, noise=4.0,
29+
bias=100.0)
30+
31+
# Add four strong outliers to the dataset.
32+
X_outliers = rng.normal(0, 0.5, size=(4, 1))
33+
y_outliers = rng.normal(0, 2.0, size=4)
34+
X_outliers[:2, :] += X.max() + X.mean() / 4.
35+
X_outliers[2:, :] += X.min() - X.mean() / 4.
36+
y_outliers[:2] += y.min() - y.mean() / 4.
37+
y_outliers[2:] += y.max() + y.mean() / 4.
38+
X = np.vstack((X, X_outliers))
39+
y = np.concatenate((y, y_outliers))
40+
plt.plot(X, y, 'b.')
41+
42+
# Fit the huber regressor over a series of epsilon values.
43+
colors = ['r-', 'b-', 'y-', 'm-']
44+
45+
x = np.linspace(X.min(), X.max(), 7)
46+
epsilon_values = [1.35, 1.5, 1.75, 1.9]
47+
for k, epsilon in enumerate(epsilon_values):
48+
huber = HuberRegressor(fit_intercept=True, alpha=0.0, max_iter=100,
49+
epsilon=epsilon)
50+
huber.fit(X, y)
51+
coef_ = huber.coef_ * x + huber.intercept_
52+
plt.plot(x, coef_, colors[k], label="huber loss, %s" % epsilon)
53+
54+
# Fit a ridge regressor to compare it to huber regressor.
55+
ridge = Ridge(fit_intercept=True, alpha=0.0, random_state=0, normalize=True)
56+
ridge.fit(X, y)
57+
coef_ridge = ridge.coef_
58+
coef_ = ridge.coef_ * x + ridge.intercept_
59+
plt.plot(x, coef_, 'g-', label="ridge regression")
60+
61+
plt.title("Comparison of HuberRegressor vs Ridge")
62+
plt.xlabel("X")
63+
plt.ylabel("y")
64+
plt.legend(loc=0)
65+
plt.show()

dev/_downloads/plot_robust_fit.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,20 @@
2222
- RANSAC is good for strong outliers in the y direction
2323
2424
- TheilSen is good for small outliers, both in direction X and y, but has
25-
a break point above which it performs worst than OLS.
25+
a break point above which it performs worse than OLS.
26+
27+
- The scores of HuberRegressor may not be compared directly to both TheilSen
28+
and RANSAC because it does not attempt to completely filter the outliers
29+
but lessen their effect.
2630
2731
"""
2832

2933
from matplotlib import pyplot as plt
3034
import numpy as np
3135

32-
from sklearn import linear_model, metrics
36+
from sklearn.linear_model import (
37+
LinearRegression, TheilSenRegressor, RANSACRegressor, HuberRegressor)
38+
from sklearn.metrics import mean_squared_error
3339
from sklearn.preprocessing import PolynomialFeatures
3440
from sklearn.pipeline import make_pipeline
3541

@@ -56,12 +62,14 @@
5662
X_errors_large = X.copy()
5763
X_errors_large[::3] = 10
5864

59-
estimators = [('OLS', linear_model.LinearRegression()),
60-
('Theil-Sen', linear_model.TheilSenRegressor(random_state=42)),
61-
('RANSAC', linear_model.RANSACRegressor(random_state=42)), ]
62-
colors = {'OLS': 'turquoise', 'Theil-Sen': 'gold', 'RANSAC': 'lightgreen'}
63-
linestyle = {'OLS': '-', 'Theil-Sen': '-.', 'RANSAC': '--'}
65+
estimators = [('OLS', LinearRegression()),
66+
('Theil-Sen', TheilSenRegressor(random_state=42)),
67+
('RANSAC', RANSACRegressor(random_state=42)),
68+
('HuberRegressor', HuberRegressor())]
69+
colors = {'OLS': 'turquoise', 'Theil-Sen': 'gold', 'RANSAC': 'lightgreen', 'HuberRegressor': 'black'}
70+
linestyle = {'OLS': '-', 'Theil-Sen': '-.', 'RANSAC': '--', 'HuberRegressor': '--'}
6471
lw = 3
72+
6573
x_plot = np.linspace(X.min(), X.max())
6674
for title, this_X, this_y in [
6775
('Modeling Errors Only', X, y),
@@ -75,7 +83,7 @@
7583
for name, estimator in estimators:
7684
model = make_pipeline(PolynomialFeatures(3), estimator)
7785
model.fit(this_X, this_y)
78-
mse = metrics.mean_squared_error(model.predict(X_test), y_test)
86+
mse = mean_squared_error(model.predict(X_test), y_test)
7987
y_plot = model.predict(x_plot[:, np.newaxis])
8088
plt.plot(x_plot, y_plot, color=colors[name], linestyle=linestyle[name],
8189
linewidth=lw, label='%s: error = %.3f' % (name, mse))
1.29 KB
1.76 KB
50 Bytes
50 Bytes
515 Bytes
515 Bytes
267 Bytes
267 Bytes

0 commit comments

Comments
 (0)