scikit-learn
diff --git a/‎dev/_downloads/plot_huber_vs_ridge.py
Lines changed: 65 additions & 0 deletions b/‎dev/_downloads/plot_huber_vs_ridge.py
Lines changed: 65 additions & 0 deletions
diff --git a/‎dev/_downloads/plot_robust_fit.py
Lines changed: 16 additions & 8 deletions b/‎dev/_downloads/plot_robust_fit.py
Lines changed: 16 additions & 8 deletions
diff --git a/‎dev/_images/math/7ca864fdb48a8abd7fdd15fe085539b6a5872448.png
1.29 KB b/‎dev/_images/math/7ca864fdb48a8abd7fdd15fe085539b6a5872448.png
1.29 KB
diff --git a/‎dev/_images/math/f715b48e5f7b877420780a965d7b07a8e464d277.png
1.76 KB b/‎dev/_images/math/f715b48e5f7b877420780a965d7b07a8e464d277.png
1.76 KB
diff --git a/‎dev/_images/plot_agglomerative_clustering.png
50 Bytes b/‎dev/_images/plot_agglomerative_clustering.png
50 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering1.png
50 Bytes b/‎dev/_images/plot_agglomerative_clustering1.png
50 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering_001.png
515 Bytes b/‎dev/_images/plot_agglomerative_clustering_001.png
515 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering_0011.png
515 Bytes b/‎dev/_images/plot_agglomerative_clustering_0011.png
515 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering_002.png
267 Bytes b/‎dev/_images/plot_agglomerative_clustering_002.png
267 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering_0021.png
267 Bytes b/‎dev/_images/plot_agglomerative_clustering_0021.png
267 Bytes
@@ -0,0 +1,65 @@
+"""
+=======================================================
+HuberRegressor vs Ridge on dataset with strong outliers
+=======================================================
+
+Fit Ridge and HuberRegressor on a dataset with outliers.
+
+The example shows that the predictions in ridge are strongly influenced
+by the outliers present in the dataset. The Huber regressor is less
+influenced by the outliers since the model uses the linear loss for these.
+As the parameter epsilon is increased for the Huber regressor, the decision
+function approaches that of the ridge.
+"""
+
+# Authors: Manoj Kumar [email protected]
+# License: BSD 3 clause
+
+print(__doc__)
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.datasets import make_regression
+from sklearn.linear_model import HuberRegressor, Ridge
+
+# Generate toy data.
+rng = np.random.RandomState(0)
+X, y = make_regression(n_samples=20, n_features=1, random_state=0, noise=4.0,
+                       bias=100.0)
+
+# Add four strong outliers to the dataset.
+X_outliers = rng.normal(0, 0.5, size=(4, 1))
+y_outliers = rng.normal(0, 2.0, size=4)
+X_outliers[:2, :] += X.max() + X.mean() / 4.
+X_outliers[2:, :] += X.min() - X.mean() / 4.
+y_outliers[:2] += y.min() - y.mean() / 4.
+y_outliers[2:] += y.max() + y.mean() / 4.
+X = np.vstack((X, X_outliers))
+y = np.concatenate((y, y_outliers))
+plt.plot(X, y, 'b.')
+
+# Fit the huber regressor over a series of epsilon values.
+colors = ['r-', 'b-', 'y-', 'm-']
+
+x = np.linspace(X.min(), X.max(), 7)
+epsilon_values = [1.35, 1.5, 1.75, 1.9]
+for k, epsilon in enumerate(epsilon_values):
+    huber = HuberRegressor(fit_intercept=True, alpha=0.0, max_iter=100,
+                           epsilon=epsilon)
+    huber.fit(X, y)
+    coef_ = huber.coef_ * x + huber.intercept_
+    plt.plot(x, coef_, colors[k], label="huber loss, %s" % epsilon)
+
+# Fit a ridge regressor to compare it to huber regressor.
+ridge = Ridge(fit_intercept=True, alpha=0.0, random_state=0, normalize=True)
+ridge.fit(X, y)
+coef_ridge = ridge.coef_
+coef_ = ridge.coef_ * x + ridge.intercept_
+plt.plot(x, coef_, 'g-', label="ridge regression")
+
+plt.title("Comparison of HuberRegressor vs Ridge")
+plt.xlabel("X")
+plt.ylabel("y")
+plt.legend(loc=0)
+plt.show()
@@ -22,14 +22,20 @@
 - RANSAC is good for strong outliers in the y direction
 
 - TheilSen is good for small outliers, both in direction X and y, but has
-  a break point above which it performs worst than OLS.
+  a break point above which it performs worse than OLS.
+
+- The scores of HuberRegressor may not be compared directly to both TheilSen
+  and RANSAC because it does not attempt to completely filter the outliers
+  but lessen their effect.
 
 """
 
 from matplotlib import pyplot as plt
 import numpy as np
 
-from sklearn import linear_model, metrics
+from sklearn.linear_model import (
+    LinearRegression, TheilSenRegressor, RANSACRegressor, HuberRegressor)
+from sklearn.metrics import mean_squared_error
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.pipeline import make_pipeline
 
@@ -56,12 +62,14 @@
 X_errors_large = X.copy()
 X_errors_large[::3] = 10
 
-estimators = [('OLS', linear_model.LinearRegression()),
-              ('Theil-Sen', linear_model.TheilSenRegressor(random_state=42)),
-              ('RANSAC', linear_model.RANSACRegressor(random_state=42)), ]
-colors = {'OLS': 'turquoise', 'Theil-Sen': 'gold', 'RANSAC': 'lightgreen'}
-linestyle = {'OLS': '-', 'Theil-Sen': '-.', 'RANSAC': '--'}
+estimators = [('OLS', LinearRegression()),
+              ('Theil-Sen', TheilSenRegressor(random_state=42)),
+              ('RANSAC', RANSACRegressor(random_state=42)),
+              ('HuberRegressor', HuberRegressor())]
+colors = {'OLS': 'turquoise', 'Theil-Sen': 'gold', 'RANSAC': 'lightgreen', 'HuberRegressor': 'black'}
+linestyle = {'OLS': '-', 'Theil-Sen': '-.', 'RANSAC': '--', 'HuberRegressor': '--'}
 lw = 3
+
 x_plot = np.linspace(X.min(), X.max())
 for title, this_X, this_y in [
         ('Modeling Errors Only', X, y),
@@ -75,7 +83,7 @@
     for name, estimator in estimators:
         model = make_pipeline(PolynomialFeatures(3), estimator)
         model.fit(this_X, this_y)
-        mse = metrics.mean_squared_error(model.predict(X_test), y_test)
+        mse = mean_squared_error(model.predict(X_test), y_test)
         y_plot = model.predict(x_plot[:, np.newaxis])
         plt.plot(x_plot, y_plot, color=colors[name], linestyle=linestyle[name],
                  linewidth=lw, label='%s: error = %.3f' % (name, mse))