Skip to content

Commit 952d3fb

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 78a213b42c7437d806a6289372ad5411d2ee37ad
1 parent 85f0ed0 commit 952d3fb

File tree

1,207 files changed

+4015
-3956
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,207 files changed

+4015
-3956
lines changed

dev/_downloads/0d59ba71a84b25ededa8e1298aed7cf2/plot_transformed_target.ipynb

Lines changed: 16 additions & 16 deletions
Large diffs are not rendered by default.
Binary file not shown.

dev/_downloads/b6ea44ec57126546a80079c1eb59ee65/plot_transformed_target.py

Lines changed: 78 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -6,35 +6,30 @@
66
Effect of transforming the targets in regression model
77
======================================================
88
9-
In this example, we give an overview of the
10-
:class:`sklearn.compose.TransformedTargetRegressor`. Two examples
11-
illustrate the benefit of transforming the targets before learning a linear
9+
In this example, we give an overview of
10+
:class:`~sklearn.compose.TransformedTargetRegressor`. We use two examples
11+
to illustrate the benefit of transforming the targets before learning a linear
1212
regression model. The first example uses synthetic data while the second
13-
example is based on the Boston housing data set.
14-
13+
example is based on the Ames housing data set.
1514
"""
1615

1716
# Author: Guillaume Lemaitre <[email protected]>
1817
# License: BSD 3 clause
1918

20-
2119
import numpy as np
2220
import matplotlib
2321
import matplotlib.pyplot as plt
2422
from distutils.version import LooseVersion
2523

26-
print(__doc__)
27-
28-
###############################################################################
29-
# Synthetic example
30-
###############################################################################
31-
3224
from sklearn.datasets import make_regression
3325
from sklearn.model_selection import train_test_split
3426
from sklearn.linear_model import RidgeCV
3527
from sklearn.compose import TransformedTargetRegressor
3628
from sklearn.metrics import median_absolute_error, r2_score
3729

30+
###############################################################################
31+
# Synthetic example
32+
##############################################################################
3833

3934
# `normed` is being deprecated in favor of `density` in histograms
4035
if LooseVersion(matplotlib.__version__) >= '2.1':
@@ -43,21 +38,24 @@
4338
density_param = {'normed': True}
4439

4540
###############################################################################
46-
# A synthetic random regression problem is generated. The targets ``y`` are
47-
# modified by: (i) translating all targets such that all entries are
48-
# non-negative and (ii) applying an exponential function to obtain non-linear
49-
# targets which cannot be fitted using a simple linear model.
41+
# A synthetic random regression dataset is generated. The targets ``y`` are
42+
# modified by:
43+
#
44+
# 1. translating all targets such that all entries are
45+
# non-negative (by adding the absolute value of the lowest ``y``) and
46+
# 2. applying an exponential function to obtain non-linear
47+
# targets which cannot be fitted using a simple linear model.
5048
#
5149
# Therefore, a logarithmic (`np.log1p`) and an exponential function
5250
# (`np.expm1`) will be used to transform the targets before training a linear
5351
# regression model and using it for prediction.
5452

5553
X, y = make_regression(n_samples=10000, noise=100, random_state=0)
56-
y = np.exp((y + abs(y.min())) / 200)
54+
y = np.expm1((y + abs(y.min())) / 200)
5755
y_trans = np.log1p(y)
5856

5957
###############################################################################
60-
# The following illustrate the probability density functions of the target
58+
# Below we plot the probability density functions of the target
6159
# before and after applying the logarithmic functions.
6260

6361
f, (ax0, ax1) = plt.subplots(1, 2)
@@ -73,24 +71,24 @@
7371
ax1.set_xlabel('Target')
7472
ax1.set_title('Transformed target distribution')
7573

76-
f.suptitle("Synthetic data", y=0.035)
74+
f.suptitle("Synthetic data", y=0.06, x=0.53)
7775
f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])
7876

7977
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
8078

8179
###############################################################################
8280
# At first, a linear model will be applied on the original targets. Due to the
83-
# non-linearity, the model trained will not be precise during the
81+
# non-linearity, the model trained will not be precise during
8482
# prediction. Subsequently, a logarithmic function is used to linearize the
8583
# targets, allowing better prediction even with a similar linear model as
8684
# reported by the median absolute error (MAE).
8785

8886
f, (ax0, ax1) = plt.subplots(1, 2, sharey=True)
89-
87+
# Use linear model
9088
regr = RidgeCV()
9189
regr.fit(X_train, y_train)
9290
y_pred = regr.predict(X_test)
93-
91+
# Plot results
9492
ax0.scatter(y_test, y_pred)
9593
ax0.plot([0, 2000], [0, 2000], '--k')
9694
ax0.set_ylabel('Target predicted')
@@ -100,7 +98,7 @@
10098
r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
10199
ax0.set_xlim([0, 2000])
102100
ax0.set_ylim([0, 2000])
103-
101+
# Transform targets and use same linear model
104102
regr_trans = TransformedTargetRegressor(regressor=RidgeCV(),
105103
func=np.log1p,
106104
inverse_func=np.expm1)
@@ -125,83 +123,103 @@
125123
###############################################################################
126124

127125
###############################################################################
128-
# In a similar manner, the boston housing data set is used to show the impact
126+
# In a similar manner, the Ames housing data set is used to show the impact
129127
# of transforming the targets before learning a model. In this example, the
130-
# targets to be predicted corresponds to the weighted distances to the five
131-
# Boston employment centers.
128+
# target to be predicted is the selling price of each house.
132129

133-
from sklearn.datasets import load_boston
130+
from sklearn.datasets import fetch_openml
134131
from sklearn.preprocessing import QuantileTransformer, quantile_transform
135132

136-
dataset = load_boston()
137-
target = np.array(dataset.feature_names) == "DIS"
138-
X = dataset.data[:, np.logical_not(target)]
139-
y = dataset.data[:, target].squeeze()
140-
y_trans = quantile_transform(dataset.data[:, target],
141-
n_quantiles=300,
133+
ames = fetch_openml(name="house_prices", as_frame=True)
134+
# Keep only numeric columns
135+
X = ames.data.select_dtypes(np.number)
136+
# Remove columns with NaN or Inf values
137+
X = X.drop(columns=['LotFrontage', 'GarageYrBlt', 'MasVnrArea'])
138+
y = ames.target
139+
y_trans = quantile_transform(y.to_frame(),
140+
n_quantiles=900,
142141
output_distribution='normal',
143142
copy=True).squeeze()
144143

145144
###############################################################################
146-
# A :class:`sklearn.preprocessing.QuantileTransformer` is used such that the
147-
# targets follows a normal distribution before applying a
148-
# :class:`sklearn.linear_model.RidgeCV` model.
145+
# A :class:`~sklearn.preprocessing.QuantileTransformer` is used to normalize
146+
# the target distribution before applying a
147+
# :class:`~sklearn.linear_model.RidgeCV` model.
149148

150149
f, (ax0, ax1) = plt.subplots(1, 2)
151150

152151
ax0.hist(y, bins=100, **density_param)
153152
ax0.set_ylabel('Probability')
154153
ax0.set_xlabel('Target')
155-
ax0.set_title('Target distribution')
154+
ax0.text(s='Target distribution', x=1.2e5, y=9.8e-6, fontsize=12)
155+
ax0.ticklabel_format(axis="both", style="sci", scilimits=(0, 0))
156156

157157
ax1.hist(y_trans, bins=100, **density_param)
158158
ax1.set_ylabel('Probability')
159159
ax1.set_xlabel('Target')
160-
ax1.set_title('Transformed target distribution')
160+
ax1.text(s='Transformed target distribution', x=-6.8, y=0.479, fontsize=12)
161161

162-
f.suptitle("Boston housing data: distance to employment centers", y=0.035)
162+
f.suptitle("Ames housing data: selling price", y=0.04)
163163
f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])
164164

165165
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
166166

167167
###############################################################################
168168
# The effect of the transformer is weaker than on the synthetic data. However,
169-
# the transform induces a decrease of the MAE.
169+
# the transformation results in an increase in :math:`R^2` and large decrease
170+
# of the MAE. The residual plot (predicted target - true target vs predicted
171+
# target) without target transformation takes on a curved, 'reverse smile'
172+
# shape due to residual values that vary depending on the value of predicted
173+
# target. With target transformation, the shape is more linear indicating
174+
# better model fit.
170175

171-
f, (ax0, ax1) = plt.subplots(1, 2, sharey=True)
176+
f, (ax0, ax1) = plt.subplots(2, 2, sharey='row', figsize=(6.5, 8))
172177

173178
regr = RidgeCV()
174179
regr.fit(X_train, y_train)
175180
y_pred = regr.predict(X_test)
176181

177-
ax0.scatter(y_test, y_pred)
178-
ax0.plot([0, 10], [0, 10], '--k')
179-
ax0.set_ylabel('Target predicted')
180-
ax0.set_xlabel('True Target')
181-
ax0.set_title('Ridge regression \n without target transformation')
182-
ax0.text(1, 9, r'$R^2$=%.2f, MAE=%.2f' % (
182+
ax0[0].scatter(y_pred, y_test, s=8)
183+
ax0[0].plot([0, 7e5], [0, 7e5], '--k')
184+
ax0[0].set_ylabel('True target')
185+
ax0[0].set_xlabel('Predicted target')
186+
ax0[0].text(s='Ridge regression \n without target transformation', x=-5e4,
187+
y=8e5, fontsize=12, multialignment='center')
188+
ax0[0].text(3e4, 64e4, r'$R^2$=%.2f, MAE=%.2f' % (
183189
r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
184-
ax0.set_xlim([0, 10])
185-
ax0.set_ylim([0, 10])
190+
ax0[0].set_xlim([0, 7e5])
191+
ax0[0].set_ylim([0, 7e5])
192+
ax0[0].ticklabel_format(axis="both", style="sci", scilimits=(0, 0))
193+
194+
ax1[0].scatter(y_pred, (y_pred - y_test), s=8)
195+
ax1[0].set_ylabel('Residual')
196+
ax1[0].set_xlabel('Predicted target')
197+
ax1[0].ticklabel_format(axis="both", style="sci", scilimits=(0, 0))
186198

187199
regr_trans = TransformedTargetRegressor(
188200
regressor=RidgeCV(),
189-
transformer=QuantileTransformer(n_quantiles=300,
201+
transformer=QuantileTransformer(n_quantiles=900,
190202
output_distribution='normal'))
191203
regr_trans.fit(X_train, y_train)
192204
y_pred = regr_trans.predict(X_test)
193205

194-
ax1.scatter(y_test, y_pred)
195-
ax1.plot([0, 10], [0, 10], '--k')
196-
ax1.set_ylabel('Target predicted')
197-
ax1.set_xlabel('True Target')
198-
ax1.set_title('Ridge regression \n with target transformation')
199-
ax1.text(1, 9, r'$R^2$=%.2f, MAE=%.2f' % (
206+
ax0[1].scatter(y_pred, y_test, s=8)
207+
ax0[1].plot([0, 7e5], [0, 7e5], '--k')
208+
ax0[1].set_ylabel('True target')
209+
ax0[1].set_xlabel('Predicted target')
210+
ax0[1].text(s='Ridge regression \n with target transformation', x=-5e4,
211+
y=8e5, fontsize=12, multialignment='center')
212+
ax0[1].text(3e4, 64e4, r'$R^2$=%.2f, MAE=%.2f' % (
200213
r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
201-
ax1.set_xlim([0, 10])
202-
ax1.set_ylim([0, 10])
214+
ax0[1].set_xlim([0, 7e5])
215+
ax0[1].set_ylim([0, 7e5])
216+
ax0[1].ticklabel_format(axis="both", style="sci", scilimits=(0, 0))
203217

204-
f.suptitle("Boston housing data: distance to employment centers", y=0.035)
205-
f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])
218+
ax1[1].scatter(y_pred, (y_pred - y_test), s=8)
219+
ax1[1].set_ylabel('Residual')
220+
ax1[1].set_xlabel('Predicted target')
221+
ax1[1].ticklabel_format(axis="both", style="sci", scilimits=(0, 0))
222+
223+
f.suptitle("Ames housing data: selling price", y=0.035)
206224

207225
plt.show()
Binary file not shown.

dev/_downloads/scikit-learn-docs.pdf

8.92 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes

0 commit comments

Comments
 (0)