|
3 | 3 | Feature transformations with ensembles of trees
|
4 | 4 | ===============================================
|
5 | 5 |
|
6 |
| -Transform your features into a higher dimensional, sparse space. Then train a |
7 |
| -linear model on these features. |
| 6 | +Transform your features into a higher dimensional, sparse space. Then |
| 7 | +train a linear model on these features. |
8 | 8 |
|
9 |
| -First fit an ensemble of trees (totally random trees, a random forest, or |
10 |
| -gradient boosted trees) on the training set. Then each leaf of each tree in the |
11 |
| -ensemble is assigned a fixed arbitrary feature index in a new feature space. |
12 |
| -These leaf indices are then encoded in a one-hot fashion. |
| 9 | +First fit an ensemble of trees (totally random trees, a random |
| 10 | +forest, or gradient boosted trees) on the training set. Then each leaf |
| 11 | +of each tree in the ensemble is assigned a fixed arbitrary feature |
| 12 | +index in a new feature space. These leaf indices are then encoded in a |
| 13 | +one-hot fashion. |
13 | 14 |
|
14 |
| -Each sample goes through the decisions of each tree of the ensemble and ends up |
15 |
| -in one leaf per tree. The sample is encoded by setting feature values for these |
16 |
| -leaves to 1 and the other feature values to 0. |
| 15 | +Each sample goes through the decisions of each tree of the ensemble |
| 16 | +and ends up in one leaf per tree. The sample is encoded by setting |
| 17 | +feature values for these leaves to 1 and the other feature values to 0. |
17 | 18 |
|
18 | 19 | The resulting transformer has then learned a supervised, sparse,
|
19 | 20 | high-dimensional categorical embedding of the data.
|
| 21 | +
|
20 | 22 | """
|
21 | 23 |
|
22 | 24 | # Author: Tim Head <[email protected]>
|
23 | 25 | #
|
24 | 26 | # License: BSD 3 clause
|
25 | 27 |
|
26 |
| -print(__doc__) |
27 |
| - |
28 |
| -from sklearn import set_config |
29 |
| -set_config(display='diagram') |
| 28 | +import numpy as np |
| 29 | +np.random.seed(10) |
30 | 30 |
|
31 |
| -# %% |
32 |
| -# First, we will create a large dataset and split it into three sets: |
33 |
| -# |
34 |
| -# - a set to train the ensemble methods which are later used to as a feature |
35 |
| -# engineering transformer; |
36 |
| -# - a set to train the linear model; |
37 |
| -# - a set to test the linear model. |
38 |
| -# |
39 |
| -# It is important to split the data in such way to avoid overfitting by leaking |
40 |
| -# data. |
| 31 | +import matplotlib.pyplot as plt |
41 | 32 |
|
42 | 33 | from sklearn.datasets import make_classification
|
43 |
| -from sklearn.model_selection import train_test_split |
44 |
| - |
45 |
| -X, y = make_classification(n_samples=80000, random_state=10) |
46 |
| - |
47 |
| -X_full_train, X_test, y_full_train, y_test = train_test_split( |
48 |
| - X, y, test_size=0.5, random_state=10) |
49 |
| -X_train_ensemble, X_train_linear, y_train_ensemble, y_train_linear = \ |
50 |
| - train_test_split(X_full_train, y_full_train, test_size=0.5, |
51 |
| - random_state=10) |
52 |
| - |
53 |
| -# %% |
54 |
| -# For each of the ensemble methods, we will use 10 estimators and a maximum |
55 |
| -# depth of 3 levels. |
56 |
| - |
57 |
| -n_estimators = 10 |
58 |
| -max_depth = 3 |
59 |
| - |
60 |
| -# %% |
61 |
| -# First, we will start by training the random forest and gradient boosting on |
62 |
| -# the separated training set |
63 |
| - |
64 |
| -from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier |
65 |
| - |
66 |
| -random_forest = RandomForestClassifier( |
67 |
| - n_estimators=n_estimators, max_depth=max_depth, random_state=10) |
68 |
| -random_forest.fit(X_train_ensemble, y_train_ensemble) |
69 |
| - |
70 |
| -gradient_boosting = GradientBoostingClassifier( |
71 |
| - n_estimators=n_estimators, max_depth=max_depth, random_state=10) |
72 |
| -_ = gradient_boosting.fit(X_train_ensemble, y_train_ensemble) |
73 |
| - |
74 |
| -# %% |
75 |
| -# The :class:`~sklearn.ensemble.RandomTreesEmbedding` is an unsupervised method |
76 |
| -# and thus does not required to be trained independently. |
77 |
| - |
78 |
| -from sklearn.ensemble import RandomTreesEmbedding |
79 |
| - |
80 |
| -random_tree_embedding = RandomTreesEmbedding( |
81 |
| - n_estimators=n_estimators, max_depth=max_depth, random_state=0) |
82 |
| - |
83 |
| -# %% |
84 |
| -# Now, we will create three pipelines that will use the above embedding as |
85 |
| -# a preprocessing stage. |
86 |
| -# |
87 |
| -# The random trees embedding can be directly pipelined with the logistic |
88 |
| -# regression because it is a standard scikit-learn transformer. |
89 |
| - |
90 | 34 | from sklearn.linear_model import LogisticRegression
|
91 |
| -from sklearn.pipeline import make_pipeline |
92 |
| - |
93 |
| -rt_model = make_pipeline( |
94 |
| - random_tree_embedding, LogisticRegression(max_iter=1000)) |
95 |
| -rt_model.fit(X_train_linear, y_train_linear) |
96 |
| - |
97 |
| -# %% |
98 |
| -# Then, we can pipeline random forest or gradient boosting with a logistic |
99 |
| -# regression. However, the feature transformation will happen by calling the |
100 |
| -# method `apply`. The pipeline in scikit-learn expects a call to `transform`. |
101 |
| -# Therefore, we wrapped the call to `apply` within a `FunctionTransformer`. |
102 |
| - |
103 |
| -from sklearn.preprocessing import FunctionTransformer |
| 35 | +from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier, |
| 36 | + GradientBoostingClassifier) |
104 | 37 | from sklearn.preprocessing import OneHotEncoder
|
| 38 | +from sklearn.model_selection import train_test_split |
| 39 | +from sklearn.metrics import roc_curve |
| 40 | +from sklearn.pipeline import make_pipeline |
105 | 41 |
|
106 |
| - |
107 |
| -def rf_apply(X, model): |
108 |
| - return model.apply(X) |
109 |
| - |
110 |
| - |
111 |
| -rf_leaves_yielder = FunctionTransformer( |
112 |
| - rf_apply, kw_args={"model": random_forest}) |
113 |
| - |
114 |
| -rf_model = make_pipeline( |
115 |
| - rf_leaves_yielder, OneHotEncoder(handle_unknown="ignore"), |
116 |
| - LogisticRegression(max_iter=1000)) |
117 |
| -rf_model.fit(X_train_linear, y_train_linear) |
118 |
| - |
119 |
| - |
120 |
| -# %% |
121 |
| -def gbdt_apply(X, model): |
122 |
| - return model.apply(X)[:, :, 0] |
123 |
| - |
124 |
| - |
125 |
| -gbdt_leaves_yielder = FunctionTransformer( |
126 |
| - gbdt_apply, kw_args={"model": gradient_boosting}) |
127 |
| - |
128 |
| -gbdt_model = make_pipeline( |
129 |
| - gbdt_leaves_yielder, OneHotEncoder(handle_unknown="ignore"), |
130 |
| - LogisticRegression(max_iter=1000)) |
131 |
| -gbdt_model.fit(X_train_linear, y_train_linear) |
132 |
| - |
133 |
| -# %% |
134 |
| -# We can finally show the different ROC curves for all the models. |
135 |
| - |
136 |
| -import matplotlib.pyplot as plt |
137 |
| -from sklearn.metrics import plot_roc_curve |
138 |
| - |
139 |
| -fig, ax = plt.subplots() |
140 |
| - |
141 |
| -models = [ |
142 |
| - ("RT embedding -> LR", rt_model), |
143 |
| - ("RF", random_forest), |
144 |
| - ("RF embedding -> LR", rf_model), |
145 |
| - ("GBDT", gradient_boosting), |
146 |
| - ("GBDT embedding -> LR", gbdt_model), |
147 |
| -] |
148 |
| - |
149 |
| -model_displays = {} |
150 |
| -for name, pipeline in models: |
151 |
| - model_displays[name] = plot_roc_curve( |
152 |
| - pipeline, X_test, y_test, ax=ax, name=name) |
153 |
| -_ = ax.set_title('ROC curve') |
154 |
| - |
155 |
| -# %% |
156 |
| -fig, ax = plt.subplots() |
157 |
| -for name, pipeline in models: |
158 |
| - model_displays[name].plot(ax=ax) |
159 |
| - |
160 |
| -ax.set_xlim(0, 0.2) |
161 |
| -ax.set_ylim(0.8, 1) |
162 |
| -_ = ax.set_title('ROC curve (zoomed in at top left)') |
| 42 | +n_estimator = 10 |
| 43 | +X, y = make_classification(n_samples=80000) |
| 44 | +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) |
| 45 | + |
| 46 | +# It is important to train the ensemble of trees on a different subset |
| 47 | +# of the training data than the linear regression model to avoid |
| 48 | +# overfitting, in particular if the total number of leaves is |
| 49 | +# similar to the number of training samples |
| 50 | +X_train, X_train_lr, y_train, y_train_lr = train_test_split( |
| 51 | + X_train, y_train, test_size=0.5) |
| 52 | + |
| 53 | +# Unsupervised transformation based on totally random trees |
| 54 | +rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, |
| 55 | + random_state=0) |
| 56 | + |
| 57 | +rt_lm = LogisticRegression(max_iter=1000) |
| 58 | +pipeline = make_pipeline(rt, rt_lm) |
| 59 | +pipeline.fit(X_train, y_train) |
| 60 | +y_pred_rt = pipeline.predict_proba(X_test)[:, 1] |
| 61 | +fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) |
| 62 | + |
| 63 | +# Supervised transformation based on random forests |
| 64 | +rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) |
| 65 | +rf_enc = OneHotEncoder() |
| 66 | +rf_lm = LogisticRegression(max_iter=1000) |
| 67 | +rf.fit(X_train, y_train) |
| 68 | +rf_enc.fit(rf.apply(X_train)) |
| 69 | +rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) |
| 70 | + |
| 71 | +y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] |
| 72 | +fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) |
| 73 | + |
| 74 | +# Supervised transformation based on gradient boosted trees |
| 75 | +grd = GradientBoostingClassifier(n_estimators=n_estimator) |
| 76 | +grd_enc = OneHotEncoder() |
| 77 | +grd_lm = LogisticRegression(max_iter=1000) |
| 78 | +grd.fit(X_train, y_train) |
| 79 | +grd_enc.fit(grd.apply(X_train)[:, :, 0]) |
| 80 | +grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) |
| 81 | + |
| 82 | +y_pred_grd_lm = grd_lm.predict_proba( |
| 83 | + grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] |
| 84 | +fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm) |
| 85 | + |
| 86 | +# The gradient boosted model by itself |
| 87 | +y_pred_grd = grd.predict_proba(X_test)[:, 1] |
| 88 | +fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd) |
| 89 | + |
| 90 | +# The random forest model by itself |
| 91 | +y_pred_rf = rf.predict_proba(X_test)[:, 1] |
| 92 | +fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf) |
| 93 | + |
| 94 | +plt.figure(1) |
| 95 | +plt.plot([0, 1], [0, 1], 'k--') |
| 96 | +plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR') |
| 97 | +plt.plot(fpr_rf, tpr_rf, label='RF') |
| 98 | +plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR') |
| 99 | +plt.plot(fpr_grd, tpr_grd, label='GBT') |
| 100 | +plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR') |
| 101 | +plt.xlabel('False positive rate') |
| 102 | +plt.ylabel('True positive rate') |
| 103 | +plt.title('ROC curve') |
| 104 | +plt.legend(loc='best') |
| 105 | +plt.show() |
| 106 | + |
| 107 | +plt.figure(2) |
| 108 | +plt.xlim(0, 0.2) |
| 109 | +plt.ylim(0.8, 1) |
| 110 | +plt.plot([0, 1], [0, 1], 'k--') |
| 111 | +plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR') |
| 112 | +plt.plot(fpr_rf, tpr_rf, label='RF') |
| 113 | +plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR') |
| 114 | +plt.plot(fpr_grd, tpr_grd, label='GBT') |
| 115 | +plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR') |
| 116 | +plt.xlabel('False positive rate') |
| 117 | +plt.ylabel('True positive rate') |
| 118 | +plt.title('ROC curve (zoomed in at top left)') |
| 119 | +plt.legend(loc='best') |
| 120 | +plt.show() |
0 commit comments