Skip to content

Commit d3da729

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 4408a25218127164cac7594a1314c302d69cdd35
1 parent 8f34873 commit d3da729

File tree

746 files changed

+2144
-2766
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

746 files changed

+2144
-2766
lines changed
Binary file not shown.

dev/_downloads/3a10dcfbc1a4bf1349c7101a429aa47b/plot_feature_transformation.py

Lines changed: 98 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -3,160 +3,118 @@
33
Feature transformations with ensembles of trees
44
===============================================
55
6-
Transform your features into a higher dimensional, sparse space. Then train a
7-
linear model on these features.
6+
Transform your features into a higher dimensional, sparse space. Then
7+
train a linear model on these features.
88
9-
First fit an ensemble of trees (totally random trees, a random forest, or
10-
gradient boosted trees) on the training set. Then each leaf of each tree in the
11-
ensemble is assigned a fixed arbitrary feature index in a new feature space.
12-
These leaf indices are then encoded in a one-hot fashion.
9+
First fit an ensemble of trees (totally random trees, a random
10+
forest, or gradient boosted trees) on the training set. Then each leaf
11+
of each tree in the ensemble is assigned a fixed arbitrary feature
12+
index in a new feature space. These leaf indices are then encoded in a
13+
one-hot fashion.
1314
14-
Each sample goes through the decisions of each tree of the ensemble and ends up
15-
in one leaf per tree. The sample is encoded by setting feature values for these
16-
leaves to 1 and the other feature values to 0.
15+
Each sample goes through the decisions of each tree of the ensemble
16+
and ends up in one leaf per tree. The sample is encoded by setting
17+
feature values for these leaves to 1 and the other feature values to 0.
1718
1819
The resulting transformer has then learned a supervised, sparse,
1920
high-dimensional categorical embedding of the data.
21+
2022
"""
2123

2224
# Author: Tim Head <[email protected]>
2325
#
2426
# License: BSD 3 clause
2527

26-
print(__doc__)
27-
28-
from sklearn import set_config
29-
set_config(display='diagram')
28+
import numpy as np
29+
np.random.seed(10)
3030

31-
# %%
32-
# First, we will create a large dataset and split it into three sets:
33-
#
34-
# - a set to train the ensemble methods which are later used to as a feature
35-
# engineering transformer;
36-
# - a set to train the linear model;
37-
# - a set to test the linear model.
38-
#
39-
# It is important to split the data in such way to avoid overfitting by leaking
40-
# data.
31+
import matplotlib.pyplot as plt
4132

4233
from sklearn.datasets import make_classification
43-
from sklearn.model_selection import train_test_split
44-
45-
X, y = make_classification(n_samples=80000, random_state=10)
46-
47-
X_full_train, X_test, y_full_train, y_test = train_test_split(
48-
X, y, test_size=0.5, random_state=10)
49-
X_train_ensemble, X_train_linear, y_train_ensemble, y_train_linear = \
50-
train_test_split(X_full_train, y_full_train, test_size=0.5,
51-
random_state=10)
52-
53-
# %%
54-
# For each of the ensemble methods, we will use 10 estimators and a maximum
55-
# depth of 3 levels.
56-
57-
n_estimators = 10
58-
max_depth = 3
59-
60-
# %%
61-
# First, we will start by training the random forest and gradient boosting on
62-
# the separated training set
63-
64-
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
65-
66-
random_forest = RandomForestClassifier(
67-
n_estimators=n_estimators, max_depth=max_depth, random_state=10)
68-
random_forest.fit(X_train_ensemble, y_train_ensemble)
69-
70-
gradient_boosting = GradientBoostingClassifier(
71-
n_estimators=n_estimators, max_depth=max_depth, random_state=10)
72-
_ = gradient_boosting.fit(X_train_ensemble, y_train_ensemble)
73-
74-
# %%
75-
# The :class:`~sklearn.ensemble.RandomTreesEmbedding` is an unsupervised method
76-
# and thus does not required to be trained independently.
77-
78-
from sklearn.ensemble import RandomTreesEmbedding
79-
80-
random_tree_embedding = RandomTreesEmbedding(
81-
n_estimators=n_estimators, max_depth=max_depth, random_state=0)
82-
83-
# %%
84-
# Now, we will create three pipelines that will use the above embedding as
85-
# a preprocessing stage.
86-
#
87-
# The random trees embedding can be directly pipelined with the logistic
88-
# regression because it is a standard scikit-learn transformer.
89-
9034
from sklearn.linear_model import LogisticRegression
91-
from sklearn.pipeline import make_pipeline
92-
93-
rt_model = make_pipeline(
94-
random_tree_embedding, LogisticRegression(max_iter=1000))
95-
rt_model.fit(X_train_linear, y_train_linear)
96-
97-
# %%
98-
# Then, we can pipeline random forest or gradient boosting with a logistic
99-
# regression. However, the feature transformation will happen by calling the
100-
# method `apply`. The pipeline in scikit-learn expects a call to `transform`.
101-
# Therefore, we wrapped the call to `apply` within a `FunctionTransformer`.
102-
103-
from sklearn.preprocessing import FunctionTransformer
35+
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
36+
GradientBoostingClassifier)
10437
from sklearn.preprocessing import OneHotEncoder
38+
from sklearn.model_selection import train_test_split
39+
from sklearn.metrics import roc_curve
40+
from sklearn.pipeline import make_pipeline
10541

106-
107-
def rf_apply(X, model):
108-
return model.apply(X)
109-
110-
111-
rf_leaves_yielder = FunctionTransformer(
112-
rf_apply, kw_args={"model": random_forest})
113-
114-
rf_model = make_pipeline(
115-
rf_leaves_yielder, OneHotEncoder(handle_unknown="ignore"),
116-
LogisticRegression(max_iter=1000))
117-
rf_model.fit(X_train_linear, y_train_linear)
118-
119-
120-
# %%
121-
def gbdt_apply(X, model):
122-
return model.apply(X)[:, :, 0]
123-
124-
125-
gbdt_leaves_yielder = FunctionTransformer(
126-
gbdt_apply, kw_args={"model": gradient_boosting})
127-
128-
gbdt_model = make_pipeline(
129-
gbdt_leaves_yielder, OneHotEncoder(handle_unknown="ignore"),
130-
LogisticRegression(max_iter=1000))
131-
gbdt_model.fit(X_train_linear, y_train_linear)
132-
133-
# %%
134-
# We can finally show the different ROC curves for all the models.
135-
136-
import matplotlib.pyplot as plt
137-
from sklearn.metrics import plot_roc_curve
138-
139-
fig, ax = plt.subplots()
140-
141-
models = [
142-
("RT embedding -> LR", rt_model),
143-
("RF", random_forest),
144-
("RF embedding -> LR", rf_model),
145-
("GBDT", gradient_boosting),
146-
("GBDT embedding -> LR", gbdt_model),
147-
]
148-
149-
model_displays = {}
150-
for name, pipeline in models:
151-
model_displays[name] = plot_roc_curve(
152-
pipeline, X_test, y_test, ax=ax, name=name)
153-
_ = ax.set_title('ROC curve')
154-
155-
# %%
156-
fig, ax = plt.subplots()
157-
for name, pipeline in models:
158-
model_displays[name].plot(ax=ax)
159-
160-
ax.set_xlim(0, 0.2)
161-
ax.set_ylim(0.8, 1)
162-
_ = ax.set_title('ROC curve (zoomed in at top left)')
42+
n_estimator = 10
43+
X, y = make_classification(n_samples=80000)
44+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
45+
46+
# It is important to train the ensemble of trees on a different subset
47+
# of the training data than the linear regression model to avoid
48+
# overfitting, in particular if the total number of leaves is
49+
# similar to the number of training samples
50+
X_train, X_train_lr, y_train, y_train_lr = train_test_split(
51+
X_train, y_train, test_size=0.5)
52+
53+
# Unsupervised transformation based on totally random trees
54+
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
55+
random_state=0)
56+
57+
rt_lm = LogisticRegression(max_iter=1000)
58+
pipeline = make_pipeline(rt, rt_lm)
59+
pipeline.fit(X_train, y_train)
60+
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
61+
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)
62+
63+
# Supervised transformation based on random forests
64+
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
65+
rf_enc = OneHotEncoder()
66+
rf_lm = LogisticRegression(max_iter=1000)
67+
rf.fit(X_train, y_train)
68+
rf_enc.fit(rf.apply(X_train))
69+
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
70+
71+
y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
72+
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
73+
74+
# Supervised transformation based on gradient boosted trees
75+
grd = GradientBoostingClassifier(n_estimators=n_estimator)
76+
grd_enc = OneHotEncoder()
77+
grd_lm = LogisticRegression(max_iter=1000)
78+
grd.fit(X_train, y_train)
79+
grd_enc.fit(grd.apply(X_train)[:, :, 0])
80+
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
81+
82+
y_pred_grd_lm = grd_lm.predict_proba(
83+
grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
84+
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
85+
86+
# The gradient boosted model by itself
87+
y_pred_grd = grd.predict_proba(X_test)[:, 1]
88+
fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)
89+
90+
# The random forest model by itself
91+
y_pred_rf = rf.predict_proba(X_test)[:, 1]
92+
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
93+
94+
plt.figure(1)
95+
plt.plot([0, 1], [0, 1], 'k--')
96+
plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
97+
plt.plot(fpr_rf, tpr_rf, label='RF')
98+
plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
99+
plt.plot(fpr_grd, tpr_grd, label='GBT')
100+
plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
101+
plt.xlabel('False positive rate')
102+
plt.ylabel('True positive rate')
103+
plt.title('ROC curve')
104+
plt.legend(loc='best')
105+
plt.show()
106+
107+
plt.figure(2)
108+
plt.xlim(0, 0.2)
109+
plt.ylim(0.8, 1)
110+
plt.plot([0, 1], [0, 1], 'k--')
111+
plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
112+
plt.plot(fpr_rf, tpr_rf, label='RF')
113+
plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
114+
plt.plot(fpr_grd, tpr_grd, label='GBT')
115+
plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
116+
plt.xlabel('False positive rate')
117+
plt.ylabel('True positive rate')
118+
plt.title('ROC curve (zoomed in at top left)')
119+
plt.legend(loc='best')
120+
plt.show()
Binary file not shown.

0 commit comments

Comments
 (0)