Skip to content

Commit 1330eae

Browse files
committed
Pushing the docs to dev/ for branch: master, commit a2ebb8cfd2d126ad8e6fb36e0bdadba7de8fcd9f
1 parent 0086d2d commit 1330eae

File tree

979 files changed

+3744
-3065
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

979 files changed

+3744
-3065
lines changed
15 Bytes
Binary file not shown.
15 Bytes
Binary file not shown.

dev/_downloads/plot_feature_transformation.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Tim Head <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nnp.random.seed(10)\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_classification\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,\n GradientBoostingClassifier)\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import roc_curve\nfrom sklearn.pipeline import make_pipeline\n\nn_estimator = 10\nX, y = make_classification(n_samples=80000)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)\n# It is important to train the ensemble of trees on a different subset\n# of the training data than the linear regression model to avoid\n# overfitting, in particular if the total number of leaves is\n# similar to the number of training samples\nX_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,\n y_train,\n test_size=0.5)\n\n# Unsupervised transformation based on totally random trees\nrt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,\n\trandom_state=0)\n\nrt_lm = LogisticRegression()\npipeline = make_pipeline(rt, rt_lm)\npipeline.fit(X_train, y_train)\ny_pred_rt = pipeline.predict_proba(X_test)[:, 1]\nfpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)\n\n# Supervised transformation based on random forests\nrf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)\nrf_enc = OneHotEncoder()\nrf_lm = LogisticRegression()\nrf.fit(X_train, y_train)\nrf_enc.fit(rf.apply(X_train))\nrf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)\n\ny_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]\nfpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)\n\ngrd = GradientBoostingClassifier(n_estimators=n_estimator)\ngrd_enc = OneHotEncoder()\ngrd_lm = LogisticRegression()\ngrd.fit(X_train, y_train)\ngrd_enc.fit(grd.apply(X_train)[:, :, 0])\ngrd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)\n\ny_pred_grd_lm = grd_lm.predict_proba(\n grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]\nfpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)\n\n\n# The gradient boosted model by itself\ny_pred_grd = grd.predict_proba(X_test)[:, 1]\nfpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)\n\n\n# The random forest model by itself\ny_pred_rf = rf.predict_proba(X_test)[:, 1]\nfpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)\n\nplt.figure(1)\nplt.plot([0, 1], [0, 1], 'k--')\nplt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')\nplt.plot(fpr_rf, tpr_rf, label='RF')\nplt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')\nplt.plot(fpr_grd, tpr_grd, label='GBT')\nplt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')\nplt.xlabel('False positive rate')\nplt.ylabel('True positive rate')\nplt.title('ROC curve')\nplt.legend(loc='best')\nplt.show()\n\nplt.figure(2)\nplt.xlim(0, 0.2)\nplt.ylim(0.8, 1)\nplt.plot([0, 1], [0, 1], 'k--')\nplt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')\nplt.plot(fpr_rf, tpr_rf, label='RF')\nplt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')\nplt.plot(fpr_grd, tpr_grd, label='GBT')\nplt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')\nplt.xlabel('False positive rate')\nplt.ylabel('True positive rate')\nplt.title('ROC curve (zoomed in at top left)')\nplt.legend(loc='best')\nplt.show()"
29+
"# Author: Tim Head <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nnp.random.seed(10)\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_classification\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,\n GradientBoostingClassifier)\nfrom sklearn.preprocessing import CategoricalEncoder\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import roc_curve\nfrom sklearn.pipeline import make_pipeline\n\nn_estimator = 10\nX, y = make_classification(n_samples=80000)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)\n# It is important to train the ensemble of trees on a different subset\n# of the training data than the linear regression model to avoid\n# overfitting, in particular if the total number of leaves is\n# similar to the number of training samples\nX_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,\n y_train,\n test_size=0.5)\n\n# Unsupervised transformation based on totally random trees\nrt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,\n\trandom_state=0)\n\nrt_lm = LogisticRegression()\npipeline = make_pipeline(rt, rt_lm)\npipeline.fit(X_train, y_train)\ny_pred_rt = pipeline.predict_proba(X_test)[:, 1]\nfpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)\n\n# Supervised transformation based on random forests\nrf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)\nrf_enc = CategoricalEncoder()\nrf_lm = LogisticRegression()\nrf.fit(X_train, y_train)\nrf_enc.fit(rf.apply(X_train))\nrf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)\n\ny_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]\nfpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)\n\ngrd = GradientBoostingClassifier(n_estimators=n_estimator)\ngrd_enc = CategoricalEncoder()\ngrd_lm = LogisticRegression()\ngrd.fit(X_train, y_train)\ngrd_enc.fit(grd.apply(X_train)[:, :, 0])\ngrd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)\n\ny_pred_grd_lm = grd_lm.predict_proba(\n grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]\nfpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)\n\n\n# The gradient boosted model by itself\ny_pred_grd = grd.predict_proba(X_test)[:, 1]\nfpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)\n\n\n# The random forest model by itself\ny_pred_rf = rf.predict_proba(X_test)[:, 1]\nfpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)\n\nplt.figure(1)\nplt.plot([0, 1], [0, 1], 'k--')\nplt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')\nplt.plot(fpr_rf, tpr_rf, label='RF')\nplt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')\nplt.plot(fpr_grd, tpr_grd, label='GBT')\nplt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')\nplt.xlabel('False positive rate')\nplt.ylabel('True positive rate')\nplt.title('ROC curve')\nplt.legend(loc='best')\nplt.show()\n\nplt.figure(2)\nplt.xlim(0, 0.2)\nplt.ylim(0.8, 1)\nplt.plot([0, 1], [0, 1], 'k--')\nplt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')\nplt.plot(fpr_rf, tpr_rf, label='RF')\nplt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')\nplt.plot(fpr_grd, tpr_grd, label='GBT')\nplt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')\nplt.xlabel('False positive rate')\nplt.ylabel('True positive rate')\nplt.title('ROC curve (zoomed in at top left)')\nplt.legend(loc='best')\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_feature_transformation.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
from sklearn.linear_model import LogisticRegression
3535
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
3636
GradientBoostingClassifier)
37-
from sklearn.preprocessing import OneHotEncoder
37+
from sklearn.preprocessing import CategoricalEncoder
3838
from sklearn.model_selection import train_test_split
3939
from sklearn.metrics import roc_curve
4040
from sklearn.pipeline import make_pipeline
@@ -62,7 +62,7 @@
6262

6363
# Supervised transformation based on random forests
6464
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
65-
rf_enc = OneHotEncoder()
65+
rf_enc = CategoricalEncoder()
6666
rf_lm = LogisticRegression()
6767
rf.fit(X_train, y_train)
6868
rf_enc.fit(rf.apply(X_train))
@@ -72,7 +72,7 @@
7272
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
7373

7474
grd = GradientBoostingClassifier(n_estimators=n_estimator)
75-
grd_enc = OneHotEncoder()
75+
grd_enc = CategoricalEncoder()
7676
grd_lm = LogisticRegression()
7777
grd.fit(X_train, y_train)
7878
grd_enc.fit(grd.apply(X_train)[:, :, 0])

dev/_downloads/scikit-learn-docs.pdf

8.62 KB
Binary file not shown.
-563 Bytes

0 commit comments

Comments
 (0)