scikit-learn
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
-27 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
-27 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
-27 Bytes b/‎dev/_downloads/auto_examples_python.zip
-27 Bytes
diff --git a/‎dev/_downloads/plot_column_transformer_mixed_types.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_column_transformer_mixed_types.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_column_transformer_mixed_types.py
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/plot_column_transformer_mixed_types.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_feature_transformation.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_feature_transformation.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_feature_transformation.py
Lines changed: 3 additions & 3 deletions b/‎dev/_downloads/plot_feature_transformation.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
-53.5 KB b/‎dev/_downloads/scikit-learn-docs.pdf
-53.5 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-101 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-101 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-101 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-101 Bytes
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport pandas as pd\nimport numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, CategoricalEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\nnp.random.seed(0)\n\n# Read data from Titanic dataset.\ntitanic_url = ('https://raw.githubusercontent.com/amueller/'\n               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')\ndata = pd.read_csv(titanic_url)\n\n# We will train our classifier with the following features:\n# Numeric Features:\n# - age: float.\n# - fare: float.\n# Categorical Features:\n# - embarked: categories encoded as strings {'C', 'S', 'Q'}.\n# - sex: categories encoded as strings {'female', 'male'}.\n# - pclass: ordinal integers {1, 2, 3}.\n\n# We create the preprocessing pipelines for both numeric and categorical data.\nnumeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='median')),\n    ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n    ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n    transformers=[\n        ('num', numeric_transformer, numeric_features),\n        ('cat', categorical_transformer, categorical_features)],\n    remainder='drop')\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n                      ('classifier', LogisticRegression())])\n\nX = data.drop('survived', axis=1)\ny = data['survived']\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n                                                    shuffle=True)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
+        "# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport pandas as pd\nimport numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\nnp.random.seed(0)\n\n# Read data from Titanic dataset.\ntitanic_url = ('https://raw.githubusercontent.com/amueller/'\n               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')\ndata = pd.read_csv(titanic_url)\n\n# We will train our classifier with the following features:\n# Numeric Features:\n# - age: float.\n# - fare: float.\n# Categorical Features:\n# - embarked: categories encoded as strings {'C', 'S', 'Q'}.\n# - sex: categories encoded as strings {'female', 'male'}.\n# - pclass: ordinal integers {1, 2, 3}.\n\n# We create the preprocessing pipelines for both numeric and categorical data.\nnumeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='median')),\n    ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n    transformers=[\n        ('num', numeric_transformer, numeric_features),\n        ('cat', categorical_transformer, categorical_features)],\n    remainder='drop')\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n                      ('classifier', LogisticRegression())])\n\nX = data.drop('survived', axis=1)\ny = data['survived']\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n                                                    shuffle=True)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
       ]
     },
     {
 
@@ -32,7 +32,7 @@
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import StandardScaler, CategoricalEncoder
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split, GridSearchCV
 
@@ -61,7 +61,7 @@
 categorical_features = ['embarked', 'sex', 'pclass']
 categorical_transformer = Pipeline(steps=[
     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
-    ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])
+    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
 
 preprocessor = ColumnTransformer(
     transformers=[
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Tim Head <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nnp.random.seed(10)\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_classification\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,\n                              GradientBoostingClassifier)\nfrom sklearn.preprocessing import CategoricalEncoder\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import roc_curve\nfrom sklearn.pipeline import make_pipeline\n\nn_estimator = 10\nX, y = make_classification(n_samples=80000)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)\n# It is important to train the ensemble of trees on a different subset\n# of the training data than the linear regression model to avoid\n# overfitting, in particular if the total number of leaves is\n# similar to the number of training samples\nX_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,\n                                                            y_train,\n                                                            test_size=0.5)\n\n# Unsupervised transformation based on totally random trees\nrt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,\n                          random_state=0)\n\nrt_lm = LogisticRegression()\npipeline = make_pipeline(rt, rt_lm)\npipeline.fit(X_train, y_train)\ny_pred_rt = pipeline.predict_proba(X_test)[:, 1]\nfpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)\n\n# Supervised transformation based on random forests\nrf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)\nrf_enc = CategoricalEncoder()\nrf_lm = LogisticRegression()\nrf.fit(X_train, y_train)\nrf_enc.fit(rf.apply(X_train))\nrf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)\n\ny_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]\nfpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)\n\ngrd = GradientBoostingClassifier(n_estimators=n_estimator)\ngrd_enc = CategoricalEncoder()\ngrd_lm = LogisticRegression()\ngrd.fit(X_train, y_train)\ngrd_enc.fit(grd.apply(X_train)[:, :, 0])\ngrd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)\n\ny_pred_grd_lm = grd_lm.predict_proba(\n    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]\nfpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)\n\n\n# The gradient boosted model by itself\ny_pred_grd = grd.predict_proba(X_test)[:, 1]\nfpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)\n\n\n# The random forest model by itself\ny_pred_rf = rf.predict_proba(X_test)[:, 1]\nfpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)\n\nplt.figure(1)\nplt.plot([0, 1], [0, 1], 'k--')\nplt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')\nplt.plot(fpr_rf, tpr_rf, label='RF')\nplt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')\nplt.plot(fpr_grd, tpr_grd, label='GBT')\nplt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')\nplt.xlabel('False positive rate')\nplt.ylabel('True positive rate')\nplt.title('ROC curve')\nplt.legend(loc='best')\nplt.show()\n\nplt.figure(2)\nplt.xlim(0, 0.2)\nplt.ylim(0.8, 1)\nplt.plot([0, 1], [0, 1], 'k--')\nplt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')\nplt.plot(fpr_rf, tpr_rf, label='RF')\nplt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')\nplt.plot(fpr_grd, tpr_grd, label='GBT')\nplt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')\nplt.xlabel('False positive rate')\nplt.ylabel('True positive rate')\nplt.title('ROC curve (zoomed in at top left)')\nplt.legend(loc='best')\nplt.show()"
+        "# Author: Tim Head <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nnp.random.seed(10)\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_classification\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,\n                              GradientBoostingClassifier)\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import roc_curve\nfrom sklearn.pipeline import make_pipeline\n\nn_estimator = 10\nX, y = make_classification(n_samples=80000)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)\n# It is important to train the ensemble of trees on a different subset\n# of the training data than the linear regression model to avoid\n# overfitting, in particular if the total number of leaves is\n# similar to the number of training samples\nX_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,\n                                                            y_train,\n                                                            test_size=0.5)\n\n# Unsupervised transformation based on totally random trees\nrt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,\n                          random_state=0)\n\nrt_lm = LogisticRegression()\npipeline = make_pipeline(rt, rt_lm)\npipeline.fit(X_train, y_train)\ny_pred_rt = pipeline.predict_proba(X_test)[:, 1]\nfpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)\n\n# Supervised transformation based on random forests\nrf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)\nrf_enc = OneHotEncoder()\nrf_lm = LogisticRegression()\nrf.fit(X_train, y_train)\nrf_enc.fit(rf.apply(X_train))\nrf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)\n\ny_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]\nfpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)\n\ngrd = GradientBoostingClassifier(n_estimators=n_estimator)\ngrd_enc = OneHotEncoder()\ngrd_lm = LogisticRegression()\ngrd.fit(X_train, y_train)\ngrd_enc.fit(grd.apply(X_train)[:, :, 0])\ngrd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)\n\ny_pred_grd_lm = grd_lm.predict_proba(\n    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]\nfpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)\n\n\n# The gradient boosted model by itself\ny_pred_grd = grd.predict_proba(X_test)[:, 1]\nfpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)\n\n\n# The random forest model by itself\ny_pred_rf = rf.predict_proba(X_test)[:, 1]\nfpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)\n\nplt.figure(1)\nplt.plot([0, 1], [0, 1], 'k--')\nplt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')\nplt.plot(fpr_rf, tpr_rf, label='RF')\nplt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')\nplt.plot(fpr_grd, tpr_grd, label='GBT')\nplt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')\nplt.xlabel('False positive rate')\nplt.ylabel('True positive rate')\nplt.title('ROC curve')\nplt.legend(loc='best')\nplt.show()\n\nplt.figure(2)\nplt.xlim(0, 0.2)\nplt.ylim(0.8, 1)\nplt.plot([0, 1], [0, 1], 'k--')\nplt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')\nplt.plot(fpr_rf, tpr_rf, label='RF')\nplt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')\nplt.plot(fpr_grd, tpr_grd, label='GBT')\nplt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')\nplt.xlabel('False positive rate')\nplt.ylabel('True positive rate')\nplt.title('ROC curve (zoomed in at top left)')\nplt.legend(loc='best')\nplt.show()"
       ]
     }
   ],
 
@@ -34,7 +34,7 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                               GradientBoostingClassifier)
-from sklearn.preprocessing import CategoricalEncoder
+from sklearn.preprocessing import OneHotEncoder
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import roc_curve
 from sklearn.pipeline import make_pipeline
@@ -62,7 +62,7 @@
 
 # Supervised transformation based on random forests
 rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
-rf_enc = CategoricalEncoder()
+rf_enc = OneHotEncoder()
 rf_lm = LogisticRegression()
 rf.fit(X_train, y_train)
 rf_enc.fit(rf.apply(X_train))
@@ -72,7 +72,7 @@
 fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
 
 grd = GradientBoostingClassifier(n_estimators=n_estimator)
-grd_enc = CategoricalEncoder()
+grd_enc = OneHotEncoder()
 grd_lm = LogisticRegression()
 grd.fit(X_train, y_train)
 grd_enc.fit(grd.apply(X_train)[:, :, 0])
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport pandas as pd\nimport numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, CategoricalEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\nnp.random.seed(0)\n\n# Read data from Titanic dataset.\ntitanic_url = ('https://raw.githubusercontent.com/amueller/'\n 'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')\ndata = pd.read_csv(titanic_url)\n\n# We will train our classifier with the following features:\n# Numeric Features:\n# - age: float.\n# - fare: float.\n# Categorical Features:\n# - embarked: categories encoded as strings {'C', 'S', 'Q'}.\n# - sex: categories encoded as strings {'female', 'male'}.\n# - pclass: ordinal integers {1, 2, 3}.\n\n# We create the preprocessing pipelines for both numeric and categorical data.\nnumeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n ('imputer', SimpleImputer(strategy='median')),\n ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n transformers=[\n ('num', numeric_transformer, numeric_features),\n ('cat', categorical_transformer, categorical_features)],\n remainder='drop')\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n ('classifier', LogisticRegression())])\n\nX = data.drop('survived', axis=1)\ny = data['survived']\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n shuffle=True)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
	`29`	+ "# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport pandas as pd\nimport numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\nnp.random.seed(0)\n\n# Read data from Titanic dataset.\ntitanic_url = ('https://raw.githubusercontent.com/amueller/'\n 'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')\ndata = pd.read_csv(titanic_url)\n\n# We will train our classifier with the following features:\n# Numeric Features:\n# - age: float.\n# - fare: float.\n# Categorical Features:\n# - embarked: categories encoded as strings {'C', 'S', 'Q'}.\n# - sex: categories encoded as strings {'female', 'male'}.\n# - pclass: ordinal integers {1, 2, 3}.\n\n# We create the preprocessing pipelines for both numeric and categorical data.\nnumeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n ('imputer', SimpleImputer(strategy='median')),\n ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n transformers=[\n ('num', numeric_transformer, numeric_features),\n ('cat', categorical_transformer, categorical_features)],\n remainder='drop')\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n ('classifier', LogisticRegression())])\n\nX = data.drop('survived', axis=1)\ny = data['survived']\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n shuffle=True)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
`30`	`30`	`]`
`31`	`31`	`},`
`32`	`32`	`{`