scikit-learn
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
-3 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
-3 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
-4 Bytes b/‎dev/_downloads/auto_examples_python.zip
-4 Bytes
diff --git a/‎dev/_downloads/column_transformer_mixed_types.ipynb renamed to ‎dev/_downloads/plot_column_transformer_mixed_types.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/column_transformer_mixed_types.ipynb renamed to ‎dev/_downloads/plot_column_transformer_mixed_types.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/column_transformer_mixed_types.py renamed to ‎dev/_downloads/plot_column_transformer_mixed_types.py
Lines changed: 25 additions & 22 deletions b/‎dev/_downloads/column_transformer_mixed_types.py renamed to ‎dev/_downloads/plot_column_transformer_mixed_types.py
Lines changed: 25 additions & 22 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
-66.1 KB b/‎dev/_downloads/scikit-learn-docs.pdf
-66.1 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
604 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
604 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
604 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
604 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-243 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-243 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
-243 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
-243 Bytes
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport pandas as pd\n\nfrom sklearn.compose import make_column_transformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, CategoricalEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\n\n# Read data from Titanic dataset.\ntitanic_url = ('https://raw.githubusercontent.com/amueller/'\n               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')\ndata = pd.read_csv(titanic_url)\n\n# We will train our classifier with the following features:\n# Numeric Features:\n# - age: float.\n# - fare: float.\n# Categorical Features:\n# - embarked: categories encoded as strings {'C', 'S', 'Q'}.\n# - sex: categories encoded as strings {'female', 'male'}.\n# - pclass: ordinal integers {1, 2, 3}.\nnumeric_features = ['age', 'fare']\ncategorical_features = ['embarked', 'sex', 'pclass']\n\n# Provisionally, use pd.fillna() to impute missing values for categorical\n# features; SimpleImputer will eventually support strategy=\"constant\".\ndata[categorical_features] = data[categorical_features].fillna(value='missing')\n\n# We create the preprocessing pipelines for both numeric and categorical data.\nnumeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())\ncategorical_transformer = CategoricalEncoder('onehot-dense',\n                                             handle_unknown='ignore')\n\npreprocessing_pl = make_column_transformer(\n    (numeric_features, numeric_transformer),\n    (categorical_features, categorical_transformer),\n    remainder='drop'\n)\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = make_pipeline(preprocessing_pl, LogisticRegression())\n\nX = data.drop('survived', axis=1)\ny = data.survived.values\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n                                                    shuffle=True)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %f\" % clf.score(X_test, y_test))"
+        "# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport pandas as pd\nimport numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, CategoricalEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\nnp.random.seed(0)\n\n# Read data from Titanic dataset.\ntitanic_url = ('https://raw.githubusercontent.com/amueller/'\n               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')\ndata = pd.read_csv(titanic_url)\n\n# We will train our classifier with the following features:\n# Numeric Features:\n# - age: float.\n# - fare: float.\n# Categorical Features:\n# - embarked: categories encoded as strings {'C', 'S', 'Q'}.\n# - sex: categories encoded as strings {'female', 'male'}.\n# - pclass: ordinal integers {1, 2, 3}.\n\n# We create the preprocessing pipelines for both numeric and categorical data.\nnumeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='median')),\n    ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n    ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n    transformers=[\n        ('num', numeric_transformer, numeric_features),\n        ('cat', categorical_transformer, categorical_features)],\n    remainder='drop')\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n                      ('classifier', LogisticRegression())])\n\nX = data.drop('survived', axis=1)\ny = data['survived']\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n                                                    shuffle=True)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
       ]
     },
     {
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "param_grid = {\n    'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],\n    'logisticregression__C': [0.1, 1.0, 1.0],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %f\"\n       % grid_search.score(X_test, y_test)))"
+        "param_grid = {\n    'preprocessor__num__imputer__strategy': ['mean', 'median'],\n    'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %.3f\"\n       % grid_search.score(X_test, y_test)))"
       ]
     }
   ],
 
@@ -27,14 +27,16 @@
 from __future__ import print_function
 
 import pandas as pd
+import numpy as np
 
-from sklearn.compose import make_column_transformer
-from sklearn.pipeline import make_pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import StandardScaler, CategoricalEncoder
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split, GridSearchCV
 
+np.random.seed(0)
 
 # Read data from Titanic dataset.
 titanic_url = ('https://raw.githubusercontent.com/amueller/'
@@ -49,36 +51,37 @@
 # - embarked: categories encoded as strings {'C', 'S', 'Q'}.
 # - sex: categories encoded as strings {'female', 'male'}.
 # - pclass: ordinal integers {1, 2, 3}.
-numeric_features = ['age', 'fare']
-categorical_features = ['embarked', 'sex', 'pclass']
-
-# Provisionally, use pd.fillna() to impute missing values for categorical
-# features; SimpleImputer will eventually support strategy="constant".
-data[categorical_features] = data[categorical_features].fillna(value='missing')
 
 # We create the preprocessing pipelines for both numeric and categorical data.
-numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
-categorical_transformer = CategoricalEncoder('onehot-dense',
-                                             handle_unknown='ignore')
+numeric_features = ['age', 'fare']
+numeric_transformer = Pipeline(steps=[
+    ('imputer', SimpleImputer(strategy='median')),
+    ('scaler', StandardScaler())])
+
+categorical_features = ['embarked', 'sex', 'pclass']
+categorical_transformer = Pipeline(steps=[
+    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
+    ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])
 
-preprocessing_pl = make_column_transformer(
-    (numeric_features, numeric_transformer),
-    (categorical_features, categorical_transformer),
-    remainder='drop'
-)
+preprocessor = ColumnTransformer(
+    transformers=[
+        ('num', numeric_transformer, numeric_features),
+        ('cat', categorical_transformer, categorical_features)],
+    remainder='drop')
 
 # Append classifier to preprocessing pipeline.
 # Now we have a full prediction pipeline.
-clf = make_pipeline(preprocessing_pl, LogisticRegression())
+clf = Pipeline(steps=[('preprocessor', preprocessor),
+                      ('classifier', LogisticRegression())])
 
 X = data.drop('survived', axis=1)
-y = data.survived.values
+y = data['survived']
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                     shuffle=True)
 
 clf.fit(X_train, y_train)
-print("model score: %f" % clf.score(X_test, y_test))
+print("model score: %.3f" % clf.score(X_test, y_test))
 
 
 ###############################################################################
@@ -93,12 +96,12 @@
 
 
 param_grid = {
-    'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
-    'logisticregression__C': [0.1, 1.0, 1.0],
+    'preprocessor__num__imputer__strategy': ['mean', 'median'],
+    'classifier__C': [0.1, 1.0, 10, 100],
 }
 
 grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)
 grid_search.fit(X_train, y_train)
 
-print(("best logistic regression from grid search: %f"
+print(("best logistic regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)))
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport pandas as pd\n\nfrom sklearn.compose import make_column_transformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, CategoricalEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\n\n# Read data from Titanic dataset.\ntitanic_url = ('https://raw.githubusercontent.com/amueller/'\n 'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')\ndata = pd.read_csv(titanic_url)\n\n# We will train our classifier with the following features:\n# Numeric Features:\n# - age: float.\n# - fare: float.\n# Categorical Features:\n# - embarked: categories encoded as strings {'C', 'S', 'Q'}.\n# - sex: categories encoded as strings {'female', 'male'}.\n# - pclass: ordinal integers {1, 2, 3}.\nnumeric_features = ['age', 'fare']\ncategorical_features = ['embarked', 'sex', 'pclass']\n\n# Provisionally, use pd.fillna() to impute missing values for categorical\n# features; SimpleImputer will eventually support strategy=\"constant\".\ndata[categorical_features] = data[categorical_features].fillna(value='missing')\n\n# We create the preprocessing pipelines for both numeric and categorical data.\nnumeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())\ncategorical_transformer = CategoricalEncoder('onehot-dense',\n handle_unknown='ignore')\n\npreprocessing_pl = make_column_transformer(\n (numeric_features, numeric_transformer),\n (categorical_features, categorical_transformer),\n remainder='drop'\n)\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = make_pipeline(preprocessing_pl, LogisticRegression())\n\nX = data.drop('survived', axis=1)\ny = data.survived.values\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n shuffle=True)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %f\" % clf.score(X_test, y_test))"
	`29`	+ "# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport pandas as pd\nimport numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, CategoricalEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\nnp.random.seed(0)\n\n# Read data from Titanic dataset.\ntitanic_url = ('https://raw.githubusercontent.com/amueller/'\n 'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')\ndata = pd.read_csv(titanic_url)\n\n# We will train our classifier with the following features:\n# Numeric Features:\n# - age: float.\n# - fare: float.\n# Categorical Features:\n# - embarked: categories encoded as strings {'C', 'S', 'Q'}.\n# - sex: categories encoded as strings {'female', 'male'}.\n# - pclass: ordinal integers {1, 2, 3}.\n\n# We create the preprocessing pipelines for both numeric and categorical data.\nnumeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n ('imputer', SimpleImputer(strategy='median')),\n ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n transformers=[\n ('num', numeric_transformer, numeric_features),\n ('cat', categorical_transformer, categorical_features)],\n remainder='drop')\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n ('classifier', LogisticRegression())])\n\nX = data.drop('survived', axis=1)\ny = data['survived']\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n shuffle=True)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
`30`	`30`	`]`
`31`	`31`	`},`
`32`	`32`	`{`
`@@ -44,7 +44,7 @@`
`44`	`44`	`},`
`45`	`45`	`"outputs": [],`
`46`	`46`	`"source": [`
`47`		`- "param_grid = {\n 'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],\n 'logisticregression__C': [0.1, 1.0, 1.0],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %f\"\n % grid_search.score(X_test, y_test)))"`
	`47`	`+ "param_grid = {\n 'preprocessor__num__imputer__strategy': ['mean', 'median'],\n 'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %.3f\"\n % grid_search.score(X_test, y_test)))"`
`48`	`48`	`]`
`49`	`49`	`}`
`50`	`50`	`],`