scikit-learn
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
5.18 KB b/‎dev/_downloads/auto_examples_jupyter.zip
5.18 KB
diff --git a/‎dev/_downloads/auto_examples_python.zip
4 KB b/‎dev/_downloads/auto_examples_python.zip
4 KB
diff --git a/‎dev/_downloads/column_transformer_mixed_types.ipynb
Lines changed: 72 additions & 0 deletions b/‎dev/_downloads/column_transformer_mixed_types.ipynb
Lines changed: 72 additions & 0 deletions
diff --git a/‎dev/_downloads/column_transformer_mixed_types.py
Lines changed: 104 additions & 0 deletions b/‎dev/_downloads/column_transformer_mixed_types.py
Lines changed: 104 additions & 0 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
910 KB b/‎dev/_downloads/scikit-learn-docs.pdf
910 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_column_transformer_mixed_types_thumb.png
26.2 KB b/‎dev/_images/sphx_glr_column_transformer_mixed_types_thumb.png
26.2 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
618 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
618 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
618 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
618 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
370 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
370 Bytes
@@ -0,0 +1,72 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Column Transformer with Mixed Types\n\n\nThis example illustrates how to apply different preprocessing and\nfeature extraction pipelines to different subsets of features,\nusing :class:`sklearn.compose.ColumnTransformer`.\nThis is particularly handy for the case of datasets that contain\nheterogeneous data types, since we may want to scale the\nnumeric features and one-hot encode the categorical ones.\n\nIn this example, the numeric data is standard-scaled after\nmean-imputation, while the categorical data is one-hot\nencoded after imputing missing values with a new category\n(``'missing'``).\n\nFinally, the preprocessing pipeline is integrated in a\nfull prediction pipeline using :class:`sklearn.pipeline.Pipeline`,\ntogether with a simple classification model.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport pandas as pd\n\nfrom sklearn.compose import make_column_transformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, CategoricalEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\n\n# Read data from Titanic dataset.\ntitanic_url = ('https://raw.githubusercontent.com/amueller/'\n               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')\ndata = pd.read_csv(titanic_url)\n\n# We will train our classifier with the following features:\n# Numeric Features:\n# - age: float.\n# - fare: float.\n# Categorical Features:\n# - embarked: categories encoded as strings {'C', 'S', 'Q'}.\n# - sex: categories encoded as strings {'female', 'male'}.\n# - pclass: ordinal integers {1, 2, 3}.\nnumeric_features = ['age', 'fare']\ncategorical_features = ['embarked', 'sex', 'pclass']\n\n# Provisionally, use pd.fillna() to impute missing values for categorical\n# features; SimpleImputer will eventually support strategy=\"constant\".\ndata[categorical_features] = data[categorical_features].fillna(value='missing')\n\n# We create the preprocessing pipelines for both numeric and categorical data.\nnumeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())\ncategorical_transformer = CategoricalEncoder('onehot-dense',\n                                             handle_unknown='ignore')\n\npreprocessing_pl = make_column_transformer(\n    (numeric_features, numeric_transformer),\n    (categorical_features, categorical_transformer),\n    remainder='drop'\n)\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = make_pipeline(preprocessing_pl, LogisticRegression())\n\nX = data.drop('survived', axis=1)\ny = data.survived.values\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n                                                    shuffle=True)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %f\" % clf.score(X_test, y_test))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Using the prediction pipeline in a grid search\n##############################################################################\n Grid search can also be performed on the different preprocessing steps\n defined in the ``ColumnTransformer`` object, together with the classifier's\n hyperparameters as part of the ``Pipeline``.\n We will search for both the imputer strategy of the numeric preprocessing\n and the regularization parameter of the logistic regression using\n :class:`sklearn.model_selection.GridSearchCV`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "param_grid = {\n    'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],\n    'logisticregression__C': [0.1, 1.0, 1.0],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %f\"\n       % grid_search.score(X_test, y_test)))"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,104 @@
+"""
+===================================
+Column Transformer with Mixed Types
+===================================
+
+This example illustrates how to apply different preprocessing and
+feature extraction pipelines to different subsets of features,
+using :class:`sklearn.compose.ColumnTransformer`.
+This is particularly handy for the case of datasets that contain
+heterogeneous data types, since we may want to scale the
+numeric features and one-hot encode the categorical ones.
+
+In this example, the numeric data is standard-scaled after
+mean-imputation, while the categorical data is one-hot
+encoded after imputing missing values with a new category
+(``'missing'``).
+
+Finally, the preprocessing pipeline is integrated in a
+full prediction pipeline using :class:`sklearn.pipeline.Pipeline`,
+together with a simple classification model.
+"""
+
+# Author: Pedro Morales <[email protected]>
+#
+# License: BSD 3 clause
+
+from __future__ import print_function
+
+import pandas as pd
+
+from sklearn.compose import make_column_transformer
+from sklearn.pipeline import make_pipeline
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler, CategoricalEncoder
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split, GridSearchCV
+
+
+# Read data from Titanic dataset.
+titanic_url = ('https://raw.githubusercontent.com/amueller/'
+               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
+data = pd.read_csv(titanic_url)
+
+# We will train our classifier with the following features:
+# Numeric Features:
+# - age: float.
+# - fare: float.
+# Categorical Features:
+# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
+# - sex: categories encoded as strings {'female', 'male'}.
+# - pclass: ordinal integers {1, 2, 3}.
+numeric_features = ['age', 'fare']
+categorical_features = ['embarked', 'sex', 'pclass']
+
+# Provisionally, use pd.fillna() to impute missing values for categorical
+# features; SimpleImputer will eventually support strategy="constant".
+data[categorical_features] = data[categorical_features].fillna(value='missing')
+
+# We create the preprocessing pipelines for both numeric and categorical data.
+numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
+categorical_transformer = CategoricalEncoder('onehot-dense',
+                                             handle_unknown='ignore')
+
+preprocessing_pl = make_column_transformer(
+    (numeric_features, numeric_transformer),
+    (categorical_features, categorical_transformer),
+    remainder='drop'
+)
+
+# Append classifier to preprocessing pipeline.
+# Now we have a full prediction pipeline.
+clf = make_pipeline(preprocessing_pl, LogisticRegression())
+
+X = data.drop('survived', axis=1)
+y = data.survived.values
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
+                                                    shuffle=True)
+
+clf.fit(X_train, y_train)
+print("model score: %f" % clf.score(X_test, y_test))
+
+
+###############################################################################
+# Using the prediction pipeline in a grid search
+###############################################################################
+# Grid search can also be performed on the different preprocessing steps
+# defined in the ``ColumnTransformer`` object, together with the classifier's
+# hyperparameters as part of the ``Pipeline``.
+# We will search for both the imputer strategy of the numeric preprocessing
+# and the regularization parameter of the logistic regression using
+# :class:`sklearn.model_selection.GridSearchCV`.
+
+
+param_grid = {
+    'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
+    'logisticregression__C': [0.1, 1.0, 1.0],
+}
+
+grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)
+grid_search.fit(X_train, y_train)
+
+print(("best logistic regression from grid search: %f"
+       % grid_search.score(X_test, y_test)))