lesteve
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
-9.24 KB b/‎dev/_downloads/auto_examples_jupyter.zip
-9.24 KB
diff --git a/‎dev/_downloads/auto_examples_python.zip
-9.43 KB b/‎dev/_downloads/auto_examples_python.zip
-9.43 KB
diff --git a/‎dev/_downloads/column_transformer.ipynb
Lines changed: 54 additions & 0 deletions b/‎dev/_downloads/column_transformer.ipynb
Lines changed: 54 additions & 0 deletions
diff --git a/‎dev/_downloads/column_transformer.py
Lines changed: 136 additions & 0 deletions b/‎dev/_downloads/column_transformer.py
Lines changed: 136 additions & 0 deletions
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 7f91379cee1de94d73196b0aacaf2c24
+config: c6497fb6e649c6f20da8e1e78e49ebd1
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Column Transformer with Heterogeneous Data Sources\n\n\nDatasets can often contain components of that require different feature\nextraction and processing pipelines.  This scenario might occur when:\n\n1. Your dataset consists of heterogeneous data types (e.g. raster images and\n   text captions)\n2. Your dataset is stored in a Pandas DataFrame and different columns\n   require different processing pipelines.\n\nThis example demonstrates how to use\n:class:`sklearn.compose.ColumnTransformer` on a dataset containing\ndifferent types of features.  We use the 20-newsgroups dataset and compute\nstandard bag-of-words features for the subject line and body in separate\npipelines as well as ad hoc features on the body. We combine them (with\nweights) using a ColumnTransformer and finally train a classifier on the\ncombined set of features.\n\nThe choice of features is not particularly helpful, but serves to illustrate\nthe technique.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Author: Matt Terry <[email protected]>\n#\n# License: BSD 3 clause\nfrom __future__ import print_function\n\nimport numpy as np\n\nfrom sklearn.base import BaseEstimator, TransformerMixin\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer\nfrom sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction import DictVectorizer\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics import classification_report\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.svm import SVC\n\n\nclass TextStats(BaseEstimator, TransformerMixin):\n    \"\"\"Extract features from each document for DictVectorizer\"\"\"\n\n    def fit(self, x, y=None):\n        return self\n\n    def transform(self, posts):\n        return [{'length': len(text),\n                 'num_sentences': text.count('.')}\n                for text in posts]\n\n\nclass SubjectBodyExtractor(BaseEstimator, TransformerMixin):\n    \"\"\"Extract the subject & body from a usenet post in a single pass.\n\n    Takes a sequence of strings and produces a dict of sequences.  Keys are\n    `subject` and `body`.\n    \"\"\"\n    def fit(self, x, y=None):\n        return self\n\n    def transform(self, posts):\n        # construct object dtype array with two columns\n        # first column = 'subject' and second column = 'body'\n        features = np.empty(shape=(len(posts), 2), dtype=object)\n        for i, text in enumerate(posts):\n            headers, _, bod = text.partition('\\n\\n')\n            bod = strip_newsgroup_footer(bod)\n            bod = strip_newsgroup_quoting(bod)\n            features[i, 1] = bod\n\n            prefix = 'Subject:'\n            sub = ''\n            for line in headers.split('\\n'):\n                if line.startswith(prefix):\n                    sub = line[len(prefix):]\n                    break\n            features[i, 0] = sub\n\n        return features\n\n\npipeline = Pipeline([\n    # Extract the subject & body\n    ('subjectbody', SubjectBodyExtractor()),\n\n    # Use C toolumnTransformer to combine the features from subject and body\n    ('union', ColumnTransformer(\n        [\n            # Pulling features from the post's subject line (first column)\n            ('subject', TfidfVectorizer(min_df=50), 0),\n\n            # Pipeline for standard bag-of-words model for body (second column)\n            ('body_bow', Pipeline([\n                ('tfidf', TfidfVectorizer()),\n                ('best', TruncatedSVD(n_components=50)),\n            ]), 1),\n\n            # Pipeline for pulling ad hoc features from post's body\n            ('body_stats', Pipeline([\n                ('stats', TextStats()),  # returns a list of dicts\n                ('vect', DictVectorizer()),  # list of dicts -> feature matrix\n            ]), 1),\n        ],\n\n        # weight components in ColumnTransformer\n        transformer_weights={\n            'subject': 0.8,\n            'body_bow': 0.5,\n            'body_stats': 1.0,\n        }\n    )),\n\n    # Use a SVC classifier on the combined features\n    ('svc', SVC(kernel='linear')),\n])\n\n# limit the list of categories to make running this example faster.\ncategories = ['alt.atheism', 'talk.religion.misc']\ntrain = fetch_20newsgroups(random_state=1,\n                           subset='train',\n                           categories=categories,\n                           )\ntest = fetch_20newsgroups(random_state=1,\n                          subset='test',\n                          categories=categories,\n                          )\n\npipeline.fit(train.data, train.target)\ny = pipeline.predict(test.data)\nprint(classification_report(y, test.target))"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,136 @@
+"""
+==================================================
+Column Transformer with Heterogeneous Data Sources
+==================================================
+
+Datasets can often contain components of that require different feature
+extraction and processing pipelines.  This scenario might occur when:
+
+1. Your dataset consists of heterogeneous data types (e.g. raster images and
+   text captions)
+2. Your dataset is stored in a Pandas DataFrame and different columns
+   require different processing pipelines.
+
+This example demonstrates how to use
+:class:`sklearn.compose.ColumnTransformer` on a dataset containing
+different types of features.  We use the 20-newsgroups dataset and compute
+standard bag-of-words features for the subject line and body in separate
+pipelines as well as ad hoc features on the body. We combine them (with
+weights) using a ColumnTransformer and finally train a classifier on the
+combined set of features.
+
+The choice of features is not particularly helpful, but serves to illustrate
+the technique.
+"""
+
+# Author: Matt Terry <[email protected]>
+#
+# License: BSD 3 clause
+from __future__ import print_function
+
+import numpy as np
+
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer
+from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics import classification_report
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.svm import SVC
+
+
+class TextStats(BaseEstimator, TransformerMixin):
+    """Extract features from each document for DictVectorizer"""
+
+    def fit(self, x, y=None):
+        return self
+
+    def transform(self, posts):
+        return [{'length': len(text),
+                 'num_sentences': text.count('.')}
+                for text in posts]
+
+
+class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
+    """Extract the subject & body from a usenet post in a single pass.
+
+    Takes a sequence of strings and produces a dict of sequences.  Keys are
+    `subject` and `body`.
+    """
+    def fit(self, x, y=None):
+        return self
+
+    def transform(self, posts):
+        # construct object dtype array with two columns
+        # first column = 'subject' and second column = 'body'
+        features = np.empty(shape=(len(posts), 2), dtype=object)
+        for i, text in enumerate(posts):
+            headers, _, bod = text.partition('\n\n')
+            bod = strip_newsgroup_footer(bod)
+            bod = strip_newsgroup_quoting(bod)
+            features[i, 1] = bod
+
+            prefix = 'Subject:'
+            sub = ''
+            for line in headers.split('\n'):
+                if line.startswith(prefix):
+                    sub = line[len(prefix):]
+                    break
+            features[i, 0] = sub
+
+        return features
+
+
+pipeline = Pipeline([
+    # Extract the subject & body
+    ('subjectbody', SubjectBodyExtractor()),
+
+    # Use C toolumnTransformer to combine the features from subject and body
+    ('union', ColumnTransformer(
+        [
+            # Pulling features from the post's subject line (first column)
+            ('subject', TfidfVectorizer(min_df=50), 0),
+
+            # Pipeline for standard bag-of-words model for body (second column)
+            ('body_bow', Pipeline([
+                ('tfidf', TfidfVectorizer()),
+                ('best', TruncatedSVD(n_components=50)),
+            ]), 1),
+
+            # Pipeline for pulling ad hoc features from post's body
+            ('body_stats', Pipeline([
+                ('stats', TextStats()),  # returns a list of dicts
+                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
+            ]), 1),
+        ],
+
+        # weight components in ColumnTransformer
+        transformer_weights={
+            'subject': 0.8,
+            'body_bow': 0.5,
+            'body_stats': 1.0,
+        }
+    )),
+
+    # Use a SVC classifier on the combined features
+    ('svc', SVC(kernel='linear')),
+])
+
+# limit the list of categories to make running this example faster.
+categories = ['alt.atheism', 'talk.religion.misc']
+train = fetch_20newsgroups(random_state=1,
+                           subset='train',
+                           categories=categories,
+                           )
+test = fetch_20newsgroups(random_state=1,
+                          subset='test',
+                          categories=categories,
+                          )
+
+pipeline.fit(train.data, train.target)
+y = pipeline.predict(test.data)
+print(classification_report(y, test.target))