Skip to content

Commit ad8de25

Browse files
committed
Pushing the docs to dev/ for branch: master, commit
1 parent 08b2a01 commit ad8de25

File tree

3,287 files changed

+61755
-22235
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3,287 files changed

+61755
-22235
lines changed

dev/.buildinfo

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: 7f91379cee1de94d73196b0aacaf2c24
3+
config: c6497fb6e649c6f20da8e1e78e49ebd1
44
tags: 645f666f9bcd5a90fca523b33c5a78b7
-9.24 KB
Binary file not shown.
-9.43 KB
Binary file not shown.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Column Transformer with Heterogeneous Data Sources\n\n\nDatasets can often contain components of that require different feature\nextraction and processing pipelines. This scenario might occur when:\n\n1. Your dataset consists of heterogeneous data types (e.g. raster images and\n text captions)\n2. Your dataset is stored in a Pandas DataFrame and different columns\n require different processing pipelines.\n\nThis example demonstrates how to use\n:class:`sklearn.compose.ColumnTransformer` on a dataset containing\ndifferent types of features. We use the 20-newsgroups dataset and compute\nstandard bag-of-words features for the subject line and body in separate\npipelines as well as ad hoc features on the body. We combine them (with\nweights) using a ColumnTransformer and finally train a classifier on the\ncombined set of features.\n\nThe choice of features is not particularly helpful, but serves to illustrate\nthe technique.\n\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# Author: Matt Terry <[email protected]>\n#\n# License: BSD 3 clause\nfrom __future__ import print_function\n\nimport numpy as np\n\nfrom sklearn.base import BaseEstimator, TransformerMixin\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer\nfrom sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction import DictVectorizer\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics import classification_report\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.svm import SVC\n\n\nclass TextStats(BaseEstimator, TransformerMixin):\n \"\"\"Extract features from each document for DictVectorizer\"\"\"\n\n def fit(self, x, y=None):\n return self\n\n def transform(self, posts):\n return [{'length': len(text),\n 'num_sentences': text.count('.')}\n for text in posts]\n\n\nclass SubjectBodyExtractor(BaseEstimator, TransformerMixin):\n \"\"\"Extract the subject & body from a usenet post in a single pass.\n\n Takes a sequence of strings and produces a dict of sequences. Keys are\n `subject` and `body`.\n \"\"\"\n def fit(self, x, y=None):\n return self\n\n def transform(self, posts):\n # construct object dtype array with two columns\n # first column = 'subject' and second column = 'body'\n features = np.empty(shape=(len(posts), 2), dtype=object)\n for i, text in enumerate(posts):\n headers, _, bod = text.partition('\\n\\n')\n bod = strip_newsgroup_footer(bod)\n bod = strip_newsgroup_quoting(bod)\n features[i, 1] = bod\n\n prefix = 'Subject:'\n sub = ''\n for line in headers.split('\\n'):\n if line.startswith(prefix):\n sub = line[len(prefix):]\n break\n features[i, 0] = sub\n\n return features\n\n\npipeline = Pipeline([\n # Extract the subject & body\n ('subjectbody', SubjectBodyExtractor()),\n\n # Use C toolumnTransformer to combine the features from subject and body\n ('union', ColumnTransformer(\n [\n # Pulling features from the post's subject line (first column)\n ('subject', TfidfVectorizer(min_df=50), 0),\n\n # Pipeline for standard bag-of-words model for body (second column)\n ('body_bow', Pipeline([\n ('tfidf', TfidfVectorizer()),\n ('best', TruncatedSVD(n_components=50)),\n ]), 1),\n\n # Pipeline for pulling ad hoc features from post's body\n ('body_stats', Pipeline([\n ('stats', TextStats()), # returns a list of dicts\n ('vect', DictVectorizer()), # list of dicts -> feature matrix\n ]), 1),\n ],\n\n # weight components in ColumnTransformer\n transformer_weights={\n 'subject': 0.8,\n 'body_bow': 0.5,\n 'body_stats': 1.0,\n }\n )),\n\n # Use a SVC classifier on the combined features\n ('svc', SVC(kernel='linear')),\n])\n\n# limit the list of categories to make running this example faster.\ncategories = ['alt.atheism', 'talk.religion.misc']\ntrain = fetch_20newsgroups(random_state=1,\n subset='train',\n categories=categories,\n )\ntest = fetch_20newsgroups(random_state=1,\n subset='test',\n categories=categories,\n )\n\npipeline.fit(train.data, train.target)\ny = pipeline.predict(test.data)\nprint(classification_report(y, test.target))"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.6.5"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}

dev/_downloads/column_transformer.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
"""
2+
==================================================
3+
Column Transformer with Heterogeneous Data Sources
4+
==================================================
5+
6+
Datasets can often contain components of that require different feature
7+
extraction and processing pipelines. This scenario might occur when:
8+
9+
1. Your dataset consists of heterogeneous data types (e.g. raster images and
10+
text captions)
11+
2. Your dataset is stored in a Pandas DataFrame and different columns
12+
require different processing pipelines.
13+
14+
This example demonstrates how to use
15+
:class:`sklearn.compose.ColumnTransformer` on a dataset containing
16+
different types of features. We use the 20-newsgroups dataset and compute
17+
standard bag-of-words features for the subject line and body in separate
18+
pipelines as well as ad hoc features on the body. We combine them (with
19+
weights) using a ColumnTransformer and finally train a classifier on the
20+
combined set of features.
21+
22+
The choice of features is not particularly helpful, but serves to illustrate
23+
the technique.
24+
"""
25+
26+
# Author: Matt Terry <[email protected]>
27+
#
28+
# License: BSD 3 clause
29+
from __future__ import print_function
30+
31+
import numpy as np
32+
33+
from sklearn.base import BaseEstimator, TransformerMixin
34+
from sklearn.datasets import fetch_20newsgroups
35+
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer
36+
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting
37+
from sklearn.decomposition import TruncatedSVD
38+
from sklearn.feature_extraction import DictVectorizer
39+
from sklearn.feature_extraction.text import TfidfVectorizer
40+
from sklearn.metrics import classification_report
41+
from sklearn.pipeline import Pipeline
42+
from sklearn.compose import ColumnTransformer
43+
from sklearn.svm import SVC
44+
45+
46+
class TextStats(BaseEstimator, TransformerMixin):
47+
"""Extract features from each document for DictVectorizer"""
48+
49+
def fit(self, x, y=None):
50+
return self
51+
52+
def transform(self, posts):
53+
return [{'length': len(text),
54+
'num_sentences': text.count('.')}
55+
for text in posts]
56+
57+
58+
class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
59+
"""Extract the subject & body from a usenet post in a single pass.
60+
61+
Takes a sequence of strings and produces a dict of sequences. Keys are
62+
`subject` and `body`.
63+
"""
64+
def fit(self, x, y=None):
65+
return self
66+
67+
def transform(self, posts):
68+
# construct object dtype array with two columns
69+
# first column = 'subject' and second column = 'body'
70+
features = np.empty(shape=(len(posts), 2), dtype=object)
71+
for i, text in enumerate(posts):
72+
headers, _, bod = text.partition('\n\n')
73+
bod = strip_newsgroup_footer(bod)
74+
bod = strip_newsgroup_quoting(bod)
75+
features[i, 1] = bod
76+
77+
prefix = 'Subject:'
78+
sub = ''
79+
for line in headers.split('\n'):
80+
if line.startswith(prefix):
81+
sub = line[len(prefix):]
82+
break
83+
features[i, 0] = sub
84+
85+
return features
86+
87+
88+
pipeline = Pipeline([
89+
# Extract the subject & body
90+
('subjectbody', SubjectBodyExtractor()),
91+
92+
# Use C toolumnTransformer to combine the features from subject and body
93+
('union', ColumnTransformer(
94+
[
95+
# Pulling features from the post's subject line (first column)
96+
('subject', TfidfVectorizer(min_df=50), 0),
97+
98+
# Pipeline for standard bag-of-words model for body (second column)
99+
('body_bow', Pipeline([
100+
('tfidf', TfidfVectorizer()),
101+
('best', TruncatedSVD(n_components=50)),
102+
]), 1),
103+
104+
# Pipeline for pulling ad hoc features from post's body
105+
('body_stats', Pipeline([
106+
('stats', TextStats()), # returns a list of dicts
107+
('vect', DictVectorizer()), # list of dicts -> feature matrix
108+
]), 1),
109+
],
110+
111+
# weight components in ColumnTransformer
112+
transformer_weights={
113+
'subject': 0.8,
114+
'body_bow': 0.5,
115+
'body_stats': 1.0,
116+
}
117+
)),
118+
119+
# Use a SVC classifier on the combined features
120+
('svc', SVC(kernel='linear')),
121+
])
122+
123+
# limit the list of categories to make running this example faster.
124+
categories = ['alt.atheism', 'talk.religion.misc']
125+
train = fetch_20newsgroups(random_state=1,
126+
subset='train',
127+
categories=categories,
128+
)
129+
test = fetch_20newsgroups(random_state=1,
130+
subset='test',
131+
categories=categories,
132+
)
133+
134+
pipeline.fit(train.data, train.target)
135+
y = pipeline.predict(test.data)
136+
print(classification_report(y, test.target))

0 commit comments

Comments
 (0)