Skip to content

Commit 16af56f

Browse files
committed
Pushing the docs to 0.20/ for branch: 0.20.X, commit c40726e91e9a49f233318541366835adfb72d576
1 parent 2405087 commit 16af56f

File tree

3,026 files changed

+533223
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3,026 files changed

+533223
-0
lines changed

0.20/.buildinfo

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Sphinx build info version 1
2+
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3+
config: 7ec53ea0763238f487c61d94c6d77d05
4+
tags: 645f666f9bcd5a90fca523b33c5a78b7
1.08 MB
Binary file not shown.
867 KB
Binary file not shown.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Sample pipeline for text feature extraction and evaluation\n\n\nThe dataset used in this example is the 20 newsgroups dataset which will be\nautomatically downloaded and then cached and reused for the document\nclassification example.\n\nYou can adjust the number of categories by giving their names to the dataset\nloader or setting them to None to get the 20 of them.\n\nHere is a sample output of a run on a quad-core machine::\n\n Loading 20 newsgroups dataset for categories:\n ['alt.atheism', 'talk.religion.misc']\n 1427 documents\n 2 categories\n\n Performing grid search...\n pipeline: ['vect', 'tfidf', 'clf']\n parameters:\n {'clf__alpha': (1.0000000000000001e-05, 9.9999999999999995e-07),\n 'clf__max_iter': (10, 50, 80),\n 'clf__penalty': ('l2', 'elasticnet'),\n 'tfidf__use_idf': (True, False),\n 'vect__max_n': (1, 2),\n 'vect__max_df': (0.5, 0.75, 1.0),\n 'vect__max_features': (None, 5000, 10000, 50000)}\n done in 1737.030s\n\n Best score: 0.940\n Best parameters set:\n clf__alpha: 9.9999999999999995e-07\n clf__max_iter: 50\n clf__penalty: 'elasticnet'\n tfidf__use_idf: True\n vect__max_n: 2\n vect__max_df: 0.75\n vect__max_features: 50000\n\n\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n 'alt.atheism',\n 'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n ('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier()),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n 'vect__max_df': (0.5, 0.75, 1.0),\n # 'vect__max_features': (None, 5000, 10000, 50000),\n 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams\n # 'tfidf__use_idf': (True, False),\n # 'tfidf__norm': ('l1', 'l2'),\n 'clf__max_iter': (5,),\n 'clf__alpha': (0.00001, 0.000001),\n 'clf__penalty': ('l2', 'elasticnet'),\n # 'clf__max_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n # multiprocessing requires the fork to happen in a __main__ protected\n # block\n\n # find the best parameters for both the feature extraction and the\n # classifier\n grid_search = GridSearchCV(pipeline, parameters, cv=5,\n n_jobs=-1, verbose=1)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n t0 = time()\n grid_search.fit(data.data, data.target)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.6.6"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
2+
"""
3+
==========================================================
4+
Sample pipeline for text feature extraction and evaluation
5+
==========================================================
6+
7+
The dataset used in this example is the 20 newsgroups dataset which will be
8+
automatically downloaded and then cached and reused for the document
9+
classification example.
10+
11+
You can adjust the number of categories by giving their names to the dataset
12+
loader or setting them to None to get the 20 of them.
13+
14+
Here is a sample output of a run on a quad-core machine::
15+
16+
Loading 20 newsgroups dataset for categories:
17+
['alt.atheism', 'talk.religion.misc']
18+
1427 documents
19+
2 categories
20+
21+
Performing grid search...
22+
pipeline: ['vect', 'tfidf', 'clf']
23+
parameters:
24+
{'clf__alpha': (1.0000000000000001e-05, 9.9999999999999995e-07),
25+
'clf__max_iter': (10, 50, 80),
26+
'clf__penalty': ('l2', 'elasticnet'),
27+
'tfidf__use_idf': (True, False),
28+
'vect__max_n': (1, 2),
29+
'vect__max_df': (0.5, 0.75, 1.0),
30+
'vect__max_features': (None, 5000, 10000, 50000)}
31+
done in 1737.030s
32+
33+
Best score: 0.940
34+
Best parameters set:
35+
clf__alpha: 9.9999999999999995e-07
36+
clf__max_iter: 50
37+
clf__penalty: 'elasticnet'
38+
tfidf__use_idf: True
39+
vect__max_n: 2
40+
vect__max_df: 0.75
41+
vect__max_features: 50000
42+
43+
"""
44+
45+
# Author: Olivier Grisel <[email protected]>
46+
# Peter Prettenhofer <[email protected]>
47+
# Mathieu Blondel <[email protected]>
48+
# License: BSD 3 clause
49+
50+
from __future__ import print_function
51+
52+
from pprint import pprint
53+
from time import time
54+
import logging
55+
56+
from sklearn.datasets import fetch_20newsgroups
57+
from sklearn.feature_extraction.text import CountVectorizer
58+
from sklearn.feature_extraction.text import TfidfTransformer
59+
from sklearn.linear_model import SGDClassifier
60+
from sklearn.model_selection import GridSearchCV
61+
from sklearn.pipeline import Pipeline
62+
63+
print(__doc__)
64+
65+
# Display progress logs on stdout
66+
logging.basicConfig(level=logging.INFO,
67+
format='%(asctime)s %(levelname)s %(message)s')
68+
69+
70+
# #############################################################################
71+
# Load some categories from the training set
72+
categories = [
73+
'alt.atheism',
74+
'talk.religion.misc',
75+
]
76+
# Uncomment the following to do the analysis on all the categories
77+
#categories = None
78+
79+
print("Loading 20 newsgroups dataset for categories:")
80+
print(categories)
81+
82+
data = fetch_20newsgroups(subset='train', categories=categories)
83+
print("%d documents" % len(data.filenames))
84+
print("%d categories" % len(data.target_names))
85+
print()
86+
87+
# #############################################################################
88+
# Define a pipeline combining a text feature extractor with a simple
89+
# classifier
90+
pipeline = Pipeline([
91+
('vect', CountVectorizer()),
92+
('tfidf', TfidfTransformer()),
93+
('clf', SGDClassifier()),
94+
])
95+
96+
# uncommenting more parameters will give better exploring power but will
97+
# increase processing time in a combinatorial way
98+
parameters = {
99+
'vect__max_df': (0.5, 0.75, 1.0),
100+
# 'vect__max_features': (None, 5000, 10000, 50000),
101+
'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
102+
# 'tfidf__use_idf': (True, False),
103+
# 'tfidf__norm': ('l1', 'l2'),
104+
'clf__max_iter': (5,),
105+
'clf__alpha': (0.00001, 0.000001),
106+
'clf__penalty': ('l2', 'elasticnet'),
107+
# 'clf__max_iter': (10, 50, 80),
108+
}
109+
110+
if __name__ == "__main__":
111+
# multiprocessing requires the fork to happen in a __main__ protected
112+
# block
113+
114+
# find the best parameters for both the feature extraction and the
115+
# classifier
116+
grid_search = GridSearchCV(pipeline, parameters, cv=5,
117+
n_jobs=-1, verbose=1)
118+
119+
print("Performing grid search...")
120+
print("pipeline:", [name for name, _ in pipeline.steps])
121+
print("parameters:")
122+
pprint(parameters)
123+
t0 = time()
124+
grid_search.fit(data.data, data.target)
125+
print("done in %0.3fs" % (time() - t0))
126+
print()
127+
128+
print("Best score: %0.3f" % grid_search.best_score_)
129+
print("Best parameters set:")
130+
best_parameters = grid_search.best_estimator_.get_params()
131+
for param_name in sorted(parameters.keys()):
132+
print("\t%s: %r" % (param_name, best_parameters[param_name]))
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Discrete versus Real AdaBoost\n\n\nThis example is based on Figure 10.2 from Hastie et al 2009 [1]_ and\nillustrates the difference in performance between the discrete SAMME [2]_\nboosting algorithm and real SAMME.R boosting algorithm. Both algorithms are\nevaluated on a binary classification task where the target Y is a non-linear\nfunction of 10 input features.\n\nDiscrete SAMME AdaBoost adapts based on errors in predicted class labels\nwhereas real SAMME.R uses the predicted class probabilities.\n\n.. [1] T. Hastie, R. Tibshirani and J. Friedman, \"Elements of Statistical\n Learning Ed. 2\", Springer, 2009.\n\n.. [2] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\", 2009.\n\n\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"print(__doc__)\n\n# Author: Peter Prettenhofer <[email protected]>,\n# Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.metrics import zero_one_loss\nfrom sklearn.ensemble import AdaBoostClassifier\n\n\nn_estimators = 400\n# A learning rate of 1. may not be optimal for both SAMME and SAMME.R\nlearning_rate = 1.\n\nX, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)\n\nX_test, y_test = X[2000:], y[2000:]\nX_train, y_train = X[:2000], y[:2000]\n\ndt_stump = DecisionTreeClassifier(max_depth=1)\ndt_stump.fit(X_train, y_train)\ndt_stump_err = 1.0 - dt_stump.score(X_test, y_test)\n\ndt = DecisionTreeClassifier(max_depth=9)\ndt.fit(X_train, y_train)\ndt_err = 1.0 - dt.score(X_test, y_test)\n\nada_discrete = AdaBoostClassifier(\n base_estimator=dt_stump,\n learning_rate=learning_rate,\n n_estimators=n_estimators,\n algorithm=\"SAMME\")\nada_discrete.fit(X_train, y_train)\n\nada_real = AdaBoostClassifier(\n base_estimator=dt_stump,\n learning_rate=learning_rate,\n n_estimators=n_estimators,\n algorithm=\"SAMME.R\")\nada_real.fit(X_train, y_train)\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-',\n label='Decision Stump Error')\nax.plot([1, n_estimators], [dt_err] * 2, 'k--',\n label='Decision Tree Error')\n\nada_discrete_err = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):\n ada_discrete_err[i] = zero_one_loss(y_pred, y_test)\n\nada_discrete_err_train = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_discrete.staged_predict(X_train)):\n ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train)\n\nada_real_err = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_real.staged_predict(X_test)):\n ada_real_err[i] = zero_one_loss(y_pred, y_test)\n\nada_real_err_train = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_real.staged_predict(X_train)):\n ada_real_err_train[i] = zero_one_loss(y_pred, y_train)\n\nax.plot(np.arange(n_estimators) + 1, ada_discrete_err,\n label='Discrete AdaBoost Test Error',\n color='red')\nax.plot(np.arange(n_estimators) + 1, ada_discrete_err_train,\n label='Discrete AdaBoost Train Error',\n color='blue')\nax.plot(np.arange(n_estimators) + 1, ada_real_err,\n label='Real AdaBoost Test Error',\n color='orange')\nax.plot(np.arange(n_estimators) + 1, ada_real_err_train,\n label='Real AdaBoost Train Error',\n color='green')\n\nax.set_ylim((0.0, 0.5))\nax.set_xlabel('n_estimators')\nax.set_ylabel('error rate')\n\nleg = ax.legend(loc='upper right', fancybox=True)\nleg.get_frame().set_alpha(0.7)\n\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.6.6"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
"""
2+
=============================
3+
Discrete versus Real AdaBoost
4+
=============================
5+
6+
This example is based on Figure 10.2 from Hastie et al 2009 [1]_ and
7+
illustrates the difference in performance between the discrete SAMME [2]_
8+
boosting algorithm and real SAMME.R boosting algorithm. Both algorithms are
9+
evaluated on a binary classification task where the target Y is a non-linear
10+
function of 10 input features.
11+
12+
Discrete SAMME AdaBoost adapts based on errors in predicted class labels
13+
whereas real SAMME.R uses the predicted class probabilities.
14+
15+
.. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical
16+
Learning Ed. 2", Springer, 2009.
17+
18+
.. [2] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
19+
20+
"""
21+
print(__doc__)
22+
23+
# Author: Peter Prettenhofer <[email protected]>,
24+
# Noel Dawe <[email protected]>
25+
#
26+
# License: BSD 3 clause
27+
28+
import numpy as np
29+
import matplotlib.pyplot as plt
30+
31+
from sklearn import datasets
32+
from sklearn.tree import DecisionTreeClassifier
33+
from sklearn.metrics import zero_one_loss
34+
from sklearn.ensemble import AdaBoostClassifier
35+
36+
37+
n_estimators = 400
38+
# A learning rate of 1. may not be optimal for both SAMME and SAMME.R
39+
learning_rate = 1.
40+
41+
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
42+
43+
X_test, y_test = X[2000:], y[2000:]
44+
X_train, y_train = X[:2000], y[:2000]
45+
46+
dt_stump = DecisionTreeClassifier(max_depth=1)
47+
dt_stump.fit(X_train, y_train)
48+
dt_stump_err = 1.0 - dt_stump.score(X_test, y_test)
49+
50+
dt = DecisionTreeClassifier(max_depth=9)
51+
dt.fit(X_train, y_train)
52+
dt_err = 1.0 - dt.score(X_test, y_test)
53+
54+
ada_discrete = AdaBoostClassifier(
55+
base_estimator=dt_stump,
56+
learning_rate=learning_rate,
57+
n_estimators=n_estimators,
58+
algorithm="SAMME")
59+
ada_discrete.fit(X_train, y_train)
60+
61+
ada_real = AdaBoostClassifier(
62+
base_estimator=dt_stump,
63+
learning_rate=learning_rate,
64+
n_estimators=n_estimators,
65+
algorithm="SAMME.R")
66+
ada_real.fit(X_train, y_train)
67+
68+
fig = plt.figure()
69+
ax = fig.add_subplot(111)
70+
71+
ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-',
72+
label='Decision Stump Error')
73+
ax.plot([1, n_estimators], [dt_err] * 2, 'k--',
74+
label='Decision Tree Error')
75+
76+
ada_discrete_err = np.zeros((n_estimators,))
77+
for i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):
78+
ada_discrete_err[i] = zero_one_loss(y_pred, y_test)
79+
80+
ada_discrete_err_train = np.zeros((n_estimators,))
81+
for i, y_pred in enumerate(ada_discrete.staged_predict(X_train)):
82+
ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train)
83+
84+
ada_real_err = np.zeros((n_estimators,))
85+
for i, y_pred in enumerate(ada_real.staged_predict(X_test)):
86+
ada_real_err[i] = zero_one_loss(y_pred, y_test)
87+
88+
ada_real_err_train = np.zeros((n_estimators,))
89+
for i, y_pred in enumerate(ada_real.staged_predict(X_train)):
90+
ada_real_err_train[i] = zero_one_loss(y_pred, y_train)
91+
92+
ax.plot(np.arange(n_estimators) + 1, ada_discrete_err,
93+
label='Discrete AdaBoost Test Error',
94+
color='red')
95+
ax.plot(np.arange(n_estimators) + 1, ada_discrete_err_train,
96+
label='Discrete AdaBoost Train Error',
97+
color='blue')
98+
ax.plot(np.arange(n_estimators) + 1, ada_real_err,
99+
label='Real AdaBoost Test Error',
100+
color='orange')
101+
ax.plot(np.arange(n_estimators) + 1, ada_real_err_train,
102+
label='Real AdaBoost Train Error',
103+
color='green')
104+
105+
ax.set_ylim((0.0, 0.5))
106+
ax.set_xlabel('n_estimators')
107+
ax.set_ylabel('error rate')
108+
109+
leg = ax.legend(loc='upper right', fancybox=True)
110+
leg.get_frame().set_alpha(0.7)
111+
112+
plt.show()

0 commit comments

Comments
 (0)