Skip to content

Commit 835430a

Browse files
committed
Pushing the docs to 0.23/ for branch: 0.23.X, commit 425564b24a87d043e1a46ebf38e60c6cdb7370ff
1 parent f597c27 commit 835430a

File tree

3,623 files changed

+664866
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3,623 files changed

+664866
-0
lines changed

0.23/.buildinfo

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Sphinx build info version 1
2+
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3+
config: 39d2e4864eb47d9365787dcc7c68a004
4+
tags: 645f666f9bcd5a90fca523b33c5a78b7
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Lasso path using LARS\n\n\nComputes Lasso Path along the regularization parameter using the LARS\nalgorithm on the diabetes dataset. Each color represents a different\nfeature of the coefficient vector, and this is displayed as a function\nof the regularization parameter.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"print(__doc__)\n\n# Author: Fabian Pedregosa <[email protected]>\n# Alexandre Gramfort <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import linear_model\nfrom sklearn import datasets\n\nX, y = datasets.load_diabetes(return_X_y=True)\n\nprint(\"Computing regularization path using the LARS ...\")\n_, _, coefs = linear_model.lars_path(X, y, method='lasso', verbose=True)\n\nxx = np.sum(np.abs(coefs.T), axis=1)\nxx /= xx[-1]\n\nplt.plot(xx, coefs.T)\nymin, ymax = plt.ylim()\nplt.vlines(xx, ymin, ymax, linestyle='dashed')\nplt.xlabel('|coef| / max|coef|')\nplt.ylabel('Coefficients')\nplt.title('LASSO Path')\nplt.axis('tight')\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.8.2"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Label Propagation digits active learning\n\n\nDemonstrates an active learning technique to learn handwritten digits\nusing label propagation.\n\nWe start by training a label propagation model with only 10 labeled points,\nthen we select the top five most uncertain points to label. Next, we train\nwith 15 labeled points (original 10 + 5 new ones). We repeat this process\nfour times to have a model trained with 30 labeled examples. Note you can\nincrease this to label more than 30 by changing `max_iterations`. Labeling\nmore than 30 can be useful to get a sense for the speed of convergence of\nthis active learning technique.\n\nA plot will appear showing the top 5 most uncertain digits for each iteration\nof training. These may or may not contain mistakes, but we will train the next\nmodel with their true labels.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"print(__doc__)\n\n# Authors: Clay Woolam <[email protected]>\n# License: BSD\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\nfrom sklearn import datasets\nfrom sklearn.semi_supervised import LabelSpreading\nfrom sklearn.metrics import classification_report, confusion_matrix\n\ndigits = datasets.load_digits()\nrng = np.random.RandomState(0)\nindices = np.arange(len(digits.data))\nrng.shuffle(indices)\n\nX = digits.data[indices[:330]]\ny = digits.target[indices[:330]]\nimages = digits.images[indices[:330]]\n\nn_total_samples = len(y)\nn_labeled_points = 40\nmax_iterations = 5\n\nunlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]\nf = plt.figure()\n\nfor i in range(max_iterations):\n if len(unlabeled_indices) == 0:\n print(\"No unlabeled items left to label.\")\n break\n y_train = np.copy(y)\n y_train[unlabeled_indices] = -1\n\n lp_model = LabelSpreading(gamma=0.25, max_iter=20)\n lp_model.fit(X, y_train)\n\n predicted_labels = lp_model.transduction_[unlabeled_indices]\n true_labels = y[unlabeled_indices]\n\n cm = confusion_matrix(true_labels, predicted_labels,\n labels=lp_model.classes_)\n\n print(\"Iteration %i %s\" % (i, 70 * \"_\"))\n print(\"Label Spreading model: %d labeled & %d unlabeled (%d total)\"\n % (n_labeled_points, n_total_samples - n_labeled_points,\n n_total_samples))\n\n print(classification_report(true_labels, predicted_labels))\n\n print(\"Confusion matrix\")\n print(cm)\n\n # compute the entropies of transduced label distributions\n pred_entropies = stats.distributions.entropy(\n lp_model.label_distributions_.T)\n\n # select up to 5 digit examples that the classifier is most uncertain about\n uncertainty_index = np.argsort(pred_entropies)[::-1]\n uncertainty_index = uncertainty_index[\n np.in1d(uncertainty_index, unlabeled_indices)][:5]\n\n # keep track of indices that we get labels for\n delete_indices = np.array([], dtype=int)\n\n # for more than 5 iterations, visualize the gain only on the first 5\n if i < 5:\n f.text(.05, (1 - (i + 1) * .183),\n \"model %d\\n\\nfit with\\n%d labels\" %\n ((i + 1), i * 5 + 10), size=10)\n for index, image_index in enumerate(uncertainty_index):\n image = images[image_index]\n\n # for more than 5 iterations, visualize the gain only on the first 5\n if i < 5:\n sub = f.add_subplot(5, 5, index + 1 + (5 * i))\n sub.imshow(image, cmap=plt.cm.gray_r, interpolation='none')\n sub.set_title(\"predict: %i\\ntrue: %i\" % (\n lp_model.transduction_[image_index], y[image_index]), size=10)\n sub.axis('off')\n\n # labeling 5 points, remote from labeled set\n delete_index, = np.where(unlabeled_indices == image_index)\n delete_indices = np.concatenate((delete_indices, delete_index))\n\n unlabeled_indices = np.delete(unlabeled_indices, delete_indices)\n n_labeled_points += len(uncertainty_index)\n\nf.suptitle(\"Active learning with Label Propagation.\\nRows show 5 most \"\n \"uncertain labels to learn with the next model.\", y=1.15)\nplt.subplots_adjust(left=0.2, bottom=0.03, right=0.9, top=0.9, wspace=0.2,\n hspace=0.85)\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.8.2"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
"""
2+
=================================
3+
Combine predictors using stacking
4+
=================================
5+
6+
.. currentmodule:: sklearn
7+
8+
Stacking refers to a method to blend estimators. In this strategy, some
9+
estimators are individually fitted on some training data while a final
10+
estimator is trained using the stacked predictions of these base estimators.
11+
12+
In this example, we illustrate the use case in which different regressors are
13+
stacked together and a final linear penalized regressor is used to output the
14+
prediction. We compare the performance of each individual regressor with the
15+
stacking strategy. Stacking slightly improves the overall performance.
16+
17+
"""
18+
print(__doc__)
19+
20+
# Authors: Guillaume Lemaitre <[email protected]>
21+
# Maria Telenczuk <https://github.com/maikia>
22+
# License: BSD 3 clause
23+
24+
25+
###############################################################################
26+
# Download the dataset
27+
###############################################################################
28+
#
29+
# We will use `Ames Housing`_ dataset which was first compiled by Dean De Cock
30+
# and became better known after it was used in Kaggle challenge. It is a set
31+
# of 1460 residential homes in Ames, Iowa, each described by 80 features. We
32+
# will use it to predict the final logarithmic price of the houses. In this
33+
# example we will use only 20 most interesting features chosen using
34+
# GradientBoostingRegressor() and limit number of entries (here we won't go
35+
# into the details on how to select the most interesting features).
36+
#
37+
# The Ames housing dataset is not shipped with scikit-learn and therefore we
38+
# will fetch it from `OpenML`_.
39+
#
40+
# .. _`Ames Housing`: http://jse.amstat.org/v19n3/decock.pdf
41+
# .. _`OpenML`: https://www.openml.org/d/42165
42+
43+
import numpy as np
44+
45+
from sklearn.datasets import fetch_openml
46+
from sklearn.utils import shuffle
47+
48+
49+
def load_ames_housing():
50+
df = fetch_openml(name="house_prices", as_frame=True)
51+
X = df.data
52+
y = df.target
53+
54+
features = ['YrSold', 'HeatingQC', 'Street', 'YearRemodAdd', 'Heating',
55+
'MasVnrType', 'BsmtUnfSF', 'Foundation', 'MasVnrArea',
56+
'MSSubClass', 'ExterQual', 'Condition2', 'GarageCars',
57+
'GarageType', 'OverallQual', 'TotalBsmtSF', 'BsmtFinSF1',
58+
'HouseStyle', 'MiscFeature', 'MoSold']
59+
60+
X = X[features]
61+
X, y = shuffle(X, y, random_state=0)
62+
63+
X = X[:600]
64+
y = y[:600]
65+
return X, np.log(y)
66+
67+
68+
X, y = load_ames_housing()
69+
70+
71+
###############################################################################
72+
# Make pipeline to preprocess the data
73+
###############################################################################
74+
#
75+
# Before we can use Ames dataset we still need to do some preprocessing.
76+
# First, the dataset has many missing values. To impute them, we will exchange
77+
# categorical missing values with the new category 'missing' while the
78+
# numerical missing values with the 'mean' of the column. We will also encode
79+
# the categories with either :class:`sklearn.preprocessing.OneHotEncoder
80+
# <sklearn.preprocessing.OneHotEncoder>` or
81+
# :class:`sklearn.preprocessing.OrdinalEncoder
82+
# <sklearn.preprocessing.OrdinalEncoder>` depending for which type of model we
83+
# will use them (linear or non-linear model). To falicitate this preprocessing
84+
# we will make two pipelines.
85+
# You can skip this section if your data is ready to use and does
86+
# not need preprocessing
87+
88+
89+
from sklearn.compose import make_column_transformer
90+
from sklearn.impute import SimpleImputer
91+
from sklearn.pipeline import make_pipeline
92+
from sklearn.preprocessing import OneHotEncoder
93+
from sklearn.preprocessing import OrdinalEncoder
94+
from sklearn.preprocessing import StandardScaler
95+
96+
97+
cat_cols = X.columns[X.dtypes == 'O']
98+
num_cols = X.columns[X.dtypes == 'float64']
99+
100+
categories = [
101+
X[column].unique() for column in X[cat_cols]]
102+
103+
for cat in categories:
104+
cat[cat == None] = 'missing' # noqa
105+
106+
cat_proc_nlin = make_pipeline(
107+
SimpleImputer(missing_values=None, strategy='constant',
108+
fill_value='missing'),
109+
OrdinalEncoder(categories=categories)
110+
)
111+
112+
num_proc_nlin = make_pipeline(SimpleImputer(strategy='mean'))
113+
114+
cat_proc_lin = make_pipeline(
115+
SimpleImputer(missing_values=None,
116+
strategy='constant',
117+
fill_value='missing'),
118+
OneHotEncoder(categories=categories)
119+
)
120+
121+
num_proc_lin = make_pipeline(
122+
SimpleImputer(strategy='mean'),
123+
StandardScaler()
124+
)
125+
126+
# transformation to use for non-linear estimators
127+
processor_nlin = make_column_transformer(
128+
(cat_proc_nlin, cat_cols),
129+
(num_proc_nlin, num_cols),
130+
remainder='passthrough')
131+
132+
# transformation to use for linear estimators
133+
processor_lin = make_column_transformer(
134+
(cat_proc_lin, cat_cols),
135+
(num_proc_lin, num_cols),
136+
remainder='passthrough')
137+
138+
139+
###############################################################################
140+
# Stack of predictors on a single data set
141+
###############################################################################
142+
#
143+
# It is sometimes tedious to find the model which will best perform on a given
144+
# dataset. Stacking provide an alternative by combining the outputs of several
145+
# learners, without the need to choose a model specifically. The performance of
146+
# stacking is usually close to the best model and sometimes it can outperform
147+
# the prediction performance of each individual model.
148+
#
149+
# Here, we combine 3 learners (linear and non-linear) and use a ridge regressor
150+
# to combine their outputs together.
151+
#
152+
# Note: although we will make new pipelines with the processors which we wrote
153+
# in the previous section for the 3 learners, the final estimator RidgeCV()
154+
# does not need preprocessing of the data as it will be fed with the already
155+
# preprocessed output from the 3 learners.
156+
157+
158+
from sklearn.experimental import enable_hist_gradient_boosting # noqa
159+
from sklearn.ensemble import HistGradientBoostingRegressor
160+
from sklearn.ensemble import RandomForestRegressor
161+
from sklearn.ensemble import StackingRegressor
162+
from sklearn.linear_model import LassoCV
163+
from sklearn.linear_model import RidgeCV
164+
165+
166+
lasso_pipeline = make_pipeline(processor_lin,
167+
LassoCV())
168+
169+
rf_pipeline = make_pipeline(processor_nlin,
170+
RandomForestRegressor(random_state=42))
171+
172+
gradient_pipeline = make_pipeline(
173+
processor_nlin,
174+
HistGradientBoostingRegressor(random_state=0))
175+
176+
estimators = [('Random Forest', rf_pipeline),
177+
('Lasso', lasso_pipeline),
178+
('Gradient Boosting', gradient_pipeline)]
179+
180+
stacking_regressor = StackingRegressor(estimators=estimators,
181+
final_estimator=RidgeCV())
182+
183+
184+
###############################################################################
185+
# Measure and plot the results
186+
###############################################################################
187+
#
188+
# Now we can use Ames Housing dataset to make the predictions. We check the
189+
# performance of each individual predictor as well as of the stack of the
190+
# regressors.
191+
#
192+
# The function ``plot_regression_results`` is used to plot the predicted and
193+
# true targets.
194+
195+
196+
import time
197+
import matplotlib.pyplot as plt
198+
from sklearn.model_selection import cross_validate, cross_val_predict
199+
200+
201+
def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
202+
"""Scatter plot of the predicted vs true targets."""
203+
ax.plot([y_true.min(), y_true.max()],
204+
[y_true.min(), y_true.max()],
205+
'--r', linewidth=2)
206+
ax.scatter(y_true, y_pred, alpha=0.2)
207+
208+
ax.spines['top'].set_visible(False)
209+
ax.spines['right'].set_visible(False)
210+
ax.get_xaxis().tick_bottom()
211+
ax.get_yaxis().tick_left()
212+
ax.spines['left'].set_position(('outward', 10))
213+
ax.spines['bottom'].set_position(('outward', 10))
214+
ax.set_xlim([y_true.min(), y_true.max()])
215+
ax.set_ylim([y_true.min(), y_true.max()])
216+
ax.set_xlabel('Measured')
217+
ax.set_ylabel('Predicted')
218+
extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False,
219+
edgecolor='none', linewidth=0)
220+
ax.legend([extra], [scores], loc='upper left')
221+
title = title + '\n Evaluation in {:.2f} seconds'.format(elapsed_time)
222+
ax.set_title(title)
223+
224+
225+
fig, axs = plt.subplots(2, 2, figsize=(9, 7))
226+
axs = np.ravel(axs)
227+
228+
for ax, (name, est) in zip(axs, estimators + [('Stacking Regressor',
229+
stacking_regressor)]):
230+
start_time = time.time()
231+
score = cross_validate(est, X, y,
232+
scoring=['r2', 'neg_mean_absolute_error'],
233+
n_jobs=-1, verbose=0)
234+
elapsed_time = time.time() - start_time
235+
236+
y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0)
237+
238+
plot_regression_results(
239+
ax, y, y_pred,
240+
name,
241+
(r'$R^2={:.2f} \pm {:.2f}$' + '\n' + r'$MAE={:.2f} \pm {:.2f}$')
242+
.format(np.mean(score['test_r2']),
243+
np.std(score['test_r2']),
244+
-np.mean(score['test_neg_mean_absolute_error']),
245+
np.std(score['test_neg_mean_absolute_error'])),
246+
elapsed_time)
247+
248+
plt.suptitle('Single predictors versus stacked predictors')
249+
plt.tight_layout()
250+
plt.subplots_adjust(top=0.9)
251+
plt.show()
252+
253+
###############################################################################
254+
# The stacked regressor will combine the strengths of the different regressors.
255+
# However, we also see that training the stacked regressor is much more
256+
# computationally expensive.

0 commit comments

Comments
 (0)