Skip to content

Commit 6d6066e

Browse files
committed
Pushing the docs to 0.22/ for branch: 0.22.X, commit bfa2750ad3ae2ec8c87195786abd8c5cf4448758
1 parent 36aa9db commit 6d6066e

File tree

3,557 files changed

+636012
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3,557 files changed

+636012
-0
lines changed

0.22/.buildinfo

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Sphinx build info version 1
2+
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3+
config: 506805eb51c13bf5284500a7ba8b5be1
4+
tags: 645f666f9bcd5a90fca523b33c5a78b7
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Lasso path using LARS\n\n\nComputes Lasso Path along the regularization parameter using the LARS\nalgorithm on the diabetes dataset. Each color represents a different\nfeature of the coefficient vector, and this is displayed as a function\nof the regularization parameter.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"print(__doc__)\n\n# Author: Fabian Pedregosa <[email protected]>\n# Alexandre Gramfort <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import linear_model\nfrom sklearn import datasets\n\nX, y = datasets.load_diabetes(return_X_y=True)\n\nprint(\"Computing regularization path using the LARS ...\")\n_, _, coefs = linear_model.lars_path(X, y, method='lasso', verbose=True)\n\nxx = np.sum(np.abs(coefs.T), axis=1)\nxx /= xx[-1]\n\nplt.plot(xx, coefs.T)\nymin, ymax = plt.ylim()\nplt.vlines(xx, ymin, ymax, linestyle='dashed')\nplt.xlabel('|coef| / max|coef|')\nplt.ylabel('Coefficients')\nplt.title('LASSO Path')\nplt.axis('tight')\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.7.5"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Label Propagation digits active learning\n\n\nDemonstrates an active learning technique to learn handwritten digits\nusing label propagation.\n\nWe start by training a label propagation model with only 10 labeled points,\nthen we select the top five most uncertain points to label. Next, we train\nwith 15 labeled points (original 10 + 5 new ones). We repeat this process\nfour times to have a model trained with 30 labeled examples. Note you can\nincrease this to label more than 30 by changing `max_iterations`. Labeling\nmore than 30 can be useful to get a sense for the speed of convergence of\nthis active learning technique.\n\nA plot will appear showing the top 5 most uncertain digits for each iteration\nof training. These may or may not contain mistakes, but we will train the next\nmodel with their true labels.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"print(__doc__)\n\n# Authors: Clay Woolam <[email protected]>\n# License: BSD\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\nfrom sklearn import datasets\nfrom sklearn.semi_supervised import LabelSpreading\nfrom sklearn.metrics import classification_report, confusion_matrix\n\ndigits = datasets.load_digits()\nrng = np.random.RandomState(0)\nindices = np.arange(len(digits.data))\nrng.shuffle(indices)\n\nX = digits.data[indices[:330]]\ny = digits.target[indices[:330]]\nimages = digits.images[indices[:330]]\n\nn_total_samples = len(y)\nn_labeled_points = 40\nmax_iterations = 5\n\nunlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]\nf = plt.figure()\n\nfor i in range(max_iterations):\n if len(unlabeled_indices) == 0:\n print(\"No unlabeled items left to label.\")\n break\n y_train = np.copy(y)\n y_train[unlabeled_indices] = -1\n\n lp_model = LabelSpreading(gamma=0.25, max_iter=20)\n lp_model.fit(X, y_train)\n\n predicted_labels = lp_model.transduction_[unlabeled_indices]\n true_labels = y[unlabeled_indices]\n\n cm = confusion_matrix(true_labels, predicted_labels,\n labels=lp_model.classes_)\n\n print(\"Iteration %i %s\" % (i, 70 * \"_\"))\n print(\"Label Spreading model: %d labeled & %d unlabeled (%d total)\"\n % (n_labeled_points, n_total_samples - n_labeled_points,\n n_total_samples))\n\n print(classification_report(true_labels, predicted_labels))\n\n print(\"Confusion matrix\")\n print(cm)\n\n # compute the entropies of transduced label distributions\n pred_entropies = stats.distributions.entropy(\n lp_model.label_distributions_.T)\n\n # select up to 5 digit examples that the classifier is most uncertain about\n uncertainty_index = np.argsort(pred_entropies)[::-1]\n uncertainty_index = uncertainty_index[\n np.in1d(uncertainty_index, unlabeled_indices)][:5]\n\n # keep track of indices that we get labels for\n delete_indices = np.array([], dtype=int)\n\n # for more than 5 iterations, visualize the gain only on the first 5\n if i < 5:\n f.text(.05, (1 - (i + 1) * .183),\n \"model %d\\n\\nfit with\\n%d labels\" %\n ((i + 1), i * 5 + 10), size=10)\n for index, image_index in enumerate(uncertainty_index):\n image = images[image_index]\n\n # for more than 5 iterations, visualize the gain only on the first 5\n if i < 5:\n sub = f.add_subplot(5, 5, index + 1 + (5 * i))\n sub.imshow(image, cmap=plt.cm.gray_r, interpolation='none')\n sub.set_title(\"predict: %i\\ntrue: %i\" % (\n lp_model.transduction_[image_index], y[image_index]), size=10)\n sub.axis('off')\n\n # labeling 5 points, remote from labeled set\n delete_index, = np.where(unlabeled_indices == image_index)\n delete_indices = np.concatenate((delete_indices, delete_index))\n\n unlabeled_indices = np.delete(unlabeled_indices, delete_indices)\n n_labeled_points += len(uncertainty_index)\n\nf.suptitle(\"Active learning with Label Propagation.\\nRows show 5 most \"\n \"uncertain labels to learn with the next model.\", y=1.15)\nplt.subplots_adjust(left=0.2, bottom=0.03, right=0.9, top=0.9, wspace=0.2,\n hspace=0.85)\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.7.5"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
"""
2+
=================================
3+
Combine predictors using stacking
4+
=================================
5+
6+
Stacking refers to a method to blend estimators. In this strategy, some
7+
estimators are individually fitted on some training data while a final
8+
estimator is trained using the stacked predictions of these base estimators.
9+
10+
In this example, we illustrate the use case in which different regressors are
11+
stacked together and a final linear penalized regressor is used to output the
12+
prediction. We compare the performance of each individual regressor with the
13+
stacking strategy. Stacking slightly improves the overall performance.
14+
15+
"""
16+
print(__doc__)
17+
18+
# Authors: Guillaume Lemaitre <[email protected]>
19+
# License: BSD 3 clause
20+
21+
###############################################################################
22+
# The function ``plot_regression_results`` is used to plot the predicted and
23+
# true targets.
24+
25+
import matplotlib.pyplot as plt
26+
27+
28+
def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
29+
"""Scatter plot of the predicted vs true targets."""
30+
ax.plot([y_true.min(), y_true.max()],
31+
[y_true.min(), y_true.max()],
32+
'--r', linewidth=2)
33+
ax.scatter(y_true, y_pred, alpha=0.2)
34+
35+
ax.spines['top'].set_visible(False)
36+
ax.spines['right'].set_visible(False)
37+
ax.get_xaxis().tick_bottom()
38+
ax.get_yaxis().tick_left()
39+
ax.spines['left'].set_position(('outward', 10))
40+
ax.spines['bottom'].set_position(('outward', 10))
41+
ax.set_xlim([y_true.min(), y_true.max()])
42+
ax.set_ylim([y_true.min(), y_true.max()])
43+
ax.set_xlabel('Measured')
44+
ax.set_ylabel('Predicted')
45+
extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False,
46+
edgecolor='none', linewidth=0)
47+
ax.legend([extra], [scores], loc='upper left')
48+
title = title + '\n Evaluation in {:.2f} seconds'.format(elapsed_time)
49+
ax.set_title(title)
50+
51+
52+
###############################################################################
53+
# Stack of predictors on a single data set
54+
###############################################################################
55+
# It is sometimes tedious to find the model which will best perform on a given
56+
# dataset. Stacking provide an alternative by combining the outputs of several
57+
# learners, without the need to choose a model specifically. The performance of
58+
# stacking is usually close to the best model and sometimes it can outperform
59+
# the prediction performance of each individual model.
60+
#
61+
# Here, we combine 3 learners (linear and non-linear) and use a ridge regressor
62+
# to combine their outputs together.
63+
64+
from sklearn.ensemble import StackingRegressor
65+
from sklearn.ensemble import RandomForestRegressor
66+
from sklearn.experimental import enable_hist_gradient_boosting # noqa
67+
from sklearn.ensemble import HistGradientBoostingRegressor
68+
from sklearn.linear_model import LassoCV
69+
from sklearn.linear_model import RidgeCV
70+
71+
estimators = [
72+
('Random Forest', RandomForestRegressor(random_state=42)),
73+
('Lasso', LassoCV()),
74+
('Gradient Boosting', HistGradientBoostingRegressor(random_state=0))
75+
]
76+
stacking_regressor = StackingRegressor(
77+
estimators=estimators, final_estimator=RidgeCV()
78+
)
79+
80+
81+
###############################################################################
82+
# We used the Boston data set (prediction of house prices). We check the
83+
# performance of each individual predictor as well as the stack of the
84+
# regressors.
85+
86+
import time
87+
import numpy as np
88+
from sklearn.datasets import load_boston
89+
from sklearn.model_selection import cross_validate, cross_val_predict
90+
91+
X, y = load_boston(return_X_y=True)
92+
93+
fig, axs = plt.subplots(2, 2, figsize=(9, 7))
94+
axs = np.ravel(axs)
95+
96+
for ax, (name, est) in zip(axs, estimators + [('Stacking Regressor',
97+
stacking_regressor)]):
98+
start_time = time.time()
99+
score = cross_validate(est, X, y,
100+
scoring=['r2', 'neg_mean_absolute_error'],
101+
n_jobs=-1, verbose=0)
102+
elapsed_time = time.time() - time.time()
103+
104+
y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0)
105+
plot_regression_results(
106+
ax, y, y_pred,
107+
name,
108+
(r'$R^2={:.2f} \pm {:.2f}$' + '\n' + r'$MAE={:.2f} \pm {:.2f}$')
109+
.format(np.mean(score['test_r2']),
110+
np.std(score['test_r2']),
111+
-np.mean(score['test_neg_mean_absolute_error']),
112+
np.std(score['test_neg_mean_absolute_error'])),
113+
elapsed_time)
114+
115+
plt.suptitle('Single predictors versus stacked predictors')
116+
plt.tight_layout()
117+
plt.subplots_adjust(top=0.9)
118+
plt.show()
119+
120+
###############################################################################
121+
# The stacked regressor will combine the strengths of the different regressors.
122+
# However, we also see that training the stacked regressor is much more
123+
# computationally expensive.
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""
2+
Online learning of a dictionary of parts of faces
3+
==================================================
4+
5+
This example uses a large dataset of faces to learn a set of 20 x 20
6+
images patches that constitute faces.
7+
8+
From the programming standpoint, it is interesting because it shows how
9+
to use the online API of the scikit-learn to process a very large
10+
dataset by chunks. The way we proceed is that we load an image at a time
11+
and extract randomly 50 patches from this image. Once we have accumulated
12+
500 of these patches (using 10 images), we run the
13+
:func:`~sklearn.cluster.MiniBatchKMeans.partial_fit` method
14+
of the online KMeans object, MiniBatchKMeans.
15+
16+
The verbose setting on the MiniBatchKMeans enables us to see that some
17+
clusters are reassigned during the successive calls to
18+
partial-fit. This is because the number of patches that they represent
19+
has become too low, and it is better to choose a random new
20+
cluster.
21+
"""
22+
print(__doc__)
23+
24+
import time
25+
26+
import matplotlib.pyplot as plt
27+
import numpy as np
28+
29+
30+
from sklearn import datasets
31+
from sklearn.cluster import MiniBatchKMeans
32+
from sklearn.feature_extraction.image import extract_patches_2d
33+
34+
faces = datasets.fetch_olivetti_faces()
35+
36+
# #############################################################################
37+
# Learn the dictionary of images
38+
39+
print('Learning the dictionary... ')
40+
rng = np.random.RandomState(0)
41+
kmeans = MiniBatchKMeans(n_clusters=81, random_state=rng, verbose=True)
42+
patch_size = (20, 20)
43+
44+
buffer = []
45+
t0 = time.time()
46+
47+
# The online learning part: cycle over the whole dataset 6 times
48+
index = 0
49+
for _ in range(6):
50+
for img in faces.images:
51+
data = extract_patches_2d(img, patch_size, max_patches=50,
52+
random_state=rng)
53+
data = np.reshape(data, (len(data), -1))
54+
buffer.append(data)
55+
index += 1
56+
if index % 10 == 0:
57+
data = np.concatenate(buffer, axis=0)
58+
data -= np.mean(data, axis=0)
59+
data /= np.std(data, axis=0)
60+
kmeans.partial_fit(data)
61+
buffer = []
62+
if index % 100 == 0:
63+
print('Partial fit of %4i out of %i'
64+
% (index, 6 * len(faces.images)))
65+
66+
dt = time.time() - t0
67+
print('done in %.2fs.' % dt)
68+
69+
# #############################################################################
70+
# Plot the results
71+
plt.figure(figsize=(4.2, 4))
72+
for i, patch in enumerate(kmeans.cluster_centers_):
73+
plt.subplot(9, 9, i + 1)
74+
plt.imshow(patch.reshape(patch_size), cmap=plt.cm.gray,
75+
interpolation='nearest')
76+
plt.xticks(())
77+
plt.yticks(())
78+
79+
80+
plt.suptitle('Patches of faces\nTrain time %.1fs on %d patches' %
81+
(dt, 8 * len(faces.images)), fontsize=16)
82+
plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
83+
84+
plt.show()

0 commit comments

Comments
 (0)