scikit-learn
diff --git a/‎0.23/.buildinfo
Lines changed: 4 additions & 0 deletions b/‎0.23/.buildinfo
Lines changed: 4 additions & 0 deletions
diff --git a/‎0.23/_downloads/00701bf1048deb8daeb5ad086596d260/plot_lasso_lars.ipynb
Lines changed: 54 additions & 0 deletions b/‎0.23/_downloads/00701bf1048deb8daeb5ad086596d260/plot_lasso_lars.ipynb
Lines changed: 54 additions & 0 deletions
diff --git a/‎0.23/_downloads/00727cbc15047062964b3f55fc4571b7/plot_label_propagation_digits_active_learning.ipynb
Lines changed: 54 additions & 0 deletions b/‎0.23/_downloads/00727cbc15047062964b3f55fc4571b7/plot_label_propagation_digits_active_learning.ipynb
Lines changed: 54 additions & 0 deletions
diff --git a/‎0.23/_downloads/00a5ddd24a9ad44708f4ab3b157ef0ff/plot_stack_predictors.py
Lines changed: 256 additions & 0 deletions b/‎0.23/_downloads/00a5ddd24a9ad44708f4ab3b157ef0ff/plot_stack_predictors.py
Lines changed: 256 additions & 0 deletions
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: 39d2e4864eb47d9365787dcc7c68a004
+tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Lasso path using LARS\n\n\nComputes Lasso Path along the regularization parameter using the LARS\nalgorithm on the diabetes dataset. Each color represents a different\nfeature of the coefficient vector, and this is displayed as a function\nof the regularization parameter.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(__doc__)\n\n# Author: Fabian Pedregosa <[email protected]>\n#         Alexandre Gramfort <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import linear_model\nfrom sklearn import datasets\n\nX, y = datasets.load_diabetes(return_X_y=True)\n\nprint(\"Computing regularization path using the LARS ...\")\n_, _, coefs = linear_model.lars_path(X, y, method='lasso', verbose=True)\n\nxx = np.sum(np.abs(coefs.T), axis=1)\nxx /= xx[-1]\n\nplt.plot(xx, coefs.T)\nymin, ymax = plt.ylim()\nplt.vlines(xx, ymin, ymax, linestyle='dashed')\nplt.xlabel('|coef| / max|coef|')\nplt.ylabel('Coefficients')\nplt.title('LASSO Path')\nplt.axis('tight')\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.2"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Label Propagation digits active learning\n\n\nDemonstrates an active learning technique to learn handwritten digits\nusing label propagation.\n\nWe start by training a label propagation model with only 10 labeled points,\nthen we select the top five most uncertain points to label. Next, we train\nwith 15 labeled points (original 10 + 5 new ones). We repeat this process\nfour times to have a model trained with 30 labeled examples. Note you can\nincrease this to label more than 30 by changing `max_iterations`. Labeling\nmore than 30 can be useful to get a sense for the speed of convergence of\nthis active learning technique.\n\nA plot will appear showing the top 5 most uncertain digits for each iteration\nof training. These may or may not contain mistakes, but we will train the next\nmodel with their true labels.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(__doc__)\n\n# Authors: Clay Woolam <[email protected]>\n# License: BSD\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\nfrom sklearn import datasets\nfrom sklearn.semi_supervised import LabelSpreading\nfrom sklearn.metrics import classification_report, confusion_matrix\n\ndigits = datasets.load_digits()\nrng = np.random.RandomState(0)\nindices = np.arange(len(digits.data))\nrng.shuffle(indices)\n\nX = digits.data[indices[:330]]\ny = digits.target[indices[:330]]\nimages = digits.images[indices[:330]]\n\nn_total_samples = len(y)\nn_labeled_points = 40\nmax_iterations = 5\n\nunlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]\nf = plt.figure()\n\nfor i in range(max_iterations):\n    if len(unlabeled_indices) == 0:\n        print(\"No unlabeled items left to label.\")\n        break\n    y_train = np.copy(y)\n    y_train[unlabeled_indices] = -1\n\n    lp_model = LabelSpreading(gamma=0.25, max_iter=20)\n    lp_model.fit(X, y_train)\n\n    predicted_labels = lp_model.transduction_[unlabeled_indices]\n    true_labels = y[unlabeled_indices]\n\n    cm = confusion_matrix(true_labels, predicted_labels,\n                          labels=lp_model.classes_)\n\n    print(\"Iteration %i %s\" % (i, 70 * \"_\"))\n    print(\"Label Spreading model: %d labeled & %d unlabeled (%d total)\"\n          % (n_labeled_points, n_total_samples - n_labeled_points,\n             n_total_samples))\n\n    print(classification_report(true_labels, predicted_labels))\n\n    print(\"Confusion matrix\")\n    print(cm)\n\n    # compute the entropies of transduced label distributions\n    pred_entropies = stats.distributions.entropy(\n        lp_model.label_distributions_.T)\n\n    # select up to 5 digit examples that the classifier is most uncertain about\n    uncertainty_index = np.argsort(pred_entropies)[::-1]\n    uncertainty_index = uncertainty_index[\n        np.in1d(uncertainty_index, unlabeled_indices)][:5]\n\n    # keep track of indices that we get labels for\n    delete_indices = np.array([], dtype=int)\n\n    # for more than 5 iterations, visualize the gain only on the first 5\n    if i < 5:\n        f.text(.05, (1 - (i + 1) * .183),\n               \"model %d\\n\\nfit with\\n%d labels\" %\n               ((i + 1), i * 5 + 10), size=10)\n    for index, image_index in enumerate(uncertainty_index):\n        image = images[image_index]\n\n        # for more than 5 iterations, visualize the gain only on the first 5\n        if i < 5:\n            sub = f.add_subplot(5, 5, index + 1 + (5 * i))\n            sub.imshow(image, cmap=plt.cm.gray_r, interpolation='none')\n            sub.set_title(\"predict: %i\\ntrue: %i\" % (\n                lp_model.transduction_[image_index], y[image_index]), size=10)\n            sub.axis('off')\n\n        # labeling 5 points, remote from labeled set\n        delete_index, = np.where(unlabeled_indices == image_index)\n        delete_indices = np.concatenate((delete_indices, delete_index))\n\n    unlabeled_indices = np.delete(unlabeled_indices, delete_indices)\n    n_labeled_points += len(uncertainty_index)\n\nf.suptitle(\"Active learning with Label Propagation.\\nRows show 5 most \"\n           \"uncertain labels to learn with the next model.\", y=1.15)\nplt.subplots_adjust(left=0.2, bottom=0.03, right=0.9, top=0.9, wspace=0.2,\n                    hspace=0.85)\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.2"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,256 @@
+"""
+=================================
+Combine predictors using stacking
+=================================
+
+.. currentmodule:: sklearn
+
+Stacking refers to a method to blend estimators. In this strategy, some
+estimators are individually fitted on some training data while a final
+estimator is trained using the stacked predictions of these base estimators.
+
+In this example, we illustrate the use case in which different regressors are
+stacked together and a final linear penalized regressor is used to output the
+prediction. We compare the performance of each individual regressor with the
+stacking strategy. Stacking slightly improves the overall performance.
+
+"""
+print(__doc__)
+
+# Authors: Guillaume Lemaitre <[email protected]>
+#          Maria Telenczuk    <https://github.com/maikia>
+# License: BSD 3 clause
+
+
+###############################################################################
+# Download the dataset
+###############################################################################
+#
+# We will use `Ames Housing`_ dataset which was first compiled by Dean De Cock
+# and became better known after it was used in Kaggle challenge. It is a set
+# of 1460 residential homes in Ames, Iowa, each described by 80 features. We
+# will use it to predict the final logarithmic price of the houses. In this
+# example we will use only 20 most interesting features chosen using
+# GradientBoostingRegressor() and limit number of entries (here we won't go
+# into the details on how to select the most interesting features).
+#
+# The Ames housing dataset is not shipped with scikit-learn and therefore we
+# will fetch it from `OpenML`_.
+#
+# .. _`Ames Housing`: http://jse.amstat.org/v19n3/decock.pdf
+# .. _`OpenML`: https://www.openml.org/d/42165
+
+import numpy as np
+
+from sklearn.datasets import fetch_openml
+from sklearn.utils import shuffle
+
+
+def load_ames_housing():
+    df = fetch_openml(name="house_prices", as_frame=True)
+    X = df.data
+    y = df.target
+
+    features = ['YrSold', 'HeatingQC', 'Street', 'YearRemodAdd', 'Heating',
+                'MasVnrType', 'BsmtUnfSF', 'Foundation', 'MasVnrArea',
+                'MSSubClass', 'ExterQual', 'Condition2', 'GarageCars',
+                'GarageType', 'OverallQual', 'TotalBsmtSF', 'BsmtFinSF1',
+                'HouseStyle', 'MiscFeature', 'MoSold']
+
+    X = X[features]
+    X, y = shuffle(X, y, random_state=0)
+
+    X = X[:600]
+    y = y[:600]
+    return X, np.log(y)
+
+
+X, y = load_ames_housing()
+
+
+###############################################################################
+# Make pipeline to preprocess the data
+###############################################################################
+#
+# Before we can use Ames dataset we still need to do some preprocessing.
+# First, the dataset has many missing values. To impute them, we will exchange
+# categorical missing values with the new category 'missing' while the
+# numerical missing values with the 'mean' of the column. We will also encode
+# the categories with either :class:`sklearn.preprocessing.OneHotEncoder
+# <sklearn.preprocessing.OneHotEncoder>` or
+# :class:`sklearn.preprocessing.OrdinalEncoder
+# <sklearn.preprocessing.OrdinalEncoder>` depending for which type of model we
+# will use them (linear or non-linear model). To falicitate this preprocessing
+# we will make two pipelines.
+# You can skip this section if your data is ready to use and does
+# not need preprocessing
+
+
+from sklearn.compose import make_column_transformer
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.preprocessing import StandardScaler
+
+
+cat_cols = X.columns[X.dtypes == 'O']
+num_cols = X.columns[X.dtypes == 'float64']
+
+categories = [
+    X[column].unique() for column in X[cat_cols]]
+
+for cat in categories:
+    cat[cat == None] = 'missing'  # noqa
+
+cat_proc_nlin = make_pipeline(
+    SimpleImputer(missing_values=None, strategy='constant',
+                  fill_value='missing'),
+    OrdinalEncoder(categories=categories)
+    )
+
+num_proc_nlin = make_pipeline(SimpleImputer(strategy='mean'))
+
+cat_proc_lin = make_pipeline(
+    SimpleImputer(missing_values=None,
+                  strategy='constant',
+                  fill_value='missing'),
+    OneHotEncoder(categories=categories)
+)
+
+num_proc_lin = make_pipeline(
+    SimpleImputer(strategy='mean'),
+    StandardScaler()
+)
+
+# transformation to use for non-linear estimators
+processor_nlin = make_column_transformer(
+    (cat_proc_nlin, cat_cols),
+    (num_proc_nlin, num_cols),
+    remainder='passthrough')
+
+# transformation to use for linear estimators
+processor_lin = make_column_transformer(
+    (cat_proc_lin, cat_cols),
+    (num_proc_lin, num_cols),
+    remainder='passthrough')
+
+
+###############################################################################
+# Stack of predictors on a single data set
+###############################################################################
+#
+# It is sometimes tedious to find the model which will best perform on a given
+# dataset. Stacking provide an alternative by combining the outputs of several
+# learners, without the need to choose a model specifically. The performance of
+# stacking is usually close to the best model and sometimes it can outperform
+# the prediction performance of each individual model.
+#
+# Here, we combine 3 learners (linear and non-linear) and use a ridge regressor
+# to combine their outputs together.
+#
+# Note: although we will make new pipelines with the processors which we wrote
+# in the previous section for the 3 learners, the final estimator RidgeCV()
+# does not need preprocessing of the data as it will be fed with the already
+# preprocessed output from the 3 learners.
+
+
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.ensemble import StackingRegressor
+from sklearn.linear_model import LassoCV
+from sklearn.linear_model import RidgeCV
+
+
+lasso_pipeline = make_pipeline(processor_lin,
+                               LassoCV())
+
+rf_pipeline = make_pipeline(processor_nlin,
+                            RandomForestRegressor(random_state=42))
+
+gradient_pipeline = make_pipeline(
+    processor_nlin,
+    HistGradientBoostingRegressor(random_state=0))
+
+estimators = [('Random Forest', rf_pipeline),
+              ('Lasso', lasso_pipeline),
+              ('Gradient Boosting', gradient_pipeline)]
+
+stacking_regressor = StackingRegressor(estimators=estimators,
+                                       final_estimator=RidgeCV())
+
+
+###############################################################################
+# Measure and plot the results
+###############################################################################
+#
+# Now we can use Ames Housing dataset to make the predictions. We check the
+# performance of each individual predictor as well as of the stack of the
+# regressors.
+#
+# The function ``plot_regression_results`` is used to plot the predicted and
+# true targets.
+
+
+import time
+import matplotlib.pyplot as plt
+from sklearn.model_selection import cross_validate, cross_val_predict
+
+
+def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
+    """Scatter plot of the predicted vs true targets."""
+    ax.plot([y_true.min(), y_true.max()],
+            [y_true.min(), y_true.max()],
+            '--r', linewidth=2)
+    ax.scatter(y_true, y_pred, alpha=0.2)
+
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['left'].set_position(('outward', 10))
+    ax.spines['bottom'].set_position(('outward', 10))
+    ax.set_xlim([y_true.min(), y_true.max()])
+    ax.set_ylim([y_true.min(), y_true.max()])
+    ax.set_xlabel('Measured')
+    ax.set_ylabel('Predicted')
+    extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False,
+                          edgecolor='none', linewidth=0)
+    ax.legend([extra], [scores], loc='upper left')
+    title = title + '\n Evaluation in {:.2f} seconds'.format(elapsed_time)
+    ax.set_title(title)
+
+
+fig, axs = plt.subplots(2, 2, figsize=(9, 7))
+axs = np.ravel(axs)
+
+for ax, (name, est) in zip(axs, estimators + [('Stacking Regressor',
+                                               stacking_regressor)]):
+    start_time = time.time()
+    score = cross_validate(est, X, y,
+                           scoring=['r2', 'neg_mean_absolute_error'],
+                           n_jobs=-1, verbose=0)
+    elapsed_time = time.time() - start_time
+
+    y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0)
+
+    plot_regression_results(
+        ax, y, y_pred,
+        name,
+        (r'$R^2={:.2f} \pm {:.2f}$' + '\n' + r'$MAE={:.2f} \pm {:.2f}$')
+        .format(np.mean(score['test_r2']),
+                np.std(score['test_r2']),
+                -np.mean(score['test_neg_mean_absolute_error']),
+                np.std(score['test_neg_mean_absolute_error'])),
+        elapsed_time)
+
+plt.suptitle('Single predictors versus stacked predictors')
+plt.tight_layout()
+plt.subplots_adjust(top=0.9)
+plt.show()
+
+###############################################################################
+# The stacked regressor will combine the strengths of the different regressors.
+# However, we also see that training the stacked regressor is much more
+# computationally expensive.