hephzaron
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
1.36 KB b/‎dev/_downloads/auto_examples_jupyter.zip
1.36 KB
diff --git a/‎dev/_downloads/auto_examples_python.zip
1.28 KB b/‎dev/_downloads/auto_examples_python.zip
1.28 KB
diff --git a/‎dev/_downloads/plot_all_scaling.ipynb
Lines changed: 6 additions & 6 deletions b/‎dev/_downloads/plot_all_scaling.ipynb
Lines changed: 6 additions & 6 deletions
diff --git a/‎dev/_downloads/plot_all_scaling.py
Lines changed: 17 additions & 15 deletions b/‎dev/_downloads/plot_all_scaling.py
Lines changed: 17 additions & 15 deletions
diff --git a/‎dev/_downloads/plot_map_data_to_normal.ipynb
Lines changed: 54 additions & 0 deletions b/‎dev/_downloads/plot_map_data_to_normal.ipynb
Lines changed: 54 additions & 0 deletions
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author:  Raghav RV <[email protected]>\n#          Guillaume Lemaitre <[email protected]>\n#          Thomas Unterthiner\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport numpy as np\n\nimport matplotlib as mpl\nfrom matplotlib import pyplot as plt\nfrom matplotlib import cm\n\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.preprocessing import minmax_scale\nfrom sklearn.preprocessing import MaxAbsScaler\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import RobustScaler\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn.preprocessing import QuantileTransformer\nfrom sklearn.preprocessing import PowerTransformer\n\nfrom sklearn.datasets import fetch_california_housing\n\nprint(__doc__)\n\ndataset = fetch_california_housing()\nX_full, y_full = dataset.data, dataset.target\n\n# Take only 2 features to make visualization easier\n# Feature of 0 has a long tail distribution.\n# Feature 5 has a few but very large outliers.\n\nX = X_full[:, [0, 5]]\n\ndistributions = [\n    ('Unscaled data', X),\n    ('Data after standard scaling',\n        StandardScaler().fit_transform(X)),\n    ('Data after min-max scaling',\n        MinMaxScaler().fit_transform(X)),\n    ('Data after max-abs scaling',\n        MaxAbsScaler().fit_transform(X)),\n    ('Data after robust scaling',\n        RobustScaler(quantile_range=(25, 75)).fit_transform(X)),\n    ('Data after power transformation (Box-Cox)',\n     PowerTransformer(method='box-cox').fit_transform(X)),\n    ('Data after quantile transformation (gaussian pdf)',\n        QuantileTransformer(output_distribution='normal')\n        .fit_transform(X)),\n    ('Data after quantile transformation (uniform pdf)',\n        QuantileTransformer(output_distribution='uniform')\n        .fit_transform(X)),\n    ('Data after sample-wise L2 normalizing',\n        Normalizer().fit_transform(X)),\n]\n\n# scale the output between 0 and 1 for the colorbar\ny = minmax_scale(y_full)\n\n# plasma does not exist in matplotlib < 1.5\ncmap = getattr(cm, 'plasma_r', cm.hot_r)\n\ndef create_axes(title, figsize=(16, 6)):\n    fig = plt.figure(figsize=figsize)\n    fig.suptitle(title)\n\n    # define the axis for the first plot\n    left, width = 0.1, 0.22\n    bottom, height = 0.1, 0.7\n    bottom_h = height + 0.15\n    left_h = left + width + 0.02\n\n    rect_scatter = [left, bottom, width, height]\n    rect_histx = [left, bottom_h, width, 0.1]\n    rect_histy = [left_h, bottom, 0.05, height]\n\n    ax_scatter = plt.axes(rect_scatter)\n    ax_histx = plt.axes(rect_histx)\n    ax_histy = plt.axes(rect_histy)\n\n    # define the axis for the zoomed-in plot\n    left = width + left + 0.2\n    left_h = left + width + 0.02\n\n    rect_scatter = [left, bottom, width, height]\n    rect_histx = [left, bottom_h, width, 0.1]\n    rect_histy = [left_h, bottom, 0.05, height]\n\n    ax_scatter_zoom = plt.axes(rect_scatter)\n    ax_histx_zoom = plt.axes(rect_histx)\n    ax_histy_zoom = plt.axes(rect_histy)\n\n    # define the axis for the colorbar\n    left, width = width + left + 0.13, 0.01\n\n    rect_colorbar = [left, bottom, width, height]\n    ax_colorbar = plt.axes(rect_colorbar)\n\n    return ((ax_scatter, ax_histy, ax_histx),\n            (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),\n            ax_colorbar)\n\n\ndef plot_distribution(axes, X, y, hist_nbins=50, title=\"\",\n                      x0_label=\"\", x1_label=\"\"):\n    ax, hist_X1, hist_X0 = axes\n\n    ax.set_title(title)\n    ax.set_xlabel(x0_label)\n    ax.set_ylabel(x1_label)\n\n    # The scatter plot\n    colors = cmap(y)\n    ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker='o', s=5, lw=0, c=colors)\n\n    # Removing the top and the right spine for aesthetics\n    # make nice axis layout\n    ax.spines['top'].set_visible(False)\n    ax.spines['right'].set_visible(False)\n    ax.get_xaxis().tick_bottom()\n    ax.get_yaxis().tick_left()\n    ax.spines['left'].set_position(('outward', 10))\n    ax.spines['bottom'].set_position(('outward', 10))\n\n    # Histogram for axis X1 (feature 5)\n    hist_X1.set_ylim(ax.get_ylim())\n    hist_X1.hist(X[:, 1], bins=hist_nbins, orientation='horizontal',\n                 color='grey', ec='grey')\n    hist_X1.axis('off')\n\n    # Histogram for axis X0 (feature 0)\n    hist_X0.set_xlim(ax.get_xlim())\n    hist_X0.hist(X[:, 0], bins=hist_nbins, orientation='vertical',\n                 color='grey', ec='grey')\n    hist_X0.axis('off')"
+        "# Author:  Raghav RV <[email protected]>\n#          Guillaume Lemaitre <[email protected]>\n#          Thomas Unterthiner\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport numpy as np\n\nimport matplotlib as mpl\nfrom matplotlib import pyplot as plt\nfrom matplotlib import cm\n\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.preprocessing import minmax_scale\nfrom sklearn.preprocessing import MaxAbsScaler\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import RobustScaler\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn.preprocessing import QuantileTransformer\nfrom sklearn.preprocessing import PowerTransformer\n\nfrom sklearn.datasets import fetch_california_housing\n\nprint(__doc__)\n\ndataset = fetch_california_housing()\nX_full, y_full = dataset.data, dataset.target\n\n# Take only 2 features to make visualization easier\n# Feature of 0 has a long tail distribution.\n# Feature 5 has a few but very large outliers.\n\nX = X_full[:, [0, 5]]\n\ndistributions = [\n    ('Unscaled data', X),\n    ('Data after standard scaling',\n        StandardScaler().fit_transform(X)),\n    ('Data after min-max scaling',\n        MinMaxScaler().fit_transform(X)),\n    ('Data after max-abs scaling',\n        MaxAbsScaler().fit_transform(X)),\n    ('Data after robust scaling',\n        RobustScaler(quantile_range=(25, 75)).fit_transform(X)),\n    ('Data after power transformation (Yeo-Johnson)',\n     PowerTransformer(method='yeo-johnson').fit_transform(X)),\n    ('Data after power transformation (Box-Cox)',\n     PowerTransformer(method='box-cox').fit_transform(X)),\n    ('Data after quantile transformation (gaussian pdf)',\n        QuantileTransformer(output_distribution='normal')\n        .fit_transform(X)),\n    ('Data after quantile transformation (uniform pdf)',\n        QuantileTransformer(output_distribution='uniform')\n        .fit_transform(X)),\n    ('Data after sample-wise L2 normalizing',\n        Normalizer().fit_transform(X)),\n]\n\n# scale the output between 0 and 1 for the colorbar\ny = minmax_scale(y_full)\n\n# plasma does not exist in matplotlib < 1.5\ncmap = getattr(cm, 'plasma_r', cm.hot_r)\n\ndef create_axes(title, figsize=(16, 6)):\n    fig = plt.figure(figsize=figsize)\n    fig.suptitle(title)\n\n    # define the axis for the first plot\n    left, width = 0.1, 0.22\n    bottom, height = 0.1, 0.7\n    bottom_h = height + 0.15\n    left_h = left + width + 0.02\n\n    rect_scatter = [left, bottom, width, height]\n    rect_histx = [left, bottom_h, width, 0.1]\n    rect_histy = [left_h, bottom, 0.05, height]\n\n    ax_scatter = plt.axes(rect_scatter)\n    ax_histx = plt.axes(rect_histx)\n    ax_histy = plt.axes(rect_histy)\n\n    # define the axis for the zoomed-in plot\n    left = width + left + 0.2\n    left_h = left + width + 0.02\n\n    rect_scatter = [left, bottom, width, height]\n    rect_histx = [left, bottom_h, width, 0.1]\n    rect_histy = [left_h, bottom, 0.05, height]\n\n    ax_scatter_zoom = plt.axes(rect_scatter)\n    ax_histx_zoom = plt.axes(rect_histx)\n    ax_histy_zoom = plt.axes(rect_histy)\n\n    # define the axis for the colorbar\n    left, width = width + left + 0.13, 0.01\n\n    rect_colorbar = [left, bottom, width, height]\n    ax_colorbar = plt.axes(rect_colorbar)\n\n    return ((ax_scatter, ax_histy, ax_histx),\n            (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),\n            ax_colorbar)\n\n\ndef plot_distribution(axes, X, y, hist_nbins=50, title=\"\",\n                      x0_label=\"\", x1_label=\"\"):\n    ax, hist_X1, hist_X0 = axes\n\n    ax.set_title(title)\n    ax.set_xlabel(x0_label)\n    ax.set_ylabel(x1_label)\n\n    # The scatter plot\n    colors = cmap(y)\n    ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker='o', s=5, lw=0, c=colors)\n\n    # Removing the top and the right spine for aesthetics\n    # make nice axis layout\n    ax.spines['top'].set_visible(False)\n    ax.spines['right'].set_visible(False)\n    ax.get_xaxis().tick_bottom()\n    ax.get_yaxis().tick_left()\n    ax.spines['left'].set_position(('outward', 10))\n    ax.spines['bottom'].set_position(('outward', 10))\n\n    # Histogram for axis X1 (feature 5)\n    hist_X1.set_ylim(ax.get_ylim())\n    hist_X1.hist(X[:, 1], bins=hist_nbins, orientation='horizontal',\n                 color='grey', ec='grey')\n    hist_X1.axis('off')\n\n    # Histogram for axis X0 (feature 0)\n    hist_X0.set_xlim(ax.get_xlim())\n    hist_X0.hist(X[:, 0], bins=hist_nbins, orientation='vertical',\n                 color='grey', ec='grey')\n    hist_X0.axis('off')"
       ]
     },
     {
@@ -141,7 +141,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "PowerTransformer (Box-Cox)\n--------------------------\n\n``PowerTransformer`` applies a power transformation to each\nfeature to make the data more Gaussian-like. Currently,\n``PowerTransformer`` implements the Box-Cox transform. The Box-Cox transform\nfinds the optimal scaling factor to stabilize variance and mimimize skewness\nthrough maximum likelihood estimation. By default, ``PowerTransformer`` also\napplies zero-mean, unit variance normalization to the transformed output.\nNote that Box-Cox can only be applied to positive, non-zero data. Income and\nnumber of households happen to be strictly positive, but if negative values\nare present, a constant can be added to each feature to shift it into the\npositive range - this is known as the two-parameter Box-Cox transform.\n\n"
+        "PowerTransformer\n----------------\n\n``PowerTransformer`` applies a power transformation to each feature to make\nthe data more Gaussian-like. Currently, ``PowerTransformer`` implements the\nYeo-Johnson and Box-Cox transforms. The power transform finds the optimal\nscaling factor to stabilize variance and mimimize skewness through maximum\nlikelihood estimation. By default, ``PowerTransformer`` also applies\nzero-mean, unit variance normalization to the transformed output. Note that\nBox-Cox can only be applied to strictly positive data. Income and number of\nhouseholds happen to be strictly positive, but if negative values are present\nthe Yeo-Johnson transformed is to be preferred.\n\n"
       ]
     },
     {
@@ -152,7 +152,7 @@
       },
       "outputs": [],
       "source": [
-        "make_plot(5)"
+        "make_plot(5)\nmake_plot(6)"
       ]
     },
     {
@@ -170,7 +170,7 @@
       },
       "outputs": [],
       "source": [
-        "make_plot(6)"
+        "make_plot(7)"
       ]
     },
     {
@@ -188,7 +188,7 @@
       },
       "outputs": [],
       "source": [
-        "make_plot(7)"
+        "make_plot(8)"
       ]
     },
     {
@@ -206,7 +206,7 @@
       },
       "outputs": [],
       "source": [
-        "make_plot(8)\n\nplt.show()"
+        "make_plot(9)\n\nplt.show()"
       ]
     }
   ],
 
@@ -87,6 +87,8 @@
         MaxAbsScaler().fit_transform(X)),
     ('Data after robust scaling',
         RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
+    ('Data after power transformation (Yeo-Johnson)',
+     PowerTransformer(method='yeo-johnson').fit_transform(X)),
     ('Data after power transformation (Box-Cox)',
      PowerTransformer(method='box-cox').fit_transform(X)),
     ('Data after quantile transformation (gaussian pdf)',
@@ -294,21 +296,21 @@ def make_plot(item_idx):
 make_plot(4)
 
 ##############################################################################
-# PowerTransformer (Box-Cox)
-# --------------------------
+# PowerTransformer
+# ----------------
 #
-# ``PowerTransformer`` applies a power transformation to each
-# feature to make the data more Gaussian-like. Currently,
-# ``PowerTransformer`` implements the Box-Cox transform. The Box-Cox transform
-# finds the optimal scaling factor to stabilize variance and mimimize skewness
-# through maximum likelihood estimation. By default, ``PowerTransformer`` also
-# applies zero-mean, unit variance normalization to the transformed output.
-# Note that Box-Cox can only be applied to positive, non-zero data. Income and
-# number of households happen to be strictly positive, but if negative values
-# are present, a constant can be added to each feature to shift it into the
-# positive range - this is known as the two-parameter Box-Cox transform.
+# ``PowerTransformer`` applies a power transformation to each feature to make
+# the data more Gaussian-like. Currently, ``PowerTransformer`` implements the
+# Yeo-Johnson and Box-Cox transforms. The power transform finds the optimal
+# scaling factor to stabilize variance and mimimize skewness through maximum
+# likelihood estimation. By default, ``PowerTransformer`` also applies
+# zero-mean, unit variance normalization to the transformed output. Note that
+# Box-Cox can only be applied to strictly positive data. Income and number of
+# households happen to be strictly positive, but if negative values are present
+# the Yeo-Johnson transformed is to be preferred.
 
 make_plot(5)
+make_plot(6)
 
 ##############################################################################
 # QuantileTransformer (Gaussian output)
@@ -319,7 +321,7 @@ def make_plot(item_idx):
 # Note that this non-parametetric transformer introduces saturation artifacts
 # for extreme values.
 
-make_plot(6)
+make_plot(7)
 
 ###################################################################
 # QuantileTransformer (uniform output)
@@ -337,7 +339,7 @@ def make_plot(item_idx):
 # any outlier by setting them to the a priori defined range boundaries (0 and
 # 1).
 
-make_plot(7)
+make_plot(8)
 
 ##############################################################################
 # Normalizer
@@ -350,6 +352,6 @@ def make_plot(item_idx):
 # transformed data only lie in the positive quadrant. This would not be the
 # case if some original features had a mix of positive and negative values.
 
-make_plot(8)
+make_plot(9)
 
 plt.show()
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Map data to a normal distribution\n\n\nThis example demonstrates the use of the Box-Cox and Yeo-Johnson transforms\nthrough :class:`preprocessing.PowerTransformer` to map data from various\ndistributions to a normal distribution.\n\nThe power transform is useful as a transformation in modeling problems where\nhomoscedasticity and normality are desired. Below are examples of Box-Cox and\nYeo-Johnwon applied to six different probability distributions: Lognormal,\nChi-squared, Weibull, Gaussian, Uniform, and Bimodal.\n\nNote that the transformations successfully map the data to a normal\ndistribution when applied to certain datasets, but are ineffective with others.\nThis highlights the importance of visualizing the data before and after\ntransformation.\n\nAlso note that even though Box-Cox seems to perform better than Yeo-Johnson for\nlognormal and chi-squared distributions, keep in mind that Box-Cox does not\nsupport inputs with negative values.\n\nFor comparison, we also add the output from\n:class:`preprocessing.QuantileTransformer`. It can force any arbitrary\ndistribution into a gaussian, provided that there are enough training samples\n(thousands). Because it is a non-parametric method, it is harder to interpret\nthan the parametric ones (Box-Cox and Yeo-Johnson).\n\nOn \"small\" datasets (less than a few hundred points), the quantile transformer\nis prone to overfitting. The use of the power transform is then recommended.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Author: Eric Chang <[email protected]>\n#         Nicolas Hug <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.preprocessing import PowerTransformer\nfrom sklearn.preprocessing import QuantileTransformer\nfrom sklearn.model_selection import train_test_split\n\nprint(__doc__)\n\n\nN_SAMPLES = 1000\nFONT_SIZE = 6\nBINS = 30\n\n\nrng = np.random.RandomState(304)\nbc = PowerTransformer(method='box-cox')\nyj = PowerTransformer(method='yeo-johnson')\nqt = QuantileTransformer(output_distribution='normal', random_state=rng)\nsize = (N_SAMPLES, 1)\n\n\n# lognormal distribution\nX_lognormal = rng.lognormal(size=size)\n\n# chi-squared distribution\ndf = 3\nX_chisq = rng.chisquare(df=df, size=size)\n\n# weibull distribution\na = 50\nX_weibull = rng.weibull(a=a, size=size)\n\n# gaussian distribution\nloc = 100\nX_gaussian = rng.normal(loc=loc, size=size)\n\n# uniform distribution\nX_uniform = rng.uniform(low=0, high=1, size=size)\n\n# bimodal distribution\nloc_a, loc_b = 100, 105\nX_a, X_b = rng.normal(loc=loc_a, size=size), rng.normal(loc=loc_b, size=size)\nX_bimodal = np.concatenate([X_a, X_b], axis=0)\n\n\n# create plots\ndistributions = [\n    ('Lognormal', X_lognormal),\n    ('Chi-squared', X_chisq),\n    ('Weibull', X_weibull),\n    ('Gaussian', X_gaussian),\n    ('Uniform', X_uniform),\n    ('Bimodal', X_bimodal)\n]\n\ncolors = ['firebrick', 'darkorange', 'goldenrod',\n          'seagreen', 'royalblue', 'darkorchid']\n\nfig, axes = plt.subplots(nrows=8, ncols=3, figsize=plt.figaspect(2))\naxes = axes.flatten()\naxes_idxs = [(0, 3, 6, 9), (1, 4, 7, 10), (2, 5, 8, 11), (12, 15, 18, 21),\n             (13, 16, 19, 22), (14, 17, 20, 23)]\naxes_list = [(axes[i], axes[j], axes[k], axes[l])\n             for (i, j, k, l) in axes_idxs]\n\n\nfor distribution, color, axes in zip(distributions, colors, axes_list):\n    name, X = distribution\n    X_train, X_test = train_test_split(X, test_size=.5)\n\n    # perform power transforms and quantile transform\n    X_trans_bc = bc.fit(X_train).transform(X_test)\n    lmbda_bc = round(bc.lambdas_[0], 2)\n    X_trans_yj = yj.fit(X_train).transform(X_test)\n    lmbda_yj = round(yj.lambdas_[0], 2)\n    X_trans_qt = qt.fit(X_train).transform(X_test)\n\n    ax_original, ax_bc, ax_yj, ax_qt = axes\n\n    ax_original.hist(X_train, color=color, bins=BINS)\n    ax_original.set_title(name, fontsize=FONT_SIZE)\n    ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE)\n\n    for ax, X_trans, meth_name, lmbda in zip(\n            (ax_bc, ax_yj, ax_qt),\n            (X_trans_bc, X_trans_yj, X_trans_qt),\n            ('Box-Cox', 'Yeo-Johnson', 'Quantile transform'),\n            (lmbda_bc, lmbda_yj, None)):\n        ax.hist(X_trans, color=color, bins=BINS)\n        title = 'After {}'.format(meth_name)\n        if lmbda is not None:\n            title += '\\n$\\lambda$ = {}'.format(lmbda)\n        ax.set_title(title, fontsize=FONT_SIZE)\n        ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE)\n        ax.set_xlim([-3.5, 3.5])\n\n\nplt.tight_layout()\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.6"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Author: Raghav RV <[email protected]>\n# Guillaume Lemaitre <[email protected]>\n# Thomas Unterthiner\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport numpy as np\n\nimport matplotlib as mpl\nfrom matplotlib import pyplot as plt\nfrom matplotlib import cm\n\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.preprocessing import minmax_scale\nfrom sklearn.preprocessing import MaxAbsScaler\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import RobustScaler\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn.preprocessing import QuantileTransformer\nfrom sklearn.preprocessing import PowerTransformer\n\nfrom sklearn.datasets import fetch_california_housing\n\nprint(__doc__)\n\ndataset = fetch_california_housing()\nX_full, y_full = dataset.data, dataset.target\n\n# Take only 2 features to make visualization easier\n# Feature of 0 has a long tail distribution.\n# Feature 5 has a few but very large outliers.\n\nX = X_full[:, [0, 5]]\n\ndistributions = [\n ('Unscaled data', X),\n ('Data after standard scaling',\n StandardScaler().fit_transform(X)),\n ('Data after min-max scaling',\n MinMaxScaler().fit_transform(X)),\n ('Data after max-abs scaling',\n MaxAbsScaler().fit_transform(X)),\n ('Data after robust scaling',\n RobustScaler(quantile_range=(25, 75)).fit_transform(X)),\n ('Data after power transformation (Box-Cox)',\n PowerTransformer(method='box-cox').fit_transform(X)),\n ('Data after quantile transformation (gaussian pdf)',\n QuantileTransformer(output_distribution='normal')\n .fit_transform(X)),\n ('Data after quantile transformation (uniform pdf)',\n QuantileTransformer(output_distribution='uniform')\n .fit_transform(X)),\n ('Data after sample-wise L2 normalizing',\n Normalizer().fit_transform(X)),\n]\n\n# scale the output between 0 and 1 for the colorbar\ny = minmax_scale(y_full)\n\n# plasma does not exist in matplotlib < 1.5\ncmap = getattr(cm, 'plasma_r', cm.hot_r)\n\ndef create_axes(title, figsize=(16, 6)):\n fig = plt.figure(figsize=figsize)\n fig.suptitle(title)\n\n # define the axis for the first plot\n left, width = 0.1, 0.22\n bottom, height = 0.1, 0.7\n bottom_h = height + 0.15\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter = plt.axes(rect_scatter)\n ax_histx = plt.axes(rect_histx)\n ax_histy = plt.axes(rect_histy)\n\n # define the axis for the zoomed-in plot\n left = width + left + 0.2\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter_zoom = plt.axes(rect_scatter)\n ax_histx_zoom = plt.axes(rect_histx)\n ax_histy_zoom = plt.axes(rect_histy)\n\n # define the axis for the colorbar\n left, width = width + left + 0.13, 0.01\n\n rect_colorbar = [left, bottom, width, height]\n ax_colorbar = plt.axes(rect_colorbar)\n\n return ((ax_scatter, ax_histy, ax_histx),\n (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),\n ax_colorbar)\n\n\ndef plot_distribution(axes, X, y, hist_nbins=50, title=\"\",\n x0_label=\"\", x1_label=\"\"):\n ax, hist_X1, hist_X0 = axes\n\n ax.set_title(title)\n ax.set_xlabel(x0_label)\n ax.set_ylabel(x1_label)\n\n # The scatter plot\n colors = cmap(y)\n ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker='o', s=5, lw=0, c=colors)\n\n # Removing the top and the right spine for aesthetics\n # make nice axis layout\n ax.spines['top'].set_visible(False)\n ax.spines['right'].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n ax.spines['left'].set_position(('outward', 10))\n ax.spines['bottom'].set_position(('outward', 10))\n\n # Histogram for axis X1 (feature 5)\n hist_X1.set_ylim(ax.get_ylim())\n hist_X1.hist(X[:, 1], bins=hist_nbins, orientation='horizontal',\n color='grey', ec='grey')\n hist_X1.axis('off')\n\n # Histogram for axis X0 (feature 0)\n hist_X0.set_xlim(ax.get_xlim())\n hist_X0.hist(X[:, 0], bins=hist_nbins, orientation='vertical',\n color='grey', ec='grey')\n hist_X0.axis('off')"
	`29`	+ "# Author: Raghav RV <[email protected]>\n# Guillaume Lemaitre <[email protected]>\n# Thomas Unterthiner\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport numpy as np\n\nimport matplotlib as mpl\nfrom matplotlib import pyplot as plt\nfrom matplotlib import cm\n\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.preprocessing import minmax_scale\nfrom sklearn.preprocessing import MaxAbsScaler\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import RobustScaler\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn.preprocessing import QuantileTransformer\nfrom sklearn.preprocessing import PowerTransformer\n\nfrom sklearn.datasets import fetch_california_housing\n\nprint(__doc__)\n\ndataset = fetch_california_housing()\nX_full, y_full = dataset.data, dataset.target\n\n# Take only 2 features to make visualization easier\n# Feature of 0 has a long tail distribution.\n# Feature 5 has a few but very large outliers.\n\nX = X_full[:, [0, 5]]\n\ndistributions = [\n ('Unscaled data', X),\n ('Data after standard scaling',\n StandardScaler().fit_transform(X)),\n ('Data after min-max scaling',\n MinMaxScaler().fit_transform(X)),\n ('Data after max-abs scaling',\n MaxAbsScaler().fit_transform(X)),\n ('Data after robust scaling',\n RobustScaler(quantile_range=(25, 75)).fit_transform(X)),\n ('Data after power transformation (Yeo-Johnson)',\n PowerTransformer(method='yeo-johnson').fit_transform(X)),\n ('Data after power transformation (Box-Cox)',\n PowerTransformer(method='box-cox').fit_transform(X)),\n ('Data after quantile transformation (gaussian pdf)',\n QuantileTransformer(output_distribution='normal')\n .fit_transform(X)),\n ('Data after quantile transformation (uniform pdf)',\n QuantileTransformer(output_distribution='uniform')\n .fit_transform(X)),\n ('Data after sample-wise L2 normalizing',\n Normalizer().fit_transform(X)),\n]\n\n# scale the output between 0 and 1 for the colorbar\ny = minmax_scale(y_full)\n\n# plasma does not exist in matplotlib < 1.5\ncmap = getattr(cm, 'plasma_r', cm.hot_r)\n\ndef create_axes(title, figsize=(16, 6)):\n fig = plt.figure(figsize=figsize)\n fig.suptitle(title)\n\n # define the axis for the first plot\n left, width = 0.1, 0.22\n bottom, height = 0.1, 0.7\n bottom_h = height + 0.15\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter = plt.axes(rect_scatter)\n ax_histx = plt.axes(rect_histx)\n ax_histy = plt.axes(rect_histy)\n\n # define the axis for the zoomed-in plot\n left = width + left + 0.2\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter_zoom = plt.axes(rect_scatter)\n ax_histx_zoom = plt.axes(rect_histx)\n ax_histy_zoom = plt.axes(rect_histy)\n\n # define the axis for the colorbar\n left, width = width + left + 0.13, 0.01\n\n rect_colorbar = [left, bottom, width, height]\n ax_colorbar = plt.axes(rect_colorbar)\n\n return ((ax_scatter, ax_histy, ax_histx),\n (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),\n ax_colorbar)\n\n\ndef plot_distribution(axes, X, y, hist_nbins=50, title=\"\",\n x0_label=\"\", x1_label=\"\"):\n ax, hist_X1, hist_X0 = axes\n\n ax.set_title(title)\n ax.set_xlabel(x0_label)\n ax.set_ylabel(x1_label)\n\n # The scatter plot\n colors = cmap(y)\n ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker='o', s=5, lw=0, c=colors)\n\n # Removing the top and the right spine for aesthetics\n # make nice axis layout\n ax.spines['top'].set_visible(False)\n ax.spines['right'].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n ax.spines['left'].set_position(('outward', 10))\n ax.spines['bottom'].set_position(('outward', 10))\n\n # Histogram for axis X1 (feature 5)\n hist_X1.set_ylim(ax.get_ylim())\n hist_X1.hist(X[:, 1], bins=hist_nbins, orientation='horizontal',\n color='grey', ec='grey')\n hist_X1.axis('off')\n\n # Histogram for axis X0 (feature 0)\n hist_X0.set_xlim(ax.get_xlim())\n hist_X0.hist(X[:, 0], bins=hist_nbins, orientation='vertical',\n color='grey', ec='grey')\n hist_X0.axis('off')"
`30`	`30`	`]`
`31`	`31`	`},`
`32`	`32`	`{`
`@@ -141,7 +141,7 @@`
`141`	`141`	`"cell_type": "markdown",`
`142`	`142`	`"metadata": {},`
`143`	`143`	`"source": [`
`144`		- "PowerTransformer (Box-Cox)\n--------------------------\n\n``PowerTransformer`` applies a power transformation to each\nfeature to make the data more Gaussian-like. Currently,\n``PowerTransformer`` implements the Box-Cox transform. The Box-Cox transform\nfinds the optimal scaling factor to stabilize variance and mimimize skewness\nthrough maximum likelihood estimation. By default, ``PowerTransformer`` also\napplies zero-mean, unit variance normalization to the transformed output.\nNote that Box-Cox can only be applied to positive, non-zero data. Income and\nnumber of households happen to be strictly positive, but if negative values\nare present, a constant can be added to each feature to shift it into the\npositive range - this is known as the two-parameter Box-Cox transform.\n\n"
	`144`	+ "PowerTransformer\n----------------\n\n``PowerTransformer`` applies a power transformation to each feature to make\nthe data more Gaussian-like. Currently, ``PowerTransformer`` implements the\nYeo-Johnson and Box-Cox transforms. The power transform finds the optimal\nscaling factor to stabilize variance and mimimize skewness through maximum\nlikelihood estimation. By default, ``PowerTransformer`` also applies\nzero-mean, unit variance normalization to the transformed output. Note that\nBox-Cox can only be applied to strictly positive data. Income and number of\nhouseholds happen to be strictly positive, but if negative values are present\nthe Yeo-Johnson transformed is to be preferred.\n\n"
`145`	`145`	`]`
`146`	`146`	`},`
`147`	`147`	`{`
`@@ -152,7 +152,7 @@`
`152`	`152`	`},`
`153`	`153`	`"outputs": [],`
`154`	`154`	`"source": [`
`155`		`- "make_plot(5)"`
	`155`	`+ "make_plot(5)\nmake_plot(6)"`
`156`	`156`	`]`
`157`	`157`	`},`
`158`	`158`	`{`
`@@ -170,7 +170,7 @@`
`170`	`170`	`},`
`171`	`171`	`"outputs": [],`
`172`	`172`	`"source": [`
`173`		`- "make_plot(6)"`
	`173`	`+ "make_plot(7)"`
`174`	`174`	`]`
`175`	`175`	`},`
`176`	`176`	`{`
`@@ -188,7 +188,7 @@`
`188`	`188`	`},`
`189`	`189`	`"outputs": [],`
`190`	`190`	`"source": [`
`191`		`- "make_plot(7)"`
	`191`	`+ "make_plot(8)"`
`192`	`192`	`]`
`193`	`193`	`},`
`194`	`194`	`{`
`@@ -206,7 +206,7 @@`
`206`	`206`	`},`
`207`	`207`	`"outputs": [],`
`208`	`208`	`"source": [`
`209`		`- "make_plot(8)\n\nplt.show()"`
	`209`	`+ "make_plot(9)\n\nplt.show()"`
`210`	`210`	`]`
`211`	`211`	`}`
`212`	`212`	`],`