Skip to content

Commit c4feed2

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 2d232acdeb66f1f983d389115de83c33e23b1276
1 parent dc37a0b commit c4feed2

File tree

1,092 files changed

+4085
-3934
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,092 files changed

+4085
-3934
lines changed
1.36 KB
Binary file not shown.
1.28 KB
Binary file not shown.

dev/_downloads/plot_all_scaling.ipynb

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Raghav RV <[email protected]>\n# Guillaume Lemaitre <[email protected]>\n# Thomas Unterthiner\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport numpy as np\n\nimport matplotlib as mpl\nfrom matplotlib import pyplot as plt\nfrom matplotlib import cm\n\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.preprocessing import minmax_scale\nfrom sklearn.preprocessing import MaxAbsScaler\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import RobustScaler\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn.preprocessing import QuantileTransformer\nfrom sklearn.preprocessing import PowerTransformer\n\nfrom sklearn.datasets import fetch_california_housing\n\nprint(__doc__)\n\ndataset = fetch_california_housing()\nX_full, y_full = dataset.data, dataset.target\n\n# Take only 2 features to make visualization easier\n# Feature of 0 has a long tail distribution.\n# Feature 5 has a few but very large outliers.\n\nX = X_full[:, [0, 5]]\n\ndistributions = [\n ('Unscaled data', X),\n ('Data after standard scaling',\n StandardScaler().fit_transform(X)),\n ('Data after min-max scaling',\n MinMaxScaler().fit_transform(X)),\n ('Data after max-abs scaling',\n MaxAbsScaler().fit_transform(X)),\n ('Data after robust scaling',\n RobustScaler(quantile_range=(25, 75)).fit_transform(X)),\n ('Data after power transformation (Box-Cox)',\n PowerTransformer(method='box-cox').fit_transform(X)),\n ('Data after quantile transformation (gaussian pdf)',\n QuantileTransformer(output_distribution='normal')\n .fit_transform(X)),\n ('Data after quantile transformation (uniform pdf)',\n QuantileTransformer(output_distribution='uniform')\n .fit_transform(X)),\n ('Data after sample-wise L2 normalizing',\n Normalizer().fit_transform(X)),\n]\n\n# scale the output between 0 and 1 for the colorbar\ny = minmax_scale(y_full)\n\n# plasma does not exist in matplotlib < 1.5\ncmap = getattr(cm, 'plasma_r', cm.hot_r)\n\ndef create_axes(title, figsize=(16, 6)):\n fig = plt.figure(figsize=figsize)\n fig.suptitle(title)\n\n # define the axis for the first plot\n left, width = 0.1, 0.22\n bottom, height = 0.1, 0.7\n bottom_h = height + 0.15\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter = plt.axes(rect_scatter)\n ax_histx = plt.axes(rect_histx)\n ax_histy = plt.axes(rect_histy)\n\n # define the axis for the zoomed-in plot\n left = width + left + 0.2\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter_zoom = plt.axes(rect_scatter)\n ax_histx_zoom = plt.axes(rect_histx)\n ax_histy_zoom = plt.axes(rect_histy)\n\n # define the axis for the colorbar\n left, width = width + left + 0.13, 0.01\n\n rect_colorbar = [left, bottom, width, height]\n ax_colorbar = plt.axes(rect_colorbar)\n\n return ((ax_scatter, ax_histy, ax_histx),\n (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),\n ax_colorbar)\n\n\ndef plot_distribution(axes, X, y, hist_nbins=50, title=\"\",\n x0_label=\"\", x1_label=\"\"):\n ax, hist_X1, hist_X0 = axes\n\n ax.set_title(title)\n ax.set_xlabel(x0_label)\n ax.set_ylabel(x1_label)\n\n # The scatter plot\n colors = cmap(y)\n ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker='o', s=5, lw=0, c=colors)\n\n # Removing the top and the right spine for aesthetics\n # make nice axis layout\n ax.spines['top'].set_visible(False)\n ax.spines['right'].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n ax.spines['left'].set_position(('outward', 10))\n ax.spines['bottom'].set_position(('outward', 10))\n\n # Histogram for axis X1 (feature 5)\n hist_X1.set_ylim(ax.get_ylim())\n hist_X1.hist(X[:, 1], bins=hist_nbins, orientation='horizontal',\n color='grey', ec='grey')\n hist_X1.axis('off')\n\n # Histogram for axis X0 (feature 0)\n hist_X0.set_xlim(ax.get_xlim())\n hist_X0.hist(X[:, 0], bins=hist_nbins, orientation='vertical',\n color='grey', ec='grey')\n hist_X0.axis('off')"
29+
"# Author: Raghav RV <[email protected]>\n# Guillaume Lemaitre <[email protected]>\n# Thomas Unterthiner\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport numpy as np\n\nimport matplotlib as mpl\nfrom matplotlib import pyplot as plt\nfrom matplotlib import cm\n\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.preprocessing import minmax_scale\nfrom sklearn.preprocessing import MaxAbsScaler\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import RobustScaler\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn.preprocessing import QuantileTransformer\nfrom sklearn.preprocessing import PowerTransformer\n\nfrom sklearn.datasets import fetch_california_housing\n\nprint(__doc__)\n\ndataset = fetch_california_housing()\nX_full, y_full = dataset.data, dataset.target\n\n# Take only 2 features to make visualization easier\n# Feature of 0 has a long tail distribution.\n# Feature 5 has a few but very large outliers.\n\nX = X_full[:, [0, 5]]\n\ndistributions = [\n ('Unscaled data', X),\n ('Data after standard scaling',\n StandardScaler().fit_transform(X)),\n ('Data after min-max scaling',\n MinMaxScaler().fit_transform(X)),\n ('Data after max-abs scaling',\n MaxAbsScaler().fit_transform(X)),\n ('Data after robust scaling',\n RobustScaler(quantile_range=(25, 75)).fit_transform(X)),\n ('Data after power transformation (Yeo-Johnson)',\n PowerTransformer(method='yeo-johnson').fit_transform(X)),\n ('Data after power transformation (Box-Cox)',\n PowerTransformer(method='box-cox').fit_transform(X)),\n ('Data after quantile transformation (gaussian pdf)',\n QuantileTransformer(output_distribution='normal')\n .fit_transform(X)),\n ('Data after quantile transformation (uniform pdf)',\n QuantileTransformer(output_distribution='uniform')\n .fit_transform(X)),\n ('Data after sample-wise L2 normalizing',\n Normalizer().fit_transform(X)),\n]\n\n# scale the output between 0 and 1 for the colorbar\ny = minmax_scale(y_full)\n\n# plasma does not exist in matplotlib < 1.5\ncmap = getattr(cm, 'plasma_r', cm.hot_r)\n\ndef create_axes(title, figsize=(16, 6)):\n fig = plt.figure(figsize=figsize)\n fig.suptitle(title)\n\n # define the axis for the first plot\n left, width = 0.1, 0.22\n bottom, height = 0.1, 0.7\n bottom_h = height + 0.15\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter = plt.axes(rect_scatter)\n ax_histx = plt.axes(rect_histx)\n ax_histy = plt.axes(rect_histy)\n\n # define the axis for the zoomed-in plot\n left = width + left + 0.2\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter_zoom = plt.axes(rect_scatter)\n ax_histx_zoom = plt.axes(rect_histx)\n ax_histy_zoom = plt.axes(rect_histy)\n\n # define the axis for the colorbar\n left, width = width + left + 0.13, 0.01\n\n rect_colorbar = [left, bottom, width, height]\n ax_colorbar = plt.axes(rect_colorbar)\n\n return ((ax_scatter, ax_histy, ax_histx),\n (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),\n ax_colorbar)\n\n\ndef plot_distribution(axes, X, y, hist_nbins=50, title=\"\",\n x0_label=\"\", x1_label=\"\"):\n ax, hist_X1, hist_X0 = axes\n\n ax.set_title(title)\n ax.set_xlabel(x0_label)\n ax.set_ylabel(x1_label)\n\n # The scatter plot\n colors = cmap(y)\n ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker='o', s=5, lw=0, c=colors)\n\n # Removing the top and the right spine for aesthetics\n # make nice axis layout\n ax.spines['top'].set_visible(False)\n ax.spines['right'].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n ax.spines['left'].set_position(('outward', 10))\n ax.spines['bottom'].set_position(('outward', 10))\n\n # Histogram for axis X1 (feature 5)\n hist_X1.set_ylim(ax.get_ylim())\n hist_X1.hist(X[:, 1], bins=hist_nbins, orientation='horizontal',\n color='grey', ec='grey')\n hist_X1.axis('off')\n\n # Histogram for axis X0 (feature 0)\n hist_X0.set_xlim(ax.get_xlim())\n hist_X0.hist(X[:, 0], bins=hist_nbins, orientation='vertical',\n color='grey', ec='grey')\n hist_X0.axis('off')"
3030
]
3131
},
3232
{
@@ -141,7 +141,7 @@
141141
"cell_type": "markdown",
142142
"metadata": {},
143143
"source": [
144-
"PowerTransformer (Box-Cox)\n--------------------------\n\n``PowerTransformer`` applies a power transformation to each\nfeature to make the data more Gaussian-like. Currently,\n``PowerTransformer`` implements the Box-Cox transform. The Box-Cox transform\nfinds the optimal scaling factor to stabilize variance and mimimize skewness\nthrough maximum likelihood estimation. By default, ``PowerTransformer`` also\napplies zero-mean, unit variance normalization to the transformed output.\nNote that Box-Cox can only be applied to positive, non-zero data. Income and\nnumber of households happen to be strictly positive, but if negative values\nare present, a constant can be added to each feature to shift it into the\npositive range - this is known as the two-parameter Box-Cox transform.\n\n"
144+
"PowerTransformer\n----------------\n\n``PowerTransformer`` applies a power transformation to each feature to make\nthe data more Gaussian-like. Currently, ``PowerTransformer`` implements the\nYeo-Johnson and Box-Cox transforms. The power transform finds the optimal\nscaling factor to stabilize variance and mimimize skewness through maximum\nlikelihood estimation. By default, ``PowerTransformer`` also applies\nzero-mean, unit variance normalization to the transformed output. Note that\nBox-Cox can only be applied to strictly positive data. Income and number of\nhouseholds happen to be strictly positive, but if negative values are present\nthe Yeo-Johnson transformed is to be preferred.\n\n"
145145
]
146146
},
147147
{
@@ -152,7 +152,7 @@
152152
},
153153
"outputs": [],
154154
"source": [
155-
"make_plot(5)"
155+
"make_plot(5)\nmake_plot(6)"
156156
]
157157
},
158158
{
@@ -170,7 +170,7 @@
170170
},
171171
"outputs": [],
172172
"source": [
173-
"make_plot(6)"
173+
"make_plot(7)"
174174
]
175175
},
176176
{
@@ -188,7 +188,7 @@
188188
},
189189
"outputs": [],
190190
"source": [
191-
"make_plot(7)"
191+
"make_plot(8)"
192192
]
193193
},
194194
{
@@ -206,7 +206,7 @@
206206
},
207207
"outputs": [],
208208
"source": [
209-
"make_plot(8)\n\nplt.show()"
209+
"make_plot(9)\n\nplt.show()"
210210
]
211211
}
212212
],

dev/_downloads/plot_all_scaling.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@
8787
MaxAbsScaler().fit_transform(X)),
8888
('Data after robust scaling',
8989
RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
90+
('Data after power transformation (Yeo-Johnson)',
91+
PowerTransformer(method='yeo-johnson').fit_transform(X)),
9092
('Data after power transformation (Box-Cox)',
9193
PowerTransformer(method='box-cox').fit_transform(X)),
9294
('Data after quantile transformation (gaussian pdf)',
@@ -294,21 +296,21 @@ def make_plot(item_idx):
294296
make_plot(4)
295297

296298
##############################################################################
297-
# PowerTransformer (Box-Cox)
298-
# --------------------------
299+
# PowerTransformer
300+
# ----------------
299301
#
300-
# ``PowerTransformer`` applies a power transformation to each
301-
# feature to make the data more Gaussian-like. Currently,
302-
# ``PowerTransformer`` implements the Box-Cox transform. The Box-Cox transform
303-
# finds the optimal scaling factor to stabilize variance and mimimize skewness
304-
# through maximum likelihood estimation. By default, ``PowerTransformer`` also
305-
# applies zero-mean, unit variance normalization to the transformed output.
306-
# Note that Box-Cox can only be applied to positive, non-zero data. Income and
307-
# number of households happen to be strictly positive, but if negative values
308-
# are present, a constant can be added to each feature to shift it into the
309-
# positive range - this is known as the two-parameter Box-Cox transform.
302+
# ``PowerTransformer`` applies a power transformation to each feature to make
303+
# the data more Gaussian-like. Currently, ``PowerTransformer`` implements the
304+
# Yeo-Johnson and Box-Cox transforms. The power transform finds the optimal
305+
# scaling factor to stabilize variance and mimimize skewness through maximum
306+
# likelihood estimation. By default, ``PowerTransformer`` also applies
307+
# zero-mean, unit variance normalization to the transformed output. Note that
308+
# Box-Cox can only be applied to strictly positive data. Income and number of
309+
# households happen to be strictly positive, but if negative values are present
310+
# the Yeo-Johnson transformed is to be preferred.
310311

311312
make_plot(5)
313+
make_plot(6)
312314

313315
##############################################################################
314316
# QuantileTransformer (Gaussian output)
@@ -319,7 +321,7 @@ def make_plot(item_idx):
319321
# Note that this non-parametetric transformer introduces saturation artifacts
320322
# for extreme values.
321323

322-
make_plot(6)
324+
make_plot(7)
323325

324326
###################################################################
325327
# QuantileTransformer (uniform output)
@@ -337,7 +339,7 @@ def make_plot(item_idx):
337339
# any outlier by setting them to the a priori defined range boundaries (0 and
338340
# 1).
339341

340-
make_plot(7)
342+
make_plot(8)
341343

342344
##############################################################################
343345
# Normalizer
@@ -350,6 +352,6 @@ def make_plot(item_idx):
350352
# transformed data only lie in the positive quadrant. This would not be the
351353
# case if some original features had a mix of positive and negative values.
352354

353-
make_plot(8)
355+
make_plot(9)
354356

355357
plt.show()
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Map data to a normal distribution\n\n\nThis example demonstrates the use of the Box-Cox and Yeo-Johnson transforms\nthrough :class:`preprocessing.PowerTransformer` to map data from various\ndistributions to a normal distribution.\n\nThe power transform is useful as a transformation in modeling problems where\nhomoscedasticity and normality are desired. Below are examples of Box-Cox and\nYeo-Johnwon applied to six different probability distributions: Lognormal,\nChi-squared, Weibull, Gaussian, Uniform, and Bimodal.\n\nNote that the transformations successfully map the data to a normal\ndistribution when applied to certain datasets, but are ineffective with others.\nThis highlights the importance of visualizing the data before and after\ntransformation.\n\nAlso note that even though Box-Cox seems to perform better than Yeo-Johnson for\nlognormal and chi-squared distributions, keep in mind that Box-Cox does not\nsupport inputs with negative values.\n\nFor comparison, we also add the output from\n:class:`preprocessing.QuantileTransformer`. It can force any arbitrary\ndistribution into a gaussian, provided that there are enough training samples\n(thousands). Because it is a non-parametric method, it is harder to interpret\nthan the parametric ones (Box-Cox and Yeo-Johnson).\n\nOn \"small\" datasets (less than a few hundred points), the quantile transformer\nis prone to overfitting. The use of the power transform is then recommended.\n\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# Author: Eric Chang <[email protected]>\n# Nicolas Hug <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.preprocessing import PowerTransformer\nfrom sklearn.preprocessing import QuantileTransformer\nfrom sklearn.model_selection import train_test_split\n\nprint(__doc__)\n\n\nN_SAMPLES = 1000\nFONT_SIZE = 6\nBINS = 30\n\n\nrng = np.random.RandomState(304)\nbc = PowerTransformer(method='box-cox')\nyj = PowerTransformer(method='yeo-johnson')\nqt = QuantileTransformer(output_distribution='normal', random_state=rng)\nsize = (N_SAMPLES, 1)\n\n\n# lognormal distribution\nX_lognormal = rng.lognormal(size=size)\n\n# chi-squared distribution\ndf = 3\nX_chisq = rng.chisquare(df=df, size=size)\n\n# weibull distribution\na = 50\nX_weibull = rng.weibull(a=a, size=size)\n\n# gaussian distribution\nloc = 100\nX_gaussian = rng.normal(loc=loc, size=size)\n\n# uniform distribution\nX_uniform = rng.uniform(low=0, high=1, size=size)\n\n# bimodal distribution\nloc_a, loc_b = 100, 105\nX_a, X_b = rng.normal(loc=loc_a, size=size), rng.normal(loc=loc_b, size=size)\nX_bimodal = np.concatenate([X_a, X_b], axis=0)\n\n\n# create plots\ndistributions = [\n ('Lognormal', X_lognormal),\n ('Chi-squared', X_chisq),\n ('Weibull', X_weibull),\n ('Gaussian', X_gaussian),\n ('Uniform', X_uniform),\n ('Bimodal', X_bimodal)\n]\n\ncolors = ['firebrick', 'darkorange', 'goldenrod',\n 'seagreen', 'royalblue', 'darkorchid']\n\nfig, axes = plt.subplots(nrows=8, ncols=3, figsize=plt.figaspect(2))\naxes = axes.flatten()\naxes_idxs = [(0, 3, 6, 9), (1, 4, 7, 10), (2, 5, 8, 11), (12, 15, 18, 21),\n (13, 16, 19, 22), (14, 17, 20, 23)]\naxes_list = [(axes[i], axes[j], axes[k], axes[l])\n for (i, j, k, l) in axes_idxs]\n\n\nfor distribution, color, axes in zip(distributions, colors, axes_list):\n name, X = distribution\n X_train, X_test = train_test_split(X, test_size=.5)\n\n # perform power transforms and quantile transform\n X_trans_bc = bc.fit(X_train).transform(X_test)\n lmbda_bc = round(bc.lambdas_[0], 2)\n X_trans_yj = yj.fit(X_train).transform(X_test)\n lmbda_yj = round(yj.lambdas_[0], 2)\n X_trans_qt = qt.fit(X_train).transform(X_test)\n\n ax_original, ax_bc, ax_yj, ax_qt = axes\n\n ax_original.hist(X_train, color=color, bins=BINS)\n ax_original.set_title(name, fontsize=FONT_SIZE)\n ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE)\n\n for ax, X_trans, meth_name, lmbda in zip(\n (ax_bc, ax_yj, ax_qt),\n (X_trans_bc, X_trans_yj, X_trans_qt),\n ('Box-Cox', 'Yeo-Johnson', 'Quantile transform'),\n (lmbda_bc, lmbda_yj, None)):\n ax.hist(X_trans, color=color, bins=BINS)\n title = 'After {}'.format(meth_name)\n if lmbda is not None:\n title += '\\n$\\lambda$ = {}'.format(lmbda)\n ax.set_title(title, fontsize=FONT_SIZE)\n ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE)\n ax.set_xlim([-3.5, 3.5])\n\n\nplt.tight_layout()\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.6.6"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}

0 commit comments

Comments
 (0)