"# Author: Raghav RV <
[email protected]>\n# Guillaume Lemaitre <
[email protected]>\n# Thomas Unterthiner\n# License: BSD 3 clause\n\nimport numpy as np\n\nimport matplotlib as mpl\nfrom matplotlib import pyplot as plt\nfrom matplotlib import cm\n\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.preprocessing import minmax_scale\nfrom sklearn.preprocessing import MaxAbsScaler\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import RobustScaler\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn.preprocessing import QuantileTransformer\nfrom sklearn.preprocessing import PowerTransformer\n\nfrom sklearn.datasets import fetch_california_housing\n\ndataset = fetch_california_housing()\nX_full, y_full = dataset.data, dataset.target\nfeature_names = dataset.feature_names\n\nfeature_mapping = {\n \"MedInc\": \"Median income in block\",\n \"HouseAge\": \"Median house age in block\",\n \"AveRooms\": \"Average number of rooms\",\n \"AveBedrms\": \"Average number of bedrooms\",\n \"Population\": \"Block population\",\n \"AveOccup\": \"Average house occupancy\",\n \"Latitude\": \"House block latitude\",\n \"Longitude\": \"House block longitude\",\n}\n\n# Take only 2 features to make visualization easier\n# Feature MedInc has a long tail distribution.\n# Feature AveOccup has a few but very large outliers.\nfeatures = [\"MedInc\", \"AveOccup\"]\nfeatures_idx = [feature_names.index(feature) for feature in features]\nX = X_full[:, features_idx]\ndistributions = [\n (\"Unscaled data\", X),\n (\"Data after standard scaling\", StandardScaler().fit_transform(X)),\n (\"Data after min-max scaling\", MinMaxScaler().fit_transform(X)),\n (\"Data after max-abs scaling\", MaxAbsScaler().fit_transform(X)),\n (\n \"Data after robust scaling\",\n RobustScaler(quantile_range=(25, 75)).fit_transform(X),\n ),\n (\n \"Data after power transformation (Yeo-Johnson)\",\n PowerTransformer(method=\"yeo-johnson\").fit_transform(X),\n ),\n (\n \"Data after power transformation (Box-Cox)\",\n PowerTransformer(method=\"box-cox\").fit_transform(X),\n ),\n (\n \"Data after quantile transformation (uniform pdf)\",\n QuantileTransformer(output_distribution=\"uniform\").fit_transform(X),\n ),\n (\n \"Data after quantile transformation (gaussian pdf)\",\n QuantileTransformer(output_distribution=\"normal\").fit_transform(X),\n ),\n (\"Data after sample-wise L2 normalizing\", Normalizer().fit_transform(X)),\n]\n\n# scale the output between 0 and 1 for the colorbar\ny = minmax_scale(y_full)\n\n# plasma does not exist in matplotlib < 1.5\ncmap = getattr(cm, \"plasma_r\", cm.hot_r)\n\n\ndef create_axes(title, figsize=(16, 6)):\n fig = plt.figure(figsize=figsize)\n fig.suptitle(title)\n\n # define the axis for the first plot\n left, width = 0.1, 0.22\n bottom, height = 0.1, 0.7\n bottom_h = height + 0.15\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter = plt.axes(rect_scatter)\n ax_histx = plt.axes(rect_histx)\n ax_histy = plt.axes(rect_histy)\n\n # define the axis for the zoomed-in plot\n left = width + left + 0.2\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter_zoom = plt.axes(rect_scatter)\n ax_histx_zoom = plt.axes(rect_histx)\n ax_histy_zoom = plt.axes(rect_histy)\n\n # define the axis for the colorbar\n left, width = width + left + 0.13, 0.01\n\n rect_colorbar = [left, bottom, width, height]\n ax_colorbar = plt.axes(rect_colorbar)\n\n return (\n (ax_scatter, ax_histy, ax_histx),\n (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),\n ax_colorbar,\n )\n\n\ndef plot_distribution(axes, X, y, hist_nbins=50, title=\"\", x0_label=\"\", x1_label=\"\"):\n ax, hist_X1, hist_X0 = axes\n\n ax.set_title(title)\n ax.set_xlabel(x0_label)\n ax.set_ylabel(x1_label)\n\n # The scatter plot\n colors = cmap(y)\n ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker=\"o\", s=5, lw=0, c=colors)\n\n # Removing the top and the right spine for aesthetics\n # make nice axis layout\n ax.spines[\"top\"].set_visible(False)\n ax.spines[\"right\"].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n ax.spines[\"left\"].set_position((\"outward\", 10))\n ax.spines[\"bottom\"].set_position((\"outward\", 10))\n\n # Histogram for axis X1 (feature 5)\n hist_X1.set_ylim(ax.get_ylim())\n hist_X1.hist(\n X[:, 1], bins=hist_nbins, orientation=\"horizontal\", color=\"grey\", ec=\"grey\"\n )\n hist_X1.axis(\"off\")\n\n # Histogram for axis X0 (feature 0)\n hist_X0.set_xlim(ax.get_xlim())\n hist_X0.hist(\n X[:, 0], bins=hist_nbins, orientation=\"vertical\", color=\"grey\", ec=\"grey\"\n )\n hist_X0.axis(\"off\")"
0 commit comments