"from __future__ import print_function\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn import metrics\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_wine\nfrom sklearn.pipeline import make_pipeline\nprint(__doc__)\n\n# Code source: Tyler Lanigan <
[email protected]>\n# Sebastian Raschka <
[email protected]>\n\n# License: BSD 3 clause\n\nRANDOM_STATE = 42\nFIG_SIZE = (10, 7)\n\n\nfeatures, target = load_wine(return_X_y=True)\n\n# Make a train/test split using 30% test size\nX_train, X_test, y_train, y_test = train_test_split(features, target,\n test_size=0.30,\n random_state=RANDOM_STATE)\n\n# Fit to data and predict using pipelined GNB and PCA.\nunscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB())\nunscaled_clf.fit(X_train, y_train)\npred_test = unscaled_clf.predict(X_test)\n\n# Fit to data and predict using pipelined scaling, GNB and PCA.\nstd_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB())\nstd_clf.fit(X_train, y_train)\npred_test_std = std_clf.predict(X_test)\n\n# Show prediction accuracies in scaled and unscaled data.\nprint('\\nPrediction accuracy for the normal test dataset with PCA')\nprint('{:.2%}\\n'.format(metrics.accuracy_score(y_test, pred_test)))\n\nprint('\\nPrediction accuracy for the standardized test dataset with PCA')\nprint('{:.2%}\\n'.format(metrics.accuracy_score(y_test, pred_test_std)))\n\n# Extract PCA from pipeline\npca = unscaled_clf.named_steps['pca']\npca_std = std_clf.named_steps['pca']\n\n# Show first principal components\nprint('\\nPC 1 without scaling:\\n', pca.components_[0])\nprint('\\nPC 1 with scaling:\\n', pca_std.components_[0])\n\n# Use PCA without and with scale on X_train data for visualization.\nX_train_transformed = pca.transform(X_train)\nscaler = std_clf.named_steps['standardscaler']\nX_train_std_transformed = pca_std.transform(scaler.transform(X_train))\n\n# visualize standardized vs. untouched dataset with PCA performed\nfig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE)\n\n\nfor l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):\n ax1.scatter(X_train_transformed[y_train == l, 0],\n X_train_transformed[y_train == l, 1],\n color=c,\n label='class %s' % l,\n alpha=0.5,\n marker=m\n )\n\nfor l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):\n ax2.scatter(X_train_std_transformed[y_train == l, 0],\n X_train_std_transformed[y_train == l, 1],\n color=c,\n label='class %s' % l,\n alpha=0.5,\n marker=m\n )\n\nax1.set_title('Training dataset after PCA')\nax2.set_title('Standardized training dataset after PCA')\n\nfor ax in (ax1, ax2):\n ax.set_xlabel('1st principal component')\n ax.set_ylabel('2nd principal component')\n ax.legend(loc='upper right')\n ax.grid()\n\nplt.tight_layout()\n\nplt.show()"
0 commit comments