|
22 | 22 | height of one meter can be considered much more important than the
|
23 | 23 | change in weight of one kilogram, this is clearly incorrect.
|
24 | 24 |
|
25 |
| -To illustrate this, PCA is performed comparing the use of data with |
| 25 | +To illustrate this, :class:`PCA <sklearn.decomposition.PCA>` |
| 26 | +is performed comparing the use of data with |
26 | 27 | :class:`StandardScaler <sklearn.preprocessing.StandardScaler>` applied,
|
27 | 28 | to unscaled data. The results are visualized and a clear difference noted.
|
28 | 29 | The 1st principal component in the unscaled set can be seen. It can be seen
|
|
33 | 34 |
|
34 | 35 | The dataset used is the Wine Dataset available at UCI. This dataset
|
35 | 36 | has continuous features that are heterogeneous in scale due to differing
|
36 |
| -properties that they measure (i.e alcohol content, and malic acid). |
| 37 | +properties that they measure (i.e. alcohol content and malic acid). |
37 | 38 |
|
38 | 39 | The transformed data is then used to train a naive Bayes classifier, and a
|
39 | 40 | clear difference in prediction accuracies is observed wherein the dataset
|
40 | 41 | which is scaled before PCA vastly outperforms the unscaled version.
|
41 | 42 |
|
42 | 43 | """
|
| 44 | +import matplotlib.pyplot as plt |
43 | 45 |
|
44 | 46 | from sklearn.model_selection import train_test_split
|
45 | 47 | from sklearn.preprocessing import StandardScaler
|
46 | 48 | from sklearn.decomposition import PCA
|
47 | 49 | from sklearn.naive_bayes import GaussianNB
|
48 |
| -from sklearn import metrics |
49 |
| -import matplotlib.pyplot as plt |
| 50 | +from sklearn.metrics import accuracy_score |
50 | 51 | from sklearn.datasets import load_wine
|
51 | 52 | from sklearn.pipeline import make_pipeline
|
52 | 53 |
|
|
66 | 67 | features, target, test_size=0.30, random_state=RANDOM_STATE
|
67 | 68 | )
|
68 | 69 |
|
69 |
| -# Fit to data and predict using pipelined GNB and PCA. |
| 70 | +# Fit to data and predict using pipelined GNB and PCA |
70 | 71 | unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB())
|
71 | 72 | unscaled_clf.fit(X_train, y_train)
|
72 | 73 | pred_test = unscaled_clf.predict(X_test)
|
73 | 74 |
|
74 |
| -# Fit to data and predict using pipelined scaling, GNB and PCA. |
| 75 | +# Fit to data and predict using pipelined scaling, GNB and PCA |
75 | 76 | std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB())
|
76 | 77 | std_clf.fit(X_train, y_train)
|
77 | 78 | pred_test_std = std_clf.predict(X_test)
|
78 | 79 |
|
79 | 80 | # Show prediction accuracies in scaled and unscaled data.
|
80 | 81 | print("\nPrediction accuracy for the normal test dataset with PCA")
|
81 |
| -print("{:.2%}\n".format(metrics.accuracy_score(y_test, pred_test))) |
| 82 | +print(f"{accuracy_score(y_test, pred_test):.2%}\n") |
82 | 83 |
|
83 | 84 | print("\nPrediction accuracy for the standardized test dataset with PCA")
|
84 |
| -print("{:.2%}\n".format(metrics.accuracy_score(y_test, pred_test_std))) |
| 85 | +print(f"{accuracy_score(y_test, pred_test_std):.2%}\n") |
85 | 86 |
|
86 | 87 | # Extract PCA from pipeline
|
87 | 88 | pca = unscaled_clf.named_steps["pca"]
|
88 | 89 | pca_std = std_clf.named_steps["pca"]
|
89 | 90 |
|
90 | 91 | # Show first principal components
|
91 |
| -print("\nPC 1 without scaling:\n", pca.components_[0]) |
92 |
| -print("\nPC 1 with scaling:\n", pca_std.components_[0]) |
| 92 | +print(f"\nPC 1 without scaling:\n{pca.components_[0]}") |
| 93 | +print(f"\nPC 1 with scaling:\n{pca_std.components_[0]}") |
93 | 94 |
|
94 | 95 | # Use PCA without and with scale on X_train data for visualization.
|
95 | 96 | X_train_transformed = pca.transform(X_train)
|
| 97 | + |
96 | 98 | scaler = std_clf.named_steps["standardscaler"]
|
97 |
| -X_train_std_transformed = pca_std.transform(scaler.transform(X_train)) |
| 99 | +scaled_X_train = scaler.transform(X_train) |
| 100 | +X_train_std_transformed = pca_std.transform(scaled_X_train) |
98 | 101 |
|
99 | 102 | # visualize standardized vs. untouched dataset with PCA performed
|
100 | 103 | fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE)
|
101 | 104 |
|
| 105 | +target_classes = range(0, 3) |
| 106 | +colors = ("blue", "red", "green") |
| 107 | +markers = ("^", "s", "o") |
102 | 108 |
|
103 |
| -for l, c, m in zip(range(0, 3), ("blue", "red", "green"), ("^", "s", "o")): |
| 109 | +for target_class, color, marker in zip(target_classes, colors, markers): |
104 | 110 | ax1.scatter(
|
105 |
| - X_train_transformed[y_train == l, 0], |
106 |
| - X_train_transformed[y_train == l, 1], |
107 |
| - color=c, |
108 |
| - label="class %s" % l, |
| 111 | + x=X_train_transformed[y_train == target_class, 0], |
| 112 | + y=X_train_transformed[y_train == target_class, 1], |
| 113 | + color=color, |
| 114 | + label=f"class {target_class}", |
109 | 115 | alpha=0.5,
|
110 |
| - marker=m, |
| 116 | + marker=marker, |
111 | 117 | )
|
112 | 118 |
|
113 |
| -for l, c, m in zip(range(0, 3), ("blue", "red", "green"), ("^", "s", "o")): |
114 | 119 | ax2.scatter(
|
115 |
| - X_train_std_transformed[y_train == l, 0], |
116 |
| - X_train_std_transformed[y_train == l, 1], |
117 |
| - color=c, |
118 |
| - label="class %s" % l, |
| 120 | + x=X_train_std_transformed[y_train == target_class, 0], |
| 121 | + y=X_train_std_transformed[y_train == target_class, 1], |
| 122 | + color=color, |
| 123 | + label=f"class {target_class}", |
119 | 124 | alpha=0.5,
|
120 |
| - marker=m, |
| 125 | + marker=marker, |
121 | 126 | )
|
122 | 127 |
|
123 | 128 | ax1.set_title("Training dataset after PCA")
|
|
0 commit comments