Skip to content

Commit 75189e4

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 35af6dc808c8d317eb7017d1b16e271c9c8bba77
1 parent 56a06b4 commit 75189e4

File tree

1,231 files changed

+4506
-4481
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,231 files changed

+4506
-4481
lines changed
Binary file not shown.

dev/_downloads/4ef6a0e5e8f2fe6463d63928373e5f91/plot_scaling_importance.py

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222
height of one meter can be considered much more important than the
2323
change in weight of one kilogram, this is clearly incorrect.
2424
25-
To illustrate this, PCA is performed comparing the use of data with
25+
To illustrate this, :class:`PCA <sklearn.decomposition.PCA>`
26+
is performed comparing the use of data with
2627
:class:`StandardScaler <sklearn.preprocessing.StandardScaler>` applied,
2728
to unscaled data. The results are visualized and a clear difference noted.
2829
The 1st principal component in the unscaled set can be seen. It can be seen
@@ -33,20 +34,20 @@
3334
3435
The dataset used is the Wine Dataset available at UCI. This dataset
3536
has continuous features that are heterogeneous in scale due to differing
36-
properties that they measure (i.e alcohol content, and malic acid).
37+
properties that they measure (i.e. alcohol content and malic acid).
3738
3839
The transformed data is then used to train a naive Bayes classifier, and a
3940
clear difference in prediction accuracies is observed wherein the dataset
4041
which is scaled before PCA vastly outperforms the unscaled version.
4142
4243
"""
44+
import matplotlib.pyplot as plt
4345

4446
from sklearn.model_selection import train_test_split
4547
from sklearn.preprocessing import StandardScaler
4648
from sklearn.decomposition import PCA
4749
from sklearn.naive_bayes import GaussianNB
48-
from sklearn import metrics
49-
import matplotlib.pyplot as plt
50+
from sklearn.metrics import accuracy_score
5051
from sklearn.datasets import load_wine
5152
from sklearn.pipeline import make_pipeline
5253

@@ -66,58 +67,62 @@
6667
features, target, test_size=0.30, random_state=RANDOM_STATE
6768
)
6869

69-
# Fit to data and predict using pipelined GNB and PCA.
70+
# Fit to data and predict using pipelined GNB and PCA
7071
unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB())
7172
unscaled_clf.fit(X_train, y_train)
7273
pred_test = unscaled_clf.predict(X_test)
7374

74-
# Fit to data and predict using pipelined scaling, GNB and PCA.
75+
# Fit to data and predict using pipelined scaling, GNB and PCA
7576
std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB())
7677
std_clf.fit(X_train, y_train)
7778
pred_test_std = std_clf.predict(X_test)
7879

7980
# Show prediction accuracies in scaled and unscaled data.
8081
print("\nPrediction accuracy for the normal test dataset with PCA")
81-
print("{:.2%}\n".format(metrics.accuracy_score(y_test, pred_test)))
82+
print(f"{accuracy_score(y_test, pred_test):.2%}\n")
8283

8384
print("\nPrediction accuracy for the standardized test dataset with PCA")
84-
print("{:.2%}\n".format(metrics.accuracy_score(y_test, pred_test_std)))
85+
print(f"{accuracy_score(y_test, pred_test_std):.2%}\n")
8586

8687
# Extract PCA from pipeline
8788
pca = unscaled_clf.named_steps["pca"]
8889
pca_std = std_clf.named_steps["pca"]
8990

9091
# Show first principal components
91-
print("\nPC 1 without scaling:\n", pca.components_[0])
92-
print("\nPC 1 with scaling:\n", pca_std.components_[0])
92+
print(f"\nPC 1 without scaling:\n{pca.components_[0]}")
93+
print(f"\nPC 1 with scaling:\n{pca_std.components_[0]}")
9394

9495
# Use PCA without and with scale on X_train data for visualization.
9596
X_train_transformed = pca.transform(X_train)
97+
9698
scaler = std_clf.named_steps["standardscaler"]
97-
X_train_std_transformed = pca_std.transform(scaler.transform(X_train))
99+
scaled_X_train = scaler.transform(X_train)
100+
X_train_std_transformed = pca_std.transform(scaled_X_train)
98101

99102
# visualize standardized vs. untouched dataset with PCA performed
100103
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE)
101104

105+
target_classes = range(0, 3)
106+
colors = ("blue", "red", "green")
107+
markers = ("^", "s", "o")
102108

103-
for l, c, m in zip(range(0, 3), ("blue", "red", "green"), ("^", "s", "o")):
109+
for target_class, color, marker in zip(target_classes, colors, markers):
104110
ax1.scatter(
105-
X_train_transformed[y_train == l, 0],
106-
X_train_transformed[y_train == l, 1],
107-
color=c,
108-
label="class %s" % l,
111+
x=X_train_transformed[y_train == target_class, 0],
112+
y=X_train_transformed[y_train == target_class, 1],
113+
color=color,
114+
label=f"class {target_class}",
109115
alpha=0.5,
110-
marker=m,
116+
marker=marker,
111117
)
112118

113-
for l, c, m in zip(range(0, 3), ("blue", "red", "green"), ("^", "s", "o")):
114119
ax2.scatter(
115-
X_train_std_transformed[y_train == l, 0],
116-
X_train_std_transformed[y_train == l, 1],
117-
color=c,
118-
label="class %s" % l,
120+
x=X_train_std_transformed[y_train == target_class, 0],
121+
y=X_train_std_transformed[y_train == target_class, 1],
122+
color=color,
123+
label=f"class {target_class}",
119124
alpha=0.5,
120-
marker=m,
125+
marker=marker,
121126
)
122127

123128
ax1.set_title("Training dataset after PCA")

0 commit comments

Comments
 (0)