Skip to content

Commit fd95155

Browse files
committed
feat(alg): add pca
1 parent 398b471 commit fd95155

File tree

2 files changed

+68
-0
lines changed

2 files changed

+68
-0
lines changed

algorithms/load_data.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import pandas as pd
2+
from sklearn.preprocessing import StandardScaler
3+
4+
def load_iris(std=False):
5+
df = pd.read_csv(
6+
filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
7+
header=None,
8+
sep=',')
9+
10+
df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
11+
df.dropna(how="all", inplace=True) # drops the empty line at file-end
12+
13+
X = df.ix[:,0:4].values
14+
y = df.ix[:,4].values
15+
16+
if std:
17+
X = StandardScaler().fit_transform(X)
18+
19+
return X, y

algorithms/pca.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""
2+
Principal Components Analysis (PCA) using NumPy
3+
4+
Dataset: Iris dataset.
5+
Adapted from Plotly PCA tutorial
6+
https://plot.ly/ipython-notebooks/principal-component-analysis/
7+
"""
8+
9+
from load_data import load_iris
10+
import numpy as np
11+
12+
13+
X_std, y = load_iris(std=True)
14+
15+
16+
def pca(X_std):
17+
18+
# 1. Calculate covariance matrix
19+
mean_vec = np.mean(X_std, axis=0)
20+
# same as np.cov(X_std.T)
21+
N = X_std.shape[0]
22+
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (N-1)
23+
24+
# 2. Find eigenvectors and eigenvalues by SVD
25+
u,s,v = np.linalg.svd(X_std.T)
26+
27+
eig_vals = s**2/(N-1)
28+
eig_vecs = u
29+
30+
# Can also do by eigendecomposition -> less efficient, O(N^3)
31+
# vs O(min(M,N)MN).
32+
# can also do for cor_mat1 = np.corrcoef(X_std.T)
33+
# eig_vals, eig_vecs = np.linalg.eig(cov_mat)
34+
35+
# 3. Select PCs
36+
# Make a list of (eigenvalue, eigenvector) tuples
37+
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
38+
39+
# Sort the (eigenvalue, eigenvector) tuples from high to low
40+
eig_pairs.sort()
41+
eig_pairs.reverse()
42+
43+
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
44+
print('Eigenvalues and eigenectors, in descending order of eigenvalues:')
45+
for i in eig_pairs:
46+
print(i)
47+
return eig_pairs
48+
49+
eig_pairs = pca(X_std)

0 commit comments

Comments
 (0)