diff --git a/algorithms/k_means.py b/algorithms/k_means.py new file mode 100644 index 0000000..d83a21f --- /dev/null +++ b/algorithms/k_means.py @@ -0,0 +1,49 @@ +""" +K-means implementation + +Seems okay: Have not tested this rigorously, but this separates 'Iris-setosa' +pretty well from 'Iris-versicolor' and 'Iris-virginica', +but mixes the latter two. + +Dataset: Iris dataset. +""" + +from load_data import load_iris +import numpy as np + + +X_std, y = load_iris(std=True) + + +def k_means(X, num_means=3, num_iterations=10): + """K means. Assumes each datapoint is a 1D array.""" + # data dim + N, D = X.shape + + # initialise vars + assignments = np.zeros(N) + dists = np.zeros((N, num_means)) + + # 1. Init means + means = np.random.random((num_means, D)) + + # 2. Iterate + for i in range(num_iterations): + # 2a(i) Calculate dists + for k in range(num_means): + dists[:,k] = np.sum((X - np.tile(means[k],(N,1)))**2,axis=1) + + # 2a(ii): Assign clusters + for n in range(N): + assignments[n] = np.argmin(dists[n]) + + # 2b. Recalculate cluster means + for k in range(num_means): + means[k] = np.mean([X[i] for i in range(N) if assignments[i] == k], axis=0) + + return means, assignments + + +means, assignments = k_means(X_std) + +print(assignments, y) \ No newline at end of file diff --git a/algorithms/load_data.py b/algorithms/load_data.py new file mode 100644 index 0000000..eef5a9a --- /dev/null +++ b/algorithms/load_data.py @@ -0,0 +1,19 @@ +import pandas as pd +from sklearn.preprocessing import StandardScaler + +def load_iris(std=False): + df = pd.read_csv( + filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', + header=None, + sep=',') + + df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class'] + df.dropna(how="all", inplace=True) # drops the empty line at file-end + + X = df.ix[:,0:4].values + y = df.ix[:,4].values + + if std: + X = StandardScaler().fit_transform(X) + + return X, y \ No newline at end of file diff --git a/algorithms/pca.py b/algorithms/pca.py new file mode 100644 index 0000000..2c629c8 --- /dev/null +++ b/algorithms/pca.py @@ -0,0 +1,49 @@ +""" +Principal Components Analysis (PCA) using NumPy + +Dataset: Iris dataset. +Adapted from Plotly PCA tutorial +https://plot.ly/ipython-notebooks/principal-component-analysis/ +""" + +from load_data import load_iris +import numpy as np + + +X_std, y = load_iris(std=True) + + +def pca(X_std): + + # 1. Calculate covariance matrix + mean_vec = np.mean(X_std, axis=0) + # same as np.cov(X_std.T) + N = X_std.shape[0] + cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (N-1) + + # 2. Find eigenvectors and eigenvalues by SVD + u,s,v = np.linalg.svd(X_std.T) + + eig_vals = s**2/(N-1) + eig_vecs = u + + # Can also do by eigendecomposition -> less efficient, O(N^3) + # vs O(min(M,N)MN). + # can also do for cor_mat1 = np.corrcoef(X_std.T) + # eig_vals, eig_vecs = np.linalg.eig(cov_mat) + + # 3. Select PCs + # Make a list of (eigenvalue, eigenvector) tuples + eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))] + + # Sort the (eigenvalue, eigenvector) tuples from high to low + eig_pairs.sort() + eig_pairs.reverse() + + # Visually confirm that the list is correctly sorted by decreasing eigenvalues + print('Eigenvalues and eigenectors, in descending order of eigenvalues:') + for i in eig_pairs: + print(i) + return eig_pairs + +eig_pairs = pca(X_std) \ No newline at end of file diff --git a/algorithms/travelling_salesman.py b/algorithms/travelling_salesman.py new file mode 100644 index 0000000..e7f0b47 --- /dev/null +++ b/algorithms/travelling_salesman.py @@ -0,0 +1,78 @@ +""" +Travelling Salesman Problem: +An Exact Dynamic Programming Solution in Python + +Jessica Yung +Dec 2018 +""" +import numpy as np + +class TravellingSalesman: + + def __init__(self, graph, start=0): + """Initialise with graph and node you start from. + :param graph: takes the form of an adjacency matrix + (suitable since we are given a fully connected graph). + :param start: an int (index in adj matrix). + Node you start from doesn't make a difference since this is a tour. + """ + self.graph = graph + self.start = start + self.nodes = list(np.arange(len(graph))) + self.cost_dict = {} + + def cost(self, nodes, end): + if (tuple(nodes), end) in self.cost_dict.keys(): + return self.cost_dict[(tuple(nodes), end)] + else: + self.cost_dict[tuple(nodes), end] = self.calc_cost(nodes, end) + return self.cost_dict[tuple(nodes), end] + + def calc_cost(self, nodes, end): + if end not in nodes: + return Exception("Endpoint not in nodes to visit.") + # print("Nodes: {}".format(nodes)) + if len(nodes) == 1: + return 0 + if len(nodes) == 2: + return self.graph[nodes[0], nodes[1]] + non_end_nodes = nodes.copy() + non_end_nodes.remove(end) + temp = [self.cost(non_end_nodes, j) + self.graph[j, end] for j in non_end_nodes if j != self.start] + # print("Non end nodes: {}".format(non_end_nodes)) + # print("End: ", end) + # for j in non_end_nodes: + # if j != self.start: + # print(self.cost(non_end_nodes, j)) + # print(self.graph[j, end]) + # print("Graph: ", self.graph) + # print("j={}, end={}".format(j, end)) + # print("cost candidates:", temp) + return min(temp) + + def dp(self): + """Dynamic programming solution to Travelling Salesman problem.""" + # calculate costs + return min(self.cost(self.nodes, i) + self.graph[i, 0] for i in self.nodes[1:]) + # return self.cost(self.nodes, self.start) + + +# test case: +def create_adj_matrix(distances): + """dists: (n-1)x(n-1) matrix with (n-1)*n/2 entries + dists from 0 to 1, 2,...n-1, then dists from 1 to 2,...,n-1, up to dists from n-1. + cells that don't represent dists in input may not exist or can exist but are ignored. + """ + n = len(distances) + 1 + mat = np.diag(np.ones(n)*np.inf) + for i in range(n-1): + for j in range(n-i-1): + mat[i, j+i+1] = mat[j+i+1, i] = distances[i][j] + return mat + +dists = create_adj_matrix([[4, 3],[2]]) +# print(dists) +ts = TravellingSalesman(dists, 0) +soln = ts.dp() +print("Min dist:", soln) +# print(ts.cost_dict)