diff --git a/algorithms/k_means.py b/algorithms/k_means.py new file mode 100644 index 0000000..d83a21f --- /dev/null +++ b/algorithms/k_means.py @@ -0,0 +1,49 @@ +""" +K-means implementation + +Seems okay: Have not tested this rigorously, but this separates 'Iris-setosa' +pretty well from 'Iris-versicolor' and 'Iris-virginica', +but mixes the latter two. + +Dataset: Iris dataset. +""" + +from load_data import load_iris +import numpy as np + + +X_std, y = load_iris(std=True) + + +def k_means(X, num_means=3, num_iterations=10): + """K means. Assumes each datapoint is a 1D array.""" + # data dim + N, D = X.shape + + # initialise vars + assignments = np.zeros(N) + dists = np.zeros((N, num_means)) + + # 1. Init means + means = np.random.random((num_means, D)) + + # 2. Iterate + for i in range(num_iterations): + # 2a(i) Calculate dists + for k in range(num_means): + dists[:,k] = np.sum((X - np.tile(means[k],(N,1)))**2,axis=1) + + # 2a(ii): Assign clusters + for n in range(N): + assignments[n] = np.argmin(dists[n]) + + # 2b. Recalculate cluster means + for k in range(num_means): + means[k] = np.mean([X[i] for i in range(N) if assignments[i] == k], axis=0) + + return means, assignments + + +means, assignments = k_means(X_std) + +print(assignments, y) \ No newline at end of file diff --git a/algorithms/load_data.py b/algorithms/load_data.py new file mode 100644 index 0000000..eef5a9a --- /dev/null +++ b/algorithms/load_data.py @@ -0,0 +1,19 @@ +import pandas as pd +from sklearn.preprocessing import StandardScaler + +def load_iris(std=False): + df = pd.read_csv( + filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', + header=None, + sep=',') + + df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class'] + df.dropna(how="all", inplace=True) # drops the empty line at file-end + + X = df.ix[:,0:4].values + y = df.ix[:,4].values + + if std: + X = StandardScaler().fit_transform(X) + + return X, y \ No newline at end of file diff --git a/algorithms/pca.py b/algorithms/pca.py new file mode 100644 index 0000000..2c629c8 --- /dev/null +++ b/algorithms/pca.py @@ -0,0 +1,49 @@ +""" +Principal Components Analysis (PCA) using NumPy + +Dataset: Iris dataset. +Adapted from Plotly PCA tutorial +https://plot.ly/ipython-notebooks/principal-component-analysis/ +""" + +from load_data import load_iris +import numpy as np + + +X_std, y = load_iris(std=True) + + +def pca(X_std): + + # 1. Calculate covariance matrix + mean_vec = np.mean(X_std, axis=0) + # same as np.cov(X_std.T) + N = X_std.shape[0] + cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (N-1) + + # 2. Find eigenvectors and eigenvalues by SVD + u,s,v = np.linalg.svd(X_std.T) + + eig_vals = s**2/(N-1) + eig_vecs = u + + # Can also do by eigendecomposition -> less efficient, O(N^3) + # vs O(min(M,N)MN). + # can also do for cor_mat1 = np.corrcoef(X_std.T) + # eig_vals, eig_vecs = np.linalg.eig(cov_mat) + + # 3. Select PCs + # Make a list of (eigenvalue, eigenvector) tuples + eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))] + + # Sort the (eigenvalue, eigenvector) tuples from high to low + eig_pairs.sort() + eig_pairs.reverse() + + # Visually confirm that the list is correctly sorted by decreasing eigenvalues + print('Eigenvalues and eigenectors, in descending order of eigenvalues:') + for i in eig_pairs: + print(i) + return eig_pairs + +eig_pairs = pca(X_std) \ No newline at end of file diff --git a/algorithms/travelling_salesman.py b/algorithms/travelling_salesman.py new file mode 100644 index 0000000..e7f0b47 --- /dev/null +++ b/algorithms/travelling_salesman.py @@ -0,0 +1,78 @@ +""" +Travelling Salesman Problem: +An Exact Dynamic Programming Solution in Python + +Jessica Yung +Dec 2018 +""" +import numpy as np + +class TravellingSalesman: + + def __init__(self, graph, start=0): + """Initialise with graph and node you start from. + :param graph: takes the form of an adjacency matrix + (suitable since we are given a fully connected graph). + :param start: an int (index in adj matrix). + Node you start from doesn't make a difference since this is a tour. + """ + self.graph = graph + self.start = start + self.nodes = list(np.arange(len(graph))) + self.cost_dict = {} + + def cost(self, nodes, end): + if (tuple(nodes), end) in self.cost_dict.keys(): + return self.cost_dict[(tuple(nodes), end)] + else: + self.cost_dict[tuple(nodes), end] = self.calc_cost(nodes, end) + return self.cost_dict[tuple(nodes), end] + + def calc_cost(self, nodes, end): + if end not in nodes: + return Exception("Endpoint not in nodes to visit.") + # print("Nodes: {}".format(nodes)) + if len(nodes) == 1: + return 0 + if len(nodes) == 2: + return self.graph[nodes[0], nodes[1]] + non_end_nodes = nodes.copy() + non_end_nodes.remove(end) + temp = [self.cost(non_end_nodes, j) + self.graph[j, end] for j in non_end_nodes if j != self.start] + # print("Non end nodes: {}".format(non_end_nodes)) + # print("End: ", end) + # for j in non_end_nodes: + # if j != self.start: + # print(self.cost(non_end_nodes, j)) + # print(self.graph[j, end]) + # print("Graph: ", self.graph) + # print("j={}, end={}".format(j, end)) + # print("cost candidates:", temp) + return min(temp) + + def dp(self): + """Dynamic programming solution to Travelling Salesman problem.""" + # calculate costs + return min(self.cost(self.nodes, i) + self.graph[i, 0] for i in self.nodes[1:]) + # return self.cost(self.nodes, self.start) + + +# test case: +def create_adj_matrix(distances): + """dists: (n-1)x(n-1) matrix with (n-1)*n/2 entries + dists from 0 to 1, 2,...n-1, then dists from 1 to 2,...,n-1, up to dists from n-1. + cells that don't represent dists in input may not exist or can exist but are ignored. + """ + n = len(distances) + 1 + mat = np.diag(np.ones(n)*np.inf) + for i in range(n-1): + for j in range(n-i-1): + mat[i, j+i+1] = mat[j+i+1, i] = distances[i][j] + return mat + +dists = create_adj_matrix([[4, 3],[2]]) +# print(dists) +ts = TravellingSalesman(dists, 0) +soln = ts.dp() +print("Min dist:", soln) +# print(ts.cost_dict) diff --git a/flashcards/python-knowledge.html b/flashcards/python-knowledge.html new file mode 100644 index 0000000..6831974 --- /dev/null +++ b/flashcards/python-knowledge.html @@ -0,0 +1,28 @@ +What does 'first class object' mean?; + +egs of first-class objects in Python; Functions, classes. + +Is Python interpreted or compiled?; interpreted. + +typing of Python; dynamically typed (vs statically typed, specify types) + +does Python have access specifiers?; No. (e.g. C++'s public, private. bc 'we are all adults here'.) + +compiled vs interpreted languages; - compiled: (compiled code) can be executed directly in the CPU's 'native' language. - interpreted: must be translated at runtime from original format to CPU machine instructions + +Deep vs shallow copy; -Shallow copy copies reference pointers and changes to copied will change original. Faster. - Deep copy copies values but not reference pointers, is slower. + +Deep copy; Stores values copied separately. - Doesn't copy reference pointers. - Changes made in copy won't affect original etc. - Slower than shallow copy. + +Shallow copy; Copies reference pointers just like it copies values. - Changes made will also affect the original. - faster execution that depends on data size(?) + +lists vs tuples; lists are mutable, tuples are not. + +How multithreading is done in Python; - Global Interpreter Lock (GIL): makes sure only one thread executes at a time. Thread acquires GIL, does some work, then passes GIL onto next thread. - i.e. take turns using same CPU core. - GIL passing adds overhead to execution (slower) + +example of ternary operator; 5 if x > 4 else 4 + +Memory management in Python; + +Sources: +- https://www.edureka.co/blog/interview-questions/python-interview-questions/ diff --git a/flashcards/pytorch.txt b/flashcards/pytorch.txt new file mode 100644 index 0000000..96653c4 --- /dev/null +++ b/flashcards/pytorch.txt @@ -0,0 +1,7 @@ +Reshape tensor `x` from 2x4 to 4x2; x=x.view(4,2) +Reshape tensor `x` to 5 rows; x=x.view(5,-1) +What does the -1 in x.view(-1) refer to?; infer dimension + +How to set up autodifferentiation; Set up tensor with requires_grad=True: `x=torch.Tensor([2,3], requires_grad=True)` then do operations on x. Can also set via `x.requires_grad = True`. + + diff --git a/flashcards/tensorflow.txt b/flashcards/tensorflow.txt new file mode 100644 index 0000000..d3b855b --- /dev/null +++ b/flashcards/tensorflow.txt @@ -0,0 +1,50 @@ +What is a tf Session?; connection to C++ backend to do computation. + +Why might using NumPy have high overhead?; Cost of switching back to Python (after doing e.g. matrix multiplication outside Python) every operation. especially bad if running computations on GPUs or in a distributed manner, where there can be a high cost to transferring data. -> Tf defines entire graph that runs outside Python. Python code builds graph and defines which parts of the graph should be run. + + +What does the first dimension of x = tf.placeholder(tf.float32, shape=[None, 784]) correspond to?; Batch size. + +What does it mean when a dimension of shape is None? It can be of any size. + +Is the shape argument to placeholder compulsory?; No, it's optional. Helps debugging though. + +What is the difference between a tf.placeholder and a tf.Variable?; Placeholders are given when we ask tf to run computations and cannot be modified (I think). Variables can be modified by the computation. + +What are the tf types of model parameters?; usually tf.Variable, e.g. tf.Variable(tf.zeros([784])) + +Do you have to initialise variables before using them in a session?; Yes: sess.run(tf.global_variables_initializer()) + + +Matrix multiplication in tf; y = tf.matmul(x, W) + b + +Categorical cross entropy loss; loss = tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=yhat) . Then ou use e.g. step = tf.train.GradientDescentOptimizer(0.5).minimize(loss), where 0.5 is the learning rate. + +Gradient descent step in tf; step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss), then step.run(feed_dict={x: blah, y: blah}) + +Take an average in tf; tf.reduce_mean(thing_to_sum) + +Can you replace a variable in your computation graph with other input using feed_dict?; Yes, you can replace any tensor in your graph using feed_dict. + +cast booleans to floats; tf.cast(list_of_booleans, tf.float32) + +why should you initialise weights with a small amount of noise?; (1) symmetry breaking (todo: expand) and (2) to prevent 0 gradients. + +when using ReLU neurons, how should you initialise them?; with a slight positive bias (e.g. 0.1) to avoid 'dead neurons'. + +how might you initialise weights with a small amount of noise?; tf.Variable(tf.truncated_normal(shape, stddev=0.1)) + +2D convolution layer in tf; tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME'). + +I want a conv layer to compute 32 features for each 5x5 patch, input having 1 channel (e.g. greyscale). What should the shape of the weights be?; [5, 5, 1, 32]. i.e. (patchdim1, patchdim2, num_input_channels, num_output_channels) + +specify shape in round or square brackets in tf?; square brackets: e.g. [5, 5, 1, 32]. + +how can you turn dropout on during training and turn it off during testing?; create placeholder keep_prob=tf.placeholder(tf.float32), dropout = tf.nn.dropout(prev_layer, keep_prob) and feed dict corresponding values when training and testing (keep_prob=1). + +scaling used in tf.nn.dropout; output scaled up by 1/keep_prob, so expected sum is unchanged. + + + + +