Cwickniss
diff --git a/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
5.98 KB b/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
5.98 KB
diff --git a/‎dev/_downloads/5612f9c55259a4294f34843655f9c6af/plot_gpr_on_structured_data.ipynb
Lines changed: 54 additions & 0 deletions b/‎dev/_downloads/5612f9c55259a4294f34843655f9c6af/plot_gpr_on_structured_data.ipynb
Lines changed: 54 additions & 0 deletions
diff --git a/‎dev/_downloads/d2c3d354a93eca3b78b2436d5a8e7164/plot_gpr_on_structured_data.py
Lines changed: 174 additions & 0 deletions b/‎dev/_downloads/d2c3d354a93eca3b78b2436d5a8e7164/plot_gpr_on_structured_data.py
Lines changed: 174 additions & 0 deletions
diff --git a/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
6.97 KB b/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
6.97 KB
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
62.7 KB b/‎dev/_downloads/scikit-learn-docs.pdf
62.7 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-619 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-619 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
-619 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
-619 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-114 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-114 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
-114 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
-114 Bytes
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Gaussian processes on discrete data structures\n\n\nThis example illustrates the use of Gaussian processes for regression and\nclassification tasks on data that are not in fixed-length feature vector form.\nThis is achieved through the use of kernel functions that operates directly\non discrete structures such as variable-length sequences, trees, and graphs.\n\nSpecifically, here the input variables are some gene sequences stored as\nvariable-length strings consisting of letters 'A', 'T', 'C', and 'G',\nwhile the output variables are floating point numbers and True/False labels\nin the regression and classification tasks, respectively.\n\nA kernel between the gene sequences is defined using R-convolution [1]_ by\nintegrating a binary letter-wise kernel over all pairs of letters among a pair\nof strings.\n\nThis example will generate three figures.\n\nIn the first figure, we visualize the value of the kernel, i.e. the similarity\nof the sequences, using a colormap. Brighter color here indicates higher\nsimilarity.\n\nIn the second figure, we show some regression result on a dataset of 6\nsequences. Here we use the 1st, 2nd, 4th, and 5th sequences as the training set\nto make predictions on the 3rd and 6th sequences.\n\nIn the third figure, we demonstrate a classification model by training on 6\nsequences and make predictions on another 5 sequences. The ground truth here is\nsimply  whether there is at least one 'A' in the sequence. Here the model makes\nfour correct classifications and fails on one.\n\n.. [1] Haussler, D. (1999). Convolution kernels on discrete structures\n(Vol. 646). Technical report, Department of Computer Science, University of\nCalifornia at Santa Cruz.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.gaussian_process.kernels import Kernel, Hyperparameter\nfrom sklearn.gaussian_process.kernels import GenericKernelMixin\nfrom sklearn.gaussian_process import GaussianProcessRegressor\nfrom sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.base import clone\n\n\nclass SequenceKernel(GenericKernelMixin, Kernel):\n    '''\n    A minimal (but valid) convolutional kernel for sequences of variable\n    lengths.'''\n    def __init__(self,\n                 baseline_similarity=0.5,\n                 baseline_similarity_bounds=(1e-5, 1)):\n        self.baseline_similarity = baseline_similarity\n        self.baseline_similarity_bounds = baseline_similarity_bounds\n\n    @property\n    def hyperparameter_baseline_similarity(self):\n        return Hyperparameter(\"baseline_similarity\",\n                              \"numeric\",\n                              self.baseline_similarity_bounds)\n\n    def _f(self, s1, s2):\n        '''\n        kernel value between a pair of sequences\n        '''\n        return sum([1.0 if c1 == c2 else self.baseline_similarity\n                   for c1 in s1\n                   for c2 in s2])\n\n    def _g(self, s1, s2):\n        '''\n        kernel derivative between a pair of sequences\n        '''\n        return sum([0.0 if c1 == c2 else 1.0\n                    for c1 in s1\n                    for c2 in s2])\n\n    def __call__(self, X, Y=None, eval_gradient=False):\n        if Y is None:\n            Y = X\n\n        if eval_gradient:\n            return (np.array([[self._f(x, y) for y in Y] for x in X]),\n                    np.array([[[self._g(x, y)] for y in Y] for x in X]))\n        else:\n            return np.array([[self._f(x, y) for y in Y] for x in X])\n\n    def diag(self, X):\n        return np.array([self._f(x, x) for x in X])\n\n    def is_stationary(self):\n        return False\n\n    def clone_with_theta(self, theta):\n        cloned = clone(self)\n        cloned.theta = theta\n        return cloned\n\n\nkernel = SequenceKernel()\n\n'''\nSequence similarity matrix under the kernel\n===========================================\n'''\n\nX = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA'])\n\nK = kernel(X)\nD = kernel.diag(X)\n\nplt.figure(figsize=(8, 5))\nplt.imshow(np.diag(D**-0.5).dot(K).dot(np.diag(D**-0.5)))\nplt.xticks(np.arange(len(X)), X)\nplt.yticks(np.arange(len(X)), X)\nplt.title('Sequence similarity under the kernel')\n\n'''\nRegression\n==========\n'''\n\nX = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA'])\nY = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0])\n\ntraining_idx = [0, 1, 3, 4]\ngp = GaussianProcessRegressor(kernel=kernel)\ngp.fit(X[training_idx], Y[training_idx])\n\nplt.figure(figsize=(8, 5))\nplt.bar(np.arange(len(X)), gp.predict(X), color='b', label='prediction')\nplt.bar(training_idx, Y[training_idx], width=0.2, color='r',\n        alpha=1, label='training')\nplt.xticks(np.arange(len(X)), X)\nplt.title('Regression on sequences')\nplt.legend()\n\n'''\nClassification\n==============\n'''\n\nX_train = np.array(['AGCT', 'CGA', 'TAAC', 'TCG', 'CTTT', 'TGCT'])\n# whether there are 'A's in the sequence\nY_train = np.array([True, True, True, False, False, False])\n\ngp = GaussianProcessClassifier(kernel)\ngp.fit(X_train, Y_train)\n\nX_test = ['AAA', 'ATAG', 'CTC', 'CT', 'C']\nY_test = [True, True, False, False, False]\n\nplt.figure(figsize=(8, 5))\nplt.scatter(np.arange(len(X_train)), [1.0 if c else -1.0 for c in Y_train],\n            s=100, marker='o', edgecolor='none', facecolor=(1, 0.75, 0),\n            label='training')\nplt.scatter(len(X_train) + np.arange(len(X_test)),\n            [1.0 if c else -1.0 for c in Y_test],\n            s=100, marker='o', edgecolor='none', facecolor='r', label='truth')\nplt.scatter(len(X_train) + np.arange(len(X_test)),\n            [1.0 if c else -1.0 for c in gp.predict(X_test)],\n            s=100, marker='x', edgecolor=(0, 1.0, 0.3), linewidth=2,\n            label='prediction')\nplt.xticks(np.arange(len(X_train) + len(X_test)),\n           np.concatenate((X_train, X_test)))\nplt.yticks([-1, 1], [False, True])\nplt.title('Classification on sequences')\nplt.legend()\n\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.7.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,174 @@
+"""
+==========================================================================
+Gaussian processes on discrete data structures
+==========================================================================
+
+This example illustrates the use of Gaussian processes for regression and
+classification tasks on data that are not in fixed-length feature vector form.
+This is achieved through the use of kernel functions that operates directly
+on discrete structures such as variable-length sequences, trees, and graphs.
+
+Specifically, here the input variables are some gene sequences stored as
+variable-length strings consisting of letters 'A', 'T', 'C', and 'G',
+while the output variables are floating point numbers and True/False labels
+in the regression and classification tasks, respectively.
+
+A kernel between the gene sequences is defined using R-convolution [1]_ by
+integrating a binary letter-wise kernel over all pairs of letters among a pair
+of strings.
+
+This example will generate three figures.
+
+In the first figure, we visualize the value of the kernel, i.e. the similarity
+of the sequences, using a colormap. Brighter color here indicates higher
+similarity.
+
+In the second figure, we show some regression result on a dataset of 6
+sequences. Here we use the 1st, 2nd, 4th, and 5th sequences as the training set
+to make predictions on the 3rd and 6th sequences.
+
+In the third figure, we demonstrate a classification model by training on 6
+sequences and make predictions on another 5 sequences. The ground truth here is
+simply  whether there is at least one 'A' in the sequence. Here the model makes
+four correct classifications and fails on one.
+
+.. [1] Haussler, D. (1999). Convolution kernels on discrete structures
+(Vol. 646). Technical report, Department of Computer Science, University of
+California at Santa Cruz.
+"""
+print(__doc__)
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.gaussian_process.kernels import Kernel, Hyperparameter
+from sklearn.gaussian_process.kernels import GenericKernelMixin
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.base import clone
+
+
+class SequenceKernel(GenericKernelMixin, Kernel):
+    '''
+    A minimal (but valid) convolutional kernel for sequences of variable
+    lengths.'''
+    def __init__(self,
+                 baseline_similarity=0.5,
+                 baseline_similarity_bounds=(1e-5, 1)):
+        self.baseline_similarity = baseline_similarity
+        self.baseline_similarity_bounds = baseline_similarity_bounds
+
+    @property
+    def hyperparameter_baseline_similarity(self):
+        return Hyperparameter("baseline_similarity",
+                              "numeric",
+                              self.baseline_similarity_bounds)
+
+    def _f(self, s1, s2):
+        '''
+        kernel value between a pair of sequences
+        '''
+        return sum([1.0 if c1 == c2 else self.baseline_similarity
+                   for c1 in s1
+                   for c2 in s2])
+
+    def _g(self, s1, s2):
+        '''
+        kernel derivative between a pair of sequences
+        '''
+        return sum([0.0 if c1 == c2 else 1.0
+                    for c1 in s1
+                    for c2 in s2])
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        if Y is None:
+            Y = X
+
+        if eval_gradient:
+            return (np.array([[self._f(x, y) for y in Y] for x in X]),
+                    np.array([[[self._g(x, y)] for y in Y] for x in X]))
+        else:
+            return np.array([[self._f(x, y) for y in Y] for x in X])
+
+    def diag(self, X):
+        return np.array([self._f(x, x) for x in X])
+
+    def is_stationary(self):
+        return False
+
+    def clone_with_theta(self, theta):
+        cloned = clone(self)
+        cloned.theta = theta
+        return cloned
+
+
+kernel = SequenceKernel()
+
+'''
+Sequence similarity matrix under the kernel
+===========================================
+'''
+
+X = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA'])
+
+K = kernel(X)
+D = kernel.diag(X)
+
+plt.figure(figsize=(8, 5))
+plt.imshow(np.diag(D**-0.5).dot(K).dot(np.diag(D**-0.5)))
+plt.xticks(np.arange(len(X)), X)
+plt.yticks(np.arange(len(X)), X)
+plt.title('Sequence similarity under the kernel')
+
+'''
+Regression
+==========
+'''
+
+X = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA'])
+Y = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0])
+
+training_idx = [0, 1, 3, 4]
+gp = GaussianProcessRegressor(kernel=kernel)
+gp.fit(X[training_idx], Y[training_idx])
+
+plt.figure(figsize=(8, 5))
+plt.bar(np.arange(len(X)), gp.predict(X), color='b', label='prediction')
+plt.bar(training_idx, Y[training_idx], width=0.2, color='r',
+        alpha=1, label='training')
+plt.xticks(np.arange(len(X)), X)
+plt.title('Regression on sequences')
+plt.legend()
+
+'''
+Classification
+==============
+'''
+
+X_train = np.array(['AGCT', 'CGA', 'TAAC', 'TCG', 'CTTT', 'TGCT'])
+# whether there are 'A's in the sequence
+Y_train = np.array([True, True, True, False, False, False])
+
+gp = GaussianProcessClassifier(kernel)
+gp.fit(X_train, Y_train)
+
+X_test = ['AAA', 'ATAG', 'CTC', 'CT', 'C']
+Y_test = [True, True, False, False, False]
+
+plt.figure(figsize=(8, 5))
+plt.scatter(np.arange(len(X_train)), [1.0 if c else -1.0 for c in Y_train],
+            s=100, marker='o', edgecolor='none', facecolor=(1, 0.75, 0),
+            label='training')
+plt.scatter(len(X_train) + np.arange(len(X_test)),
+            [1.0 if c else -1.0 for c in Y_test],
+            s=100, marker='o', edgecolor='none', facecolor='r', label='truth')
+plt.scatter(len(X_train) + np.arange(len(X_test)),
+            [1.0 if c else -1.0 for c in gp.predict(X_test)],
+            s=100, marker='x', edgecolor=(0, 1.0, 0.3), linewidth=2,
+            label='prediction')
+plt.xticks(np.arange(len(X_train) + len(X_test)),
+           np.concatenate((X_train, X_test)))
+plt.yticks([-1, 1], [False, True])
+plt.title('Classification on sequences')
+plt.legend()
+
+plt.show()