Skip to content

Commit e9f2708

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 8360786895e12a63afe07aa66ad8a3fbf438b729
1 parent 4e1df6b commit e9f2708

File tree

1,226 files changed

+5415
-4093
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,226 files changed

+5415
-4093
lines changed
Binary file not shown.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Gaussian processes on discrete data structures\n\n\nThis example illustrates the use of Gaussian processes for regression and\nclassification tasks on data that are not in fixed-length feature vector form.\nThis is achieved through the use of kernel functions that operates directly\non discrete structures such as variable-length sequences, trees, and graphs.\n\nSpecifically, here the input variables are some gene sequences stored as\nvariable-length strings consisting of letters 'A', 'T', 'C', and 'G',\nwhile the output variables are floating point numbers and True/False labels\nin the regression and classification tasks, respectively.\n\nA kernel between the gene sequences is defined using R-convolution [1]_ by\nintegrating a binary letter-wise kernel over all pairs of letters among a pair\nof strings.\n\nThis example will generate three figures.\n\nIn the first figure, we visualize the value of the kernel, i.e. the similarity\nof the sequences, using a colormap. Brighter color here indicates higher\nsimilarity.\n\nIn the second figure, we show some regression result on a dataset of 6\nsequences. Here we use the 1st, 2nd, 4th, and 5th sequences as the training set\nto make predictions on the 3rd and 6th sequences.\n\nIn the third figure, we demonstrate a classification model by training on 6\nsequences and make predictions on another 5 sequences. The ground truth here is\nsimply whether there is at least one 'A' in the sequence. Here the model makes\nfour correct classifications and fails on one.\n\n.. [1] Haussler, D. (1999). Convolution kernels on discrete structures\n(Vol. 646). Technical report, Department of Computer Science, University of\nCalifornia at Santa Cruz.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.gaussian_process.kernels import Kernel, Hyperparameter\nfrom sklearn.gaussian_process.kernels import GenericKernelMixin\nfrom sklearn.gaussian_process import GaussianProcessRegressor\nfrom sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.base import clone\n\n\nclass SequenceKernel(GenericKernelMixin, Kernel):\n '''\n A minimal (but valid) convolutional kernel for sequences of variable\n lengths.'''\n def __init__(self,\n baseline_similarity=0.5,\n baseline_similarity_bounds=(1e-5, 1)):\n self.baseline_similarity = baseline_similarity\n self.baseline_similarity_bounds = baseline_similarity_bounds\n\n @property\n def hyperparameter_baseline_similarity(self):\n return Hyperparameter(\"baseline_similarity\",\n \"numeric\",\n self.baseline_similarity_bounds)\n\n def _f(self, s1, s2):\n '''\n kernel value between a pair of sequences\n '''\n return sum([1.0 if c1 == c2 else self.baseline_similarity\n for c1 in s1\n for c2 in s2])\n\n def _g(self, s1, s2):\n '''\n kernel derivative between a pair of sequences\n '''\n return sum([0.0 if c1 == c2 else 1.0\n for c1 in s1\n for c2 in s2])\n\n def __call__(self, X, Y=None, eval_gradient=False):\n if Y is None:\n Y = X\n\n if eval_gradient:\n return (np.array([[self._f(x, y) for y in Y] for x in X]),\n np.array([[[self._g(x, y)] for y in Y] for x in X]))\n else:\n return np.array([[self._f(x, y) for y in Y] for x in X])\n\n def diag(self, X):\n return np.array([self._f(x, x) for x in X])\n\n def is_stationary(self):\n return False\n\n def clone_with_theta(self, theta):\n cloned = clone(self)\n cloned.theta = theta\n return cloned\n\n\nkernel = SequenceKernel()\n\n'''\nSequence similarity matrix under the kernel\n===========================================\n'''\n\nX = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA'])\n\nK = kernel(X)\nD = kernel.diag(X)\n\nplt.figure(figsize=(8, 5))\nplt.imshow(np.diag(D**-0.5).dot(K).dot(np.diag(D**-0.5)))\nplt.xticks(np.arange(len(X)), X)\nplt.yticks(np.arange(len(X)), X)\nplt.title('Sequence similarity under the kernel')\n\n'''\nRegression\n==========\n'''\n\nX = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA'])\nY = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0])\n\ntraining_idx = [0, 1, 3, 4]\ngp = GaussianProcessRegressor(kernel=kernel)\ngp.fit(X[training_idx], Y[training_idx])\n\nplt.figure(figsize=(8, 5))\nplt.bar(np.arange(len(X)), gp.predict(X), color='b', label='prediction')\nplt.bar(training_idx, Y[training_idx], width=0.2, color='r',\n alpha=1, label='training')\nplt.xticks(np.arange(len(X)), X)\nplt.title('Regression on sequences')\nplt.legend()\n\n'''\nClassification\n==============\n'''\n\nX_train = np.array(['AGCT', 'CGA', 'TAAC', 'TCG', 'CTTT', 'TGCT'])\n# whether there are 'A's in the sequence\nY_train = np.array([True, True, True, False, False, False])\n\ngp = GaussianProcessClassifier(kernel)\ngp.fit(X_train, Y_train)\n\nX_test = ['AAA', 'ATAG', 'CTC', 'CT', 'C']\nY_test = [True, True, False, False, False]\n\nplt.figure(figsize=(8, 5))\nplt.scatter(np.arange(len(X_train)), [1.0 if c else -1.0 for c in Y_train],\n s=100, marker='o', edgecolor='none', facecolor=(1, 0.75, 0),\n label='training')\nplt.scatter(len(X_train) + np.arange(len(X_test)),\n [1.0 if c else -1.0 for c in Y_test],\n s=100, marker='o', edgecolor='none', facecolor='r', label='truth')\nplt.scatter(len(X_train) + np.arange(len(X_test)),\n [1.0 if c else -1.0 for c in gp.predict(X_test)],\n s=100, marker='x', edgecolor=(0, 1.0, 0.3), linewidth=2,\n label='prediction')\nplt.xticks(np.arange(len(X_train) + len(X_test)),\n np.concatenate((X_train, X_test)))\nplt.yticks([-1, 1], [False, True])\nplt.title('Classification on sequences')\nplt.legend()\n\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.7.5"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
"""
2+
==========================================================================
3+
Gaussian processes on discrete data structures
4+
==========================================================================
5+
6+
This example illustrates the use of Gaussian processes for regression and
7+
classification tasks on data that are not in fixed-length feature vector form.
8+
This is achieved through the use of kernel functions that operates directly
9+
on discrete structures such as variable-length sequences, trees, and graphs.
10+
11+
Specifically, here the input variables are some gene sequences stored as
12+
variable-length strings consisting of letters 'A', 'T', 'C', and 'G',
13+
while the output variables are floating point numbers and True/False labels
14+
in the regression and classification tasks, respectively.
15+
16+
A kernel between the gene sequences is defined using R-convolution [1]_ by
17+
integrating a binary letter-wise kernel over all pairs of letters among a pair
18+
of strings.
19+
20+
This example will generate three figures.
21+
22+
In the first figure, we visualize the value of the kernel, i.e. the similarity
23+
of the sequences, using a colormap. Brighter color here indicates higher
24+
similarity.
25+
26+
In the second figure, we show some regression result on a dataset of 6
27+
sequences. Here we use the 1st, 2nd, 4th, and 5th sequences as the training set
28+
to make predictions on the 3rd and 6th sequences.
29+
30+
In the third figure, we demonstrate a classification model by training on 6
31+
sequences and make predictions on another 5 sequences. The ground truth here is
32+
simply whether there is at least one 'A' in the sequence. Here the model makes
33+
four correct classifications and fails on one.
34+
35+
.. [1] Haussler, D. (1999). Convolution kernels on discrete structures
36+
(Vol. 646). Technical report, Department of Computer Science, University of
37+
California at Santa Cruz.
38+
"""
39+
print(__doc__)
40+
41+
import numpy as np
42+
import matplotlib.pyplot as plt
43+
from sklearn.gaussian_process.kernels import Kernel, Hyperparameter
44+
from sklearn.gaussian_process.kernels import GenericKernelMixin
45+
from sklearn.gaussian_process import GaussianProcessRegressor
46+
from sklearn.gaussian_process import GaussianProcessClassifier
47+
from sklearn.base import clone
48+
49+
50+
class SequenceKernel(GenericKernelMixin, Kernel):
51+
'''
52+
A minimal (but valid) convolutional kernel for sequences of variable
53+
lengths.'''
54+
def __init__(self,
55+
baseline_similarity=0.5,
56+
baseline_similarity_bounds=(1e-5, 1)):
57+
self.baseline_similarity = baseline_similarity
58+
self.baseline_similarity_bounds = baseline_similarity_bounds
59+
60+
@property
61+
def hyperparameter_baseline_similarity(self):
62+
return Hyperparameter("baseline_similarity",
63+
"numeric",
64+
self.baseline_similarity_bounds)
65+
66+
def _f(self, s1, s2):
67+
'''
68+
kernel value between a pair of sequences
69+
'''
70+
return sum([1.0 if c1 == c2 else self.baseline_similarity
71+
for c1 in s1
72+
for c2 in s2])
73+
74+
def _g(self, s1, s2):
75+
'''
76+
kernel derivative between a pair of sequences
77+
'''
78+
return sum([0.0 if c1 == c2 else 1.0
79+
for c1 in s1
80+
for c2 in s2])
81+
82+
def __call__(self, X, Y=None, eval_gradient=False):
83+
if Y is None:
84+
Y = X
85+
86+
if eval_gradient:
87+
return (np.array([[self._f(x, y) for y in Y] for x in X]),
88+
np.array([[[self._g(x, y)] for y in Y] for x in X]))
89+
else:
90+
return np.array([[self._f(x, y) for y in Y] for x in X])
91+
92+
def diag(self, X):
93+
return np.array([self._f(x, x) for x in X])
94+
95+
def is_stationary(self):
96+
return False
97+
98+
def clone_with_theta(self, theta):
99+
cloned = clone(self)
100+
cloned.theta = theta
101+
return cloned
102+
103+
104+
kernel = SequenceKernel()
105+
106+
'''
107+
Sequence similarity matrix under the kernel
108+
===========================================
109+
'''
110+
111+
X = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA'])
112+
113+
K = kernel(X)
114+
D = kernel.diag(X)
115+
116+
plt.figure(figsize=(8, 5))
117+
plt.imshow(np.diag(D**-0.5).dot(K).dot(np.diag(D**-0.5)))
118+
plt.xticks(np.arange(len(X)), X)
119+
plt.yticks(np.arange(len(X)), X)
120+
plt.title('Sequence similarity under the kernel')
121+
122+
'''
123+
Regression
124+
==========
125+
'''
126+
127+
X = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA'])
128+
Y = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0])
129+
130+
training_idx = [0, 1, 3, 4]
131+
gp = GaussianProcessRegressor(kernel=kernel)
132+
gp.fit(X[training_idx], Y[training_idx])
133+
134+
plt.figure(figsize=(8, 5))
135+
plt.bar(np.arange(len(X)), gp.predict(X), color='b', label='prediction')
136+
plt.bar(training_idx, Y[training_idx], width=0.2, color='r',
137+
alpha=1, label='training')
138+
plt.xticks(np.arange(len(X)), X)
139+
plt.title('Regression on sequences')
140+
plt.legend()
141+
142+
'''
143+
Classification
144+
==============
145+
'''
146+
147+
X_train = np.array(['AGCT', 'CGA', 'TAAC', 'TCG', 'CTTT', 'TGCT'])
148+
# whether there are 'A's in the sequence
149+
Y_train = np.array([True, True, True, False, False, False])
150+
151+
gp = GaussianProcessClassifier(kernel)
152+
gp.fit(X_train, Y_train)
153+
154+
X_test = ['AAA', 'ATAG', 'CTC', 'CT', 'C']
155+
Y_test = [True, True, False, False, False]
156+
157+
plt.figure(figsize=(8, 5))
158+
plt.scatter(np.arange(len(X_train)), [1.0 if c else -1.0 for c in Y_train],
159+
s=100, marker='o', edgecolor='none', facecolor=(1, 0.75, 0),
160+
label='training')
161+
plt.scatter(len(X_train) + np.arange(len(X_test)),
162+
[1.0 if c else -1.0 for c in Y_test],
163+
s=100, marker='o', edgecolor='none', facecolor='r', label='truth')
164+
plt.scatter(len(X_train) + np.arange(len(X_test)),
165+
[1.0 if c else -1.0 for c in gp.predict(X_test)],
166+
s=100, marker='x', edgecolor=(0, 1.0, 0.3), linewidth=2,
167+
label='prediction')
168+
plt.xticks(np.arange(len(X_train) + len(X_test)),
169+
np.concatenate((X_train, X_test)))
170+
plt.yticks([-1, 1], [False, True])
171+
plt.title('Classification on sequences')
172+
plt.legend()
173+
174+
plt.show()
Binary file not shown.

dev/_downloads/scikit-learn-docs.pdf

62.7 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes

0 commit comments

Comments
 (0)