Skip to content

Commit 36d8028

Browse files
committed
Pushing the docs to dev/ for branch: main, commit d8592a682aef7ffcb88932610bf8c1cc44739ebd
1 parent 9a70a47 commit 36d8028

File tree

1,232 files changed

+4570
-4450
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,232 files changed

+4570
-4450
lines changed
Binary file not shown.

dev/_downloads/348dd747b709a747e14c8bcdddf0a9b6/plot_gpr_on_structured_data.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@
3838
3939
"""
4040

41+
# %%
4142
import numpy as np
42-
import matplotlib.pyplot as plt
4343
from sklearn.gaussian_process.kernels import Kernel, Hyperparameter
4444
from sklearn.gaussian_process.kernels import GenericKernelMixin
4545
from sklearn.gaussian_process import GaussianProcessRegressor
@@ -102,10 +102,11 @@ def clone_with_theta(self, theta):
102102

103103
kernel = SequenceKernel()
104104

105-
"""
106-
Sequence similarity matrix under the kernel
107-
===========================================
108-
"""
105+
# %%
106+
# Sequence similarity matrix under the kernel
107+
# ===========================================
108+
109+
import matplotlib.pyplot as plt
109110

110111
X = np.array(["AGCT", "AGC", "AACT", "TAA", "AAA", "GAACA"])
111112

@@ -117,11 +118,11 @@ def clone_with_theta(self, theta):
117118
plt.xticks(np.arange(len(X)), X)
118119
plt.yticks(np.arange(len(X)), X)
119120
plt.title("Sequence similarity under the kernel")
121+
plt.show()
120122

121-
"""
122-
Regression
123-
==========
124-
"""
123+
# %%
124+
# Regression
125+
# ==========
125126

126127
X = np.array(["AGCT", "AGC", "AACT", "TAA", "AAA", "GAACA"])
127128
Y = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0])
@@ -136,11 +137,11 @@ def clone_with_theta(self, theta):
136137
plt.xticks(np.arange(len(X)), X)
137138
plt.title("Regression on sequences")
138139
plt.legend()
140+
plt.show()
139141

140-
"""
141-
Classification
142-
==============
143-
"""
142+
# %%
143+
# Classification
144+
# ==============
144145

145146
X_train = np.array(["AGCT", "CGA", "TAAC", "TCG", "CTTT", "TGCT"])
146147
# whether there are 'A's in the sequence
@@ -176,13 +177,12 @@ def clone_with_theta(self, theta):
176177
[1.0 if c else -1.0 for c in gp.predict(X_test)],
177178
s=100,
178179
marker="x",
179-
edgecolor=(0, 1.0, 0.3),
180+
facecolor="b",
180181
linewidth=2,
181182
label="prediction",
182183
)
183184
plt.xticks(np.arange(len(X_train) + len(X_test)), np.concatenate((X_train, X_test)))
184185
plt.yticks([-1, 1], [False, True])
185186
plt.title("Classification on sequences")
186187
plt.legend()
187-
188188
plt.show()

dev/_downloads/46c19b52b5a5ab5796725eb7e0688309/plot_gpr_on_structured_data.ipynb

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,61 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"import numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.gaussian_process.kernels import Kernel, Hyperparameter\nfrom sklearn.gaussian_process.kernels import GenericKernelMixin\nfrom sklearn.gaussian_process import GaussianProcessRegressor\nfrom sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.base import clone\n\n\nclass SequenceKernel(GenericKernelMixin, Kernel):\n \"\"\"\n A minimal (but valid) convolutional kernel for sequences of variable\n lengths.\"\"\"\n\n def __init__(self, baseline_similarity=0.5, baseline_similarity_bounds=(1e-5, 1)):\n self.baseline_similarity = baseline_similarity\n self.baseline_similarity_bounds = baseline_similarity_bounds\n\n @property\n def hyperparameter_baseline_similarity(self):\n return Hyperparameter(\n \"baseline_similarity\", \"numeric\", self.baseline_similarity_bounds\n )\n\n def _f(self, s1, s2):\n \"\"\"\n kernel value between a pair of sequences\n \"\"\"\n return sum(\n [1.0 if c1 == c2 else self.baseline_similarity for c1 in s1 for c2 in s2]\n )\n\n def _g(self, s1, s2):\n \"\"\"\n kernel derivative between a pair of sequences\n \"\"\"\n return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2])\n\n def __call__(self, X, Y=None, eval_gradient=False):\n if Y is None:\n Y = X\n\n if eval_gradient:\n return (\n np.array([[self._f(x, y) for y in Y] for x in X]),\n np.array([[[self._g(x, y)] for y in Y] for x in X]),\n )\n else:\n return np.array([[self._f(x, y) for y in Y] for x in X])\n\n def diag(self, X):\n return np.array([self._f(x, x) for x in X])\n\n def is_stationary(self):\n return False\n\n def clone_with_theta(self, theta):\n cloned = clone(self)\n cloned.theta = theta\n return cloned\n\n\nkernel = SequenceKernel()\n\n\"\"\"\nSequence similarity matrix under the kernel\n===========================================\n\"\"\"\n\nX = np.array([\"AGCT\", \"AGC\", \"AACT\", \"TAA\", \"AAA\", \"GAACA\"])\n\nK = kernel(X)\nD = kernel.diag(X)\n\nplt.figure(figsize=(8, 5))\nplt.imshow(np.diag(D**-0.5).dot(K).dot(np.diag(D**-0.5)))\nplt.xticks(np.arange(len(X)), X)\nplt.yticks(np.arange(len(X)), X)\nplt.title(\"Sequence similarity under the kernel\")\n\n\"\"\"\nRegression\n==========\n\"\"\"\n\nX = np.array([\"AGCT\", \"AGC\", \"AACT\", \"TAA\", \"AAA\", \"GAACA\"])\nY = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0])\n\ntraining_idx = [0, 1, 3, 4]\ngp = GaussianProcessRegressor(kernel=kernel)\ngp.fit(X[training_idx], Y[training_idx])\n\nplt.figure(figsize=(8, 5))\nplt.bar(np.arange(len(X)), gp.predict(X), color=\"b\", label=\"prediction\")\nplt.bar(training_idx, Y[training_idx], width=0.2, color=\"r\", alpha=1, label=\"training\")\nplt.xticks(np.arange(len(X)), X)\nplt.title(\"Regression on sequences\")\nplt.legend()\n\n\"\"\"\nClassification\n==============\n\"\"\"\n\nX_train = np.array([\"AGCT\", \"CGA\", \"TAAC\", \"TCG\", \"CTTT\", \"TGCT\"])\n# whether there are 'A's in the sequence\nY_train = np.array([True, True, True, False, False, False])\n\ngp = GaussianProcessClassifier(kernel)\ngp.fit(X_train, Y_train)\n\nX_test = [\"AAA\", \"ATAG\", \"CTC\", \"CT\", \"C\"]\nY_test = [True, True, False, False, False]\n\nplt.figure(figsize=(8, 5))\nplt.scatter(\n np.arange(len(X_train)),\n [1.0 if c else -1.0 for c in Y_train],\n s=100,\n marker=\"o\",\n edgecolor=\"none\",\n facecolor=(1, 0.75, 0),\n label=\"training\",\n)\nplt.scatter(\n len(X_train) + np.arange(len(X_test)),\n [1.0 if c else -1.0 for c in Y_test],\n s=100,\n marker=\"o\",\n edgecolor=\"none\",\n facecolor=\"r\",\n label=\"truth\",\n)\nplt.scatter(\n len(X_train) + np.arange(len(X_test)),\n [1.0 if c else -1.0 for c in gp.predict(X_test)],\n s=100,\n marker=\"x\",\n edgecolor=(0, 1.0, 0.3),\n linewidth=2,\n label=\"prediction\",\n)\nplt.xticks(np.arange(len(X_train) + len(X_test)), np.concatenate((X_train, X_test)))\nplt.yticks([-1, 1], [False, True])\nplt.title(\"Classification on sequences\")\nplt.legend()\n\nplt.show()"
29+
"import numpy as np\nfrom sklearn.gaussian_process.kernels import Kernel, Hyperparameter\nfrom sklearn.gaussian_process.kernels import GenericKernelMixin\nfrom sklearn.gaussian_process import GaussianProcessRegressor\nfrom sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.base import clone\n\n\nclass SequenceKernel(GenericKernelMixin, Kernel):\n \"\"\"\n A minimal (but valid) convolutional kernel for sequences of variable\n lengths.\"\"\"\n\n def __init__(self, baseline_similarity=0.5, baseline_similarity_bounds=(1e-5, 1)):\n self.baseline_similarity = baseline_similarity\n self.baseline_similarity_bounds = baseline_similarity_bounds\n\n @property\n def hyperparameter_baseline_similarity(self):\n return Hyperparameter(\n \"baseline_similarity\", \"numeric\", self.baseline_similarity_bounds\n )\n\n def _f(self, s1, s2):\n \"\"\"\n kernel value between a pair of sequences\n \"\"\"\n return sum(\n [1.0 if c1 == c2 else self.baseline_similarity for c1 in s1 for c2 in s2]\n )\n\n def _g(self, s1, s2):\n \"\"\"\n kernel derivative between a pair of sequences\n \"\"\"\n return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2])\n\n def __call__(self, X, Y=None, eval_gradient=False):\n if Y is None:\n Y = X\n\n if eval_gradient:\n return (\n np.array([[self._f(x, y) for y in Y] for x in X]),\n np.array([[[self._g(x, y)] for y in Y] for x in X]),\n )\n else:\n return np.array([[self._f(x, y) for y in Y] for x in X])\n\n def diag(self, X):\n return np.array([self._f(x, x) for x in X])\n\n def is_stationary(self):\n return False\n\n def clone_with_theta(self, theta):\n cloned = clone(self)\n cloned.theta = theta\n return cloned\n\n\nkernel = SequenceKernel()"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"## Sequence similarity matrix under the kernel\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"import matplotlib.pyplot as plt\n\nX = np.array([\"AGCT\", \"AGC\", \"AACT\", \"TAA\", \"AAA\", \"GAACA\"])\n\nK = kernel(X)\nD = kernel.diag(X)\n\nplt.figure(figsize=(8, 5))\nplt.imshow(np.diag(D**-0.5).dot(K).dot(np.diag(D**-0.5)))\nplt.xticks(np.arange(len(X)), X)\nplt.yticks(np.arange(len(X)), X)\nplt.title(\"Sequence similarity under the kernel\")\nplt.show()"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {},
53+
"source": [
54+
"## Regression\n\n"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [],
64+
"source": [
65+
"X = np.array([\"AGCT\", \"AGC\", \"AACT\", \"TAA\", \"AAA\", \"GAACA\"])\nY = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0])\n\ntraining_idx = [0, 1, 3, 4]\ngp = GaussianProcessRegressor(kernel=kernel)\ngp.fit(X[training_idx], Y[training_idx])\n\nplt.figure(figsize=(8, 5))\nplt.bar(np.arange(len(X)), gp.predict(X), color=\"b\", label=\"prediction\")\nplt.bar(training_idx, Y[training_idx], width=0.2, color=\"r\", alpha=1, label=\"training\")\nplt.xticks(np.arange(len(X)), X)\nplt.title(\"Regression on sequences\")\nplt.legend()\nplt.show()"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"metadata": {},
71+
"source": [
72+
"## Classification\n\n"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": null,
78+
"metadata": {
79+
"collapsed": false
80+
},
81+
"outputs": [],
82+
"source": [
83+
"X_train = np.array([\"AGCT\", \"CGA\", \"TAAC\", \"TCG\", \"CTTT\", \"TGCT\"])\n# whether there are 'A's in the sequence\nY_train = np.array([True, True, True, False, False, False])\n\ngp = GaussianProcessClassifier(kernel)\ngp.fit(X_train, Y_train)\n\nX_test = [\"AAA\", \"ATAG\", \"CTC\", \"CT\", \"C\"]\nY_test = [True, True, False, False, False]\n\nplt.figure(figsize=(8, 5))\nplt.scatter(\n np.arange(len(X_train)),\n [1.0 if c else -1.0 for c in Y_train],\n s=100,\n marker=\"o\",\n edgecolor=\"none\",\n facecolor=(1, 0.75, 0),\n label=\"training\",\n)\nplt.scatter(\n len(X_train) + np.arange(len(X_test)),\n [1.0 if c else -1.0 for c in Y_test],\n s=100,\n marker=\"o\",\n edgecolor=\"none\",\n facecolor=\"r\",\n label=\"truth\",\n)\nplt.scatter(\n len(X_train) + np.arange(len(X_test)),\n [1.0 if c else -1.0 for c in gp.predict(X_test)],\n s=100,\n marker=\"x\",\n facecolor=\"b\",\n linewidth=2,\n label=\"prediction\",\n)\nplt.xticks(np.arange(len(X_train) + len(X_test)), np.concatenate((X_train, X_test)))\nplt.yticks([-1, 1], [False, True])\nplt.title(\"Classification on sequences\")\nplt.legend()\nplt.show()"
3084
]
3185
}
3286
],
Binary file not shown.

dev/_downloads/scikit-learn-docs.zip

-1.94 KB
Binary file not shown.
-124 Bytes
-130 Bytes
44 Bytes
7 Bytes
24 Bytes

0 commit comments

Comments
 (0)