Skip to content

Commit f900e09

Browse files
committed
language modelling with w2v
1 parent a81a18f commit f900e09

File tree

8 files changed

+5790
-6
lines changed

8 files changed

+5790
-6
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
class_03/data/*
22
*.pyc
3+
4+
*.npy

class_12/.ipynb_checkpoints/CNN-checkpoint.ipynb

Lines changed: 370 additions & 0 deletions
Large diffs are not rendered by default.

class_12/.ipynb_checkpoints/Convolutions-checkpoint.ipynb

Lines changed: 105 additions & 3 deletions
Large diffs are not rendered by default.
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {
7+
"collapsed": true
8+
},
9+
"outputs": [],
10+
"source": [
11+
"import numpy as np\n",
12+
"from matplotlib import pyplot as plt\n",
13+
"%matplotlib inline"
14+
]
15+
},
16+
{
17+
"cell_type": "code",
18+
"execution_count": 2,
19+
"metadata": {
20+
"collapsed": true
21+
},
22+
"outputs": [],
23+
"source": [
24+
"f = open('./data.txt')\n",
25+
"d = f.read()\n",
26+
"f.close()"
27+
]
28+
},
29+
{
30+
"cell_type": "code",
31+
"execution_count": 102,
32+
"metadata": {
33+
"collapsed": false
34+
},
35+
"outputs": [],
36+
"source": [
37+
"data = d[1260:]\n",
38+
"data = data.lower().decode('utf-8')\n",
39+
"import re"
40+
]
41+
},
42+
{
43+
"cell_type": "code",
44+
"execution_count": 99,
45+
"metadata": {
46+
"collapsed": false
47+
},
48+
"outputs": [],
49+
"source": [
50+
"p = re.sub('[^A-Za-z]+', ' ', data)\n",
51+
"ds = p.split()"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": 100,
57+
"metadata": {
58+
"collapsed": false
59+
},
60+
"outputs": [],
61+
"source": [
62+
"u = np.unique(ds, return_counts=True)"
63+
]
64+
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": 145,
68+
"metadata": {
69+
"collapsed": false,
70+
"scrolled": false
71+
},
72+
"outputs": [],
73+
"source": [
74+
"bow = {}\n",
75+
"rev_bow = {}\n",
76+
"i = 0\n",
77+
"for ix in range(len(u[0])):\n",
78+
" if u[1][ix] > 2:\n",
79+
" bow[i] = u[0][ix]\n",
80+
" rev_bow[u[0][ix]] = i\n",
81+
" i += 1"
82+
]
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": 146,
87+
"metadata": {
88+
"collapsed": false
89+
},
90+
"outputs": [
91+
{
92+
"data": {
93+
"text/plain": [
94+
"1781"
95+
]
96+
},
97+
"execution_count": 146,
98+
"metadata": {},
99+
"output_type": "execute_result"
100+
}
101+
],
102+
"source": [
103+
"len(bow)"
104+
]
105+
},
106+
{
107+
"cell_type": "code",
108+
"execution_count": 147,
109+
"metadata": {
110+
"collapsed": true
111+
},
112+
"outputs": [],
113+
"source": [
114+
"def get_one_hot_vector(word):\n",
115+
" vec = np.zeros((len(bow),))\n",
116+
" vec[rev_bow[word]] = 1.0\n",
117+
" \n",
118+
" return vec\n",
119+
"\n",
120+
"def get_word_from_vec(vec):\n",
121+
" ind = np.argmax(vec)\n",
122+
" \n",
123+
" return bow[ind]"
124+
]
125+
},
126+
{
127+
"cell_type": "code",
128+
"execution_count": 148,
129+
"metadata": {
130+
"collapsed": false,
131+
"scrolled": false
132+
},
133+
"outputs": [
134+
{
135+
"name": "stdout",
136+
"output_type": "stream",
137+
"text": [
138+
"tree\n"
139+
]
140+
}
141+
],
142+
"source": [
143+
"a = get_one_hot_vector('tree')\n",
144+
"a_ = get_word_from_vec(a)\n",
145+
"\n",
146+
"print a_"
147+
]
148+
},
149+
{
150+
"cell_type": "code",
151+
"execution_count": 156,
152+
"metadata": {
153+
"collapsed": false,
154+
"scrolled": false
155+
},
156+
"outputs": [],
157+
"source": [
158+
"all_data = p.split()\n",
159+
"len(all_data)\n",
160+
"\n",
161+
"dataset = []#np.zeros((len(all_data), len(bow)))\n",
162+
"# print dataset.shape"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": 157,
168+
"metadata": {
169+
"collapsed": false
170+
},
171+
"outputs": [],
172+
"source": [
173+
"for w in range(len(all_data)):\n",
174+
" try:\n",
175+
" dataset.append(get_one_hot_vector(all_data[w]))\n",
176+
" except:\n",
177+
" pass"
178+
]
179+
},
180+
{
181+
"cell_type": "code",
182+
"execution_count": 158,
183+
"metadata": {
184+
"collapsed": false
185+
},
186+
"outputs": [
187+
{
188+
"name": "stdout",
189+
"output_type": "stream",
190+
"text": [
191+
"(35456, 1781)\n"
192+
]
193+
}
194+
],
195+
"source": [
196+
"dataset = np.asarray(dataset)\n",
197+
"print dataset.shape"
198+
]
199+
},
200+
{
201+
"cell_type": "code",
202+
"execution_count": 159,
203+
"metadata": {
204+
"collapsed": true
205+
},
206+
"outputs": [],
207+
"source": [
208+
"np.save('all_word_data', dataset)"
209+
]
210+
}
211+
],
212+
"metadata": {
213+
"kernelspec": {
214+
"display_name": "Python 2",
215+
"language": "python",
216+
"name": "python2"
217+
},
218+
"language_info": {
219+
"codemirror_mode": {
220+
"name": "ipython",
221+
"version": 2
222+
},
223+
"file_extension": ".py",
224+
"mimetype": "text/x-python",
225+
"name": "python",
226+
"nbconvert_exporter": "python",
227+
"pygments_lexer": "ipython2",
228+
"version": "2.7.12"
229+
}
230+
},
231+
"nbformat": 4,
232+
"nbformat_minor": 2
233+
}

class_12/CNN.ipynb

Lines changed: 381 additions & 0 deletions
Large diffs are not rendered by default.

class_12/Convolutions.ipynb

Lines changed: 150 additions & 3 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)