Skip to content

Commit 49cb1ba

Browse files
committed
markov chain language model
1 parent 4ce600c commit 49cb1ba

File tree

2 files changed

+506
-20
lines changed

2 files changed

+506
-20
lines changed

class_17/.ipynb_checkpoints/MackovChain_LM-checkpoint.ipynb

Lines changed: 253 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,194 @@
2626
},
2727
{
2828
"cell_type": "code",
29-
"execution_count": 7,
29+
"execution_count": 25,
30+
"metadata": {
31+
"collapsed": false
32+
},
33+
"outputs": [],
34+
"source": [
35+
"d = d.decode('utf-8')"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 26,
41+
"metadata": {
42+
"collapsed": false
43+
},
44+
"outputs": [
45+
{
46+
"name": "stdout",
47+
"output_type": "stream",
48+
"text": [
49+
"1690964\n"
50+
]
51+
}
52+
],
53+
"source": [
54+
"print len(d)/2"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 27,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [],
64+
"source": [
65+
"data = d.strip().lower()"
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": 34,
71+
"metadata": {
72+
"collapsed": false
73+
},
74+
"outputs": [
75+
{
76+
"name": "stdout",
77+
"output_type": "stream",
78+
"text": [
79+
".....\n",
80+
".....\n",
81+
".....\n",
82+
".....\n",
83+
".....\n",
84+
".....\n",
85+
".....\n",
86+
".....\n",
87+
".....\n",
88+
".....\n"
89+
]
90+
}
91+
],
92+
"source": [
93+
"# language model\n",
94+
"order = 3\n",
95+
"lm = {}\n",
96+
"n = 100000\n",
97+
"\n",
98+
"for ix in range(n):\n",
99+
" dt = '*'*order + data\n",
100+
" ctx = dt[ix:ix+order]\n",
101+
" nxt = dt[ix+order]\n",
102+
" \n",
103+
" if (float(ix*100)/n)%10 == 0:\n",
104+
" print '.....'\n",
105+
" \n",
106+
" # print ctx, nxt\n",
107+
" try:\n",
108+
" lm[ctx][nxt] += 1\n",
109+
" except:\n",
110+
" try:\n",
111+
" lm[ctx][nxt] = 1\n",
112+
" except:\n",
113+
" lm[ctx] = {}\n",
114+
" lm[ctx][nxt] = 1"
115+
]
116+
},
117+
{
118+
"cell_type": "code",
119+
"execution_count": 79,
120+
"metadata": {
121+
"collapsed": false,
122+
"scrolled": true
123+
},
124+
"outputs": [],
125+
"source": [
126+
"for k in lm.keys():\n",
127+
" s = 0\n",
128+
" for kx in lm[k].keys():\n",
129+
" s += lm[k][kx]\n",
130+
" for kx in lm[k].keys():\n",
131+
" lm[k][kx] /= float(s)\n",
132+
" # print lm[k], k"
133+
]
134+
},
135+
{
136+
"cell_type": "code",
137+
"execution_count": 52,
138+
"metadata": {
139+
"collapsed": false
140+
},
141+
"outputs": [
142+
{
143+
"data": {
144+
"text/plain": [
145+
"{u' ': 0.2, u\"'\": 0.2, u'k': 0.1, u'l': 0.3, u'y': 0.2}"
146+
]
147+
},
148+
"execution_count": 52,
149+
"metadata": {},
150+
"output_type": "execute_result"
151+
}
152+
],
153+
"source": [
154+
"lm['er']"
155+
]
156+
},
157+
{
158+
"cell_type": "code",
159+
"execution_count": 72,
160+
"metadata": {
161+
"collapsed": true
162+
},
163+
"outputs": [],
164+
"source": [
165+
"def generate_chars(ctx, model):\n",
166+
" r = np.random.random()\n",
167+
" try:\n",
168+
" possible = model[ctx]\n",
169+
" for k in possible.keys():\n",
170+
" if r-possible[k] < 0:\n",
171+
" return k\n",
172+
" return possible.keys()[int(len(possible.keys())*r)]\n",
173+
" except:\n",
174+
" return ' '"
175+
]
176+
},
177+
{
178+
"cell_type": "code",
179+
"execution_count": 74,
180+
"metadata": {
181+
"collapsed": false
182+
},
183+
"outputs": [
184+
{
185+
"data": {
186+
"text/plain": [
187+
"' '"
188+
]
189+
},
190+
"execution_count": 74,
191+
"metadata": {},
192+
"output_type": "execute_result"
193+
}
194+
],
195+
"source": [
196+
"generate_chars('sg', lm)\n"
197+
]
198+
},
199+
{
200+
"cell_type": "code",
201+
"execution_count": 77,
202+
"metadata": {
203+
"collapsed": true
204+
},
205+
"outputs": [],
206+
"source": [
207+
"def sample(start='*', length=500):\n",
208+
" text = start*order\n",
209+
" for ix in range(length):\n",
210+
" text += generate_chars(text[ix:ix+order], lm)\n",
211+
" return text"
212+
]
213+
},
214+
{
215+
"cell_type": "code",
216+
"execution_count": 78,
30217
"metadata": {
31218
"collapsed": false
32219
},
@@ -35,20 +222,76 @@
35222
"name": "stdout",
36223
"output_type": "stream",
37224
"text": [
38-
" The Sign of the Four\n",
225+
"**the specherk\n",
226+
" ther's of th the arthe five advents\n",
227+
" th thes\n",
228+
" thure fache a silvenge adve pips\n",
229+
"\n",
230+
"\n",
231+
"\n",
232+
" thery\n",
233+
" advents\n",
234+
"\n",
235+
" the fache musg th therlet\n",
236+
"\n",
237+
" th the adventured bache vallock holmes\n",
238+
" the the adventure table the a she face\n",
239+
" the ented lip\n",
240+
" twity\n",
241+
" the the ster copperk\n",
242+
" the pips\n",
243+
"\n",
244+
" tabletengin sperleague\n",
245+
"\n",
246+
"\n",
247+
" twisted bacher's of twistock holmes\n",
248+
"\n",
249+
"\n",
250+
" the yellock holmes\n",
251+
" the facherk\n",
252+
" the four bace\n",
253+
"\n",
254+
" a sign doyl can of ste scarle\n",
255+
" th th the of the yelow face\n",
256+
"\n",
257+
" table noblaze\n",
258+
" twistockleted lips\n",
259+
" table pips\n",
260+
" arthes\n",
261+
" the of tablue stured-herlock holmes\n",
262+
" tableadented ley ided le noblue adventure aded lips\n",
39263
"\n",
40-
" The Adventures of Sherlock Holmes\n",
41-
" A Scandal in Bohemia\n",
42-
" The Red-Headed League\n",
43-
" A Case of Identity\n",
44-
" The Boscombe Valley Mystery\n",
45-
" The Five Orange Pips\n",
46-
" The Man wit\n"
264+
" the sign of the nobled-he adventure pips\n",
265+
" the a sign of th the foure mes\n",
266+
" the ented bl\n"
267+
]
268+
}
269+
],
270+
"source": [
271+
"print sample(length=1000)"
272+
]
273+
},
274+
{
275+
"cell_type": "code",
276+
"execution_count": 62,
277+
"metadata": {
278+
"collapsed": false
279+
},
280+
"outputs": [
281+
{
282+
"name": "stdout",
283+
"output_type": "stream",
284+
"text": [
285+
"**\n",
286+
"**1\n"
47287
]
48288
}
49289
],
50290
"source": [
51-
"print d[200:500]"
291+
"a = '**'\n",
292+
"print a[0:0+2]\n",
293+
"a += '1'\n",
294+
"print a"
52295
]
53296
}
54297
],

0 commit comments

Comments
 (0)