|
11 | 11 | }, |
12 | 12 | { |
13 | 13 | "cell_type": "code", |
14 | | - "execution_count": 2, |
| 14 | + "execution_count": 1, |
15 | 15 | "metadata": { |
16 | 16 | "collapsed": false |
17 | 17 | }, |
|
24 | 24 | }, |
25 | 25 | { |
26 | 26 | "cell_type": "code", |
27 | | - "execution_count": 3, |
| 27 | + "execution_count": 2, |
28 | 28 | "metadata": { |
29 | 29 | "collapsed": false |
30 | 30 | }, |
|
35 | 35 | "[u'PRIDE AND PREJUDICE', u'', u'By Jane Austen', u'', u'']" |
36 | 36 | ] |
37 | 37 | }, |
38 | | - "execution_count": 3, |
| 38 | + "execution_count": 2, |
39 | 39 | "metadata": {}, |
40 | 40 | "output_type": "execute_result" |
41 | 41 | } |
42 | 42 | ], |
43 | 43 | "source": [ |
44 | 44 | "# first 5 elements of the RDD\n", |
45 | | - "text_file.take(5)" |
| 45 | + "text_file.take(5)+" |
46 | 46 | ] |
47 | 47 | }, |
48 | 48 | { |
49 | 49 | "cell_type": "code", |
50 | | - "execution_count": 4, |
| 50 | + "execution_count": 3, |
51 | 51 | "metadata": { |
52 | 52 | "collapsed": false |
53 | 53 | }, |
|
58 | 58 | "unicode" |
59 | 59 | ] |
60 | 60 | }, |
61 | | - "execution_count": 4, |
| 61 | + "execution_count": 3, |
62 | 62 | "metadata": {}, |
63 | 63 | "output_type": "execute_result" |
64 | 64 | } |
|
69 | 69 | }, |
70 | 70 | { |
71 | 71 | "cell_type": "code", |
72 | | - "execution_count": 5, |
| 72 | + "execution_count": 4, |
73 | 73 | "metadata": { |
74 | 74 | "collapsed": false |
75 | 75 | }, |
|
80 | 80 | "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'" |
81 | 81 | ] |
82 | 82 | }, |
83 | | - "execution_count": 5, |
| 83 | + "execution_count": 4, |
84 | 84 | "metadata": {}, |
85 | 85 | "output_type": "execute_result" |
86 | 86 | } |
|
91 | 91 | }, |
92 | 92 | { |
93 | 93 | "cell_type": "code", |
94 | | - "execution_count": 6, |
| 94 | + "execution_count": 5, |
95 | 95 | "metadata": { |
96 | 96 | "collapsed": true |
97 | 97 | }, |
|
181 | 181 | }, |
182 | 182 | { |
183 | 183 | "cell_type": "code", |
184 | | - "execution_count": 10, |
| 184 | + "execution_count": 22, |
185 | 185 | "metadata": { |
186 | 186 | "collapsed": false |
187 | 187 | }, |
|
192 | 192 | "[['pride', 'and', 'prejudice'], [], ['by', 'jane', 'austen'], [], []]" |
193 | 193 | ] |
194 | 194 | }, |
195 | | - "execution_count": 10, |
| 195 | + "execution_count": 22, |
196 | 196 | "metadata": {}, |
197 | 197 | "output_type": "execute_result" |
198 | 198 | } |
|
204 | 204 | }, |
205 | 205 | { |
206 | 206 | "cell_type": "code", |
207 | | - "execution_count": 11, |
| 207 | + "execution_count": 10, |
208 | 208 | "metadata": { |
209 | 209 | "collapsed": false |
210 | 210 | }, |
|
215 | 215 | "['pride', 'and', 'prejudice', 'by', 'jane']" |
216 | 216 | ] |
217 | 217 | }, |
218 | | - "execution_count": 11, |
| 218 | + "execution_count": 10, |
219 | 219 | "metadata": {}, |
220 | 220 | "output_type": "execute_result" |
221 | 221 | } |
|
227 | 227 | }, |
228 | 228 | { |
229 | 229 | "cell_type": "code", |
230 | | - "execution_count": 12, |
| 230 | + "execution_count": 11, |
231 | 231 | "metadata": { |
232 | 232 | "collapsed": false |
233 | 233 | }, |
|
238 | 238 | "[('pride', 1), ('and', 1), ('prejudice', 1), ('by', 1), ('jane', 1)]" |
239 | 239 | ] |
240 | 240 | }, |
241 | | - "execution_count": 12, |
| 241 | + "execution_count": 11, |
242 | 242 | "metadata": {}, |
243 | 243 | "output_type": "execute_result" |
244 | 244 | } |
|
251 | 251 | }, |
252 | 252 | { |
253 | 253 | "cell_type": "code", |
254 | | - "execution_count": 13, |
| 254 | + "execution_count": 12, |
255 | 255 | "metadata": { |
256 | 256 | "collapsed": false |
257 | 257 | }, |
|
276 | 276 | " ('sweetest', 2)]" |
277 | 277 | ] |
278 | 278 | }, |
279 | | - "execution_count": 13, |
| 279 | + "execution_count": 12, |
280 | 280 | "metadata": {}, |
281 | 281 | "output_type": "execute_result" |
282 | 282 | } |
|
290 | 290 | }, |
291 | 291 | { |
292 | 292 | "cell_type": "code", |
293 | | - "execution_count": 15, |
| 293 | + "execution_count": 13, |
294 | 294 | "metadata": { |
295 | 295 | "collapsed": false |
296 | 296 | }, |
|
315 | 315 | " (1336, 'he')]" |
316 | 316 | ] |
317 | 317 | }, |
318 | | - "execution_count": 15, |
| 318 | + "execution_count": 13, |
319 | 319 | "metadata": {}, |
320 | 320 | "output_type": "execute_result" |
321 | 321 | } |
|
324 | 324 | "one_RDD = text_file.flatMap(lambda x: uni_to_clean_str(x).split())\n", |
325 | 325 | "one_RDD = one_RDD.map(lambda x: (x,1)) \n", |
326 | 326 | "one_RDD = one_RDD.reduceByKey(lambda x,y: x + y)\n", |
| 327 | + "one_RDD.take(5)\n", |
327 | 328 | "one_RDD = one_RDD.map(lambda x:(x[1],x[0])) \n", |
| 329 | + "one_RDD.take(5)\n", |
328 | 330 | "one_RDD.sortByKey(False).take(15)" |
329 | 331 | ] |
330 | 332 | }, |
331 | | - { |
332 | | - "cell_type": "markdown", |
333 | | - "metadata": {}, |
334 | | - "source": [ |
335 | | - "<h2 align=\"center\"> Mistake </h2>" |
336 | | - ] |
337 | | - }, |
338 | 333 | { |
339 | 334 | "cell_type": "code", |
340 | | - "execution_count": 49, |
| 335 | + "execution_count": null, |
341 | 336 | "metadata": { |
342 | 337 | "collapsed": true |
343 | 338 | }, |
344 | 339 | "outputs": [], |
345 | | - "source": [ |
346 | | - "def clean_str(x):\n", |
347 | | - " converted = x.encode('utf-8')\n", |
348 | | - " lowercased_str = converted.lower()\n", |
349 | | - " clean_str = lowercased_str.translate(None,string.punctuation)\n", |
350 | | - " return clean_str" |
351 | | - ] |
352 | | - }, |
353 | | - { |
354 | | - "cell_type": "code", |
355 | | - "execution_count": 50, |
356 | | - "metadata": { |
357 | | - "collapsed": false |
358 | | - }, |
359 | | - "outputs": [ |
360 | | - { |
361 | | - "data": { |
362 | | - "text/plain": [ |
363 | | - "[('disgracewhen', 1),\n", |
364 | | - " ('pardon', 16),\n", |
365 | | - " ('expostulation', 1),\n", |
366 | | - " ('desirable', 11),\n", |
367 | | - " ('foul', 1)]" |
368 | | - ] |
369 | | - }, |
370 | | - "execution_count": 50, |
371 | | - "metadata": {}, |
372 | | - "output_type": "execute_result" |
373 | | - } |
374 | | - ], |
375 | | - "source": [ |
376 | | - "one_RDD = text_file.flatMap(lambda x: clean_str(x).split())\n", |
377 | | - "one_RDD = one_RDD.map(lambda x: (x,1))\n", |
378 | | - "one_RDD = one_RDD.reduceByKey(lambda x,y: x + y)\n", |
379 | | - "one_RDD.take(5) # notice the issue with disgracewhen" |
380 | | - ] |
| 340 | + "source": [] |
381 | 341 | } |
382 | 342 | ], |
383 | 343 | "metadata": { |
|
0 commit comments