|
11 | 11 | }, |
12 | 12 | { |
13 | 13 | "cell_type": "code", |
14 | | - "execution_count": 132, |
| 14 | + "execution_count": 40, |
15 | 15 | "metadata": { |
16 | 16 | "collapsed": false |
17 | 17 | }, |
|
24 | 24 | }, |
25 | 25 | { |
26 | 26 | "cell_type": "code", |
27 | | - "execution_count": 135, |
| 27 | + "execution_count": 41, |
28 | 28 | "metadata": { |
29 | 29 | "collapsed": false |
30 | 30 | }, |
|
35 | 35 | "[u'PRIDE AND PREJUDICE', u'', u'By Jane Austen', u'', u'']" |
36 | 36 | ] |
37 | 37 | }, |
38 | | - "execution_count": 135, |
| 38 | + "execution_count": 41, |
39 | 39 | "metadata": {}, |
40 | 40 | "output_type": "execute_result" |
41 | 41 | } |
42 | 42 | ], |
43 | 43 | "source": [ |
44 | | - "# first 5 lines of the RDD\n", |
| 44 | + "# first 5 elements of the RDD\n", |
45 | 45 | "text_file.take(5)" |
46 | 46 | ] |
47 | 47 | }, |
48 | 48 | { |
49 | 49 | "cell_type": "code", |
50 | | - "execution_count": 141, |
| 50 | + "execution_count": 42, |
51 | 51 | "metadata": { |
52 | 52 | "collapsed": false |
53 | 53 | }, |
|
58 | 58 | "unicode" |
59 | 59 | ] |
60 | 60 | }, |
61 | | - "execution_count": 141, |
| 61 | + "execution_count": 42, |
62 | 62 | "metadata": {}, |
63 | 63 | "output_type": "execute_result" |
64 | 64 | } |
|
69 | 69 | }, |
70 | 70 | { |
71 | 71 | "cell_type": "code", |
72 | | - "execution_count": 165, |
| 72 | + "execution_count": 43, |
73 | 73 | "metadata": { |
74 | 74 | "collapsed": true |
75 | 75 | }, |
|
84 | 84 | }, |
85 | 85 | { |
86 | 86 | "cell_type": "code", |
87 | | - "execution_count": 166, |
| 87 | + "execution_count": 44, |
88 | 88 | "metadata": { |
89 | 89 | "collapsed": false |
90 | 90 | }, |
|
95 | 95 | "['pride and prejudice', '', 'by jane austen', '', '']" |
96 | 96 | ] |
97 | 97 | }, |
98 | | - "execution_count": 166, |
| 98 | + "execution_count": 44, |
99 | 99 | "metadata": {}, |
100 | 100 | "output_type": "execute_result" |
101 | 101 | } |
|
107 | 107 | }, |
108 | 108 | { |
109 | 109 | "cell_type": "code", |
110 | | - "execution_count": 167, |
| 110 | + "execution_count": 45, |
111 | 111 | "metadata": { |
112 | 112 | "collapsed": false |
113 | 113 | }, |
|
118 | 118 | "[['pride', 'and', 'prejudice'], [], ['by', 'jane', 'austen'], [], []]" |
119 | 119 | ] |
120 | 120 | }, |
121 | | - "execution_count": 167, |
| 121 | + "execution_count": 45, |
122 | 122 | "metadata": {}, |
123 | 123 | "output_type": "execute_result" |
124 | 124 | } |
|
130 | 130 | }, |
131 | 131 | { |
132 | 132 | "cell_type": "code", |
133 | | - "execution_count": 168, |
| 133 | + "execution_count": 46, |
134 | 134 | "metadata": { |
135 | 135 | "collapsed": false |
136 | 136 | }, |
|
141 | 141 | "['pride', 'and', 'prejudice', 'by', 'jane']" |
142 | 142 | ] |
143 | 143 | }, |
144 | | - "execution_count": 168, |
| 144 | + "execution_count": 46, |
145 | 145 | "metadata": {}, |
146 | 146 | "output_type": "execute_result" |
147 | 147 | } |
|
153 | 153 | }, |
154 | 154 | { |
155 | 155 | "cell_type": "code", |
156 | | - "execution_count": 161, |
| 156 | + "execution_count": 47, |
157 | 157 | "metadata": { |
158 | 158 | "collapsed": false |
159 | 159 | }, |
|
164 | 164 | "[('pride', 1), ('and', 1), ('prejudice', 1), ('by', 1), ('jane', 1)]" |
165 | 165 | ] |
166 | 166 | }, |
167 | | - "execution_count": 161, |
| 167 | + "execution_count": 47, |
168 | 168 | "metadata": {}, |
169 | 169 | "output_type": "execute_result" |
170 | 170 | } |
|
177 | 177 | }, |
178 | 178 | { |
179 | 179 | "cell_type": "code", |
180 | | - "execution_count": 162, |
| 180 | + "execution_count": 48, |
181 | 181 | "metadata": { |
182 | 182 | "collapsed": false |
183 | 183 | }, |
|
192 | 192 | " ('foul', 1)]" |
193 | 193 | ] |
194 | 194 | }, |
195 | | - "execution_count": 162, |
| 195 | + "execution_count": 48, |
196 | 196 | "metadata": {}, |
197 | 197 | "output_type": "execute_result" |
198 | 198 | } |
|
201 | 201 | "one_RDD = text_file.flatMap(lambda x: uni_to_clean_str(x).split())\n", |
202 | 202 | "one_RDD = one_RDD.map(lambda x: (x,1))\n", |
203 | 203 | "one_RDD = one_RDD.reduceByKey(lambda x,y: x + y)\n", |
204 | | - "one_RDD.take(5)" |
| 204 | + "one_RDD.take(5) # notice the issue with disgracewhen" |
205 | 205 | ] |
206 | 206 | }, |
207 | 207 | { |
208 | 208 | "cell_type": "code", |
209 | | - "execution_count": 48, |
| 209 | + "execution_count": 22, |
210 | 210 | "metadata": { |
211 | 211 | "collapsed": false |
212 | 212 | }, |
|
217 | 217 | "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'" |
218 | 218 | ] |
219 | 219 | }, |
220 | | - "execution_count": 48, |
| 220 | + "execution_count": 22, |
221 | 221 | "metadata": {}, |
222 | 222 | "output_type": "execute_result" |
223 | 223 | } |
|
226 | 226 | "string.punctuation" |
227 | 227 | ] |
228 | 228 | }, |
| 229 | + { |
| 230 | + "cell_type": "code", |
| 231 | + "execution_count": 23, |
| 232 | + "metadata": { |
| 233 | + "collapsed": true |
| 234 | + }, |
| 235 | + "outputs": [], |
| 236 | + "source": [ |
| 237 | + "punc = '!\"#$%&\\'()*+,./:;<=>?@[\\\\]^_`{|}~'" |
| 238 | + ] |
| 239 | + }, |
| 240 | + { |
| 241 | + "cell_type": "code", |
| 242 | + "execution_count": 38, |
| 243 | + "metadata": { |
| 244 | + "collapsed": true |
| 245 | + }, |
| 246 | + "outputs": [], |
| 247 | + "source": [ |
| 248 | + "def uni_to_clean_str(x):\n", |
| 249 | + " converted = x.encode('utf-8')\n", |
| 250 | + " lowercased_str = converted.lower()\n", |
| 251 | + " # for more difficult cases use re.split(' A|B')\n", |
| 252 | + " lowercased_str = lowercased_str.replace('--',' ')\n", |
| 253 | + " clean_str = lowercased_str.translate(None, punc) #Change 1\n", |
| 254 | + " return clean_str" |
| 255 | + ] |
| 256 | + }, |
| 257 | + { |
| 258 | + "cell_type": "code", |
| 259 | + "execution_count": 39, |
| 260 | + "metadata": { |
| 261 | + "collapsed": false |
| 262 | + }, |
| 263 | + "outputs": [ |
| 264 | + { |
| 265 | + "data": { |
| 266 | + "text/plain": [ |
| 267 | + "[('pardon', 16),\n", |
| 268 | + " ('expostulation', 1),\n", |
| 269 | + " ('desirable', 11),\n", |
| 270 | + " ('bathing-place', 1),\n", |
| 271 | + " ('four', 33),\n", |
| 272 | + " ('brightening', 1),\n", |
| 273 | + " ('straws', 1),\n", |
| 274 | + " ('sleep', 4),\n", |
| 275 | + " ('mansion', 2),\n", |
| 276 | + " ('appetite', 2),\n", |
| 277 | + " ('hate', 9),\n", |
| 278 | + " ('looking', 29),\n", |
| 279 | + " ('unfeeling', 3),\n", |
| 280 | + " ('reproofs', 2),\n", |
| 281 | + " ('sweetest', 2),\n", |
| 282 | + " ('seriously', 13),\n", |
| 283 | + " ('presents', 3),\n", |
| 284 | + " ('neighbours', 11),\n", |
| 285 | + " ('sorry', 34),\n", |
| 286 | + " ('sinking', 3),\n", |
| 287 | + " ('conjecture', 7),\n", |
| 288 | + " ('rational', 10),\n", |
| 289 | + " ('compassion', 14),\n", |
| 290 | + " ('void', 1),\n", |
| 291 | + " ('georgianas', 3),\n", |
| 292 | + " ('preservation', 2),\n", |
| 293 | + " ('suggesting', 1),\n", |
| 294 | + " ('every', 198),\n", |
| 295 | + " ('foul', 1),\n", |
| 296 | + " ('softened', 6),\n", |
| 297 | + " ('frailty', 1),\n", |
| 298 | + " ('conception', 1),\n", |
| 299 | + " ('bringing', 12),\n", |
| 300 | + " ('vast', 2),\n", |
| 301 | + " ('tickets', 3),\n", |
| 302 | + " ('school', 2),\n", |
| 303 | + " ('attentively', 3),\n", |
| 304 | + " ('conceive', 3),\n", |
| 305 | + " ('protest', 1),\n", |
| 306 | + " ('convenience', 3),\n", |
| 307 | + " ('red', 3),\n", |
| 308 | + " ('guiding', 1),\n", |
| 309 | + " ('whist', 5),\n", |
| 310 | + " ('happier', 3),\n", |
| 311 | + " ('triumph', 10),\n", |
| 312 | + " ('enjoy', 8),\n", |
| 313 | + " ('disclose', 1),\n", |
| 314 | + " ('parted', 14),\n", |
| 315 | + " ('force', 8),\n", |
| 316 | + " ('construed', 1),\n", |
| 317 | + " ('lessen', 4),\n", |
| 318 | + " ('cordially', 3),\n", |
| 319 | + " ('diffuseness', 1),\n", |
| 320 | + " ('tires', 1),\n", |
| 321 | + " ('scheming', 1),\n", |
| 322 | + " ('second', 24),\n", |
| 323 | + " ('signify', 3),\n", |
| 324 | + " ('consenting', 1),\n", |
| 325 | + " ('expressive', 2),\n", |
| 326 | + " ('meditation', 1),\n", |
| 327 | + " ('blue', 2),\n", |
| 328 | + " ('hide', 2),\n", |
| 329 | + " ('solemn', 5),\n", |
| 330 | + " ('saved', 3),\n", |
| 331 | + " ('contributed', 1),\n", |
| 332 | + " ('liberty', 7),\n", |
| 333 | + " ('spokesman', 1),\n", |
| 334 | + " ('superciliousness', 1),\n", |
| 335 | + " ('above', 21),\n", |
| 336 | + " ('suppose', 48),\n", |
| 337 | + " ('net', 1),\n", |
| 338 | + " ('ever', 130),\n", |
| 339 | + " ('manager', 1),\n", |
| 340 | + " ('exclamations', 3),\n", |
| 341 | + " ('rightly', 2),\n", |
| 342 | + " ('deemed', 1),\n", |
| 343 | + " ('repugnance', 1),\n", |
| 344 | + " ('work-bags', 1),\n", |
| 345 | + " ('glimpse', 2),\n", |
| 346 | + " ('here', 68),\n", |
| 347 | + " ('pratt', 2),\n", |
| 348 | + " ('china', 1),\n", |
| 349 | + " ('represented', 7),\n", |
| 350 | + " ('shortness', 1),\n", |
| 351 | + " ('obtained', 4),\n", |
| 352 | + " ('daughter', 77),\n", |
| 353 | + " ('study', 7),\n", |
| 354 | + " ('reports', 1),\n", |
| 355 | + " ('cogent', 1),\n", |
| 356 | + " ('ductility', 1),\n", |
| 357 | + " ('special', 2),\n", |
| 358 | + " ('settled', 48),\n", |
| 359 | + " ('verdure', 1),\n", |
| 360 | + " ('criticise', 1),\n", |
| 361 | + " ('scold', 1),\n", |
| 362 | + " ('indolence', 2),\n", |
| 363 | + " ('brought', 37),\n", |
| 364 | + " ('moral', 3),\n", |
| 365 | + " ('opportunities', 1),\n", |
| 366 | + " ('glance', 7)]" |
| 367 | + ] |
| 368 | + }, |
| 369 | + "execution_count": 39, |
| 370 | + "metadata": {}, |
| 371 | + "output_type": "execute_result" |
| 372 | + } |
| 373 | + ], |
| 374 | + "source": [ |
| 375 | + "one_RDD = text_file.flatMap(lambda x: uni_to_clean_str(x).split())\n", |
| 376 | + "one_RDD = one_RDD.map(lambda x: (x,1))\n", |
| 377 | + "one_RDD = one_RDD.reduceByKey(lambda x,y: x + y)\n", |
| 378 | + "one_RDD.take(100)" |
| 379 | + ] |
| 380 | + }, |
229 | 381 | { |
230 | 382 | "cell_type": "code", |
231 | 383 | "execution_count": null, |
|
0 commit comments