Skip to content

Commit d00fed4

Browse files
committed
word count almost done
1 parent b147d72 commit d00fed4

2 files changed

Lines changed: 344 additions & 40 deletions

File tree

PySpark_Basics/.ipynb_checkpoints/PySpark_Part1_Word_Count_Removing_Punctuation_Pride_Prejudice-checkpoint.ipynb

Lines changed: 172 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
},
1212
{
1313
"cell_type": "code",
14-
"execution_count": 132,
14+
"execution_count": 40,
1515
"metadata": {
1616
"collapsed": false
1717
},
@@ -24,7 +24,7 @@
2424
},
2525
{
2626
"cell_type": "code",
27-
"execution_count": 135,
27+
"execution_count": 41,
2828
"metadata": {
2929
"collapsed": false
3030
},
@@ -35,19 +35,19 @@
3535
"[u'PRIDE AND PREJUDICE', u'', u'By Jane Austen', u'', u'']"
3636
]
3737
},
38-
"execution_count": 135,
38+
"execution_count": 41,
3939
"metadata": {},
4040
"output_type": "execute_result"
4141
}
4242
],
4343
"source": [
44-
"# first 5 lines of the RDD\n",
44+
"# first 5 elements of the RDD\n",
4545
"text_file.take(5)"
4646
]
4747
},
4848
{
4949
"cell_type": "code",
50-
"execution_count": 141,
50+
"execution_count": 42,
5151
"metadata": {
5252
"collapsed": false
5353
},
@@ -58,7 +58,7 @@
5858
"unicode"
5959
]
6060
},
61-
"execution_count": 141,
61+
"execution_count": 42,
6262
"metadata": {},
6363
"output_type": "execute_result"
6464
}
@@ -69,7 +69,7 @@
6969
},
7070
{
7171
"cell_type": "code",
72-
"execution_count": 165,
72+
"execution_count": 43,
7373
"metadata": {
7474
"collapsed": true
7575
},
@@ -84,7 +84,7 @@
8484
},
8585
{
8686
"cell_type": "code",
87-
"execution_count": 166,
87+
"execution_count": 44,
8888
"metadata": {
8989
"collapsed": false
9090
},
@@ -95,7 +95,7 @@
9595
"['pride and prejudice', '', 'by jane austen', '', '']"
9696
]
9797
},
98-
"execution_count": 166,
98+
"execution_count": 44,
9999
"metadata": {},
100100
"output_type": "execute_result"
101101
}
@@ -107,7 +107,7 @@
107107
},
108108
{
109109
"cell_type": "code",
110-
"execution_count": 167,
110+
"execution_count": 45,
111111
"metadata": {
112112
"collapsed": false
113113
},
@@ -118,7 +118,7 @@
118118
"[['pride', 'and', 'prejudice'], [], ['by', 'jane', 'austen'], [], []]"
119119
]
120120
},
121-
"execution_count": 167,
121+
"execution_count": 45,
122122
"metadata": {},
123123
"output_type": "execute_result"
124124
}
@@ -130,7 +130,7 @@
130130
},
131131
{
132132
"cell_type": "code",
133-
"execution_count": 168,
133+
"execution_count": 46,
134134
"metadata": {
135135
"collapsed": false
136136
},
@@ -141,7 +141,7 @@
141141
"['pride', 'and', 'prejudice', 'by', 'jane']"
142142
]
143143
},
144-
"execution_count": 168,
144+
"execution_count": 46,
145145
"metadata": {},
146146
"output_type": "execute_result"
147147
}
@@ -153,7 +153,7 @@
153153
},
154154
{
155155
"cell_type": "code",
156-
"execution_count": 161,
156+
"execution_count": 47,
157157
"metadata": {
158158
"collapsed": false
159159
},
@@ -164,7 +164,7 @@
164164
"[('pride', 1), ('and', 1), ('prejudice', 1), ('by', 1), ('jane', 1)]"
165165
]
166166
},
167-
"execution_count": 161,
167+
"execution_count": 47,
168168
"metadata": {},
169169
"output_type": "execute_result"
170170
}
@@ -177,7 +177,7 @@
177177
},
178178
{
179179
"cell_type": "code",
180-
"execution_count": 162,
180+
"execution_count": 48,
181181
"metadata": {
182182
"collapsed": false
183183
},
@@ -192,7 +192,7 @@
192192
" ('foul', 1)]"
193193
]
194194
},
195-
"execution_count": 162,
195+
"execution_count": 48,
196196
"metadata": {},
197197
"output_type": "execute_result"
198198
}
@@ -201,12 +201,12 @@
201201
"one_RDD = text_file.flatMap(lambda x: uni_to_clean_str(x).split())\n",
202202
"one_RDD = one_RDD.map(lambda x: (x,1))\n",
203203
"one_RDD = one_RDD.reduceByKey(lambda x,y: x + y)\n",
204-
"one_RDD.take(5)"
204+
"one_RDD.take(5) # notice the issue with disgracewhen"
205205
]
206206
},
207207
{
208208
"cell_type": "code",
209-
"execution_count": 48,
209+
"execution_count": 22,
210210
"metadata": {
211211
"collapsed": false
212212
},
@@ -217,7 +217,7 @@
217217
"'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
218218
]
219219
},
220-
"execution_count": 48,
220+
"execution_count": 22,
221221
"metadata": {},
222222
"output_type": "execute_result"
223223
}
@@ -226,6 +226,158 @@
226226
"string.punctuation"
227227
]
228228
},
229+
{
230+
"cell_type": "code",
231+
"execution_count": 23,
232+
"metadata": {
233+
"collapsed": true
234+
},
235+
"outputs": [],
236+
"source": [
237+
"punc = '!\"#$%&\\'()*+,./:;<=>?@[\\\\]^_`{|}~'"
238+
]
239+
},
240+
{
241+
"cell_type": "code",
242+
"execution_count": 38,
243+
"metadata": {
244+
"collapsed": true
245+
},
246+
"outputs": [],
247+
"source": [
248+
"def uni_to_clean_str(x):\n",
249+
" converted = x.encode('utf-8')\n",
250+
" lowercased_str = converted.lower()\n",
251+
" # for more difficult cases use re.split(' A|B')\n",
252+
" lowercased_str = lowercased_str.replace('--',' ')\n",
253+
" clean_str = lowercased_str.translate(None, punc) #Change 1\n",
254+
" return clean_str"
255+
]
256+
},
257+
{
258+
"cell_type": "code",
259+
"execution_count": 39,
260+
"metadata": {
261+
"collapsed": false
262+
},
263+
"outputs": [
264+
{
265+
"data": {
266+
"text/plain": [
267+
"[('pardon', 16),\n",
268+
" ('expostulation', 1),\n",
269+
" ('desirable', 11),\n",
270+
" ('bathing-place', 1),\n",
271+
" ('four', 33),\n",
272+
" ('brightening', 1),\n",
273+
" ('straws', 1),\n",
274+
" ('sleep', 4),\n",
275+
" ('mansion', 2),\n",
276+
" ('appetite', 2),\n",
277+
" ('hate', 9),\n",
278+
" ('looking', 29),\n",
279+
" ('unfeeling', 3),\n",
280+
" ('reproofs', 2),\n",
281+
" ('sweetest', 2),\n",
282+
" ('seriously', 13),\n",
283+
" ('presents', 3),\n",
284+
" ('neighbours', 11),\n",
285+
" ('sorry', 34),\n",
286+
" ('sinking', 3),\n",
287+
" ('conjecture', 7),\n",
288+
" ('rational', 10),\n",
289+
" ('compassion', 14),\n",
290+
" ('void', 1),\n",
291+
" ('georgianas', 3),\n",
292+
" ('preservation', 2),\n",
293+
" ('suggesting', 1),\n",
294+
" ('every', 198),\n",
295+
" ('foul', 1),\n",
296+
" ('softened', 6),\n",
297+
" ('frailty', 1),\n",
298+
" ('conception', 1),\n",
299+
" ('bringing', 12),\n",
300+
" ('vast', 2),\n",
301+
" ('tickets', 3),\n",
302+
" ('school', 2),\n",
303+
" ('attentively', 3),\n",
304+
" ('conceive', 3),\n",
305+
" ('protest', 1),\n",
306+
" ('convenience', 3),\n",
307+
" ('red', 3),\n",
308+
" ('guiding', 1),\n",
309+
" ('whist', 5),\n",
310+
" ('happier', 3),\n",
311+
" ('triumph', 10),\n",
312+
" ('enjoy', 8),\n",
313+
" ('disclose', 1),\n",
314+
" ('parted', 14),\n",
315+
" ('force', 8),\n",
316+
" ('construed', 1),\n",
317+
" ('lessen', 4),\n",
318+
" ('cordially', 3),\n",
319+
" ('diffuseness', 1),\n",
320+
" ('tires', 1),\n",
321+
" ('scheming', 1),\n",
322+
" ('second', 24),\n",
323+
" ('signify', 3),\n",
324+
" ('consenting', 1),\n",
325+
" ('expressive', 2),\n",
326+
" ('meditation', 1),\n",
327+
" ('blue', 2),\n",
328+
" ('hide', 2),\n",
329+
" ('solemn', 5),\n",
330+
" ('saved', 3),\n",
331+
" ('contributed', 1),\n",
332+
" ('liberty', 7),\n",
333+
" ('spokesman', 1),\n",
334+
" ('superciliousness', 1),\n",
335+
" ('above', 21),\n",
336+
" ('suppose', 48),\n",
337+
" ('net', 1),\n",
338+
" ('ever', 130),\n",
339+
" ('manager', 1),\n",
340+
" ('exclamations', 3),\n",
341+
" ('rightly', 2),\n",
342+
" ('deemed', 1),\n",
343+
" ('repugnance', 1),\n",
344+
" ('work-bags', 1),\n",
345+
" ('glimpse', 2),\n",
346+
" ('here', 68),\n",
347+
" ('pratt', 2),\n",
348+
" ('china', 1),\n",
349+
" ('represented', 7),\n",
350+
" ('shortness', 1),\n",
351+
" ('obtained', 4),\n",
352+
" ('daughter', 77),\n",
353+
" ('study', 7),\n",
354+
" ('reports', 1),\n",
355+
" ('cogent', 1),\n",
356+
" ('ductility', 1),\n",
357+
" ('special', 2),\n",
358+
" ('settled', 48),\n",
359+
" ('verdure', 1),\n",
360+
" ('criticise', 1),\n",
361+
" ('scold', 1),\n",
362+
" ('indolence', 2),\n",
363+
" ('brought', 37),\n",
364+
" ('moral', 3),\n",
365+
" ('opportunities', 1),\n",
366+
" ('glance', 7)]"
367+
]
368+
},
369+
"execution_count": 39,
370+
"metadata": {},
371+
"output_type": "execute_result"
372+
}
373+
],
374+
"source": [
375+
"one_RDD = text_file.flatMap(lambda x: uni_to_clean_str(x).split())\n",
376+
"one_RDD = one_RDD.map(lambda x: (x,1))\n",
377+
"one_RDD = one_RDD.reduceByKey(lambda x,y: x + y)\n",
378+
"one_RDD.take(100)"
379+
]
380+
},
229381
{
230382
"cell_type": "code",
231383
"execution_count": null,

0 commit comments

Comments
 (0)