Skip to content

Commit e40bf06

Browse files
committed
done
1 parent 5f2a57d commit e40bf06

1 file changed

Lines changed: 23 additions & 63 deletions

File tree

PySpark_Basics/PySpark_Part1_Word_Count_Removing_Punctuation_Pride_Prejudice.ipynb

Lines changed: 23 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
},
1212
{
1313
"cell_type": "code",
14-
"execution_count": 2,
14+
"execution_count": 1,
1515
"metadata": {
1616
"collapsed": false
1717
},
@@ -24,7 +24,7 @@
2424
},
2525
{
2626
"cell_type": "code",
27-
"execution_count": 3,
27+
"execution_count": 2,
2828
"metadata": {
2929
"collapsed": false
3030
},
@@ -35,19 +35,19 @@
3535
"[u'PRIDE AND PREJUDICE', u'', u'By Jane Austen', u'', u'']"
3636
]
3737
},
38-
"execution_count": 3,
38+
"execution_count": 2,
3939
"metadata": {},
4040
"output_type": "execute_result"
4141
}
4242
],
4343
"source": [
4444
"# first 5 elements of the RDD\n",
45-
"text_file.take(5)"
45+
"text_file.take(5)+"
4646
]
4747
},
4848
{
4949
"cell_type": "code",
50-
"execution_count": 4,
50+
"execution_count": 3,
5151
"metadata": {
5252
"collapsed": false
5353
},
@@ -58,7 +58,7 @@
5858
"unicode"
5959
]
6060
},
61-
"execution_count": 4,
61+
"execution_count": 3,
6262
"metadata": {},
6363
"output_type": "execute_result"
6464
}
@@ -69,7 +69,7 @@
6969
},
7070
{
7171
"cell_type": "code",
72-
"execution_count": 5,
72+
"execution_count": 4,
7373
"metadata": {
7474
"collapsed": false
7575
},
@@ -80,7 +80,7 @@
8080
"'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
8181
]
8282
},
83-
"execution_count": 5,
83+
"execution_count": 4,
8484
"metadata": {},
8585
"output_type": "execute_result"
8686
}
@@ -91,7 +91,7 @@
9191
},
9292
{
9393
"cell_type": "code",
94-
"execution_count": 6,
94+
"execution_count": 5,
9595
"metadata": {
9696
"collapsed": true
9797
},
@@ -181,7 +181,7 @@
181181
},
182182
{
183183
"cell_type": "code",
184-
"execution_count": 10,
184+
"execution_count": 22,
185185
"metadata": {
186186
"collapsed": false
187187
},
@@ -192,7 +192,7 @@
192192
"[['pride', 'and', 'prejudice'], [], ['by', 'jane', 'austen'], [], []]"
193193
]
194194
},
195-
"execution_count": 10,
195+
"execution_count": 22,
196196
"metadata": {},
197197
"output_type": "execute_result"
198198
}
@@ -204,7 +204,7 @@
204204
},
205205
{
206206
"cell_type": "code",
207-
"execution_count": 11,
207+
"execution_count": 10,
208208
"metadata": {
209209
"collapsed": false
210210
},
@@ -215,7 +215,7 @@
215215
"['pride', 'and', 'prejudice', 'by', 'jane']"
216216
]
217217
},
218-
"execution_count": 11,
218+
"execution_count": 10,
219219
"metadata": {},
220220
"output_type": "execute_result"
221221
}
@@ -227,7 +227,7 @@
227227
},
228228
{
229229
"cell_type": "code",
230-
"execution_count": 12,
230+
"execution_count": 11,
231231
"metadata": {
232232
"collapsed": false
233233
},
@@ -238,7 +238,7 @@
238238
"[('pride', 1), ('and', 1), ('prejudice', 1), ('by', 1), ('jane', 1)]"
239239
]
240240
},
241-
"execution_count": 12,
241+
"execution_count": 11,
242242
"metadata": {},
243243
"output_type": "execute_result"
244244
}
@@ -251,7 +251,7 @@
251251
},
252252
{
253253
"cell_type": "code",
254-
"execution_count": 13,
254+
"execution_count": 12,
255255
"metadata": {
256256
"collapsed": false
257257
},
@@ -276,7 +276,7 @@
276276
" ('sweetest', 2)]"
277277
]
278278
},
279-
"execution_count": 13,
279+
"execution_count": 12,
280280
"metadata": {},
281281
"output_type": "execute_result"
282282
}
@@ -290,7 +290,7 @@
290290
},
291291
{
292292
"cell_type": "code",
293-
"execution_count": 15,
293+
"execution_count": 13,
294294
"metadata": {
295295
"collapsed": false
296296
},
@@ -315,7 +315,7 @@
315315
" (1336, 'he')]"
316316
]
317317
},
318-
"execution_count": 15,
318+
"execution_count": 13,
319319
"metadata": {},
320320
"output_type": "execute_result"
321321
}
@@ -324,60 +324,20 @@
324324
"one_RDD = text_file.flatMap(lambda x: uni_to_clean_str(x).split())\n",
325325
"one_RDD = one_RDD.map(lambda x: (x,1)) \n",
326326
"one_RDD = one_RDD.reduceByKey(lambda x,y: x + y)\n",
327+
"one_RDD.take(5)\n",
327328
"one_RDD = one_RDD.map(lambda x:(x[1],x[0])) \n",
329+
"one_RDD.take(5)\n",
328330
"one_RDD.sortByKey(False).take(15)"
329331
]
330332
},
331-
{
332-
"cell_type": "markdown",
333-
"metadata": {},
334-
"source": [
335-
"<h2 align=\"center\"> Mistake </h2>"
336-
]
337-
},
338333
{
339334
"cell_type": "code",
340-
"execution_count": 49,
335+
"execution_count": null,
341336
"metadata": {
342337
"collapsed": true
343338
},
344339
"outputs": [],
345-
"source": [
346-
"def clean_str(x):\n",
347-
" converted = x.encode('utf-8')\n",
348-
" lowercased_str = converted.lower()\n",
349-
" clean_str = lowercased_str.translate(None,string.punctuation)\n",
350-
" return clean_str"
351-
]
352-
},
353-
{
354-
"cell_type": "code",
355-
"execution_count": 50,
356-
"metadata": {
357-
"collapsed": false
358-
},
359-
"outputs": [
360-
{
361-
"data": {
362-
"text/plain": [
363-
"[('disgracewhen', 1),\n",
364-
" ('pardon', 16),\n",
365-
" ('expostulation', 1),\n",
366-
" ('desirable', 11),\n",
367-
" ('foul', 1)]"
368-
]
369-
},
370-
"execution_count": 50,
371-
"metadata": {},
372-
"output_type": "execute_result"
373-
}
374-
],
375-
"source": [
376-
"one_RDD = text_file.flatMap(lambda x: clean_str(x).split())\n",
377-
"one_RDD = one_RDD.map(lambda x: (x,1))\n",
378-
"one_RDD = one_RDD.reduceByKey(lambda x,y: x + y)\n",
379-
"one_RDD.take(5) # notice the issue with disgracewhen"
380-
]
340+
"source": []
381341
}
382342
],
383343
"metadata": {

0 commit comments

Comments
 (0)