Skip to content

Commit 9d9066c

Browse files
authored
Merge pull request jleetutorial#1 from jleetutorial/pedromb-scala_to_python
Scala to Python - rdd folder
2 parents 50dcbc6 + f637b18 commit 9d9066c

29 files changed

Lines changed: 196 additions & 273 deletions

commons/Utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import re
2+
3+
class Utils():
4+
5+
COMMA_DELIMITER = re.compile(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''')

rdd/WordCount.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
from pyspark import SparkContext
33

44
if __name__ == "__main__":
5-
sc = SparkContext("local", "word count")
6-
lines = sc.textFile("in/word_count.text")
7-
words = lines.flatMap(lambda line: line.split(" "))
8-
wordCounts = words.countByValue()
9-
for word, count in wordCounts.items():
10-
print(word, count)
5+
sc = SparkContext("local", "word count")
6+
sc.setLogLevel("ERROR")
7+
lines = sc.textFile("in/word_count.text")
8+
words = lines.flatMap(lambda line: line.split(" "))
9+
wordCounts = words.countByValue()
10+
for word, count in wordCounts.items():
11+
print(word, count)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from pyspark import SparkContext
2+
3+
if __name__ == "__main__":
4+
5+
'''
6+
Create a Spark program to read the airport data from in/airports.text, find all the airports whose latitude are bigger than 40.
7+
Then output the airport's name and the airport's latitude to out/airports_by_latitude.text.
8+
9+
Each row of the input file contains the following columns:
10+
Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code,
11+
ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
12+
13+
Sample output:
14+
"St Anthony", 51.391944
15+
"Tofino", 49.082222
16+
...
17+
'''

rdd/airports/AirportsByLatitudeProblem.scala

Lines changed: 0 additions & 20 deletions
This file was deleted.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from pyspark import SparkContext
2+
from commons.Utils import Utils
3+
4+
def splitComma(line: str):
5+
splits = Utils.COMMA_DELIMITER.split(line)
6+
return "{}, {}".format(splits[1], splits[6])
7+
8+
if __name__ == "__main__":
9+
sc = SparkContext("local", "airports")
10+
11+
airports = sc.textFile("in/airports.text")
12+
13+
airportsInUSA = airports.filter(lambda line: float(Utils.COMMA_DELIMITER.split(line)[6]) > 40)
14+
15+
airportsNameAndCityNames = airportsInUSA.map(splitComma)
16+
17+
airportsNameAndCityNames.saveAsTextFile("out/airports_by_latitude.text")

rdd/airports/AirportsByLatitudeSolution.scala

Lines changed: 0 additions & 23 deletions
This file was deleted.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from pyspark import SparkContext
2+
3+
if __name__ == "__main__":
4+
5+
'''
6+
Create a Spark program to read the airport data from in/airports.text, find all the airports which are located in United States
7+
and output the airport's name and the city's name to out/airports_in_usa.text.
8+
9+
Each row of the input file contains the following columns:
10+
Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code,
11+
ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
12+
13+
Sample output:
14+
"Putnam County Airport", "Greencastle"
15+
"Dowagiac Municipal Airport", "Dowagiac"
16+
...
17+
'''

rdd/airports/AirportsInUsaProblem.scala

Lines changed: 0 additions & 19 deletions
This file was deleted.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from pyspark import SparkContext
2+
from commons.Utils import Utils
3+
4+
def splitComma(line: str):
5+
splits = Utils.COMMA_DELIMITER.split(line)
6+
return "{}, {}".format(splits[1], splits[2])
7+
8+
if __name__ == "__main__":
9+
sc = SparkContext("local", "airports")
10+
11+
airports = sc.textFile("in/airports.text")
12+
airportsInUSA = airports.filter(lambda line : Utils.COMMA_DELIMITER.split(line)[3] == "\"United States\"")
13+
14+
airportsNameAndCityNames = airportsInUSA.map(splitComma)
15+
airportsNameAndCityNames.saveAsTextFile("out/airports_in_usa.text")

rdd/airports/AirportsInUsaSolution.scala

Lines changed: 0 additions & 22 deletions
This file was deleted.

0 commit comments

Comments
 (0)