ocrrss
diff --git a/‎commons/Utils.py‎
Lines changed: 5 additions & 0 deletions b/‎commons/Utils.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎rdd/WordCount.py‎
Lines changed: 7 additions & 6 deletions b/‎rdd/WordCount.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎rdd/airports/AirportsByLatitudeProblem.py‎
Lines changed: 17 additions & 0 deletions b/‎rdd/airports/AirportsByLatitudeProblem.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎rdd/airports/AirportsByLatitudeProblem.scala‎
Lines changed: 0 additions & 20 deletions b/‎rdd/airports/AirportsByLatitudeProblem.scala‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎rdd/airports/AirportsByLatitudeSolution.py‎
Lines changed: 17 additions & 0 deletions b/‎rdd/airports/AirportsByLatitudeSolution.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎rdd/airports/AirportsByLatitudeSolution.scala‎
Lines changed: 0 additions & 23 deletions b/‎rdd/airports/AirportsByLatitudeSolution.scala‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎rdd/airports/AirportsInUsaProblem.py‎
Lines changed: 17 additions & 0 deletions b/‎rdd/airports/AirportsInUsaProblem.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎rdd/airports/AirportsInUsaProblem.scala‎
Lines changed: 0 additions & 19 deletions b/‎rdd/airports/AirportsInUsaProblem.scala‎
Lines changed: 0 additions & 19 deletions
diff --git a/‎rdd/airports/AirportsInUsaSolution.py‎
Lines changed: 15 additions & 0 deletions b/‎rdd/airports/AirportsInUsaSolution.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎rdd/airports/AirportsInUsaSolution.scala‎
Lines changed: 0 additions & 22 deletions b/‎rdd/airports/AirportsInUsaSolution.scala‎
Lines changed: 0 additions & 22 deletions
@@ -0,0 +1,5 @@
+import re
+
+class Utils():
+    
+    COMMA_DELIMITER = re.compile(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''')
@@ -2,9 +2,10 @@
 from pyspark import SparkContext
 
 if __name__ == "__main__":
-	sc = SparkContext("local", "word count")
-  	lines = sc.textFile("in/word_count.text")
-  	words = lines.flatMap(lambda line: line.split(" "))
-  	wordCounts = words.countByValue()
-  	for word, count in wordCounts.items():
-  		print(word, count)
+    sc = SparkContext("local", "word count")
+    sc.setLogLevel("ERROR")
+    lines = sc.textFile("in/word_count.text")
+    words = lines.flatMap(lambda line: line.split(" "))
+    wordCounts = words.countByValue()
+    for word, count in wordCounts.items():
+        print(word, count)
@@ -0,0 +1,17 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    Create a Spark program to read the airport data from in/airports.text,  find all the airports whose latitude are bigger than 40.
+    Then output the airport's name and the airport's latitude to out/airports_by_latitude.text.
+
+    Each row of the input file contains the following columns:
+    Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code,
+    ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
+
+    Sample output:
+    "St Anthony", 51.391944
+    "Tofino", 49.082222
+    ...
+    '''
@@ -0,0 +1,17 @@
+from pyspark import SparkContext
+from commons.Utils import Utils
+
+def splitComma(line: str):
+    splits = Utils.COMMA_DELIMITER.split(line)
+    return "{}, {}".format(splits[1], splits[6])
+
+if __name__ == "__main__":
+    sc = SparkContext("local", "airports")
+    
+    airports = sc.textFile("in/airports.text")
+
+    airportsInUSA = airports.filter(lambda line: float(Utils.COMMA_DELIMITER.split(line)[6]) > 40)
+    
+    airportsNameAndCityNames = airportsInUSA.map(splitComma)
+
+    airportsNameAndCityNames.saveAsTextFile("out/airports_by_latitude.text")
@@ -0,0 +1,17 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    Create a Spark program to read the airport data from in/airports.text, find all the airports which are located in United States
+    and output the airport's name and the city's name to out/airports_in_usa.text.
+
+    Each row of the input file contains the following columns:
+    Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code,
+    ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
+
+    Sample output:
+    "Putnam County Airport", "Greencastle"
+    "Dowagiac Municipal Airport", "Dowagiac"
+    ...
+    '''
@@ -0,0 +1,15 @@
+from pyspark import SparkContext
+from commons.Utils import Utils
+
+def splitComma(line: str):
+    splits = Utils.COMMA_DELIMITER.split(line)
+    return "{}, {}".format(splits[1], splits[2])
+
+if __name__ == "__main__":
+    sc = SparkContext("local", "airports")
+
+    airports = sc.textFile("in/airports.text")
+    airportsInUSA = airports.filter(lambda line : Utils.COMMA_DELIMITER.split(line)[3] == "\"United States\"")
+
+    airportsNameAndCityNames = airportsInUSA.map(splitComma)
+    airportsNameAndCityNames.saveAsTextFile("out/airports_in_usa.text")