Skip to content

Commit 652d9ff

Browse files
author
Pedro Bernardo
committed
Added rdd/nasaApacheWebLogs/*.py
1 parent 7b40998 commit 652d9ff

4 files changed

Lines changed: 67 additions & 0 deletions

File tree

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from pyspark import SparkContext
2+
3+
if __name__ == "__main__":
4+
5+
'''
6+
"in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
7+
"in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995
8+
Create a Spark program to generate a new RDD which contains the hosts which are accessed on BOTH days.
9+
Save the resulting RDD to "out/nasa_logs_same_hosts.csv" file.
10+
11+
Example output:
12+
vagrant.vf.mmc.com
13+
www-a1.proxy.aol.com
14+
.....
15+
16+
Keep in mind, that the original log files contains the following header lines.
17+
host logname time method url response bytes
18+
19+
Make sure the head lines are removed in the resulting RDD.
20+
'''
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from pyspark import SparkContext
2+
3+
if __name__ == "__main__":
4+
sc = SparkContext("local", "sameHosts")
5+
6+
julyFirstLogs = sc.textFile("in/nasa_19950701.tsv")
7+
augustFirstLogs = sc.textFile("in/nasa_19950801.tsv")
8+
9+
julyFirstHosts = julyFirstLogs.map(lambda line: line.split("\t")[0])
10+
augustFirstHosts = augustFirstLogs.map(lambda line: line.split("\t")[0])
11+
12+
intersection = julyFirstHosts.intersection(augustFirstHosts)
13+
14+
cleanedHostIntersection = intersection.filter(lambda host: host != "host")
15+
cleanedHostIntersection.saveAsTextFile("out/nasa_logs_same_hosts.csv")
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from pyspark import SparkContext
2+
3+
if __name__ == "__main__":
4+
5+
'''
6+
"in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
7+
"in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995
8+
Create a Spark program to generate a new RDD which contains the log lines from both July 1st and August 1st,
9+
take a 0.1 sample of those log lines and save it to "out/sample_nasa_logs.tsv" file.
10+
11+
Keep in mind, that the original log files contains the following header lines.
12+
host logname time method url response bytes
13+
14+
Make sure the head lines are removed in the resulting RDD.
15+
'''
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from pyspark import SparkContext
2+
3+
def isNotHeader(line:str):
4+
return not (line.startswith("host") and "bytes" in line)
5+
6+
if __name__ == "__main__":
7+
sc = SparkContext("local", "unionLogs")
8+
9+
julyFirstLogs = sc.textFile("in/nasa_19950701.tsv")
10+
augustFirstLogs = sc.textFile("in/nasa_19950801.tsv")
11+
12+
aggregatedLogLines = julyFirstLogs.union(augustFirstLogs)
13+
14+
cleanLogLines = aggregatedLogLines.filter(isNotHeader)
15+
sample = cleanLogLines.sample(withReplacement = True, fraction = 0.1)
16+
17+
sample.saveAsTextFile("out/sample_nasa_logs.csv")

0 commit comments

Comments
 (0)