forked from jonesberg/DataAnalysisWithPythonAndPySpark
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathword_count_submit.py
More file actions
24 lines (19 loc) · 775 Bytes
/
word_count_submit.py
File metadata and controls
24 lines (19 loc) · 775 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
spark = SparkSession.builder.appName(
"Counting word occurences from a book."
).getOrCreate()
spark.sparkContext.setLogLevel("WARN")
# If you need to read multiple text files, replace `1342-0` by `*`.
results = (
spark.read.text("../../data/gutenberg_books/1342-0.txt")
.select(F.split(F.col("value"), " ").alias("line"))
.select(F.explode(F.col("line")).alias("word"))
.select(F.lower(F.col("word")).alias("word"))
.select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
.where(F.col("word") != "")
.groupby(F.col("word"))
.count()
)
results.orderBy("count", ascending=False).show(10)
results.coalesce(1).write.csv("./results_single_partition.csv")