Skip to content

Commit e4e9377

Browse files
author
James Lee
committed
use reduce by key to implement word count
1 parent afbb93e commit e4e9377

1 file changed

Lines changed: 37 additions & 0 deletions

File tree

  • src/main/java/com/sparkTutorial/pairRdd/aggregation/reducebykey
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package com.sparkTutorial.pairRdd.aggregation.reducebykey;
2+
3+
4+
import org.apache.log4j.Level;
5+
import org.apache.log4j.Logger;
6+
import org.apache.spark.SparkConf;
7+
import org.apache.spark.api.java.JavaPairRDD;
8+
import org.apache.spark.api.java.JavaRDD;
9+
import org.apache.spark.api.java.JavaSparkContext;
10+
import org.apache.spark.api.java.function.Function2;
11+
import org.apache.spark.api.java.function.PairFunction;
12+
import scala.Tuple2;
13+
14+
import java.util.Arrays;
15+
import java.util.Map;
16+
17+
public class WorldCount {
18+
19+
public static void main(String[] args) throws Exception {
20+
21+
Logger.getLogger("org").setLevel(Level.ERROR);
22+
SparkConf conf = new SparkConf().setAppName("wordCounts").setMaster("local[3]");
23+
JavaSparkContext sc = new JavaSparkContext(conf);
24+
25+
JavaRDD<String> lines = sc.textFile("in/word_count.text");
26+
JavaRDD<String> wordRdd = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
27+
28+
JavaPairRDD<String, Integer> wordPairRdd = wordRdd.mapToPair((PairFunction<String, String, Integer>) word -> new Tuple2<>(word, 1));
29+
30+
JavaPairRDD<String, Integer> wordCounts = wordPairRdd.reduceByKey((Function2<Integer, Integer, Integer>) (x, y) -> x + y);
31+
32+
for (Map.Entry<String, Integer> wordCountPair : wordCounts.collectAsMap().entrySet()) {
33+
System.out.println(wordCountPair.getKey() + " : " + wordCountPair.getValue());
34+
35+
}
36+
}
37+
}

0 commit comments

Comments
 (0)