话不多说,直接上码
package com.myspark.wordcount.java_version;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
*
* @Author:Yh.Huang Date:2019/4/26
* Desc:利用java语言开发spark wordcount程序
*/
public class WordCount_Java {
public static void main(String[] args) {
//1.创建sparkConf对象
SparkConf conf = new SparkConf().setAppName("WordCount_Java").setMaster("local[2]");
//2.创建sparkcontext对象
JavaSparkContext jsc = new JavaSparkContext(conf);
jsc.setLogLevel("WARN");
//3.读取数据文件
JavaRDD<String> dataJavaRDD = jsc.textFile("D:\\hdfs\\spark\\input\\1.txt");
//4.切分数据
JavaRDD<String> wordsJavaRDD = dataJavaRDD.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String line) throws Exception {
String[] words = line.split(" ");
return Arrays.asList(words).iterator();
}
});
//5.为每个单词添加计数量1
JavaPairRDD<String, Integer> wordAndOneJavaPairRDD = wordsJavaRDD.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<String, Integer>(word, 1);
}
});
//6.统计相同单词累加
JavaPairRDD<String, Integer> resultJavaPairRDD = wordAndOneJavaPairRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
/**
* 扩展:实现单词出现的次数进行降序排列
* 步驟:1.將<String, Integer> 位置颠倒,
* 2.通过sortByKey()进行排序
* 3.再把位置调转
*/
JavaPairRDD<Integer, String> reverseJavaPairRDD =
resultJavaPairRDD.mapToPair(new PairFunction<Tuple2<String, Integer>,
Integer, String>() {
@Override
public Tuple2<Integer, String> call(Tuple2<String, Integer> tuple2) throws Exception {
return new Tuple2<Integer, String>(tuple2._2, tuple2._1);
}
});
//把位置调转,排序方法(倒序)sortByKey(false)
JavaPairRDD<String, Integer> sortJavaPairRDD = reverseJavaPairRDD.sortByKey(false).mapToPair(
new PairFunction<Tuple2<Integer, String>, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Tuple2<Integer, String> tuple2) throws Exception {
return new Tuple2<String, Integer>(tuple2._2, tuple2._1);
}
});
//7.收集结果数据
List<Tuple2<String, Integer>> finalResult = sortJavaPairRDD.collect();
//输出结果
for (Tuple2<String, Integer> tuple : finalResult) {
System.out.println(tuple._1 + " 出现的次数: " + tuple._2);
}
//8.关闭
jsc.stop();
}
}
希望能对你有所帮助,谢谢观看!