package sanjin;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
/**
* Description: java 版本的多文件的wc程序
*
* mvn clean install
* rz jar
* ./bin./submit --class sanjin.JavaWordCount /home/hadoop/jar/test1.0.jar /spark-test/input /spark-test/javaoutput
* 命令 方法名 jar 包位置 输入文件位置 文件输出位置
* @Author: 留歌36
* @Date: 2019/3/6 16:42
*/
public class JavaWordCount {
public static void main(String[] args) {
String inputPath =args[0];
String outputPath = args[1];
SparkSession sparkSession = SparkSession.builder()
.appName(Thread.currentThread().getStackTrace()[1].getClassName())
// .master("local[2]")
.getOrCreate();
JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
JavaPairRDD<String,String> textFiles = sc.wholeTextFiles(inputPath);
JavaPairRDD<String,Integer> counts = textFiles.flatMap(s -> Arrays.asList(s._2.split("\\s+")).iterator())
.mapToPair(word -> new Tuple2<String, Integer>(word, 1))
.reduceByKey((x, y) -> x+y);
counts.saveAsTextFile(outputPath);
sparkSession.stop();
}
}
多文件的wc程序【java版】
最新推荐文章于 2025-04-08 13:45:05 发布