package com.hj.spark;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
public class PartitionRDD {
public static void main(String[] args) {
// TODO Auto-generated method stub
// 配置SparkConf,设置应用名称,启用本地运行方式
SparkConf conf = new SparkConf().setAppName("PartitionRDD").setMaster("local");
// 创建SparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
// mapDemo(sc);
// flatMapDemo(sc);
// groupByKeyDemo(sc);
reduceByKey(sc);
}
// 将相同长度的字符串拼接到一起
private static void reduceByKey(JavaSparkContext sc) {
List<String> data = Arrays.asList("hadoop","hdfs","hello","world","hadoop","Java","Java","hello");
JavaPairRDD<Integer, String> rdd = sc.parallelize(data).keyBy(new Function<String, Integer>() {
@Override
public Integer call(String str) throws Exception {
// TODO Auto-generated method stub
return str.length();
}
});
JavaPairRDD<Integer, String> reduceByKeyRDD = rdd.reduceByKey(new Function2<String, String, String>() {
@Override
public String call(String str1, String str2) throws Exception {
// TODO Auto-generated method stub
return str1+"_"+str2;
}
});
System.out.println(reduceByKeyRDD.collect());
}
// 将相同长度字符串放在同一个分区
private static void groupByKeyDemo(JavaSparkContext sc) {
List<String> data = Arrays.asList("hadoop","hdfs","hello","world","hadoop","Java","Java","hello");
JavaPairRDD<Integer, String> rdd = sc.parallelize(data).keyBy(new Function<String, Integer>() {
@Override
public Integer call(String str) throws Exception {
// TODO Auto-generated method stub
return str.length();
}
});
/*
* rdd
* <6,hadoop> <4,hdfs> <5,hello> <5,world>........
*/
JavaPairRDD<Integer, Iterable<String>> groupbyRDD = rdd.groupByKey();
System.out.println(groupbyRDD.collect());
}
// flatMap
private static void flatMapDemo(JavaSparkContext sc) {
List<List<String>> list = Arrays.asList(Arrays.asList("hadoop spark hbase java","java scala hive"),
Arrays.asList("java scala python"));
JavaRDD<List<String>> strRDD = sc.parallelize(list);
// [[hadoop spark hbase java, java scala hive],[java scala Python]]
JavaRDD<String> flatMapRDD = strRDD.flatMap(new FlatMapFunction<List<String>, String>() {
@Override
public Iterator<String> call(List<String> strList) throws Exception {
String[] addStrings = null;
List<String> list = new ArrayList<String>();
StringBuilder stringBuilder = new StringBuilder();
// TODO Auto-generated method stub
String str = ""; // 临时接收处理后字符串
System.out.println("=============================="+strList.size());
for (int i = 0; i < strList.size(); i++){
stringBuilder.append(strList.get(i)+" ");
}
addStrings = stringBuilder.toString().split(" ");
for(String word : addStrings){
list.add(word);
}
/*
* 数组 -\->迭代器
* 数据--->列表
* 列表--->迭代器
*/
return list.iterator();
}
});
System.out.println("原数据:"+strRDD.collect());
System.out.println("变化后:"+flatMapRDD.collect());
// 原数据:[[hadoop spark hbase java, java scala hive], [java scala python]]
// 变化后:[hadoop, spark, hbase, java, java, scala, hive, java, scala, python]
}
private static void mapDemo(JavaSparkContext sc) {
// 创建测试数据
List<String> list = Arrays.asList("hadoop spark hbase java","java scala hive");
JavaRDD<String> strRDD = sc.parallelize(list);
// 输出 [hadoop spark hbase java, java scala hive]
JavaRDD<String []> splitRDD = strRDD.map(new Function<String, String[]>() {
@Override
public String[] call(String v1) throws Exception {
// TODO Auto-generated method stub
return v1.split(" ");
} // 第一个参数:输入数据类型;第二个参数:输出数据类型
});
// splitRDD:
/*
* List(List(hadoop,spark,hbase,java),
* List(java,scala,hive))
*/
List<String[]> result = splitRDD.collect();
for (int i = 0; i < result.size(); i++){
for(String s: result.get(i)){
System.out.println("Array "+i+" data:" + s);
}
}
}
}
基于Java开发的Spark篇(RDD)
最新推荐文章于 2025-02-28 09:48:28 发布