package com.fei.simple_project;
import java.util.Arrays;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.DoubleFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;
public class App
{
public static void main( String[] args )
{
SparkConf conf = new SparkConf().setAppName("Simple Application");
JavaSparkContext sc = new JavaSparkContext(conf);
//使用parallelize创建RDD
JavaRDD<Integer> lines1 = sc.parallelize(Arrays.asList(1,2,3,4));
lines1.persist(StorageLevel.MEMORY_ONLY());
System.out.println("222222222Count is:"+lines1.count());
JavaRDD<String> lines2 = sc.parallelize(Arrays.asList("pandas", "like","i like pandas"));
lines2.persist(StorageLevel.MEMORY_ONLY());
System.out.println("333333333333Count is:"+lines2.count());
//转化操作
//map, 每个元素返回一个对象
JavaRDD<String> str1RDD = lines2.map(new Function<String, String>(){
public String call(String x) {
return x.split(" ")[0];
}
});
System.out.println("44444444444likeRDD Count is:"+str1RDD.count()+" "+ str1RDD.first());
for(String it:str1RDD.collect()){
System.out.println(it+" ");
}
//flat map,所有元素放入一个对象中返回,返回可迭代内容
JavaRDD<String> flatMapRDD = lines2.flatMap(new FlatMapFunction<String, String>(){
public Iterable<String> call(String x) throws Exception {
return Arrays.asList(x.split(" "));
}
}) ;
System.out.println("55555555flatMapRDD Count is:"+flatMapRDD.count()+" "+ flatMapRDD.first());
for(String it:flatMapRDD.collect()){
System.out.println(it+" ");
}
//distinct,集合去重,不常用,开销大
JavaRDD<String> distinctRDD = flatMapRDD.distinct();
System.out.println("6666666666distinctRDD Count is:"+distinctRDD.count()+" "+ distinctRDD.first());
for(String it:distinctRDD.collect()){
System.out.println(it+" ");
}
//union,合并,包括重复
JavaRDD<String> unionRDD = flatMapRDD.union(str1RDD);
System.out.println("777777777777777unionRDD Count is:"+unionRDD.count()+" "+ unionRDD.first());
for(String it:unionRDD.collect()){
System.out.println(it+" ");
}
//intersection,返回共有的
JavaRDD<String> intersectionRDD = flatMapRDD.intersection(str1RDD);
System.out.println("88888888888intersectionRDD Count is:"+intersectionRDD.count()+" "+ intersectionRDD.first());
for(String it:intersectionRDD.collect()){
System.out.println(it+" ");
}
//subtract,返回只在第一个中
JavaRDD<String> subtractRDD = flatMapRDD.subtract(str1RDD);
System.out.println("9999999999subtractRDD Count is:"+subtractRDD.count());
for(String it:subtractRDD.collect()){
System.out.println(it+" ");
}
//cartesian,返回笛卡尔积
JavaPairRDD<String, String> cartesianRDD = flatMapRDD.cartesian(str1RDD);
System.out.println("aaaaaaaaacartesianRDD Count is:"+cartesianRDD.count()+" "+ cartesianRDD.first());
for(Tuple2<String, String> it:cartesianRDD.collect()){
System.out.println(it._1+" "+it._2+"\n");
}
//mapToDouble,RDD类型转换
//其他转换还有:flatMapToDouble,flatMapToPair, mapToPair
JavaDoubleRDD doubleRDD = lines1.mapToDouble(new DoubleFunction<Integer>(){
public double call(Integer x) throws Exception {
return (double)x*x;
}
});
System.out.println("aaaaaaaaadoubleRDD Count is:"+doubleRDD.count()+" "+ doubleRDD.first());
for(double it:doubleRDD.collect()){
System.out.println(it+" ");
}
//mean是平均数
System.out.println("9999999999doubleRDD mean is:"+doubleRDD.mean());
}
}
222222222Count is:4
333333333333Count is:3
44444444444likeRDD Count is:3 pandas
pandas
like
i
55555555flatMapRDD Count is:5 pandas
pandas
like
i
like
pandas
6666666666distinctRDD Count is:3 pandas
pandas
i
like
777777777777777unionRDD Count is:8 pandas
pandas
like
i
like
pandas
pandas
like
i
88888888888intersectionRDD Count is:3 pandas
pandas
i
like
9999999999subtractRDD Count is:0
aaaaaaaaacartesianRDD Count is:15 (pandas,pandas)
pandas pandas
pandas like
pandas i
like pandas
like like
like i
i pandas
like pandas
pandas pandas
i like
like like
pandas like
i i
like i
pandas i
aaaaaaaaadoubleRDD Count is:4 1.0
1.0
4.0
9.0
16.0
9999999999doubleRDD mean is:7.5