Spark RDD 转化

该博客演示了如何在 Apache Spark 中使用 Java 进行 RDD 的转换操作,包括 map、flatMap、distinct、union、intersection、subtract、cartesian、mapToDouble 等,并展示了每个操作的结果和用法。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

package com.fei.simple_project;

import java.util.Arrays;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.DoubleFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.storage.StorageLevel;

import scala.Tuple2;


public class App 
{
    public static void main( String[] args )
    {
        SparkConf conf = new SparkConf().setAppName("Simple Application");
        JavaSparkContext sc = new JavaSparkContext(conf);
        
        //使用parallelize创建RDD
        JavaRDD<Integer> lines1 = sc.parallelize(Arrays.asList(1,2,3,4));
        lines1.persist(StorageLevel.MEMORY_ONLY());
        System.out.println("222222222Count is:"+lines1.count());
        
        JavaRDD<String> lines2 = sc.parallelize(Arrays.asList("pandas", "like","i like pandas"));
        lines2.persist(StorageLevel.MEMORY_ONLY());
        System.out.println("333333333333Count is:"+lines2.count());
        
        //转化操作
        //map, 每个元素返回一个对象
      JavaRDD<String> str1RDD = lines2.map(new Function<String, String>(){
		public String call(String x)  {
			return x.split(" ")[0];
		}
      });
      System.out.println("44444444444likeRDD Count is:"+str1RDD.count()+" "+ str1RDD.first());
 
      for(String it:str1RDD.collect()){
    	  System.out.println(it+"   ");
      }
      //flat map,所有元素放入一个对象中返回,返回可迭代内容
      JavaRDD<String> flatMapRDD = lines2.flatMap(new FlatMapFunction<String, String>(){
		public Iterable<String> call(String x) throws Exception {
			return Arrays.asList(x.split(" "));
		}
      }) ;
      System.out.println("55555555flatMapRDD Count is:"+flatMapRDD.count()+" "+ flatMapRDD.first());
      
      for(String it:flatMapRDD.collect()){
    	  System.out.println(it+"   ");
      }
      
      //distinct,集合去重,不常用,开销大
      JavaRDD<String> distinctRDD = flatMapRDD.distinct();
      System.out.println("6666666666distinctRDD Count is:"+distinctRDD.count()+" "+ distinctRDD.first());
      
      for(String it:distinctRDD.collect()){
    	  System.out.println(it+"   ");
      }
      //union,合并,包括重复
      JavaRDD<String> unionRDD = flatMapRDD.union(str1RDD);
      System.out.println("777777777777777unionRDD Count is:"+unionRDD.count()+" "+ unionRDD.first());
      
      for(String it:unionRDD.collect()){
    	  System.out.println(it+"   ");
      }
      //intersection,返回共有的
      JavaRDD<String> intersectionRDD = flatMapRDD.intersection(str1RDD);
      System.out.println("88888888888intersectionRDD Count is:"+intersectionRDD.count()+" "+ intersectionRDD.first());
      
      for(String it:intersectionRDD.collect()){
    	  System.out.println(it+"   ");
      }      
      //subtract,返回只在第一个中
      JavaRDD<String> subtractRDD = flatMapRDD.subtract(str1RDD);
      System.out.println("9999999999subtractRDD Count is:"+subtractRDD.count());
      
      for(String it:subtractRDD.collect()){
    	  System.out.println(it+"   ");
      }            
      //cartesian,返回笛卡尔积
      JavaPairRDD<String, String> cartesianRDD = flatMapRDD.cartesian(str1RDD);
      System.out.println("aaaaaaaaacartesianRDD Count is:"+cartesianRDD.count()+" "+ cartesianRDD.first());
      
      for(Tuple2<String, String> it:cartesianRDD.collect()){
    	  System.out.println(it._1+"   "+it._2+"\n");
      }      
      
      //mapToDouble,RDD类型转换
      //其他转换还有:flatMapToDouble,flatMapToPair, mapToPair
      JavaDoubleRDD doubleRDD = lines1.mapToDouble(new DoubleFunction<Integer>(){
		public double call(Integer x) throws Exception {
			return (double)x*x;
		}
      });
      System.out.println("aaaaaaaaadoubleRDD Count is:"+doubleRDD.count()+" "+ doubleRDD.first());
      for(double it:doubleRDD.collect()){
    	  System.out.println(it+"   ");
      } 
      //mean是平均数
      System.out.println("9999999999doubleRDD mean is:"+doubleRDD.mean());
      
    }
}

222222222Count is:4                                                             
333333333333Count is:3
44444444444likeRDD Count is:3 pandas
pandas   
like   
i   
55555555flatMapRDD Count is:5 pandas
pandas   
like   
i   
like   
pandas   
6666666666distinctRDD Count is:3 pandas
pandas   
i   
like   
777777777777777unionRDD Count is:8 pandas
pandas   
like   
i   
like   
pandas   
pandas   
like   
i   
88888888888intersectionRDD Count is:3 pandas
pandas   
i   
like   
9999999999subtractRDD Count is:0
aaaaaaaaacartesianRDD Count is:15 (pandas,pandas)
pandas   pandas

pandas   like

pandas   i

like   pandas

like   like

like   i

i   pandas

like   pandas

pandas   pandas

i   like

like   like

pandas   like

i   i

like   i

pandas   i

aaaaaaaaadoubleRDD Count is:4 1.0
1.0   
4.0   
9.0   
16.0   
9999999999doubleRDD mean is:7.5


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值