基于Java开发的Spark篇(RDD)

package com.hj.spark;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;


public class PartitionRDD {

	public static void main(String[] args) {
		// TODO Auto-generated method stub
		// 配置SparkConf,设置应用名称,启用本地运行方式
		SparkConf conf = new SparkConf().setAppName("PartitionRDD").setMaster("local");
		// 创建SparkContext
		JavaSparkContext sc = new JavaSparkContext(conf);
//		mapDemo(sc);
//		flatMapDemo(sc);
//		groupByKeyDemo(sc);
		reduceByKey(sc);
	}
	// 将相同长度的字符串拼接到一起
	private static void reduceByKey(JavaSparkContext sc) {
		List<String> data = Arrays.asList("hadoop","hdfs","hello","world","hadoop","Java","Java","hello");
		JavaPairRDD<Integer, String> rdd = sc.parallelize(data).keyBy(new Function<String, Integer>() {

			@Override
			public Integer call(String str) throws Exception {
				// TODO Auto-generated method stub
				return str.length();
			}
		});
		JavaPairRDD<Integer, String> reduceByKeyRDD = rdd.reduceByKey(new Function2<String, String, String>() {
			
			@Override
			public String call(String str1, String str2) throws Exception {
				// TODO Auto-generated method stub
				return str1+"_"+str2;
			}
		});
		System.out.println(reduceByKeyRDD.collect());
	}
	
	// 将相同长度字符串放在同一个分区
	private static void groupByKeyDemo(JavaSparkContext sc) {
		List<String> data = Arrays.asList("hadoop","hdfs","hello","world","hadoop","Java","Java","hello");
		JavaPairRDD<Integer, String> rdd = sc.parallelize(data).keyBy(new Function<String, Integer>() {

			@Override
			public Integer call(String str) throws Exception {
				// TODO Auto-generated method stub
				return str.length();
			}
		});
		/*
		 * rdd
		 * <6,hadoop> <4,hdfs> <5,hello> <5,world>........
		 */
		JavaPairRDD<Integer, Iterable<String>> groupbyRDD = rdd.groupByKey();
		System.out.println(groupbyRDD.collect());
	}

	// flatMap
	private static void flatMapDemo(JavaSparkContext sc) {
		List<List<String>> list = Arrays.asList(Arrays.asList("hadoop spark hbase java","java scala hive"),
											Arrays.asList("java scala python"));
		JavaRDD<List<String>> strRDD = sc.parallelize(list);
		// [[hadoop spark hbase java, java scala hive],[java scala Python]]
		JavaRDD<String> flatMapRDD = strRDD.flatMap(new FlatMapFunction<List<String>, String>() {


			@Override
			public Iterator<String> call(List<String> strList) throws Exception {
				String[] addStrings = null;
				List<String> list = new ArrayList<String>();
				StringBuilder stringBuilder = new StringBuilder();
				// TODO Auto-generated method stub
				String str = ""; // 临时接收处理后字符串

				System.out.println("=============================="+strList.size());
				for (int i = 0; i < strList.size(); i++){
					stringBuilder.append(strList.get(i)+" ");
				}
				
				addStrings = stringBuilder.toString().split(" ");
				for(String word : addStrings){
					list.add(word);
				}
				/*
				 * 数组 -\->迭代器
				 * 数据--->列表
				 * 列表--->迭代器
				 */
				
				return list.iterator();
			}
		
		});
		System.out.println("原数据:"+strRDD.collect());
		System.out.println("变化后:"+flatMapRDD.collect());
		// 原数据:[[hadoop spark hbase java, java scala hive], [java scala python]]
		// 变化后:[hadoop, spark, hbase, java, java, scala, hive, java, scala, python]
	}

	private static void mapDemo(JavaSparkContext sc) {
		// 创建测试数据
		List<String> list = Arrays.asList("hadoop spark hbase java","java scala hive");
		JavaRDD<String> strRDD = sc.parallelize(list);
		// 输出 [hadoop spark hbase java, java scala hive]
		JavaRDD<String []> splitRDD = strRDD.map(new Function<String, String[]>() {

			@Override
			public String[] call(String v1) throws Exception {
				// TODO Auto-generated method stub			
				return v1.split(" ");
			} // 第一个参数:输入数据类型;第二个参数:输出数据类型		
		});
		// splitRDD:
		/*
		 * List(List(hadoop,spark,hbase,java),
		 * 		List(java,scala,hive))
		 */
		List<String[]> result = splitRDD.collect();
		for (int i = 0; i < result.size(); i++){
			for(String s: result.get(i)){
				System.out.println("Array "+i+" data:" + s);
			}
		}
	}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值