如果数据已经以预期的方式提取了键,groupByKey() 就会使用 RDD 中的键来对数据进行 分组。对于一个由类型 K 的键和类型 V 的值组成的 RDD,所得到的结果 RDD 类型会是 [K, Iterable[V]]。
多个RDD分组,可以使用cogroup函数,cogroup() 的函数对多个共享同 一个键的 RDD 进行分组。对两个键的类型均为 K 而值的类型分别为 V 和 W 的 RDD 进行 cogroup() 时,得到的结果 RDD 类型为 [(K, (Iterable[V], Iterable[W]))]。如果其中的 一个 RDD 对于另一个 RDD 中存在的某个键没有对应的记录,那么对应的迭代器则为空。 cogroup() 提供了为多个 RDD 进行数据分组的方法。
1 2 3 4 5 6 7 8 9 10 11 12
var rddl = sc.makeRDD(Array(("A",0), ("A",2), ("B",1), ("B",2), ("Cn",1))) rdd1.groupByKey().collect //使用reduceByKey操作将RDD[K,V]中每个K对应的V值根据映射函数来运算 var rdd2 = rdd1.reduceByKey((x,y) => x + y) //对rddl使用reduceByKey操作进行重新分区 var rdd2 = rdd1.reduceByKey (new org.apache.spark.HashPartitioner(2),(x, y) => x + y) rdd2.collect
var rdd2 = sc.makeRDD(Array(("A","a"),("C","c"),("D","d")),2) var rdd3 = sc.makeRDD(Array(("A","A"),("E","E")),2) var rdd4 = rddl.cogroup(rdd2,rdd3) rdd4.collect
val pairs =sc.parallelize(List((1,1),(2,2),(3,3))) pairs.partitioner val partitioned = pairs.partitionBy(new org.apache.spark.HashPartitioner(2)) partitioned.partitioner
//覆盖分区号获取函数 overridedefgetPartition(key: Any): Int = { val ckey: String = key.toString ckey.substring(ckey.length-1).toInt%numParts } }
val result = data.partitionBy(newCustomerPartitioner(4)) result.mapPartitionsWithIndex((index,iter)=>Iterator(index.toString +" : "+ iter.mkString("|"))).collect
val data=sc.parallelize(List((1,"hphblog"),(2,"Spark"),(3,"Flink"),(4,"SpringBoot"),(5,"SpringCloud"))) data.saveAsObjectFile("hdfs://datanode1:9000/objfile") val objrdd:org.apache.spark.rdd.RDD[(Int,String)] = sc.objectFile[(Int,String)]("hdfs://datanode1:9000/objfile/p*") objrdd.collect()
val data = sc.parallelize(Array((1,"Hadoop"), (2,"Spark"), (3,"Flink"))) data.saveAsHadoopFile("hdfs://datanode1:9000/output/hdfs_spark",classOf[Text],classOf[IntWritable],classOf[TextOutputFormat[Text,IntWritable]])
1 2
val data = sc.parallelize(Array(("Hadoop",1), ("Spark",2), ("Flink",3))) data.saveAsNewAPIHadoopFile("hdfs://datanod1:9000/output/NewAPI/",classOf[Text],classOf[IntWritable] , classOf[org.apache.hadoop.mapreduce.OutputFormat[Text,IntWritable]])
/** * Created by 清风笑丶 Cotter on 2019/5/30. */ objectJDBCRDD2MySQL{ defmain(args: Array[String]) { val sparkConf = newSparkConf().setMaster("local[*]").setAppName("HBaseApp") val sc = newSparkContext(sparkConf) val data = sc.parallelize(List("JDBC2Mysql", "JDBCSaveToMysql","RDD2Mysql"))
data.foreachPartition(insertData) }
definsertData(iterator: Iterator[String]): Unit = { Class.forName ("com.mysql.jdbc.Driver").newInstance() val conn = java.sql.DriverManager.getConnection("jdbc:mysql://datanode1:3306/rdd", "root", "hive") iterator.foreach(data => { val ps = conn.prepareStatement("insert into rddtable(name) values (?)") ps.setString(1, data) ps.executeUpdate() }) }
create 'fruit','info' put 'fruit','1001','info:name','Apple' put 'fruit','1001','info:color','Read' put 'fruit','1002','info:name','Banana' put 'fruit','1002','info:color','Yelow'
/** * Created by 清风笑丶 Cotter on 2019/5/30. */ objectWrite2Hbase{ defmain(args: Array[String]) { val sparkConf = newSparkConf().setMaster("local[2]").setAppName("HBaseApp") val sc = newSparkContext(sparkConf)
sc.setLogLevel("ERROR") val conf = HBaseConfiguration.create() conf.set("hbase.zookeeper.quorum", "192.168.1.101"); conf.set("hbase.zookeeper.property.clientPort", "2181")
val jobConf = newJobConf(conf) jobConf.setOutputFormat(classOf[TableOutputFormat]) jobConf.set(TableOutputFormat.OUTPUT_TABLE, "fruit_spark")
val fruitTable = TableName.valueOf("fruit_spark") val tableDescr = newHTableDescriptor(fruitTable) tableDescr.addFamily(newHColumnDescriptor("info".getBytes))
val admin = newHBaseAdmin(conf) if (admin.tableExists(fruitTable)) { admin.disableTable(fruitTable) admin.deleteTable(fruitTable) } admin.createTable(tableDescr)
defconvert(triple: (Int, String, Int)) = { val put = newPut(Bytes.toBytes(triple._1)) put.addImmutable(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes(triple._2)) put.addImmutable(Bytes.toBytes("info"), Bytes.toBytes("price"), Bytes.toBytes(triple._3)) (newImmutableBytesWritable, put) } val initialRDD = sc.parallelize(List((1,"apple",11), (2,"banana",12), (3,"pear",13))) val localData = initialRDD.map(convert)
overridedefadd(v: String): Unit = { _logArray.add(v) }
overridedefmerge(other: org.apache.spark.util.AccumulatorV2[String, java.util.Set[String]]): Unit = { other match { case o: LogAccumulator => _logArray.addAll(o.value) }
// 过滤掉带字母的 objectLogAccumulator{ defmain(args: Array[String]) { val conf=newSparkConf().setAppName("LogAccumulator").setMaster("local[*]") val sc=newSparkContext(conf)
val accum = newLogAccumulator sc.register(accum, "logAccum") val sum = sc.parallelize(Array("1", "2a", "3", "4b", "5", "6", "7cd", "8", "9"), 2).filter(line => { val pattern = """^-?(\d+)""" val flag = line.matches(pattern) if (!flag) { accum.add(line) } flag }).map(_.toInt).reduce(_ + _) //1+3+5+6+7+8+9 =32
println("sum: " + sum) for (v <- accum.value) print(v + "") println() sc.stop() } }