union : 合并结果集
scala> val a = sc.parallelize(List(1,2,3,4,5))
a: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[63] at parallelize at <console>:24
scala> val b = sc.parallelize(List(1,2,5,6,7,8))
b: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[64] at parallelize at <console>:24
scala> val c =a.union(b)
c: org.apache.spark.rdd.RDD[Int] = UnionRDD[65] at union at <console>:28
scala> val d =c.collect
d: Array[Int] = Array(1, 2, 3, 4, 5, 1, 2, 5, 6, 7, 8)
去重排序,collect是Action操作
scala> c.distinct.sortBy(x=>x).collect
res61: Array[Int] = Array(1, 2, 3, 4, 5, 6, 8, 9)
交集
scala> a.collect
res64: Array[Int] = Array(1, 2, 3, 4)
scala> b.collect
res65: Array[Int] = Array(4, 5, 3, 6, 8, 9)
scala> a.intersection(b).collect
res67: Array[Int] = Array(4, 3)
连接,参考mysql的join,左右连接有则显示some,无显示none
scala> val a = sc.parallelize(List(("hi",1),("hello",3),("ws",6)))
a: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[100] at parallelize at <console>:24
scala> val b =sc.parallelize(List(("ws",2),("hi",3),("good",4),("hello",6)))
b: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[101] at parallelize at <console>:24
scala> a.join(b)
res68: org.apache.spark.rdd.RDD[(String, (Int, Int))] = MapPartitionsRDD[104] at join at <console>:29
scala> a.join(b).collect
def collect[U](f: PartialFunction[(String, (Int, Int)),U])(implicit evidence$29: scala.reflect.ClassTag[U]): org.apache.spark.rdd.RDD[U]
def collect(): Array[(String, (Int, Int))]
scala> a.join(b).collect
res69: Array[(String, (Int, Int))] = Array((hi,(1,3)), (ws,(6,2)), (hello,(3,6)))
scala> a.leftOuterJoin(b) #参考mysql的左连接
res70: org.apache.spark.rdd.RDD[(String, (Int, Option[Int]))] = MapPartitionsRDD[110] at leftOuterJoin at <console>:29
scala> a.leftOuterJoin(b).collect
res71: Array[(String, (Int, Option[Int]))] = Array((hi,(1,Some(3))), (ws,(6,Some(2))), (hello,(3,Some(6))))
scala> a.rightOuterJoin(b) #参考mysql的右连接
res72: org.apache.spark.rdd.RDD[(String, (Option[Int], Int))] = MapPartitionsRDD[116] at rightOuterJoin at <console>:29
scala> a.rightOuterJoin(b).collect
res73: Array[(String, (Option[Int], Int))] = Array((hi,(Some(1),3)), (ws,(Some(6),2)), (hello,(Some(3),6)), (good,(None,4)))
groupByKey
scala> a.collect
res81: Array[(String, Int)] = Array((hi,1), (hello,3), (ws,6))
scala> b.collect
res82: Array[(String, Int)] = Array((ws,2), (hi,3), (good,4), (hello,6))
scala> val g = a.union(b).groupByKey
g: org.apache.spark.rdd.RDD[(String, Iterable[Int])] = ShuffledRDD[123] at groupByKey at <console>:28
scala> g.collect
res83: Array[(String, Iterable[Int])] = Array((ws,CompactBuffer(6, 2)), (hello,CompactBuffer(3, 6)), (hi,CompactBuffer(1, 3)), (good,CompactBuffer(4)))
操作map的value , 求value的和
scala> g.map(x=>(x._1,x._2.sum))
res84: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[124] at map at <console>:31
scala> g.map(x=>(x._1,x._2.sum)).collect
res85: Array[(String, Int)] = Array((ws,8), (hello,9), (hi,4), (good,4))
scala> g.mapValues(_.sum).collect
res86: Array[(String, Int)] = Array((ws,8), (hello,9), (hi,4), (good,4))