import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
object Spark {
final val conf:SparkConf=new SparkConf().setAppName("Spark").setMaster("local")
final val sc:SparkContext=new SparkContext(conf)
//利用scala实现Spark的接口
def main (args:Array[String]):Unit={
val arr3=Array("dd","aa","cc")
val arr0=arr3.map(x=>x.substring(0,1))
//filter 过滤元素
val arr4=arr3.filter(x=>x=="aa")
//show(arr2)
//flatMap
val str5=Array("张无忌 宋青书","张媛媛 宋佳")
val st6=str5.flatMap(x=>x.split(" "))
show(st6)
val str7=str5.map("hello "+_)
//show(str7)
//mapPartitions
val ap11=Array(1,5,21,64,1,77,42,4)
val list11: List[Int] = ap11.toList
val rdd011=sc.parallelize(list11,3)//通过sparkContext创建PDD
//rdd011.collect().map(println(_))
//需要用到collect才会数据落地
//遍历rdd01
//mapPartitions
//的用处是每次处理一个分区的数据
//println("----mapPartition-----")
//mapPartitions()
//map
val arr=Array("dd","aa","cc")
val arr1=arr.map(x=>x.substring(0,1))
for(i<-arr1) println(i)
//filter 过滤元素
val arr2=arr.filter(x=>x=="aa")
//show(arr2)
//flatMap
val str1=Array("张无忌 宋青书","张媛媛 宋佳")
val st2=str1.flatMap(x=>x.split(" "))
show(st2)
val str3=st2.map("hello "+_)
show(str3)
//mapPartitions
val ap=Array(1,5,21,64,1,77,42,4)
val list: List[Int] = ap.toList
val rdd01=sc.parallelize(list,3)//通过sparkContext创建PDD
rdd01.collect().map(println(_))
//需要用到collect才会数据落地
//遍历rdd01
//mapPartitions
//的用处是每次处理一个分区的数据
println("----mapPartition-----")
mapPartitions()
//println("----sample-----")
//sample()
//println("----union-----")
//union()
/*println("---intersection---")
intersection()
println("---disinct---")
distinct()*/
/*println("___cartesian___")
cartesian()*/
/*println("___repartition___")
repartition()
println("___repartitionAndSortWithinPartitions___")
repartitionAndSortWithinPartitions()*/
/*println("__------cogroup-------")
cogroup
println("__------aggregateByKey-------")
aggregateByKey*/
join
}
//打印
def show(arr:Array[String]):Unit={
for(i<-arr) print(i+" ")
println()
}
def mapPartitions():Unit={
val ll:List[Int]=List(1,2,3,4,5,6)
val rdd01=sc.parallelize(ll,2)
var rdd02=rdd01.mapPartitions(doubleP)
println(rdd02.collect().mkString)
}
final def doubleP(iter:Iterator[Int]):Iterator[(Int,Int)]={
var res=List[(Int,Int)]()
//定义一个空的LiST接收结果
while(iter.hasNext){
val cur=iter.next()
res.::=(cur,cur*2)
}
res.iterator
}
//sample
//有放回取样
//无放回取样
def sample():Unit={
var listA:List[Int]=Array(1,3,5,13,56,31,44,12,33,2).toList
//创建list
var rdd01=sc.parallelize(listA)
var random=rdd01.sample(false,0.5)//每个概率为0.1 所以代表取5个
println(random.map(x=>" "+x).collect().mkString)
}
def union():Unit={
var list1=List(1,3,5,6)
var list2=List(1,5,2,1,4)
var rdd1=sc.parallelize(list1)
var rdd2=sc.parallelize(list2)
var ui=rdd1.union(rdd2)
print(ui.map(x=>" "+x).collect().mkString)
}
def intersection():Unit={//求交集
var list1=List(1,3,5,6)
var list2=List(1,5,2,1,4)
var rdd1=sc.parallelize(list1)
var rdd2=sc.parallelize(list2)
var in=rdd1.intersection(rdd2)
print(in.map(x=>" "+x).collect().mkString)
}
def distinct():Unit={//去重
var list2=List(1,5,2,1,4,6,6,3,2,1,3,4,5,1)
var rdd2=sc.parallelize(list2)
var in=rdd2.distinct()
print(in.map(x=>" "+x).collect().mkString)
}
def cartesian():Unit={//笛卡尔积
var list1=List(1,3,5,6)
var list2=List(1,5,2,1,4)
var rdd1=sc.parallelize(list1)
var rdd2=sc.parallelize(list2)
var dikaer=rdd1.cartesian(rdd2)
println(dikaer.map(" "+_).collect().mkString)
}
/**
* 进行重分区
* HDFS -》 hello.txt 2个文件块(不包含副本)
* 2个文件块 -》2个分区 -》当spark任务运行,一个分区就启动一个task任务。
*
* 解决的问题:本来分区数少 -》 增加分区数
*/
def repartition():Unit={
var list2=List(1,5,2,1,4,1,3,5,6,12,13,14,15,16)
var rdd2=sc.parallelize(list2,1)
var rdd3=rdd2.repartition(4)
println(rdd3.map(" "+_).collect().mkString)
}
def repartitionAndSortWithinPartitions():Unit={
var list2=List(1,5,2,1,4,1,3,5,6,12,13,14,15,16)
var rdd2=sc.parallelize(list2,1)
println(rdd2.repartition(4).map(" "+_).sortBy(x=>x).collect().mkString)
var rdd3=rdd2.repartition(4).map(" "+_).sortBy(x=>x)
println("把分区变少")
var rdd4=rdd3.coalesce(1)
println(rdd4.sortBy(x=>x).collect().mkString)
}
def cogroup:Unit={//将两个tuple或者map的一个键作为关联
var arr1=Array(
("1","李小龙"),("2","刀剑笑"),("3","蜘蛛侠")
)
var arr2=Array(
("1","双节棍"),("2","赤血剑"),("3","危急感官")
)
var rdd1=sc.parallelize(arr1)
var rdd2=sc.parallelize(arr2)
val unit: RDD[(String, (Iterable[String], Iterable[String]))] = rdd1.cogroup(rdd2)
unit.foreach(println)//不需要像下面的写法也能遍历
//x=>println(x._1+"---"+x._2._1+"|"+x._2._2)//
}
def aggregateByKey():Unit={//scala是没有mapToPair函数的,scala版本只需要map就可以了
var str=Array("marry i marry who","who will marry you i ?")
var sup=str.map(x=>x.split(" ")).flatten.map(x=>(x,1)).groupBy(x=>x._1).map(x=>(x._1,x._2.length)).toList.sortBy(x=>x._2).reverse
print(sup.map(x=>x))
}
def join():Unit={
var arr1=Array(
("1",23),("2",11),("3",22)
)
var arr2=Array(
("1","双节棍"),("2","赤血剑"),("3","危急感官"),("2","天地无双剑")
)
var rdd1=sc.parallelize(arr1)
var rdd2=sc.parallelize(arr2)
var join=rdd1.join(rdd2)
var join1=rdd1.reduceByKey((x,y)=>x+y)
println(join1)
println(join.map(x=>"+++++"+x._1+"--"+x._2._1+"--"+x._2._2).collect().mkString)
}
}