package rddbasic
import org.apache.spark.storage.StorageLevel
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by hanq0 on 2017/10/23.
*/
object stuRdds {
val conf = new SparkConf().setAppName("WordCount").setMaster("local")
val sc = new SparkContext(conf)
//内部集合
def innnerRdds(): Unit = {
val data = Array(1, 2, 3, 4, 5)
val distData = sc.parallelize(data)
}
//外部数据
def outterRdds(): Unit = {
val distFile = sc.textFile("data.txt")
}
//基本操作
def basicRdd(): Unit = {
val lines = sc.textFile("data.txt")
val lineLengths = lines.map(s => s.length)
val totalLength = lineLengths.reduce((a, b) => a + b)
}
//调用函数
def passFunctions(): Unit = {
val lines = sc.textFile("data.txt")
val lineLengths = lines.map(MyFunctions.mymap)
//val totalLength = lineLengths.reduce(MyFunctions.myreduce())
}
//闭包,集群模式下变量被分隔
def valuesDisparched(): Unit = {
var counter = 0
val data = Array(1, 2, 3, 4, 5)
var rdd = sc.parallelize(data)
// Wrong: Don't do this!!
rdd.foreach(x => counter += x)
println("Counter value: " + counter)
}
//对键值对(key-value)的操作
def KeyValueactions(): Unit = {
val lines = sc.textFile("data.txt")
val pairs = lines.map(s => (s, 1))
val counts = pairs.reduceByKey((a, b) => a + b)
}
//Rdd的转换Transformations
def RddTransformations(): Unit = {
val data0 = Array(1, 2, 3, 4, 5)
val data1 = Array(6, 7, 8, 9, 0)
val data2 = data0.union(data1)
//sortByKey([ascending], [numTasks]),sample(withReplacement, fraction, seed),repartition(numPartitions)......
}
//Rdd的操作Actions
def RddActions(): Unit = {
val data = Array(1, 2, 3, 4, 5)
var rdd = sc.parallelize(data)
rdd.foreach(x => x+1)
//reduce(func),collect(),count()......
}
//Rdd的持久化
def RddStorage(): Unit = {
val data = sc.textFile("data.txt")
data.persist(StorageLevel.MEMORY_ONLY) // persist() 方法进行设置
data.cache() //cache() 方法是使用默认存储级别的快捷设置方法
}
//共享变量
//Broadcast variables(广播变量)允许程序员将一个 read-only(只读的)变量缓存到每台机器上
def BroadcastVariables(): Unit = {
val broadcastVar = sc.broadcast(Array(1, 2, 3))
print(broadcastVar.value)
}
//Accumulators(累加器)是一个仅可以执行 “added”(添加)的变量来通过一个关联和交换操作
def AccumulatorsVariables(): Unit = {
val accum = sc.longAccumulator("My Accumulator")
sc.parallelize(Array(1, 2, 3, 4)).foreach(x => accum.add(x))
print(accum.value)
}
}
10.RDD基本操作
最新推荐文章于 2023-03-11 15:33:28 发布