一:通过createDataset(seq,list,rdd)
import org.apache.spark.SparkContext
import org.apache.spark.sql.{Dataset, SparkSession}
object CreateDataset {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[4]").appName(this.getClass.getName).getOrCreate()
// 需要导入隐式转换
import spark.implicits._
val sc: SparkContext = spark.sparkContext
//通过seq创建Dataset
val seqDs: Dataset[Int] =spark.createDataset(1 to 10)
//通过list创建Dataset
val listDs: Dataset[(String, Int)] = spark.createDataset(List(("a",1),("b",2),("c",3)))
//通过rdd创建Dataset
val rddDs: Dataset[(String, Int, Int)] = spark.createDataset(sc.parallelize(List(("a",1,2),("b",2,3),("c",3,4))))
seqDs.show()
listDs.show()
rddDs.show()
}
}
二:通过case class
1.通过case class样例类创建一个seq、list、Array、RDD,再.toDS转化为Dataset
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}
import scala.collection.mutable
object CreateDataSetByCaseClass {
case class Point(label:String,x:Double,y:Double)
case class Category(id:Long,name:String)
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[4]").appName(this.getClass.getName).getOrCreate()
// 需要导入隐式转换
import spark.implicits._
val sc: SparkContext = spark.sparkContext
//通过Point的样例类创建一个seq,并将它转化为Dataset
val points: Dataset[Point] = Seq(Point("bar",2.6,3.5),Point("foo",4.0,3.7)).toDS()
//通过Category的样例类创建一个seq,并将它转化为Dataset
val categories: Dataset[Category] = Seq(Category(1,"bar"),Category(2,"foo")).toDS()
//进行join连接,注意这里需要传入三个”=“,这时一个方法
points.join(categories,points("label")===categories("name")).show()
//通过Point的样例类创建一个List,并将它转化为Dataset
val points2: Dataset[Point] = List(Point("bar",2.6,3.5),Point("foo",4.0,3.7)).toDS()
//通过Category的样例类创建一个List,并将它转化为Dataset
val categories2: Dataset[Category] = List(Category(1,"bar"),Category(2,"foo")).toDS()
//进行join连接,注意这里需要传入三个”=“,这时一个方法
points2.join(categories2,points2("label")===categories2("name")).show()
//通过Point的样例类创建一个RDD,并将它转化为Dataset
val points3: Dataset[Point] = sc.parallelize(List(Point("bar",2.6,3.5),Point("foo",4.0,3.7))).toDS()
//通过Category的样例类创建一个RDD,并将它转化为Dataset
val categories3: Dataset[Category] = sc.parallelize(List(Category(1,"bar"),Category(2,"foo"))).toDS()
points3.join(categories3,points3("label")===categories3("name")).show()
}
}
2.先创建RDD,在把RDD和样例类进行关联,再.toDS转化为Dataset
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}
import scala.collection.mutable
object CreateDataSetByCaseClass {
case class Point(label:String,x:Double,y:Double)
case class Category(id:Long,name:String)
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[4]").appName(this.getClass.getName).getOrCreate()
// 需要导入隐式转换
import spark.implicits._
val sc: SparkContext = spark.sparkContext
//过Point的数据创建一个RDD
val pointRdd: RDD[(String, Double, Double)] = sc.parallelize(List(("bar",2.6,3.5),("foo",4.0,3.7)))
//通过Category的数据创建一个RDD
val categoriesRdd: RDD[(Int, String)] = sc.parallelize(List((1,"bar"),(2,"foo")))
//两个RDD和样例类进行关联
val pointsDS: Dataset[Point] = pointRdd.map(x=>Point(x._1,x._2,x._3)).toDS()
val categoriesDs: Dataset[Category] = categoriesRdd.map(x=>Category(x._1,x._2)).toDS()
//将两个DataSet进行关联,输出
pointsDS.join(categoriesDs,pointsDS("label")===categoriesDs("name")).show()
}
}
博客介绍了在Spark中创建Dataset的方法,一是通过createDataset(seq,list,rdd),二是通过case class,包括用样例类创建seq、list等再转化为Dataset,以及先创建RDD并与样例类关联后转化为Dataset。
611

被折叠的 条评论
为什么被折叠?



