SparkSql编程
1. 添加依赖文件
首先在maven项目的pom.xml中添加Spark SQL的依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
<version>1.5.2</version>
</dependency>
2.通过反射推断Schema
-
创建RDD
-
创建case class
-
将RDD和case class关联
-
将RDD转换成DataFrame
-
注册表
-
执行SQL sqlContext.sql(“sql”)
object PersonSql { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PersonSql").setMaster("local[3]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) val line = sc.textFile("D:\\sparkdata\\input\\a.txt").map(_.split(" ")) val personRdd = line.map(x => { Person(x(0).toInt,x(1),x(2).toInt) }) import sqlContext.implicits._ val df = personRdd.toDF //df.select("age").show //df.filter(df.col("age")>=23).show df.groupBy(df.col("age")).count().show() sc.stop() } } case class Person(id:Int,name:String,age:Int)
3. 通过StructType直接指定Schema
-
创建RDD
-
通过StructType直接指定每个字段的schema
-
将RDD映射到rowRDD
-
将schema信息应用到rowRDD上
-
注册表
-
执行SQL sqlContext.sql(“sql”)
object SpecifyingSchema { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PersonSql").setMaster("local[3]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) val personRDD = sc.textFile("D:\\sparkdata\\input\\a.txt").map(_.split(" ")) //通过StructType直接指定每个字段的schema val schema = StructType{ List( StructField("id", IntegerType, true), //字段,类型,是否为空 StructField("name",StringType,true), StructField("age",IntegerType,true) ) } //将RDD映射到rowRDD val rowRDD = personRDD.map(p => Row(p(0).toInt, p(1).trim, p(2).toInt)) //将schema信息应用到rowRDD上 val personDataFrame = sqlContext.createDataFrame(rowRDD, schema) //注册表 personDataFrame.registerTempTable("t_person") //执行SQL val df = sqlContext.sql("select * from t_person order by age desc limit 2") df.show() //停止Spark Context sc.stop() } }
4.保存结果与读取数据的格式
1. 保存数据
val result = sqlContext.sql("select * from t_person order by age desc")
//以文件的形式保存到hdfs
result.save("hdfs://hadoop.itcast.cn:9000/sql/res1")
//以json的形式保存到hdfs
result.save("hdfs://hadoop.itcast.cn:9000/sql/res2", "json")
2.读取数据
//读取进来就是一个dataFrame</br>
val df = sqlContext.load("hdfs://hadoop.itcast.cn:9000/sql/res2", "json")
1058

被折叠的 条评论
为什么被折叠?



