package com.spark.sql
import org.apache.spark.sql.{DataFrame, Encoder, Row, SparkSession}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
object DataOperation {
System.setProperty("hadoop.home.dir","D:\\soft\\hadoop\\hadoop-2.7.3")
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("data-operation").getOrCreate()
//1.读取json进行sql处理
jsonFile(spark)
//2.读取text进行sql处理
//textFile(spark)
//3. 案例一
// movieFile(spark)
//spark.close()
}
def movieFile(spark: SparkSession): Unit = {
val parquetFile = "G:\\data\\movies.parquet"
val moviesDF = spark.read.parquet(parquetFile)
moviesDF.createOrReplaceTempView("movies")
//Jolie导演在2009年之后拍的电影
// spark.sql("select * from movies where actor_name like '%Jolie%' and produced_year > 2009").show()
//每个导演拍过几部电影,电影数量大于30,降序排列
// spark.sql("select actor_name, count(*) as count from movies group by actor_name having count>30 order by count desc").show()
//每年上映的电影数量,降序排序
// spark.sql("""select produced_year, count(*) as count
// from (select distinct movie_title, produced_year from movies)
// group by produced_year order by count desc""").show(5)
//生成全局表
moviesDF.createOrReplaceGlobalTempView("movies_g")
spark.sql("select count(*) as total from global_temp.movies_g").show()
}
def textFile(spark: SparkSession): Unit = {
val file = "G:\\data\\people.txt"
implicit val personEncoder: Encoder[Person] = ExpressionEncoder()
val df = spark.read.text(file)
.map(row => {
val arr = row.getString(0).split(" ")
Person(arr(0), arr(1).toInt)
})/*.toDF()*/
// val rdd = spark.sparkContext.textFile(file)
// import spark.implicits._
// val rdd1 = rdd.map(line => Person(line.split(" ")(0), line.split(" ")(1).toInt))
// val df = rdd1.toDF()
val df = spark.createDataFrame(rdd1)
//
df.createOrReplaceTempView("people")
val df1 = spark.sql("select name,age from people where age > 20 limit 2")
df1.show()
//查看sparksql内存中有哪些表
spark.catalog.listTables().show()
}
def jsonFile(spark: SparkSession): Unit ={
val file = "G:\\data\\people.json"
val df = spark.read.json(file)
// val ds = df.as[Person]
//生成临时表:内存
df.createOrReplaceTempView("person")
val df1 = spark.sql("select * from person where age is not null")
df1.show()
val df2 = spark.sql("select name,age from person where age is not null order by age desc limit 1")
df2.show
}
}
case class Person(name:String, age:Int)
Spark 通过df操作对sql进行处理
最新推荐文章于 2024-10-13 23:19:34 发布