(1)使用反射的方式
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
object sprkSql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("JdbcRdd").setMaster("local[2]")
val sc = new SparkContext(conf)
//创建sqlContext的入口
val sqlContext = new SQLContext(sc)
val linesrdd = sc.textFile("D:\\数据\\person.txt")
//把rdd,关联schema
val personRdd = linesrdd.map(line=>{
val flieds = line.split(",")
val id = flieds(0).toLong
val name=flieds(1)
val age=flieds(2).toInt
val facevalue=flieds(3).toDouble
Person(id,name,age,facevalue)
})
//将RDD转换为DATaFrame
import sqlContext.implicits._
val df: DataFrame = personRdd.toDF()//此时将RDD转为DATaFrame
//对DATaFrame进行操作
//1.使用sql,基于表操作的,需要将dataframe注册为一个临时表
df.registerTempTable("t_person")
//执行sql语句,需要sqlcontext
val res = sqlContext.sql("SELECT * FROM t_person ")
//查看结果
res.show()
//释放资源
sc.stop()
}
}
//基于反射
case class Person(id:Long,name:String,age:Int,facevalue:Double)
(2)基于RDD和schema的方式
import org.apache.spark.sql.types._
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext, types}
object RDDtoDF2 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("JdbcRdd").setMaster("local[2]")
val sc = new SparkContext(conf)
//创建sqlContext的入口
val sqlContext = new SQLContext(sc)
val linesrdd = sc.textFile("D:\\数据\\person.txt")
val personRdd = linesrdd.map(line=>{
val flieds = line.split(",")
val id = flieds(0).toLong
val name=flieds(1)
val age=flieds(2).toInt
val facevalue=flieds(3).toDouble
Row(id,name,age,facevalue)
})
//定义一个表的结构信息
val sch:StructType=StructType(List(
StructField("id",LongType,true),//字段名,类型,是否可以为空
StructField("name",StringType,true),
StructField("age",IntegerType,true),
StructField("facevalue",DoubleType,true)
))
//把RDD和schema关联
val df = sqlContext.createDataFrame(personRdd,sch)
//通过dataframe提供的api去操作数据
val res = df.select("id","name","age")
//排序
import sqlContext.implicits._
val sorted =res.orderBy($"name",$"age" asc)
//这个排序是当第一个字段相等的情况下,再对第二个字段排序
sorted.show()
sc.stop()
}
}
另附:使用JdbcRdd的方式查询sql,但是这样子局限性比较大
import java.sql.DriverManager
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}
object JdbcDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("JdbcRdd").setMaster("local[2]")
val sc = new SparkContext(conf)
//mysql参数
val driver="com.mysql.jdbc.Driver"
val user="root"
val password="root"
val jdbcUrl="jdbc:mysql://192.168.88.130:3306/sessioanalyze?useUnicode=true"
//获取一个连接
val conn = ()=>{
Class.forName(driver).newInstance()
DriverManager.getConnection(jdbcUrl,user,password)
}
//准备sql语句
val sql="select * from city_info where city_id >= ? and city_id <= ? "
//执行SQL
val jdbcRDD:JdbcRDD[(Int,String,String)] = new JdbcRDD(
sc,conn,sql,0,10,1,res=>{ //里面的参数 0,10 代表着sql语句要传递的参数
val id=res.getInt("city_id")
val name=res.getString("city_name")
val area=res.getString("area")
(id,name,area)
}
)
//查看结果
jdbcRDD.foreach(println)
sc.stop()
}
}
按照计划一天天走!!