JSON数据源
Spark SQL可以自动推断JSON文件的元数据,并且加载其数据,创建一个DataFrame。可以使用SQLContext.read.json ()方法,针对一个元素类型为String的RDD,或者是一个JSON文件。
但是要注意的是,这里使用的JSON文件与传统意义上的JSON文件是不一样的。每行都必须,也只能包含一个,单独的,自包含的,有效的JSON对象。不能让一个JSON对象分散在多行。否则会报错。
案例:
Java版本:
package Spark_SQL.Hive_sql;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
import static org.apache.spark.sql.types.DataTypes.IntegerType;
import static org.apache.spark.sql.types.DataTypes.StringType;
/**
* @Date: 2019/3/16 9:13
* @Author Angle
*/
/*
* 利用json数据源执行sql
* 查询json文件中成绩大于80的学生的信息
*
* 1、读取json文件,查询成绩大于80学生姓名
* 2、针对json字符串创建DataFrame,查询出符合条件姓名的成绩
* 3、进行join连接,保存到文件
*
* */
public class JSONDataSource {
public static void main(String[] args){
SparkConf conf = new SparkConf().setAppName("JSONDataSource").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
//针对json文件创建dataFrame
Dataset<Row> stuDF = sqlContext.read().json("E:\\IDEA\\textFile\\hive\\students.json");
//注册临时表,查询成绩大于80的学生
stuDF.registerTempTable("students");
Dataset<Row> stuScore = sqlContext.sql("select name,score from students where score>=80");
//DataFrame转化为RDD,执行transformation操作
List<Object> goodstuName = stuScore.javaRDD().map(new Function<Row, Object>() {
@Override
public Object call(Row row) throws Exception {
return row.getString(0);
}
}).collect();
//针对JavaRDD<String>创建DataFrame
ArrayList<String> stuInfoJSON = new ArrayList<String>();
stuInfoJSON.add("{\"name\":\"Leo\", \"age\":15}");
stuInfoJSON.add("{\"name\":\"Marry\", \"age\":19}");
stuInfoJSON.add("{\"name\":\"Jack\", \"age\":27}");
JavaRDD<String> stuInfoJSONRDD = sc.parallelize(stuInfoJSON,1);
Dataset<Row> stuInfoDF = sqlContext.read().json(stuInfoJSONRDD);
//注册临时表,查询分数大于80学生信息
stuInfoDF.registerTempTable("student_Info");
String sql = "select name,age from student_info where name in (";
for (int i=0 ; i<goodstuName.size();i++){
sql += "'"+goodstuName.get(i)+ "'";
if(i<goodstuName.size()-1){
sql += ",";
}
}
sql += ")";
//通过sql语句得出name+score信息
Dataset<Row> goodStuInfoDF = sqlContext.sql(sql);
//将两份数据的DataFrame,转换为JavaPairRDD,执行join,transformation操作
//先转换为JavaRDD,再map为JavaPairRDD,再join
JavaPairRDD<String, Tuple2<Integer, Integer>> goodStudentRDD = goodStuInfoDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<String, Integer>(
row.getString(0), Integer.valueOf(String.valueOf(row.getLong(1))));
}
}).join(stuScore.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row1) throws Exception {
return new Tuple2<String, Integer>(
row1.getString(0), Integer.valueOf(String.valueOf(row1.getLong(1))));
}
}));
//然后将好学生的全部信息转化为JavaRDD<row> ( JavaRDD -> DataFrame )
JavaRDD<Row> goodStudentRowRDD = goodStudentRDD.map(new Function<Tuple2<String, Tuple2<Integer, Integer>>, Row>() {
@Override
public Row call(Tuple2<String, Tuple2<Integer, Integer>> v1) throws Exception {
return RowFactory.create(v1._1,v1._2._2,v1._2._1);
}
});
//创建元数据,将JavaRDD<Row>转换为DataFrame
List<StructField> structField = new ArrayList<StructField>();
structField.add(DataTypes.createStructField("name", StringType,true));
structField.add(DataTypes.createStructField("score",IntegerType,true));
structField.add(DataTypes.createStructField("age",IntegerType,true));
StructType structType = DataTypes.createStructType(structField);
Dataset<Row> goodStudentDF = sqlContext.createDataFrame(goodStudentRowRDD, structType);
goodStudentDF.write().format("json").save("E:\\IDEA\\textFile\\hive\\goodStudentsDF.json");
}
}
scala版本:
package SparkSQL_Scala.HiveSql
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* @Date: 2019/3/16 14:06
* @Author Angle
*/
object JSONDataSource_s {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("JSONDataSource_s").setMaster("local")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
//读取数据-->注册临时表-->sql查询-->拉取name到本地转化成row
val stuScoreDF = sqlContext.read.json("E:\\IDEA\\textFile\\hive\\students.json")
stuScoreDF.registerTempTable("student_score")
val goodStuDF = sqlContext.sql("select name,score from student_score " +
"where score>=80")
val goodStuName = goodStuDF.rdd.map(row => row(0)).collect()
//student的json信息-->>序列化成RDD-->>转换成DataFrame表-->>注册成临时表
val stuInfoJSON = Array("{\"name\":\"Leo\", \"age\":85}",
"{\"name\":\"Marry\", \"age\":99}",
"{\"name\":\"Jack\", \"age\":74}")
val stuInfoJSONRDD = sc.parallelize(stuInfoJSON,1)
val stuInfoDF = sqlContext.read.json(stuInfoJSONRDD)
stuInfoDF.registerTempTable("student_info")
//查询分数大于80学生基本信息
var sql = "select name,age from student_info where name in (";
for (i <- 0 until goodStuName.length ){
sql += "'"+goodStuName(i)+ "'"
if(i<goodStuName.length-1){
sql += ","
}
}
sql += ")"
val goodStuInfoDF = sqlContext.sql(sql)
//将分数大于80学生信息与基本信息join
val goodStuRDD = goodStuDF.rdd.map{row =>
(row.getAs[String]("name"),row.getAs[Long]("score"))}
.join(goodStuInfoDF.rdd.map(row => (row.getAs[String]("name")
,row.getAs[Long]("age"))))
//将RDD转化为DataFrame
val goodStuRowsRDD = goodStuRDD.map(info =>
Row(info._1,info._2._1.toInt,info._2._2.toInt))
//构建元数据
val structType = StructType(Array(
StructField("name",StringType,true),
StructField("score",IntegerType,true),
StructField("age",IntegerType,true)
))
val goodStudentsDF = sqlContext.createDataFrame(goodStuRowsRDD,structType)
//将DataFrame中的数据保存到json中
goodStudentsDF.write.format("json").save("E:\\IDEA\\textFile\\hive\\goodStudentsDF.json")
}
}