数据:
val eventsFromJSONDF = Seq(
(0, """{"device_id": 0, "device_type": "sensor-ipad", "ip": "68.161.225.1", "cca3": "USA", "cn": "United States", "temp": 25, "signal": 23, "battery_level": 8, "c02_level": 917, "timestamp" :1475600496 }""").toDF("id", "json")
//转换成dataframe,分为两列,一列叫id,一列叫json
------------get_json_object方法,直接从json列获取device_id数据
val jsDF = eventsFromJSONDF.select($"id" , get_json_object($"json", "$.device_id")
------------from_json方法,定义json的表格,将列中的JSON字符串解析为DataFram,通常用在json表格里面套了一个json的情况
先建表
val jsonSchema = new StructType()
.add("battery_level", LongType)
.add("c02_level", LongType)
.add("cca3", StringType)
.add("cn", StringType)
.add("device_id", LongType)
.add("device_type", StringType)
.add("signal", LongType)
.add("ip", StringType) .add("temp", LongType) .add("timestamp", TimestampType)
val frame:DataFrame=eventsDS.select($"id",from_json($"device",jsonSchema) .as("column1"))//"$id"和"$device"是列的选择器,表示选DataFrame中的id和device列。
//from_json($"device", jsonSchema)将device列中的JSON字符串解析为DataFrame,jsonSchema是解析的JSON模式即定义json的表格。后续可以使用explode("column1")方法将数据”炸“出来,当一个json里面套了两个json,两个json都是数据,这时候就可以将它们炸出来,如下图例子:

该数据在teacher那里有两个json数据
这时候使用explode("")方法可以将其炸开,分成两组张老师和王老师数据显示
----------------to_json方法,将数据转换成json类型
val jsonDF2:DataFrame=aaa.select(to_json(struct("id","device_type","ip")).as("jsonstr")) //struct("id","device_type","ip"),将要转换的数据变成结构化对象,to_json(struct("id","device_type","ip"))将结构化对象转换为JSON字符串。
练习
数据如
{"address":"中国.江苏.徐州","area":4300.21,"classess":[{"classesName":"地下矿山瓦斯检测实验室","num":10}],"level":"211","schoolName":"中国矿业大学","teachers":[{"name":"张院士","year":50},{"name":"王院士","year":60}]}
{"address":"中国.江苏.南京","area":1000.21,"classess":[{"classesName":"园林设计","num":20}],"level":"双一流","schoolName":"南京林业大学","teachers":[{"name":"张院士","year":50},{"name":"王院士","year":60}]}
尝试将其提取出来
答案代码如下
import org.apache.spark.sql.{SparkSession, types}
import org.apache.spark.sql.functions.{explode, from_json, get_json_object}
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType}
object fff {
def main(args: Array[String]): Unit = {
// 创建SparkSession
val spark=SparkSession.builder().appName("jsonchang").master("local[*]").getOrCreate()
// 导入隐式转换
import spark.implicits._
// 读取文本文件
val aaa=spark.sparkContext.textFile("D:/File/wechat/op2.log")
// 将文本文件转换为DataFrame
val bbb=aaa.toDF("aa")
// 从json中提取字段
val jsonObject1 = bbb.select(
get_json_object($"aa", "$.address").as("address"),
get_json_object($"aa", "$.area").as("area"),
get_json_object($"aa", "$.classess").as("classess"),
get_json_object($"aa", "$.level").as("level"),
get_json_object($"aa", "$.schoolName").as("schoolName"),
get_json_object($"aa", "$.teachers").as("teachers"))
// 定义json的schema
val schema:types.ArrayType= ArrayType(
StructType(
Array(
StructField("name",StringType),
StructField("year",IntegerType)
)
)
)
val schema2:types.ArrayType= ArrayType(
StructType(
Array(
StructField("classesName",StringType),
StructField("num",IntegerType)
)
)
)
// 将json转换为DataFrame
val jsonObject2=jsonObject1.select($"address",$"area"
,from_json($"classess",schema2).as("classess"),
$"level",$"schoolName",
$"teachers")
// 展开json中的数组
val jsonObject3=jsonObject2.select($"address",$"area",explode($"classess")as("classess"),$"level",$"schoolName",
$"teachers")
// 将json转换为DataFrame
val jsonObject4=jsonObject3.select($"address",$"area",$"classess.*",$"level"
,$"schoolName",from_json($"teachers",schema).as("teachers"))
// 展开json中的数组
val jsonObject5=jsonObject4.select($"address",$"area",$"classesName",$"num",$"level"
,$"schoolName",explode($"teachers").as("teachers"))
// 显示DataFrame
val ddd=jsonObject5.show()
// 将DataFrame写入Mysql
//MysqlConnect.dataFrameToMysql(jsonObject5,MysqlConnect.table,1)
}
}
587

被折叠的 条评论
为什么被折叠?



