package com.qf.sparkstreaming.day04
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{DataTypes, StructType}
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
{
"devices": {
"cameras": {
"device_id": "awJo6rH",
"last_event": {
"has_sound": true,
"has_motion": true,
"has_person": true,
"start_time": "2016-12-29T00:00:00.000Z",
"end_time": "2016-12-29T18:42:00.000Z"
}
}
}
}
object _03KafkaSourceJson {
def main(args: Array[String]): Unit = {
val session: SparkSession = SparkSession.builder().appName("test1").master("local[*]").getOrCreate()
session.sparkContext.setLogLevel("ERROR")
//作为消费者,从kafka读取数据,获取到的数据有schema,
// 分别是 key|value|topic|partition|offset|timestamp|timestampType|
val frame: DataFrame = session.readStream.format("kafka")
.option("kafka.bootstrap.servers","qianfeng01:9092,qianfeng02:9092,qianfeng03:9092")
.option("startingOffsets","earliest")
.option("subscribe","pet").load()
//处理kafka中的数据
val last_event = new StructType()
.add("has_sound",DataTypes.BooleanType)
.add("has_motion",DataTypes.BooleanType)
.add("has_person",DataTypes.BooleanType)
.add("start_time",DataTypes.DateType)
.add("end_time",DataTypes.DateType)
val cameras = new StructType()
.add("device_id",DataTypes.StringType)
.add("last_event",last_event)
val devices = new StructType()
.add("cameras",cameras)
//1.过滤日志。只留下WARN以上级别的
Logger.getLogger("org").setLevel(Level.WARN)
//2.获取SparkSession对象
val session: SparkSession = SparkSession.builder().appName("test").master("local[2]").getOrCreate()
val schema = new StructType()
.add("devices",devices)
//映射时间格式
val jsonOptions = Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss.sss'Z'")
import session.implicits._
import org.apache.spark.sql.functions._
//处理value是json的数据,然后返回的是字段value的数据是一个json数据
val frame1: DataFrame = frame.selectExpr("cast(value as String)")
.select(from_json('value, schema, jsonOptions).alias("value"))
//查询value里的has_person ,start_time,end_time
val frame2: DataFrame = frame1.
selectExpr("value.devices.cameras.last_event.has_person",
"value.devices.cameras.last_event.start_time",
"value.devices.cameras.last_event.end_time"
)
.filter($"has_person"===true)
.groupBy($"has_person",$"start_time")
.count()
frame2.writeStream
.outputMode(OutputMode.Update())
.format("console")
.start()
.awaitTermination()
}
}
开启kafka生产者:输入:
{ “devices”: {
“cameras”: {
“device_id”: “awJo6rH”,
“last_event”: {
“has_sound”: true,
“has_motion”: true,
“has_person”: true,
“start_time”: “2016-12-29T00:00:00.000Z”,
“end_time”: “2016-12-29T18:42:00.000Z”
}
} } }