spark streaming案例总结
1、读取N分钟前文件
def readFile(bachTime: Int, big_time: Int, frame: DataFrame, template_path: String, spark: SparkSession) = {
val nowMinute: String = getNowTime()._3
val yu_time = nowMinute.toInt % bachTime
var file_df: DataFrame = frame
for (i <- Range(0, big_time.abs / bachTime)) {
val cut_time = i * bachTime * (-1) - yu_time
val time = getBeforeTime(cut_time)
try {
val df: DataFrame = spark.read.parquet(s"${template_path}//dt=${time._1}//hour=${time._2}//minute=${time._3}//*")
file_df = file_df.union(df)
} catch {
case e: Exception =>
println(e)
}
}
file_df
}
spark streaming 在向hdfs上写文件时,以每小时00分钟开始,以窗口为单位去写数据,上边的的程序,首先当前分钟与窗口时间取模,所以向上第一文件就是当前时间减去模时间就可以了,第二个是模+一个窗口时间,第三个是模加+两个窗口时间。。。
不管文件是否存在,直接读,不存在就直接异常处理
2、scala 时间处理
当前时间
def getNowTime() = {
val format = new SimpleDateFormat("YYYYMMddHHmmss")
val date = new Date()
val formatDate = format.format(date)
val day = formatDate.substring(0, 8)
val hour = formatDate.substring(8, 10)
val minute = formatDate.substring(10, 12)
(day, hour, minute, formatDate)
}
N分钟前
def getBeforeTime(bef_time: Int) = {
val date = new Date()
val cal = Calendar.getInstance()
cal.setTime(date)
cal.add(Calendar.MINUTE, bef_time)
val format = new SimpleDateFormat("YYYYMMddHHmmss")
val formatDate = format.format(cal.getTime)
val day = formatDate.substring(0, 8)
val hour = formatDate.substring(8, 10)
val minute = formatDate.substring(10, 12)
(day, hour, minute, formatDate)
}
3、有关Schema的创建
val templateElement = Seq("now_time" -> "Long", "channel" -> "string",
"serverName" -> "string", "requestTime" -> "string",
"responseTime" -> "string", "msgId" -> "string",
"shortUrl" -> "string", "sceneId" -> "string",
"shopId" -> "string", "number" -> "string",
"contentSign" -> "string", "ip" -> "string", "deviceId" -> "string",
"timestamp" -> "string", "nonce" -> "string", "sign" -> "string",
"subCode" -> "string", "message" -> "string",
"cardTplId" -> "string", "dyncParams" -> "string") //20
var templateSchema: StructType = new StructType()
for (i <- templateElement) {
templateSchema = templateSchema.add(i._1, i._2)
}
空DF的创建
val file_df = spark.createDataFrame(sc.emptyRDD[Row], templateSchema)
4、df保存到hdfs
def save2hdfs(path: String, df: DataFrame) = {
val dt: (String, String, String, String) = getNowTime()
val day = dt._1
val hour = dt._2
val minute = dt._3
if (!myIsEmpty(df)) {
df.write.mode(SaveMode.Append).parquet(s"${path}//dt=${day}//hour=${hour}//minute=${minute}")
}
}
def myIsEmpty(df: DataFrame): Boolean = {
try {
df.head()
false
} catch {
case e: NoSuchElementException =>
println(e.getMessage)
true
}
}
5、df写入MySQL
val mysqlUrl = "jdbc:mysql://localhost:3306/test?characterEncoding=utf8&allowMultiQueries=true&useTimezone=true&serverTimezone=Asia/Shanghai"
val user = "root"
val passwd = "root"
joinDf.foreachPartition(rows=>{
val connection= DriverManager.getConnection(mysqlUrl,user,passwd)
val sql="insert into smi_pull_parsing_data_info(create_time,parsing_date,parsing_time,message_id,template_id,code,firm,short_url,scene_id) values(?,?,?,?,?,?,?,?,?)"
val ps = connection.prepareStatement(sql)
for(row <- rows){
for(i <- Range(0,row.length)){
ps.setString(i+1,row.get(i).toString)
}
ps.addBatch()
}
ps.executeBatch()
ps.close()
connection.close()
})
6、df算子的自定义函数(例子:时间戳的处理)
val stamp2Time_day = (str: String) => {
val format = new SimpleDateFormat("yyyyMMdd")
val timeLong = str.toLong
format.format(timeLong)
}
val stamp2Time = (str: String) => {
val format = new SimpleDateFormat("yyyy-MM-dd HH:dd:sss")
val timeLong = str.toLong
format.format(timeLong)
}
val udf_stamp2Time = udf(stamp2Time)
val udf_stamp2Time_day = udf(stamp2Time_day)
val joinDf: DataFrame = callbackDf.join(templateDf).where(callbackDf("msgid") === templateDf("msgid"))
.select(callbackDf("now_time"),udf_stamp2Time_day(callbackDf("timestamp")),
udf_stamp2Time(callbackDf("timestamp")), callbackDf("msgid"), callbackDf("cardTplId"),
callbackDf("code"), callbackDf("channel"), templateDf("shorturl"), templateDf("sceneId")
)
6、rdd json的读取
val templateRdd: RDD[Row] = value.filter(_.contains(template_topic)).map(templateJson2row).filter(_ != null)
def templateJson2row: String => Row = {
x => {
try {
val jsonText: String = x.split("\\s+", 6).last
val jsonobject = JSON.parseObject(jsonText)
val channel: String = jsonobject.getString("channel")
val serverName: String = jsonobject.getString("serverName")
val requestTime: String = jsonobject.getString("requestTime")
val responseTime: String = jsonobject.getString("responseTime")
var msgId, shortUrl, sceneId, shopId, number, contentSign, ip, deviceId, timestamp, nonce, sign = ""
if (jsonobject.containsKey("request")) {
val requestObject: JSONObject = jsonobject.getJSONObject("request")
msgId = requestObject.getString("msgId")
shortUrl = requestObject.getString("shortUrl")
sceneId = requestObject.getString("sceneId")
shopId = requestObject.getString("shopId")
number = requestObject.getString("number")
ip = requestObject.getString("ip")
deviceId = requestObject.getString("deviceId")
timestamp = requestObject.getString("timestamp")
nonce = requestObject.getString("nonce")
sign = requestObject.getString("sign")
val array: JSONArray = requestObject.getJSONArray("contentSign")
if (array.size() > 0) {
for (a <- Range(0, array.size())) {
contentSign = contentSign + "," + array.get(a)
}
contentSign = contentSign.substring(1)
}
}
var subCode, message, cardTplId, dyncParams = ""
if (jsonobject.containsKey("response")) {
val responseObject: JSONObject = jsonobject.getJSONObject("response")
subCode = responseObject.getString("subCode")
message = responseObject.getString("message")
if (responseObject.containsKey("data")) {
val dataObject: JSONObject = responseObject.getJSONObject("data")
cardTplId = dataObject.getString("cardTplId")
if (dataObject.containsKey("dyncParams")) {
val dyncParams_object: JSONObject = dataObject.getJSONObject("dyncParams")
val set: util.Set[String] = dyncParams_object.keySet()
if (set.size() > 0) {
val it: util.Iterator[String] = set.iterator()
while (it.hasNext) {
val key_it = it.next()
dyncParams = dyncParams + "," + key_it + ":" + dyncParams_object.getString(key_it)
}
if (dyncParams.length > 1) {
dyncParams = dyncParams.substring(1)
}
}
}
}
}
Row(getNowTime()._4.toLong, channel, serverName, requestTime, responseTime,
msgId, shortUrl, sceneId, shopId, number,
contentSign, ip, deviceId, timestamp, nonce,
sign, subCode, message, cardTplId, dyncParams)
}
catch {
case e: Exception =>
println(e)
println(x)
null
}
}
}
本文介绍了Spark Streaming在处理实时数据流时的案例,包括读取指定时间范围内的文件、时间处理函数、DataFrame操作、数据保存到HDFS和MySQL、自定义UDF以及从JSON数据中提取信息。通过这些操作展示了Spark Streaming如何高效地进行实时数据处理和存储。
5万+

被折叠的 条评论
为什么被折叠?



