struct streaming SQL udf udaf

本文介绍了一个使用 Apache Spark 进行实时流处理的具体案例,包括如何从指定主机和端口接收数据流,并通过 Spark SQL 对数据进行清洗、转换及聚合操作。文中详细展示了如何利用 UDF 函数和窗口函数来完成复杂的数据处理任务。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

object StructuredNetworkWordCount {
def main(args: Array[String]) {
if (args.length < 2) {
System.err.println("Usage: StructuredNetworkWordCount <hostname> <port>")
System.exit(1)
}

val host = args(0)
val port = args(1).toInt

val spark = SparkSession
.builder
.appName("StructuredNetworkWordCount")
.config("spark.default.parallelism",3)
.getOrCreate()

import spark.implicits._

// Create DataFrame representing the stream of input lines from connection to host:port
val lines = spark.readStream
.format("socket")
.option("host", host)
.option("port", port)
.load()

val words = lines.as[String]
.map(x => {println("**:"+x);x.split(" ")})
.filter(_.length == 3)
.map(x => (x(0),new Timestamp(x(1).toLong),x(2)))
.withColumnRenamed("_1","dim1").withColumnRenamed("_2","time").withColumnRenamed("_3","imeisi")
words.printSchema()

val wordCounts2 = words
.withWatermark("time", "10 minutes")
wordCounts2.registerTempTable("uv")
spark.udf.register("doubleString",Utils.udfDoubleString _ )
spark.udf.register("HLLCUDAFInt", new HLLCUDAFInt() )


val wordCounts = spark.sql(" select * from (select time,doubleString(dim1) as aa, doubleString(dim1) as bb ,HLLCUDAFInt(imeisi) as uv from uv group by time,doubleString(dim1)) tampa ")
// 如何拆新表, insert into ,其实不用, 用临时表 就ok , sql 负责一点.

val query = wordCounts.writeStream
.outputMode("update")
.foreach(new ForeachWriter[Row] {
override def process(value: Row): Unit = {
println(s" ${value.getAs[String](0)} ${value.getAs[String](1)} ${value.getAs[String](2)} ${value.getAs[Int](3)} ")
}
override def close(errorOrNull: Throwable): Unit = {}
override def open(partitionId: Long, version: Long): Boolean = true
}).start()

query.awaitTermination()
}
}

class HLLCUDAFInt extends UserDefinedAggregateFunction{ //ctrl+I实现复写方法
override def inputSchema: StructType = StructType(Array(StructField("input", StringType, true)))
override def bufferSchema: StructType = StructType(Array(StructField("hllcbyte",BinaryType , true)))
override def dataType: DataType = LongType
override def deterministic: Boolean = true
override def initialize(buffer: MutableAggregationBuffer): Unit = {buffer(0)= {
val hllc = new HLLCounter(14)
val bytes1 = ByteBuffer.allocate(hllc.maxLength())
hllc.writeRegisters(bytes1)
bytes1.array
}}
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val hllc = new HLLCounter(14)
hllc.readRegisters(ByteBuffer.wrap(buffer.getAs[Array[Byte]](0)))
hllc.add(input.getAs[String](0))
val bytes1 = ByteBuffer.allocate(hllc.maxLength())
hllc.writeRegisters(bytes1)
buffer(0) = bytes1.array
}
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
val hllc = new HLLCounter(14)
hllc.readRegisters(ByteBuffer.wrap(buffer1.getAs[Array[Byte]](0)))
val hllc2 = new HLLCounter(14)
hllc2.readRegisters(ByteBuffer.wrap(buffer2.getAs[Array[Byte]](0)))
hllc.merge(hllc2)
val bytes1 = ByteBuffer.allocate(hllc.maxLength())
hllc.writeRegisters(bytes1)
buffer1(0) = bytes1.array
}
override def evaluate(buffer: Row): Any = {
val hllc = new HLLCounter(14)
hllc.readRegisters(ByteBuffer.wrap(buffer.getAs[Array[Byte]](0)))
hllc.getCountEstimate
}
}


Aggregator 写法

class HllcdistinctValue extends Aggregator[Row, HLLCounter, Long] {
// A zero value for this aggregation. Should satisfy the property that any b + zero = b
def zero: HLLCounter = new HLLCounter()
// Combine two values to produce a new value. For performance, the function may modify `buffer`
// and return it instead of constructing a new object
def reduce(buffer: HLLCounter, employee: Row): HLLCounter = {
buffer.add(employee.getString(0))
buffer
}
// Merge two intermediate values
def merge(b1: HLLCounter, b2: HLLCounter): HLLCounter = {
b1.merge(b2)
b1
}
// Transform the output of the reduction
def finish(reduction: HLLCounter): Long = reduction.getCountEstimate
// Specifies the Encoder for the intermediate value type
def bufferEncoder: Encoder[HLLCounter] = Encoders.javaSerialization
// Specifies the Encoder for the final output value type
def outputEncoder: Encoder[Long] = Encoders.scalaLong
}

用法:

val averageSalary = new HllcdistinctValue().toColumn
// words 已经为 dateframe 结构了.
// Generate running word count
val windowedCounts = words
//.groupBy("word", "timestamp").count()
.withWatermark("timestamp", "10 minutes")
// .groupBy(window($"timestamp", windowDuration, slideDuration), $"word").agg(averageSalary)
.groupBy(window($"timestamp", windowDuration, slideDuration)).agg(averageSalary)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值