object StructuredNetworkWordCount {
def main(args: Array[String]) {
if (args.length < 2) {
System.err.println("Usage: StructuredNetworkWordCount <hostname> <port>")
System.exit(1)
}
val host = args(0)
val port = args(1).toInt
val spark = SparkSession
.builder
.appName("StructuredNetworkWordCount")
.config("spark.default.parallelism",3)
.getOrCreate()
import spark.implicits._
// Create DataFrame representing the stream of input lines from connection to host:port
val lines = spark.readStream
.format("socket")
.option("host", host)
.option("port", port)
.load()
val words = lines.as[String]
.map(x => {println("**:"+x);x.split(" ")})
.filter(_.length == 3)
.map(x => (x(0),new Timestamp(x(1).toLong),x(2)))
.withColumnRenamed("_1","dim1").withColumnRenamed("_2","time").withColumnRenamed("_3","imeisi")
words.printSchema()
val wordCounts2 = words
.withWatermark("time", "10 minutes")
wordCounts2.registerTempTable("uv")
spark.udf.register("doubleString",Utils.udfDoubleString _ )
spark.udf.register("HLLCUDAFInt", new HLLCUDAFInt() )
val wordCounts = spark.sql(" select * from (select time,doubleString(dim1) as aa, doubleString(dim1) as bb ,HLLCUDAFInt(imeisi) as uv from uv group by time,doubleString(dim1)) tampa ")
// 如何拆新表, insert into ,其实不用, 用临时表 就ok , sql 负责一点.
val query = wordCounts.writeStream
.outputMode("update")
.foreach(new ForeachWriter[Row] {
override def process(value: Row): Unit = {
println(s" ${value.getAs[String](0)} ${value.getAs[String](1)} ${value.getAs[String](2)} ${value.getAs[Int](3)} ")
}
override def close(errorOrNull: Throwable): Unit = {}
override def open(partitionId: Long, version: Long): Boolean = true
}).start()
query.awaitTermination()
}
}
class HLLCUDAFInt extends UserDefinedAggregateFunction{ //ctrl+I实现复写方法
override def inputSchema: StructType = StructType(Array(StructField("input", StringType, true)))
override def bufferSchema: StructType = StructType(Array(StructField("hllcbyte",BinaryType , true)))
override def dataType: DataType = LongType
override def deterministic: Boolean = true
override def initialize(buffer: MutableAggregationBuffer): Unit = {buffer(0)= {
val hllc = new HLLCounter(14)
val bytes1 = ByteBuffer.allocate(hllc.maxLength())
hllc.writeRegisters(bytes1)
bytes1.array
}}
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val hllc = new HLLCounter(14)
hllc.readRegisters(ByteBuffer.wrap(buffer.getAs[Array[Byte]](0)))
hllc.add(input.getAs[String](0))
val bytes1 = ByteBuffer.allocate(hllc.maxLength())
hllc.writeRegisters(bytes1)
buffer(0) = bytes1.array
}
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
val hllc = new HLLCounter(14)
hllc.readRegisters(ByteBuffer.wrap(buffer1.getAs[Array[Byte]](0)))
val hllc2 = new HLLCounter(14)
hllc2.readRegisters(ByteBuffer.wrap(buffer2.getAs[Array[Byte]](0)))
hllc.merge(hllc2)
val bytes1 = ByteBuffer.allocate(hllc.maxLength())
hllc.writeRegisters(bytes1)
buffer1(0) = bytes1.array
}
override def evaluate(buffer: Row): Any = {
val hllc = new HLLCounter(14)
hllc.readRegisters(ByteBuffer.wrap(buffer.getAs[Array[Byte]](0)))
hllc.getCountEstimate
}
}
Aggregator 写法
class HllcdistinctValue extends Aggregator[Row, HLLCounter, Long] {
// A zero value for this aggregation. Should satisfy the property that any b + zero = b
def zero: HLLCounter = new HLLCounter()
// Combine two values to produce a new value. For performance, the function may modify `buffer`
// and return it instead of constructing a new object
def reduce(buffer: HLLCounter, employee: Row): HLLCounter = {
buffer.add(employee.getString(0))
buffer
}
// Merge two intermediate values
def merge(b1: HLLCounter, b2: HLLCounter): HLLCounter = {
b1.merge(b2)
b1
}
// Transform the output of the reduction
def finish(reduction: HLLCounter): Long = reduction.getCountEstimate
// Specifies the Encoder for the intermediate value type
def bufferEncoder: Encoder[HLLCounter] = Encoders.javaSerialization
// Specifies the Encoder for the final output value type
def outputEncoder: Encoder[Long] = Encoders.scalaLong
}
用法:
val averageSalary = new HllcdistinctValue().toColumn
// words 已经为 dateframe 结构了.
// Generate running word count
val windowedCounts = words
//.groupBy("word", "timestamp").count()
.withWatermark("timestamp", "10 minutes")
// .groupBy(window($"timestamp", windowDuration, slideDuration), $"word").agg(averageSalary)
.groupBy(window($"timestamp", windowDuration, slideDuration)).agg(averageSalary)
def main(args: Array[String]) {
if (args.length < 2) {
System.err.println("Usage: StructuredNetworkWordCount <hostname> <port>")
System.exit(1)
}
val host = args(0)
val port = args(1).toInt
val spark = SparkSession
.builder
.appName("StructuredNetworkWordCount")
.config("spark.default.parallelism",3)
.getOrCreate()
import spark.implicits._
// Create DataFrame representing the stream of input lines from connection to host:port
val lines = spark.readStream
.format("socket")
.option("host", host)
.option("port", port)
.load()
val words = lines.as[String]
.map(x => {println("**:"+x);x.split(" ")})
.filter(_.length == 3)
.map(x => (x(0),new Timestamp(x(1).toLong),x(2)))
.withColumnRenamed("_1","dim1").withColumnRenamed("_2","time").withColumnRenamed("_3","imeisi")
words.printSchema()
val wordCounts2 = words
.withWatermark("time", "10 minutes")
wordCounts2.registerTempTable("uv")
spark.udf.register("doubleString",Utils.udfDoubleString _ )
spark.udf.register("HLLCUDAFInt", new HLLCUDAFInt() )
val wordCounts = spark.sql(" select * from (select time,doubleString(dim1) as aa, doubleString(dim1) as bb ,HLLCUDAFInt(imeisi) as uv from uv group by time,doubleString(dim1)) tampa ")
// 如何拆新表, insert into ,其实不用, 用临时表 就ok , sql 负责一点.
val query = wordCounts.writeStream
.outputMode("update")
.foreach(new ForeachWriter[Row] {
override def process(value: Row): Unit = {
println(s" ${value.getAs[String](0)} ${value.getAs[String](1)} ${value.getAs[String](2)} ${value.getAs[Int](3)} ")
}
override def close(errorOrNull: Throwable): Unit = {}
override def open(partitionId: Long, version: Long): Boolean = true
}).start()
query.awaitTermination()
}
}
class HLLCUDAFInt extends UserDefinedAggregateFunction{ //ctrl+I实现复写方法
override def inputSchema: StructType = StructType(Array(StructField("input", StringType, true)))
override def bufferSchema: StructType = StructType(Array(StructField("hllcbyte",BinaryType , true)))
override def dataType: DataType = LongType
override def deterministic: Boolean = true
override def initialize(buffer: MutableAggregationBuffer): Unit = {buffer(0)= {
val hllc = new HLLCounter(14)
val bytes1 = ByteBuffer.allocate(hllc.maxLength())
hllc.writeRegisters(bytes1)
bytes1.array
}}
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val hllc = new HLLCounter(14)
hllc.readRegisters(ByteBuffer.wrap(buffer.getAs[Array[Byte]](0)))
hllc.add(input.getAs[String](0))
val bytes1 = ByteBuffer.allocate(hllc.maxLength())
hllc.writeRegisters(bytes1)
buffer(0) = bytes1.array
}
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
val hllc = new HLLCounter(14)
hllc.readRegisters(ByteBuffer.wrap(buffer1.getAs[Array[Byte]](0)))
val hllc2 = new HLLCounter(14)
hllc2.readRegisters(ByteBuffer.wrap(buffer2.getAs[Array[Byte]](0)))
hllc.merge(hllc2)
val bytes1 = ByteBuffer.allocate(hllc.maxLength())
hllc.writeRegisters(bytes1)
buffer1(0) = bytes1.array
}
override def evaluate(buffer: Row): Any = {
val hllc = new HLLCounter(14)
hllc.readRegisters(ByteBuffer.wrap(buffer.getAs[Array[Byte]](0)))
hllc.getCountEstimate
}
}
Aggregator 写法
class HllcdistinctValue extends Aggregator[Row, HLLCounter, Long] {
// A zero value for this aggregation. Should satisfy the property that any b + zero = b
def zero: HLLCounter = new HLLCounter()
// Combine two values to produce a new value. For performance, the function may modify `buffer`
// and return it instead of constructing a new object
def reduce(buffer: HLLCounter, employee: Row): HLLCounter = {
buffer.add(employee.getString(0))
buffer
}
// Merge two intermediate values
def merge(b1: HLLCounter, b2: HLLCounter): HLLCounter = {
b1.merge(b2)
b1
}
// Transform the output of the reduction
def finish(reduction: HLLCounter): Long = reduction.getCountEstimate
// Specifies the Encoder for the intermediate value type
def bufferEncoder: Encoder[HLLCounter] = Encoders.javaSerialization
// Specifies the Encoder for the final output value type
def outputEncoder: Encoder[Long] = Encoders.scalaLong
}
用法:
val averageSalary = new HllcdistinctValue().toColumn
// words 已经为 dateframe 结构了.
// Generate running word count
val windowedCounts = words
//.groupBy("word", "timestamp").count()
.withWatermark("timestamp", "10 minutes")
// .groupBy(window($"timestamp", windowDuration, slideDuration), $"word").agg(averageSalary)
.groupBy(window($"timestamp", windowDuration, slideDuration)).agg(averageSalary)