1.基于埋点日志数据的网络流量统计
网站总浏览量(PV)的统计
网站独立访客数(UV)的统计
package com.chuangyan.network35
import java.time.Duration
import org.apache.flink.api.common.RuntimeExecutionMode
import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
case class UserBehavior(userId: Long, itemId: Long, categoryId: Long, behavior: String, timestamp: Long)
object PageView {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC)
env.setParallelism(1)
val source: DataStream[String] = env.readTextFile("D:\\study\\Code\\UserBehavior\\NetworkFlowAnalysis\\src\\main\\resources\\UserBehavior.csv")
val dataStream: DataStream[UserBehavior] = source.map(line => {
val split = line.split(",")
val userId = split(0).trim.toLong
val itemId = split(1).trim.toLong
val categoryId = split(2).trim.toLong
val behavior = split(3).trim
val timestamp = split(4).trim.toLong
UserBehavior(userId, itemId, categoryId, behavior, timestamp)
})
.assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner[UserBehavior] {
override def extractTimestamp(element: UserBehavior, recordTimestamp: Long): Long = element.timestamp * 1000L
}))
//求PV:每隔1h计算一次p
/*val ds: DataStream[(String, Int)] = dataStream.filter(_.behavior == "pv")
.map(i => ("pv", 1))*/
val ds: DataStream[(String, Int)] = dataStream.filter(_.behavior == "pv")
.map(i => ("pv", 1))
ds.keyBy(_._1)
.timeWindow(Time.hours(1))
.sum(1)
.print()
env.execute("pv job")
}
}
package com.chuangyan.network35
import java.time.Duration
import org.apache.flink.api.common.RuntimeExecutionMode
import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.AllWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
case class UserBehavior(userId:Long,itemId:Long,categoryId:Long,behavior:String,timestamp:Long )
object UniqueVisitor {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC)
env.setParallelism(1)
val source = env.readTextFile("D:\\study\\Code\\UserBehavior\\NetworkFlowAnalysis\\src\\main\\resources\\UserBehavior.csv")
val dataStream: DataStream[UserBehavior] = source.map(line => {
val split = line.split(",")
val userId = split(0).trim.toLong
val itemId = split(1).trim.toLong
val categoryId = split(2).trim.toLong
val behavior = split(3).trim
val timestamp = split(4).trim.toLong
UserBehavior(userId, itemId, categoryId, behavior, timestamp)
})
.assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner[UserBehavior] {
override def extractTimestamp(element: UserBehavior, recordTimestamp: Long): Long = element.timestamp * 1000L
}))
//求uv
dataStream.filter(_.behavior == "pv")
.timeWindowAll(Time.hours(1))//滚动窗口
//计算窗口 独立访客数
.apply(new UvCountByWindow())
.print()
env.execute("pv job")
}
}
case class UvCount(windowEnd:Long,count:Long)
class UvCountByWindow() extends AllWindowFunction[UserBehavior,UvCount,TimeWindow]{
override def apply(window: TimeWindow, input: Iterable[UserBehavior], out: Collector[UvCount]): Unit = {
//对窗口内 所有数据 去重
/*
去重的方案有哪些
1.set
2.redis
3.布隆过滤
* */
import scala.collection.mutable
val set=mutable.Set[Long]()
val it=input.iterator
while(it.hasNext){
set+=it.next().userId
}
val windowEnd=window.getEnd
val count=set.size
out.collect(UvCount(windowEnd,count))
}
}