输入bean
package com.atguigu.networkflow_analysis.beans;
public class ApacheEventLog {
private String ip;
private String userId;
private Long timestamp;
private String method;
private String url;
public ApacheEventLog() {
}
public ApacheEventLog(String ip, String userId, Long timestamp, String method, String url) {
this.ip = ip;
this.userId = userId;
this.timestamp = timestamp;
this.method = method;
this.url = url;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public Long getTimestamp() {
return timestamp;
}
public void setTimestamp(Long timestamp) {
this.timestamp = timestamp;
}
public String getMethod() {
return method;
}
public void setMethod(String method) {
this.method = method;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
@Override
public String toString() {
return "ApacheEventLog{" +
"ip='" + ip + '\'' +
", userId='" + userId + '\'' +
", timestamp=" + timestamp +
", method='" + method + '\'' +
", url='" + url + '\'' +
'}';
}
}
输出bean
package com.atguigu.networkflow_analysis.beans;
public class PageViewCount {
private String url;
private Long windowEnd;
private Long count;
public PageViewCount() {
}
public PageViewCount(String url, Long windowEnd, Long count) {
this.url = url;
this.windowEnd = windowEnd;
this.count = count;
}
public String getUrl() {
return url;
}
public Long getWindowEnd() {
return windowEnd;
}
public Long getCount() {
return count;
}
public void setUrl(String url) {
this.url = url;
}
public void setWindowEnd(Long windowEnd) {
this.windowEnd = windowEnd;
}
public void setCount(Long count) {
this.count = count;
}
@Override
public String toString() {
return "PageViewCount{" +
"url='" + url + '\'' +
", windowEnd=" + windowEnd +
", count=" + count +
'}';
}
}
代码
package com.atguigu.networkflow_analysis.Ahotpages;
import com.atguigu.networkflow_analysis.beans.ApacheEventLog;
import com.atguigu.networkflow_analysis.beans.PageViewCount;
import org.apache.commons.compress.utils.Lists;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Map;
import java.util.regex.Pattern;
/**
* 本类统计热门页面top3
*
*/
public class HotPages {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
// URL resource = HotPages.class.getResource("/apache.log");//读取文件
DataStreamSource<String> inputStream = env.readTextFile(resource.getPath());
DataStreamSource<String> inputStream = env.socketTextStream("localhost", 7777); //读取端口
SingleOutputStreamOperator<ApacheEventLog> mapStream = inputStream.map(new MapFunction<String, ApacheEventLog>() {
@Override
public ApacheEventLog map(String value) throws Exception {
String[] str = value.split(" ");
SimpleDateFormat sdf = new SimpleDateFormat("MM/dd/yyyy:HH:mm:ss");
long ts = sdf.parse(str[3]).getTime(); //getTime返回毫秒数
return new ApacheEventLog(str[0], str[1], ts, str[5], str[6]);
}
}).assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ApacheEventLog>(Time.seconds(1)) {
@Override
public long extractTimestamp(ApacheEventLog apacheEventLog) {
return apacheEventLog.getTimestamp();
}
});
//开始处理
//先整体聚合
OutputTag<ApacheEventLog> tag = new OutputTag<ApacheEventLog>("late-date") { };
SingleOutputStreamOperator<PageViewCount> aggStream = mapStream
.filter(line -> "GET".equals(line.getMethod()))
.filter(line ->{
String regex="^((?!\\.(css|js|png|ico)$).)*$"; //通过正则过滤url
return Pattern.matches(regex,line.getUrl());
})
.keyBy(ApacheEventLog::getUrl)
.timeWindow(Time.minutes(10), Time.seconds(5))
.allowedLateness(Time.minutes(1)) //允许的最大延迟
.sideOutputLateData(tag) //定义的侧输出流
.aggregate(new PageCountAgg(), new PageCountResult());
//获取并打印侧输出流
aggStream.getSideOutput(tag).print("late");
//窗口内求topN
SingleOutputStreamOperator<String> result = aggStream.keyBy(PageViewCount::getWindowEnd)
.process(new TopNHotPages(3));
//输出
result.print();
//执行
env.execute("通知热门网页");
}
//泛型1:输入类型 泛型2:聚合状态类型 泛型3:输出类型
public static class PageCountAgg implements AggregateFunction<ApacheEventLog,Long,Long>{
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(ApacheEventLog value, Long accumulator) {
return accumulator+1;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return a+b;
}
}
//参数1:输入类型 参数2:输出类型 参数3:keyBy的返回值键值对中value的类型 参数4: 窗口类型
public static class PageCountResult implements WindowFunction<Long,PageViewCount,String, TimeWindow>{
@Override
public void apply(String url, TimeWindow window, Iterable<Long> iterable, Collector<PageViewCount> collector) throws Exception {
collector.collect(new PageViewCount(url,window.getEnd(),iterable.iterator().next()));
}
}
//参数1:keyBy返回值类型 参数2:输入类型 参数3:输出类型
public static class TopNHotPages extends KeyedProcessFunction<Long, PageViewCount, String> {
private Integer topSize;
private MapState<String,Long> mapState; //用于保存url和count值
public TopNHotPages(Integer topSize) {
this.topSize = topSize;
}
//初始化状态
@Override
public void open(Configuration parameters) throws Exception {
mapState = getRuntimeContext().getMapState(new MapStateDescriptor<String, Long>("map:url-count",String.class,Long.class));
}
//每来一条数据如何处理
@Override
public void processElement(PageViewCount page, Context context, Collector<String> collector) throws Exception {
//每来一条数据,存入MapState中,并注册定时器(只要触发时间一样,定时器就是同一个)
mapState.put(page.getUrl(),page.getCount());
context.timerService().registerEventTimeTimer(page.getWindowEnd()); //注册定时器
//注册一个一分钟后的定时器,用于清空状态
context.timerService().registerEventTimeTimer(page.getWindowEnd()+60*1000L);
}
//定时器触发
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
/*
当时间到了当前key的时间往后一分钟,到了allowedLatenes规定的最大延迟的时候,清空mapState
*/
if(timestamp == ctx.getCurrentKey()+60*1000L){
mapState.clear();
return;
}
ArrayList<Map.Entry<String, Long>> entries = Lists.newArrayList(mapState.entries().iterator());
entries.sort(new Comparator<Map.Entry<String, Long>>() {
@Override
public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
if(o1.getValue() > o2.getValue())
return -1;
else if (o1.getValue() == o2.getValue())
return 0;
else
return -1;
}
});
//定义输出结果格式
StringBuilder resultBuilder=new StringBuilder();
resultBuilder.append("===================\n");
resultBuilder.append("窗口结束时间:").append(new Timestamp(timestamp)).append("\n"); //输出windowend
//遍历输出
for (int i = 0; i < Math.min(topSize,entries.size()); i++) {
Map.Entry<String, Long> currentPageViewCount = entries.get(i);
resultBuilder.append("Number").append(i+1).append(":")
.append("网页地址:").append(currentPageViewCount.getKey())
.append("热门度:").append(currentPageViewCount.getValue())
.append("\n");
}
resultBuilder.append("===================\n\n");
Thread.sleep(1000L);//控制输出频率
out.collect(resultBuilder.toString());
}
//清空状态
@Override
public void close() throws Exception {
mapState.clear(); //清空状态
}
}
}
窗口分析
1、前置知识
watermark延迟为1s
allowedLateness为1min
窗口长度10min 5s滑动一次
定时器触发时间为窗口结束时间
83.149.9.216 - - 17/05/2015:10:25:49 +0000 GET /presentations/
watermark为 10:25:48
不输出
83.149.9.216 - - 17/05/2015:10:25:50 +0000 GET /presentations/
watermark为 10:25:59
不输出
83.149.9.216 - - 17/05/2015:10:25:51 +0000 GET /presentations/
watermark为 10:25:50 到了窗口[10:15:50,10:25:50)结束时间
窗口[10:15:50,10:25:50)触发计算
输出第1条数据
83.149.9.216 - - 17/05/2015:10:25:52 +0000 GET /presentations/
watermark为 10:25:51
不输出
83.149.9.216 - - 17/05/2015:10:25:46 +0000 GET /presentations/
watermark为 10:25:51 属于窗口[10:15:50,10:25:50)的延迟数据
窗口窗口[10:15:50,10:25:50)等待触发计算 触发时机:watermark更新
不输出
83.149.9.216 - - 17/05/2015:10:25:53 +0000 GET /presentations/
watermark为 10:25:52 watermark更新
第5条延迟数据触发计算
输出第1、5条数据
83.149.9.216 - - 17/05/2015:10:25:43 +0000 GET /presentations/
watermark为 10:25:52 属于窗口[10:15:45,10:25:45)的延迟数据
窗口[10:15:45,10:25:45)等待触发计算 触发时机:watermark更新
不输出
83.149.9.216 - - 17/05/2015:10:25:47 +0000 GET /present
watermark为 10:25:52 属于窗口[10:15:50,10:25:50)的延迟数据
窗口[10:15:50,10:25:50)等待触发计算 触发时机:watermark更新
不输出
83.149.9.216 - - 17/05/2015:10:25:54 +0000 GET /presentations/
watermark为 10:25:53 watermark更新
第7、8条延迟数据触发计算
[10:15:45,10:25:45)输出第7条数据
[10:15:50,10:25:50)输出第1、5、8条数据
83.149.9.216 - - 17/05/2015:10:25:55 +0000 GET /presentations/
watermark为 10:25:54
不输出
83.149.9.216 - - 17/05/2015:10:14:55 +0000 GET /presentations/
watermark为 10:25:54 属于窗口[10:14:55,10:24:55)的延迟数据
窗口[10:14:55,10:24:55)等待触发计算 触发时机:watermark更新
不输出
83.149.9.216 - - 17/05/2015:10:14:53 +0000 GET /presentations/
watermark为 10:25:54 属于窗口[10:14:50,10:24:50)的延迟数据
窗口结束时间与watermark差值超过allowedLateness规定的1min
直接以侧道输出流输出,不参与逻辑计算
83.149.9.216 - - 17/05/2015:10:25:56 +0000 GET /presentations/
watermark为 10:25:55 watermark更新
第11条延迟数据触发计算
[10:14:55,10:24:55)输出第11条数据
watermark为 10:25:55 到了窗口[10:15:55,10:25:55)结束时间
窗口[10:15:55,10:25:55)触发计算
输出第1、2、3、4、5、6、7、8、9条数据
83.149.9.216 - - 17/05/2015:10:14:55 +0000 GET /presentations/
watermark为 10:25:55 属于窗口[10:14:55,10:24:55)延迟数据
窗口结束时间与watermark差值超过allowedLateness规定的1min
直接以侧道输出流输出,不参与逻辑计算
83.149.9.216 - - 17/05/2015:10:25:58 +0000 GET /presentations/
watermark为 10:25:57
不输出