Flink入门第十六课:DataStream api在数据延迟下统计热门网页并进行窗口分析

本篇博客介绍了如何使用Flink的DataStream API进行窗口分析,针对数据延迟情况统计热门网页。通过定义输入和输出Bean,详细展示了相关代码实现,并深入探讨了窗口分析的应用。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

输入bean

package com.atguigu.networkflow_analysis.beans;

public class ApacheEventLog {
    private String ip;
    private String userId;
    private Long timestamp;
    private String method;
    private String url;

    public ApacheEventLog() {
    }

    public ApacheEventLog(String ip, String userId, Long timestamp, String method, String url) {
        this.ip = ip;
        this.userId = userId;
        this.timestamp = timestamp;
        this.method = method;
        this.url = url;
    }

    public String getIp() {
        return ip;
    }

    public void setIp(String ip) {
        this.ip = ip;
    }

    public String getUserId() {
        return userId;
    }

    public void setUserId(String userId) {
        this.userId = userId;
    }

    public Long getTimestamp() {
        return timestamp;
    }

    public void setTimestamp(Long timestamp) {
        this.timestamp = timestamp;
    }

    public String getMethod() {
        return method;
    }

    public void setMethod(String method) {
        this.method = method;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    @Override
    public String toString() {
        return "ApacheEventLog{" +
                "ip='" + ip + '\'' +
                ", userId='" + userId + '\'' +
                ", timestamp=" + timestamp +
                ", method='" + method + '\'' +
                ", url='" + url + '\'' +
                '}';
    }
}

 

输出bean

package com.atguigu.networkflow_analysis.beans;

public class PageViewCount {
    private String url;
    private Long windowEnd;
    private Long count;

    public PageViewCount() {
    }

    public PageViewCount(String url, Long windowEnd, Long count) {
        this.url = url;
        this.windowEnd = windowEnd;
        this.count = count;
    }

    public String getUrl() {
        return url;
    }

    public Long getWindowEnd() {
        return windowEnd;
    }

    public Long getCount() {
        return count;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public void setWindowEnd(Long windowEnd) {
        this.windowEnd = windowEnd;
    }

    public void setCount(Long count) {
        this.count = count;
    }

    @Override
    public String toString() {
        return "PageViewCount{" +
                "url='" + url + '\'' +
                ", windowEnd=" + windowEnd +
                ", count=" + count +
                '}';
    }
}

 代码

package com.atguigu.networkflow_analysis.Ahotpages;

import com.atguigu.networkflow_analysis.beans.ApacheEventLog;
import com.atguigu.networkflow_analysis.beans.PageViewCount;
import org.apache.commons.compress.utils.Lists;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;

import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Map;
import java.util.regex.Pattern;

/**
 * 本类统计热门页面top3
 *
 */
public class HotPages {
    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

//        URL resource = HotPages.class.getResource("/apache.log");//读取文件
        DataStreamSource<String> inputStream = env.readTextFile(resource.getPath());
        DataStreamSource<String> inputStream = env.socketTextStream("localhost", 7777); //读取端口

        SingleOutputStreamOperator<ApacheEventLog> mapStream = inputStream.map(new MapFunction<String, ApacheEventLog>() {
            @Override
            public ApacheEventLog map(String value) throws Exception {
                String[] str = value.split(" ");
                SimpleDateFormat sdf = new SimpleDateFormat("MM/dd/yyyy:HH:mm:ss");
                long ts = sdf.parse(str[3]).getTime(); //getTime返回毫秒数
                return new ApacheEventLog(str[0], str[1], ts, str[5], str[6]);
            }
        }).assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ApacheEventLog>(Time.seconds(1)) {
            @Override
            public long extractTimestamp(ApacheEventLog apacheEventLog) {
                return apacheEventLog.getTimestamp();
            }
        });

        //开始处理
        //先整体聚合
        OutputTag<ApacheEventLog> tag = new OutputTag<ApacheEventLog>("late-date") { };
        SingleOutputStreamOperator<PageViewCount> aggStream = mapStream
                .filter(line -> "GET".equals(line.getMethod()))
                .filter(line ->{
                    String regex="^((?!\\.(css|js|png|ico)$).)*$";  //通过正则过滤url
                    return Pattern.matches(regex,line.getUrl());
                })
                .keyBy(ApacheEventLog::getUrl)
                .timeWindow(Time.minutes(10), Time.seconds(5))
                .allowedLateness(Time.minutes(1))   //允许的最大延迟
                .sideOutputLateData(tag)   //定义的侧输出流
                .aggregate(new PageCountAgg(), new PageCountResult());

        //获取并打印侧输出流
        aggStream.getSideOutput(tag).print("late");

        //窗口内求topN
        SingleOutputStreamOperator<String> result = aggStream.keyBy(PageViewCount::getWindowEnd)
                .process(new TopNHotPages(3));
        //输出
        result.print();
        //执行
        env.execute("通知热门网页");
    }

    //泛型1:输入类型   泛型2:聚合状态类型   泛型3:输出类型
    public  static class PageCountAgg implements AggregateFunction<ApacheEventLog,Long,Long>{

        @Override
        public Long createAccumulator() {
            return 0L;
        }

        @Override
        public Long add(ApacheEventLog value, Long accumulator) {
            return accumulator+1;
        }

        @Override
        public Long getResult(Long accumulator) {
            return accumulator;
        }

        @Override
        public Long merge(Long a, Long b) {
            return a+b;
        }
    }

    //参数1:输入类型  参数2:输出类型  参数3:keyBy的返回值键值对中value的类型  参数4: 窗口类型
    public  static class PageCountResult implements WindowFunction<Long,PageViewCount,String, TimeWindow>{

        @Override
        public void apply(String url, TimeWindow window, Iterable<Long> iterable, Collector<PageViewCount> collector) throws Exception {
            collector.collect(new PageViewCount(url,window.getEnd(),iterable.iterator().next()));
        }
    }
    //参数1:keyBy返回值类型  参数2:输入类型  参数3:输出类型
    public  static class TopNHotPages extends KeyedProcessFunction<Long, PageViewCount, String> {
        private Integer topSize;
        private MapState<String,Long> mapState; //用于保存url和count值

        public TopNHotPages(Integer topSize) {
            this.topSize = topSize;
        }

        //初始化状态
        @Override
        public void open(Configuration parameters) throws Exception {
            mapState = getRuntimeContext().getMapState(new MapStateDescriptor<String, Long>("map:url-count",String.class,Long.class));
        }

        //每来一条数据如何处理
        @Override
        public void processElement(PageViewCount page, Context context, Collector<String> collector) throws Exception {
            //每来一条数据,存入MapState中,并注册定时器(只要触发时间一样,定时器就是同一个)
            mapState.put(page.getUrl(),page.getCount());
            context.timerService().registerEventTimeTimer(page.getWindowEnd()); //注册定时器
            //注册一个一分钟后的定时器,用于清空状态
            context.timerService().registerEventTimeTimer(page.getWindowEnd()+60*1000L);
        }

        //定时器触发
        @Override
        public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
            /*
                当时间到了当前key的时间往后一分钟,到了allowedLatenes规定的最大延迟的时候,清空mapState
             */
            if(timestamp == ctx.getCurrentKey()+60*1000L){
                mapState.clear();
                return;
            }

            ArrayList<Map.Entry<String, Long>> entries = Lists.newArrayList(mapState.entries().iterator());

            entries.sort(new Comparator<Map.Entry<String, Long>>() {
                @Override
                public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
                    if(o1.getValue() > o2.getValue())
                        return -1;
                    else if (o1.getValue() == o2.getValue())
                        return 0;
                    else
                        return -1;
                }
            });

            //定义输出结果格式
            StringBuilder resultBuilder=new StringBuilder();
            resultBuilder.append("===================\n");
            resultBuilder.append("窗口结束时间:").append(new Timestamp(timestamp)).append("\n"); //输出windowend

            //遍历输出
            for (int i = 0; i < Math.min(topSize,entries.size()); i++) {
                Map.Entry<String, Long> currentPageViewCount = entries.get(i);
                resultBuilder.append("Number").append(i+1).append(":")
                        .append("网页地址:").append(currentPageViewCount.getKey())
                        .append("热门度:").append(currentPageViewCount.getValue())
                        .append("\n");
            }

            resultBuilder.append("===================\n\n");

            Thread.sleep(1000L);//控制输出频率
            out.collect(resultBuilder.toString());
        }

        //清空状态
        @Override
        public void close() throws Exception {
            mapState.clear();  //清空状态
        }
    }
}

窗口分析

1、前置知识
    watermark延迟为1s
    allowedLateness为1min
    窗口长度10min 5s滑动一次
    定时器触发时间为窗口结束时间

83.149.9.216 - - 17/05/2015:10:25:49 +0000 GET /presentations/
    watermark为 10:25:48
    不输出

83.149.9.216 - - 17/05/2015:10:25:50 +0000 GET /presentations/
    watermark为 10:25:59
    不输出

83.149.9.216 - - 17/05/2015:10:25:51 +0000 GET /presentations/
    watermark为 10:25:50  到了窗口[10:15:50,10:25:50)结束时间
    窗口[10:15:50,10:25:50)触发计算
    输出第1条数据

83.149.9.216 - - 17/05/2015:10:25:52 +0000 GET /presentations/
    watermark为 10:25:51
    不输出 

83.149.9.216 - - 17/05/2015:10:25:46 +0000 GET /presentations/
    watermark为 10:25:51 属于窗口[10:15:50,10:25:50)的延迟数据
    窗口窗口[10:15:50,10:25:50)等待触发计算 触发时机:watermark更新
    不输出

83.149.9.216 - - 17/05/2015:10:25:53 +0000 GET /presentations/
    watermark为 10:25:52  watermark更新
    第5条延迟数据触发计算
    输出第1、5条数据

83.149.9.216 - - 17/05/2015:10:25:43 +0000 GET /presentations/
    watermark为 10:25:52 属于窗口[10:15:45,10:25:45)的延迟数据
    窗口[10:15:45,10:25:45)等待触发计算  触发时机:watermark更新
    不输出

83.149.9.216 - - 17/05/2015:10:25:47 +0000 GET /present
    watermark为 10:25:52 属于窗口[10:15:50,10:25:50)的延迟数据
    窗口[10:15:50,10:25:50)等待触发计算   触发时机:watermark更新
    不输出

83.149.9.216 - - 17/05/2015:10:25:54 +0000 GET /presentations/
    watermark为 10:25:53  watermark更新
    第7、8条延迟数据触发计算
    [10:15:45,10:25:45)输出第7条数据
    [10:15:50,10:25:50)输出第1、5、8条数据

83.149.9.216 - - 17/05/2015:10:25:55 +0000 GET /presentations/
    watermark为 10:25:54 
    不输出

83.149.9.216 - - 17/05/2015:10:14:55 +0000 GET /presentations/
    watermark为 10:25:54  属于窗口[10:14:55,10:24:55)的延迟数据
    窗口[10:14:55,10:24:55)等待触发计算  触发时机:watermark更新
    不输出
    
83.149.9.216 - - 17/05/2015:10:14:53 +0000 GET /presentations/
    watermark为 10:25:54  属于窗口[10:14:50,10:24:50)的延迟数据
    窗口结束时间与watermark差值超过allowedLateness规定的1min
    直接以侧道输出流输出,不参与逻辑计算

83.149.9.216 - - 17/05/2015:10:25:56 +0000 GET /presentations/
    watermark为 10:25:55  watermark更新
    第11条延迟数据触发计算  
    [10:14:55,10:24:55)输出第11条数据

    watermark为 10:25:55   到了窗口[10:15:55,10:25:55)结束时间
    窗口[10:15:55,10:25:55)触发计算
    输出第1、2、3、4、5、6、7、8、9条数据

83.149.9.216 - - 17/05/2015:10:14:55 +0000 GET /presentations/
    watermark为 10:25:55  属于窗口[10:14:55,10:24:55)延迟数据
    窗口结束时间与watermark差值超过allowedLateness规定的1min
    直接以侧道输出流输出,不参与逻辑计算

83.149.9.216 - - 17/05/2015:10:25:58 +0000 GET /presentations/
    watermark为 10:25:57 
    不输出

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

二百四十九先森

你的打赏是我努力的最大动力~

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值