Apache Flink Java 示例:实时流量统计(窗口聚合)
本文将详细讲解如何使用 Apache Flink 实现实时流量统计系统,重点演示窗口聚合的应用。该示例将统计网站用户的访问数据,计算各种维度的实时指标。
应用场景说明
我们创建一个实时流量统计系统,用于分析:
- 用户级别:用户每分钟的访问量(PV)
- 页面级别:热门页面每10秒的访问量
- 系统级别:网站每小时的UV统计(独立访客)
完整实现代码
1. 数据模型定义
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.functions.*;
import org.apache.flink.api.java.tuple.*;
import org.apache.flink.streaming.api.datastream.*;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.*;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.time.Duration;
import java.util.*;
import java.util.concurrent.TimeUnit;
// 用户访问事件
public class UserBehaviorEvent {
private String userId; // 用户ID
private String page; // 访问页面
private String ip; // 用户IP
private String userAgent; // 浏览器UA
private long timestamp; // 事件时间戳(毫秒)
// 构造方法/getters/setters
public UserBehaviorEvent(String userId, String page, long timestamp) {
this.userId = userId;
this.page = page;
this.timestamp = timestamp;
}
public long getTimestamp() {
return timestamp;
}
public String getUserId() {
return userId;
}
public String getPage() {
return page;
}
}
2. 核心处理逻辑与窗口聚合
public class RealTimeTrafficAnalysis {
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(4);
// 1. 模拟数据源(生产环境使用Kafka/Pulsar)
DataStream<UserBehaviorEvent> eventStream = env.addSource(new UserBehaviorSource())
.name("user-behavior-source")
.uid("user-behavior-source");
// 2. 分配时间戳与水印(允许5秒乱序)
DataStream<UserBehaviorEvent> timestampedStream = eventStream
.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserBehaviorEvent>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner((SerializableTimestampAssigner<UserBehaviorEvent>)
(event, recordTimestamp) -> event.getTimestamp())
)
.name("assign-timestamps")
.uid("assign-timestamps");
// ======== 核心窗口聚合逻辑 ========
// 3. 用户每分钟访问量(PV)统计(滚动窗口)
DataStream<Tuple3<String, Long, Integer>> userPvStream = timestampedStream
.keyBy(UserBehaviorEvent::getUserId)
.window(TumblingEventTimeWindows.of(Time.minutes(1))) // 1分钟滚动窗口
.aggregate(new AggregateFunction<UserBehaviorEvent, Integer, Integer>() {
@Override
public Integer createAccumulator() {
return 0;
}
@Override
public Integer add(UserBehaviorEvent value, Integer accumulator) {
return accumulator + 1; // 每条事件访问量+1
}
@Override
public Integer getResult(Integer accumulator) {
return accumulator;
}
@Override
public Integer merge(Integer a, Integer b) {
return a + b;
}
})
.map(new RichMapFunction<Integer, Tuple3<String, Long, Integer>>() {
private transient ValueState<Long> lastUpdateTimeState;
@Override
public void open(Configuration parameters) {
// 存储上次更新时间,用于过滤重复触发
lastUpdateTimeState = getRuntimeContext().getState(
new ValueStateDescriptor<>("last-update", Long.class));
}
@Override
public Tuple3<String, Long, Integer> map(Integer count) throws Exception {
String userId = ((UserBehaviorEvent) getCurrentKey()).getUserId();
long windowEnd = getWindowEnd();
Long lastUpdate = lastUpdateTimeState.value();
if (lastUpdate == null || windowEnd > lastUpdate) {
lastUpdateTimeState.update(windowEnd);
return Tuple3.of(userId, windowEnd, count);
}
return null; // 忽略重复计算
}
})
.name("user-pv-aggregation")
.uid("user-pv-aggregation");
// 4. 热门页面每10秒访问量统计(滑动窗口)
DataStream<Tuple3<String, Long, Integer>> pagePvStream = timestampedStream
.keyBy(UserBehaviorEvent::getPage)
.window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5))) // 每5秒计算一次过去10秒数据
.aggregate(new AggregateFunction<UserBehaviorEvent, Integer, Integer>() {
// ... 类似上面的PV计数逻辑
})
.name("page-pv-aggregation")
.uid("page-pv-aggregation");
// 5. 网站每小时UV统计(独立访客,滚动窗口)
DataStream<Tuple2<Long, Integer>> siteUvStream = timestampedStream
.windowAll(TumblingEventTimeWindows.of(Time.hours(1))) // 全量窗口
.aggregate(new AggregateFunction<UserBehaviorEvent, Set<String>, Integer>() {
@Override
public Set<String> createAccumulator() {
return new HashSet<>(); // 使用Set存储独立用户ID
}
@Override
public Set<String> add(UserBehaviorEvent value, Set<String> accumulator) {
accumulator.add(value.getUserId());
return accumulator;
}
@Override
public Integer getResult(Set<String> accumulator) {
return accumulator.size(); // 返回独立用户数
}
@Override
public Set<String> merge(Set<String> a, Set<String> b) {
a.addAll(b);
return a;
}
})
.map(new MapFunction<Integer, Tuple2<Long, Integer>>() {
@Override
public Tuple2<Long, Integer> map(Integer count) {
return Tuple2.of(System.currentTimeMillis(), count);
}
})
.name("site-uv-aggregation")
.uid("site-uv-aggregation");
// 6. 输出结果(生产环境写入Kafka/Database)
userPvStream.filter(Objects::nonNull).print("User-PV");
pagePvStream.filter(Objects::nonNull).print("Page-PV-Top");
siteUvStream.print("Site-UV");
env.execute("Real-time Traffic Analysis");
}
// 模拟用户行为数据源
private static class UserBehaviorSource implements SourceFunction<UserBehaviorEvent> {
private volatile boolean running = true;
private final Random random = new Random();
private final List<String> userIds = Arrays.asList("u1001", "u1002", "u1003", "u1004", "u1005");
private final List<String> pages = Arrays.asList("/home", "/products", "/cart", "/checkout", "/profile");
@Override
public void run(SourceContext<UserBehaviorEvent> ctx) throws Exception {
while (running) {
String userId = userIds.get(random.nextInt(userIds.size()));
String page = pages.get(random.nextInt(pages.size()));
// 添加随机时间偏移,模拟真实环境时间乱序
long eventTime = System.currentTimeMillis() - random.nextInt(5000);
ctx.collect(new UserBehaviorEvent(userId, page, eventTime));
// 随机间隔0-500ms生成事件
Thread.sleep(random.nextInt(500));
}
}
@Override
public void cancel() {
running = false;
}
}
}
3. 核心组件详解
A. 时间语义与水印
.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserBehaviorEvent>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner((event, recordTimestamp) -> event.getTimestamp())
)
- 事件时间:使用事件自身的时间戳(而不是系统处理时间)
- 水印机制:允许5秒数据乱序,处理延迟到达的数据
- 触发条件:当水印超过窗口结束时间时触发计算
B. 窗口类型
-
滚动窗口 (Tumbling Window)
.window(TumblingEventTimeWindows.of(Time.minutes(1)))
- 固定大小、无重叠的窗口
- 如:每分钟用户PV统计
-
滑动窗口 (Sliding Window)
.window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
- 固定大小、有重叠的窗口
- 如:每5秒计算过去10秒的页面访问量
-
全量窗口 (Global Window)
.windowAll(TumblingEventTimeWindows.of(Time.hours(1)))
- 适用于无分组的全局统计
- 如:整站小时级UV统计
C. 聚合函数
.aggregate(new AggregateFunction<UserBehaviorEvent, Set<String>, Integer>() {
@Override
public Set<String> createAccumulator() {
return new HashSet<>(); // 初始化累加器
}
@Override
public Set<String> add(UserBehaviorEvent value, Set<String> accumulator) {
accumulator.add(value.getUserId()); // 增量更新
return accumulator;
}
@Override
public Integer getResult(Set<String> accumulator) {
return accumulator.size(); // 返回最终结果
}
// ... merge方法用于会话窗口
})
窗口聚合可视化说明
窗口触发时机示例
事件时间线(单位:秒):
0 10 20 30 40 50 60 70 80 90 100
|----|----|----|----|----|----|----|----|----|----|
1分钟滚动窗口:
[0-60) --- 水印65时触发 → 输出结果
[60-120) -- 水印125时触发
10秒滑动窗口(5秒滑动):
[0-10) -- 水印15时触发
[5-15) -- 水印20时触发
[10-20) -- 水印25时触发
生产环境优化建议
-
状态后端配置
env.setStateBackend(new EmbeddedRocksDBStateBackend()); env.enableCheckpointing(60000); // 1分钟checkpoint
-
数据源/汇对接
// Kafka源 Properties props = new Properties(); props.setProperty("bootstrap.servers", "kafka-broker:9092"); DataStream<UserBehaviorEvent> kafkaSource = env .addSource(new FlinkKafkaConsumer<>("user_events", new JsonDeserialization(), props)); // JDBC输出 siteUvStream.addSink(JdbcSink.sink( "INSERT INTO site_uv(window_end, uv) VALUES (?, ?)", (statement, t) -> { statement.setTimestamp(1, new Timestamp(t.f0)); statement.setInt(2, t.f1); }, JdbcExecutionOptions.builder().build(), new JdbcConnectionOptions.JdbcConnectionOptionsBuilder() .withUrl("jdbc:mysql://db-host:3306/analytics") .withDriverName("com.mysql.cj.jdbc.Driver") .withUsername("flink") .withPassword("password") .build() ));
-
性能优化
- 使用
reduce
代替aggregate
避免全状态查询 - 对于UV统计使用
HyperLogLog
算法减少内存占用 - 开启对象重用减少GC压力:
env.getConfig().enableObjectReuse()
- 使用
完整应用流程图
通过此示例,您可以掌握Flink窗口聚合的核心概念和技术细节,并应用于各种实时分析场景。