import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class test {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 模拟用户行为数据流 (用户ID, 页面ID, 访问时间戳)
DataStream<UserBehavior> dataStream = env.fromElements(
new UserBehavior("user1", "pageA", 1000L),
new UserBehavior("user2", "pageA", 2000L),
new UserBehavior("user1", "pageB", 3000L),
new UserBehavior("user3", "pageA", 4000L),
new UserBehavior("user2", "pageB", 5000L),
new UserBehavior("user4", "pageC", 6000L),
new UserBehavior("user3", "pageB", 7000L),
new UserBehavior("user5", "pageC", 8000L)
);
// 分配时间戳和水位线
DataStream<UserBehavior> timedStream = dataStream
.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserBehavior>forMonotonousTimestamps()
.withTimestampAssigner(
(SerializableTimestampAssigner<UserBehavior>) (element, recordTimestamp) -> element.timestamp
)
);
// 计算每个页面的UV
DataStream<PageViewCount> uvStream = timedStream
.keyBy(UserBehavior::getPageId)
.window(TumblingEventTimeWindows.of(Time.minutes(10)))
.aggregate(new UVAggregate(), new UVWindowResult());
DataStream<String> result = uvStream
.keyBy(PageViewCount::getWindowEnd)
.process(new TopNPages(10));
result.print("Top10 Pages");
env.execute("Page UV Top10");
}
// UV统计聚合函数
public static class UVAggregate implements AggregateFunction<UserBehavior, Set<String>, Long> {
@Override
public Set<String> createAccumulator() {
return new HashSet<>();
}
@Override
public Set<String> add(UserBehavior value, Set<String> accumulator) {
accumulator.add(value.getUserId());
return accumulator;
}
@Override
public Long getResult(Set<String> accumulator) {
return (long) accumulator.size();
}
@Override
public Set<String> merge(Set<String> a, Set<String> b) {
a.addAll(b);
return a;
}
}
// 窗口结果处理函数
public static class UVWindowResult extends ProcessWindowFunction<Long, PageViewCount, String, TimeWindow> {
@Override
public void process(String pageId,
Context context,
Iterable<Long> elements,
Collector<PageViewCount> out) {
Long uv = elements.iterator().next();
out.collect(new PageViewCount(pageId, uv, context.window().getEnd()));
}
}
// TopN处理函数
public static class TopNPages extends KeyedProcessFunction<Long, PageViewCount, String> {
private final int topSize;
private transient ListState<PageViewCount> pageViewState;
public TopNPages(int topSize) {
this.topSize = topSize;
}
@Override
public void open(Configuration parameters) {
// 初始化状态存储
ListStateDescriptor<PageViewCount> descriptor =
new ListStateDescriptor<>("pageViewState", PageViewCount.class);
pageViewState = getRuntimeContext().getListState(descriptor);
}
@Override
public void processElement(
PageViewCount value,
Context ctx,
Collector<String> out
) throws Exception {
// 将每个页面UV数据添加到状态
pageViewState.add(value);
// 注册定时器,在窗口结束时触发排序
ctx.timerService().registerEventTimeTimer(value.getWindowEnd() + 100);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
List<PageViewCount> allPageViews = new ArrayList<>();
for (PageViewCount pageView : pageViewState.get()) {
allPageViews.add(pageView);
}
pageViewState.clear();
allPageViews.sort(Comparator.comparing(PageViewCount::getCount).reversed());
int resultSize = Math.min(topSize, allPageViews.size());
List<PageViewCount> topPages = allPageViews.subList(0, resultSize);
StringBuilder sb = new StringBuilder();
for (PageViewCount page : topPages) {
sb.append(page.getPageId())
.append(", ")
.append(page.getCount())
.append("\n");
}
out.collect(sb.toString());
}
}
// 数据结构定义
public static class UserBehavior {
private String userId;
private String pageId;
private Long timestamp;
public UserBehavior() {
}
public UserBehavior(String userId, String pageId, Long timestamp) {
this.userId = userId;
this.pageId = pageId;
this.timestamp = timestamp;
}
public String getUserId() {
return userId;
}
public String getPageId() {
return pageId;
}
public Long getTimestamp() {
return timestamp;
}
}
public static class PageViewCount {
private String pageId;
private Long count;
private Long windowEnd;
public PageViewCount() {
}
public PageViewCount(String pageId, Long count, Long windowEnd) {
this.pageId = pageId;
this.count = count;
this.windowEnd = windowEnd;
}
public String getPageId() {
return pageId;
}
public Long getCount() {
return count;
}
public Long getWindowEnd() {
return windowEnd;
}
}
}
这段代码,最终输出结果为:
Top10 Pages> pageA, 3
pageB, 3
pageC, 2
如何使最终输出结果为
pageA, 3
pageB, 3
pageC, 2
最新发布