问题:用户访问某网站页面,每次打开页面时都会记录一条(userid, pageurl, timestamp)信息,请设计一套流式计算(Flink),实现以下功能:(考察点:考虑页面pv数据倾斜问题)
最近1小时内实时的页面uv量top10统计,输出结果如下:
page2, 200000
page10, 1000
page15, 500
page7, 300
代码如下:
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class test {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 模拟用户行为数据流 (用户ID, 页面ID, 访问时间戳)
DataStream<UserBehavior> dataStream = env.fromElements(
new UserBehavior("user1", "pageA", 1000L),
new UserBehavior("user2", "pageA", 2000L),
new UserBehavior("user1", "pageB", 3000L),
new UserBehavior("user3", "pageA", 4000L),
new UserBehavior("user2", "pageB", 5000L),
new UserBehavior("user4", "pageC", 6000L),
new UserBehavior("user3", "pageB", 7000L),
new UserBehavior("user5", "pageC", 8000L)
);
// 分配时间戳和水位线
DataStream<UserBehavior> timedStream = dataStream
.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserBehavior>forMonotonousTimestamps()
.withTimestampAssigner(
(SerializableTimestampAssigner<UserBehavior>) (element, recordTimestamp) -> element.timestamp
)
);
// 计算每个页面的UV
DataStream<PageViewCount> uvStream = timedStream
.keyBy(UserBehavior::getPageId)
.window(TumblingEventTimeWindows.of(Time.minutes(10)))
.aggregate(new UVAggregate(), new UVWindowResult());
DataStream<String> result = uvStream
.keyBy(PageViewCount::getWindowEnd)
.process(new TopNPages(10));
result.print("Top10 Pages");
env.execute("Page UV Top10");
}
// UV统计聚合函数
public static class UVAggregate implements AggregateFunction<UserBehavior, Set<String>, Long> {
@Override
public Set<String> createAccumulator() {
return new HashSet<>();
}
@Override
public Set<String> add(UserBehavior value, Set<String> accumulator) {
accumulator.add(value.getUserId());
return accumulator;
}
@Override
public Long getResult(Set<String> accumulator) {
return (long) accumulator.size();
}
@Override
public Set<String> merge(Set<String> a, Set<String> b) {
a.addAll(b);
return a;
}
}
// 窗口结果处理函数
public static class UVWindowResult extends ProcessWindowFunction<Long, PageViewCount, String, TimeWindow> {
@Override
public void process(String pageId,
Context context,
Iterable<Long> elements,
Collector<PageViewCount> out) {
Long uv = elements.iterator().next();
out.collect(new PageViewCount(pageId, uv, context.window().getEnd()));
}
}
// TopN处理函数
public static class TopNPages extends KeyedProcessFunction<Long, PageViewCount, String> {
private final int topSize;
private transient ListState<PageViewCount> pageViewState;
public TopNPages(int topSize) {
this.topSize = topSize;
}
@Override
public void open(Configuration parameters) {
// 初始化状态存储
ListStateDescriptor<PageViewCount> descriptor =
new ListStateDescriptor<>("pageViewState", PageViewCount.class);
pageViewState = getRuntimeContext().getListState(descriptor);
}
@Override
public void processElement(
PageViewCount value,
Context ctx,
Collector<String> out
) throws Exception {
// 将每个页面UV数据添加到状态
pageViewState.add(value);
// 注册定时器,在窗口结束时触发排序
ctx.timerService().registerEventTimeTimer(value.getWindowEnd() + 100);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
List<PageViewCount> allPageViews = new ArrayList<>();
for (PageViewCount pageView : pageViewState.get()) {
allPageViews.add(pageView);
}
pageViewState.clear();
allPageViews.sort(Comparator.comparing(PageViewCount::getCount).reversed());
int resultSize = Math.min(topSize, allPageViews.size());
List<PageViewCount> topPages = allPageViews.subList(0, resultSize);
StringBuilder sb = new StringBuilder();
for (PageViewCount page : topPages) {
sb.append(page.getPageId())
.append(", ")
.append(page.getCount())
.append("\n");
}
out.collect(sb.toString());
}
}
// 数据结构定义
public static class UserBehavior {
private String userId;
private String pageId;
private Long timestamp;
public UserBehavior() {
}
public UserBehavior(String userId, String pageId, Long timestamp) {
this.userId = userId;
this.pageId = pageId;
this.timestamp = timestamp;
}
public String getUserId() {
return userId;
}
public String getPageId() {
return pageId;
}
public Long getTimestamp() {
return timestamp;
}
}
public static class PageViewCount {
private String pageId;
private Long count;
private Long windowEnd;
public PageViewCount() {
}
public PageViewCount(String pageId, Long count, Long windowEnd) {
this.pageId = pageId;
this.count = count;
this.windowEnd = windowEnd;
}
public String getPageId() {
return pageId;
}
public Long getCount() {
return count;
}
public Long getWindowEnd() {
return windowEnd;
}
}
}
这段代码,是不是没有考虑页面pv数据倾斜问题,应该改成什么样的?请将完整代码返回我
最新发布