问题:用户访问某网站页面,每次打开页面时都会记录一条(userid, pageurl, timestamp)信息,请设计一套流式计算(Flink or Spark Streaming),实现以下功能:(考察点:考虑页面pv数据倾斜问题)
最近1小时内实时的页面uv量top10统计,输出结果如下:
page2, 200000
page10, 1000
page15, 500
page7, 300
代码如下:
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.util.*;
public class test {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(4); // 提高并行度
// 模拟数据源
DataStream<UserBehavior> dataStream = env.fromElements(
new UserBehavior("user1", "pageA", 1000L),
new UserBehavior("user2", "pageA", 2000L),
new UserBehavior("user1", "pageB", 3000L),
new UserBehavior("user3", "pageA", 4000L),
new UserBehavior("user2", "pageB", 5000L),
new UserBehavior("user4", "pageC", 6000L),
new UserBehavior("user3", "pageB", 7000L),
new UserBehavior("user5", "pageC", 8000L)
);
// 修复1:明确时间戳分配器类型
DataStream<UserBehavior> timedStream = dataStream
.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserBehavior>forMonotonousTimestamps()
.withTimestampAssigner(
(SerializableTimestampAssigner<UserBehavior>) (element, recordTimestamp) -> element.timestamp
)
);
// 阶段1:分桶聚合(解决倾斜)
DataStream<PageBucketCount> bucketStream = timedStream
.map(new BucketMapper(10)) // 10个桶
.keyBy(value -> Tuple2.of(value.pageId, value.bucketId)) // 修复2:使用Tuple2明确类型
.window(TumblingEventTimeWindows.of(Time.minutes(10)))
.aggregate(new BucketAggregate(), new BucketWindowResult());
// 阶段2:合并分桶结果
DataStream<PageViewCount> uvStream = bucketStream
.keyBy(PageBucketCount::getPageId)
.window(TumblingEventTimeWindows.of(Time.minutes(10)))
.reduce((value1, value2) ->
new PageBucketCount(
value1.pageId,
-1, // 桶ID不再需要
value1.count + value2.count,
Math.max(value1.windowEnd, value2.windowEnd)
)
)
.map(count -> new PageViewCount(count.pageId, count.count, count.windowEnd));
// TopN处理(优化状态管理)
DataStream<String> result = uvStream
.keyBy(PageViewCount::getWindowEnd)
.process(new TopNPages(10));
result.print("Top10 Pages");
env.execute("Optimized Page UV TopN");
}
// ===== 分桶映射(核心倾斜解决方案) =====
public static class BucketMapper implements org.apache.flink.api.common.functions.MapFunction<UserBehavior, PageBucket> {
private final int bucketCount;
private final Random random = new Random();
public BucketMapper(int bucketCount) {
this.bucketCount = bucketCount;
}
@Override
public PageBucket map(UserBehavior value) {
int bucketId = random.nextInt(bucketCount);
return new PageBucket(value.userId, value.pageId, bucketId, value.timestamp);
}
}
// ===== 分桶聚合函数 =====
public static class BucketAggregate implements AggregateFunction<PageBucket, Set<String>, Long> {
@Override
public Set<String> createAccumulator() {
return new HashSet<>();
}
@Override
public Set<String> add(PageBucket value, Set<String> accumulator) {
accumulator.add(value.userId);
return accumulator;
}
@Override
public Long getResult(Set<String> accumulator) {
return (long) accumulator.size();
}
@Override
public Set<String> merge(Set<String> a, Set<String> b) {
a.addAll(b);
return a;
}
}
// ===== 分桶窗口结果 =====
public static class BucketWindowResult extends ProcessWindowFunction<Long, PageBucketCount, Tuple2<String, Integer>, TimeWindow> {
@Override
public void process(Tuple2<String, Integer> key,
Context context,
Iterable<Long> elements,
Collector<PageBucketCount> out) {
String pageId = key.f0;
int bucketId = key.f1;
Long count = elements.iterator().next();
out.collect(new PageBucketCount(pageId, bucketId, count, context.window().getEnd()));
}
}
// ===== TopN处理(优化状态) =====
public static class TopNPages extends KeyedProcessFunction<Long, PageViewCount, String> {
private final int topSize;
private transient MapState<String, Long> pageViewState;
private transient ValueState<Long> triggerTimeState;
public TopNPages(int topSize) {
this.topSize = topSize;
}
@Override
public void open(Configuration parameters) {
// 修复3:使用MapState优化状态访问
MapStateDescriptor<String, Long> stateDescriptor =
new MapStateDescriptor<>("pageViewState", String.class, Long.class);
pageViewState = getRuntimeContext().getMapState(stateDescriptor);
ValueStateDescriptor<Long> timeDescriptor =
new ValueStateDescriptor<>("triggerTime", Long.class);
triggerTimeState = getRuntimeContext().getState(timeDescriptor);
}
@Override
public void processElement(PageViewCount value, Context ctx, Collector<String> out) throws Exception {
pageViewState.put(value.pageId, value.count);
// 注册延迟触发定时器(避免重复注册)
Long windowEnd = value.windowEnd;
if (triggerTimeState.value() == null) {
long triggerTime = windowEnd + 100;
ctx.timerService().registerEventTimeTimer(triggerTime);
triggerTimeState.update(triggerTime);
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
// 获取并排序TopN
List<Map.Entry<String, Long>> allEntries = new ArrayList<>();
for (Iterator<Map.Entry<String, Long>> it = pageViewState.iterator(); it.hasNext(); ) {
allEntries.add(it.next());
}
allEntries.sort((e1, e2) -> Long.compare(e2.getValue(), e1.getValue()));
StringBuilder result = new StringBuilder("===== Top10 Pages (");
result.append("窗口结束: ").append(timestamp - 100).append(") =====\n");
int count = Math.min(topSize, allEntries.size());
for (int i = 0; i < count; i++) {
Map.Entry<String, Long> entry = allEntries.get(i);
result.append(entry.getKey()).append(", ").append(entry.getValue()).append("\n");
}
// 清理状态
pageViewState.clear();
triggerTimeState.clear();
out.collect(result.toString());
}
}
// ===== 数据结构 =====
public static class UserBehavior {
public String userId;
public String pageId;
public Long timestamp;
public UserBehavior(String userId, String pageId, Long timestamp) {
this.userId = userId;
this.pageId = pageId;
this.timestamp = timestamp;
}
}
public static class PageBucket {
public String userId;
public String pageId;
public int bucketId;
public Long timestamp;
public PageBucket(String userId, String pageId, int bucketId, Long timestamp) {
this.userId = userId;
this.pageId = pageId;
this.bucketId = bucketId;
this.timestamp = timestamp;
}
}
public static class PageBucketCount {
public String pageId;
public int bucketId;
public Long count;
public Long windowEnd;
public PageBucketCount(String pageId, int bucketId, Long count, Long windowEnd) {
this.pageId = pageId;
this.bucketId = bucketId;
this.count = count;
this.windowEnd = windowEnd;
}
public String getPageId() {
return pageId;
}
}
public static class PageViewCount {
public String pageId;
public Long count;
public Long windowEnd;
public PageViewCount(String pageId, Long count, Long windowEnd) {
this.pageId = pageId;
this.count = count;
this.windowEnd = windowEnd;
}
public Long getWindowEnd() {
return windowEnd;
}
}
}
报错如下:
Exception in thread "main" org.apache.flink.api.common.functions.InvalidTypesException: The generic type parameters of 'Tuple2' are missing. In many cases lambda methods don't provide enough information for automatic type extraction when Java generics are involved. An easy workaround is to use an (anonymous) class instead that implements the 'org.apache.flink.api.java.functions.KeySelector' interface. Otherwise the type has to be specified explicitly using type information.
请给我修改后可以正常运行的代码
最新发布