Flink学习（三）：数据转换

最新推荐文章于 2024-01-25 01:55:16 发布

coderpai

最新推荐文章于 2024-01-25 01:55:16 发布

阅读量985

点赞数

分类专栏： Flink

本文链接：https://blog.youkuaiyun.com/CoderPai/article/details/104954545

版权

Flink 专栏收录该内容

2 篇文章

订阅专栏

作者：chen_h
微信号 & QQ：862251340
微信公众号：coderpai

Flink学习（一）：流处理介绍

Flink学习（二）：实验一数据清洗

Flink学习（三）：数据转换

无状态转换

map()

在第一个实验中，我们过滤了出租车事件流。在同一代码库中，有一个GeoUtils类，提供了一个静态方法GeoUtils.mapToGridCell（float lon，float lat），该方法将位置（经度，纬度）映射到一个网格单元，该网格单元所指的区域约为100x100米。

现在，通过向每个事件添加startCell和endCell字段来丰富我们的出租车乘车对象流。我们可以创建扩展了TaxiRide的EnrichedRide对象，并添加以下字段：

public static class EnrichedRide extends TaxiRide {
    public int startCell;
    public int endCell;

    public EnrichedRide() {}

    public EnrichedRide(TaxiRide ride) {
        this.rideId = ride.rideId;
        this.isStart = ride.isStart;
        this.startTime = ride.startTime;
        this.endTime = ride.endTime;

        this.startCell = GeoUtils.mapToGridCell(ride.startLon, ride.startLat);
        this.endCell = GeoUtils.mapToGridCell(ride.endLon, ride.endLat);
    }

    public String toString() {
        return super.toString() + "," +
                Integer.toString(this.startCell) + "," +
                Integer.toString(this.endCell) + "," + "hello Map";
    }
}

之后，我们就可以创建一个应用来进行数据转换。

DataStream<TaxiRide> rides = env.addSource(new TaxiRideSource(...));

DataStream<EnrichedRide> enrichedNYCRides = rides
    .filter(new NYCFilter())
    .map(new Enrichment());

enrichedNYCRides.print();

其中，MapFunction函数的实现如下：

public static class Enrichment implements MapFunction<TaxiRide, EnrichedRide> {
  @Override
  public EnrichedRide map(TaxiRide taxiRide) throws Exception {
    return new EnrichedRide(taxiRide);
  }
}

完整代码如下：

/*
 * Copyright 2015 data Artisans GmbH, 2019 Ververica GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.ververica.flinktraining.exercises.datastream_java.basics;

import com.ververica.flinktraining.exercises.datastream_java.sources.TaxiRideSource;
import com.ververica.flinktraining.exercises.datastream_java.datatypes.TaxiRide;
import com.ververica.flinktraining.exercises.datastream_java.utils.ExerciseBase;
import com.ververica.flinktraining.exercises.datastream_java.utils.MissingSolutionException;
import com.ververica.flinktraining.exercises.datastream_java.utils.GeoUtils;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * The "Ride Cleansing" exercise from the Flink training
 * (http://training.ververica.com).
 * The task of the exercise is to filter a data stream of taxi ride records to keep only rides that
 * start and end within New York City. The resulting stream should be printed.
 *
 * Parameters:
 *   -input path-to-input-file
 *
 */
public class Transfor extends ExerciseBase {
    public static void main(String[] args) throws Exception {

        ParameterTool params = ParameterTool.fromArgs(args);
        final String input = params.get("input", ExerciseBase.pathToRideData);

        final int maxEventDelay = 60;       // events are out of order by max 60 seconds
        final int servingSpeedFactor = 600; // events of 10 minutes are served in 1 second

        // set up streaming execution environment
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(ExerciseBase.parallelism);

        // start the data generator
        DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor)));

        DataStream<EnrichedRide> enrichedNYCRides = rides
                // filter out rides that do not start or stop in NYC
                .filter(new NYCFilter())
                .map(new Enrichment());

        // print the filtered stream
        enrichedNYCRides.print();

        // run the cleansing pipeline
        env.execute("Taxi Ride Cleansing");
    }

    private static class NYCFilter implements FilterFunction<TaxiRide> {
        @Override
        public boolean filter(TaxiRide taxiRide) throws Exception {
            return GeoUtils.isInNYC(taxiRide.startLon,taxiRide.startLat) && GeoUtils.isInNYC(taxiRide.endLon, taxiRide.endLat);
        }
    }
    public static class Enrichment implements MapFunction<TaxiRide, EnrichedRide> {
        @Override
        public EnrichedRide map(TaxiRide taxiRide) throws Exception {
            return new EnrichedRide(taxiRide);
        }
    }

    public static class EnrichedRide extends TaxiRide {
        public int startCell;
        public int endCell;

        public EnrichedRide() {}

        public EnrichedRide(TaxiRide ride) {
            this.rideId = ride.rideId;
            this.isStart = ride.isStart;
            this.startTime = ride.startTime;
            this.endTime = ride.endTime;

            this.startCell = GeoUtils.mapToGridCell(ride.startLon, ride.startLat);
            this.endCell = GeoUtils.mapToGridCell(ride.endLon, ride.endLat);
        }

        public String toString() {
            return super.toString() + "," +
                    Integer.toString(this.startCell) + "," +
                    Integer.toString(this.endCell) + "," + "hello Map";
        }
    }


}

flatmap()

MapFunction 仅在执行一对一转换时才适用：对于进入的每个流元素，map（）将输出一个转换后的元素。如果你想输出多个，你将要使用flatmap（）。

DataStream<TaxiRide> rides = env.addSource(new TaxiRideSource(...));

DataStream<EnrichedRide> enrichedNYCRides = rides
    .flatMap(new NYCEnrichment());

enrichedNYCRides.print();

FlatMapFunction 函数实现如下：

public static class NYCEnrichment implements FlatMapFunction<TaxiRide, EnrichedRide> {
    @Override
    public void flatMap(TaxiRide taxiRide, Collector<EnrichedRide> out) throws Exception {

        FilterFunction<TaxiRide> valid = new NYCFilter();
        if (valid.filter(taxiRide)) {
            out.collect(new EnrichedRide(taxiRide));
        }
    }
}

Keyed 流

keyBy: 能够围绕其中一个属性对流进行分区通常非常有用，以便将具有该属性相同值的所有事件组合在一起。
例如，假设我们想要在每个出租车出行开始的网格区域中找到行驶时间最长的出租车。
每个网格区域，即根据网格区域进行分组；然后DESC排序limit 1即可。

rides
  .flatMap(new NYCEnrichment())
  .keyBy("startCell")

每个keyBy都会导致网络 shuffle ，从而对流进行重新分区。通常，这非常昂贵，因为它涉及网络通信以及序列化和反序列化。

在这里插入图片描述

在上面的示例中，key 已由其名称“ startCell”指定。这种类型的键选择具有一个缺点，即编译器无法推断用于键控的字段的类型，因此Flink会将键值作为元组传递，这很尴尬。通常最好使用类型正确的KeySelector，例如：

rides
  .flatMap(new NYCEnrichment())
  .keyBy(
    new KeySelector<EnrichedRide, int>() {
      @Override
      public int getKey(EnrichedRide ride) throws Exception {
        return ride.startCell;
      }
    })

用 lambda 可以更简洁的表达：

rides
  .flatMap(new NYCEnrichment())
  .keyBy(ride -> ride.startCell)

在 Keyed 流上进行聚合

此代码段创建一个新的元祖流，中包含每个 taxi 结束事件的 startCell 和持续时间（以分钟为单位）：

// Integer-> startCell Minutes-> duration
DataStream<Tuple2<Integer, Minutes>> minutesByStartCell = enrichedNYCRides
        .flatMap(new FlatMapFunction<EnrichedRide, Tuple2<Integer, Minutes>>() {
            @Override
            public void flatMap(EnrichedRide ride,
                                Collector<Tuple2<Integer, Minutes>> out) throws Exception {
                // 出租车出行结束事件
                if (!ride.isStart) {
                    // 计算从开始到结束的时间间隔
                    Interval rideInterval = new Interval(ride.startTime.toDate().getTime(), ride.endTime.toDate().getTime());
                    Minutes duration = rideInterval.toDuration().toStandardMinutes();
                    out.collect(new Tuple2<>(ride.startCell, duration));
                }
            }
        });

// fields position
minutesByStartCell
        // startCell
        .keyBy(0)
        // duration
        .maxBy(1).print();

输出结果：

...
3> (55032,11)
3> (55041,9)
3> (37568,7)
3> (49543,12)
3> (42817,9)
2> (55291,4)
2> (56029,6)
2> (57529,0)
2> (53295,10)
2> (44572,4)
2> (54286,7)
2> (37819,4)
2> (54545,8)
2> (48540,7)
2> (40077,4)
2> (38324,2)
1> (55048,1)
4> (49548,10)
2> (47061,8)
...

状态

这里的keyBy操作是有状态的流处理。虽然状态处理是透明的，但Flink必须跟踪每个不同键的最大持续时间。在Flink程序中涉及到状态时，要考虑状态所占用空间可能会变多大。如果key的空间是无界的，那么状态的空间也应是无界的。在处理流数据时，通常更有意义的是考虑有限窗口上的聚合，而不是整个流。

完整代码如下：

package com.ververica.flinktraining.exercises.datastream_java.basics;

import com.ververica.flinktraining.exercises.datastream_java.sources.TaxiRideSource;
import com.ververica.flinktraining.exercises.datastream_java.datatypes.TaxiRide;
import com.ververica.flinktraining.exercises.datastream_java.utils.ExerciseBase;
import com.ververica.flinktraining.exercises.datastream_java.utils.GeoUtils;

import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.elasticsearch2.shaded.org.joda.time.Interval;
import org.apache.flink.streaming.connectors.elasticsearch2.shaded.org.joda.time.Minutes;

import org.apache.flink.util.Collector;

/**
 * The "Ride Cleansing" exercise from the Flink training
 * (http://training.ververica.com).
 * The task of the exercise is to filter a data stream of taxi ride records to keep only rides that
 * start and end within New York City. The resulting stream should be printed.
 *
 * Parameters:
 *   -input path-to-input-file
 *
 */
public class Transfor extends ExerciseBase {
    public static void main(String[] args) throws Exception {

        ParameterTool params = ParameterTool.fromArgs(args);
        final String input = params.get("input", ExerciseBase.pathToRideData);

        final int maxEventDelay = 60;       // events are out of order by max 60 seconds
        final int servingSpeedFactor = 600; // events of 10 minutes are served in 1 second

        // set up streaming execution environment
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(ExerciseBase.parallelism);

        // start the data generator
        DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor)));

//      DataStream<EnrichedRide> enrichedNYCRides = rides
//                // filter out rides that do not start or stop in NYC
//                .filter(new NYCFilter())
//                .map(new Enrichment());

//        DataStream<EnrichedRide> enrichedNYCRides = rides
//                // filter out rides that do not start or stop in NYC
//                .flatMap(new NYCEnrichment());

        DataStream<EnrichedRide> enrichedNYCRides = rides.flatMap(new NYCEnrichment())
                .keyBy(
                        new KeySelector<EnrichedRide, Integer>() {
                            @Override
                            public Integer getKey(EnrichedRide ride) throws Exception {
                                return ride.startCell;
                            }
                        });
        // Integer-> startCell Minutes-> duration
        DataStream<Tuple2<Integer, Minutes>> minutesByStartCell = enrichedNYCRides
                .flatMap(new FlatMapFunction<EnrichedRide, Tuple2<Integer, Minutes>>() {
                    @Override
                    public void flatMap(EnrichedRide ride,
                                        Collector<Tuple2<Integer, Minutes>> out) throws Exception {
                        // 出租车出行结束事件
                        if (!ride.isStart) {
                            // 计算从开始到结束的时间间隔
                            Interval rideInterval = new Interval(ride.startTime.toDate().getTime(), ride.endTime.toDate().getTime());
                            Minutes duration = rideInterval.toDuration().toStandardMinutes();
                            out.collect(new Tuple2<>(ride.startCell, duration));
                        }
                    }
                });

        // fields position
        minutesByStartCell
                // startCell
                .keyBy(0)
                // duration
                .maxBy(1).print();



        // print the filtered stream
//        enrichedNYCRides.print();

        // run the cleansing pipeline
        env.execute("Taxi Ride Cleansing");
    }

    private static class NYCFilter implements FilterFunction<TaxiRide> {
        @Override
        public boolean filter(TaxiRide taxiRide) throws Exception {
            return GeoUtils.isInNYC(taxiRide.startLon,taxiRide.startLat) && GeoUtils.isInNYC(taxiRide.endLon, taxiRide.endLat);
        }
    }
    public static class Enrichment implements MapFunction<TaxiRide, EnrichedRide> {
        @Override
        public EnrichedRide map(TaxiRide taxiRide) throws Exception {
            return new EnrichedRide(taxiRide);
        }
    }

    public static class NYCEnrichment implements FlatMapFunction<TaxiRide, EnrichedRide> {
        @Override
        public void flatMap(TaxiRide taxiRide, Collector<EnrichedRide> out) throws Exception {
            // 这里举例不太恰当one-to-one 或者 one-to-0 如WordCount里的字符串split是典型的FlatMap
            FilterFunction<TaxiRide> valid = new NYCFilter();
            if (valid.filter(taxiRide)) {
                out.collect(new EnrichedRide(taxiRide));
            }
        }
    }



    public static class EnrichedRide extends TaxiRide {
        public int startCell;
        public int endCell;

        public EnrichedRide() {}

        public EnrichedRide(TaxiRide ride) {
            this.rideId = ride.rideId;
            this.isStart = ride.isStart;
            this.startTime = ride.startTime;
            this.endTime = ride.endTime;

            this.startCell = GeoUtils.mapToGridCell(ride.startLon, ride.startLat);
            this.endCell = GeoUtils.mapToGridCell(ride.endLon, ride.endLat);
        }

        public String toString() {
            return super.toString() + "," +
                    Integer.toString(this.startCell) + "," +
                    Integer.toString(this.endCell) + "," + "hello Map";
        }
    }


}