Flink-Table SQL 实践编程 StreamTableEnvironment(十)下

StreamTableEnvironment主要是流式数据处理,本文主要实现GroupBy Window Aggregation、Over Window aggregation对应sql api的代码实现,同时实现流数据表join 连接。

代码中使用socket方式作为source数据输入方式,闲话不说,具体代码如下:

一、依赖的实体类以及ScalarFunction信息:

import lombok.Data;
import java.sql.Timestamp;

@Data
public class StudentInfo{

    private String name;
    private String sex;
    private String course;
    private Float score;
    private Long timestamp;  //sql 关键字,sql中不可以直接使用
    private Timestamp sysDate;
}
import lombok.Data;

@Data
public class Teacher {

    private String teacherName;
    private String studentName;

}
import lombok.Data;
import java.sql.Timestamp;

@Data
public class StudentAndTeacher {
    private String name;
    private String sex;
    private String course;
    private Float score;
    private Timestamp sysDate;
    private String teacherName;

}
import org.apache.flink.table.functions.ScalarFunction;
import java.sql.Timestamp;

public class UTC2Local extends ScalarFunction {

    public Timestamp eval(Timestamp s) {
        long timestamp = s.getTime() + 28800000; ////flink默认的是UTC时间,我们的时区是东八区,时间戳需要增加八个小时
        return new Timestamp(timestamp);
    }

}

二、实现GroupBy Window Aggregation、Over Window aggregation对应sql操作:

import org.apache.commons.lang3.StringUtils;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks;
import org.apache.flink.streaming.api.watermark.Watermark;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import org.apache.flink.util.Collector;

import javax.annotation.Nullable;
import java.sql.Timestamp;

public class FlinkTableSQLStreamingExample {

    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);

        //source,这里使用socket连接获取数据
        DataStreamSource<String> textStudent = env.socketTextStream("127.0.0.1", 9999, "\n");

        //处理输入数据流,转换为StudentInfo类型,方便后续处理
        SingleOutputStreamOperator<StudentInfo> dataStreamStudent = textStudent.flatMap(new FlatMapFunction<String, StudentInfo>() {
            @Override
            public void flatMap(String s, Collector<StudentInfo> collector){
                String infos[] = s.split(",");
                if(StringUtils.isNotBlank(s) && infos.length==5){
                    StudentInfo studentInfo = new StudentInfo();
                    studentInfo.setName(infos[0]);
                    studentInfo.setSex(infos[1]);
                    studentInfo.setCourse(infos[2]);
                    studentInfo.setScore(Float.parseFloat(infos[3]));
                    studentInfo.setTimestamp(Long.parseLong(infos[4]));
                    studentInfo.setSysDate(new Timestamp(studentInfo.getTimestamp())); //
                    collector.collect(studentInfo);
                }
            }
        });

        DataStream<StudentInfo> dataStream = dataStreamStudent.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks<StudentInfo>() {
            private final long maxTimeLag = 5000; // 5 seconds
            @Nullable
            @Override
            public Watermark getCurrentWatermark() {
                return new Watermark(System.currentTimeMillis() - maxTimeLag);
            }
            @Override
            public long extractTimestamp(StudentInfo studentInfo, long l) {
                return studentInfo.getTimestamp();
            }
        });

        //注册dataStreamStudent流到表中,表名为:studentInfo
        tEnv.registerDataStream("studentInfo",dataStream,"name,sex,course,score,timestamp,sysDate.rowtime");

        tEnv.registerFunction("utc2local",new UTC2Local());

        //GroupBy Window Aggregation 根据name分组,统计学科数量
        Table groupWindAggrTable = tEnv.sqlQuery("SELECT  " +
                "utc2local(TUMBLE_START(sysDate, INTERVAL '2' MINUTE)) as wStart, " +
                "name, SUM(score) as score " +
                "FROM  studentInfo " +
                "GROUP BY TUMBLE(sysDate, INTERVAL '2' MINUTE), name");
        DataStream<Row> groupWindAggrTableResult = tEnv.toAppendStream(groupWindAggrTable, Row.class);
        groupWindAggrTableResult.print();

        Table overWindowAggr = tEnv.sqlQuery(
                "select name,count(course) over(" +
                    "   partition by name order by sysDate" +
                    "   rows between 2 preceding and current row" +
                     ")" +
                "from studentInfo");
        DataStream<Row> overWindowAggrResult = tEnv.toAppendStream(overWindowAggr, Row.class);
        overWindowAggrResult.print();

        Table overWindowAggr2 = tEnv.sqlQuery(
                "select name,COUNT(course) OVER w ,SUM(score) OVER w " +
                "from studentInfo " +
                "window w as ( " +
                        "   partition by name " +
                        "   order by sysDate " +
                        "   rows between 2 preceding and current row " +
                        ") ");
        DataStream<Tuple2<Boolean, Row>> overWindowAggr2Result = tEnv.toRetractStream(overWindowAggr2, Row.class);
        overWindowAggr2Result.print();

        env.execute("studentScoreAnalyse");
    }
}

输入数据信息:

输出结果信息:

三、表join 连接,以及在连接之后使用GroupBy Window:

Rowtime属性不能在常规联接的输入行中。作为解决方法,您可以将输入表的时间属性强制转换在连接操作之后,或者操作之前。

import org.apache.commons.lang3.StringUtils;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks;
import org.apache.flink.streaming.api.watermark.Watermark;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import org.apache.flink.util.Collector;

import javax.annotation.Nullable;
import java.sql.Timestamp;

public class FlinkTableSQLStreamingJoinExample {

    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);

        //source,这里使用socket连接获取数据
        DataStreamSource<String> textStudent = env.socketTextStream("127.0.0.1", 9999, "\n");

        DataStreamSource<String> textTeacher = env.socketTextStream("127.0.0.1", 9998, "\n");

        //处理输入数据流,转换为StudentInfo类型,方便后续处理
        SingleOutputStreamOperator<StudentInfo> dataStreamStudent = textStudent.flatMap(new FlatMapFunction<String, StudentInfo>() {
            @Override
            public void flatMap(String s, Collector<StudentInfo> collector){
                String infos[] = s.split(",");
                if(StringUtils.isNotBlank(s) && infos.length==5){
                    StudentInfo studentInfo = new StudentInfo();
                    studentInfo.setName(infos[0]);
                    studentInfo.setSex(infos[1]);
                    studentInfo.setCourse(infos[2]);
                    studentInfo.setScore(Float.parseFloat(infos[3]));
                    studentInfo.setTimestamp(Long.parseLong(infos[4]));
                    studentInfo.setSysDate(new Timestamp(studentInfo.getTimestamp())); //
                    collector.collect(studentInfo);
                }
            }
        });
        //注册dataStreamStudent流到表中,表名为:studentInfo
        tEnv.registerDataStream("studentInfo",dataStreamStudent,"name,sex,course,score,timestamp,sysDate");


        //处理教师流
        SingleOutputStreamOperator<Teacher> dataStreamTeacher = textTeacher.flatMap(new FlatMapFunction<String, Teacher>() {
            @Override
            public void flatMap(String s, Collector<Teacher> collector){
                String infos[] = s.split(",");
                if(StringUtils.isNotBlank(s) && infos.length==2){
                    Teacher teacher = new Teacher();
                    teacher.setTeacherName(infos[0]);
                    teacher.setStudentName(infos[1]);
                    collector.collect(teacher);
                }
            }
        });
        tEnv.registerTable("teacher",tEnv.fromDataStream(dataStreamTeacher));

        Table joinTable = tEnv.sqlQuery("SELECT name,sex,course,score,teacherName,sysDate from ( " +
                                         "  select *  " +
                                         "  FROM studentInfo LEFT JOIN teacher " +
                                         "  ON studentInfo.name = teacher.studentName)");
        DataStream<Tuple2<Boolean, StudentAndTeacher>> joinTableResult = tEnv.toRetractStream(joinTable,StudentAndTeacher.class);
        joinTableResult.print();

        //此处主要是数据流类型转换,由Tuple2<Boolean, StudentAndTeacher>转换为StudentAndTeacher类型,主要是为了后面时间窗口操作,设置rowtime信息,
        //Tuple2<Boolean, StudentAndTeacher>转化为table类型时,默认对应字段为f0、f1
        SingleOutputStreamOperator<StudentAndTeacher> dataStreamFormat = joinTableResult.flatMap(new FlatMapFunction<Tuple2<Boolean, StudentAndTeacher>, StudentAndTeacher>() {
            @Override
            public void flatMap(Tuple2<Boolean, StudentAndTeacher> booleanStudentAndTeacherTuple2, Collector<StudentAndTeacher> collector) throws Exception {
                collector.collect(booleanStudentAndTeacherTuple2.f1);
            }
        });

        DataStream<StudentAndTeacher> dataStream = dataStreamFormat.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks<StudentAndTeacher>() {
            private final long maxTimeLag = 5000; // 5 seconds
            @Override
            public long extractTimestamp(StudentAndTeacher studentAndTeacher, long l) {
                return studentAndTeacher.getSysDate().getTime();
            }

            @Nullable
            @Override
            public Watermark getCurrentWatermark() {
                return new Watermark(System.currentTimeMillis() - maxTimeLag);
            }
        });

        //注册dataStreamStudent流到表中,表名为:studentInfo
        tEnv.registerDataStream("studentAndTeacherInfo",dataStream,"name,sex,course,score,teacherName,sysDate.rowtime");

        tEnv.registerFunction("utc2local",new UTC2Local());

        //GroupBy Window Aggregation 根据name分组,统计学科数量
        Table groupWindAggrTable = tEnv.sqlQuery("SELECT  " +
                "utc2local(TUMBLE_START(sysDate, INTERVAL '2' MINUTE)) as wStart, " +
                "name, SUM(score) as score " +
                "FROM  studentAndTeacherInfo " +
                "GROUP BY TUMBLE(sysDate, INTERVAL '2' MINUTE), name");
        DataStream<Row> groupWindAggrTableResult = tEnv.toAppendStream(groupWindAggrTable, Row.class);
        groupWindAggrTableResult.print();

        env.execute("studentScoreAnalyse");

    }
}

输入数据:

输出结果信息:

Flink 批流一体编程实践可以通过 Table/SQL API 和 DataStream/DataSet API 两种方式来实现。下面将分别介绍这两种方式的实现方法。 1. Table/SQL API 实现批流一体编程实践 Table/SQL API 是 Flink 提供的一种基于 SQL编程接口,可以将流处理和批处理统一起来。通过 Table/SQL API,用户可以使用 SQL 语句来操作流和批数据,从而实现批流一体的编程。 下面是一个使用 Table/SQL API 实现批流一体编程的示例代码: ```python from pyflink.table import StreamTableEnvironment, BatchTableEnvironment, EnvironmentSettings # 创建流处理环境 stream_env = StreamExecutionEnvironment.get_execution_environment() stream_env.set_parallelism(1) stream_table_env = StreamTableEnvironment.create(stream_env) # 创建批处理环境 batch_env = ExecutionEnvironment.get_execution_environment() batch_env.set_parallelism(1) batch_table_env = BatchTableEnvironment.create(batch_env) # 创建表 stream_table_env.execute_sql("CREATE TABLE source_table (id INT, name STRING) WITH ('connector' = 'kafka', 'topic' = 'source_topic', 'properties.bootstrap.servers' = 'localhost:9092', 'format' = 'json')") batch_table_env.execute_sql("CREATE TABLE sink_table (id INT, name STRING) WITH ('connector' = 'jdbc', 'url' = 'jdbc:mysql://localhost:3306/test', 'table-name' = 'sink_table', 'username' = 'root', 'password' = '123456', 'driver' = 'com.mysql.jdbc.Driver')") # 执行查询 result_table = stream_table_env.sql_query("SELECT id, name FROM source_table WHERE id > 10") result_table.insert_into("sink_table") # 执行作业 stream_table_env.execute("job_name") ``` 在上面的示例代码中,我们首先创建了一个流处理环境和一个批处理环境,然后分别使用 StreamTableEnvironment 和 BatchTableEnvironment 创建了对应的表环境。接着,我们使用 execute_sql() 方法创建了一个输入表和一个输出表,并使用 sql_query() 方法执行了一个查询操作,最后使用 insert_into() 方法将查询结果插入到输出表中。最后,我们使用 execute() 方法执行了整个作业。 2. DataStream/DataSet API 实现批流一体编程实践 除了 Table/SQL API,Flink 还提供了 DataStream/DataSet API 来实现批流一体编程。通过 DataStream/DataSet API,用户可以使用相同的 API 来操作流和批数据,从而实现批流一体的编程。 下面是一个使用 DataStream/DataSet API 实现批流一体编程的示例代码: ```python from pyflink.common.serialization import SimpleStringSchema from pyflink.datastream import StreamExecutionEnvironment from pyflink.datastream.connectors import FlinkKafkaConsumer from pyflink.dataset import ExecutionEnvironment # 创建流处理环境 stream_env = StreamExecutionEnvironment.get_execution_environment() stream_env.set_parallelism(1) # 创建批处理环境 batch_env = ExecutionEnvironment.get_execution_environment() batch_env.set_parallelism(1) # 创建数据源 stream_source = FlinkKafkaConsumer('source_topic', SimpleStringSchema(), {'bootstrap.servers': 'localhost:9092'}) batch_source = batch_env.from_elements([(1, 'a'), (2, 'b'), (3, 'c')]) # 执行流处理 stream_env.add_source(stream_source).filter(lambda x: int(x.split(',')[0]) > 10).print() # 执行批处理 batch_source.filter(lambda x: x[0] > 1).print() # 执行作业 stream_env.execute('job_name') ``` 在上面的示例代码中,我们首先创建了一个流处理环境和一个批处理环境,然后分别使用 FlinkKafkaConsumer 和 from_elements() 方法创建了对应的数据源。接着,我们使用 filter() 方法对数据进行过滤,并使用 print() 方法输出结果。最后,我们使用 execute() 方法执行了整个作业。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

springk

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值