Flink示例——Table、SQL
版本信息
| 产品 | 版本 |
|---|---|
| Flink | 1.7.2 |
| Java | 1.8.0_231 |
| Scala | 2.11.12 |
Mavan依赖
- pom.xml 依赖部分
<dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-clients_2.11</artifactId> <version>${flink.version}</version> </dependency> <!-- Table、SQL --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table_2.11</artifactId> <version>${flink.version}</version> </dependency> <!-- TableAPI的代码由Scala编写,因此必须导入相关包 --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-scala_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-scala_2.11</artifactId> <version>${flink.version}</version> </dependency>
自定义SourceFunction
- 提供一个SourceFunction,方便后面测试
public class CustomSourceFunction extends RichSourceFunction<String> { private boolean flag = true; private long idAdder = 0L; @Override public void run(SourceContext<String> ctx) throws Exception { List<String> nameList = Arrays.asList("xiaowang", "lilei", "yangyang", "zhangsan", "lisi", "wangwu", "meimei"); List<String> addressList = Arrays.asList("beijing", "chongqing", "shanghai", "nanjing", "chengdu", "guangzhou"); Random random = new Random(); while (flag) { Thread.sleep(100); // 模拟数据 id,name,age,address,eventTime long id = idAdder++; String name = nameList.get(random.nextInt(nameList.size())); int age = random.nextInt(20) + 10; String address = addressList.get(random.nextInt(addressList.size())); // 随机模拟业务时间 long eventTime = System.currentTimeMillis() + (random.nextInt(nameList.size()) - nameList.size() / 2) * 1000; ctx.collect(id + "," + name+ "," + age + "," + address + "," + eventTime); } } @Override public void cancel() { flag = false; } }
TableAPI、SQL 简单示例
- 使用Tuple方式转换DataStrem(较繁琐)
public class SimpleDemo { public static void main(String[] args) { // Environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(3); // 获取TableEnvironment StreamTableEnvironment tableEnv = StreamTableEnvironment.getTableEnvironment(env); // 自定义数据源 CustomSourceFunction sourceFunction = new CustomSourceFunction(); DataStreamSource<String> customDS = env.addSource(sourceFunction); // 使用TableAPI处理DataStream // 使用Tuple方式,较繁琐 // returns必须写 DataStream<Tuple5<Long, String, Integer, String, Long>> personDS = customDS.map(line -> { String[] fields = line.split(","); long id = Long.parseLong(fields[0]); String name = fields[1]; int age = Integer.parseInt(fields[2]); String address = fields[3]; long eventTime = Long.parseLong(fields[4]); return Tuple5.of(id, name, age, address, eventTime); }).returns(Types.TUPLE(Types.LONG, Types.STRING, Types.INT, Types.STRING, Types.LONG)); // 将DataStream转为Table Table srcTable = tableEnv.fromDataStream(personDS, "id, name, age, address, eventTime"); // 打印Schema srcTable.printSchema(); // 注意此处SQL较标准SQL不同 // 数字20不可写为'20' // &&不可写为AND Table table = srcTable .filter("age > 20 && age < 25") .select("name, age, address"); // 将Table转为DataStream tableEnv.toAppendStream(table, Row.class) .print(); try { env.execute(); } catch (Exception e) { e.printStackTrace(); } } } - 使用POJO方式转换DataStrem(需要定义POJO类)
- POJO类 Person
public class Person { // 字段必须public public long id; public String name; public int age; public String address; public long eventTime; // 必须声明public的无参构造 public Person() { } public Person(long id, String name, int age, String address, long eventTime) { this.id = id; this.name = name; this.age = age; this.address = address; this.eventTime = eventTime; } /** * 定义一个工厂方法,用于解析CSV格式的数据 * @param line 逗号分隔的csv格式数据, * @return 对象 {@link Person} */ public static Person parseCSV(String line) { String[] fields = line.split(","); long id = Long.parseLong(fields[0]); String name = fields[1]; int age = Integer.parseInt(fields[2]); String address = fields[3]; long eventTime = Long.parseLong(fields[4]); return new Person(id, name, age, address, eventTime); } } - 处理代码
// 使用POJO方式转换DataStrem(需要定义POJO类) // 处理,解析一行数据为Person对象 DataStream<Person> personDS = customDS.map(Person::parseCSV); Table table = tableEnv.fromDataStream(personDS) .filter("age > 20 && age < 25") .select("name, age, address"); tableEnv.toAppendStream(table, Row.class) .print();
- POJO类 Person
- 使用SQL处理DataStream
DataStream<Person> personDS = customDS.map(Person::parseCSV); // 将personDS注册为一张表 tableEnv.registerDataStream("tb_person", personDS); // 此处必须写AND,不能用&& Table table = tableEnv.sqlQuery( "SELECT name, age, address FROM tb_person WHERE age > '20' AND age < '25'" ); tableEnv.toAppendStream(table, Row.class).print();
TableAPI、SQL 窗口聚合示例
- 代码 WindowAggDemo
public class WindowAggDemo { public static void main(String[] args) { // Environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(3); // 获取TableEnvironment StreamTableEnvironment tableEnv = StreamTableEnvironment.getTableEnvironment(env); // 自定义数据源 CustomSourceFunction sourceFunction = new CustomSourceFunction(); DataStreamSource<String> customDS = env.addSource(sourceFunction); // 处理,解析一行数据为Person对象 DataStream<Person> personDS = customDS.map(Person::parseCSV); // 将DataStream转为Table // UserActionTime.proctime表示使用处理时间 // UserActionTime.rowtime表示事件时间 Table srcTable = tableEnv.fromDataStream(personDS, "id, name, age, address, eventTime, UserActionTime.proctime"); // 创建窗口,5秒一个窗口,将UserActionTime字段作为时间,窗口别名为myWindow Window myWindow = Tumble.over("5.seconds").on("UserActionTime").as("myWindow"); // 窗口+聚合 Table table = srcTable.window(myWindow) .groupBy("myWindow, address") // 第一个字段为窗口别名 .select("address, age.avg"); // age.avg即对年龄求均值 // 输出 tableEnv.toRetractStream(table, Row.class) .filter(tuple2 -> tuple2.f0) // 只要新增数据 .print(); try { env.execute(); } catch (Exception e) { e.printStackTrace(); } } }

本文介绍如何在Apache Flink 1.7.2版本中使用Table API和SQL进行数据处理,包括自定义SourceFunction的实现,以及通过Table API和SQL对DataStream进行简单示例操作和窗口聚合示例。文章详细展示了如何使用Java和Scala编写Flink应用程序,并提供了从DataStream到Table的转换方法,以及使用SQL查询Table的示例。
2309

被折叠的 条评论
为什么被折叠?



