BatchTableEnvironment:自定义标量函数
Scalar functions:映射scalar值为新scalar值
Table functions:映射scalar值为新rows
Aggregate functions:映射多个rows的scalar值为新scalar值
Table aggregate functions:映射多个rows的scalar值为新rows
Async table functions:table source执行查询操作
maven 依赖:
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.9.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>1.9.1</version>
</dependency>
<!--使用Java编程语言支持DataStream / DataSet API的Table&SQL API-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.11</artifactId>
<version>1.9.1</version>
<!--<scope>provided</scope>-->
</dependency>
<!--表程序规划器和运行时-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.11</artifactId>
<version>1.9.1</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-jdbc_2.11</artifactId>
<version>1.9.1</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.18</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.49</version>
</dependency>
// 自定义函数1
import org.apache.flink.table.functions.ScalarFunction;
import java.util.Date;
import java.text.SimpleDateFormat;
import java.util.Calendar;
// 定义一个简单的标量函数,用于将Date转换为String
public class StrToTimestamp extends ScalarFunction {
private SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
public Long eval(String dateStr) {
long timestamp = 0L;
try {
Date date = sdf.parse(dateStr);
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
timestamp = calendar.getTimeInMillis() / 1000;
} catch (Exception e) {
e.printStackTrace();
}
return timestamp;
}
}
// 自定义函数2
import org.apache.flink.table.functions.ScalarFunction;
import java.sql.Date;
import java.text.SimpleDateFormat;
public class DateToStr extends ScalarFunction {
private SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
public String eval(Date date) {
return sdf.format(date);
}
}
// 自定义函数3
import org.apache.flink.table.functions.ScalarFunction;
public class FirstOrMaxQuantity extends ScalarFunction {
public Float eval(Float firstTotalChargeQuantity, Float maxTotalChargeQuantity) {
Float result = 0.0F;
try {
if (maxTotalChargeQuantity == null ) {
result = firstTotalChargeQuantity;
} else {
result = maxTotalChargeQuantity;
}
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
}
// 自定义函数4
import org.apache.flink.table.functions.ScalarFunction;
import java.sql.Timestamp;
import java.text.DecimalFormat;
public class TimeCeil extends ScalarFunction {
public String eval(Timestamp column) {
String field = column.toString().split(" ")[1];
String[] timeSplit = field.split(":");
// 数字字符串前补零
DecimalFormat g1 = new DecimalFormat("00");
String hour = timeSplit[0];
String standard;
// 时间向上取整:取半小时整点
if (Integer.parseInt(timeSplit[1]) >= 30) {
hour = g1.format(Integer.parseInt(hour) + 1);
standard = "00";
} else {
standard = "30";
}
if ((hour + ":" + standard + ":00").equals("24:00:00.0")) {
return "23:59:59";
} else {
return hour + ":" + standard + ":00";
}
}
}
// 自定义函数5
import org.apache.flink.table.functions.ScalarFunction;
public class TsMinusHalf extends ScalarFunction {
public Long eval(String dateStr) {
long timestamp = 0L;
try {
if (dateStr.equals("23:30:00")) {
timestamp = 1799;
} else {
timestamp = 1800;
}
} catch (Exception e) {
e.printStackTrace();
}
return timestamp;
}
}
注:BatchTableEnvironment 该方法在 Flink 1.9.1 中还可以用,1.13.6 已经不再推荐使用了,1.17.2 已经彻底没有了。
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.io.jdbc.JDBCInputFormat;
import org.apache.flink.api.java.typeutils.RowTypeInfo;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.BatchTableEnvironment;
import org.apache.flink.types.Row;
import java.sql.Date;
import java.sql.Timestamp;
public class FlinkTest {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
BatchTableEnvironment bTableEnv = BatchTableEnvironment.create(env);
bTableEnv.registerFunction("TsMinusHalf", new TsMinusHalf());
bTableEnv.registerFunction("DateToStr", new DateToStr());
bTableEnv.registerFunction("TimeCeil", new TimeCeil());
bTableEnv.registerFunction("StrToTimestamp", new StrToTimestamp());
bTableEnv.registerFunction("FirstOrMaxQuantity", new FirstOrMaxQuantity());
// 方式一:自己造数据
// DataSet<Row> sourceData = env.fromElements(
// Row.of("cabinet", "0723HBGCES5050-ssgj", "0723HBGCES5050", "des-1691575468267", new java.sql.Date(new java.util.Date().getTime()), new Timestamp(System.currentTimeMillis()), "des-1691575468267-subSys-1", "9", 189501.0f, 179381.0f),
// Row.of("cabinet", "0723HBGCES5050-ssgj", "0723HBGCES5050", "des-1691575468267", new java.sql.Date(new java.util.Date().getTime()), new Timestamp(System.currentTimeMillis()), "des-1691575468267-subSys-1", "9", 189505.0f, 179387.0f),
// Row.of("cabinet", "0723HBGCES5050-ssgj", "0723HBGCES5050", "des-1691575468267", new java.sql.Date(new java.util.Date().getTime()), new Timestamp(System.currentTimeMillis()), "des-1691575468267-subSys-1", "9", 189505.0f, 179387.0f)
// );
// 方式二:从 Starrocks 中获取数据
TypeInformation[] fieldTypes = {
Types.STRING,
Types.STRING,
Types.STRING,
Types.STRING,
Types.SQL_DATE,
Types.SQL_TIMESTAMP,
Types.STRING,
Types.INT,
Types.FLOAT,
Types.FLOAT
};
RowTypeInfo rowTypeInfo = new RowTypeInfo(fieldTypes);
JDBCInputFormat jdbcInputFormat = JDBCInputFormat.buildJDBCInputFormat().setDrivername("com.mysql.jdbc.Driver")
.setDBUrl("jdbc:mysql://xxx.xxx.xxx.xxx:9030/dws?useUnicode=true&characterEncoding=utf-8&useSSL=false&serverTimezone=Asia/Shanghai")
.setUsername("root")
.setPassword("")
.setQuery("select student_type, student_id, teacher_id, school_id, data_date, data_fact_time, class_id, status, english_score_total, math_score_total from dwd.dwd_student where student_id = '4145646564erww-heheda' and data_date = '2024-12-16'").setRowTypeInfo(rowTypeInfo).finish();
DataSet<Row> sourceData = env.createInput(jdbcInputFormat);
Table dwdstudent = bTableEnv.fromDataSet(sourceData);
bTableEnv.registerTable("dwd_student", dwdstudent);
Table result = bTableEnv.sqlQuery("select\n" +
" studentType,\n" +
" studentId,\n" +
" teacherId,\n" +
" schoolId,\n" +
" classId,\n" +
" DateStr,\n" +
" HalfHour,\n" +
" StrToTimestamp(concat(DateToStr(DateStr), ' ', HalfHour)) as ts,\n" +
" min(englishScoreTotal) as first_englishScoreTotal,\n" +
" max(englishScoreTotal) as last_englishScoreTotal,\n" +
" min(mathScoreTotal) as first_mathScoreTotal,\n" +
" max(mathScoreTotal) as last_mathScoreTotal\n" +
"from\n" +
" (\n" +
" select\n" +
" f0 as studentType,\n" +
" f1 as studentId,\n" +
" f2 as teacherId,\n" +
" f3 as schoolId,\n" +
" f4 as DateStr,\n" +
" f5 as TimeStr,\n" +
" TimeCeil(f5) as HalfHour,\n" +
" f6 as classId,\n" +
" f7 as status,\n" +
" f8 as englishScoreTotal,\n" +
" f9 as mathScoreTotal\n" +
" from\n" +
" dwd_student\n" +
" )\n" +
"where\n" +
" englishScoreTotal <> 0 and mathScoreTotal <> 0\n" +
"group by\n" +
" studentType, studentId, teacherId, schoolId, classId, DateStr, HalfHour");
// result.printSchema();
Table result2 = result
.select("classId as classIdTmp, ts as tsTmp, ts + TsMinusHalf(HalfHour) as tsMinusHalf, last_englishScoreTotal as max_englishScoreTotal, last_mathScoreTotal as max_mathScoreTotal");
Table result3 = result
.leftOuterJoin(result2, "classIdTmp = classId && tsMinusHalf = ts")
.select("studentType, studentId, teacherId, schoolId, classId, DateStr, HalfHour, ts, FirstOrMaxQuantity(first_englishScoreTotal, max_englishScoreTotal) as first_englishScoreTotal, last_englishScoreTotal, FirstOrMaxQuantity(first_mathScoreTotal, max_mathScoreTotal) as first_mathScoreTotal, last_mathScoreTotal");
Table result4 = result3.select("studentType, studentId, teacherId, schoolId, classId, DateStr, ts, HalfHour, last_englishScoreTotal - first_englishScoreTotal as _diff_englishScoreTotal, last_mathScoreTotal - first_mathScoreTotal as _diff_mathScoreTotal");
DataSet<Row> rowDataset = bTableEnv.toDataSet(result4, Row.class);
rowDataset.print();
}
}
执行结果:
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734346800,19:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734352200,20:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734318000,11:00:00,0.0,51.40625
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734334200,15:30:00,0.0,50.890625
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734355800,21:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734361200,23:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734319800,11:30:00,0.0,50.796875
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734345000,18:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734283800,01:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734289200,03:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734298200,05:30:00,28.3125,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734364800,24:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734300000,06:00:00,28.09375,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734316200,10:30:00,0.0,49.09375
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734336000,16:00:00,0.0,28.109375
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734343200,18:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734303600,07:00:00,28.296875,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734354000,21:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734357600,22:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734339600,17:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734350400,20:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734292800,04:00:00,27.203125,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734310800,09:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734314400,10:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734363000,23:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734296400,05:00:00,28.390625,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734327000,13:30:00,53.09375,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734330600,14:30:00,0.0,49.296875
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734309000,08:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734312600,09:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734332400,15:00:00,0.0,51.40625
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734359400,22:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734285600,02:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734291000,03:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734301800,06:30:00,28.296875,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734305400,07:30:00,4.203125,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734321600,12:00:00,0.0,34.90625
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734337800,16:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734280200,00:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734282000,01:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734307200,08:00:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734323400,12:30:00,51.296875,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734348600,19:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734328800,14:00:00,35.40625,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734341400,17:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734325200,13:00:00,50.8125,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734287400,02:30:00,0.0,0.0
4145646564erww-heheda,fasdfasdf,new,student-abcdefg,student-abcdefg-class-1,2024-12-16,1734294600,04:30:00,28.0,0.0
参考:
flink 定义一个全局的状态 flink 自定义udf
Flink1.9 UDF使用教程
在Apache Flink中,Java UDF(用户自定义函数)的使用涉及几个关键步骤
Flink UDF使用指南,你竟然不会用?
DataSet API 的未来:需要注意的是,Flink 的官方路线图中已经不再优先开发 DataSet API
的新特性,未来的主要开发将集中在 DataStream API
,甚至批处理功能都将通过 DataStream API
来实现。因此,如果可能,建议新项目尽量使用 DataStream API
来替代 DataSet API
。特别是 Flink 的 Table API
和 SQL API
也适用于批处理和流处理,这些高层 API 提供了更简洁的语法和更强的优化能力。来自:大数据-118 - Flink DataSet 基本介绍 核心特性 创建、转换、输出等 原创
流式处理:自定义聚合函数
maven 依赖:
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>1.13.6</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.12</artifactId>
<version>1.13.6</version>
</dependency>
<!--表程序规划器和运行时-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_2.12</artifactId>
<version>1.13.6</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.12</artifactId> <!-- 注意Scala版本应与您的项目匹配 -->
<version>1.13.6</version>
</dependency>
</dependencies>
自定义聚合函数:AggregationFunction要求必须实现的方法
- createAccumulator()
- accumulate()
- getValue()
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.table.functions.AggregateFunction;
//实现自定义的AggregateFunction<结果值,中间状态>
public class AvgTemp extends AggregateFunction<Double, Tuple2<Double,Integer>> {
//输出结果
@Override
public Double getValue(Tuple2<Double, Integer> acc) {
return acc.f0/acc.f1;
}
//初始化累加器
@Override
public Tuple2<Double, Integer> createAccumulator() {
return new Tuple2<Double,Integer>(0.0,0);
}
//必须实现一个accumulate(中间状态,传入的一条数据)方法,来数据之后更新状态
public void accumulate(Tuple2<Double,Integer> acc, Double temp){
acc.f0 += temp;
acc.f1 += 1;
}
}
main 方法测试:
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
public class FlinkTest {
public static void main(String[] args) throws Exception {
// 设置流式执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 创建流式表环境
EnvironmentSettings environmentSettings = EnvironmentSettings
.newInstance()
.build();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, environmentSettings);
// 创建数据流并转换为表
DataStreamSource<Tuple3<String,Long,Double>> dataStream = env.fromElements(
new Tuple3<>("hehe", 1L, 0.23),
new Tuple3<>("haha", 2L, 3.5)
);
//流转换成表 Tuple3<String,Long,Double>
Table sourceTable = tableEnv.fromDataStream(dataStream, "f0 as id, f1 as ts, f2 as temp,pt.proctime");
//在环境中注册UDAF
AvgTemp avgTemp = new AvgTemp();
tableEnv.registerFunction("avgTemp",avgTemp);
//tableAPI
Table resultTable = sourceTable.groupBy("id")
.aggregate("avgTemp(temp) as avgtemp")
.select("id,avgtemp");
tableEnv.toRetractStream(resultTable, Row.class).print();
// 执行Flink作业
env.execute("Flink sql SplitFunction Test");
System.out.println("---------------------");
//SQL
tableEnv.createTemporaryView("sensor",sourceTable);
Table resultSqlTbale = tableEnv.sqlQuery("select id,avgTemp(temp) as avgtemp from sensor group by id");
tableEnv.toRetractStream(resultSqlTbale,Row.class).print();
// 执行Flink作业
env.execute("Flink sql SplitFunction Test");
}
}
执行结果:
(true,+I[hehe, 0.23])
(true,+I[haha, 3.5])
---------------------
(true,+I[hehe, 0.23])
(true,+I[haha, 3.5])
参考:FlinkSQL-自定义聚合函数AggregateFunction
流式处理:自定义标量函数
maven 依赖:
<!-- 1.13.6 版本 -->
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>1.13.6</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.12</artifactId>
<version>1.13.6</version>
</dependency>
<!--表程序规划器和运行时-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_2.12</artifactId>
<version>1.13.6</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.12</artifactId> <!-- 注意Scala版本应与您的项目匹配 -->
<version>1.13.6</version>
</dependency>
</dependencies>
<!-- 1.17.2 版本 -->
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java</artifactId>
<version>1.17.2</version>
</dependency>
<!--表程序规划器和运行时-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-loader</artifactId>
<version>1.17.2</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge</artifactId>
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients</artifactId> <!-- 注意Scala版本应与您的项目匹配 -->
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-runtime</artifactId>
<version>1.17.2</version>
</dependency>
</dependencies>
自定义 udf 函数:字符串拆分
import org.apache.flink.table.annotation.DataTypeHint;
import org.apache.flink.table.annotation.FunctionHint;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;
@FunctionHint(output = @DataTypeHint("ROW<`str_value` STRING>"))
public class SplitFunction extends TableFunction<Row> {
// 实现eval方法,用于拆分输入字符串并输出每个子串
public void eval(String str, String regex) {
if (str != null) {
// 使用指定正则表达式对输入字符串进行拆分
for (String s : str.split(regex)) {
// 使用collect(...)方法发射一行数据
collect(Row.of(s));
}
}
}
}
main 方法测试:
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
public class FlinkTest {
public static void main(String[] args) throws Exception {
// 设置流式执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 创建流式表环境
EnvironmentSettings environmentSettings = EnvironmentSettings
.newInstance()
.build();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, environmentSettings);
// 创建数据流并转换为表
DataStreamSource<String> dataStream = env.fromElements("hello,world");
Table table = tableEnv.fromDataStream(dataStream);
table.printSchema();// 打印表结构
// 创建临时视图
tableEnv.createTemporaryView("MyTable", table);
// 注册自定义函数SplitFunction
tableEnv.createTemporarySystemFunction("SplitFunction", SplitFunction.class);
// 执行SQL查询,调用SplitFunction拆分字符串
Table result = tableEnv.sqlQuery(
"SELECT f0, str_value " +
"FROM MyTable " +
"LEFT JOIN LATERAL TABLE(SplitFunction(f0, ',')) ON TRUE");
// 将结果转换为数据流并打印
tableEnv.toDataStream(result, Row.class).print();
// 执行Flink作业
env.execute("Flink sql SplitFunction Test");
}
}
执行结果:
(
`f0` STRING
)
+I[hello,world, hello]
+I[hello,world, world]
Process finished with exit code 0
Flink Table API 支持的操作示例
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java</artifactId>
<version>1.17.2</version>
</dependency>
<!--表程序规划器和运行时-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-loader</artifactId>
<version>1.17.2</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge</artifactId>
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients</artifactId> <!-- 注意Scala版本应与您的项目匹配 -->
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-runtime</artifactId>
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.26</version>
</dependency>
</dependencies>
GroupBy Window Aggregation:
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.Tumble;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import java.time.Duration;
import java.util.Arrays;
import java.util.List;
import static org.apache.flink.table.api.Expressions.$;
import static org.apache.flink.table.api.Expressions.lit;
/**
* @author alanchan
*
*/
public class TestTableAPIOperationDemo2 {
final static List<User> userList = Arrays.asList(
new User(1L, "alan", 18, 1698742358391L),
new User(2L, "alan", 19, 1698742359396L),
new User(3L, "alan", 25, 1698742360407L),
new User(4L, "alanchan", 28, 1698742361409L),
new User(5L, "alanchan", 29, 1698742362424L)
);
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tenv = StreamTableEnvironment.create(env);
DataStream<User> users = env.fromCollection(userList)
.assignTimestampsAndWatermarks(
WatermarkStrategy
.<User>forBoundedOutOfOrderness(Duration.ofSeconds(1))
.withTimestampAssigner((user, recordTimestamp) -> user.getRowtime())
)
;
Table usersTable = tenv.fromDataStream(users, $("id"), $("name"), $("balance"),$("rowtime").rowtime());
//使用分组窗口结合单个或者多个分组键对表进行分组和聚合。
Table result = usersTable
.window(Tumble.over(lit(5).minutes()).on($("rowtime")).as("w")) // 定义窗口
.groupBy($("name"), $("w")) // 按窗口和键分组
// 访问窗口属性并聚合
.select(
$("name"),
$("w").start(),
$("w").end(),
$("w").rowtime(),
$("balance").sum().as("sum(balance)")
);
DataStream<Tuple2<Boolean, Row>> resultDS = tenv.toRetractStream(result, Row.class);
resultDS.print();
// 2> (true,+I[alan, 2023-10-31T08:50, 2023-10-31T08:55, 2023-10-31T08:54:59.999, 62])
// 16> (true,+I[alanchan, 2023-10-31T08:50, 2023-10-31T08:55, 2023-10-31T08:54:59.999, 57])
env.execute();
}
@Data
@NoArgsConstructor
@AllArgsConstructor
public static class User {
private long id;
private String name;
private int balance;
private Long rowtime;
}
}
执行结果:
2> (true,+I[alan, 2023-10-31T08:50, 2023-10-31T08:55, 2023-10-31T08:54:59.999, 62])
16> (true,+I[alanchan, 2023-10-31T08:50, 2023-10-31T08:55, 2023-10-31T08:54:59.999, 57])
来自:
【flink番外篇】9、Flink Table API 支持的操作示例(1)-完整版
【flink番外篇】9、Flink Table API 支持的操作示例(6)- 表的聚合(group by、Distinct、GroupBy/Over Window Aggregation)操作
Flink教程(16)- Flink Table与SQL
Flink 启动程序命令
通过命令行提交任务:
./bin/flink run -c com.my.program.Main /path/to/user/jar/my-program.jar
# 在这个命令中,-c 参数后面跟的是主类的全路径,而接着的 .jar 文件则是用户编写的程序的 JAR 包。
提交带有参数的任务:
./bin/flink run -c com.my.program.Main -p 5 /path/to/user/jar/my-program.jar --arg1 value1 --arg2 value2
# 在这个命令中,-p 参数用于指定任务执行时的并行度,后面的数字 5 表示并行执行的任务数目。接着的 --arg1 value1 --arg2 value2 则是传给主类的参数。
提交 YARN 任务:
./bin/flink run -m yarn-cluster -c com.my.program.Main /path/to/user/jar/my-program.jar
# 在这个命令中,-m 参数后面指定了任务提交的目标,yarn-cluster 表示任务将会在 YARN 集群上执行。
提交 Kubernetes 任务:
./bin/flink run-application -t kubernetes-application -Dkubernetes.cluster-id=my-cluster -Dkubernetes.namespace=default -c com.my.program.Main /path/to/user/jar/my-program.jar
# 在这个命令中,-t 参数后面指定了任务提交的目标,kubernetes-application 表示任务将会在 Kubernetes 上执行。