Apache Flink Java 示例:批处理数据分析(Table/SQL API)
本文将详细介绍如何使用 Apache Flink 的 Table API 和 SQL API 进行高效的批处理数据分析。与传统的 DataSet API 相比,Table/SQL API 提供了更简洁的声明式编程模型,特别适合 SQL 熟悉的数据分析师和工程师。
电商数据分析场景
我们将使用 Table/SQL API 实现以下分析任务:
- 销售趋势分析(月度/季度)
- 地理销售分布
- 产品类别表现
- 用户分群分析
- 关联购买行为挖掘
完整实现代码
1. 环境配置与依赖 (pom.xml)
<dependencies>
<!-- Flink Table API/SQL -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge</artifactId>
<version>1.17.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.12</artifactId>
<version>1.17.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-csv</artifactId>
<version>1.17.0</version>
</dependency>
<!-- 其他辅助依赖 -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.26</version>
<scope>provided</scope>
</dependency>
</dependencies>
2. 数据模型定义
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.math.BigDecimal;
import java.time.LocalDate;
/**
* 订单实体类
*/
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Order {
private Long orderId; // 订单ID
private Long userId; // 用户ID
private Integer productId; // 产品ID
private String productCategory; // 产品类别
private BigDecimal amount; // 订单金额
private LocalDate orderDate; // 订单日期
private Integer quantity; // 购买数量
private String country; // 国家
private String region; // 地区
private Integer rating; // 评分(1-5)
}
/**
* 用户实体类
*/
@Data
@NoArgsConstructor
@AllArgsConstructor
public class User {
private Long userId; // 用户ID
private String gender; // 性别
private Integer age; // 年龄
private String membershipLevel; // 会员级别
private String joinDate; // 加入日期
}
/**
* 产品实体类
*/
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Product {
private Integer productId; // 产品ID
private String productName; // 产品名称
private String category; // 类别
private BigDecimal price; // 价格
private Integer stock; // 库存
}
3. 批处理分析主程序
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.connector.file.src.FileSource;
import org.apache.flink.connector.file.src.reader.TextLineInputFormat;
import org.apache.flink.core.fs.Path;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableResult;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.expressions.Expression;
import org.apache.flink.types.Row;
import java.math.BigDecimal;
import java.time.LocalDate;
import java.time.Month;
import java.util.ArrayList;
import java.util.List;
import static org.apache.flink.table.api.Expressions.$;
import static org.apache.flink.table.api.Expressions.call;
public class EcommerceBatchAnalysisTableAPI {
public static void main(String[] args) throws Exception {
// 1. 创建批处理环境(使用流式API但设置为BATCH模式)
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.BATCH); // 关键配置:设置为批处理模式
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 2. 准备模拟数据(生产环境从CSV/Parquet读取)
DataStream<Order> orders = createOrdersStream(env);
DataStream<User> users = createUsersStream(env);
DataStream<Product> products = createProductsStream(env);
// 3. 将DataStream注册为Table
tableEnv.createTemporaryView("orders", orders);
tableEnv.createTemporaryView("users", users);
tableEnv.createTemporaryView("products", products);
// ===================== 核心分析任务 =====================
// 4. 任务1: 月度销售趋势分析
System.out.println("========== 月度销售趋势分析 ==========");
analyzeMonthlySales(tableEnv);
// 5. 任务2: 地理销售分布分析
System.out.println("\n========== 地理销售分布分析 ==========");
analyzeGeographicSales(tableEnv);
// 6. 任务3: 产品类别表现分析
System.out.println("\n========== 产品类别表现分析 ==========");
analyzeCategoryPerformance(tableEnv);
// 7. 任务4: 用户分群价值分析
System.out.println("\n========== 用户分群价值分析 ==========");
analyzeUserSegments(tableEnv);
// 8. 任务5: 关联购买行为分析
System.out.println("\n========== 关联购买行为分析 ==========");
analyzeProductAssociations(tableEnv);
// 9. 执行分析任务
env.execute("E-commerce Batch Analysis with Table API");
}
// ===================== 分析函数实现 =====================
/**
* 任务1: 月度销售趋势分析
*/
private static void analyzeMonthlySales(StreamTableEnvironment tableEnv) {
// 使用纯SQL查询
String sql = "SELECT \n" +
" EXTRACT(YEAR FROM orderDate) AS year, \n" +
" EXTRACT(MONTH FROM orderDate) AS month, \n" +
" COUNT(orderId) AS total_orders,\n" +
" SUM(quantity) AS total_items,\n" +
" SUM(amount) AS total_sales,\n" +
" AVG(amount) AS avg_order_value\n" +
"FROM orders \n" +
"GROUP BY \n" +
" EXTRACT(YEAR FROM orderDate), \n" +
" EXTRACT(MONTH FROM orderDate)\n" +
"ORDER BY year DESC, month DESC";
tableEnv.sqlQuery(sql).execute().print();
}
/**
* 任务2: 地理销售分布分析
*/
private static void analyzeGeographicSales(StreamTableEnvironment tableEnv) {
// 使用Table API
Table result = tableEnv.from("orders")
.groupBy($("country"), $("region"))
.select(
$("country"),
$("region"),
$("amount").sum().as("total_sales"),
$("orderId").count().as("order_count")
)
.orderBy($("total_sales").desc());
result.execute().print();
}
/**
* 任务3: 产品类别表现分析
*/
private static void analyzeCategoryPerformance(StreamTableEnvironment tableEnv) {
// 结合表和SQL
String sql = "SELECT \n" +
" o.productCategory,\n" +
" COUNT(DISTINCT o.orderId) AS order_count,\n" +
" SUM(o.quantity) AS item_count,\n" +
" SUM(o.amount) AS total_sales,\n" +
" AVG(o.rating) AS avg_rating,\n" +
" RANK() OVER (ORDER BY SUM(o.amount) DESC) AS sales_rank\n" +
"FROM orders o\n" +
"GROUP BY o.productCategory\n" +
"ORDER BY total_sales DESC";
tableEnv.sqlQuery(sql).execute().print();
}
/**
* 任务4: 用户分群价值分析
*/
private static void analyzeUserSegments(StreamTableEnvironment tableEnv) {
// 使用Table API完成复杂分析
Table result = tableEnv.from("users")
.join(tableEnv.from("orders"))
.where($("users.userId").isEqual($("orders.userId")))
.groupBy(
$("users.gender"),
$("users.membershipLevel"),
$("users.age").divide(10).times(10).as("age_group")
)
.select(
$("gender"),
$("membershipLevel"),
$("age_group"),
$("orders.orderId").count().as("order_count"),
$("orders.amount").sum().as("total_spending"),
$("orders.rating").avg().as("avg_rating")
)
.orderBy($("total_spending").desc());
result.execute().print();
}
/**
* 任务5: 关联购买行为分析
*/
private static void analyzeProductAssociations(StreamTableEnvironment tableEnv) {
// 使用高级SQL特性(自连接+窗口函数)
String sql = "WITH OrderPairs AS (\n" +
" SELECT \n" +
" o1.productId AS product1,\n" +
" o2.productId AS product2,\n" +
" COUNT(*) AS pair_count\n" +
" FROM orders o1\n" +
" JOIN orders o2 ON o1.orderId = o2.orderId \n" +
" AND o1.productId < o2.productId\n" +
" GROUP BY o1.productId, o2.productId\n" +
")\n" +
"SELECT \n" +
" p1.productName AS product1_name,\n" +
" p2.productName AS product2_name,\n" +
" op.pair_count,\n" +
" PERCENT_RANK() OVER (ORDER BY op.pair_count DESC) AS percentile\n" +
"FROM OrderPairs op\n" +
"JOIN products p1 ON op.product1 = p1.productId\n" +
"JOIN products p2 ON op.product2 = p2.productId\n" +
"WHERE op.pair_count > 5\n" +
"ORDER BY op.pair_count DESC\n" +
"LIMIT 20";
tableEnv.sqlQuery(sql).execute().print();
}
// ===================== 模拟数据生成 =====================
private static DataStream<Order> createOrdersStream(StreamExecutionEnvironment env) {
List<Order> orders = new ArrayList<>();
// 生成1000条模拟订单
for (int i = 1; i <= 1000; i++) {
orders.add(new Order(
(long) i,
(long) (i % 100 + 1), // 100个用户
i % 50, // 50种产品
"CATEGORY-" + (i % 10), // 10个类别
new BigDecimal(50 + Math.random() * 450), // $50-$500
LocalDate.of(2023, Month.JANUARY, 1).plusDays(i % 365),
(int) (1 + Math.random() * 4), // 1-5件
i % 2 == 0 ? "US" : (i % 3 == 0 ? "UK" : "CN"),
i % 2 == 0 ? "North" : (i % 3 == 0 ? "South" : "East"),
(int) (1 + Math.random() * 5) // 1-5星
));
}
return env.fromCollection(orders);
}
private static DataStream<User> createUsersStream(StreamExecutionEnvironment env) {
List<User> users = new ArrayList<>();
// 生成100个模拟用户
for (int i = 1; i <= 100; i++) {
users.add(new User(
(long) i,
i % 2 == 0 ? "M" : "F",
18 + i % 50, // 18-68岁
i % 3 == 0 ? "Silver" : (i % 2 == 0 ? "Gold" : "Platinum"),
"202" + (i % 3) + "-" + String.format("%02d", 1 + (i % 12)) + "-" + String.format("%02d", 1 + (i % 28))
));
}
return env.fromCollection(users);
}
private static DataStream<Product> createProductsStream(StreamExecutionEnvironment env) {
List<Product> products = new ArrayList<>();
// 生成50个模拟产品
for (int i = 1; i <= 50; i++) {
products.add(new Product(
i,
"Product " + i,
"CATEGORY-" + (i % 10), // 10个类别
new BigDecimal(20 + Math.random() * 480), // $20-$500
(int) (100 + Math.random() * 900) // 100-1000库存
));
}
return env.fromCollection(products);
}
}
4. Table/SQL API 核心功能详解
A. 核心优势
- 声明式编程:使用类似SQL的语法表达复杂逻辑
- 统一API:流批统一处理(批处理通过
setRuntimeMode
指定) - 优化器支持:自动优化执行计划
- 生态系统集成:轻松连接各种数据源和数据汇
- 扩展性好:支持自定义函数(UDF)
B. 常用操作对比(SQL vs Table API)
操作 | SQL | Table API |
---|---|---|
选择列 | SELECT col1, col2 FROM table | table.select($("col1"), $("col2")) |
过滤 | WHERE col1 > 100 | table.filter($("col1").gt(100)) |
分组聚合 | GROUP BY col1, SUM(col2) | table.groupBy($("col1")).select($("col1"), $("col2").sum()) |
排序 | ORDER BY col1 DESC | table.orderBy($("col1").desc()) |
窗口函数 | RANK() OVER (PARTITION BY ...) | $("col1").rank().over(partitionBy(...)) |
C. 窗口函数示例
// 计算每个类别内的产品销售额排名
Table rankedProducts = tableEnv.from("orders")
.select(
$("productCategory"),
$("productId"),
$("amount"),
call("RANK").over(
Window.partitionBy($("productCategory"))
.orderBy($("amount").desc())
).as("sales_rank")
);
rankedProducts.execute().print();
5. 高级分析技术
A. 使用 UDF(用户自定义函数)
import org.apache.flink.table.functions.ScalarFunction;
// 注册UDF
tableEnv.createTemporarySystemFunction("CATEGORY_VALUE", new CategoryValueFunction());
// 在SQL中使用UDF
String sql = "SELECT productCategory, CATEGORY_VALUE(amount, rating) AS category_value " +
"FROM orders GROUP BY productCategory";
// UDF实现
public static class CategoryValueFunction extends ScalarFunction {
public BigDecimal eval(BigDecimal amount, Integer rating) {
double value = amount.doubleValue() * (0.5 + rating / 10.0);
return BigDecimal.valueOf(value).setScale(2, BigDecimal.ROUND_HALF_UP);
}
}
B. 模式匹配(MATCH_RECOGNIZE)
// 识别用户购买模式(高端产品 -> 配件购买)
String patternSql = "SELECT * FROM orders " +
"MATCH_RECOGNIZE ( " +
" PARTITION BY userId " +
" ORDER BY orderDate " +
" MEASURES " +
" FIRST(high.productId) AS high_product, " +
" COLLECT(accessory.productId) AS accessories " +
" PATTERN (high accessory+) " +
" DEFINE " +
" high AS high.amount > 300, " +
" accessory AS accessory.amount < 100" +
")";
C. 复杂类型处理
// 处理嵌套JSON数据(示例)
Table jsonTable = tableEnv.sqlQuery(
"SELECT " +
" JSON_VALUE(payload, '$.userId') AS userId," +
" JSON_VALUE(payload, '$.productId') AS productId " +
"FROM KafkaSource"
);
// 转换为SQL类型
Table typedTable = jsonTable
.select(
$("userId").cast(DataTypes.BIGINT()).as("userId"),
$("productId").cast(DataTypes.INT()).as("productId")
);
6. 生产环境最佳实践
A. 连接外部数据源
// 创建Hive Catalog
String name = "myhive";
String defaultDatabase = "default";
String hiveConfDir = "/path/to/hive_conf_dir";
HiveCatalog hive = new HiveCatalog(name, defaultDatabase, hiveConfDir);
tableEnv.registerCatalog("myhive", hive);
tableEnv.useCatalog("myhive");
// 直接查询Hive表
Table hiveTable = tableEnv.sqlQuery("SELECT * FROM sales_data");
B. 使用 Parquet/ORC 格式
Table sourceTable = tableEnv.sqlQuery(
"CREATE TABLE orders_parquet (" +
" orderId BIGINT," +
" userId BIGINT," +
" productId INT," +
" ..." +
") WITH (" +
" 'connector' = 'filesystem'," +
" 'path' = 'hdfs:///data/orders'," +
" 'format' = 'parquet'" +
")");
C. 数据分区处理
// 分区表查询
Table partitionedTable = tableEnv.sqlQuery(
"SELECT * FROM orders_partitioned " +
"WHERE order_year = 2023 AND order_month = 7"
);
D. 性能优化配置
// 设置并行度
tableEnv.getConfig().set("parallelism.default", "4");
// 启用批处理优化
tableEnv.getConfig().set("table.exec.resource.default-parallelism", "4");
tableEnv.getConfig().set("table.exec.mini-batch.enabled", "true");
tableEnv.getConfig().set("table.exec.mini-batch.size", "5000");
// 优化器配置
tableEnv.getConfig().set("table.optimizer.join-reorder-enabled", "true");
tableEnv.getConfig().set("table.optimizer.join.broadcast-threshold", "1048576"); // 1MB
7. Table/SQL API 与 DataSet API 对比
特性 | Table/SQL API | DataSet API |
---|---|---|
编程模型 | 声明式 | 命令式 |
流批统一 | 是 | 否(批处理独立) |
优化器 | 有 | 无 |
生态系统 | 丰富连接器 | 有限 |
学习曲线 | 低(SQL知识) | 高(Java API) |
UDF支持 | 完善 | 需要开发 |
实时能力 | 支持流处理 | 仅批处理 |
窗口支持 | 高级窗口功能 | 基本窗口 |
8. 分析结果可视化示例
// 使用Python可视化(通过PyFlink或外部可视化工具)
tableEnv.toDataStream(resultTable)
.executeAndCollect()
.forEachRemaining(row -> {
// 转换为Python可消费的格式
System.out.println(formatRowForPython(row));
});
// Python端(使用pandas+matplotlib)
"""
import pandas as pd
import matplotlib.pyplot as plt
# 读取Flink输出
data = []
with open('flink_output.txt') as f:
for line in f:
parts = line.split('|')
month = parts[0].strip()
sales = float(parts[1].strip())
data.append({'month': month, 'sales': sales})
df = pd.DataFrame(data)
# 创建销售趋势图
plt.figure(figsize=(12, 6))
plt.plot(df['month'], df['sales'], marker='o')
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Sales ($)')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('sales_trend.png')
"""
9. 完整分析流程架构
总结
通过本示例,我们展示了如何利用 Flink Table/SQL API 进行高效的批处理数据分析。主要优点包括:
- SQL优先:让数据分析师使用熟悉的SQL语法
- 优化执行:Flink优化器自动优化查询计划
- 统一流批:同一API处理批量和实时数据
- 高级分析:内置窗口函数和MATCH_RECOGNIZE等高级功能
- 生态系统集成:无缝连接Hive、HBase、JDBC等数据源
对于需要执行周期性批处理分析任务(如每日销售报表、用户分群更新、产品关联规则挖掘)的场景,Flink Table/SQL API 提供了比传统MapReduce/Spark更高效、更直观的解决方案。