1. SELECT查询基础架构
1.1 流式SELECT查询的特殊性
与传统数据库查询不同,Flink SQL的SELECT查询针对持续变化的数据流设计:
连续查询特性
查询结果持续更新,而非一次性返回
支持无界数据流的增量计算
结果表随输入流的变化而动态变化
需要明确的时间语义定义
1.2 基础SELECT语法结构
[WITH with_query [, ...]]
SELECT [ALL | DISTINCT] select_expr [, ...]
FROM table_expression
[[LEFT|RIGHT|FULL] JOIN table_expression ON condition]
[WHERE condition]
[GROUP BY grouping_element [, ...]]
[HAVING condition]
[ORDER BY order_element [ASC | DESC] [, ...]]
[LIMIT count]
2. SELECT子句深度解析
2.1 列选择与表达式计算
基础列选择
-- 选择特定列
SELECT user_id, user_name, email FROM users;
-- 使用表别名
SELECT u.user_id, u.user_name FROM users AS u;
-- 选择所有列(生产环境慎用)
SELECT * FROM user_events;
表达式计算
-- 算术表达式
SELECT
price,
quantity,
price * quantity AS total_amount, -- 乘法计算
(price * quantity) * 0.9 AS discounted_total, -- 折扣计算
ROUND(price * 1.1, 2) AS price_with_tax -- 四舍五入
FROM order_items;
-- 字符串表达式
SELECT
first_name,
last_name,
first_name || ' ' || last_name AS full_name, -- 字符串拼接
UPPER(first_name) AS upper_name, -- 大写转换
LENGTH(first_name) AS name_length, -- 长度计算
SUBSTRING(email FROM 1 FOR 5) AS email_prefix -- 子字符串
FROM customers;
-- 时间表达式
SELECT
event_time,
event_time + INTERVAL '1' HOUR AS plus_one_hour, -- 时间加法
EXTRACT(HOUR FROM event_time) AS event_hour, -- 提取小时
DATE_FORMAT(event_time, 'yyyy-MM-dd') AS event_date -- 日期格式化
FROM events;
2.2 条件表达式与函数调用
CASE条件表达式
-- 简单CASE表达式
SELECT
user_id,
status,
CASE status
WHEN 'ACTIVE' THEN '活跃用户'
WHEN 'INACTIVE' THEN '非活跃用户'
WHEN 'SUSPENDED' THEN '冻结用户'
ELSE '未知状态'
END AS status_description
FROM users;
-- 搜索式CASE表达式
SELECT
order_amount,
CASE
WHEN order_amount >= 1000 THEN '大额订单'
WHEN order_amount >= 500 THEN '中额订单'
WHEN order_amount >= 100 THEN '小额订单'
ELSE '微小订单'
END AS order_level,
CASE
WHEN order_amount > 1000 AND status = 'COMPLETED' THEN 'VIP订单'
WHEN create_time > CURRENT_TIMESTAMP - INTERVAL '7' DAY THEN '近期订单'
ELSE '普通订单'
END AS order_type
FROM orders;
内置函数使用
-- 数学函数
SELECT
value,
ABS(value) AS absolute_value, -- 绝对值
CEIL(value) AS ceiling_value, -- 向上取整
FLOOR(value) AS floor_value, -- 向下取整
LOG(value) AS natural_log, -- 自然对数
POWER(value, 2) AS squared_value -- 平方
FROM metrics;
-- 字符串函数
SELECT
description,
TRIM(description) AS trimmed_desc, -- 去除空格
REPLACE(description, 'old', 'new') AS replaced_desc, -- 替换文本
POSITION('keyword' IN description) AS keyword_pos, -- 查找位置
REVERSE(description) AS reversed_desc -- 字符串反转
FROM products;
3. WHERE子句过滤技术
3.1 基础条件过滤
比较运算符
-- 数值比较
SELECT * FROM orders WHERE amount > 1000;
SELECT * FROM users WHERE age BETWEEN 18 AND 65;
SELECT * FROM products WHERE price IN (10, 20, 30);
-- 字符串比较
SELECT * FROM users WHERE name LIKE '张%'; -- 张姓用户
SELECT * FROM products WHERE name ILIKE '%phone%'; -- 不区分大小写
SELECT * FROM logs WHERE message SIMILAR TO '%(error|exception)%';
-- 时间比较
SELECT * FROM events
WHERE event_time >= CURRENT_TIMESTAMP - INTERVAL '1' HOUR; -- 最近1小时事件
SELECT * FROM orders
WHERE order_date BETWEEN '2023-01-01' AND '2023-12-31'; -- 2023年订单
逻辑运算符组合
-- AND/OR组合条件
SELECT * FROM users
WHERE (age >= 18 AND age <= 35) -- 年轻用户
AND (city = '北京' OR city = '上海') -- 一线城市
AND status = 'ACTIVE'; -- 活跃状态
-- NOT运算符
SELECT * FROM products
WHERE NOT discontinued -- 未下架商品
AND NOT price IS NULL; -- 价格不为空
-- 复杂逻辑组合
SELECT * FROM orders
WHERE (amount > 1000 OR item_count >= 10) -- 大额或多商品订单
AND status IN ('COMPLETED', 'SHIPPED') -- 已完成或已发货
AND NOT (payment_method = 'COD' AND amount > 5000); -- 排除大额货到付款
3.2 高级过滤技巧
NULL值处理
-- NULL值判断
SELECT * FROM users WHERE phone IS NULL; -- 手机号为NULL
SELECT * FROM products WHERE description IS NOT NULL; -- 描述不为NULL
-- 使用COALESCE处理NULL
SELECT
user_id,
COALESCE(nickname, real_name, '匿名用户') AS display_name, -- 优先使用昵称
COALESCE(balance, 0) AS effective_balance -- NULL转为0
FROM users;
-- 使用NULLIF避免除零错误
SELECT
total_sales,
customer_count,
total_sales / NULLIF(customer_count, 0) AS avg_sales -- 避免除零
FROM sales_summary;
子查询过滤
-- EXISTS子查询
SELECT u.*
FROM users u
WHERE EXISTS (
SELECT 1 FROM orders o
WHERE o.user_id = u.user_id
AND o.amount > 1000
); -- 存在大额订单的用户
-- IN子查询
SELECT * FROM products
WHERE category_id IN (
SELECT category_id FROM categories
WHERE parent_id = 1
); -- 属于某个父分类的商品
-- 相关子查询
SELECT
user_id,
(SELECT COUNT(*) FROM orders o WHERE o.user_id = u.user_id) AS order_count
FROM users u
WHERE (SELECT AVG(amount) FROM orders o WHERE o.user_id = u.user_id) > 500;
-- 平均订单金额大于500的用户
4. GROUP BY分组聚合
4.1 基础分组聚合
单字段分组
-- 按城市分组统计用户
SELECT
city,
COUNT(*) AS user_count,
AVG(age) AS avg_age,
MAX(age) AS max_age,
MIN(age) AS min_age
FROM users
WHERE status = 'ACTIVE'
GROUP BY city;
-- 按日期分组统计订单
SELECT
CAST(order_time AS DATE) AS order_date, -- 按日期分组
COUNT(DISTINCT user_id) AS daily_users, -- 日活跃用户数
SUM(amount) AS daily_revenue, -- 日收入
AVG(amount) AS avg_order_value -- 平均订单价值
FROM orders
GROUP BY CAST(order_time AS DATE);
多字段分组
-- 多维度分组统计
SELECT
category_id,
supplier_id,
COUNT(*) AS product_count,
AVG(price) AS avg_price,
SUM(stock) AS total_stock
FROM products
WHERE discontinued = FALSE
GROUP BY category_id, supplier_id; -- 按分类和供应商分组
-- 使用GROUPING SETS进行多维度聚合
SELECT
COALESCE(city, '所有城市') AS city,
COALESCE(age_group, '所有年龄段') AS age_group,
COUNT(*) AS user_count
FROM (
SELECT
city,
CASE
WHEN age < 20 THEN '20岁以下'
WHEN age < 40 THEN '20-40岁'
ELSE '40岁以上'
END AS age_group
FROM users
) user_groups
GROUP BY GROUPING SETS ((city, age_group), (city), (age_group), ());
4.2 流式分组聚合特性
窗口分组聚合
-- 滚动窗口分组
SELECT
window_start,
window_end,
user_id,
COUNT(*) AS event_count,
SUM(amount) AS total_amount
FROM user_events
FROM TUMBLE(TABLE user_events, DESCRIPTOR(event_time), INTERVAL '1' HOUR)
GROUP BY
window_start,
window_start,
user_id;
-- 滑动窗口分组
SELECT
window_start,
window_end,
page_id,
COUNT(DISTINCT user_id) AS uv -- 5分钟滑动窗口的独立访客
FROM page_views
FROM HOP(TABLE page_views, DESCRIPTOR(event_time), INTERVAL '1' SECOND, INTERVAL '5' MINUTES)
GROUP BY
window_start,
window_end,
page_id;
会话窗口分组
-- 会话窗口(按用户活跃程度分组)
SELECT
window_start AS session_start,
window_end AS session_end,
user_id,
COUNT(*) AS events_per_session,
MAX(event_time) - MIN(event_time) AS session_duration
FROM user_events
FROM SESSION(TABLE user_events PARTITION BY user_id, DESCRIPTOR(event_time), INTERVAL '5' MINUTES)
GROUP BY
window_start,
window_end,
user_id;
5. HAVING子句分组后过滤
5.1 HAVING与WHERE的区别
-- WHERE在分组前过滤,HAVING在分组后过滤
SELECT
city,
COUNT(*) AS user_count,
AVG(age) AS avg_age
FROM users
WHERE register_time > '2023-01-01' -- 分组前过滤:2023年后注册的用户
GROUP BY city
HAVING COUNT(*) > 1000 AND AVG(age) < 40; -- 分组后过滤:用户数>1000且平均年龄<40的城市
-- 错误示例:不能在WHERE中使用聚合函数
-- SELECT city, COUNT(*) FROM users WHERE COUNT(*) > 1000 GROUP BY city; -- 错误!
-- 正确用法:聚合函数条件放在HAVING中
SELECT city, COUNT(*) AS user_count
FROM users
GROUP BY city
HAVING COUNT(*) > 1000; -- 正确
5.2 复杂HAVING条件
-- 多条件HAVING过滤
SELECT
product_category,
DATE_FORMAT(order_time, 'yyyy-MM') AS order_month,
COUNT(DISTINCT user_id) AS unique_customers,
SUM(amount) AS total_sales,
AVG(amount) AS avg_order_value
FROM orders
WHERE order_time >= '2023-01-01'
GROUP BY product_category, DATE_FORMAT(order_time, 'yyyy-MM')
HAVING
COUNT(DISTINCT user_id) >= 1000 -- 客户数超过1000
AND SUM(amount) > 50000 -- 销售额超过5万
AND AVG(amount) BETWEEN 100 AND 500 -- 平均订单价值在100-500之间
ORDER BY total_sales DESC;
6. DISTINCT去重处理
6.1 基础去重操作
-- 单字段去重
SELECT DISTINCT city FROM users; -- 所有不重复的城市
SELECT DISTINCT status FROM orders; -- 所有不重复的订单状态
-- 多字段组合去重
SELECT DISTINCT province, city FROM users; -- 不重复的省市区组合
-- 在聚合函数中去重
SELECT
COUNT(DISTINCT user_id) AS unique_users, -- 独立用户数
COUNT(DISTINCT product_id) AS unique_products -- 独立商品数
FROM order_items;
6.2 流式环境下的去重挑战
-- 基于时间的去重(最近N分钟内去重)
SELECT
user_id,
SESSION_START(event_time, INTERVAL '5' MINUTE) AS window_start
FROM (
SELECT
user_id,
event_time,
LAG(event_time) OVER (PARTITION BY user_id ORDER BY event_time) AS prev_time
FROM click_events
)
WHERE prev_time IS NULL OR event_time > prev_time + INTERVAL '5' MINUTE;
-- 使用ROW_NUMBER()去重(保留最新记录)
SELECT user_id, event_type, event_time
FROM (
SELECT *,
ROW_NUMBER() OVER (
PARTITION BY user_id, event_type
ORDER BY event_time DESC
) as rn
FROM user_events
)
WHERE rn = 1; -- 每个用户每种事件类型只保留最新记录
7. ORDER BY与LIMIT排序限制
7.1 流式排序注意事项
-- 有限流的排序(配合LIMIT使用)
SELECT user_id, total_amount
FROM (
SELECT
user_id,
SUM(amount) as total_amount
FROM orders
WHERE order_time >= '2023-01-01'
GROUP BY user_id
)
ORDER BY total_amount DESC
LIMIT 10; -- 2023年消费金额TOP10用户
-- 时间窗口内的排序
SELECT *
FROM (
SELECT
user_id,
amount,
event_time,
ROW_NUMBER() OVER (
PARTITION BY TUMBLE(event_time, INTERVAL '1' HOUR)
ORDER BY amount DESC
) as rank_in_hour
FROM transactions
)
WHERE rank_in_hour <= 5; -- 每小时交易金额TOP5
7.2 LIMIT限制结果集
-- 基础LIMIT使用
SELECT * FROM users ORDER BY register_time DESC LIMIT 100; -- 最新注册的100个用户
-- 分页查询(注意:流环境下分页语义不同)
SELECT * FROM products
ORDER BY price DESC
LIMIT 10 OFFSET 20; -- 价格第21-30高的商品
-- 采样查询
SELECT * FROM logs
WHERE MOD(ABS(HASH(log_id)), 100) = 0 -- 哈希采样1%
LIMIT 1000;
8. WITH子句(CTE)查询优化
8.1 公共表表达式使用
-- 使用CTE简化复杂查询
WITH
active_users AS (
SELECT user_id, user_name, city
FROM users
WHERE status = 'ACTIVE'
AND last_login_time > CURRENT_TIMESTAMP - INTERVAL '30' DAY
),
big_orders AS (
SELECT user_id, SUM(amount) as total_amount
FROM orders
WHERE order_time >= '2023-01-01'
GROUP BY user_id
HAVING SUM(amount) > 10000
)
SELECT
au.user_id,
au.user_name,
au.city,
bo.total_amount
FROM active_users au
JOIN big_orders bo ON au.user_id = bo.user_id
ORDER BY bo.total_amount DESC;
9. 流式查询优化技巧
9.1 谓词下推优化
-- 优化前:过滤在聚合之后
SELECT category, COUNT(*)
FROM products
GROUP BY category
HAVING COUNT(*) > 100; -- 过滤在分组后
-- 优化后:尽早过滤
SELECT category, COUNT(*)
FROM products
WHERE price > 0 -- 尽早过滤无效数据
GROUP BY category
HAVING COUNT(*) > 100;
-- 使用派生表提前过滤
SELECT category, COUNT(*)
FROM (
SELECT * FROM products
WHERE discontinued = FALSE
AND stock > 0
) available_products
GROUP BY category;
9.2 状态后端优化
-- 为分组键设置状态TTL
SELECT
user_id,
COUNT(*) AS event_count
FROM user_events
GROUP BY user_id
-- 设置状态保留时间,避免状态无限增长
/*+ STATE_TTL('1 days') */;
-- 使用窗口缩小状态范围
SELECT
user_id,
start_window AS window_day,
COUNT(*) AS daily_events
FROM user_events
FROM TUMBLE(TABLE user_events, DESCRIPTOR(event_time), INTERVAL '1' DAY)
GROUP BY
user_id,
start_window
10. 实战案例:用户行为分析
10.1 复杂查询示例
-- 用户行为深度分析
WITH user_behavior AS (
SELECT
user_id,
event_type,
event_time,
LAG(event_time) OVER (
PARTITION BY user_id
ORDER BY event_time
) AS prev_event_time,
LEAD(event_type) OVER (
PARTITION BY user_id
ORDER BY event_time
) AS next_event_type
FROM user_events
WHERE event_time >= CURRENT_TIMESTAMP - INTERVAL '7' DAY
),
sessionized_events AS (
SELECT
user_id,
event_type,
event_time,
-- 会话标识:30分钟无活动为新会话
SUM(CASE
WHEN prev_event_time IS NULL
OR event_time > prev_event_time + INTERVAL '30' MINUTE
THEN 1 ELSE 0
END) OVER (
PARTITION BY user_id
ORDER BY event_time
) AS session_id
FROM user_behavior
)
SELECT
user_id,
COUNT(*) AS total_events,
COUNT(DISTINCT session_id) AS session_count,
AVG(session_events) AS avg_events_per_session,
MAX(session_duration) AS max_session_duration
FROM (
SELECT
user_id,
session_id,
COUNT(*) AS session_events,
MAX(event_time) - MIN(event_time) AS session_duration
FROM sessionized_events
GROUP BY user_id, session_id
) session_stats
GROUP BY user_id
HAVING COUNT(DISTINCT session_id) >= 3 -- 至少3个会话的用户
ORDER BY total_events DESC;
掌握SELECT、WHERE、GROUP BY等基础查询子句是构建复杂流处理应用的关键。
926

被折叠的 条评论
为什么被折叠?



