Flink SQL查询基础:SELECT、WHERE、GROUP BY子句详解

1. SELECT查询基础架构

1.1 流式SELECT查询的特殊性

与传统数据库查询不同,Flink SQL的SELECT查询针对持续变化的数据流设计:
连续查询特性
查询结果持续更新,而非一次性返回
支持无界数据流的增量计算
结果表随输入流的变化而动态变化
需要明确的时间语义定义

1.2 基础SELECT语法结构

[WITH with_query [, ...]]
SELECT [ALL | DISTINCT] select_expr [, ...]
FROM table_expression
[[LEFT|RIGHT|FULL] JOIN table_expression ON condition]
[WHERE condition]
[GROUP BY grouping_element [, ...]]
[HAVING condition]
[ORDER BY order_element [ASC | DESC] [, ...]]
[LIMIT count]

2. SELECT子句深度解析

2.1 列选择与表达式计算

基础列选择

-- 选择特定列
SELECT user_id, user_name, email FROM users;

-- 使用表别名
SELECT u.user_id, u.user_name FROM users AS u;

-- 选择所有列(生产环境慎用)
SELECT * FROM user_events;

表达式计算

-- 算术表达式
SELECT 
    price,
    quantity,
    price * quantity AS total_amount,           -- 乘法计算
    (price * quantity) * 0.9 AS discounted_total, -- 折扣计算
    ROUND(price * 1.1, 2) AS price_with_tax     -- 四舍五入
FROM order_items;

-- 字符串表达式
SELECT
    first_name,
    last_name,
    first_name || ' ' || last_name AS full_name,  -- 字符串拼接
    UPPER(first_name) AS upper_name,             -- 大写转换
    LENGTH(first_name) AS name_length,           -- 长度计算
    SUBSTRING(email FROM 1 FOR 5) AS email_prefix -- 子字符串
FROM customers;

-- 时间表达式
SELECT
    event_time,
    event_time + INTERVAL '1' HOUR AS plus_one_hour,    -- 时间加法
    EXTRACT(HOUR FROM event_time) AS event_hour,        -- 提取小时
    DATE_FORMAT(event_time, 'yyyy-MM-dd') AS event_date  -- 日期格式化
FROM events;

2.2 条件表达式与函数调用

CASE条件表达式

-- 简单CASE表达式
SELECT
    user_id,
    status,
    CASE status 
        WHEN 'ACTIVE' THEN '活跃用户'
        WHEN 'INACTIVE' THEN '非活跃用户' 
        WHEN 'SUSPENDED' THEN '冻结用户'
        ELSE '未知状态'
    END AS status_description
FROM users;

-- 搜索式CASE表达式
SELECT
    order_amount,
    CASE 
        WHEN order_amount >= 1000 THEN '大额订单'
        WHEN order_amount >= 500 THEN '中额订单'
        WHEN order_amount >= 100 THEN '小额订单'
        ELSE '微小订单'
    END AS order_level,
    CASE 
        WHEN order_amount > 1000 AND status = 'COMPLETED' THEN 'VIP订单'
        WHEN create_time > CURRENT_TIMESTAMP - INTERVAL '7' DAY THEN '近期订单'
        ELSE '普通订单'
    END AS order_type
FROM orders;

内置函数使用

-- 数学函数
SELECT
    value,
    ABS(value) AS absolute_value,        -- 绝对值
    CEIL(value) AS ceiling_value,        -- 向上取整
    FLOOR(value) AS floor_value,         -- 向下取整
    LOG(value) AS natural_log,           -- 自然对数
    POWER(value, 2) AS squared_value     -- 平方
FROM metrics;

-- 字符串函数
SELECT
    description,
    TRIM(description) AS trimmed_desc,           -- 去除空格
    REPLACE(description, 'old', 'new') AS replaced_desc, -- 替换文本
    POSITION('keyword' IN description) AS keyword_pos,   -- 查找位置
    REVERSE(description) AS reversed_desc       -- 字符串反转
FROM products;

3. WHERE子句过滤技术

3.1 基础条件过滤

比较运算符

-- 数值比较
SELECT * FROM orders WHERE amount > 1000;
SELECT * FROM users WHERE age BETWEEN 18 AND 65;
SELECT * FROM products WHERE price IN (10, 20, 30);

-- 字符串比较
SELECT * FROM users WHERE name LIKE '张%';          -- 张姓用户
SELECT * FROM products WHERE name ILIKE '%phone%';  -- 不区分大小写
SELECT * FROM logs WHERE message SIMILAR TO '%(error|exception)%';

-- 时间比较
SELECT * FROM events 
WHERE event_time >= CURRENT_TIMESTAMP - INTERVAL '1' HOUR;  -- 最近1小时事件

SELECT * FROM orders 
WHERE order_date BETWEEN '2023-01-01' AND '2023-12-31';    -- 2023年订单

逻辑运算符组合

-- AND/OR组合条件
SELECT * FROM users 
WHERE (age >= 18 AND age <= 35)        -- 年轻用户
   AND (city = '北京' OR city = '上海')   -- 一线城市
   AND status = 'ACTIVE';               -- 活跃状态

-- NOT运算符
SELECT * FROM products 
WHERE NOT discontinued                 -- 未下架商品
  AND NOT price IS NULL;               -- 价格不为空

-- 复杂逻辑组合
SELECT * FROM orders 
WHERE (amount > 1000 OR item_count >= 10)  -- 大额或多商品订单
  AND status IN ('COMPLETED', 'SHIPPED')    -- 已完成或已发货
  AND NOT (payment_method = 'COD' AND amount > 5000);  -- 排除大额货到付款

3.2 高级过滤技巧

NULL值处理

-- NULL值判断
SELECT * FROM users WHERE phone IS NULL;           -- 手机号为NULL
SELECT * FROM products WHERE description IS NOT NULL; -- 描述不为NULL

-- 使用COALESCE处理NULL
SELECT 
    user_id,
    COALESCE(nickname, real_name, '匿名用户') AS display_name,  -- 优先使用昵称
    COALESCE(balance, 0) AS effective_balance     -- NULL转为0
FROM users;

-- 使用NULLIF避免除零错误
SELECT 
    total_sales,
    customer_count,
    total_sales / NULLIF(customer_count, 0) AS avg_sales  -- 避免除零
FROM sales_summary;

子查询过滤

-- EXISTS子查询
SELECT u.* 
FROM users u
WHERE EXISTS (
    SELECT 1 FROM orders o 
    WHERE o.user_id = u.user_id 
      AND o.amount > 1000
);  -- 存在大额订单的用户

-- IN子查询
SELECT * FROM products
WHERE category_id IN (
    SELECT category_id FROM categories 
    WHERE parent_id = 1
);  -- 属于某个父分类的商品

-- 相关子查询
SELECT 
    user_id,
    (SELECT COUNT(*) FROM orders o WHERE o.user_id = u.user_id) AS order_count
FROM users u
WHERE (SELECT AVG(amount) FROM orders o WHERE o.user_id = u.user_id) > 500;
-- 平均订单金额大于500的用户

4. GROUP BY分组聚合

4.1 基础分组聚合

单字段分组

-- 按城市分组统计用户
SELECT 
    city,
    COUNT(*) AS user_count,
    AVG(age) AS avg_age,
    MAX(age) AS max_age,
    MIN(age) AS min_age
FROM users 
WHERE status = 'ACTIVE'
GROUP BY city;

-- 按日期分组统计订单
SELECT
    CAST(order_time AS DATE) AS order_date,  -- 按日期分组
    COUNT(DISTINCT user_id) AS daily_users,   -- 日活跃用户数
    SUM(amount) AS daily_revenue,             -- 日收入
    AVG(amount) AS avg_order_value            -- 平均订单价值
FROM orders
GROUP BY CAST(order_time AS DATE);

多字段分组

-- 多维度分组统计
SELECT
    category_id,
    supplier_id, 
    COUNT(*) AS product_count,
    AVG(price) AS avg_price,
    SUM(stock) AS total_stock
FROM products
WHERE discontinued = FALSE
GROUP BY category_id, supplier_id;  -- 按分类和供应商分组

-- 使用GROUPING SETS进行多维度聚合
SELECT
    COALESCE(city, '所有城市') AS city,
    COALESCE(age_group, '所有年龄段') AS age_group,
    COUNT(*) AS user_count
FROM (
    SELECT 
        city,
        CASE 
            WHEN age < 20 THEN '20岁以下'
            WHEN age < 40 THEN '20-40岁' 
            ELSE '40岁以上'
        END AS age_group
    FROM users
) user_groups
GROUP BY GROUPING SETS ((city, age_group), (city), (age_group), ());

4.2 流式分组聚合特性

窗口分组聚合

-- 滚动窗口分组
SELECT
    window_start,
    window_end,
    user_id,
    COUNT(*) AS event_count,
    SUM(amount) AS total_amount
FROM user_events
FROM TUMBLE(TABLE user_events, DESCRIPTOR(event_time), INTERVAL '1' HOUR)
GROUP BY 
    window_start,
    window_start,
    user_id;

-- 滑动窗口分组
SELECT
    window_start,
    window_end,
    page_id,
    COUNT(DISTINCT user_id) AS uv  -- 5分钟滑动窗口的独立访客
FROM page_views
FROM HOP(TABLE page_views, DESCRIPTOR(event_time), INTERVAL '1' SECOND, INTERVAL '5' MINUTES)
GROUP BY 
    window_start,
    window_end,
    page_id;

会话窗口分组

-- 会话窗口(按用户活跃程度分组)
SELECT
    window_start AS session_start,
    window_end AS session_end,
    user_id,
    COUNT(*) AS events_per_session,
    MAX(event_time) - MIN(event_time) AS session_duration
FROM user_events
FROM SESSION(TABLE user_events PARTITION BY user_id, DESCRIPTOR(event_time), INTERVAL '5' MINUTES) 
GROUP BY 
    window_start,
    window_end,
    user_id;

5. HAVING子句分组后过滤

5.1 HAVING与WHERE的区别

-- WHERE在分组前过滤,HAVING在分组后过滤
SELECT 
    city,
    COUNT(*) AS user_count,
    AVG(age) AS avg_age
FROM users
WHERE register_time > '2023-01-01'  -- 分组前过滤:2023年后注册的用户
GROUP BY city
HAVING COUNT(*) > 1000 AND AVG(age) < 40;  -- 分组后过滤:用户数>1000且平均年龄<40的城市

-- 错误示例:不能在WHERE中使用聚合函数
-- SELECT city, COUNT(*) FROM users WHERE COUNT(*) > 1000 GROUP BY city;  -- 错误!

-- 正确用法:聚合函数条件放在HAVING中
SELECT city, COUNT(*) AS user_count 
FROM users 
GROUP BY city
HAVING COUNT(*) > 1000;  -- 正确

5.2 复杂HAVING条件

-- 多条件HAVING过滤
SELECT
    product_category,
    DATE_FORMAT(order_time, 'yyyy-MM') AS order_month,
    COUNT(DISTINCT user_id) AS unique_customers,
    SUM(amount) AS total_sales,
    AVG(amount) AS avg_order_value
FROM orders
WHERE order_time >= '2023-01-01'
GROUP BY product_category, DATE_FORMAT(order_time, 'yyyy-MM')
HAVING 
    COUNT(DISTINCT user_id) >= 1000      -- 客户数超过1000
    AND SUM(amount) > 50000              -- 销售额超过5万
    AND AVG(amount) BETWEEN 100 AND 500  -- 平均订单价值在100-500之间
ORDER BY total_sales DESC;

6. DISTINCT去重处理

6.1 基础去重操作

-- 单字段去重
SELECT DISTINCT city FROM users;          -- 所有不重复的城市
SELECT DISTINCT status FROM orders;       -- 所有不重复的订单状态

-- 多字段组合去重
SELECT DISTINCT province, city FROM users;  -- 不重复的省市区组合

-- 在聚合函数中去重
SELECT 
    COUNT(DISTINCT user_id) AS unique_users,     -- 独立用户数
    COUNT(DISTINCT product_id) AS unique_products -- 独立商品数
FROM order_items;

6.2 流式环境下的去重挑战

-- 基于时间的去重(最近N分钟内去重)
SELECT 
    user_id,
    SESSION_START(event_time, INTERVAL '5' MINUTE) AS window_start
FROM (
    SELECT 
        user_id,
        event_time,
        LAG(event_time) OVER (PARTITION BY user_id ORDER BY event_time) AS prev_time
    FROM click_events
) 
WHERE prev_time IS NULL OR event_time > prev_time + INTERVAL '5' MINUTE;

-- 使用ROW_NUMBER()去重(保留最新记录)
SELECT user_id, event_type, event_time
FROM (
    SELECT *,
        ROW_NUMBER() OVER (
            PARTITION BY user_id, event_type 
            ORDER BY event_time DESC
        ) as rn
    FROM user_events
) 
WHERE rn = 1;  -- 每个用户每种事件类型只保留最新记录

7. ORDER BY与LIMIT排序限制

7.1 流式排序注意事项

-- 有限流的排序(配合LIMIT使用)
SELECT user_id, total_amount
FROM (
    SELECT 
        user_id, 
        SUM(amount) as total_amount
    FROM orders 
    WHERE order_time >= '2023-01-01'
    GROUP BY user_id
)
ORDER BY total_amount DESC
LIMIT 10;  -- 2023年消费金额TOP10用户

-- 时间窗口内的排序
SELECT *
FROM (
    SELECT 
        user_id,
        amount,
        event_time,
        ROW_NUMBER() OVER (
            PARTITION BY TUMBLE(event_time, INTERVAL '1' HOUR)
            ORDER BY amount DESC
        ) as rank_in_hour
    FROM transactions
)
WHERE rank_in_hour <= 5;  -- 每小时交易金额TOP5

7.2 LIMIT限制结果集

-- 基础LIMIT使用
SELECT * FROM users ORDER BY register_time DESC LIMIT 100;  -- 最新注册的100个用户

-- 分页查询(注意:流环境下分页语义不同)
SELECT * FROM products 
ORDER BY price DESC 
LIMIT 10 OFFSET 20;  -- 价格第21-30高的商品

-- 采样查询
SELECT * FROM logs 
WHERE MOD(ABS(HASH(log_id)), 100) = 0  -- 哈希采样1%
LIMIT 1000;

8. WITH子句(CTE)查询优化

8.1 公共表表达式使用

-- 使用CTE简化复杂查询
WITH 
active_users AS (
    SELECT user_id, user_name, city
    FROM users 
    WHERE status = 'ACTIVE' 
      AND last_login_time > CURRENT_TIMESTAMP - INTERVAL '30' DAY
),
big_orders AS (
    SELECT user_id, SUM(amount) as total_amount
    FROM orders 
    WHERE order_time >= '2023-01-01'
    GROUP BY user_id
    HAVING SUM(amount) > 10000
)
SELECT 
    au.user_id,
    au.user_name,
    au.city,
    bo.total_amount
FROM active_users au
JOIN big_orders bo ON au.user_id = bo.user_id
ORDER BY bo.total_amount DESC;

9. 流式查询优化技巧

9.1 谓词下推优化

-- 优化前:过滤在聚合之后
SELECT category, COUNT(*) 
FROM products
GROUP BY category
HAVING COUNT(*) > 100;  -- 过滤在分组后

-- 优化后:尽早过滤
SELECT category, COUNT(*) 
FROM products
WHERE price > 0  -- 尽早过滤无效数据
GROUP BY category
HAVING COUNT(*) > 100;

-- 使用派生表提前过滤
SELECT category, COUNT(*)
FROM (
    SELECT * FROM products 
    WHERE discontinued = FALSE 
      AND stock > 0
) available_products
GROUP BY category;

9.2 状态后端优化

-- 为分组键设置状态TTL
SELECT 
    user_id,
    COUNT(*) AS event_count
FROM user_events
GROUP BY user_id
-- 设置状态保留时间,避免状态无限增长
/*+ STATE_TTL('1 days') */;

-- 使用窗口缩小状态范围
SELECT 
    user_id,
    start_window AS window_day,
    COUNT(*) AS daily_events
FROM user_events
FROM TUMBLE(TABLE user_events, DESCRIPTOR(event_time), INTERVAL '1' DAY)
GROUP BY 
    user_id,
    start_window

10. 实战案例:用户行为分析

10.1 复杂查询示例

-- 用户行为深度分析
WITH user_behavior AS (
    SELECT 
        user_id,
        event_type,
        event_time,
        LAG(event_time) OVER (
            PARTITION BY user_id 
            ORDER BY event_time
        ) AS prev_event_time,
        LEAD(event_type) OVER (
            PARTITION BY user_id 
            ORDER BY event_time
        ) AS next_event_type
    FROM user_events
    WHERE event_time >= CURRENT_TIMESTAMP - INTERVAL '7' DAY
),
sessionized_events AS (
    SELECT 
        user_id,
        event_type,
        event_time,
        -- 会话标识:30分钟无活动为新会话
        SUM(CASE 
            WHEN prev_event_time IS NULL 
              OR event_time > prev_event_time + INTERVAL '30' MINUTE 
            THEN 1 ELSE 0 
        END) OVER (
            PARTITION BY user_id 
            ORDER BY event_time
        ) AS session_id
    FROM user_behavior
)
SELECT 
    user_id,
    COUNT(*) AS total_events,
    COUNT(DISTINCT session_id) AS session_count,
    AVG(session_events) AS avg_events_per_session,
    MAX(session_duration) AS max_session_duration
FROM (
    SELECT 
        user_id,
        session_id,
        COUNT(*) AS session_events,
        MAX(event_time) - MIN(event_time) AS session_duration
    FROM sessionized_events
    GROUP BY user_id, session_id
) session_stats
GROUP BY user_id
HAVING COUNT(DISTINCT session_id) >= 3  -- 至少3个会话的用户
ORDER BY total_events DESC;

掌握SELECT、WHERE、GROUP BY等基础查询子句是构建复杂流处理应用的关键。

from pyflink.datastream import StreamExecutionEnvironment from pyflink.table import StreamTableEnvironment, EnvironmentSettings, DataTypes from pyflink.table.udf import udf import os import logging # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # 定义处理CDC操作的UDF @udf(result_type=DataTypes.STRING()) def process_cdc_op(op): return { 'I': 'INSERT', 'U': 'UPDATE', 'D': 'DELETE' }.get(op, 'UNKNOWN') def safe_execute_sql(t_env, sql, object_type="statement"): """安全执行SQL语句,处理对象已存在的情况""" try: t_env.execute_sql(sql) logger.info(f"Successfully executed: {sql[:60]}...") return True except Exception as e: if "already exists" in str(e).lower(): logger.warning(f"Object already exists, skipping creation: {e}") return True else: logger.error(f"Error executing {object_type}: {sql}\n{str(e)}") return False def create_database_if_not_exists(t_env, catalog, db_name): """安全创建数据库""" t_env.execute_sql(f"USE CATALOG {catalog}") dbs = [row[0] for row in t_env.execute_sql("SHOW DATABASES").collect()] if db_name not in dbs: create_db_sql = f"CREATE DATABASE {db_name}" if not safe_execute_sql(t_env, create_db_sql, "database"): return False else: logger.info(f"Database {db_name} already exists in catalog {catalog}") t_env.execute_sql(f"USE {db_name}") return True def table_exists(t_env, table_name): """检查表是否存在""" try: t_env.from_path(table_name) return True except: return False def main(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.enable_checkpointing(5000) # 添加必要的连接器JAR包 flink_home = os.getenv('FLINK_HOME', '/opt/flink') required_jars = [ f"file://{flink_home}/lib/flink-connector-kafka-1.17.1.jar", f"file://{flink_home}/lib/flink-connector-jdbc-1.17.1.jar", f"file://{flink_home}/lib/flink-sql-connector-hive-3.1.2_2.12-1.16.3.jar", f"file://{flink_home}/lib/mysql-connector-java-8.0.28.jar", f"file://{flink_home}/lib/hive-exec-3.1.2.jar", ] for jar in required_jars: env.add_jars(jar) logger.info(f"Added JAR: {jar}") settings = EnvironmentSettings.new_instance().in_streaming_mode().build() t_env = StreamTableEnvironment.create(env, environment_settings=settings) # 注册UDF t_env.create_temporary_function("process_cdc_op", process_cdc_op) # 1. 创建Hive Catalog - 使用单行语句避免解析错误 hive_catalog_sql = ( "CREATE CATALOG hive_catalog WITH (" "'type' = 'hive', " "'hive-conf-dir' = '/opt/hive/conf'" ")" ) safe_execute_sql(t_env, hive_catalog_sql, "catalog") # 2. 使用默认catalog(无需创建) t_env.use_catalog("default_catalog") # 3. 创建默认数据库(如果不存在) if not create_database_if_not_exists(t_env, "default_catalog", "default_database"): logger.error("Failed to create default database") return # 4. 创建Kafka源表(安全方式) kafka_table_ddl = ( "CREATE TABLE kafka_user_meal (" "id STRING, " "review STRING, " "rating DOUBLE, " "review_time BIGINT, " "user_id STRING, " "meal_id STRING, " "op STRING, " "ts AS TO_TIMESTAMP(FROM_UNIXTIME(review_time)), " "WATERMARK FOR ts AS ts - INTERVAL '5' SECOND, " "PRIMARY KEY (id) NOT ENFORCED" ") WITH (" "'connector' = 'kafka', " "'topic' = 'cleaned-user-meal-reviews', " "'properties.bootstrap.servers' = 'master:9092,slave01:9092,slave02:9092', " "'properties.group.id' = 'flink-cdc-group', " "'scan.startup.mode' = 'latest-offset', " "'format' = 'json', " "'json.ignore-parse-errors' = 'true'" ")" ) if not safe_execute_sql(t_env, kafka_table_ddl, "table"): logger.error("Failed to create Kafka table") return # 5. 创建Hive数据库和表 t_env.execute_sql("USE CATALOG hive_catalog") if not create_database_if_not_exists(t_env, "hive_catalog", "user_meal"): return hive_table_ddl = ( "CREATE TABLE hive_user_meal_cdc (" "id STRING COMMENT '唯一标识', " "review STRING COMMENT '评价内容', " "rating DOUBLE COMMENT '评分', " "review_time TIMESTAMP COMMENT '评价时间', " "user_id STRING COMMENT '用户ID', " "meal_id STRING COMMENT '餐品ID', " "operation_type STRING COMMENT '操作类型', " "operation_ts TIMESTAMP COMMENT '操作时间', " "op STRING COMMENT '原始操作类型'" ") PARTITIONED BY (op) " "STORED AS ORC " "TBLPROPERTIES (" "'sink.partition-commit.policy.kind' = 'metastore,success-file', " "'auto-compaction' = 'true'" ")" ) if not safe_execute_sql(t_env, hive_table_ddl, "table"): return # 6. 创建MySQL表(在默认catalog中) t_env.execute_sql("USE CATALOG default_catalog") t_env.execute_sql("USE default_database") mysql_table_ddl = ( "CREATE TABLE mysql_user_meal (" "id STRING PRIMARY KEY NOT ENFORCED COMMENT '唯一标识', " "review STRING COMMENT '评价内容', " "rating DOUBLE COMMENT '评分', " "review_time TIMESTAMP(3) COMMENT '评价时间', " "user_id STRING COMMENT '用户ID', " "meal_id STRING COMMENT '餐品ID', " "last_operation STRING COMMENT '最后操作类型', " "update_time TIMESTAMP(3) COMMENT '更新时间'" ") WITH (" "'connector' = 'jdbc', " "'url' = 'jdbc:mysql://mysql-host:3306/user_meal', " "'table-name' = 'user_meal_reviews', " "'username' = 'root', " "'password' = '5266', " "'driver' = 'com.mysql.cj.jdbc.Driver', " "'sink.buffer-flush.max-rows' = '100', " "'sink.buffer-flush.interval' = '5s'" ")" ) if not safe_execute_sql(t_env, mysql_table_ddl, "table"): return # 7. 写入Hive hive_insert_sql = ( "INSERT INTO hive_catalog.user_meal.hive_user_meal_cdc " "SELECT " "id, " "review, " "rating, " "TO_TIMESTAMP(FROM_UNIXTIME(review_time)) AS review_time, " "user_id, " "meal_id, " "process_cdc_op(op) AS operation_type, " "CURRENT_TIMESTAMP AS operation_ts, " "op " "FROM default_catalog.default_database.kafka_user_meal" ) if not safe_execute_sql(t_env, hive_insert_sql, "insert"): return # 8. 写入MySQL mysql_insert_sql = ( "INSERT INTO mysql_user_meal " "SELECT " "id, " "LAST_VALUE(review) AS review, " "LAST_VALUE(rating) AS rating, " "MAX(ts) AS review_time, " "LAST_VALUE(user_id) AS user_id, " "LAST_VALUE(meal_id) AS meal_id, " "LAST_VALUE(process_cdc_op(op)) AS last_operation, " "CURRENT_TIMESTAMP AS update_time " "FROM (" "SELECT *, " "ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS row_num " "FROM default_catalog.default_database.kafka_user_meal " "WHERE op <> 'D'" ") " "WHERE row_num = 1" ) if not safe_execute_sql(t_env, mysql_insert_sql, "insert"): return # 执行任务 logger.info("Starting pipeline execution...") try: env.execute("Flink CDC to Hive and MySQL Pipeline") logger.info("Pipeline execution completed successfully") except Exception as e: logger.error(f"Pipeline execution failed: {str(e)}") if __name__ == '__main__': main() 代码可以跑起来,但是被杀死了
06-19
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值