Flink ROW_NUMBER()-排行榜和去重

本文介绍了Flink SQL中ROW_NUMBER()函数的使用,主要应用于实现TOP-N排名和数据去重。通过官方文档和测试代码展示了如何进行操作,并强调了PARTITION BY和ORDER BY在功能实现中的关键作用。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

from pyflink.datastream import StreamExecutionEnvironment from pyflink.table import StreamTableEnvironment, EnvironmentSettings, DataTypes from pyflink.table.udf import udf import os import logging # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # 定义处理CDC操作的UDF @udf(result_type=DataTypes.STRING()) def process_cdc_op(op): return { 'I': 'INSERT', 'U': 'UPDATE', 'D': 'DELETE' }.get(op, 'UNKNOWN') def safe_execute_sql(t_env, sql, object_type="statement"): """安全执行SQL语句,处理对象已存在的情况""" try: t_env.execute_sql(sql) logger.info(f"Successfully executed: {sql[:60]}...") return True except Exception as e: if "already exists" in str(e).lower(): logger.warning(f"Object already exists, skipping creation: {e}") return True else: logger.error(f"Error executing {object_type}: {sql}\n{str(e)}") return False def create_database_if_not_exists(t_env, catalog, db_name): """安全创建数据库""" t_env.execute_sql(f"USE CATALOG {catalog}") dbs = [row[0] for row in t_env.execute_sql("SHOW DATABASES").collect()] if db_name not in dbs: create_db_sql = f"CREATE DATABASE {db_name}" if not safe_execute_sql(t_env, create_db_sql, "database"): return False else: logger.info(f"Database {db_name} already exists in catalog {catalog}") t_env.execute_sql(f"USE {db_name}") return True def table_exists(t_env, table_name): """检查表是否存在""" try: t_env.from_path(table_name) return True except: return False def main(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.enable_checkpointing(5000) # 添加必要的连接器JAR包 flink_home = os.getenv('FLINK_HOME', '/opt/flink') required_jars = [ f"file://{flink_home}/lib/flink-connector-kafka-1.17.1.jar", f"file://{flink_home}/lib/flink-connector-jdbc-1.17.1.jar", f"file://{flink_home}/lib/flink-sql-connector-hive-3.1.2_2.12-1.16.3.jar", f"file://{flink_home}/lib/mysql-connector-java-8.0.28.jar", f"file://{flink_home}/lib/hive-exec-3.1.2.jar", ] for jar in required_jars: env.add_jars(jar) logger.info(f"Added JAR: {jar}") settings = EnvironmentSettings.new_instance().in_streaming_mode().build() t_env = StreamTableEnvironment.create(env, environment_settings=settings) # 注册UDF t_env.create_temporary_function("process_cdc_op", process_cdc_op) # 1. 创建Hive Catalog - 使用单行语句避免解析错误 hive_catalog_sql = ( "CREATE CATALOG hive_catalog WITH (" "'type' = 'hive', " "'hive-conf-dir' = '/opt/hive/conf'" ")" ) safe_execute_sql(t_env, hive_catalog_sql, "catalog") # 2. 使用默认catalog(无需创建) t_env.use_catalog("default_catalog") # 3. 创建默认数据库(如果不存在) if not create_database_if_not_exists(t_env, "default_catalog", "default_database"): logger.error("Failed to create default database") return # 4. 创建Kafka源表(安全方式) kafka_table_ddl = ( "CREATE TABLE kafka_user_meal (" "id STRING, " "review STRING, " "rating DOUBLE, " "review_time BIGINT, " "user_id STRING, " "meal_id STRING, " "op STRING, " "ts AS TO_TIMESTAMP(FROM_UNIXTIME(review_time)), " "WATERMARK FOR ts AS ts - INTERVAL '5' SECOND, " "PRIMARY KEY (id) NOT ENFORCED" ") WITH (" "'connector' = 'kafka', " "'topic' = 'cleaned-user-meal-reviews', " "'properties.bootstrap.servers' = 'master:9092,slave01:9092,slave02:9092', " "'properties.group.id' = 'flink-cdc-group', " "'scan.startup.mode' = 'latest-offset', " "'format' = 'json', " "'json.ignore-parse-errors' = 'true'" ")" ) if not safe_execute_sql(t_env, kafka_table_ddl, "table"): logger.error("Failed to create Kafka table") return # 5. 创建Hive数据库表 t_env.execute_sql("USE CATALOG hive_catalog") if not create_database_if_not_exists(t_env, "hive_catalog", "user_meal"): return hive_table_ddl = ( "CREATE TABLE hive_user_meal_cdc (" "id STRING COMMENT '唯一标识', " "review STRING COMMENT '评价内容', " "rating DOUBLE COMMENT '评分', " "review_time TIMESTAMP COMMENT '评价时间', " "user_id STRING COMMENT '用户ID', " "meal_id STRING COMMENT '餐品ID', " "operation_type STRING COMMENT '操作类型', " "operation_ts TIMESTAMP COMMENT '操作时间', " "op STRING COMMENT '原始操作类型'" ") PARTITIONED BY (op) " "STORED AS ORC " "TBLPROPERTIES (" "'sink.partition-commit.policy.kind' = 'metastore,success-file', " "'auto-compaction' = 'true'" ")" ) if not safe_execute_sql(t_env, hive_table_ddl, "table"): return # 6. 创建MySQL表(在默认catalog中) t_env.execute_sql("USE CATALOG default_catalog") t_env.execute_sql("USE default_database") mysql_table_ddl = ( "CREATE TABLE mysql_user_meal (" "id STRING PRIMARY KEY NOT ENFORCED COMMENT '唯一标识', " "review STRING COMMENT '评价内容', " "rating DOUBLE COMMENT '评分', " "review_time TIMESTAMP(3) COMMENT '评价时间', " "user_id STRING COMMENT '用户ID', " "meal_id STRING COMMENT '餐品ID', " "last_operation STRING COMMENT '最后操作类型', " "update_time TIMESTAMP(3) COMMENT '更新时间'" ") WITH (" "'connector' = 'jdbc', " "'url' = 'jdbc:mysql://mysql-host:3306/user_meal', " "'table-name' = 'user_meal_reviews', " "'username' = 'root', " "'password' = '5266', " "'driver' = 'com.mysql.cj.jdbc.Driver', " "'sink.buffer-flush.max-rows' = '100', " "'sink.buffer-flush.interval' = '5s'" ")" ) if not safe_execute_sql(t_env, mysql_table_ddl, "table"): return # 7. 写入Hive hive_insert_sql = ( "INSERT INTO hive_catalog.user_meal.hive_user_meal_cdc " "SELECT " "id, " "review, " "rating, " "TO_TIMESTAMP(FROM_UNIXTIME(review_time)) AS review_time, " "user_id, " "meal_id, " "process_cdc_op(op) AS operation_type, " "CURRENT_TIMESTAMP AS operation_ts, " "op " "FROM default_catalog.default_database.kafka_user_meal" ) if not safe_execute_sql(t_env, hive_insert_sql, "insert"): return # 8. 写入MySQL mysql_insert_sql = ( "INSERT INTO mysql_user_meal " "SELECT " "id, " "LAST_VALUE(review) AS review, " "LAST_VALUE(rating) AS rating, " "MAX(ts) AS review_time, " "LAST_VALUE(user_id) AS user_id, " "LAST_VALUE(meal_id) AS meal_id, " "LAST_VALUE(process_cdc_op(op)) AS last_operation, " "CURRENT_TIMESTAMP AS update_time " "FROM (" "SELECT *, " "ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS row_num " "FROM default_catalog.default_database.kafka_user_meal " "WHERE op <> 'D'" ") " "WHERE row_num = 1" ) if not safe_execute_sql(t_env, mysql_insert_sql, "insert"): return # 执行任务 logger.info("Starting pipeline execution...") try: env.execute("Flink CDC to Hive and MySQL Pipeline") logger.info("Pipeline execution completed successfully") except Exception as e: logger.error(f"Pipeline execution failed: {str(e)}") if __name__ == '__main__': main() 代码可以跑起来,但是被杀死了
最新发布
06-19
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值