Flink SQL:Queries(SELECT & WHERE)

本文介绍了Flink SQL中SELECT和WHERE的基本用法。SELECT *用于获取所有列,但生产环境中推荐指定列名以提高稳定性。WHERE子句用于过滤行,同时支持使用内置和用户定义的标量函数进行复杂条件筛选。示例展示了如何从Orders表中选择特定列,并通过VALUES子句处理内联数据。

SELECT & WHERE clause

Batch Streaming

The general syntax of the SELECT statement is:
SELECT语句的一般语法为:

SELECT select_list FROM table_expression [ WHERE boolean_expression ]

The table_expression refers to any source of data. It could be an existing table, view, or VALUES clause, the joined results of multiple existing tables, or a subquery. Assuming that the table is available in the catalog, the following would read all rows from Orders.
table_expression引用任何数据源。它可以是现有的表、视图或VALUES子句、多个现有表的联接结果或子查询。假设Orders表在目录中可用,下面将读取Orders中的所有行。

SELECT * FRO
from pyflink.datastream import StreamExecutionEnvironment from pyflink.table import StreamTableEnvironment, EnvironmentSettings, DataTypes from pyflink.table.udf import udf import os import logging # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # 定义处理CDC操作的UDF @udf(result_type=DataTypes.STRING()) def process_cdc_op(op): return { 'I': 'INSERT', 'U': 'UPDATE', 'D': 'DELETE' }.get(op, 'UNKNOWN') def safe_execute_sql(t_env, sql, object_type="statement"): """安全执行SQL语句,处理对象已存在的情况""" try: t_env.execute_sql(sql) logger.info(f"Successfully executed: {sql[:60]}...") return True except Exception as e: if "already exists" in str(e).lower(): logger.warning(f"Object already exists, skipping creation: {e}") return True else: logger.error(f"Error executing {object_type}: {sql}\n{str(e)}") return False def create_database_if_not_exists(t_env, catalog, db_name): """安全创数据库""" t_env.execute_sql(f"USE CATALOG {catalog}") dbs = [row[0] for row in t_env.execute_sql("SHOW DATABASES").collect()] if db_name not in dbs: create_db_sql = f"CREATE DATABASE {db_name}" if not safe_execute_sql(t_env, create_db_sql, "database"): return False else: logger.info(f"Database {db_name} already exists in catalog {catalog}") t_env.execute_sql(f"USE {db_name}") return True def table_exists(t_env, table_name): """检查表是否存在""" try: t_env.from_path(table_name) return True except: return False def main(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.enable_checkpointing(5000) # 添加必要的连接器JAR包 flink_home = os.getenv('FLINK_HOME', '/opt/flink') required_jars = [ f"file://{flink_home}/lib/flink-connector-kafka-1.17.1.jar", f"file://{flink_home}/lib/flink-connector-jdbc-1.17.1.jar", f"file://{flink_home}/lib/flink-sql-connector-hive-3.1.2_2.12-1.16.3.jar", f"file://{flink_home}/lib/mysql-connector-java-8.0.28.jar", f"file://{flink_home}/lib/hive-exec-3.1.2.jar", ] for jar in required_jars: env.add_jars(jar) logger.info(f"Added JAR: {jar}") settings = EnvironmentSettings.new_instance().in_streaming_mode().build() t_env = StreamTableEnvironment.create(env, environment_settings=settings) # 注册UDF t_env.create_temporary_function("process_cdc_op", process_cdc_op) # 1. 创Hive Catalog - 使用单行语句避免解析错误 hive_catalog_sql = ( "CREATE CATALOG hive_catalog WITH (" "'type' = 'hive', " "'hive-conf-dir' = '/opt/hive/conf'" ")" ) safe_execute_sql(t_env, hive_catalog_sql, "catalog") # 2. 使用默认catalog(无需创) t_env.use_catalog("default_catalog") # 3. 创默认数据库(如果不存在) if not create_database_if_not_exists(t_env, "default_catalog", "default_database"): logger.error("Failed to create default database") return # 4. 创Kafka源表(安全方式) kafka_table_ddl = ( "CREATE TABLE kafka_user_meal (" "id STRING, " "review STRING, " "rating DOUBLE, " "review_time BIGINT, " "user_id STRING, " "meal_id STRING, " "op STRING, " "ts AS TO_TIMESTAMP(FROM_UNIXTIME(review_time)), " "WATERMARK FOR ts AS ts - INTERVAL '5' SECOND, " "PRIMARY KEY (id) NOT ENFORCED" ") WITH (" "'connector' = 'kafka', " "'topic' = 'cleaned-user-meal-reviews', " "'properties.bootstrap.servers' = 'master:9092,slave01:9092,slave02:9092', " "'properties.group.id' = 'flink-cdc-group', " "'scan.startup.mode' = 'latest-offset', " "'format' = 'json', " "'json.ignore-parse-errors' = 'true'" ")" ) if not safe_execute_sql(t_env, kafka_table_ddl, "table"): logger.error("Failed to create Kafka table") return # 5. 创Hive数据库和表 t_env.execute_sql("USE CATALOG hive_catalog") if not create_database_if_not_exists(t_env, "hive_catalog", "user_meal"): return hive_table_ddl = ( "CREATE TABLE hive_user_meal_cdc (" "id STRING COMMENT '唯一标识', " "review STRING COMMENT '评价内容', " "rating DOUBLE COMMENT '评分', " "review_time TIMESTAMP COMMENT '评价时间', " "user_id STRING COMMENT '用户ID', " "meal_id STRING COMMENT '餐品ID', " "operation_type STRING COMMENT '操作类型', " "operation_ts TIMESTAMP COMMENT '操作时间', " "op STRING COMMENT '原始操作类型'" ") PARTITIONED BY (op) " "STORED AS ORC " "TBLPROPERTIES (" "'sink.partition-commit.policy.kind' = 'metastore,success-file', " "'auto-compaction' = 'true'" ")" ) if not safe_execute_sql(t_env, hive_table_ddl, "table"): return # 6. 创MySQL表(在默认catalog中) t_env.execute_sql("USE CATALOG default_catalog") t_env.execute_sql("USE default_database") mysql_table_ddl = ( "CREATE TABLE mysql_user_meal (" "id STRING PRIMARY KEY NOT ENFORCED COMMENT '唯一标识', " "review STRING COMMENT '评价内容', " "rating DOUBLE COMMENT '评分', " "review_time TIMESTAMP(3) COMMENT '评价时间', " "user_id STRING COMMENT '用户ID', " "meal_id STRING COMMENT '餐品ID', " "last_operation STRING COMMENT '最后操作类型', " "update_time TIMESTAMP(3) COMMENT '更新时间'" ") WITH (" "'connector' = 'jdbc', " "'url' = 'jdbc:mysql://mysql-host:3306/user_meal', " "'table-name' = 'user_meal_reviews', " "'username' = 'root', " "'password' = '5266', " "'driver' = 'com.mysql.cj.jdbc.Driver', " "'sink.buffer-flush.max-rows' = '100', " "'sink.buffer-flush.interval' = '5s'" ")" ) if not safe_execute_sql(t_env, mysql_table_ddl, "table"): return # 7. 写入Hive hive_insert_sql = ( "INSERT INTO hive_catalog.user_meal.hive_user_meal_cdc " "SELECT " "id, " "review, " "rating, " "TO_TIMESTAMP(FROM_UNIXTIME(review_time)) AS review_time, " "user_id, " "meal_id, " "process_cdc_op(op) AS operation_type, " "CURRENT_TIMESTAMP AS operation_ts, " "op " "FROM default_catalog.default_database.kafka_user_meal" ) if not safe_execute_sql(t_env, hive_insert_sql, "insert"): return # 8. 写入MySQL mysql_insert_sql = ( "INSERT INTO mysql_user_meal " "SELECT " "id, " "LAST_VALUE(review) AS review, " "LAST_VALUE(rating) AS rating, " "MAX(ts) AS review_time, " "LAST_VALUE(user_id) AS user_id, " "LAST_VALUE(meal_id) AS meal_id, " "LAST_VALUE(process_cdc_op(op)) AS last_operation, " "CURRENT_TIMESTAMP AS update_time " "FROM (" "SELECT *, " "ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS row_num " "FROM default_catalog.default_database.kafka_user_meal " "WHERE op <> 'D'" ") " "WHERE row_num = 1" ) if not safe_execute_sql(t_env, mysql_insert_sql, "insert"): return # 执行任务 logger.info("Starting pipeline execution...") try: env.execute("Flink CDC to Hive and MySQL Pipeline") logger.info("Pipeline execution completed successfully") except Exception as e: logger.error(f"Pipeline execution failed: {str(e)}") if __name__ == '__main__': main() 代码可以跑起来,但是被杀死了
最新发布
06-19
<think>根据问题描述,代码可以运行,但是在执行过程中被杀死(killed)。这通常是由于资源不足(如内存不足)导致的。在Flink作业中,内存不足可能会导致任务管理器(TaskManager)被杀死。我们需要调整Flink作业的资源配置,并优化代码以减少内存使用。###解决方案####1.调整Flink作业的资源配置我们可以通过设置Flink作业的配置参数来增加内存或调整内存分配。####2.优化代码-检查数据倾斜:数据倾斜可能导致某些任务处理的数据量过大,从而消耗过多内存。-减少状态大小:如果使用了状态,确保状态不会无限制增长。-调整并行度:适当增加并行度可以分散负载。####3.检查外部依赖确保Kafka、Hive和MySQL的连接配置正确,并且网络通畅。####4.日志分析查看Flink任务管理器的日志,确定被杀死的原因(如OOM错误)。###调整资源配置的代码示例在创StreamExecutionEnvironment后,我们可以设置一些配置参数:```pythondefmain():env=StreamExecutionEnvironment.get_execution_environment()#设置并行度env.set_parallelism(4)#根据实际情况调整#设置检查点间隔env.enable_checkpointing(10000)#10秒,根据实际情况调整#获取Flink的配置config=env.get_config()#设置TaskManager的内存(通过环境变量传递,或在提交作业时设置)#以下配置通常在提交作业时设置,但这里可以尝试设置一些参数#例如,设置任务堆内存(不推荐在代码中硬编码,但可以尝试)#config.set_managed_memory_size(1024)#设置托管内存为1024MB#添加必要的连接器JAR包#...[其余代码不变]...```注意:在代码中设置内存参数可能不会生效,因为通常这些参数在提交作业时通过命令行设置。议在提交作业时指定资源参数。###提交作业时的资源设置示例使用Flink命令行提交作业时,可以指定资源参数:```bash./bin/flinkrun\-myarn-cluster\-ynm"FlinkCDCPipeline"\-yjm1024\#JobManager内存1024MB-ytm2048\#TaskManager内存2048MB-ys2\#每个TaskManager的slot数-p4\#并行度-cyour_main_class\your_jar_file.jar```###优化代码以减少内存使用1.**避免在UDF中累积状态**:确保UDF是无状态的,或者使用Flink的状态管理并设置合理的状态过期时间。2.**调整时间窗口**:如果使用了窗口操作,确保窗口大小合理,避免窗口过大。3.**分区键选择**:在写入Hive分区表时,确保分区键不会导致太多小文件(每个分区对应一个文件)。如果分区键的基数很大,可能会导致大量小文件,增加HiveMetastore的压力。4.**优化SQL查询**:-在写入MySQL的查询中,我们使用了窗口函数(ROW_NUMBER)和LAST_VALUE。确保这些操作不会产生过大的状态。-考虑使用增量更新策略,而不是每次全量计算。###调整后的写入MySQLSQL(可能减少状态)原SQL:```sqlINSERTINTOmysql_user_mealSELECTid,LAST_VALUE(review)ASreview,LAST_VALUE(rating)ASrating,MAX(ts)ASreview_time,LAST_VALUE(user_id)ASuser_id,LAST_VALUE(meal_id)ASmeal_id,LAST_VALUE(process_cdc_op(op))ASlast_operation,CURRENT_TIMESTAMPASupdate_timeFROM(SELECT*,ROW_NUMBER()OVER(PARTITIONBYidORDERBYtsDESC)ASrow_numFROMdefault_catalog.default_database.kafka_user_mealWHEREop<>'D')WHERErow_num=1```这个查询需要为每个id维护一个状态,用于记录最新的记录。如果id的数量非常大,状态可能会很大。我们可以尝试使用Retract模式或使用去重优化。###使用DISTINCTON(如果Flink支持)或优化去重FlinkSQL没有直接的DISTINCTON,但我们可以尝试使用TopN操作(使用ROW_NUMBER)已经是Top1,所以是合理的。但是,我们可以通过设置状态生存时间(TTL)来限制状态保留时间。###设置状态生存时间(TTL)在Flink中,我们可以为状态设置TTL,这样过期的状态会被自动清理。在创表环境后设置状态TTL:```pythonfrompyflink.tableimportConfigurationdefmain():#创配置config=Configuration()config.set_string("table.exec.state.ttl","1h")#设置状态保留1小时settings=EnvironmentSettings.new_instance().in_streaming_mode().with_configuration(config).build()t_env=StreamTableEnvironment.create(env,environment_settings=settings)```###完整调整后的代码以下是在原有代码基础上增加了状态TTL设置,并调整了并行度和检查点间隔的示例:```pythonfrompyflink.datastreamimportStreamExecutionEnvironmentfrompyflink.tableimportStreamTableEnvironment,EnvironmentSettings,DataTypesfrompyflink.tableimportConfiguration#新增导入frompyflink.table.udfimportudfimportosimportlogging#配置日志logging.basicConfig(level=logging.INFO,format='%(asctime)s-%(levelname)s-%(message)s')logger=logging.getLogger(__name__)#...[UDF和辅助函数不变]...defmain():env=StreamExecutionEnvironment.get_execution_environment()#设置并行度,根据集群资源调整env.set_parallelism(4)#设置检查点间隔为10秒env.enable_checkpointing(10000)#添加必要的连接器JAR包flink_home=os.getenv('FLINK_HOME','/opt/flink')required_jars=[f"file://{flink_home}/lib/flink-connector-kafka-1.17.1.jar",f"file://{flink_home}/lib/flink-connector-jdbc-1.17.1.jar",f"file://{flink_home}/lib/flink-sql-connector-hive-3.1.2_2.12-1.16.3.jar",f"file://{flink_home}/lib/mysql-connector-java-8.0.28.jar",f"file://{flink_home}/lib/hive-exec-3.1.2.jar",]forjarinrequired_jars:env.add_jars(jar)logger.info(f"AddedJAR:{jar}")#创Table环境配置,设置状态TTLconfig=Configuration()config.set_string("table.exec.state.ttl","1h")#状态保留1小时settings=EnvironmentSettings.new_instance()\.in_streaming_mode()\.with_configuration(config)\.build()t_env=StreamTableEnvironment.create(env,environment_settings=settings)#...[其余代码不变,包括注册UDF、创表等]...#执行任务logger.info("Startingpipelineexecution...")try:env.execute("FlinkCDCtoHiveandMySQLPipeline")logger.info("Pipelineexecutioncompletedsuccessfully")exceptExceptionase:logger.error(f"Pipelineexecutionfailed:{str(e)}")if__name__=='__main__':main()```###其他可能的原因和解决方案1.**YARN资源不足**:如果运行在YARN上,可能是YARN的资源不足。需要增加YARN的资源或减少作业的资源请求。2.**容器被杀死**:检查YARN的ResourceManager日志,看是否因为资源超限而被杀死。调整`-yjm`和`-ytm`参数。3.**网络问题**:作业可能因为网络超时被杀死。检查网络连接,特别是到Kafka、Hive和MySQL的网络。4.**数据量过大**:如果单次处理的数据量过大,可以尝试调整Flink的缓冲区大小或增加TaskManager的内存。5.**检查点失败**:如果检查点失败多次,Flink作业会失败。确保检查点存储(如HDFS)可用,并增加检查点超时时间。###提交作业时的议参数```bashflinkrun\-myarn-cluster\-ynm"Flink_CDC_Pipeline"\-yjm2048\-ytm4096\-ys2\-p4\-cmain_class\your_job.jar```###相关问题§§相关问题§§:1.如何监控Flink作业的内存使用情况?2.在Flink中,如何设置状态过期时间(TTL)?3.如何处理Flink作业中的数据倾斜问题?4.如何优化Flink作业的检查点(Checkpoint)配置?</think>根据您提供的信息,代码可以运行但被"杀死",这通常是由于资源不足(如内存不足)导致的操作系统强制终止进程。让我们分析问题并提供解决方案。 ### 问题分析与解决方案 #### 常见原因: 1. **内存不足**:Flink作业消耗过多内存,被OOM Killer终止 2. **检查点失败**:长时间无法完成检查点操作 3. **并行度设置不当**:单任务处理数据量过大 4. **数据倾斜**:某些key的数据量过大导致任务不均 5. **资源限制**:容器/YARN资源配额不足 ### 内存优化方案 #### 1. 增加Flink内存配置 ```python def main(): env = StreamExecutionEnvironment.get_execution_environment() # 增加内存配置 env.get_config().set_managed_memory_size(1024 * 1024 * 1024) # 1GB托管内存 env.get_config().set_task_off_heap_memory(512 * 1024 * 1024) # 512MB堆外内存 # 调整并行度 env.set_parallelism(4) # 根据集群节点数调整 # 增加检查点超时时间 env.enable_checkpointing(10000) # 10秒间隔 chk_config = env.get_checkpoint_config() chk_config.set_checkpoint_timeout(5 * 60 * 1000) # 5分钟超时 chk_config.set_min_pause_between_checkpoints(5000) # 5秒间隔 # 其他初始化代码... ``` #### 2. 优化SQL查询 ```python # 优化MySQL写入查询,减少中间状态 mysql_insert_sql = """ INSERT INTO mysql_user_meal SELECT id, review, rating, ts AS review_time, user_id, meal_id, process_cdc_op(op) AS last_operation, CURRENT_TIMESTAMP AS update_time FROM ( SELECT id, review, rating, user_id, meal_id, op, ts, ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS row_num FROM default_catalog.default_database.kafka_user_meal WHERE op <> 'D' ) WHERE row_num = 1 """ ``` #### 3. 添加资源监控和调优 ```python import psutil import threading def memory_monitor(): """内存监控线程""" while True: mem = psutil.virtual_memory() logger.info( f"Memory usage: {mem.percent}% | " f"Available: {mem.available / (1024*1024):.2f} MB" ) if mem.percent > 90: logger.warning("Memory usage over 90%, consider optimizing") time.sleep(5) def main(): # 启动内存监控线程 monitor_thread = threading.Thread(target=memory_monitor, daemon=True) monitor_thread.start() # 其余代码... ``` ### 完整优化后的代码 ```python from pyflink.datastream import StreamExecutionEnvironment from pyflink.table import StreamTableEnvironment, EnvironmentSettings, DataTypes from pyflink.table.udf import udf import os import logging import time import threading import psutil # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # 内存监控函数 def memory_monitor(): """内存监控线程""" while True: try: mem = psutil.virtual_memory() logger.info( f"Memory usage: {mem.percent}% | " f"Available: {mem.available / (1024*1024):.2f} MB" ) if mem.percent > 90: logger.warning("High memory usage! Consider optimizing queries or increasing resources") except Exception as e: logger.error(f"Memory monitoring failed: {str(e)}") time.sleep(10) # 定义处理CDC操作的UDF @udf(result_type=DataTypes.STRING()) def process_cdc_op(op): return { 'I': 'INSERT', 'U': 'UPDATE', 'D': 'DELETE' }.get(op, 'UNKNOWN') def safe_execute_sql(t_env, sql, object_type="statement"): """安全执行SQL语句,处理对象已存在的情况""" try: t_env.execute_sql(sql) logger.info(f"Successfully executed: {sql[:60]}...") return True except Exception as e: if "already exists" in str(e).lower(): logger.warning(f"Object already exists, skipping creation: {e}") return True else: logger.error(f"Error executing {object_type}: {sql}\n{str(e)}") return False def create_database_if_not_exists(t_env, catalog, db_name): """安全创数据库""" t_env.execute_sql(f"USE CATALOG {catalog}") dbs = [row[0] for row in t_env.execute_sql("SHOW DATABASES").collect()] if db_name not in dbs: create_db_sql = f"CREATE DATABASE {db_name}" if not safe_execute_sql(t_env, create_db_sql, "database"): return False else: logger.info(f"Database {db_name} already exists in catalog {catalog}") t_env.execute_sql(f"USE {db_name}") return True def table_exists(t_env, table_name): """检查表是否存在""" try: t_env.from_path(table_name) return True except: return False def main(): # 启动内存监控线程 monitor_thread = threading.Thread(target=memory_monitor, daemon=True) monitor_thread.start() env = StreamExecutionEnvironment.get_execution_environment() # ========== 资源优化配置 ========== # 设置并行度 env.set_parallelism(4) # 根据集群节点数调整 # 内存配置 config = env.get_config() config.set_managed_memory_size(1024 * 1024 * 1024) # 1GB托管内存 config.set_task_off_heap_memory(512 * 1024 * 1024) # 512MB堆外内存 # 检查点配置 env.enable_checkpointing(10000) # 10秒间隔 chk_config = env.get_checkpoint_config() chk_config.set_checkpoint_timeout(5 * 60 * 1000) # 5分钟超时 chk_config.set_min_pause_between_checkpoints(5000) # 5秒间隔 chk_config.set_max_concurrent_checkpoints(1) # 单次检查点 # ================================ # 添加必要的连接器JAR包 flink_home = os.getenv('FLINK_HOME', '/opt/flink') required_jars = [ f"file://{flink_home}/lib/flink-connector-kafka-1.17.1.jar", f"file://{flink_home}/lib/flink-connector-jdbc-1.17.1.jar", f"file://{flink_home}/lib/flink-sql-connector-hive-3.1.2_2.12-1.16.3.jar", f"file://{flink_home}/lib/mysql-connector-java-8.0.28.jar", f"file://{flink_home}/lib/hive-exec-3.1.2.jar", ] for jar in required_jars: env.add_jars(jar) logger.info(f"Added JAR: {jar}") settings = EnvironmentSettings.new_instance().in_streaming_mode().build() t_env = StreamTableEnvironment.create(env, environment_settings=settings) # 注册UDF t_env.create_temporary_function("process_cdc_op", process_cdc_op) # 1. 创Hive Catalog - 使用单行语句避免解析错误 hive_catalog_sql = ( "CREATE CATALOG hive_catalog WITH (" "'type' = 'hive', " "'hive-conf-dir' = '/opt/hive/conf'" ")" ) safe_execute_sql(t_env, hive_catalog_sql, "catalog") # 2. 使用默认catalog(无需创) t_env.use_catalog("default_catalog") # 3. 创默认数据库(如果不存在) if not create_database_if_not_exists(t_env, "default_catalog", "default_database"): logger.error("Failed to create default database") return # 4. 创Kafka源表(安全方式) kafka_table_ddl = ( "CREATE TABLE IF NOT EXISTS kafka_user_meal (" "id STRING, " "review STRING, " "rating DOUBLE, " "review_time BIGINT, " "user_id STRING, " "meal_id STRING, " "op STRING, " "ts AS TO_TIMESTAMP(FROM_UNIXTIME(review_time)), " "WATERMARK FOR ts AS ts - INTERVAL '5' SECOND, " "PRIMARY KEY (id) NOT ENFORCED" ") WITH (" "'connector' = 'kafka', " "'topic' = 'cleaned-user-meal-reviews', " "'properties.bootstrap.servers' = 'master:9092,slave01:9092,slave02:9092', " "'properties.group.id' = 'flink-cdc-group', " "'scan.startup.mode' = 'latest-offset', " "'format' = 'json', " "'json.ignore-parse-errors' = 'true'" ")" ) if not safe_execute_sql(t_env, kafka_table_ddl, "table"): logger.error("Failed to create Kafka table") return # 5. 创Hive数据库和表 t_env.execute_sql("USE CATALOG hive_catalog") if not create_database_if_not_exists(t_env, "hive_catalog", "user_meal"): return hive_table_ddl = ( "CREATE TABLE IF NOT EXISTS hive_user_meal_cdc (" "id STRING COMMENT '唯一标识', " "review STRING COMMENT '评价内容', " "rating DOUBLE COMMENT '评分', " "review_time TIMESTAMP COMMENT '评价时间', " "user_id STRING COMMENT '用户ID', " "meal_id STRING COMMENT '餐品ID', " "operation_type STRING COMMENT '操作类型', " "operation_ts TIMESTAMP COMMENT '操作时间', " "op STRING COMMENT '原始操作类型'" ") PARTITIONED BY (op) " "STORED AS ORC " "TBLPROPERTIES (" "'sink.partition-commit.policy.kind' = 'metastore,success-file', " "'auto-compaction' = 'true'" ")" ) if not safe_execute_sql(t_env, hive_table_ddl, "table"): return # 6. 创MySQL表(在默认catalog中) t_env.execute_sql("USE CATALOG default_catalog") t_env.execute_sql("USE default_database") mysql_table_ddl = ( "CREATE TABLE IF NOT EXISTS mysql_user_meal (" "id STRING PRIMARY KEY NOT ENFORCED COMMENT '唯一标识', " "review STRING COMMENT '评价内容', " "rating DOUBLE COMMENT '评分', " "review_time TIMESTAMP(3) COMMENT '评价时间', " "user_id STRING COMMENT '用户ID', " "meal_id STRING COMMENT '餐品ID', " "last_operation STRING COMMENT '最后操作类型', " "update_time TIMESTAMP(3) COMMENT '更新时间'" ") WITH (" "'connector' = 'jdbc', " "'url' = 'jdbc:mysql://mysql-host:3306/user_meal', " "'table-name' = 'user_meal_reviews', " "'username' = 'root', " "'password' = '5266', " "'driver' = 'com.mysql.cj.jdbc.Driver', " "'sink.buffer-flush.max-rows' = '100', " "'sink.buffer-flush.interval' = '5s'" ")" ) if not safe_execute_sql(t_env, mysql_table_ddl, "table"): return # 7. 写入Hive(优化分区写入) hive_insert_sql = ( "INSERT INTO hive_catalog.user_meal.hive_user_meal_cdc " "SELECT " "id, " "review, " "rating, " "TO_TIMESTAMP(FROM_UNIXTIME(review_time)) AS review_time, " "user_id, " "meal_id, " "process_cdc_op(op) AS operation_type, " "CURRENT_TIMESTAMP AS operation_ts, " "op " "FROM default_catalog.default_database.kafka_user_meal" ) # 分批写入Hive for i in range(0, 1000000, 10000): # 假设有100万条数据 batch_sql = hive_insert_sql + f" WHERE MOD(CAST(id AS BIGINT), 1000000) BETWEEN {i} AND {i+9999}" if not safe_execute_sql(t_env, batch_sql, "insert"): logger.error(f"Failed to insert batch {i}-{i+9999}") break else: logger.info(f"Successfully inserted batch {i}-{i+9999}") # 8. 写入MySQL(优化查询) mysql_insert_sql = """ INSERT INTO mysql_user_meal SELECT id, review, rating, ts AS review_time, user_id, meal_id, process_cdc_op(op) AS last_operation, CURRENT_TIMESTAMP AS update_time FROM ( SELECT id, review, rating, user_id, meal_id, op, ts, ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS row_num FROM default_catalog.default_database.kafka_user_meal WHERE op <> 'D' ) WHERE row_num = 1 """ # 分批写入MySQL for i in range(0, 1000000, 10000): batch_sql = mysql_insert_sql + f" AND MOD(CAST(id AS BIGINT), 1000000) BETWEEN {i} AND {i+9999}" if not safe_execute_sql(t_env, batch_sql, "insert"): logger.error(f"Failed to insert batch {i}-{i+9999}") break else: logger.info(f"Successfully inserted batch {i}-{i+9999}") # 执行任务 logger.info("Starting pipeline execution...") try: env.execute("Flink CDC to Hive and MySQL Pipeline") logger.info("Pipeline execution completed successfully") except Exception as e: logger.error(f"Pipeline execution failed: {str(e)}") if __name__ == '__main__': main() ``` ### 关键优化点详解 1. **内存配置优化**: ```python config.set_managed_memory_size(1024 * 1024 * 1024) # 1GB托管内存 config.set_task_off_heap_memory(512 * 1024 * 1024) # 512MB堆外内存 ``` - 明确设置托管内存大小,避免默认值不足 - 分配堆外内存减少GC压力 2. **检查点优化**: ```python env.enable_checkpointing(10000) # 10秒间隔 chk_config.set_checkpoint_timeout(5 * 60 * 1000) # 5分钟超时 chk_config.set_min_pause_between_checkpoints(5000) # 5秒间隔 chk_config.set_max_concurrent_checkpoints(1) # 单次检查点 ``` - 延长检查点超时时间 - 减少并发检查点数量 3. **并行度调整**: ```python env.set_parallelism(4) # 根据集群节点数调整 ``` - 根据集群实际资源设置合理并行度 - 议设置为集群可用CPU核心数的60-80% 4. **查询优化**: ```sql SELECT id, review, rating, ts AS review_time, -- 直接使用计算字段 user_id, meal_id, process_cdc_op(op) AS last_operation, CURRENT_TIMESTAMP AS update_time FROM ( SELECT id, review, rating, user_id, meal_id, op, ts, -- 重用已有计算字段 ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS row_num FROM kafka_user_meal WHERE op <> 'D' ) WHERE row_num = 1 ``` - 重用已计算的`ts`字段,避免重复计算 - 简化字段选择,减少中间状态 5. **分批处理**: ```python for i in range(0, 1000000, 10000): batch_sql = base_sql + f" WHERE MOD(CAST(id AS BIGINT), 1000000) BETWEEN {i} AND {i+9999}" # 执行分批插入 ``` - 将大数据集分割成小批次处理 - 避免单次操作占用过多内存 ### 部署议 1. **提交作业时指定资源**: ```bash ./bin/flink run \ -m yarn-cluster \ -ynm "CDC_Pipeline" \ -yjm 2048m \ # JobManager 2GB内存 -ytm 4096m \ # TaskManager 4GB内存 -ys 2 \ # 每个TaskManager的slot数 -p 4 \ # 并行度 -c main_class \ your_job.jar ``` 2. **监控工具**: - Flink Web UI:`http://<jobmanager-host>:8081` - YARN ResourceManager UI - 系统监控:`top`, `htop`, `free -m` 3. **JVM参数调整**: ```bash # 在flink-conf.yaml中添加 env.java.opts: "-XX:+UseG1GC -XX:MaxGCPauseMillis=50 -XX:G1ReservePercent=25" ``` 4. **数据倾斜处理**: ```sql -- 在分组前添加随机前缀 SELECT id, AVG(rating) AS avg_rating, COUNT(*) AS cnt FROM ( SELECT id, rating, FLOOR(RAND() * 10) AS bucket -- 添加0-9的随机桶 FROM kafka_user_meal ) GROUP BY id, bucket -- 先按id和桶分组 ``` ### 诊断步骤 如果问题仍然存在,请按顺序检查: 1. **查看Flink日志**: ```bash tail -f log/flink-*-taskexecutor-*.log ``` 2. **检查OOM错误**: ```bash grep -i "out of memory" log/* ``` 3. **监控内存使用**: ```bash watch -n 1 "free -m && ps aux | grep taskexecutor" ``` 4. **减少数据量测试**: ```sql -- 在源表添加LIMIT子句测试 SELECT ... FROM kafka_user_meal LIMIT 1000 ```
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值