from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, EnvironmentSettings, DataTypes
from pyflink.table.udf import udf
import os
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# 定义处理CDC操作的UDF
@udf(result_type=DataTypes.STRING())
def process_cdc_op(op):
return {
'I': 'INSERT',
'U': 'UPDATE',
'D': 'DELETE'
}.get(op, 'UNKNOWN')
def safe_execute_sql(t_env, sql, object_type="statement"):
"""安全执行SQL语句,处理对象已存在的情况"""
try:
t_env.execute_sql(sql)
logger.info(f"Successfully executed: {sql[:60]}...")
return True
except Exception as e:
if "already exists" in str(e).lower():
logger.warning(f"Object already exists, skipping creation: {e}")
return True
else:
logger.error(f"Error executing {object_type}: {sql}\n{str(e)}")
return False
def create_database_if_not_exists(t_env, catalog, db_name):
"""安全创建数据库"""
t_env.execute_sql(f"USE CATALOG {catalog}")
dbs = [row[0] for row in t_env.execute_sql("SHOW DATABASES").collect()]
if db_name not in dbs:
create_db_sql = f"CREATE DATABASE {db_name}"
if not safe_execute_sql(t_env, create_db_sql, "database"):
return False
else:
logger.info(f"Database {db_name} already exists in catalog {catalog}")
t_env.execute_sql(f"USE {db_name}")
return True
def table_exists(t_env, table_name):
"""检查表是否存在"""
try:
t_env.from_path(table_name)
return True
except:
return False
def main():
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
env.enable_checkpointing(5000)
# 添加必要的连接器JAR包
flink_home = os.getenv('FLINK_HOME', '/opt/flink')
required_jars = [
f"file://{flink_home}/lib/flink-connector-kafka-1.17.1.jar",
f"file://{flink_home}/lib/flink-connector-jdbc-1.17.1.jar",
f"file://{flink_home}/lib/flink-sql-connector-hive-3.1.2_2.12-1.16.3.jar",
f"file://{flink_home}/lib/mysql-connector-java-8.0.28.jar",
f"file://{flink_home}/lib/hive-exec-3.1.2.jar",
]
for jar in required_jars:
env.add_jars(jar)
logger.info(f"Added JAR: {jar}")
settings = EnvironmentSettings.new_instance().in_streaming_mode().build()
t_env = StreamTableEnvironment.create(env, environment_settings=settings)
# 注册UDF
t_env.create_temporary_function("process_cdc_op", process_cdc_op)
# 1. 创建Hive Catalog - 使用单行语句避免解析错误
hive_catalog_sql = (
"CREATE CATALOG hive_catalog WITH ("
"'type' = 'hive', "
"'hive-conf-dir' = '/opt/hive/conf'"
")"
)
safe_execute_sql(t_env, hive_catalog_sql, "catalog")
# 2. 使用默认catalog(无需创建)
t_env.use_catalog("default_catalog")
# 3. 创建默认数据库(如果不存在)
if not create_database_if_not_exists(t_env, "default_catalog", "default_database"):
logger.error("Failed to create default database")
return
# 4. 创建Kafka源表(安全方式)
kafka_table_ddl = (
"CREATE TABLE kafka_user_meal ("
"id STRING, "
"review STRING, "
"rating DOUBLE, "
"review_time BIGINT, "
"user_id STRING, "
"meal_id STRING, "
"op STRING, "
"ts AS TO_TIMESTAMP(FROM_UNIXTIME(review_time)), "
"WATERMARK FOR ts AS ts - INTERVAL '5' SECOND, "
"PRIMARY KEY (id) NOT ENFORCED"
") WITH ("
"'connector' = 'kafka', "
"'topic' = 'cleaned-user-meal-reviews', "
"'properties.bootstrap.servers' = 'master:9092,slave01:9092,slave02:9092', "
"'properties.group.id' = 'flink-cdc-group', "
"'scan.startup.mode' = 'latest-offset', "
"'format' = 'json', "
"'json.ignore-parse-errors' = 'true'"
")"
)
if not safe_execute_sql(t_env, kafka_table_ddl, "table"):
logger.error("Failed to create Kafka table")
return
# 5. 创建Hive数据库和表
t_env.execute_sql("USE CATALOG hive_catalog")
if not create_database_if_not_exists(t_env, "hive_catalog", "user_meal"):
return
hive_table_ddl = (
"CREATE TABLE hive_user_meal_cdc ("
"id STRING COMMENT '唯一标识', "
"review STRING COMMENT '评价内容', "
"rating DOUBLE COMMENT '评分', "
"review_time TIMESTAMP COMMENT '评价时间', "
"user_id STRING COMMENT '用户ID', "
"meal_id STRING COMMENT '餐品ID', "
"operation_type STRING COMMENT '操作类型', "
"operation_ts TIMESTAMP COMMENT '操作时间', "
"op STRING COMMENT '原始操作类型'"
") PARTITIONED BY (op) "
"STORED AS ORC "
"TBLPROPERTIES ("
"'sink.partition-commit.policy.kind' = 'metastore,success-file', "
"'auto-compaction' = 'true'"
")"
)
if not safe_execute_sql(t_env, hive_table_ddl, "table"):
return
# 6. 创建MySQL表(在默认catalog中)
t_env.execute_sql("USE CATALOG default_catalog")
t_env.execute_sql("USE default_database")
mysql_table_ddl = (
"CREATE TABLE mysql_user_meal ("
"id STRING PRIMARY KEY NOT ENFORCED COMMENT '唯一标识', "
"review STRING COMMENT '评价内容', "
"rating DOUBLE COMMENT '评分', "
"review_time TIMESTAMP(3) COMMENT '评价时间', "
"user_id STRING COMMENT '用户ID', "
"meal_id STRING COMMENT '餐品ID', "
"last_operation STRING COMMENT '最后操作类型', "
"update_time TIMESTAMP(3) COMMENT '更新时间'"
") WITH ("
"'connector' = 'jdbc', "
"'url' = 'jdbc:mysql://mysql-host:3306/user_meal', "
"'table-name' = 'user_meal_reviews', "
"'username' = 'root', "
"'password' = '5266', "
"'driver' = 'com.mysql.cj.jdbc.Driver', "
"'sink.buffer-flush.max-rows' = '100', "
"'sink.buffer-flush.interval' = '5s'"
")"
)
if not safe_execute_sql(t_env, mysql_table_ddl, "table"):
return
# 7. 写入Hive
hive_insert_sql = (
"INSERT INTO hive_catalog.user_meal.hive_user_meal_cdc "
"SELECT "
"id, "
"review, "
"rating, "
"TO_TIMESTAMP(FROM_UNIXTIME(review_time)) AS review_time, "
"user_id, "
"meal_id, "
"process_cdc_op(op) AS operation_type, "
"CURRENT_TIMESTAMP AS operation_ts, "
"op "
"FROM default_catalog.default_database.kafka_user_meal"
)
if not safe_execute_sql(t_env, hive_insert_sql, "insert"):
return
# 8. 写入MySQL
mysql_insert_sql = (
"INSERT INTO mysql_user_meal "
"SELECT "
"id, "
"LAST_VALUE(review) AS review, "
"LAST_VALUE(rating) AS rating, "
"MAX(ts) AS review_time, "
"LAST_VALUE(user_id) AS user_id, "
"LAST_VALUE(meal_id) AS meal_id, "
"LAST_VALUE(process_cdc_op(op)) AS last_operation, "
"CURRENT_TIMESTAMP AS update_time "
"FROM ("
"SELECT *, "
"ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS row_num "
"FROM default_catalog.default_database.kafka_user_meal "
"WHERE op <> 'D'"
") "
"WHERE row_num = 1"
)
if not safe_execute_sql(t_env, mysql_insert_sql, "insert"):
return
# 执行任务
logger.info("Starting pipeline execution...")
try:
env.execute("Flink CDC to Hive and MySQL Pipeline")
logger.info("Pipeline execution completed successfully")
except Exception as e:
logger.error(f"Pipeline execution failed: {str(e)}")
if __name__ == '__main__':
main()
代码可以跑起来,但是被杀死了
最新发布