from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, EnvironmentSettings, DataTypes
from pyflink.table.expressions import col
from pyflink.table.udf import udf
import os
# 定义处理CDC操作的UDF
@udf(result_type=DataTypes.STRING())
def process_cdc_op(op):
"""将CDC操作转换为可读的标签"""
return {
'I': 'INSERT',
'U': 'UPDATE',
'D': 'DELETE'
}.get(op, 'UNKNOWN')
def main():
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
env.enable_checkpointing(5000)
# 添加必要的连接器JAR包
flink_home = os.getenv('FLINK_HOME', '/opt/flink')
env.add_jars(
f"file://{flink_home}/lib/flink-connector-kafka-1.17.1.jar",
f"file://{flink_home}/lib/flink-connector-jdbc-1.17.1.jar",
f"file://{flink_home}/lib/flink-sql-connector-hive-3.1.2_2.12-1.16.3.jar", # 注意Scala版本
f"file://{flink_home}/lib/mysql-connector-java-8.0.28.jar",
f"file://{flink_home}/lib/hive-exec-3.1.2.jar",
)
settings = EnvironmentSettings.new_instance().in_streaming_mode().build()
t_env = StreamTableEnvironment.create(env, environment_settings=settings)
# 注册UDF
t_env.create_temporary_function("process_cdc_op", process_cdc_op)
# 注册Hive Catalog
t_env.execute_sql("""
CREATE CATALOG hive_catalog WITH (
'type' = 'hive',
'hive-conf-dir' = '/opt/hive/conf',
'hive.metastore.uris' = 'thrift://master:9083',
'default-database' = 'user_meal'
)
""")
t_env.use_catalog("hive_catalog")
# 1. 创建Kafka源表
t_env.execute_sql("""
CREATE TABLE kafka_user_meal (
id STRING,
review STRING,
rating DOUBLE,
review_time STRING,
user_id STRING,
meal_id STRING,
op STRING, -- CDC操作类型(I/U/D)
ts AS TO_TIMESTAMP(FROM_UNIXTIME(CAST(review_time AS BIGINT))),
WATERMARK FOR ts AS ts - INTERVAL '5' SECOND,
PRIMARY KEY (id) NOT ENFORCED
) WITH (
'connector' = 'kafka',
'topic' = 'cleaned-user-meal-reviews',
'properties.bootstrap.servers' = 'master:9092,slave01:9092,slave02:9092',
'properties.group.id' = 'flink-cdc-group',
'scan.startup.mode' = 'latest-offset',
'format' = 'json',
'json.ignore-parse-errors' = 'true'
)
""")
# 2. 创建Hive表(存储原始CDC操作)
t_env.execute_sql("""
CREATE TABLE IF NOT EXISTS hive_user_meal_cdc (
id STRING,
review STRING,
rating DOUBLE,
review_time STRING,
user_id STRING,
meal_id STRING,
operation_type STRING, -- 转换后的操作类型
operation_ts TIMESTAMP(3),
op STRING -- 原始操作类型
) PARTITIONED BY (op) -- 按操作类型分区
TBLPROPERTIES (
'sink.partition-commit.policy.kind' = 'metastore,success-file',
'auto-compaction' = 'true'
)
""")
# 3. 创建MySQL表(存储最新状态)
t_env.execute_sql("""
CREATE TABLE mysql_user_meal (
id STRING PRIMARY KEY NOT ENFORCED,
review STRING,
rating DOUBLE,
review_time TIMESTAMP(3),
user_id STRING,
meal_id STRING,
last_operation STRING,
update_time TIMESTAMP(3)
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://mysql-host:3306/user_meal',
'table-name' = 'user_meal_reviews',
'username' = 'root',
'password' = '5266',
'driver' = 'com.mysql.cj.jdbc.Driver',
'sink.buffer-flush.max-rows' = '100',
'sink.buffer-flush.interval' = '5s'
)
""")
# 4. 写入Hive(存储所有CDC操作)
t_env.execute_sql("""
INSERT INTO hive_user_meal_cdc
SELECT
id,
review,
rating,
review_time,
user_id,
meal_id,
process_cdc_op(op) AS operation_type, -- 使用UDF转换操作类型
ts AS operation_ts,
op
FROM kafka_user_meal
""")
# 5. 从Hive读取最新状态并写入MySQL
# 使用Flink的CDC特性获取最新状态
latest_data = t_env.sql_query("""
SELECT
id,
LAST_VALUE(review) AS review,
LAST_VALUE(rating) AS rating,
MAX(ts) AS review_time,
LAST_VALUE(user_id) AS user_id,
LAST_VALUE(meal_id) AS meal_id,
LAST_VALUE(process_cdc_op(op)) AS last_operation,
CURRENT_TIMESTAMP AS update_time
FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS row_num
FROM kafka_user_meal
WHERE op <> 'D' -- 排除删除操作
)
WHERE row_num = 1
""")
# 创建临时视图
t_env.create_temporary_view("latest_user_meal", latest_data)
# 写入MySQL
t_env.execute_sql("""
INSERT INTO mysql_user_meal
SELECT * FROM latest_user_meal
""")
# 执行任务
env.execute("Flink CDC to Hive and MySQL Pipeline")
if __name__ == '__main__':
main()
/home/hadoop/桌面/pyflink/bin/python3.8 /home/hadoop/PycharmProjects/SparkProject/src/flinkCDC.py
Traceback (most recent call last):
File "/home/hadoop/PycharmProjects/SparkProject/src/flinkCDC.py", line 169, in <module>
main()
File "/home/hadoop/PycharmProjects/SparkProject/src/flinkCDC.py", line 42, in main
t_env.execute_sql("""
File "/home/hadoop/桌面/pyflink/lib/python3.8/site-packages/pyflink/table/table_environment.py", line 837, in execute_sql
return TableResult(self._j_tenv.executeSql(stmt))
File "/home/hadoop/桌面/pyflink/lib/python3.8/site-packages/py4j/java_gateway.py", line 1322, in __call__
return_value = get_return_value(
File "/home/hadoop/桌面/pyflink/lib/python3.8/site-packages/pyflink/util/exceptions.py", line 146, in deco
return f(*a, **kw)
File "/home/hadoop/桌面/pyflink/lib/python3.8/site-packages/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o44.executeSql.
: org.apache.flink.table.api.ValidationException: Unable to create catalog 'hive_catalog'.
Catalog options are:
'default-database'='user_meal'
'hive-conf-dir'='/opt/hive/conf'
'hive.metastore.uris'='thrift://master:9083'
'type'='hive'
at org.apache.flink.table.factories.FactoryUtil.createCatalog(FactoryUtil.java:439)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.createCatalog(TableEnvironmentImpl.java:1468)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.executeInternal(TableEnvironmentImpl.java:1214)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.executeSql(TableEnvironmentImpl.java:765)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at org.apache.flink.api.python.shaded.py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
at org.apache.flink.api.python.shaded.py4j.Gateway.invoke(Gateway.java:282)
at org.apache.flink.api.python.shaded.py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at org.apache.flink.api.python.shaded.py4j.commands.CallCommand.execute(CallCommand.java:79)
at org.apache.flink.api.python.shaded.py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.flink.table.api.ValidationException: Unsupported options found for 'hive'.
Unsupported options:
hive.metastore.uris
Supported options:
default-database
hadoop-conf-dir
hive-conf-dir
hive-version
property-version
at org.apache.flink.table.factories.FactoryUtil.validateUnconsumedKeys(FactoryUtil.java:632)
at org.apache.flink.table.factories.FactoryUtil$FactoryHelper.validate(FactoryUtil.java:931)
at org.apache.flink.table.catalog.hive.factories.HiveCatalogFactory.createCatalog(HiveCatalogFactory.java:69)
at org.apache.flink.table.factories.FactoryUtil.createCatalog(FactoryUtil.java:436)
... 14 more
最新发布