from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, EnvironmentSettings, DataTypes
from pyflink.table.udf import udf
import os
# 定义处理CDC操作的UDF
@udf(result_type=DataTypes.STRING())
def process_cdc_op(op):
"""将CDC操作转换为可读的标签"""
return {
'I': 'INSERT',
'U': 'UPDATE',
'D': 'DELETE'
}.get(op, 'UNKNOWN')
def main():
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
env.enable_checkpointing(5000)
# 添加必要的连接器JAR包
flink_home = os.getenv('FLINK_HOME', '/opt/flink')
env.add_jars(
f"file://{flink_home}/lib/flink-connector-kafka-1.17.1.jar",
f"file://{flink_home}/lib/flink-connector-jdbc-1.17.1.jar",
f"file://{flink_home}/lib/flink-sql-connector-hive-3.1.2_2.12-1.16.3.jar",
f"file://{flink_home}/lib/mysql-connector-java-8.0.28.jar",
f"file://{flink_home}/lib/hive-exec-3.1.2.jar",
)
settings = EnvironmentSettings.new_instance().in_streaming_mode().build()
t_env = StreamTableEnvironment.create(env, environment_settings=settings)
# 注册UDF
t_env.create_temporary_function("process_cdc_op", process_cdc_op)
# 解决方案1: 使用默认数据库
t_env.execute_sql("""
CREATE CATALOG hive_catalog WITH (
'type' = 'hive',
'hive-conf-dir' = '/opt/hive/conf'
)
""")
t_env.use_catalog("hive_catalog")
# 1. 创建Kafka源表
t_env.execute_sql("""
CREATE TABLE kafka_user_meal (
id STRING,
review STRING,
rating DOUBLE,
review_time STRING,
user_id STRING,
meal_id STRING,
op STRING,
ts AS TO_TIMESTAMP(FROM_UNIXTIME(CAST(review_time AS BIGINT))),
WATERMARK FOR ts AS ts - INTERVAL '5' SECOND,
PRIMARY KEY (id) NOT ENFORCED
) WITH (
'connector' = 'kafka',
'topic' = 'cleaned-user-meal-reviews',
'properties.bootstrap.servers' = 'master:9092,slave01:9092,slave02:9092',
'properties.group.id' = 'flink-cdc-group',
'scan.startup.mode' = 'latest-offset',
'format' = 'json',
'json.ignore-parse-errors' = 'true'
)
""")
# 2. 创建Hive表
t_env.execute_sql("""
CREATE TABLE IF NOT EXISTS hive_user_meal_cdc (
id STRING,
review STRING,
rating DOUBLE,
review_time STRING,
user_id STRING,
meal_id STRING,
operation_type STRING,
operation_ts TIMESTAMP(3),
op STRING
) PARTITIONED BY (op)
STORED AS ORC
TBLPROPERTIES (
'sink.partition-commit.policy.kind' = 'metastore,success-file',
'auto-compaction' = 'true'
)
""")
# 3. 创建MySQL表
t_env.execute_sql("""
CREATE TABLE mysql_user_meal (
id STRING PRIMARY KEY NOT ENFORCED,
review STRING,
rating DOUBLE,
review_time TIMESTAMP(3),
user_id STRING,
meal_id STRING,
last_operation STRING,
update_time TIMESTAMP(3)
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://mysql-host:3306/user_meal',
'table-name' = 'user_meal_reviews',
'username' = 'root',
'password' = '5266',
'driver' = 'com.mysql.cj.jdbc.Driver',
'sink.buffer-flush.max-rows' = '100',
'sink.buffer-flush.interval' = '5s'
)
""")
# 4. 写入Hive
t_env.execute_sql("""
INSERT INTO hive_user_meal_cdc
SELECT
id,
review,
rating,
review_time,
user_id,
meal_id,
process_cdc_op(op) AS operation_type,
ts AS operation_ts,
op
FROM kafka_user_meal
""")
# 5. 写入MySQL(优化查询)
latest_data = t_env.sql_query("""
SELECT
id,
LAST_VALUE(review IGNORE NULLS) AS review,
LAST_VALUE(rating IGNORE NULLS) AS rating,
MAX(ts) AS review_time,
LAST_VALUE(user_id IGNORE NULLS) AS user_id,
LAST_VALUE(meal_id IGNORE NULLS) AS meal_id,
LAST_VALUE(process_cdc_op(op) IGNORE NULLS) AS last_operation,
CURRENT_TIMESTAMP AS update_time
FROM kafka_user_meal
WHERE op <> 'D'
GROUP BY id
""")
t_env.create_temporary_view("latest_user_meal", latest_data)
t_env.execute_sql("""
INSERT INTO mysql_user_meal
SELECT * FROM latest_user_meal
""")
env.execute("Flink CDC to Hive and MySQL Pipeline")
if __name__ == '__main__':
main()
这个代码能不能走得通?已经运行了一遍,hive中已经存在表kafka_user_meal,但是无法查看数据
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.RunJar.run(RunJar.java:328)
at org.apache.hadoop.util.RunJar.main(RunJar.java:241)
FAILED: SemanticException Line 0:-1 Invalid column reference 'TOK_ALLCOLREF'
1962568 [199a4ed1-e39e-47c4-ade9-ff69770529cc main] ERROR org.apache.hadoop.hive.ql.Driver - FAILED: SemanticException Line 0:-1 Invalid column reference 'TOK_ALLCOLREF'
org.apache.hadoop.hive.ql.parse.SemanticException: Line 0:-1 Invalid column reference 'TOK_ALLCOLREF'
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genColListRegex(SemanticAnalyzer.java:3761)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genSelectPlan(SemanticAnalyzer.java:4526)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genSelectPlan(SemanticAnalyzer.java:4338)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPostGroupByBodyPlan(SemanticAnalyzer.java:10566)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genBodyPlan(SemanticAnalyzer.java:10505)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:11418)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:11288)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genOPTree(SemanticAnalyzer.java:12074)
at org.apache.hadoop.hive.ql.parse.CalcitePlanner.genOPTree(CalcitePlanner.java:593)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:12164)
at org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:330)
at org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:285)
at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:659)
at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1826)
at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:1773)
at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:1768)
at org.apache.hadoop.hive.ql.reexec.ReExecDriver.compileAndRespond(ReExecDriver.java:126)
at org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:214)
at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:239)
at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:188)
at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:402)
at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:821)
at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:759)
at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:683)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.RunJar.run(RunJar.java:328)
at org.apache.hadoop.util.RunJar.main(RunJar.java:241)
hive> DESCRIBE FORMATTED kafka_user_meal;
OK
# col_name data_type comment
# Detailed Table Information
Database: user_meal
OwnerType: USER
Owner: null
CreateTime: Wed Jun 18 17:53:08 CST 2025
LastAccessTime: UNKNOWN
Retention: 0
Location: hdfs://master:9000/user/hive/warehouse/user_meal.db/kafka_user_meal
Table Type: MANAGED_TABLE
Table Parameters:
flink.comment
flink.connector kafka
flink.format json
flink.json.ignore-parse-errors true
flink.properties.bootstrap.servers master:9092,slave01:9092,slave02:9092
flink.properties.group.id flink-cdc-group
flink.scan.startup.mode latest-offset
flink.schema.0.data-type VARCHAR(2147483647) NOT NULL
flink.schema.0.name id
flink.schema.1.data-type VARCHAR(2147483647)
flink.schema.1.name review
flink.schema.2.data-type DOUBLE
flink.schema.2.name rating
flink.schema.3.data-type VARCHAR(2147483647)
flink.schema.3.name review_time
flink.schema.4.data-type VARCHAR(2147483647)
flink.schema.4.name user_id
flink.schema.5.data-type VARCHAR(2147483647)
flink.schema.5.name meal_id
flink.schema.6.data-type VARCHAR(2147483647)
flink.schema.6.name op
flink.schema.7.data-type TIMESTAMP(3)
flink.schema.7.expr TO_TIMESTAMP(FROM_UNIXTIME(CAST(`review_time` AS BIGINT)))
flink.schema.7.name ts
flink.schema.primary-key.columns id
flink.schema.primary-key.name PK_3386
flink.schema.watermark.0.rowtime ts
flink.schema.watermark.0.strategy.data-type TIMESTAMP(3)
flink.schema.watermark.0.strategy.expr `ts` - INTERVAL '5' SECOND
flink.topic cleaned-user-meal-reviews
transient_lastDdlTime 1750240388
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat: org.apache.hadoop.mapred.TextInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
serialization.format 1
Time taken: 0.168 seconds, Fetched: 54 row(s)
hive> SELECT
> id,
> review,
> rating,
> review_time,
> user_id,
> meal_id,
> op,
> ts
> FROM kafka_user_meal
> LIMIT 10;
2138986 [199a4ed1-e39e-47c4-ade9-ff69770529cc main] ERROR org.apache.hadoop.hive.ql.parse.CalcitePlanner - CBO failed, skipping CBO.
org.apache.hadoop.hive.ql.parse.SemanticException: Line 2:4 Invalid table alias or column reference 'id': (possible column names are: )
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genAllExprNodeDesc(SemanticAnalyzer.java:12673)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genExprNodeDesc(SemanticAnalyzer.java:12613)
at org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.genSelectLogicalPlan(CalcitePlanner.java:4539)
at org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.genLogicalPlan(CalcitePlanner.java:4876)
at org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.apply(CalcitePlanner.java:1661)
at org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.apply(CalcitePlanner.java:1609)
at org.apache.calcite.tools.Frameworks$1.apply(Frameworks.java:118)
at org.apache.calcite.prepare.CalcitePrepareImpl.perform(CalcitePrepareImpl.java:1052)
at org.apache.calcite.tools.Frameworks.withPrepare(Frameworks.java:154)
at org.apache.calcite.tools.Frameworks.withPlanner(Frameworks.java:111)
at org.apache.hadoop.hive.ql.parse.CalcitePlanner.logicalPlan(CalcitePlanner.java:1414)
at org.apache.hadoop.hive.ql.parse.CalcitePlanner.getOptimizedAST(CalcitePlanner.java:1430)
at org.apache.hadoop.hive.ql.parse.CalcitePlanner.genOPTree(CalcitePlanner.java:450)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:12164)
at org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:330)
at org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:285)
at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:659)
at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1826)
at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:1773)
at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:1768)
at org.apache.hadoop.hive.ql.reexec.ReExecDriver.compileAndRespond(ReExecDriver.java:126)
at org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:214)
at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:239)
at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:188)
at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:402)
at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:821)
at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:759)
at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:683)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.RunJar.run(RunJar.java:328)
at org.apache.hadoop.util.RunJar.main(RunJar.java:241)
FAILED: SemanticException [Error 10004]: Line 2:4 Invalid table alias or column reference 'id': (possible column names are: )
2139034 [199a4ed1-e39e-47c4-ade9-ff69770529cc main] ERROR org.apache.hadoop.hive.ql.Driver - FAILED: SemanticException [Error 10004]: Line 2:4 Invalid table alias or column reference 'id': (possible column names are: )
org.apache.hadoop.hive.ql.parse.SemanticException: Line 2:4 Invalid table alias or column reference 'id': (possible column names are: )
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genAllExprNodeDesc(SemanticAnalyzer.java:12673)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genExprNodeDesc(SemanticAnalyzer.java:12613)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genSelectPlan(SemanticAnalyzer.java:4565)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genSelectPlan(SemanticAnalyzer.java:4338)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPostGroupByBodyPlan(SemanticAnalyzer.java:10566)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genBodyPlan(SemanticAnalyzer.java:10505)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:11418)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:11288)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genOPTree(SemanticAnalyzer.java:12074)
at org.apache.hadoop.hive.ql.parse.CalcitePlanner.genOPTree(CalcitePlanner.java:593)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:12164)
at org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:330)
at org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:285)
at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:659)
at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1826)
at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:1773)
at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:1768)
at org.apache.hadoop.hive.ql.reexec.ReExecDriver.compileAndRespond(ReExecDriver.java:126)
at org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:214)
at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:239)
at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:188)
at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:402)
at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:821)
at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:759)
at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:683)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.RunJar.run(RunJar.java:328)
at org.apache.hadoop.util.RunJar.main(RunJar.java:241)
hive> FROM kafka_user_meal
是在pycharm中运行到一般后进程被杀死的结果
最新发布