参考于阿里-孙金城
(1)从flink的自带数据生产器data-gen中生成数据,使用print sink打印出去
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import EnvironmentSettings, StreamTableEnvironment
def hello_world():
"""
从随机Source读取数据,然后直接利用PrintSink输出。
"""
settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
env = StreamExecutionEnvironment.get_execution_environment()
t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings)
source_ddl = """
CREATE TABLE random_source (
f_sequence INT,
f_random INT,
f_random_str STRING
) WITH (
'connector' = 'datagen',
'rows-per-second'='5',
'fields.f_sequence.kind'='sequence',
'fields.f_sequence.start'='1',
'fields.f_sequence.end'='1000',
'fields.f_random.min'='1',
'fields.f_random.max'='1000',
'fields.f_random_str.length'='10'
)
"""
sink_ddl = """
CREATE TABLE print_sink (
f_sequence INT,
f_random INT,
f_random_str STRING
) WITH (
'connector' = 'print'
)
"""
# 注册source和sink
t_env.execute_sql(source_ddl)
t_env.execute_sql(sink_ddl)
# 数据提取
tab = t_env.from_path("random_source")
# 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进...
tab.insert_into("print_sink")
# 执行作业
t_env.execute("Flink Hello World")
if __name__ == '__main__':
hello_world()
从kafka到mysql
kafka创建topic,我这都用的docker
kafka-topics.sh --create --bootstrap-server kafka:9092 --topic cdn-log
然后是mysql建表
create table cdn_log (msg VARCHAR(300));
上flink代码
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import EnvironmentSettings, StreamTableEnvironment, DataTypes
def kafka_to_mysql():
"""
从Kafka Source读取Json数据,然后导入到Mysql。{"msg": "welcome flink users..."}
"""
settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
env = StreamExecutionEnvironment.get_execution_environment()
t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings)
t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed", True)
'''
# 添加依赖
base_dir = "file:///Users/jincheng.sunjc/work/PlaygroundEnv/myJars/"
kafka_jar = f"{base_dir}flink-sql-connector-kafka-0.11_2.11-1.11.1.jar"
jdbc_jar = f"{base_dir}flink-connector-jdbc_2.11-1.11.1.jar"
mysql_jar = f"{base_dir}mysql-connector-java-5.1.40.jar"
json_format_jar = f"{base_dir}flink-json-1.11.1.jar"
table_common_jar = f"{base_dir}flink-table-common-1.11.1.jar"
jar_seq = [kafka_jar, jdbc_jar, mysql_jar, json_format_jar, table_common_jar]
jars = ";".join(jar_seq)
t_env.get_config().get_configuration()
'''
source_ddl = """
CREATE TABLE kafka_source (
msg STRING
) WITH (
'connector' = 'kafka-0.11',
'topic' = 'cdn-log',
'properties.bootstrap.servers' = 'kafka:9092',
'format' = 'json',
'scan.startup.mode' = 'latest-offset'
)
"""
sink_ddl = """
CREATE TABLE mysql_sink (
msg STRING
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://mysql:3306/data?characterEncoding=utf-8&useSSL=false',
'table-name' = 'cdn_log',
'username' = 'root',
'password' = '123456',
'sink.buffer-flush.max-rows' = '1'
)
"""
# 注册source和sink
t_env.execute_sql(source_ddl)
t_env.execute_sql(sink_ddl)
# 数据提取
tab = t_env.from_path("kafka_source")
# 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进...
tab.insert_into("mysql_sink")
# 执行作业
t_env.execute("kafka_to_mysql")
if __name__ == '__main__':
kafka_to_mysql()
kafka开始生产 kafka-console-producer.sh --broker-list kafka:9092 --topic cdn-log
这时候mysql