1、环境准备
笔者这边使用的cdh6.2.0自带的kafka,flink版本是1.16.3,python版本是3.6.5
# 安装python依赖
python3 -m pip install apache-flink==1.16.3
# 配置用户环境变量
# ps:这里的python地址要配一下,并且python下面的bin要有一个名字为python的可执行二进制包
vi ~/.bash_profile
export HADOOP_CLASSPATH=`hadoop classpath`
export HADOOP_CONF_DIR=/etc/hadoop/conf
export JAVA_HOME=/path/jdk
# 配置flink路径
export FLINK_HOME=/path/flink1.16.3
export HADOOP_COMMON_HOME=/opt/cloudera/parcels/CDH/lib/hadoop
export PATH=/path/python3.6.5/bin/:$FLINK_HOME/bin:$JAVA_HOME/bin:$PATH
2、编写代码
准备krb5.conf和jaas.conf文件(本地测试需要用到)
jaas.conf的内容如下:
KafkaClient {
com.sun.security.auth.module.Krb5LoginModule required
useKeyTab=true
keyTab="/path/user.keytab"
principal="user/域名@HADOOP.COM";
};
Client {
com.sun.security.auth.module.Krb5LoginModule required
useKeyTab=true
keyTab="/path/user.keytab"
principal="user/域名@HADOOP.COM";
};
pyflink连接kafka,数据落地到mysql的demo(逻辑比较简单):
# -*- coding: UTF-8 -*-
import logging
import sys,json
from pyflink.common import Types, WatermarkStrategy, Row
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors.kafka import FlinkKafkaProducer, FlinkKafkaConsumer, KafkaSource, KafkaOffsetsInitializer, KafkaSink
from pyflink.datastream.formats.json import JsonRowSerializationSchema, JsonRowDeserializationSchema
from pyflink.java_gateway import get_gateway
from pyflink.common.serialization import SimpleStringSchema
from pyflink.datastream.connectors.jdbc import JdbcSink, JdbcConnectionOptions, JdbcExecutionOptions
from pyflink.datastream.data_stream import DataStream
from pyflink.table.descriptors import Schema
# 提交到yarn,这段要注释掉
# 本地测试需要放开
# System = get_gateway().jvm.System
# System.setProperty("java.security.krb5.conf", "/path/krb5.conf");
# System.setProperty("java.security.auth.login.config", "/path/jaas.conf");
# print中文编码问题
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
if __name__ == '__main__':
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
env = StreamExecutionEnvironment.get_execution_environment()
# 本地测试需要放开,提交到集群可以注释掉
# env.add_jars("file:///path/flink-sql-connector-kafka-1.16.3.jar")
# env.add_jars("file:///path/mysql-connector-java-8.0.22.jar")
# env.add_jars("file:///path/flink-connector-jdbc-1.16.3.jar")
print("add kafka source")
# kafka 最好配置域名访问,不然可能会报出kerberos数据库不存在user/ip@HADOOP.COM凭据的错误
kafka_source = KafkaSource.builder() \
.set_bootstrap_servers("xxxx:9092,xxxx:9092,xxxx:9092") \
.set_topics("test1") \
.set_group_id("test_group_1") \
.set_value_only_deserializer(SimpleStringSchema()) \
.set_property("sasl.kerberos.service.name", "kafka") \
.set_property("sasl.mechanism", "GSSAPI") \
.set_property("security.protocol", "SASL_PLAINTEXT")