(转)pyflink实时接收kafka数据至kafka的其他topic

rm -rf job.py
cat>job.py<<EOF
import os
from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
from pyflink.table import StreamTableEnvironment, DataTypes, EnvironmentSettings
from pyflink.table.udf import udf


provinces = ("beijing", "shanghai", "hangzhou", "shenzhen", "jiangxi", "chongqing", "xizang")


@udf(input_types=[DataTypes.INT()], result_type=DataTypes.STRING())
def province_id_to_name(id):
    return provinces[id]

#请根据创建的Kafka集群,输入以下信息。
def log_processing():
    kafka_servers = "xx.xx.xx.xx:9092,xx.xx.xx.xx:9092,xx.xx.xx.xx:9092"
    kafka_zookeeper_servers = "xx.xx.xx.xx:2181,xx.xx.xx.xx:2181,xx.xx.xx.xx:2181"
    source_topic = "payment_msg"
    sink_topic = "results"
    kafka_consumer_group_id = "test_3"

    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    env_settings = EnvironmentSettings.Builder().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=env_settings)
    t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed", True)

    source_ddl = f"""
            CREATE TABLE payment_msg(
                createTime VARCHAR,
                rt as TO_TIMESTAMP(createTime),
                orderId BIGINT,
                payAmount DOUBLE,
                payPlatform INT,
                provinceId INT,
                WATERMARK FOR rt as rt - INTERVAL '2' SECOND
            ) WITH (
              'connector.type' = 'kafka',
              'connector.version' = 'universal',
              'connector.topic' = '{source_topic}',
              'connector.properties.bootstrap.servers' = '{kafka_servers}',
              'connector.properties.zookeeper.connect' = '{kafka_zookeeper_servers}',
              'connector.properties.group.id' = '{kafka_consumer_group_id}',
              'connector.startup-mode' = 'latest-offset',
              'format.type' = 'json'
            )
            """

    es_sink_ddl = f"""
            CREATE TABLE es_sink (
            province VARCHAR,
            pay_amount DOUBLE,
            rowtime TIMESTAMP(3)
            ) with (
              'connector.type' = 'kafka',
              'connector.version' = 'universal',
              'connector.topic' = '{sink_topic}',
              'connector.properties.bootstrap.servers' = '{kafka_servers}',
              'connector.properties.zookeeper.connect' = '{kafka_zookeeper_servers}',
              'connector.properties.group.id' = '{kafka_consumer_group_id}',
              'connector.startup-mode' = 'latest-offset',
              'format.type' = 'json'
            )
    """

    t_env.sql_update(source_ddl)
    t_env.sql_update(es_sink_ddl)

    t_env.register_function('province_id_to_name', province_id_to_name)

    query = """
    select province_id_to_name(provinceId) as province, sum(payAmount) as pay_amount, tumble_start(rt, interval '5' second) as rowtime
    from payment_msg
    group by tumble(rt, interval '5' second), provinceId
    """

    t_env.sql_query(query).insert_into("es_sink")

    t_env.execute("payment_demo")


if __name__ == '__main__':
    log_processing()
EOF

rm -rf lib
mkdir lib
cd lib
wget https://maven.aliyun.com/nexus/content/groups/public/org/apache/flink/flink-sql-connector-kafka_2.11/1.10.1/flink-sql-connector-kafka_2.11-1.10.1.jar
wget https://maven.aliyun.com/nexus/content/groups/public/org/apache/flink/flink-json/1.10.1/flink-json-1.10.1-sql-jar.jar
cd ../
zip -r lib.jar lib/*

参考文档:
https://help.aliyun.com/document_detail/181568.html
https://blog.youkuaiyun.com/chenshijie2011/article/details/117399883
https://blog.youkuaiyun.com/chenshijie2011/article/details/117401621
https://www.cnblogs.com/maoxiangyi/p/13509782.html
https://www.cnblogs.com/Springmoon-venn/p/13726089.html
https://www.jianshu.com/p/295066a24092
https://blog.youkuaiyun.com/m0_37592814/article/details/108044830
 

### 使用 PyFlink 处理来自 Kafka数据流 为了使用 PyFlink 连接并处理来自 Kafka数据流,需先设置好相应的开发环境以及配置必要的依赖项。具体来说,在 Python 中利用 PyFlinkKafka 构建的数据管道能够高效地完成实时数据分析任务。 #### 创建执行环境与表环境 初始化过程中要创建两个主要组件:`StreamExecutionEnvironment` 及 `StreamTableEnvironment`。前者用于定义应用程序的整体运行参数;后者则允许操作结构化表格形式的数据源和接收器[^1]。 ```python from pyflink.datastream import StreamExecutionEnvironment from pyflink.table import StreamTableEnvironment env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(env) ``` #### 配置 Kafka 数据源描述符 接下来指定如何连接至特定的 Kafka 主题作为输入源。这涉及到设定服务器地址、主题名称以及其他可能影响通信行为的关键字参数。 ```python from pyflink.table.descriptors import Schema, Kafka, Json t_env.connect( Kafka() .version("universal") # 版本兼容性声明 .topic("your_topic_name") .start_from_latest() # 或 start_from_earliest() 来决定消费起点 .property("bootstrap.servers", "localhost:9092")) .with_format(Json()) .with_schema(Schema().field("id", "INT").field("name", "STRING")) .in_append_mode() .register_table_source("source_table") ``` 上述代码片段展示了怎样通过 JSON 解析模式来解析消息体内的字段,并将其映射成 Flink 表格中的列。 #### 实现简单的 Word Count 功能 一旦成功注册了 Kafka 输入源之后,就可以在此基础上实施各种换逻辑。这里给出一个简易版单词计数的例子: ```python result = t_env.from_path('source_table') \ .group_by('word') \ .select('word, count(1) as cnt') t_env.to_append_stream(result, Types.ROW([Types.STRING(), Types.LONG()])).print() # 提交作业给集群执行 env.execute("WordCount Example") ``` 此段脚本会统计每条记录中不同词语出现次数并将结果输出打印出来。当然实际应用里通常还会涉及更复杂的业务场景分析[^4]。
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小金子的夏天

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值