Flink清洗Kafka数据存入MySQL测试

本文介绍如何使用Apache Flink 1.6.2从Kafka读取日志数据,清洗并转换数据格式,然后将处理后的数据写入MySQL数据库。涉及Kafka消息消费、数据映射、过滤以及与MySQL的连接配置。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

版本信息:

Flink Version:1.6.2
Kafka Version:0.9.0.0
MySQL Version:5.6.21

Kafka 消息样例及格式:[IP TIME URL STATU_CODE REFERER]


1.74.103.143	2018-12-20 18:12:00	 "GET /class/130.html HTTP/1.1" 	404	https://search.yahoo.com/search?p=Flink实战

Pom.xml

<scala.version>2.11.8</scala.version>
<flink.version>1.6.2</flink.version>

 <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-java</artifactId>
      <version>${flink.version}</version>
    </dependency>

    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-streaming-java_2.11</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-clients_2.11</artifactId>
      <version>${flink.version}</version>
    </dependency>

    <!--Flink-Kafka -->
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-connector-kafka-0.9_2.11</artifactId>
      <version>${flink.version}</version>
    </dependency>

    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>5.1.39</version>
    </dependency>


sConf

package com.soul.conf;

/**
 * @author soulChun
 * @create 2018-12-20-15:11
 */
public class sConf {
    public static final String USERNAME = "root";
    public static final String PASSWORD = "root";
    public static final String DRIVERNAME = "com.mysql.jdbc.Driver";
    public static final String URL = "jdbc:mysql://localhost:3306/soul";
}

MySQLSlink

package com.soul.kafka;

import com.soul.conf.sConf;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;

/**
 * @author soulChun
 * @create 2018-12-20-15:09
 */
public class MySQLSink extends RichSinkFunction<Tuple5<String, String, String, String, String>> {
    private static final long serialVersionUID = 1L;
    private Connection connection;
    private PreparedStatement preparedStatement;

    public void invoke(Tuple5<String, String, String, String, String> value) {

        try {
            if (connection == null) {
                Class.forName(sConf.DRIVERNAME);
                connection = DriverManager.getConnection(sConf.URL, sConf.USERNAME, sConf.PASSWORD);
            }
            String sql = "insert into log_info (ip,time,courseid,status_code,referer) values (?,?,?,?,?)";
            preparedStatement = connection.prepareStatement(sql);
            preparedStatement.setString(1, value.f0);
            preparedStatement.setString(2, value.f1);
            preparedStatement.setString(3, value.f2);
            preparedStatement.setString(4, value.f3);
            preparedStatement.setString(5, value.f4);
            System.out.println("Start insert");
            preparedStatement.executeUpdate();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void open(Configuration parms) throws Exception {
        Class.forName(sConf.DRIVERNAME);
        connection = DriverManager.getConnection(sConf.URL, sConf.USERNAME, sConf.PASSWORD);
    }

    public void close() throws Exception {

        if (preparedStatement != null) {
            preparedStatement.close();
        }

        if (connection != null) {
            connection.close();
        }

    }


}

数据清洗日期工具类

package com.soul.utils;

import org.apache.commons.lang3.time.FastDateFormat;

import java.util.Date;

/**
 * @author soulChun
 * @create 2018-12-19-18:44
 */
public class DateUtils {
    private static FastDateFormat SOURCE_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss");
    private static FastDateFormat TARGET_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss");

    public static Long  getTime(String  time) throws Exception{
        return SOURCE_FORMAT.parse(time).getTime();
    }

    public static String parseMinute(String time) throws  Exception{
        return TARGET_FORMAT.format(new Date(getTime(time)));
    }


    public static void main(String[] args) throws Exception{
        String time = "2018-12-19 18:55:00";

        System.out.println(parseMinute(time));
    }
}

MySQL建表

create table log_info(
ID INT NOT NULL AUTO_INCREMENT,
IP VARCHAR(50),
TIME VARCHAR(50),
CourseID VARCHAR(10),
Status_Code VARCHAR(10),
Referer VARCHAR(100),
PRIMARY KEY ( ID )
)ENGINE=InnoDB DEFAULT CHARSET=utf8;

主程序:主要是将time的格式转成yyyyMMddHHmmss,还有取URL中的课程ID,将不是/class开头的过滤掉。

package com.soul.kafka;

import com.soul.utils.DateUtils;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer09;

import java.util.Properties;

/**
 * @author soulChun
 * @create 2018-12-19-17:23
 */
public class FlinkCleanKafka {
    public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.enableCheckpointing(5000);
        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "localhost:9092");//kafka的节点的IP或者hostName,多个使用逗号分隔
        properties.setProperty("zookeeper.connect", "localhost:2181");//zookeeper的节点的IP或者hostName,多个使用逗号进行分隔
        properties.setProperty("group.id", "test-consumer-group");//flink consumer flink的消费者的group.id

        FlinkKafkaConsumer09<String> myConsumer = new FlinkKafkaConsumer09<String>("imooc_topic", new SimpleStringSchema(), properties);

        DataStream<String> stream = env.addSource(myConsumer);
//        stream.print().setParallelism(2);

        DataStream CleanData = stream.map(new MapFunction<String, Tuple5<String, String, String, String, String>>() {
            @Override
            public Tuple5<String, String, String, String, String> map(String value) throws Exception {
                String[] data = value.split("\\\t");
                String CourseID = null;
                String url = data[2].split("\\ ")[2];
                if (url.startsWith("/class")) {
                    String CourseHTML = url.split("\\/")[2];
                    CourseID = CourseHTML.substring(0, CourseHTML.lastIndexOf("."));
//                    System.out.println(CourseID);
                }

                return Tuple5.of(data[0], DateUtils.parseMinute(data[1]), CourseID, data[3], data[4]);
            }
        }).filter(new FilterFunction<Tuple5<String, String, String, String, String>>() {
            @Override
            public boolean filter(Tuple5<String, String, String, String, String> value) throws Exception {
                return value.f2 != null;
            }
        });


        CleanData.addSink(new MySQLSink());

        env.execute("Flink kafka");
    }
}

启动主程序,查看表数据

mysql> select count(*) from log_info;
+----------+
| count(*) |
+----------+
|    15137 |
+----------+

Kafka过来的消息是我模拟的,一分钟产生100条。以上只是测试代码,对数据的准确性、程序性能没有做考虑,大家可以自己完善。还有WaterMark之类还在测试,测试完会做分享。

Apache Flink 1.15.4版本中使用DataStream API与数据库交互通常需要使用Flink SQL或表API。以下是使用Table API与MySQL连接并读取数据的一个简单示例,同时通过Kafka作为事件源更新内存中的数据: ```java import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.TableConfig; import org.apache.flink.table.api.TableSchema; import org.apache.flink.table.api.Types; import org.apache.flink.table.api.ValidationException; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; public class FlinkMySQLKafkaExample { public static void main(String[] args) throws Exception { // 初始化流处理环境 final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // 创建TableEnvironment final TableConfig tableConfig = new TableConfig(); tableConfig.set("format", "json"); // 如果MySQL数据是JSON格式 tableConfig.addConnectionProvider("jdbc", new JdbcConnectionProvider()); StreamTableEnvironment tEnv = StreamTableEnvironment.create(env, tableConfig); // 定义表连接MySQL String mysqlUrl = "jdbc:mysql://localhost:3306/mydatabase"; String tableName = "my_table"; TableSchema schema = TableSchema.builder() .field("id", Types.BIGINT) .field("name", Types.STRING) .build(); try { Table mysqlTable = tEnv.fromDatabase("default", tableName, schema, mysqlUrl); } catch (ValidationException e) { System.err.println("Error connecting to MySQL: " + e.getMessage()); return; } // 从Kafka消费更新数据 Properties kafkaProps = new Properties(); kafkaProps.setProperty("bootstrap.servers", "localhost:9092"); kafkaProps.setProperty("group.id", "test-group"); FlinkKafkaConsumer<String> kafkaSource = new FlinkKafkaConsumer<>("my-topic", new SimpleStringMapper(), kafkaProps); DataStream<String> kafkaStream = env.addSource(kafkaSource); // 将Kafka的消息映射到表结构并更新内存 DataStream<Row> updatedTable = kafkaStream.map(new MapFunction<String, Row>() { @Override public Row map(String value) throws Exception { // 解析Kafka消息并转换成Row对象 Map<String, Object> rowMap = JSON.parseToObject(value, new TypeReference<Map<String, Object>>() {}); return Row.of(rowMap.get("id"), rowMap.get("name")); } }).toTable(tEnv, schema); // 更新MySQL表 Table updatedMySqlTable = updatedTable.update(mysqlTable); // 运行流处理任务 updatedMySqlTable.executeInsert("default").await(); // 关闭资源 env.execute("Flink MySQL Kafka Example"); } private static class SimpleStringMapper implements MapFunction<String, String> { @Override public String map(String value) { // 实现字符串解析逻辑 return value; } } } // 相关问题:
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值