Flink通过JDBC写ClickHouse

Flink写ClickHouse的分布式表
实现方式:数据直接通过JDBC写每个本地表

定义 AbstractClickHouseJDBCOutputFormat

package cn.org.hooli.flink.api.java.io.jdbc;

import org.apache.flink.api.common.io.RichOutputFormat;
import org.apache.flink.configuration.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

public abstract class AbstractClickHouseJDBCOutputFormat<T> extends RichOutputFormat<T> {
    private static final long serialVersionUID = 1L;

    private static final Logger LOG = LoggerFactory.getLogger(AbstractClickHouseJDBCOutputFormat.class);


    private final String username;
    private final String password;
    private final String driverName = "ru.yandex.clickhouse.ClickHouseDriver";
    protected final String[] hosts;

    protected final List<Connection> connectionList;

    public AbstractClickHouseJDBCOutputFormat(String username, String password, String[] hosts) {
        this.username = username;
        this.password = password;
        this.hosts = hosts;
        this.connectionList = new ArrayList<>();
    }

    @Override
    public void configure(Configuration parameters) {
    }

    protected void establishConnection() throws SQLException, ClassNotFoundException {
        Class.forName(driverName);

        for (String host : this.hosts) {
            // jdbc:clickhouse://10.138.41.146:8123
            String url = "jdbc:clickhouse://" + host;
            Connection connection = DriverManager.getConnection(url, this.username, this.password);
            this.connectionList.add(connection);
        }
    }

    protected void closeDbConnection() throws IOException {
        for (Connection connection : this.connectionList) {
            if (connection != null) {
                try {
                    connection.close();
                } catch (SQLException se) {
                    LOG.warn("JDBC connection could not be closed: " + se.getMessage());
                } finally {
                    connection = null;
                }
            }
        }


    }
}

定义ClickHouseJDBCOutputFormat

package cn.org.hooli.flink.api.java.io.jdbc;

import org.apache.flink.types.Row;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.*;

/**
 * OutputFormat to write Rows into a JDBC database.
 * The OutputFormat has to be configured using the supplied OutputFormatBuilder.
 *
 * @see Row
 * @see DriverManager
 */
public class ClickHouseJDBCOutputFormat extends AbstractClickHouseJDBCOutputFormat<Row> {

    private static final long serialVersionUID = 1L;

    private static final Logger LOG = LoggerFactory.getLogger(ClickHouseJDBCOutputFormat.class);

    private final String query;

    private final List<PreparedStatement> preparedStatementList;

    private final Map<Integer, List<Row>> ipWithDataList;

    private final long insertCkTimenterval; // 4000L
    // 插入的批次
    private final int insertCkBatchSize;  // 开发测试用10条
    // 上次写入时间
    private Long lastInsertTime;

    private final String dataBaseName;
    private final String tablename;
    private final String[] tableColums;

    /**
     *
     * @param username              用户名
     * @param password              密码
     * @param hosts                 格式 {"1.1.1.1:8123", "1.1.1.2:8123", "1.1.1.3:8123"}
     * @param insertCkTimenterval   flush数据到 ClickHouse (ms)
     * @param insertCkBatchSize     达到多少条写 ClickHouse
     * @param dataBaseName          数据库名
     * @param tablename             表名 (本地表名)
     * @param tableColums           列名
     */
    public ClickHouseJDBCOutputFormat(String username, String password, String[] hosts, long insertCkTimenterval, int insertCkBatchSize, String dataBaseName, String tablename, String[] tableColums) {
        super(username, password, hosts);
        this.insertCkTimenterval = insertCkTimenterval;
        this.insertCkBatchSize = insertCkBatchSize;
        this.lastInsertTime = System.currentTimeMillis();
        this.ipWithDataList = new HashMap<>();
        this.dataBaseName = dataBaseName;
        this.tablename = tablename;
        this.tableColums = tableColums;
        this.preparedStatementList = new ArrayList<>();
        this.query = clickhouseInsertValue(this.tableColums, this.tablename, this.dataBaseName);
    }

    /**
     * Connects to the target database and initializes the prepared statement.
     *
     * @param taskNumber The number of the parallel instance.
     * @throws IOException Thrown, if the output could not be opened due to an
     *                     I/O problem.
     */
    @Override
    public void open(int taskNumber, int numTasks) throws IOException {
        try {
            establishConnection();

            for (Connection connection : connectionList) {
                PreparedStatement preparedStatement = connection.prepareStatement(query);
                this.preparedStatementList.add(preparedStatement);
            }

        } catch (SQLException sqe) {
            throw new IllegalArgumentException("open() failed.", sqe);
        } catch (ClassNotFoundException cnfe) {
            throw new IllegalArgumentException("JDBC driver class not found.", cnfe);
        }
    }

    @Override
    public void writeRecord(Row row) throws IOException {
        /**
         * 1. 将数据写入CK
         */
        final int[] size = {0};
        ipWithDataList.values().forEach(rows -> {
            size[0] += rows.size();
        });

        if (size[0] >= this.insertCkBatchSize) {
            ipWithDataList.forEach((index, rows) -> {
                try {
                    flush(rows, preparedStatementList.get(index), connectionList.get(index));
                    LOG.info("insertCkBatchSize");
                } catch (SQLException e) {
                    throw new RuntimeException("Preparation of JDBC statement failed.", e);
                }
            });

            this.lastInsertTime = System.currentTimeMillis();

        }

        /**
         * 将当前行数据添加到List中
         */
        // 轮询写入各个local表,避免单节点数据过多
        if (null != row) {
            Random random = new Random();
            int index = random.nextInt(super.hosts.length);

            List<Row> rows = ipWithDataList.get(index);
            if (rows == null) {
                rows = new ArrayList<>();
            }

            rows.add(row);
            ipWithDataList.put(index, rows);
        }
    }


    // 插入数据
    public void flush(List<Row> rows, PreparedStatement preparedStatement, Connection connection) throws SQLException {

        for (int i = 0; i < rows.size(); ++i) {
            Row row = rows.get(i);
            for (int j = 0; j < this.tableColums.length; ++j) {

                if (null != row.getField(j)) {
                    preparedStatement.setObject(j + 1, row.getField(j));
                } else {
                    preparedStatement.setObject(j + 1, "null");
                }
            }
            preparedStatement.addBatch();
        }

        preparedStatement.executeBatch();
        connection.commit();
        preparedStatement.clearBatch();

        rows.clear();
    }

    public void snapshotStateFlush() {
        if (this.isTimeToDoInsert()) {

            LOG.info("timeToDoInsert");

            flush();
        }
    }

    public void flush() {
        ipWithDataList.forEach((index, rows) -> {
            try {
                flush(rows, preparedStatementList.get(index), connectionList.get(index));
            } catch (SQLException e) {
                throw new RuntimeException("Preparation of JDBC statement failed.", e);
            }
        });
    }

    /**
     * 根据时间判断是否插入数据
     *
     * @return
     */
    private boolean isTimeToDoInsert() {
        long currTime = System.currentTimeMillis();
        return currTime - this.lastInsertTime >= this.insertCkTimenterval;
    }

    /**
     * Executes prepared statement and closes all resources of this instance.
     *
     * @throws IOException Thrown, if the input could not be closed properly.
     */
    @Override
    public void close() throws IOException {
        for (PreparedStatement preparedStatement : this.preparedStatementList) {
            if (null != preparedStatement) {
                if (preparedStatement != null) {
                    flush();
                    try {
                        preparedStatement.close();
                    } catch (SQLException e) {
                        LOG.info("JDBC statement could not be closed: " + e.getMessage());
                    } finally {
                        preparedStatement = null;
                    }
                }
            }
        }

        closeDbConnection();
    }

    private String clickhouseInsertValue(String[] tableColums, String tablename, String dataBaseName) {
        StringBuffer sbCloums = new StringBuffer();
        StringBuffer sbValues = new StringBuffer();
        for (String s : tableColums) {
            sbCloums.append(s).append(",");
            sbValues.append("?").append(",");
        }
        String colums = sbCloums.toString().substring(0, sbCloums.toString().length() - 1);
        String values = sbValues.toString().substring(0, sbValues.toString().length() - 1);

        String insertSQL = "INSERT INTO " + dataBaseName + "." + tablename + " ( " + colums + " ) VALUES ( " + values + ")";
        return insertSQL;
    }

}

定义ClickHouseJDBCSinkFunction

package cn.org.hooli.flink.api.java.io.jdbc;

import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.flink.types.Row;

public class ClickHouseJDBCSinkFunction extends RichSinkFunction<Row> implements CheckpointedFunction {
    final ClickHouseJDBCOutputFormat outputFormat;

    public ClickHouseJDBCSinkFunction(ClickHouseJDBCOutputFormat outputFormat) {
        this.outputFormat = outputFormat;
    }

    @Override
    public void invoke(Row value, Context context) throws Exception {
        outputFormat.writeRecord(value);
    }

    @Override
    public void snapshotState(FunctionSnapshotContext context) throws Exception {
        outputFormat.snapshotStateFlush();
    }

    @Override
    public void initializeState(FunctionInitializationContext context) throws Exception {

    }

    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);
        RuntimeContext ctx = getRuntimeContext();
        outputFormat.setRuntimeContext(ctx);
        outputFormat.open(ctx.getIndexOfThisSubtask(), ctx.getNumberOfParallelSubtasks());
    }

    @Override
    public void close() throws Exception {
        outputFormat.close();
        super.close();
    }
}

使用方式

// insertCkBatchSize 的计算方式:
// 比如500条为一个批次,并行度是3,每个并行度(task)只要达到 500 / 3 就可以写出
// insertCkBatchSize = 500 / 3

logDataStream.addSink(
	new ClickHouseJDBCSinkFunction(
		new ClickHouseJDBCOutputFormat(username, password, hosts, insertCkTimenterval, insertCkBatchSize, dataBaseName, tablename, tableColums)
	)
).name("clickhouse-sink");

Flink JDBC 批量ClickHouse 可以通过 Flink JDBC OutputFormat 实现,具体步骤如下: 1. 在 Flink 任务中引入 ClickHouse JDBC 驱动程序,例如: ```java Class.forName("ru.yandex.clickhouse.ClickHouseDriver"); ``` 2. 创建 ClickHouse JDBC 连接,例如: ```java Connection connection = DriverManager.getConnection("jdbc:clickhouse://localhost:8123/default", "default", ""); ``` 3. 创建 Flink JDBC OutputFormat 对象,例如: ```java JDBCOutputFormat jdbcOutputFormat = JDBCOutputFormat.buildJDBCOutputFormat() .setDrivername("ru.yandex.clickhouse.ClickHouseDriver") .setDBUrl("jdbc:clickhouse://localhost:8123/default") .setUsername("default") .setPassword("") .setQuery("INSERT INTO table_name (column1, column2) VALUES (?, ?)") .setBatchInterval(5000) .finish(); ``` 其中,setQuery 方法指定了 SQL 语句,例如 INSERT INTO table_name (column1, column2) VALUES (?, ?),并且使用 ? 占位符来表示参数。 setBatchInterval 方法指定了批量提交的时间间隔,例如 5000 毫秒。 4. 将 Flink DataStream 转换成 JDBCOutputFormat,并使用 addSink 方法将其ClickHouse,例如: ```java DataStream<Tuple2<String, Integer>> dataStream = ...; dataStream .map(new MapFunction<Tuple2<String, Integer>, Row>() { @Override public Row map(Tuple2<String, Integer> value) throws Exception { Row row = new Row(2); row.setField(0, value.f0); row.setField(1, value.f1); return row; } }) .output(jdbcOutputFormat); ``` 其中,将 Tuple2 转换成 Row 对象,并使用 output 方法将其JDBCOutputFormat。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值