通过flume实时抽取数据库的多张表新增数据

本文链接：https://blog.youkuaiyun.com/weixin_42570350/article/details/106092270

网上关于sqlserver的数据新增同步方案很少，参考了github上的一位作者，链接在此，在此基础上做了些修改，可以同时监控多张表的数据新增，并加上了表名，列名信息

话不多说，直接上代码

package org.keedio.flume.source;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.hibernate.Session;
import org.hibernate.SessionFactory;
import org.hibernate.boot.registry.StandardServiceRegistryBuilder;
import org.hibernate.cfg.Configuration;
import org.hibernate.service.ServiceRegistry;
import org.hibernate.transform.Transformers;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Helper class to manage hibernate sessions and perform queries
 *
 * @author <a href="mailto:mvalle@keedio.com">Marcelo Valle</a>
 * @author tianqin
 */
public class HibernateHelper {

    private static final Logger LOG = LoggerFactory
            .getLogger(HibernateHelper.class);

    private static SessionFactory factory;
    private Session session;
    private Configuration config;
    private SQLSourceHelper sqlSourceHelper;

    /**
     * Constructor to initialize hibernate configuration parameters
     *
     * @param sqlSourceHelper Contains the configuration parameters from flume config file
     */
    public HibernateHelper(SQLSourceHelper sqlSourceHelper) {

        this.sqlSourceHelper = sqlSourceHelper;

        config = new Configuration()
                .setProperty("hibernate.connection.url", sqlSourceHelper.getConnectionURL())
                .setProperty("hibernate.connection.username", sqlSourceHelper.getUser())
                .setProperty("hibernate.connection.password", sqlSourceHelper.getPassword());

        if (sqlSourceHelper.getHibernateDialect() != null)
            config.setProperty("hibernate.dialect", sqlSourceHelper.getHibernateDialect());
        if (sqlSourceHelper.getHibernateDriver() != null)
            config.setProperty("hibernate.connection.driver_class", sqlSourceHelper.getHibernateDriver());
    }

    /**
     * Connect to database using hibernate
     */
    public void establishSession() {

        LOG.info("Opening hibernate session");

        ServiceRegistry serviceRegistry = new StandardServiceRegistryBuilder()
                .applySettings(config.getProperties()).build();
        factory = config.buildSessionFactory(serviceRegistry);
        session = factory.openSession();
    }

    /**
     * Close database connection
     */
    public void closeSession() {

        LOG.info("Closing hibernate session");

        session.close();
    }

    /**
     * Execute the selection query in the database
     *
     * @return 这里做了较大的修改，原作者的查询结果不含列名，这对业务人员很不友好，我修改成返回一个map集合，key是表名，value是
     * 每张表数据组成的list集合，包含列名
     */
    @SuppressWarnings("unchecked")
    public Map<String, List<List<Object>>> executeQuery(String[] tables) {

        //返回的 表名-数据 map对象
        Map<String, List<List<Object>>> tableRowsList = new HashMap<>();
        //得到每张表的查询语句集合
        Map<String, String> querys = sqlSourceHelper.getQuery();
        //得到每张表的当前index集合
        Map<String, Integer> currentIndexMap = sqlSourceHelper.getCurrentIndex();


        for (String table : tables) {
            List<List<Object>> tableRows = new ArrayList<>();
            List<Object> listObject = new ArrayList<>();

            /**
             * 这儿原作者是 .setResultTransformer(Transformers.TO_LIST()).list() 不包含列名信息
             * 取列名信息需要 List<Map<String, Object>>类型
             */
            List<Map<String, Object>> rowsList = session
                    .createSQLQuery(querys.get(table))
                    .setFirstResult(currentIndexMap.get(table))
                    .setMaxResults(sqlSourceHelper.getMaxRows())
                    .setResultTransformer(Transformers.ALIAS_TO_ENTITY_MAP).list();

            //类型转换，将List<Map<String, Object>>转换成List<List<Object>>
            for (Map<String, Object> rows : rowsList) {
                for (String column : rows.keySet()) {
                    listObject.add("table:" + table + "," + column + ":" + String.valueOf(rows.get(column)));
                }
                tableRows.add(listObject);
            }
  //给每行数据加上表名信息
            for (int i = 0; i < tableRows.size(); i++) {
                tableRows.get(i).add("table:"+table);
            }
            sqlSourceHelper.setCurrentIndex(table, currentIndexMap.get(table) + rowsList.size());
            tableRowsList.put(table, tableRows);
        }


        return tableRowsList;
    }
}

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *******************************************************************************/
package org.keedio.flume.source;

import java.io.File;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.PollableSource;
import org.apache.flume.conf.Configurable;
import org.apache.flume.event.SimpleEvent;
import org.apache.flume.source.AbstractSource;
import org.keedio.flume.metrics.SqlSourceCounter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import au.com.bytecode.opencsv.CSVWriter;


/**
 * A Source to read data from a SQL database. This source ask for new data in a table each configured time.<p>
 *
 * @author <a href="mailto:mvalle@keedio.com">Marcelo Valle</a>
 * @author tianqin
 */
public class SQLSource extends AbstractSource implements Configurable, PollableSource {

    private static final Logger LOG = LoggerFactory.getLogger(SQLSource.class);
    protected SQLSourceHelper sqlSourceHelper;
    private SqlSourceCounter sqlSourceCounter;
    private CSVWriter csvWriter;
    private HibernateHelper hibernateHelper;

    private String[] tables;
    private Map<String, Integer> currentIndexMap;
    private Map<String, File> files;

    /**
     * Configure the source, load configuration properties and establish connection with database
     */
    @Override
    public void configure(Context context) {

        LOG.info("Reading and processing configuration values for source " + getName());

        /* Initialize configuration parameters */
        sqlSourceHelper = new SQLSourceHelper(context);

        tables = sqlSourceHelper.getTables();
        currentIndexMap = sqlSourceHelper.getCurrentIndex();
        files = sqlSourceHelper.getFiles();

        /* Initialize metric counters */
        sqlSourceCounter = new SqlSourceCounter("SOURCESQL." + this.getName());

        /* Establish connection with database */
        hibernateHelper = new HibernateHelper(sqlSourceHelper);
        hibernateHelper.establishSession();

        /* Instantiate the CSV Writer */
        csvWriter = new CSVWriter(new ChannelWriter());

    }

    /**
     * Process a batch of events performing SQL Queries
     */
    @Override
    public Status process() throws EventDeliveryException {

        try {
            sqlSourceCounter.startProcess();

            Map<String, List<List<Object>>> results = hibernateHelper.executeQuery(tables);
            int size = 0;
            if (!results.isEmpty()) {

                for (String table : tables) {
                    List<List<Object>> result = results.get(table);
                    csvWriter.writeAll(sqlSourceHelper.getAllRows(result));
                    sqlSourceHelper.updateStatusFile(files.get(table), table, currentIndexMap.get(table));
                    size += result.size();
                }
                csvWriter.flush();
                sqlSourceCounter.incrementEventCount(size);
            }

            sqlSourceCounter.endProcess(size);

            if (size < sqlSourceHelper.getMaxRows()) {
                Thread.sleep(sqlSourceHelper.getRunQueryDelay());
            }

            return Status.READY;

        } catch (IOException | InterruptedException e) {
            LOG.error("Error procesing row", e);
            return Status.BACKOFF;
        }
    }

    /**
     * Starts the source. Starts the metrics counter.
     */
    @Override
    public void start() {

//        LOG.info("Starting sql source {} ...", getName());
//        sqlSourceCounter.start();
        super.start();
    }

    /**
     * Stop the source. Close database connection and stop metrics counter.
     */
    @Override
    public void stop() {

        LOG.info("Stopping sql source {} ...", getName());

        try {
            hibernateHelper.closeSession();
            csvWriter.close();
        } catch (IOException e) {
            LOG.warn("Error CSVWriter object ", e);
        } finally {
//            this.sqlSourceCounter.stop();
            super.stop();
        }
    }

    private class ChannelWriter extends Writer {
        private List<Event> events = new ArrayList<>();

        @Override
        public void write(char[] cbuf, int off, int len) throws IOException {
            Event event = new SimpleEvent();

            String s = new String(cbuf);
            event.setBody(s.substring(off, len - 1).getBytes());

            Map<String, String> headers;
            headers = new HashMap<String, String>();
            headers.put("timestamp", String.valueOf(System.currentTimeMillis()));
            event.setHeaders(headers);

            events.add(event);

            if (events.size() >= sqlSourceHelper.getBatchSize())
                flush();
        }

        @Override
        public void flush() {
            getChannelProcessor().processEventBatch(events);
            events.clear();
        }

        @Override
        public void close() throws IOException {
            flush();
        }
    }
}

package org.keedio.flume.source;


import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.flume.conf.ConfigurationException;
import org.apache.flume.Context;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * Helper to manage configuration parameters and utility methods <p>
 * <p>
 * Configuration parameters readed from flume configuration file:
 * <tt>type: </tt> org.keedio.flume.source.SQLSource <p>
 * <tt>connection.url: </tt> database connection URL <p>
 * <tt>user: </tt> user to connect to database <p>
 * <tt>password: </tt> user password <p>
 * <tt>table: </tt> table to read from <p>
 * <tt>columns.to.select: </tt> columns to select for import data (* will import all) <p>
 * <tt>incremental.value: </tt> Start value to import data <p>
 * <tt>run.query.delay: </tt> delay time to execute each query to database <p>
 * <tt>status.file.path: </tt> Directory to save status file <p>
 * <tt>status.file.name: </tt> Name for status file (saves last row index processed) <p>
 * <tt>batch.size: </tt> Batch size to send events from flume source to flume channel <p>
 * <tt>max.rows: </tt> Max rows to import from DB in one query <p>
 * <tt>custom.query: </tt> Custom query to execute to database (be careful) <p>
 *
 * @author <a href="mailto:mvalle@keedio.com">Marcelo Valle</a>
 * @author <a href="mailto:lalazaro@keedio.com">Luis Lazaro</a>
 * @author tianqin
 */

public class SQLSourceHelper {

    private static final Logger LOG = LoggerFactory.getLogger(SQLSourceHelper.class);

    private File file, directory;
    private int runQueryDelay, batchSize, maxRows, currentIndex;
    private String statusFilePath, statusFileName, connectionURL, table,
            columnsToSelect, user, password, customQuery, query, hibernateDialect,
            hibernateDriver;

    private static final String DEFAULT_STATUS_DIRECTORY = "/var/lib/flume";
    private static final int DEFAULT_QUERY_DELAY = 10000;
    private static final int DEFAULT_BATCH_SIZE = 1000;
    private static final int DEFAULT_MAX_ROWS = 10000;
    private static final int DEFAULT_INCREMENTAL_VALUE = 0;

    /**
     * 添加map存放多张表数据，key为表名
     */
    private Map<String, File> files;
    private Map<String, String> querys;
    private Map<String, Integer> currentIndexMap;
    private String[] tables;

    /**
     * Builds an SQLSourceHelper containing the configuration parameters and
     * usefull utils for SQL Source
     *
     * @param context Flume source context, contains the properties from configuration file
     */
    public SQLSourceHelper(Context context) {

        statusFilePath = context.getString("status.file.path", DEFAULT_STATUS_DIRECTORY);
        statusFileName = context.getString("status.file.name");
        connectionURL = context.getString("connection.url");
        table = context.getString("table");
        columnsToSelect = context.getString("columns.to.select", "*");
        runQueryDelay = context.getInteger("run.query.delay", DEFAULT_QUERY_DELAY);
        user = context.getString("user");
        password = context.getString("password");
        directory = new File(statusFilePath);
        customQuery = context.getString("custom.query");
        batchSize = context.getInteger("batch.size", DEFAULT_BATCH_SIZE);
        maxRows = context.getInteger("max.rows", DEFAULT_MAX_ROWS);
        hibernateDialect = context.getString("hibernate.dialect");
        hibernateDriver = context.getString("hibernate.connection.driver_class");

        checkMandatoryProperties();

        //配置文件中表名英文逗号隔开
        tables = table.split(",");

        if (!(isStatusDirectoryCreated())) {
            createDirectory();
        }
        files = new HashMap<>();
        querys = new HashMap<>();
        currentIndexMap = new HashMap<>();

        //给每张表对应的value赋值
        for (String table1 : tables) {

            file = new File(statusFilePath + "/" + statusFileName + "_" + table1);
            files.put(table1, file);
            currentIndex = getStatusFileIndex(context.getInteger("incremental.value", DEFAULT_INCREMENTAL_VALUE), file, table1);
            currentIndexMap.put(table1, currentIndex);
            query = buildQuery(table1);
            querys.put(table1, query);
        }
    }


    private String buildQuery(String tableName) {

        if (customQuery == null)
            return "SELECT " + columnsToSelect + " FROM " + tableName;
        else
            return customQuery;
    }


    private boolean isStatusFileCreated(File file) {

        return file.exists() && !file.isDirectory() ? true : false;
    }

    private boolean isStatusDirectoryCreated() {
        return directory.exists() && !directory.isFile() ? true : false;
    }

    /**
     * Converter from a List of Object List to a List of String arrays <p>
     * Useful for csvWriter
     *
     * @param queryResult Query Result from hibernate executeQuery method
     * @return A list of String arrays, ready for csvWriter.writeall method
     */
    public List<String[]> getAllRows(List<List<Object>> queryResult) {

        List<String[]> allRows = new ArrayList<String[]>();

        if (queryResult == null || queryResult.isEmpty())
            return allRows;

        String[] row = null;

        for (int i = 0; i < queryResult.size(); i++) {
            List<Object> rawRow = queryResult.get(i);
            row = new String[rawRow.size()];
            for (int j = 0; j < rawRow.size(); j++) {
                if (rawRow.get(j) != null)
                    row[j] = rawRow.get(j).toString();
                else
                    row[j] = "";
            }
            allRows.add(row);
        }

        return allRows;
    }


    /**
     * Update status file with last read row index
     */
    public void updateStatusFile(File file, String table, int currentIndex) {
        /* Status file creation or update */
        try {
            Writer writer = new FileWriter(file, false);
            writer.write(connectionURL + " ");
            writer.write(table + " ");
            writer.write(Integer.toString(currentIndex) + " \n");
            writer.close();
        } catch (IOException e) {
            LOG.error("Error writing incremental value to status file!!!", e);
        }
    }

    private int getStatusFileIndex(int configuredStartValue, File file, String table) {

        if (!isStatusFileCreated(file)) {
            LOG.info("Status file not created, using start value from config file");
            return configuredStartValue;
        } else {
            try {
                FileReader reader = new FileReader(file);
                char[] chars = new char[(int) file.length()];
                reader.read(chars);
                String[] statusInfo = new String(chars).split(" ");
                if (statusInfo[0].equals(connectionURL) && statusInfo[1].equals(table)) {
                    reader.close();
                    LOG.info(statusFilePath + "/" + statusFileName + "_" + table + " correctly formed");
                    return Integer.parseInt(statusInfo[2]);
                } else {
                    LOG.warn(statusFilePath + "/" + statusFileName + "_" + table + " corrupt!!! Deleting it.");
                    reader.close();
                    deleteStatusFile(file);
                    return configuredStartValue;
                }
            } catch (NumberFormatException | IOException e) {
                LOG.error("Corrupt index value in file!!! Deleting it.", e);
                deleteStatusFile(file);
                return configuredStartValue;
            }
        }
    }

    private void deleteStatusFile(File file) {
        if (file.delete()) {
            LOG.info("Deleted status file: {}", file.getAbsolutePath());
        } else {
            LOG.warn("Error deleting file: {}", file.getAbsolutePath());
        }
    }

    private void checkMandatoryProperties() {

        if (statusFileName == null) {
            throw new ConfigurationException("status.file.name property not set");
        }
        if (connectionURL == null) {
            throw new ConfigurationException("connection.url property not set");
        }
        if (table == null && customQuery == null) {
            throw new ConfigurationException("property table not set");
        }
        if (password == null) {
            throw new ConfigurationException("password property not set");
        }
        if (user == null) {
            throw new ConfigurationException("user property not set");
        }
    }

    /*
     * @return String connectionURL
     */
    String getConnectionURL() {
        return connectionURL;
    }

    /*
     * @return boolean pathname into directory
     */
    private boolean createDirectory() {
        return directory.mkdir();
    }

    /*
     * @return 最新的 table-index 键值对
     */
    Map<String, Integer> getCurrentIndex() {
        return currentIndexMap;
    }

    /**
     * @void set incrementValue
     * @param table 表名
     * @param newValue 新的index
     */
    void setCurrentIndex(String table, int newValue) {
        currentIndexMap.replace(table, newValue);
    }

    /*
     * @return String user for database
     */
    String getUser() {
        return user;
    }

    /*
     * @return String password for user
     */
    String getPassword() {
        return password;
    }

    /*
     * @return int delay in ms
     */
    int getRunQueryDelay() {
        return runQueryDelay;
    }

    int getBatchSize() {
        return batchSize;
    }

    int getMaxRows() {
        return maxRows;
    }

    Map<String, String> getQuery() {
        return querys;
    }


    String getHibernateDialect() {
        return hibernateDialect;
    }


    String getHibernateDriver() {
        return hibernateDriver;
    }

    //返回 table-file 键值对
    Map<String, File> getFiles() {
        return files;
    }

    //返回 table数组
    String[] getTables() {
        return tables;
    }
}

就这三个文件做了些许改动，其他的用原作者的就行了

演示效果

我用的kafka sink作接收数据，监听的test_usb、test_usb2两张表

两张表同时插入数据
在这里插入图片描述

kafka消费到数据
在这里插入图片描述

附上flume配置文件

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per agent, 
# in this case called 'agent'

agent.sources = sqlsSource
agent.channels = memoryChannel
agent.sinks = kafkaSink

# For each one of the sources, the type is defined
agent.sources.sqlsSource.type = org.keedio.flume.source.SQLSource

#读取sqlserver2008的配置
agent.sources.sqlsSource.connection.url = jdbc:sqlserver://0.0.0.0:1433;DatabaseName=test
agent.sources.sqlsSource.user = sa
agent.sources.sqlsSource.password = 123456
agent.sources.sqlsSource.hibernate.connection.autocommit = true
agent.sources.sqlsSource.hibernate.connection.provider_class = org.hibernate.connection.C3P0ConnectionProvider
agent.sources.sqlsSource.hibernate.connection.driver_class = com.microsoft.sqlserver.jdbc.SQLServerDriver
agent.sources.sqlsSource.hibernate.c3p0.min_size=1
agent.sources.sqlsSource.hibernate.c3p0.max_size=10
#根据sqlserver版本作修改
agent.sources.sqlsSource.hibernate.dialect = org.hibernate.dialect.SQLServer2008Dialect


agent.sources.sqlsSource.max.rows = 1000000
agent.sources.sqlsSource.table = test_usb,test_usb2 #多张表的表名要用逗号隔开
agent.sources.sqlsSource.columns.to.select = *
agent.sources.sqlsSource.column.name = id	
agent.sources.sqlsSource.incremental.value = 0
agent.sources.sqlsSource.delimiter.entry = ,
agent.sources.sqlsSource.enclose.by.quotes = false
agent.sources.sqlsSource.run.query.delay=5000

agent.sources.sqlsSource.status.file.path = D:/ChromeDownload/apache-flume-1.7.0-bin/status
agent.sources.sqlsSource.status.file.name = status
# The channel can be defined as follows.
agent.sources.sqlsSource.channels = memoryChannel


# Each sink's type must be defined
agent.sinks.kafkaSink.type = org.apache.flume.sink.kafka.KafkaSink
agent.sinks.kafkaSink.kafka.bootstrap.servers=wind-bd-test02:9092,wind-bd-test03:9092,wind-bd-test04:9092
agent.sinks.kafkaSink.kafka.topic=test


#Specify the channel the sink should use
agent.sinks.kafkaSink.channel = memoryChannel

# Each channel's type is defined.
agent.channels.memoryChannel.type = memory

# Other config values specific to each type of channel(sink or source)
# can be defined as well
# In this case, it specifies the capacity of the memory channel
agent.channels.memoryChannel.capacity = 100