waterdrop1.x导入clickhouse分布式表-修改源码

原创

已于 2022-02-22 14:19:11 修改 · 962 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#clickhouse写分布式表 #waterdrop写ck #seatunnel #clickhouse行hash

于 2022-02-22 14:05:11 首次发布

该博客介绍了针对Clickhouse分布式表的写入性能优化，通过自定义的Clickhouse输出插件，实现了按行哈希和行随机两种新的写入模式，以提高写入效率。作者提供了编译好的类文件，并给出了详细的使用说明，包括配置参数、测试案例和性能测试结果。测试表明，新写入模式在某些场景下能有效提升写入速度。

接上一篇，使用fiter+sql方式进行分布式写表，存在效率低的问题，现在尝试从源码入手，制定clickhouse的分布式表本地写入方案

编译好的class文件：

https://download.youkuaiyun.com/download/cakecc2008/81878962https://download.youkuaiyun.com/download/cakecc2008/81878962

1、目标：

实现按行hash和按行随机两张写表模式，兼容之前的单机模式和分区随机模式

2、思路:

新增2个参数

write_mode：写表方式 hash_fields：hash字段，支持多个字段，逗号分隔

伪码：

如果（【cluster】参数有效）：
	如果（【write_mode】有效）：
		如果（write_mode=‘rowhash’,且【hash_fields】有效）：
			行hash模式
		否则如果（write_mode=‘rowrandom’）
			行随机模式
		否则
			分区随机模式，默认	
	否则：
		分区随机模式，默认
否则：
	单机模式

### 3、源码修改：

只需修改 io.github.interestinglab.waterdrop.output.batch.Clickhouse 类中的process方法即可，下面是Clickhouse 修改后的完整代码（版本1.5.1）

package io.github.interestinglab.waterdrop.output.batch

import java.text.SimpleDateFormat
import java.util
import java.util.Properties
import java.math.BigDecimal
import java.sql.ResultSet

import io.github.interestinglab.waterdrop.config.{Config, ConfigFactory}
import io.github.interestinglab.waterdrop.apis.BaseOutput
import io.github.interestinglab.waterdrop.config.ConfigRuntimeException
import io.github.interestinglab.waterdrop.config.TypesafeConfigUtils
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import ru.yandex.clickhouse.except.{ClickHouseException, ClickHouseUnknownException}
import ru.yandex.clickhouse.settings.ClickHouseProperties
import ru.yandex.clickhouse.{
  BalancedClickhouseDataSource,
  ClickHouseConnectionImpl,
  ClickHousePreparedStatement,
  ClickhouseJdbcUrlParser
}

import scala.collection.JavaConversions._
import scala.collection.immutable.HashMap
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.WrappedArray
import scala.util.matching.Regex
import scala.util.{Failure, Success, Try}

class Clickhouse extends BaseOutput {

  var tableSchema: Map[String, String] = new HashMap[String, String]()
  var jdbcLink: String = _
  var initSQL: String = _
  var table: String = _
  var fields: java.util.List[String] = _

  var cluster: String = _

  //contians cluster basic info
  var clusterInfo: ArrayBuffer[(String, Int, Int, String)] = _

  var retryCodes: java.util.List[Integer] = _
  var config: Config = ConfigFactory.empty()
  val clickhousePrefix = "clickhouse."
  val properties: Properties = new Properties()

  var writeMode:String = "single"
  var hashFieldsArray:Array[String] = _
  /**
   * Set Config.
   * */
  override def setConfig(config: Config): Unit = {
    this.config = config
  }

  /**
   * Get Config.
   * */
  override def getConfig(): Config = {
    this.config
  }

  override def checkConfig(): (Boolean, String) = {

    val requiredOptions = List("host", "table", "database")

    val nonExistsOptions = requiredOptions.map(optionName => (optionName, config.hasPath(optionName))).filter { p =>
      val (optionName, exists) = p
      !exists
    }

    if (TypesafeConfigUtils.hasSubConfig(config, clickhousePrefix)) {
      val clickhouseConfig = TypesafeConfigUtils.extractSubConfig(config, clickhousePrefix, false)
      clickhouseConfig
        .entrySet()
        .foreach(entry => {
          val key = entry.getKey
          val value = String.valueOf(entry.getValue.unwrapped())
          properties.put(key, value)
        })
    }

    if (nonExistsOptions.nonEmpty) {
      (
        false,
        "please specify " + nonExistsOptions
          .map { option =>
            val (name, exists) = option
            "[" + name + "]"
          }
          .mkString(", ") + " as non-empty string")
    }

    val hasUserName = config.hasPath("username")
    val hasPassword = config.hasPath("password")

    if (hasUserName && !hasPassword || !hasUserName && hasPassword) {
      (false, "please specify username and password at the same time")
    }
    if (hasPassword) {
      properties.put("user", config.getString("username"))
      properties.put("password", config.getString("password"))
    }

    (true, "")
  }

  override def prepare(spark: SparkSession): Unit = {
    this.jdbcLink = String.format("jdbc:clickhouse://%s/%s", config.getString("host"), config.getString("database"))

    val balanced: BalancedClickhouseDataSource = new BalancedClickhouseDataSource(this.jdbcLink, properties)
    val conn = balanced.getConnection.asInstanceOf[ClickHouseConnectionImpl]

    this.table = config.getString("table")
    this.tableSchema = getClickHouseSchema(conn, table)

    if (this.config.hasPath("fields")) {
      this.fields = config.getStringList("fields")
      val (flag, msg) = acceptedClickHouseSchema()
      if (!flag) {
        throw new ConfigRuntimeException(msg)
      }
    }

    val defaultConfig = ConfigFactory.parseMap(
      Map(
        "bulk_size" -> 20000,
        // "retry_codes" -> util.Arrays.asList(ClickHouseErrorCode.NETWORK_ERROR.code),
        "retry_codes" -> util.Arrays.asList(),
        "retry" -> 1
      )
    )
    //检查配置文件中是否存在cluster参数
    if (config.hasPath("cluster")) {
      this.cluster = config.getString("cluster")
      //从数据库中获取集群信息，后面在process方法中用到，clusterInfo其实是一个数组
      this.clusterInfo = getClickHouseClusterInfo(conn, cluster)
      if (this.clusterInfo.size == 0) {
        val errorInfo = s"cloud not find cluster config in system.clusters, config cluster = $cluster"
        logError(errorInfo)
        throw new RuntimeException(errorInfo)
      }
      logInfo(s"get [$cluster] config from system.clusters, the replica info is [$clusterInfo].")
    }

    config = config.withFallback(defaultConfig)
    retryCodes = config.getIntList("retry_codes")
    super.prepare(spark)
  }

  override def process(df: Dataset[Row]): Unit = {
    val dfFields = df.schema.fieldNames
    val bulkSize = config.getInt("bulk_size")
    val retry = config.getInt("retry")

    if (!config.hasPath("fields")) {
      fields = dfFields.toList
    }

    this.initSQL = initPrepareSQL()
    logInfo(this.initSQL)

    //如果clusterInfo中有数据，就是集群模式
    if (this.clusterInfo != null && this.clusterInfo.size > 0) {
      var writeMode="single"
      var hashFields="" //支持多个字段，逗号分隔

      //获取 write_mode、和hash_fields两个参数
      if(config.hasPath("write_mode")) {
        writeMode=config.getString("write_mode")
      }
      if(config.hasPath("hash_fields")) {
        hashFields=config.getString("hash_fields")
        hashFieldsArray=hashFields.split(",")
      }
      //写模式为hash，且hash_fields参数不为空，且hash_fields字段都存在于流中，则为使用hash方式，否则还是使用随机方式
      logInfo(s"conf's writeMode: $writeMode")
      logInfo(s"conf's HashFields: $hashFields")
      //logDebug(s"dfFields.contains(HashField):"+dfFields.contains(HashField))
      if("rowhash".equals(writeMode) && !"".equals(hashFields) && hashFields != null && checkFields(hashFields,dfFields)) {
        this.writeMode="rowHash"
        logInfo(s"cluster rowHash  mode,  shard index select in iteator")
      }else if("rowrandom".equals(writeMode)) {
        this.writeMode="rowRandom"
        logInfo(s"cluster rowRandom  mode,  shard index select in iteator")
      }
      else{
        this.writeMode="partRandom"
        logInfo(s"cluster partRandom  mode,  shard index select in foreachPartition.  the jdbc url is ")
      }
    }else{
      this.writeMode="single"
      logInfo(s"single mode, the jdbc url create in foreachPartition.")
    }

    logDebug(s"ready foreachPartition...")
    df.foreachPar