接上一篇,使用fiter+sql方式进行分布式写表,存在效率低的问题,现在尝试从源码入手,制定clickhouse的分布式表本地写入方案
编译好的class文件:
1、目标:
实现按行hash和按行随机两张写表模式,兼容之前的单机模式和分区随机模式
2、思路:
新增2个参数
write_mode:写表方式 hash_fields:hash字段,支持多个字段,逗号分隔
伪码:
如果(【cluster】参数有效):
如果(【write_mode】有效):
如果(write_mode=‘rowhash’,且【hash_fields】有效):
行hash模式
否则如果(write_mode=‘rowrandom’)
行随机模式
否则
分区随机模式,默认
否则:
分区随机模式,默认
否则:
单机模式
### 3、源码修改:
只需修改 io.github.interestinglab.waterdrop.output.batch.Clickhouse 类中的process方法即可,下面是Clickhouse 修改后的完整代码(版本1.5.1)
package io.github.interestinglab.waterdrop.output.batch
import java.text.SimpleDateFormat
import java.util
import java.util.Properties
import java.math.BigDecimal
import java.sql.ResultSet
import io.github.interestinglab.waterdrop.config.{Config, ConfigFactory}
import io.github.interestinglab.waterdrop.apis.BaseOutput
import io.github.interestinglab.waterdrop.config.ConfigRuntimeException
import io.github.interestinglab.waterdrop.config.TypesafeConfigUtils
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import ru.yandex.clickhouse.except.{ClickHouseException, ClickHouseUnknownException}
import ru.yandex.clickhouse.settings.ClickHouseProperties
import ru.yandex.clickhouse.{
BalancedClickhouseDataSource,
ClickHouseConnectionImpl,
ClickHousePreparedStatement,
ClickhouseJdbcUrlParser
}
import scala.collection.JavaConversions._
import scala.collection.immutable.HashMap
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.WrappedArray
import scala.util.matching.Regex
import scala.util.{Failure, Success, Try}
class Clickhouse extends BaseOutput {
var tableSchema: Map[String, String] = new HashMap[String, String]()
var jdbcLink: String = _
var initSQL: String = _
var table: String = _
var fields: java.util.List[String] = _
var cluster: String = _
//contians cluster basic info
var clusterInfo: ArrayBuffer[(String, Int, Int, String)] = _
var retryCodes: java.util.List[Integer] = _
var config: Config = ConfigFactory.empty()
val clickhousePrefix = "clickhouse."
val properties: Properties = new Properties()
var writeMode:String = "single"
var hashFieldsArray:Array[String] = _
/**
* Set Config.
* */
override def setConfig(config: Config): Unit = {
this.config = config
}
/**
* Get Config.
* */
override def getConfig(): Config = {
this.config
}
override def checkConfig(): (Boolean, String) = {
val requiredOptions = List("host", "table", "database")
val nonExistsOptions = requiredOptions.map(optionName => (optionName, config.hasPath(optionName))).filter { p =>
val (optionName, exists) = p
!exists
}
if (TypesafeConfigUtils.hasSubConfig(config, clickhousePrefix)) {
val clickhouseConfig = TypesafeConfigUtils.extractSubConfig(config, clickhousePrefix, false)
clickhouseConfig
.entrySet()
.foreach(entry => {
val key = entry.getKey
val value = String.valueOf(entry.getValue.unwrapped())
properties.put(key, value)
})
}
if (nonExistsOptions.nonEmpty) {
(
false,
"please specify " + nonExistsOptions
.map { option =>
val (name, exists) = option
"[" + name + "]"
}
.mkString(", ") + " as non-empty string")
}
val hasUserName = config.hasPath("username")
val hasPassword = config.hasPath("password")
if (hasUserName && !hasPassword || !hasUserName && hasPassword) {
(false, "please specify username and password at the same time")
}
if (hasPassword) {
properties.put("user", config.getString("username"))
properties.put("password", config.getString("password"))
}
(true, "")
}
override def prepare(spark: SparkSession): Unit = {
this.jdbcLink = String.format("jdbc:clickhouse://%s/%s", config.getString("host"), config.getString("database"))
val balanced: BalancedClickhouseDataSource = new BalancedClickhouseDataSource(this.jdbcLink, properties)
val conn = balanced.getConnection.asInstanceOf[ClickHouseConnectionImpl]
this.table = config.getString("table")
this.tableSchema = getClickHouseSchema(conn, table)
if (this.config.hasPath("fields")) {
this.fields = config.getStringList("fields")
val (flag, msg) = acceptedClickHouseSchema()
if (!flag) {
throw new ConfigRuntimeException(msg)
}
}
val defaultConfig = ConfigFactory.parseMap(
Map(
"bulk_size" -> 20000,
// "retry_codes" -> util.Arrays.asList(ClickHouseErrorCode.NETWORK_ERROR.code),
"retry_codes" -> util.Arrays.asList(),
"retry" -> 1
)
)
//检查配置文件中是否存在cluster参数
if (config.hasPath("cluster")) {
this.cluster = config.getString("cluster")
//从数据库中获取集群信息,后面在process方法中用到,clusterInfo其实是一个数组
this.clusterInfo = getClickHouseClusterInfo(conn, cluster)
if (this.clusterInfo.size == 0) {
val errorInfo = s"cloud not find cluster config in system.clusters, config cluster = $cluster"
logError(errorInfo)
throw new RuntimeException(errorInfo)
}
logInfo(s"get [$cluster] config from system.clusters, the replica info is [$clusterInfo].")
}
config = config.withFallback(defaultConfig)
retryCodes = config.getIntList("retry_codes")
super.prepare(spark)
}
override def process(df: Dataset[Row]): Unit = {
val dfFields = df.schema.fieldNames
val bulkSize = config.getInt("bulk_size")
val retry = config.getInt("retry")
if (!config.hasPath("fields")) {
fields = dfFields.toList
}
this.initSQL = initPrepareSQL()
logInfo(this.initSQL)
//如果clusterInfo中有数据,就是集群模式
if (this.clusterInfo != null && this.clusterInfo.size > 0) {
var writeMode="single"
var hashFields="" //支持多个字段,逗号分隔
//获取 write_mode、和hash_fields两个参数
if(config.hasPath("write_mode")) {
writeMode=config.getString("write_mode")
}
if(config.hasPath("hash_fields")) {
hashFields=config.getString("hash_fields")
hashFieldsArray=hashFields.split(",")
}
//写模式为hash,且hash_fields参数不为空,且hash_fields字段都存在于流中,则为使用hash方式,否则还是使用随机方式
logInfo(s"conf's writeMode: $writeMode")
logInfo(s"conf's HashFields: $hashFields")
//logDebug(s"dfFields.contains(HashField):"+dfFields.contains(HashField))
if("rowhash".equals(writeMode) && !"".equals(hashFields) && hashFields != null && checkFields(hashFields,dfFields)) {
this.writeMode="rowHash"
logInfo(s"cluster rowHash mode, shard index select in iteator")
}else if("rowrandom".equals(writeMode)) {
this.writeMode="rowRandom"
logInfo(s"cluster rowRandom mode, shard index select in iteator")
}
else{
this.writeMode="partRandom"
logInfo(s"cluster partRandom mode, shard index select in foreachPartition. the jdbc url is ")
}
}else{
this.writeMode="single"
logInfo(s"single mode, the jdbc url create in foreachPartition.")
}
logDebug(s"ready foreachPartition...")
df.foreachPar

该博客介绍了针对Clickhouse分布式表的写入性能优化,通过自定义的Clickhouse输出插件,实现了按行哈希和行随机两种新的写入模式,以提高写入效率。作者提供了编译好的类文件,并给出了详细的使用说明,包括配置参数、测试案例和性能测试结果。测试表明,新写入模式在某些场景下能有效提升写入速度。
https://download.youkuaiyun.com/download/cakecc2008/81878962
最低0.47元/天 解锁文章
2236

被折叠的 条评论
为什么被折叠?



