Spark的执行脚本
#! /bin/bash
spark2-submit
–class it.luke.Mainapp
–master yarn
–deploy-mode cluster
–driver-memory 4g
–num-executors 10
–executor-memory 2g
–executor-cores 2
–conf spark.sql.autoBroadcastJoinThreshold=“10485760” \
–conf spark.sql.shuffle.partitions=“200”
–conf spark.shuffle.compress=“true”
–conf spark.shuffle.io.maxRetries=“5”
–conf spark.shuffle.io.retryWait=“10s”
–conf spark.broadcast.compress=“true”
–conf spark.serializer=“org.apache.spark.serializer.KryoSerializer”
–conf spark.memory.fraction=“0.6”
–conf spark.memory.storageFraction=“0.5”
–conf spark.default.parallelism=200
–conf spark.locality.wait=“3s”
–conf spark.speculation.flag=“true”
–conf spark.speculation.multiplier=“1.5”
/demo-1.0.jar
‘输入变量1’
‘输入变量2’
spark参数 解析
#自动广播
spark.sql.autoBroadcastJoinThreshold=“10485760”
#spark sql shuffle并行度设置
spark.sql.shuffle.partitions=“200”
#自动广播超时时间
#spark.sql.broadcastTimeout=""
#是否启动shuffle压缩
spark.shuffle.compress=“true”
#shuffle失败重试次数
spark.shuffle.io.maxRetries=“5”
#shuffle重试的间隔时间
spark.shuffle.io.retryWait=“10s”
#广播是否启动压缩
spark.broadcast.compress=“true”
#序列化机制
spark.serializer=“org.apache.spark.serializer.KryoSerializer”
#存储与执行的内存比例
spark.memory.fraction=“0.6”
#存储的内存比例
spark.memory.storageFraction=“0.5”
#spark core shuffle并行度设置
spark.default.parallelism=200
#数据本地化等待时间
spark.locality.wait=“3s”
#是否启动推测机制
spark.speculation.flag=“true”
#推测机制启动时机
spark.speculation.multiplier=“1.5”
Scala中的参数调优
application.conf
#spark参数
#自动广播
spark.sql.autoBroadcastJoinThreshold="10485760"
#spark sql shuffle并行度设置
spark.sql.shuffle.partitions="200"
#自动广播超时时间
#spark.sql.broadcastTimeout=""
#是否启动shuffle压缩
spark.shuffle.compress="true"
#shuffle失败重试次数
spark.shuffle.io.maxRetries="5"
#shuffle重试的间隔时间
spark.shuffle.io.retryWait="10s"
#广播是否启动压缩
spark.broadcast.compress="true"
#序列化机制
spark.serializer="org.apache.spark.serializer.KryoSerializer"
#存储与执行的内存比例
spark.memory.fraction="0.6"
#存储的内存比例
spark.memory.storageFraction="0.5"
#spark core shuffle并行度设置
spark.default.parallelism=200
#数据本地化等待时间
spark.locality.wait="3s"
#是否启动推测机制
spark.speculation.flag="true"
#推测机制启动时机
spark.speculation.multiplier="1.5"
scala代码
package utils
import com.typesafe.config.ConfigFactory
object ConfigUtils {
val conf = ConfigFactory.load() //默认读取source中application.conf文件
//spark sql shuffle的分区数设置
val SPARK_SQL_SHUFFLE_PARTITIONS = conf.getString("spark.sql.shuffle.partitions")
//spark sql小表自动广播的大小限制
val SPARK_SQL_AUTOBROADCASTJOINTHRESHOLD = conf.getString("spark.sql.autoBroadcastJoinThreshold")
//设置shuffle数据是否进行压缩
val SPARK_SHUFFLE_COMPRESS = conf.getString("spark.shuffle.compress")
//设置shuffle失败时的重试次数
val SPARK_SHUFFLE_IO_MAXRETRIES = conf.getString("spark.shuffle.io.maxRetries")
//设置shuffle重试的时间间隔
val SPARK_SHUFFLE_IO_RETRYWAIT = conf.getString("spark.shuffle.io.retryWait")
//设置广播数据是否压缩
val SPARK_BROADCAST_COMPRESS = conf.getString("spark.broadcast.compress")
//设置spark的序列化方式
val SPARK_SERIALIZER = conf.getString("spark.serializer")
//设置执行与存储的内存比例
val SPARK_MEMORY_FRACTION = conf.getString("spark.memory.fraction")
//设置存储的内存比例
val SPARK_MEMORY_STORAGEFRACTION = conf.getString("spark.memory.storageFraction")
//设置sparkcore的shuffle分区数
val SPARK_DEFAULT_PARALLELISM = conf.getString("spark.default.parallelism")
//设置数据本地化的等待时间
val SPARK_LOCALITY_WAIT = conf.getString("spark.locality.wait")
//是否启动推测机制
val SPARK_SPECULATION = conf.getString("spark.speculation.flag")
//推测机制的启动时机
val SPARK_SPECULATION_MULTIPLIER = conf.getString("spark.speculation.multiplier")
}
后续的开发根据工具类进行配置就可以了