首先从一条命令说起:
spark-submit \
--master yarn \
--deploy-mode client \
--driver-memory 10G \
--executor-memory 10G \
--num-executors 25 \
--executor-cores 4 \
--queue ltemr \
--conf "spark.driver.extraJavaOptions=-Dhdp.version=3.1.0.0-78" \
--conf "spark.yarn.am.extraJavaOptions=-Dhdp.version=3.1.0.0-78" \
--jars $(echo /home/ltemr/oozie_signal/spark/lib/*.jar | tr ' ' ',') \
--properties-file conf/spark-properties-uemr.conf \
uemr-streaming-driver-1.0-SNAPSHOT.jar \
UEMRFixLocationDriver
接着会执行:
exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"
注解: $@ 表示我们spark-submit提交命令中所有的参数
我们再看看spark-class做了什么工作,下边是关于spark-class脚本的内容
if [ -z "${SPARK_HOME}" ]; then
source "$(dirname "$0")"/find-spark-home
fi
#注解:加载Spark环境配置内容spark-env.sh
. "${SPARK_HOME}"/bin/load-spark-env.sh
#注解:获取java lib用来启动应用
if [ -n "${JAVA_HOME}" ]; then
RUNNER="${JAVA_HOME}/bin/java"
else
if [ "$(command -v java)" ]; then
RUNNER="java"
else
echo "JAVA_HOME is not set" >&2
exit 1
fi
fi
#注解:获取Spark jar包目录以便加载
if [ -d "${SPARK_HOME}/jars" ]; then
SPARK_JARS_DIR="${SPARK_HOME}/jars"
else
SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars"
fi
if [ ! -d "$SPARK_JARS_DIR" ] && [ -z "$SPARK_TESTING$SPARK_SQL_TESTING" ]; then
echo "Failed to find Spark jars directory ($SPARK_JARS_DIR)." 1>&2
echo "You need to build Spark with the target \"package\" before running this program." 1>&2
exit 1
else
LAUNCH_CLASSPATH="$SPARK_JARS_DIR/*"
fi
# Add the launcher build dir to the classpath if requested.
#注:SPARK_PREPEND_CLASSES主要用于迭代式开发,不用对全部依赖进行重新编译,只需要对Spark本身修改部分进行编译打包即可,以加速开发。这在Useful Developer Tools中有提及。
if [ -n "$SPARK_PREPEND_CLASSES" ]; then
LAUNCH_CLASSPATH="${SPARK_HOME}/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH"
fi
# For tests
if [[ -n "$SPARK_TESTING" ]]; then
unset YARN_CONF_DIR
unset HADOOP_CONF_DIR
fi
# The launcher library will print arguments separated by a NULL character, to allow arguments with
# characters that would be otherwise interpreted by the shell. Read that in a while loop, populating
# an array that will be used to exec the final command.
#
# The exit code of the launcher is appended to the output, so the parent shell removes it from the
# command array and checks the value to see if the launcher succeeded.
build_command() {
"$RUNNER" -Xmx128m -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@"
printf "%d\0" $?
}
# Turn off posix mode since it does not allow process substitution
#注:set +o表示关闭,-o表示打开,posix是一种在Unix系统上的软件接口标准,为跨平台兼容使用,支持这种标准的软件可以在所有Unix操作系统上移植使用。
#注:IFS表示输入域分隔符(Input Field Separator),read -d ''表示以空格为界定符
#注:read表示读取以IFS作为分隔符的行(可能是反斜杠分割的行)
#注:-r参数表示删除反斜杠处理这一过程,保留反斜杠
#注:CMD是一个数组,将build_command函数的输出循环读入数组
set +o posix
CMD=()
while IFS= read -d '' -r ARG; do
CMD+=("$ARG")
done < <(build_command "$@")
#注:命令长度
COUNT=${#CMD[@]}
#注:数组最后一个元素下标
LAST=$((COUNT - 1))
#注:数组最后一个值表示build_command执行返回值
LAUNCHER_EXIT_CODE=${CMD[$LAST]}
# Certain JVM failures result in errors being printed to stdout (instead of stderr), which causes
# the code that parses the output of the launcher to get confused. In those cases, check if the
# exit code is an integer, and if it's not, handle it as a special error case.
#注:LAUNCHER返回值如果不是整数,做特殊异常处理
if ! [[ $LAUNCHER_EXIT_CODE =~ ^[0-9]+$ ]]; then
echo "${CMD[@]}" | head -n-1 1>&2
exit 1
fi
if [ $LAUNCHER_EXIT_CODE != 0 ]; then
exit $LAUNCHER_EXIT_CODE
fi
#注:执行提交命令
#注:组装的完整命令示例,表示一整行命令
# JAVA_HOME/bin/java -cp SPARK_HOME/conf/:/Users/rttian/Documents/work/bigdata/spark-2.2.0-bin-hadoop2.7/jars/*
# -Xmx1g org.apache.spark.deploy.SparkSubmit
# --master local[3]
# --class org.apache.spark.examples.SparkPi
# examples/jars/spark-examples_2.11-2.2.0.jar 10
CMD=("${CMD[@]:0:$LAST}")
exec "${CMD[@]}"