背景
公司数据集成平台有较多数据集成任务,目前主要通过datax实现,由于开源版本只有本地运行模式对本地资源要求较高,记录下springboot 中怎么通过api的方式把一个datax任务以application的方法提交到yarn集群,增加并行能力。
代码实现
client提交实现
@InterfaceAudience.Public
@InterfaceStability.Unstable
public class Client {
// Application master specific info to register a new Application with RM/ASM
private String appName = "";
private String pch = "";
// datax job json
private String dataxJson = "";
// datax home path
private String dataxHomeArchivePath = "";
// if log4j.properties file available, add to local resources and set into classpath
private String log4jPropFile = "";
// reflect run
private boolean reflect = true;
// datax参数
private final List<String> parameter = new ArrayList<>();
// Debug flag
private boolean debugFlag = false;
// Queue for App master
private String queue = "";
// Application master jar file
private String appMasterJar = "";
// Amt. of memory resource to request for to run the App Master
private long masterMemory = 128;
// Amt. of virtual core resource to request for to run the App Master
private int masterVCores = 1;
// App master priority
private int amPriority = 0;
// Amt of memory to request for container in which shell script will be executed
private int containerMemory = 10;
// Amt. of virtual cores to request for container in which shell script will be executed
private int containerVirtualCores = 1;
// No. of containers in which the shell script needs to be executed
private int numContainers = 1;
// flag to indicate whether to keep containers across application attempts.
private boolean keepContainers = false;
private int memoryOverhead = 50;
private String[] javaOpts = new String[]{};
private String[] shellArgs = new String[]{};
// Env variables to be setup for the shell command
private final Map<String, String> shellEnv = new HashMap<>();
// Main class to invoke application master
private final String appMasterMainClass;
// yarnManipulator
private static YarnManipulator yarnManipulator;
// Start time for client
private final long clientStartTime = System.currentTimeMillis();
// Timeout threshold for client. Kill app after time interval expires.
// -1 means no timeout so that the application will not be killed after timeout, in other words, long time running job will be kept running.
private long clientTimeout = -1;
private final String[] args;
// Command line options
private final Options opts;
// Configuration
private final Configuration conf;
// yarnClient
private YarnClient yarnClient;
private FileSystem fs;
// hadoop.security.authentication = kerberos
private String authentication;
private String krb5Conf;
private String kerberosUserName;
private String kerberosKeytab;
private String configDir;
/**
*
*/
public Client(Configuration conf, DataxJob job) {
this("com.on.yarn.ApplicationMaster", conf, job.toStrinArray());
}
private Client(String appMasterMainClass, Configuration conf, String[] args) {
this.appMasterMainClass = appMasterMainClass;
yarnManipulator = new LocalYarnManipulator();
this.args = args;
this.conf = conf;
opts = new Options();
opts.addOption("appName", true, "Application Name. Default value - DistributedShell");
opts.addOption("pch", true, "Application pch. Default value - DistributedShell");
opts.addOption("mainJar", true, "Jar file containing the application master in local file system");
opts.addOption("jsonPath", true, "datax Json file");
opts.addOption("datax_home_hdfs", true, "Jar file containing the application master in HDFS");
opts.addOption("log_properties", true, "log4j.properties file");
opts.addOption("reflect", true, "DataX job是否反射运行");
opts.addOption("p", true, "DataX传参");
opts.addOption("debug", false, "Dump out debug information");
opts.addOption("queue", true, "RM Queue in which this application is to be submitted");
opts.addOption("master_memory", true, "Amount of memory in MB to be requested to run the application master");
opts.addOption("master_vcores", true, "Amount of virtual cores to be requested to run the application master");
opts.addOption("priority", true, "Application Priority. Default 0");
opts.addOption("container_memory", true, "Amount of memory in MB to be requested to run the shell command");
opts.addOption("container_vcores", true, "Amount of virtual cores to be requested to run the shell command");
opts.addOption("num_containers", true, "No. of containers on which the shell command needs to be executed");
opts.addOption("keep_containers_across_application_attempts", false,
"Flag to indicate whether to keep containers across application attempts."
+ " If the flag is true, running containers will not be killed when"
+ " application attempt fails and these containers will be retrieved by"
+ " the new application attempt ");
opts.addOption("memory_overhead", true, "Amount of memory overhead in MB for application master and container");
opts.addOption("java_opts", true, "Java opts for container");
opts.addOption("shell_args", true, "Command line args for the shell script. Multiple args can be separated by empty space.");
opts.getOption("shell_args").setArgs(Option.UNLIMITED_VALUES);
opts.addOption("shell_env", true, "Environment for shell script. Specified as env_key=env_val pairs");
opts.addOption("timeout", true, "Application timeout in milliseconds");
opts.addOption("proxy_user", true, "proxy_user");
opts.addOption("help", false, "Print usage");
}
/**
* Parse command line options
*
* @return Whether the init was successful to run the client
*/
private boolean init() throws ParseException, IOException {
CommandLine cliParser = new GnuParser().parse(opts, args);
if (args.length == 0) {
throw new IllegalArgumentException("No args specified for client to initialize");
}
appName = cliParser.getOptionValue("appName", "dataxDemo");
pch = cliParser.getOptionValue("pch", "pch");
if (!cliParser.hasOption("mainJar")) {
throw new IllegalArgumentException("No mainJar file path specified for application master");
}
appMasterJar = cliParser.getOptionValue("mainJar");
if (!cliParser.hasOption("jsonPath")) {
throw new IllegalArgumentException("No datax_job file path specified for application master");
}
dataxJson = FileUtils.readFileToString(FileUtils.getFile(cliParser.getOptionValue("jsonPath")), Charset.defaultCharset());
if (cliParser.hasOption("log_properties")) {
String log4jPath = cliParser.getOptionValue("log_properties");
try {
Log4jPropertyHelper.updateLog4jConf(Client.class, log4jPath);
} catch (Exception e) {
yarnManipulator.warn("Can not set up custom log4j properties. " + e);
}
}
if (!cliParser.hasOption("datax_home_hdfs")) {
throw new IllegalArgumentException("No datax_home_hdfs file path specified for application master");
}
dataxHomeArchivePath = cliParser.getOptionValue("datax_home_hdfs");
log4jPropFile = cliParser.getOptionValue("log_properties", "");
reflect = Boolean.parseBoolean(cliParser.getOptionValue("reflect", "true"));
if (cliParser.hasOption("p")) {
String p = cliParser.getOptionValue("p");
if (StringUtils.isNotBlank(p)) {
for (String str : StringUtils.split(p, ",")) {
if (StringUtils.isNotBlank(str)) {
parameter.add(str);
}
}
}
}
if (cliParser.hasOption("debug")) {
debugFlag = true;
}
queue = cliParser.getOptionValue("queue", "default");
masterMemory = Integer.parseInt(cliParser.getOptionValue("master_memory", "128"));
if (masterMemory < 0) {
throw new IllegalArgumentException("Invalid memory specified for application master, exiting." + " Specified memory=" + masterMemory);
}
masterVCores = Integer.parseInt(cliParser.getOptionValue("master_vcores", "1"));
if (masterVCores < 0) {
throw new IllegalArgumentException("Invalid virtual cores specified for application master, exiting." + " Specified virtual cores=" + masterVCores);
}
amPriority = Integer.parseInt(cliParser.getOptionValue("priority", "0"));
containerMemory = Integer.parseInt(cliParser.getOptionValue("container_memory", "10"));
containerVirtualCores = Integer.parseInt(cliParser.getOptionValue("container_vcores", "1"));
numContainers = Integer.parseInt(cliParser.getOptionValue("num_containers", "1"));
if (containerMemory < 0 || containerVirtualCores < 0 || numContainers < 1) {
throw new IllegalArgumentException("Invalid no. of containers or container memory/vcores specified,"
+ " exiting."
+ " Specified containerMemory=" + containerMemory
+ ", containerVirtualCores=" + containerVirtualCores
+ ", numContainer=" + numContainers);
}
if (cliParser.hasOption("keep_containers_across_application_attempts")) {
yarnManipulator.info("keep_containers_across_application_attempts");
keepContainers = true;
}
memoryOverhead = Integer.parseInt(cliParser.getOptionValue("memory_overhead", "2"));
if (cliParser.hasOption("java_opts")) {
javaOpts = cliParser.getOptionValues("java_opts");
}
if (cliParser.hasOption("shell_args")) {
shellArgs = cliParser.getOptionValues("shell_args");
}
if (cliParser.hasOption("shell_env")) {
String[] envs = cliParser.getOptionValues("shell_env");
for (String env : envs) {
env = env.trim();
int index = env.indexOf('=');
if (index == -1) {
shellEnv.put(env, "");
continue;
}
String key = env.substring(0, index);
String val = "";
if (index < (env.length() - 1)) {
val = env.substring(index + 1);
}
shellEnv.put(key, val);
}
}
clientTimeout = Integer.parseInt(cliParser.getOptionValue("timeout", "-1"));
String user = cliParser.getOptionValue("proxy_user");
if (StringUtils.isNotBlank(user)) {
String mkdirStr = "hdfs dfs -mkdir /user/%s";
String chownStr = "hdfs dfs -chown -R %s /user/%s";
Constants.exec(String.format(mkdirStr, user));
Constants.exec(String.format(chownStr, user, user));
System.setProperty("HADOOP_USER_NAME", user);
System.setProperty("HADOOP_PROXY_USER", user);
}
if (cliParser.hasOption("help")) {
new HelpFormatter().printHelp("Client", opts);
return false;
}
return true;
}
public void stop() {
if (null != yarnClient) {
try {
yarnClient.stop();
} catch (Exception e) {
yarnManipulator.error("Error stopping yarn client", e);
}
}
if (null != fs) {
try {
IOUtils.close(fs);
} catch (IOException e) {
yarnManipulator.warn("Error closing file system, " + e.getMessage());
}
}
}
/**
* Main run function for the client
*
* @return true if application completed successfully
*/
public ApplicationId run() throws IOException, YarnException, URISyntaxException, ParseException {
yarnManipulator.info("init args");
if (init()) {
yarnManipulator.info("init args success");
}
yarnManipulator.info("Running Client");
yarnClient = YarnClient.createYarnClient();
this.conf.set("mapreduce.am.max-attempts", "3");
this.conf.set("yarn.resourcemanager.am.max-attempts", "3");
this.conf.set("yarn.client.failover-sleep-base-ms", "1000");
if ("kerberos".equals(this.authentication)) {
loginKerberos();
}
yarnClient.init(conf);
yarnClient.start();
yarnManipulator.info("Client start");
YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics();
yarnManipulator.info("Got Cluster metric info from ASM , numNodeManagers=" + clusterMetrics.getNumNodeManagers());
List<NodeReport> clusterNodeReports = yarnClient.getNodeReports(NodeState.RUNNING);
yarnManipulator.info("Got Cluster node info from ASM");
for (NodeReport node : clusterNodeReports) {
yarnManipulator.info("Got node report from ASM for, nodeId=" + node.getNodeId() + ", nodeAddress" + node.getHttpAddress()
+ ", nodeRackName" + node.getRackName() + ", nodeNumContainers" + node.getNumContainers());
}
QueueInfo queueInfo = yarnClient.getQueueInfo(this.queue);
yarnManipulator.info("Queue info" + ", queueName=" + queueInfo.getQueueName() + ", queueCurrentCapacity=" + queueInfo.getCurrentCapacity()
+ ", queueMaxCapacity=" + queueInfo.getMaximumCapacity() + ", queueApplicationCount=" + queueInfo.getApplications().size()
+ ", queueChildQueueCount=" + queueInfo.getChildQueues().size());
if (queueInfo.getQueueStatistics().getNumAppsPending() > 10) {
throw new RuntimeException("Queue has pending applications, please wait for them to complete");
}
List<QueueUserACLInfo> listAclInfo = yarnClient.getQueueAclsInfo();
for (QueueUserACLInfo aclInfo : listAclInfo) {
for (QueueACL userAcl : aclInfo.getUserAcls()) {
yarnManipulator.info("User ACL Info for Queue" + ", queueName=" + aclInfo.getQueueName() + ", userAcl=" + userAcl.name());
}
}
YarnClientApplication app = yarnClient.createApplication();
GetNewApplicationResponse appResponse = app.getNewApplicationResponse();
// TODO get min/max resource capabilities from RM and change memory ask if needed
// If we do not have min/max, we may not be able to correctly request
// the required resources from the RM for the app master
// Memory ask has to be a multiple of min and less than max.
// Dump out information about cluster capability as seen by the resource manager
long maxMem = appResponse.getMaximumResourceCapability().getMemorySize();
yarnManipulator.info("Max mem capabililty of resources in this cluster " + maxMem);
// A resource ask cannot exceed the max.
if (masterMemory + memoryOverhead > maxMem) {
yarnManipulator.info("AM memory specified above max threshold of cluster. Using max value.specified=" + masterMemory + ", max=" + maxMem);
masterMemory = maxMem - memoryOverhead;
}
int maxVCores = appResponse.getMaximumResourceCapability().getVirtualCores();
yarnManipulator.info("Max virtual cores capabililty of resources in this cluster " + maxVCores);
if (masterVCores > maxVCores) {
yarnManipulator.info("AM virtual cores specified above max threshold of cluster. Using max value." + ", specified=" + masterVCores + ", max=" + maxVCores);
masterVCores = maxVCores;
}
// set the application name
ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext();
ApplicationId appId = appContext.getApplicationId();
appContext.setApplicationName(appName + ":" + pch);
appContext.setApplicationType("datax");
// 为应用程序设置标签
Set<String> tags = new HashSet<>(1 << 2);
tags.add("dataxtag" +";"+ appName + ";" + pch);
appContext.setApplicationTags(tags);
// Set the queue to which this application is to be submitted in the RM
appContext.setQueue(queue);
// Set the priority for the application master, what is the range for priority? how to decide?
appContext.setPriority(Priority.newInstance(amPriority));
appContext.setKeepContainersAcrossApplicationAttempts(keepContainers);
appContext.setResource(Resource.newInstance(masterMemory + memoryOverhead, masterVCores));
// 设置UnmmanageAM,默认值是false,am默认是启动在节点上的container,如果设置成true,再配合其他设置可将这个am启动在指定的环境下方便调试
// appContext.setUnmanagedAM(false);
// 任务完成时令牌是否销毁,默认值是true
appContext.setCancelTokensWhenComplete(true);
// 任务失败后最大重试次数,
appContext.setMaxAppAttempts(1);
// 应用失败重试时间间隔
appContext.setAttemptFailuresValidityInterval(30 * 1000L);
// 日志聚合上下文
// appContext.setLogAggregationContext();
// set local resources for the application master local files or archives as needed
yarnManipulator.info("Copy App Master jar from local filesystem and add to local environment");
// In this scenario, the jar file for the application master is part of the local resources
Map<String, LocalResource> localResources = new HashMap<>();
// Copy the application master jar to the filesystem
// Create a local resource to point to the destination jar path
fs = getFileSystem(conf, appMasterJar);
Path dst;
if (StringUtils.startsWithAny(appMasterJar, "hdfs", Constants.S_3_A, Constants.S_3_N, Constants.S_3, Constants.OSS)) {
String path = new Path(appMasterJar).toUri().getPath();
dst = fs.makeQualified(new Path(path));
} else {
dst = upLoad(fs, appMasterJar);
}
addlocalResources(fs, Constants.APP_MASTER_JAR_PATH, localResources, dst);
// YarnHelper.addFrameworkToDistributedCache(fs, dst.toUri().toString(), localResources, conf);
if (null != dataxHomeArchivePath) {
if (StringUtils.endsWith(dataxHomeArchivePath, ".tar.gz")) {
// addToLocalResources(fs, dataxHomeArchivePath, Constants.DATAX, localResources);
}
}
// Set the log4j properties if needed
if (!log4jPropFile.isEmpty()) {
addToLocalResources(fs, log4jPropFile, Constants.LOG_4_J_PATH, appId.toString(), localResources, null);
}
if (javaOpts.length > 0) {
addToLocalResources(fs, null, Constants.JAVA_OPTS_PATH, appId.toString(), localResources, StringUtils.join(javaOpts, " "));
}
if (shellArgs.length > 0) {
addToLocalResources(fs, null, Constants.SHELL_ARGS_PATH, appId.toString(), localResources, StringUtils.join(shellArgs, " "));
}
if ("kerberos".equals(this.authentication)) {
addToLocalResources(fs, krb5Conf, Constants.KRB5, appId.toString(), localResources, null);
addToLocalResources(fs, kerberosKeytab, Constants.KEYTAB, appId.toString(), localResources, null);
addToLocalResources(fs, configDir + "/" +Constants.CORE_SITE, Constants.CORE_SITE, appId.toString(), localResources, null);
addToLocalResources(fs, configDir + "/" +Constants.HDFS_SITE, Constants.HDFS_SITE, appId.toString(), localResources, null);
addToLocalResources(fs, configDir + "/" +Constants.YARN_SITE, Constants.YARN_SITE, appId.toString(), localResources, null);
}
// Set the env variables to be setup in the env where the application master will be run
yarnManipulator.info("Set the environment for the application master");
Map<String, String> env = new HashMap<>();
// env.put("CLASSPATH", YarnHelper.buildClassPathEnv(conf));
env.put(Constants.JAR_FILE_PATH, dst.toUri().toString());
// Add AppMaster.jar location to classpath
// At some point we should not be required to add the hadoop specific classpaths to the env.
// It should be provided out of the box.
// For now setting all required classpaths including the classpath to "." for the application jar
StringBuilder classPathEnv = new StringBuilder(Environment.CLASSPATH.$$())
.append(ApplicationConstants.CLASS_PATH_SEPARATOR).append("./*")
.append(ApplicationConstants.CLASS_PATH_SEPARATOR)
.append(Constants.getDataxDir() + Constants.DATAX + "/lib/*");
for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH, YarnConfiguration.DEFAULT_YARN_CROSS_PLATFORM_APPLICATION_CLASSPATH)) {
classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR);
classPathEnv.append(c.trim());
}
classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR).append("./logback.xml");
classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR).append("./parquet-logging.properties");
// add the runtime classpath needed for tests to work
if (conf.getBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, false)) {
classPathEnv.append(':');
classPathEnv.append(System.getProperty("java.class.path"));
}
env.put("LANG", "zh_CN.UTF-8");
env.put("CLASSPATH", classPathEnv.toString());
// Set the necessary command to execute the application master
Vector<CharSequence> vargs = new Vector<>(30);
// Set java executable command
yarnManipulator.info("Setting up app master command");
vargs.add("$JAVA_HOME/bin/java");
// Set Xmx based on am memory size
vargs.add("-Xms" + 128 + "m");
if (reflect) {
vargs.add("-Xmx" + masterMemory + "m");
}
// vargs.add("-XX:+TraceClassLoading");
vargs.add("-Dreflect=" + reflect);
vargs.add("-Dloglevel=info");
vargs.add("-Djava.security.egd=file:///dev/urandom");
vargs.add("-Duser.language=zh");
vargs.add("-Dfile.encoding=utf-8");
vargs.add("-Ddatax=" + dataxHomeArchivePath);
vargs.add("-Ddatax.jobid=" + appName);
vargs.add("-Ddatax.pch=" + pch);
vargs.add("-Ddatax.json=" + Base64Util.encode(dataxJson));
parameter.add("logback.configurationFile=file:"+Constants.getDataxDir() + Constants.DATAX +"/conf/eslogback.xml");
if (CollectionUtils.isNotEmpty(parameter)) {
for (String p : parameter) {
vargs.add("-D" + p);
}
}
if ("kerberos".equals(this.authentication)) {
vargs.add("-Dhadoop.security.authentication=kerberos");
vargs.add("-Dkerberos.userName=" + kerberosUserName);
}
vargs.add("-Dlogback.statusListenerClass=ch.qos.logback.core.status.NopStatusListener");
// Set class name
vargs.add(appMasterMainClass);
// Set params for Application Master
vargs.add("--master_memory " + masterMemory);
vargs.add("--container_memory " + containerMemory);
vargs.add("--container_vcores " + containerVirtualCores);
vargs.add("--num_containers " + numContainers);
vargs.add("--memory_overhead " + memoryOverhead);
// Shell Command Container priority
int shellCmdPriority = 0;
vargs.add("--priority " + shellCmdPriority);
if (debugFlag) {
vargs.add("--debug");
}
for (Map.Entry<String, String> entry : shellEnv.entrySet()) {
vargs.add("--shell_env " + entry.getKey() + "=" + entry.getValue());
}
vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout");
vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr");
// Get final commmand
StringBuilder command = new StringBuilder();
for (CharSequence str : vargs) {
command.append(str).append(" ");
}
yarnManipulator.info("Completed setting up app master command " + command);
List<String> commands = new ArrayList<>();
commands.add(command.toString());
// Set up the container launch context for the application master
ContainerLaunchContext amContainer = ContainerLaunchContext.newInstance(localResources, env, commands, null, null, null);
// Service data is a binary blob that can be passed to the application Not needed in this scenario
// amContainer.setServiceData(serviceData);
// Setup security tokens
if (UserGroupInformation.isSecurityEnabled()) {
// Note: Credentials class is marked as LimitedPrivate for HDFS and MapReduce
Credentials credentials = new Credentials();
String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL);
if (tokenRenewer == null || tokenRenewer.length() == 0) {
throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer");
}
// For now, only getting tokens for the default file-system.
final Token<?>[] tokens = fs.addDelegationTokens(tokenRenewer, credentials);
if (tokens != null) {
for (Token<?> token : tokens) {
yarnManipulator.info("Got dt for " + fs.getUri() + "; " + token);
}
}
DataOutputBuffer dob = new DataOutputBuffer();
credentials.writeTokenStorageToStream(dob);
ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
amContainer.setTokens(fsTokens);
}
appContext.setAMContainerSpec(amContainer);
// 设置applicationMaster的container运行资源请求
// String hostName = "127.0.0.1";
// int numContainers = 1;
// ResourceRequest amRequest = ResourceRequest.newInstance(Priority.newInstance(10), hostName, Resource.newInstance(memory, vCores), numContainers);
// applicationSubmissionContext.setAMContainerResourceRequest(amRequest);
// Submit the application to the applications manager
// SubmitApplicationResponse submitResp = applicationsManager.submitApplication(appRequest);
// Ignore the response as either a valid response object is returned on success
// or an exception thrown to denote some form of a failure
yarnManipulator.info("Submitting application to ASM");
yarnClient.submitApplication(appContext);
// TODO
// Try submitting the same request again app submission failure?
// Monitor the application
return appId;
}
private void loginKerberos() {
//如果是加了kerberos的Flink,则需要先进行Kerberos认证
if ("kerberos".equals(this.authentication) && StringUtils.isNotBlank(this.kerberosUserName) && StringUtils.isNotBlank(this.kerberosKeytab)) {
System.setProperty("java.security.krb5.conf", this.krb5Conf);
//开启Kerberos的续订,不过期,否则会有过期时间(一般24小时)
conf.set("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
UserGroupInformation.setConfiguration(conf);
try {
yarnManipulator.info("loginKerberos: " + this.kerberosUserName + ", " + this.kerberosKeytab);
UserGroupInformation.loginUserFromKeytab(this.kerberosUserName, this.kerberosKeytab);
} catch (Exception e) {
yarnManipulator.error("loginKerberos fail:", e);
}
}
}
/**
* Monitor the submitted application for completion.
* Kill application if time expires.
*
* @param appId Application Id of application to be monitored
* @return true if application completed successfully
*/
public boolean monitorApplication(ApplicationId appId) throws YarnException, IOException {
while (true) {
// Check app status every 6 second.
try {
Thread.sleep(6000);
} catch (InterruptedException e) {
yarnManipulator.error("Thread sleep in monitoring loop interrupted");
}
// Get application report for the appId we are interested in
ApplicationReport report = yarnClient.getApplicationReport(appId);
yarnManipulator.info("Got application report from ASM for"
+ ", appId=" + appId.getId()
+ ", clientToAMToken=" + report.getClientToAMToken()
+ ", appDiagnostics=" + report.getDiagnostics()
+ ", appMasterHost=" + report.getHost()
+ ", appQueue=" + report.getQueue()
+ ", appMasterRpcPort=" + report.getRpcPort()
+ ", appStartTime=" + report.getStartTime()
+ ", yarnAppState=" + report.getYarnApplicationState().toString()
+ ", distributedFinalState=" + report.getFinalApplicationStatus().toString()
+ ", appTrackingUrl=" + report.getTrackingUrl()
+ ", appUser=" + report.getUser());
YarnApplicationState state = report.getYarnApplicationState();
FinalApplicationStatus dsStatus = report.getFinalApplicationStatus();
if (YarnApplicationState.FINISHED == state) {
if (FinalApplicationStatus.SUCCEEDED == dsStatus) {
yarnManipulator.info("Application has completed successfully. Breaking monitoring loop");
return true;
} else {
yarnManipulator.info("Application did finished unsuccessfully. " +
"=" + state + ", DSFinalStatus=" + dsStatus.toString() + ". Breaking monitoring loop");
return false;
}
} else if (YarnApplicationState.KILLED == state || YarnApplicationState.FAILED == state) {
yarnManipulator.info("Application did not finish."
+ " YarnState=" + state + ", DSFinalStatus=" + dsStatus.toString() + ". Breaking monitoring loop");
return false;
}
if (clientTimeout > 0 && System.currentTimeMillis() > (clientStartTime + clientTimeout)) {
yarnManipulator.info("Reached client specified timeout for application. Killing application");
forceKillApplication(appId);
return false;
}
}
}
/**
* Kill a submitted application by sending a call to the ASM
*
* @param appId Application Id to be killed.
*/
private void forceKillApplication(ApplicationId appId) throws YarnException, IOException {
// TODO clarify whether multiple jobs with the same app id can be submitted and be running at
// the same time.
// If yes, can we kill a particular attempt only?
// Response can be ignored as it is non-null on success or throws an exception in case of failures
yarnClient.killApplication(appId);
}
private Path upLoad(FileSystem fs, String path) throws IOException {
String name = FileUtils.getFile(path).getName();
Path yarnJar = new Path(fs.getHomeDirectory(), "datax_on_yarn_jar/" + name);
if (!fs.exists(yarnJar) || fs.getFileStatus(yarnJar).getLen() != FileUtils.getFile(path).length()) {
fs.copyFromLocalFile(new Path(path), yarnJar);
}
return yarnJar;
}
private void addToLocalResources(FileSystem fs, String fileSrcPath, String fileDstPath, String appId,
Map<String, LocalResource> localResources, String resources) throws IOException {
String suffix = appName + "/" + appId + "/" + fileDstPath;
Path dst = new Path(fs.getHomeDirectory(), suffix);
if (fileSrcPath == null) {
FSDataOutputStream ostream = null;
try {
ostream = FileSystem.create(fs, dst, new FsPermission((short) 0710));
ostream.writeUTF(resources);
} finally {
IOUtils.closeQuietly(ostream);
}
} else {
fs.copyFromLocalFile(new Path(fileSrcPath), dst);
}
addlocalResources(fs, fileDstPath, localResources, dst);
}
private static void addlocalResources(FileSystem fs, String fileDstPath, Map<String, LocalResource> localResources, Path dst) throws IOException {
FileStatus scFileStatus = fs.getFileStatus(dst);
LocalResource scRsrc =
LocalResource.newInstance(
ConverterUtils.getYarnUrlFromURI(dst.toUri()),
LocalResourceType.FILE, LocalResourceVisibility.APPLICATION,
scFileStatus.getLen(), scFileStatus.getModificationTime());
localResources.put(fileDstPath, scRsrc);
}
// private Path addToLocalResources(FileSystem fs, String path, String name, Map<String, LocalResource> localResources) throws IOException {
// Path dst = fs.makeQualified(new Path(path));
// FileStatus scFileStatus = fs.getFileStatus(dst);
// LocalResource scRsrc = LocalResource.newInstance(
// ConverterUtils.getYarnUrlFromURI(dst.toUri()),
// LocalResourceType.ARCHIVE, LocalResourceVisibility.APPLICATION,
// scFileStatus.getLen(), scFileStatus.getModificationTime());
localResources.put(name, scRsrc);
// return dst;
// }
public static FileSystem getFileSystem(Configuration conf, String path) throws IOException, URISyntaxException {
FileSystem fs;
if (StringUtils.startsWithAny(path, Constants.S_3_A, Constants.S_3_N, Constants.S_3)) {
conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
fs = FileSystem.get(new URI(path), conf);
} else if (StringUtils.startsWithAny(path, Constants.OSS)) {
fs = FileSystem.get(new URI(path), conf);
} else {
fs = FileSystem.get(conf);
}
return fs;
}
public void setKerberos(String krb5Conf, String kerberosUserName, String kerberosKeytab, String configDir) {
this.configDir = configDir;
this.krb5Conf = krb5Conf;
this.kerberosUserName = kerberosUserName;
this.kerberosKeytab = kerberosKeytab;
this.authentication = "kerberos";
System.setProperty("hadoop.security.authentication", "kerberos");
}
}
ApplicationMaster
package com.on.yarn;
import com.google.common.annotations.VisibleForTesting;
import com.on.yarn.constant.Constants;
import com.on.yarn.datax.DataXExecutor;
import com.on.yarn.datax.DataXPidExecutor;
import com.on.yarn.datax.Executor;
import com.on.yarn.util.Log4jPropertyHelper;
import lombok.Data;
import org.apache.commons.cli.*;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.ExitUtil;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.ContainerManagementProtocol;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.*;
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
import org.apache.hadoop.yarn.client.api.async.impl.NMClientAsyncImpl;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.log4j.LogManager;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicInteger;
@Data
@InterfaceAudience.Public
@InterfaceStability.Unstable
public class ApplicationMaster {
private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
// Configuration
private Configuration conf;
// Handle to communicate with the Resource Manager
private AMRMClientAsync amRMClient;
// In both secure and non-secure modes, this points to the job-submitter.
private UserGroupInformation appSubmitterUgi;
// Handle to communicate with the Node Manager
private NMClientAsync nmClientAsync;
// Listen to process the response from the Node Manager
private NMCallbackHandler containerListener;
// Application Attempt Id ( combination of attemptId and fail count )
@VisibleForTesting
protected ApplicationAttemptId appAttemptID;
// TODO
// For status update for clients - yet to be implemented
// Hostname of the container
private String appMasterHostname = "";
// Port on which the app master listens for status updates from clients
private int appMasterRpcPort = -1;
// Tracking url to which app master publishes info for clients to monitor
private String appMasterTrackingUrl = "";
// App Master configuration
// No. of containers to run shell command on
@VisibleForTesting
protected int numTotalContainers = 1;
// Memory to request for the container on which the shell command will run
private int containerMemory = 10;
// VirtualCores to request for the container on which the shell command will run
private int containerVirtualCores = 1;
// Priority of the request
private int requestPriority;
// Counter for completed containers ( complete denotes successful or failed )
private AtomicInteger numCompletedContainers = new AtomicInteger();
// Allocated container count so that we know how many containers has the RM allocated to use
@VisibleForTesting
protected AtomicInteger numAllocatedContainers = new AtomicInteger();
// Count of failed containers
private AtomicInteger numFailedContainers = new AtomicInteger();
// Count of containers already requested from the RM
// Needed as once requested, we should not request for containers again.
// Only request for more if the original requirement changes.
@VisibleForTesting
protected AtomicInteger numRequestedContainers = new AtomicInteger();
// Args to be passed to the shell command
private String shellArgs = "";
private String javaOpts = "";
// Env variables to be setup for the shell command
private Map<String, String> shellEnv = new HashMap<>();
private static volatile boolean done;
private static volatile boolean doneDataX = false;
private ByteBuffer allTokens;
// Launch threads
private List<Thread> launchThreads = new ArrayList<>();
private ConcurrentHashMap<ContainerId, Container> runningContainers = new ConcurrentHashMap<>();
//private YarnAppMasterHttpServer httpServer;
public static final int DEFAULT_APP_MASTER_TRACKING_URL_PORT = 8090;
// Container memory overhead in MB
private int memoryOverhead = 10;
private static int amMemory = 128;
private static Process pro;
private static Executor dataXExecutor = null;
public ApplicationMaster() {
// Set up the configuration
conf = new YarnConfiguration();
conf.set("yarn.resourcemanager.am.max-attempts", "3");
conf.set("yarn.client.failover-sleep-base-ms", "1000");
conf.set("mapreduce.am.max-attempts", "3");
}
public static void main(String[] args) {
boolean result = false;
ApplicationMaster appMaster = null;
try {
appMaster = new ApplicationMaster();
LOG.info("Initializing ApplicationMaster");
boolean doRun = appMaster.init(args);
if (!doRun) {
System.exit(0);
}
appMaster.run();
LOG.info("ApplicationMaster finish...");
if ("true".equals(System.getProperty("reflect"))) {
dataXExecutor = new DataXExecutor();
} else {
dataXExecutor = new DataXPidExecutor(amMemory);
}
dataXExecutor.run();
appMaster.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, "FINISHED");
done = true;
doneDataX = true;
result = appMaster.finish();
LOG.info("ApplicationMaster finish");
} catch (Throwable t) {
LOG.fatal("Error running ApplicationMaster", t);
if (appMaster != null) {
appMaster.unregisterApplicationMaster(FinalApplicationStatus.FAILED, ExceptionUtils.getFullStackTrace(t));
}
LogManager.shutdown();
ExitUtil.terminate(1, t);
} finally {
if (result) {
LOG.info("Application Master completed successfully. exiting");
System.exit(0);
} else {
LOG.info("Application Master failed. exiting");
System.exit(2);
}
}
}
/**
* Parse command line options
*
* @param args Command line args
* @return Whether init successful and run should be invoked
* @throws ParseException
* @throws IOException
*/
public boolean init(String[] args) throws ParseException, IOException {
Options opts = new Options();
opts.addOption("app_attempt_id", true, "App Attempt ID. Not to be used unless for testing purposes");
opts.addOption("debug", false, "Dump out debug information");
opts.addOption("priority", true, "Application Priority. Default 0");
opts.addOption("master_memory", true, "Amount of memory in MB to be requested to run the application master");
opts.addOption("container_memory", true, "Amount of memory in MB to be requested to run the shell command");
opts.addOption("container_vcores", true, "Amount of virtual cores to be requested to run the shell command");
opts.addOption("num_containers", true, "No. of containers on which the shell command needs to be executed");
opts.addOption("memory_overhead", true, "Amount of memory overhead in MB for container");
opts.addOption("java_opts", true, "Java opts for container");
opts.addOption("shell_env", true, "Environment for shell script. Specified as env_key=env_val pairs");
opts.addOption("help", false, "Print usage");
CommandLine cliParser = new GnuParser().parse(opts, args);
if (args.length == 0) {
new HelpFormatter().printHelp("ApplicationMaster", opts);
throw new IllegalArgumentException("No args specified for application master to initialize");
}
//Check whether customer log4j.properties file exists
if (fileExist(Constants.LOG_4_J_PATH)) {
try {
Log4jPropertyHelper.updateLog4jConf(ApplicationMaster.class, Constants.LOG_4_J_PATH);
} catch (Exception e) {
LOG.warn("Can not set up custom log4j properties. " + e);
}
}
if (cliParser.hasOption("debug")) {
dumpOutDebugInfo();
}
requestPriority = Integer.parseInt(cliParser.getOptionValue("priority", "0"));
amMemory = Integer.parseInt(cliParser.getOptionValue("master_memory", "128"));
if (amMemory < 0) {
throw new IllegalArgumentException("Invalid memory specified for application master, exiting." + " Specified memory=" + amMemory);
}
Map<String, String> envs = System.getenv();
if (!envs.containsKey(Environment.CONTAINER_ID.name())) {
if (cliParser.hasOption("app_attempt_id")) {
appAttemptID = ConverterUtils.toApplicationAttemptId(cliParser.getOptionValue("app_attempt_id", ""));
} else {
throw new IllegalArgumentException("Application Attempt Id not set in the environment");
}
} else {
ContainerId containerId = ConverterUtils.toContainerId(envs.get(Environment.CONTAINER_ID.name()));
appAttemptID = containerId.getApplicationAttemptId();
}
if (!envs.containsKey(ApplicationConstants.APP_SUBMIT_TIME_ENV)) {
throw new RuntimeException(ApplicationConstants.APP_SUBMIT_TIME_ENV + " not set in the environment");
}
if (!envs.containsKey(Environment.NM_HOST.name())) {
throw new RuntimeException(Environment.NM_HOST.name() + " not set in the environment");
}
if (!envs.containsKey(Environment.NM_HTTP_PORT.name())) {
throw new RuntimeException(Environment.NM_HTTP_PORT + " not set in the environment");
}
if (!envs.containsKey(Environment.NM_PORT.name())) {
throw new RuntimeException(Environment.NM_PORT.name() + " not set in the environment");
}
LOG.info("Application master for app, appId=" + appAttemptID.getApplicationId().getId()
+ ", clustertimestamp=" + appAttemptID.getApplicationId().getClusterTimestamp()
+ ", attemptId=" + appAttemptID.getAttemptId());
containerMemory = Integer.parseInt(cliParser.getOptionValue("container_memory", "10"));
containerVirtualCores = Integer.parseInt(cliParser.getOptionValue("container_vcores", "1"));
numTotalContainers = Integer.parseInt(cliParser.getOptionValue("num_containers", "1"));
if (numTotalContainers == 0) {
throw new IllegalArgumentException("Cannot run distributed shell with no containers");
}
memoryOverhead = Integer.parseInt(cliParser.getOptionValue("memory_overhead", "1"));
if (fileExist(Constants.JAVA_OPTS_PATH)) {
javaOpts = readContent(Constants.JAVA_OPTS_PATH);
}
if (fileExist(Constants.SHELL_ARGS_PATH)) {
shellArgs = readContent(Constants.SHELL_ARGS_PATH);
}
if (cliParser.hasOption("shell_env")) {
String[] shellEnvs = cliParser.getOptionValues("shell_env");
for (String env : shellEnvs) {
env = env.trim();
int index = env.indexOf('=');
if (index == -1) {
shellEnv.put(env, "");
continue;
}
String key = env.substring(0, index);
String val = "";
if (index < (env.length() - 1)) {
val = env.substring(index + 1);
}
shellEnv.put(key, val);
}
}
if (cliParser.hasOption("help")) {
new HelpFormatter().printHelp("ApplicationMaster", opts);
return false;
}
return true;
}
/**
* Dump out contents of $CWD and the environment to stdout for debugging
*/
private void dumpOutDebugInfo() {
LOG.info("Dump debug output");
Map<String, String> envs = System.getenv();
for (Map.Entry<String, String> env : envs.entrySet()) {
LOG.info("System env: key=" + env.getKey() + ", val=" + env.getValue());
}
BufferedReader buf = null;
try {
String lines = Shell.WINDOWS ? Shell.execCommand("cmd", "/c", "dir") : Shell.execCommand("ls", "-al");
buf = new BufferedReader(new StringReader(lines));
String line;
while ((line = buf.readLine()) != null) {
LOG.info("System CWD content: " + line);
}
} catch (IOException e) {
LOG.error("Error in ApplicationMaster while dumping debug output", e);
} finally {
IOUtils.cleanup(LOG, buf);
}
}
/**
* Main run function for the application master
*
* @throws YarnException
* @throws IOException
*/
@SuppressWarnings({"unchecked"})
public void run() throws Throwable {
LOG.info("Starting ApplicationMaster");
// Note: Credentials, Token, UserGroupInformation, DataOutputBuffer class are marked as LimitedPrivate
DataOutputBuffer dob = new DataOutputBuffer();
Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
credentials.writeTokenStorageToStream(dob);
// Now remove the AM->RM token so that containers cannot access it.
Iterator<Token<?>> iter = credentials.getAllTokens().iterator();
LOG.info("Executing with tokens:");
while (iter.hasNext()) {
Token<?> token = iter.next();
LOG.info(token);
if (token.getKind().equals(AMRMTokenIdentifier.KIND_NAME)) {
iter.remove();
}
}
allTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
// Create appSubmitterUgi and add original tokens to it
String appSubmitterUserName = System.getenv(Environment.USER.name());
appSubmitterUgi = UserGroupInformation.createRemoteUser(appSubmitterUserName);
appSubmitterUgi.addCredentials(credentials);
AMRMClientAsync.CallbackHandler allocListener = new RMCallbackHandler();
amRMClient = AMRMClientAsync.createAMRMClientAsync(1000, allocListener);
amRMClient.init(conf);
amRMClient.start();
containerListener = createNMCallbackHandler();
nmClientAsync = new NMClientAsyncImpl(containerListener);
nmClientAsync.init(conf);
nmClientAsync.start();
appMasterHostname = NetUtils.getHostname();
// Setup local RPC Server to accept status requests directly from clients
// TODO need to setup a protocol for client to be able to communicate to
// the RPC server
// TODO use the rpc port info to register with the RM for the client to
// send requests to this app master
// Register self with ResourceManager
// This will start heartbeating to the RM
RegisterApplicationMasterResponse response = amRMClient.registerApplicationMaster(appMasterHostname, appMasterRpcPort, appMasterTrackingUrl);
// Setup local RPC Server to accept status requests directly from clients
// TODO need to setup a protocol for client to be able to communicate to
// the RPC server
// TODO use the rpc port info to register with the RM for the client to
// send requests to this app master
// Dump out information about cluster capability as seen by the
// resource manager
int maxMem = response.getMaximumResourceCapability().getMemory();
LOG.info("Max mem capability of resources in this cluster " + maxMem);
int maxVCores = response.getMaximumResourceCapability().getVirtualCores();
LOG.info("Max vcores capability of resources in this cluster " + maxVCores);
// A resource ask cannot exceed the max.
if (containerMemory + memoryOverhead > maxMem) {
LOG.info("Container memory specified above max threshold of cluster."
+ " Using max value." + ", specified=" + (containerMemory + memoryOverhead) + ", max=" + maxMem);
containerMemory = maxMem - memoryOverhead;
}
if (containerVirtualCores > maxVCores) {
LOG.info("Container virtual cores specified above max threshold of cluster."
+ " Using max value." + ", specified=" + containerVirtualCores + ", max=" + maxVCores);
containerVirtualCores = maxVCores;
}
List<Container> previousAMRunningContainers = response.getContainersFromPreviousAttempts();
LOG.info(appAttemptID + " received " + previousAMRunningContainers.size() + " previous attempts' running containers on AM registration.");
numAllocatedContainers.addAndGet(previousAMRunningContainers.size());
recoverExecutors(previousAMRunningContainers);
int numTotalContainersToRequest = numTotalContainers - previousAMRunningContainers.size();
// Setup ask for containers from RM
// Send request for containers to RM
// Until we get our fully allocated quota, we keep on polling RM for
// containers
// Keep looping until all the containers are launched and shell script
// executed on them ( regardless of success/failure).
requestKContainers(numTotalContainersToRequest);
}
@VisibleForTesting
NMCallbackHandler createNMCallbackHandler() {
return new NMCallbackHandler(this);
}
@VisibleForTesting
protected boolean finish() {
// wait for completion.
while (!done) {
try {
Thread.sleep(1000);
} catch (InterruptedException ex) {
}
}
// Join all launched threads
// needed for when we time out
// and we need to release containers
for (Thread launchThread : launchThreads) {
try {
launchThread.join(10000);
} catch (InterruptedException e) {
LOG.error("Exception thrown in thread join: ", e);
}
}
// When the application completes, it should stop all running containers
LOG.info("Application completed. Stopping running containers");
nmClientAsync.stop();
// When the application completes, it should send a finish application
// signal to the RM
LOG.info("Application completed. Signalling finish to RM");
FinalApplicationStatus appStatus;
String appMessage = null;
boolean success = true;
// TODO: 2023/2/15 试试这里,不管node线程,直接结束
if (numFailedContainers.get() == 0 && numCompletedContainers.get() == numTotalContainers) {
appStatus = FinalApplicationStatus.SUCCEEDED;
} else {
if (doneDataX) {
appStatus = FinalApplicationStatus.SUCCEEDED;
} else {
appStatus = FinalApplicationStatus.FAILED;
appMessage = "Diagnostics." + ", total=" + numTotalContainers + ", completed=" + numCompletedContainers.get() + ", allocated="
+ numAllocatedContainers.get() + ", failed=" + numFailedContainers.get();
success = false;
}
}
unregisterApplicationMaster(appStatus, appMessage);
amRMClient.stop();
return success;
}
public void unregisterApplicationMaster(FinalApplicationStatus appStatus, String appMessage) {
try {
amRMClient.unregisterApplicationMaster(appStatus, appMessage, null);
} catch (YarnException | IOException ex) {
LOG.error("Failed to unregister application", ex);
}
}
/**
* RMCallbackHandler
*/
private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler {
@SuppressWarnings("unchecked")
@Override
public void onContainersCompleted(List<ContainerStatus> completedContainers) {
LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size());
for (ContainerStatus containerStatus : completedContainers) {
LOG.info(appAttemptID + " got container status for containerID=" + containerStatus.getContainerId()
+ ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics());
// non complete containers should not be here
assert (containerStatus.getState() == ContainerState.COMPLETE);
// increment counters for completed/failed containers
int exitStatus = containerStatus.getExitStatus();
if (0 != exitStatus) {
// container failed
if (ContainerExitStatus.ABORTED != exitStatus) {
// shell script failed counts as completed
numCompletedContainers.incrementAndGet();
numFailedContainers.incrementAndGet();
} else {
// container was killed by framework, possibly preempted
// we should re-try as the container was lost for some reason
numAllocatedContainers.decrementAndGet();
numRequestedContainers.decrementAndGet();
// we do not need to release the container as it would be done by the RM
}
} else {
// nothing to do container completed successfully
numCompletedContainers.incrementAndGet();
LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
}
runningContainers.remove(containerStatus.getContainerId());
}
// ask for more containers if any failed
int askCount = numTotalContainers - numRequestedContainers.get();
numRequestedContainers.addAndGet(askCount);
if (askCount > 0) {
for (int i = 0; i < askCount; ++i) {
ContainerRequest containerAsk = setupContainerAskForRM();
amRMClient.addContainerRequest(containerAsk);
}
}
if (numCompletedContainers.get() == numTotalContainers) {
done = true;
}
}
@Override
public void onContainersAllocated(List<Container> allocatedContainers) {
LOG.info("Got response from RM for container ask, allocatedCnt=" + allocatedContainers.size());
// We are sleeping here because there might be multiple calls and we want to keep the number of containers as expected.
if (runningContainers.size() >= numTotalContainers) {
return;
}
numAllocatedContainers.addAndGet(allocatedContainers.size());
for (Container allocatedContainer : allocatedContainers) {
LOG.info("Launching shell command on a new container"
+ ", containerId=" + allocatedContainer.getId()
+ ", containerNode=" + allocatedContainer.getNodeId().getHost() + ":" + allocatedContainer.getNodeId().getPort()
+ ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress()
+ ", containerResourceMemory=" + allocatedContainer.getResource().getMemory()
+ ", containerResourceVirtualCores=" + allocatedContainer.getResource().getVirtualCores()
+ ", containerToken" + allocatedContainer.getContainerToken().getIdentifier().toString());
LaunchContainerRunnable runnableLaunchContainer = new LaunchContainerRunnable(allocatedContainer, containerListener);
Thread launchThread = new Thread(runnableLaunchContainer);
// launch and start the container on a separate thread to keep the main thread unblocked as all containers may not be allocated at one go.
launchThreads.add(launchThread);
launchThread.start();
}
}
@Override
public void onShutdownRequest() {
done = true;
}
@Override
public void onNodesUpdated(List<NodeReport> updatedNodes) {
}
@Override
public float getProgress() {
// set progress to deliver to RM on next heartbeat
return (float) numCompletedContainers.get() / numTotalContainers;
}
@Override
public void onError(Throwable e) {
done = true;
amRMClient.stop();
}
}
/**
* NMCallbackHandler
*/
static class NMCallbackHandler implements NMClientAsync.CallbackHandler {
private ConcurrentMap<ContainerId, Container> containers = new ConcurrentHashMap<>();
private final ApplicationMaster applicationMaster;
NMCallbackHandler(ApplicationMaster applicationMaster) {
this.applicationMaster = applicationMaster;
}
void addContainer(ContainerId containerId, Container container) {
containers.putIfAbsent(containerId, container);
}
@Override
public void onContainerStopped(ContainerId containerId) {
LOG.info("Succeeded to stop Container " + containerId);
applicationMaster.runningContainers.remove(containerId);
containers.remove(containerId);
}
@Override
public void onContainerStatusReceived(ContainerId containerId, ContainerStatus containerStatus) {
LOG.debug("Container Status: id=" + containerId + ", status=" + containerStatus);
}
@Override
public void onContainerStarted(ContainerId containerId, Map<String, ByteBuffer> allServiceResponse) {
LOG.debug("Succeeded to start Container " + containerId);
Container container = containers.get(containerId);
if (container != null) {
applicationMaster.nmClientAsync.getContainerStatusAsync(containerId, container.getNodeId());
}
}
@Override
public void onStartContainerError(ContainerId containerId, Throwable t) {
LOG.error("Failed to start Container " + containerId);
containers.remove(containerId);
applicationMaster.runningContainers.remove(containerId);
applicationMaster.numCompletedContainers.incrementAndGet();
applicationMaster.numFailedContainers.incrementAndGet();
}
@Override
public void onGetContainerStatusError(ContainerId containerId, Throwable t) {
LOG.error("Failed to query the status of Container " + containerId);
}
@Override
public void onStopContainerError(ContainerId containerId, Throwable t) {
LOG.error("Failed to stop Container " + containerId);
applicationMaster.runningContainers.remove(containerId);
containers.remove(containerId);
}
}
/**
* Thread to connect to the {@link ContainerManagementProtocol} and launch the container
* that will execute the shell command.
*/
private class LaunchContainerRunnable implements Runnable {
// Allocated container
Container container;
NMCallbackHandler containerListener;
/**
* @param lcontainer Allocated container
* @param containerListener Callback handler of the container
*/
LaunchContainerRunnable(Container lcontainer, NMCallbackHandler containerListener) {
this.container = lcontainer;
this.containerListener = containerListener;
}
@Override
/**
* Connects to CM, sets up container launch context
* for shell command and eventually dispatches the container
* start request to the CM.
*/
public void run() {
LOG.info("Setting up container launch container for containerId=" + container.getId());
List<String> commands = new ArrayList<>();
commands.add("ls");
// Set up ContainerLaunchContext, setting local resource, environment, command and token for constructor.
// Note for tokens: Set up tokens for the container too. Today, for normal shell commands, the container in distribute-shell doesn't need any tokens.
// We are populating them mainly for NodeManagers to be able to download anyfiles in the distributed file-system.
// The tokens are otherwise also useful in cases, for e.g., when one is running a "hadoop dfs" command inside the distributed shell.
ContainerLaunchContext ctx = ContainerLaunchContext.newInstance(null, shellEnv, commands, null, allTokens.duplicate(), null);
runningContainers.putIfAbsent(container.getId(), container);
containerListener.addContainer(container.getId(), container);
nmClientAsync.startContainerAsync(container, ctx);
}
}
/**
* Setup the request that will be sent to the RM for the container ask.
*
* @return the setup ResourceRequest to be sent to RM
*/
private ContainerRequest setupContainerAskForRM() {
// setup requirements for hosts using * as any host will do for the distributed shell app
// Set up resource type requirements
// For now, memory and CPU are supported so we set memory and cpu requirements
Resource capability = Resource.newInstance(containerMemory + memoryOverhead, containerVirtualCores);
// set the priority for the request
// TODO - what is the range for priority? how to decide?
Priority pri = Priority.newInstance(requestPriority);
ContainerRequest request = new ContainerRequest(capability, null, null, pri);
LOG.info("Requested container ask: " + request);
return request;
}
private void recoverExecutors(List<Container> previousAMRunningContainers) {
for (Container container : previousAMRunningContainers) {
runningContainers.putIfAbsent(container.getId(), container);
}
}
private void requestKContainers(int askCount) {
LOG.info("Request new containers count:" + askCount);
for (int i = 0; i < askCount; ++i) {
ContainerRequest containerAsk = setupContainerAskForRM();
amRMClient.addContainerRequest(containerAsk);
}
numRequestedContainers.set(numTotalContainers);
}
private synchronized void askMoreContainersIfNecessary() {
int askCount = numTotalContainers - runningContainers.size();
if (askCount > 0) {
LOG.info("Request more containers count:" + askCount);
requestKContainers(askCount);
} else {
LOG.info("No more to ask for containers");
}
}
public ConcurrentHashMap<ContainerId, Container> getRunningContainers() {
return runningContainers;
}
private boolean fileExist(String filePath) {
return new File(filePath).exists();
}
private String readContent(String filePath) throws IOException {
DataInputStream ds = null;
try {
ds = new DataInputStream(new FileInputStream(filePath));
return ds.readUTF();
} finally {
org.apache.commons.io.IOUtils.closeQuietly(ds);
}
}
}
提交代码样例
public static void main(String[] args) throws Exception {
DataxJob job = new DataxJob();
job.setAppName("0101");
job.setPch("0101");
job.setMainJar("/work/datax-on-yarn-0.0.1-SNAPSHOT.jar");
job.setJsonPath(new File("/job/job.json"));
job.setDataxHome("/work/datax.tar.gz");
job.setReflect(true);
job.setParameter(Lists.newArrayList("datax.jobinfo=" +
Base64Util.encode("{" +
"\"jobId\":\"0101\"," +
"\"rwId\":\"0101\"," +
"\"zrwId\":\"0101\"," +
"\"zrwpch\":\"0101\"," +
"\"rwlyDm\":\"01\"," +
"\"elasticsearch.url\":\"esip:port\"," +
"\"elasticsearch.username\":\"elastic\"," +
"\"elasticsearch.password\":\"pass\"," +
"\"bootstrap.servers\":\"kafkaip:9591\"," +
"\"security.protocol\":\"SASL_PLAINTEXT\"," +
"\"sasl.mechanism\":\"SCRAM-SHA-256\"," +
"\"sasl.jaas.config\":\"org.apache.kafka.common.security.scram.ScramLoginModule\\u0020required username=\\\"admin\\\"\\u0020password=\\\"kafkapass\\\";\"," +
"}")));
Client client = new Client(createConfiguration(), job);
ApplicationId applicationId = client.run();
client.stop();
System.out.println(applicationId);
}
借鉴
https://github.com/duhanmin/datax-on-yarn