datax on yarn(datax在yarn上运行)

背景

公司数据集成平台有较多数据集成任务,目前主要通过datax实现,由于开源版本只有本地运行模式对本地资源要求较高,记录下springboot 中怎么通过api的方式把一个datax任务以application的方法提交到yarn集群,增加并行能力。

代码实现

client提交实现

@InterfaceAudience.Public
@InterfaceStability.Unstable
public class Client {

    // Application master specific info to register a new Application with RM/ASM
    private String appName = "";
    private String pch = "";
    // datax job json
    private String dataxJson = "";
    // datax home path
    private String dataxHomeArchivePath = "";
    // if log4j.properties file available, add to local resources and set into classpath
    private String log4jPropFile = "";
    // reflect run
    private boolean reflect = true;
    // datax参数
    private final List<String> parameter = new ArrayList<>();
    // Debug flag
    private boolean debugFlag = false;
    // Queue for App master
    private String queue = "";

    // Application master jar file
    private String appMasterJar = "";
    // Amt. of memory resource to request for to run the App Master
    private long masterMemory = 128;
    // Amt. of virtual core resource to request for to run the App Master
    private int masterVCores = 1;
    // App master priority
    private int amPriority = 0;
    // Amt of memory to request for container in which shell script will be executed
    private int containerMemory = 10;
    // Amt. of virtual cores to request for container in which shell script will be executed
    private int containerVirtualCores = 1;
    // No. of containers in which the shell script needs to be executed
    private int numContainers = 1;
    // flag to indicate whether to keep containers across application attempts.
    private boolean keepContainers = false;
    private int memoryOverhead = 50;
    private String[] javaOpts = new String[]{};
    private String[] shellArgs = new String[]{};
    // Env variables to be setup for the shell command
    private final Map<String, String> shellEnv = new HashMap<>();

    // Main class to invoke application master
    private final String appMasterMainClass;
    // yarnManipulator
    private static YarnManipulator yarnManipulator;
    // Start time for client
    private final long clientStartTime = System.currentTimeMillis();
    // Timeout threshold for client. Kill app after time interval expires.
    // -1 means no timeout so that the application will not be killed after timeout, in other words, long time running job will be kept running.
    private long clientTimeout = -1;

    private final String[] args;
    // Command line options
    private final Options opts;
    // Configuration
    private final Configuration conf;
    // yarnClient
    private YarnClient yarnClient;
    private FileSystem fs;
    // hadoop.security.authentication = kerberos
    private String authentication;

    private String krb5Conf;

    private String kerberosUserName;

    private String kerberosKeytab;

    private String configDir;

    /**
     *
     */
    public Client(Configuration conf, DataxJob job) {
        this("com.on.yarn.ApplicationMaster", conf, job.toStrinArray());
    }

    private Client(String appMasterMainClass, Configuration conf, String[] args) {
        this.appMasterMainClass = appMasterMainClass;
        yarnManipulator = new LocalYarnManipulator();
        this.args = args;
        this.conf = conf;
        opts = new Options();
        opts.addOption("appName", true, "Application Name. Default value - DistributedShell");
        opts.addOption("pch", true, "Application pch. Default value - DistributedShell");
        opts.addOption("mainJar", true, "Jar file containing the application master in local file system");
        opts.addOption("jsonPath", true, "datax Json file");
        opts.addOption("datax_home_hdfs", true, "Jar file containing the application master in HDFS");
        opts.addOption("log_properties", true, "log4j.properties file");
        opts.addOption("reflect", true, "DataX job是否反射运行");
        opts.addOption("p", true, "DataX传参");
        opts.addOption("debug", false, "Dump out debug information");
        opts.addOption("queue", true, "RM Queue in which this application is to be submitted");

        opts.addOption("master_memory", true, "Amount of memory in MB to be requested to run the application master");
        opts.addOption("master_vcores", true, "Amount of virtual cores to be requested to run the application master");
        opts.addOption("priority", true, "Application Priority. Default 0");
        opts.addOption("container_memory", true, "Amount of memory in MB to be requested to run the shell command");
        opts.addOption("container_vcores", true, "Amount of virtual cores to be requested to run the shell command");
        opts.addOption("num_containers", true, "No. of containers on which the shell command needs to be executed");
        opts.addOption("keep_containers_across_application_attempts", false,
                "Flag to indicate whether to keep containers across application attempts."
                        + " If the flag is true, running containers will not be killed when"
                        + " application attempt fails and these containers will be retrieved by"
                        + " the new application attempt ");
        opts.addOption("memory_overhead", true, "Amount of memory overhead in MB for application master and container");
        opts.addOption("java_opts", true, "Java opts for container");
        opts.addOption("shell_args", true, "Command line args for the shell script. Multiple args can be separated by empty space.");
        opts.getOption("shell_args").setArgs(Option.UNLIMITED_VALUES);
        opts.addOption("shell_env", true, "Environment for shell script. Specified as env_key=env_val pairs");

        opts.addOption("timeout", true, "Application timeout in milliseconds");
        opts.addOption("proxy_user", true, "proxy_user");

        opts.addOption("help", false, "Print usage");
    }

    /**
     * Parse command line options
     *
     * @return Whether the init was successful to run the client
     */
    private boolean init() throws ParseException, IOException {
        CommandLine cliParser = new GnuParser().parse(opts, args);
        if (args.length == 0) {
            throw new IllegalArgumentException("No args specified for client to initialize");
        }
        appName = cliParser.getOptionValue("appName", "dataxDemo");
        pch = cliParser.getOptionValue("pch", "pch");
        if (!cliParser.hasOption("mainJar")) {
            throw new IllegalArgumentException("No mainJar file path specified for application master");
        }
        appMasterJar = cliParser.getOptionValue("mainJar");
        if (!cliParser.hasOption("jsonPath")) {
            throw new IllegalArgumentException("No datax_job file path specified for application master");
        }
        dataxJson = FileUtils.readFileToString(FileUtils.getFile(cliParser.getOptionValue("jsonPath")), Charset.defaultCharset());
        if (cliParser.hasOption("log_properties")) {
            String log4jPath = cliParser.getOptionValue("log_properties");
            try {
                Log4jPropertyHelper.updateLog4jConf(Client.class, log4jPath);
            } catch (Exception e) {
                yarnManipulator.warn("Can not set up custom log4j properties. " + e);
            }
        }
        if (!cliParser.hasOption("datax_home_hdfs")) {
            throw new IllegalArgumentException("No datax_home_hdfs file path specified for application master");
        }
        dataxHomeArchivePath = cliParser.getOptionValue("datax_home_hdfs");
        log4jPropFile = cliParser.getOptionValue("log_properties", "");
        reflect = Boolean.parseBoolean(cliParser.getOptionValue("reflect", "true"));
        if (cliParser.hasOption("p")) {
            String p = cliParser.getOptionValue("p");
            if (StringUtils.isNotBlank(p)) {
                for (String str : StringUtils.split(p, ",")) {
                    if (StringUtils.isNotBlank(str)) {
                        parameter.add(str);
                    }
                }
            }
        }
        if (cliParser.hasOption("debug")) {
            debugFlag = true;
        }

        queue = cliParser.getOptionValue("queue", "default");
        masterMemory = Integer.parseInt(cliParser.getOptionValue("master_memory", "128"));
        if (masterMemory < 0) {
            throw new IllegalArgumentException("Invalid memory specified for application master, exiting." + " Specified memory=" + masterMemory);
        }
        masterVCores = Integer.parseInt(cliParser.getOptionValue("master_vcores", "1"));
        if (masterVCores < 0) {
            throw new IllegalArgumentException("Invalid virtual cores specified for application master, exiting." + " Specified virtual cores=" + masterVCores);
        }
        amPriority = Integer.parseInt(cliParser.getOptionValue("priority", "0"));
        containerMemory = Integer.parseInt(cliParser.getOptionValue("container_memory", "10"));
        containerVirtualCores = Integer.parseInt(cliParser.getOptionValue("container_vcores", "1"));
        numContainers = Integer.parseInt(cliParser.getOptionValue("num_containers", "1"));
        if (containerMemory < 0 || containerVirtualCores < 0 || numContainers < 1) {
            throw new IllegalArgumentException("Invalid no. of containers or container memory/vcores specified,"
                    + " exiting."
                    + " Specified containerMemory=" + containerMemory
                    + ", containerVirtualCores=" + containerVirtualCores
                    + ", numContainer=" + numContainers);
        }
        if (cliParser.hasOption("keep_containers_across_application_attempts")) {
            yarnManipulator.info("keep_containers_across_application_attempts");
            keepContainers = true;
        }
        memoryOverhead = Integer.parseInt(cliParser.getOptionValue("memory_overhead", "2"));
        if (cliParser.hasOption("java_opts")) {
            javaOpts = cliParser.getOptionValues("java_opts");
        }
        if (cliParser.hasOption("shell_args")) {
            shellArgs = cliParser.getOptionValues("shell_args");
        }
        if (cliParser.hasOption("shell_env")) {
            String[] envs = cliParser.getOptionValues("shell_env");
            for (String env : envs) {
                env = env.trim();
                int index = env.indexOf('=');
                if (index == -1) {
                    shellEnv.put(env, "");
                    continue;
                }
                String key = env.substring(0, index);
                String val = "";
                if (index < (env.length() - 1)) {
                    val = env.substring(index + 1);
                }
                shellEnv.put(key, val);
            }
        }

        clientTimeout = Integer.parseInt(cliParser.getOptionValue("timeout", "-1"));
        String user = cliParser.getOptionValue("proxy_user");
        if (StringUtils.isNotBlank(user)) {
            String mkdirStr = "hdfs dfs -mkdir /user/%s";
            String chownStr = "hdfs dfs -chown -R %s /user/%s";
            Constants.exec(String.format(mkdirStr, user));
            Constants.exec(String.format(chownStr, user, user));
            System.setProperty("HADOOP_USER_NAME", user);
            System.setProperty("HADOOP_PROXY_USER", user);
        }
        if (cliParser.hasOption("help")) {
            new HelpFormatter().printHelp("Client", opts);
            return false;
        }
        return true;
    }

    public void stop() {
        if (null != yarnClient) {
            try {
                yarnClient.stop();
            } catch (Exception e) {
                yarnManipulator.error("Error stopping yarn client", e);
            }
        }
        if (null != fs) {
            try {
                IOUtils.close(fs);
            } catch (IOException e) {
                yarnManipulator.warn("Error closing file system, " + e.getMessage());
            }
        }
    }

    /**
     * Main run function for the client
     *
     * @return true if application completed successfully
     */
    public ApplicationId run() throws IOException, YarnException, URISyntaxException, ParseException {
        yarnManipulator.info("init args");
        if (init()) {
            yarnManipulator.info("init args success");
        }
        yarnManipulator.info("Running Client");
        yarnClient = YarnClient.createYarnClient();
        this.conf.set("mapreduce.am.max-attempts", "3");
        this.conf.set("yarn.resourcemanager.am.max-attempts", "3");
        this.conf.set("yarn.client.failover-sleep-base-ms", "1000");

        if ("kerberos".equals(this.authentication)) {
            loginKerberos();
        }
        yarnClient.init(conf);
        yarnClient.start();
        yarnManipulator.info("Client start");

        YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics();
        yarnManipulator.info("Got Cluster metric info from ASM , numNodeManagers=" + clusterMetrics.getNumNodeManagers());
        List<NodeReport> clusterNodeReports = yarnClient.getNodeReports(NodeState.RUNNING);
        yarnManipulator.info("Got Cluster node info from ASM");
        for (NodeReport node : clusterNodeReports) {
            yarnManipulator.info("Got node report from ASM for, nodeId=" + node.getNodeId() + ", nodeAddress" + node.getHttpAddress()
                    + ", nodeRackName" + node.getRackName() + ", nodeNumContainers" + node.getNumContainers());
        }
        QueueInfo queueInfo = yarnClient.getQueueInfo(this.queue);
        yarnManipulator.info("Queue info" + ", queueName=" + queueInfo.getQueueName() + ", queueCurrentCapacity=" + queueInfo.getCurrentCapacity()
                + ", queueMaxCapacity=" + queueInfo.getMaximumCapacity() + ", queueApplicationCount=" + queueInfo.getApplications().size()
                + ", queueChildQueueCount=" + queueInfo.getChildQueues().size());
        if (queueInfo.getQueueStatistics().getNumAppsPending() > 10) {
            throw new RuntimeException("Queue has pending applications, please wait for them to complete");
        }
        List<QueueUserACLInfo> listAclInfo = yarnClient.getQueueAclsInfo();
        for (QueueUserACLInfo aclInfo : listAclInfo) {
            for (QueueACL userAcl : aclInfo.getUserAcls()) {
                yarnManipulator.info("User ACL Info for Queue" + ", queueName=" + aclInfo.getQueueName() + ", userAcl=" + userAcl.name());
            }
        }

        YarnClientApplication app = yarnClient.createApplication();
        GetNewApplicationResponse appResponse = app.getNewApplicationResponse();
        // TODO get min/max resource capabilities from RM and change memory ask if needed
        // If we do not have min/max, we may not be able to correctly request
        // the required resources from the RM for the app master
        // Memory ask has to be a multiple of min and less than max.
        // Dump out information about cluster capability as seen by the resource manager
        long maxMem = appResponse.getMaximumResourceCapability().getMemorySize();
        yarnManipulator.info("Max mem capabililty of resources in this cluster " + maxMem);
        // A resource ask cannot exceed the max.
        if (masterMemory + memoryOverhead > maxMem) {
            yarnManipulator.info("AM memory specified above max threshold of cluster. Using max value.specified=" + masterMemory + ", max=" + maxMem);
            masterMemory = maxMem - memoryOverhead;
        }
        int maxVCores = appResponse.getMaximumResourceCapability().getVirtualCores();
        yarnManipulator.info("Max virtual cores capabililty of resources in this cluster " + maxVCores);
        if (masterVCores > maxVCores) {
            yarnManipulator.info("AM virtual cores specified above max threshold of cluster. Using max value." + ", specified=" + masterVCores + ", max=" + maxVCores);
            masterVCores = maxVCores;
        }

        // set the application name
        ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext();
        ApplicationId appId = appContext.getApplicationId();
        appContext.setApplicationName(appName + ":" + pch);
        appContext.setApplicationType("datax");
        // 为应用程序设置标签
        Set<String> tags = new HashSet<>(1 << 2);
        tags.add("dataxtag" +";"+ appName + ";" + pch);
        appContext.setApplicationTags(tags);
        // Set the queue to which this application is to be submitted in the RM
        appContext.setQueue(queue);
        // Set the priority for the application master, what is the range for priority? how to decide?
        appContext.setPriority(Priority.newInstance(amPriority));
        appContext.setKeepContainersAcrossApplicationAttempts(keepContainers);
        appContext.setResource(Resource.newInstance(masterMemory + memoryOverhead, masterVCores));
        // 设置UnmmanageAM,默认值是false,am默认是启动在节点上的container,如果设置成true,再配合其他设置可将这个am启动在指定的环境下方便调试
//        appContext.setUnmanagedAM(false);
        // 任务完成时令牌是否销毁,默认值是true
        appContext.setCancelTokensWhenComplete(true);
        // 任务失败后最大重试次数,
        appContext.setMaxAppAttempts(1);
        // 应用失败重试时间间隔
        appContext.setAttemptFailuresValidityInterval(30 * 1000L);
        // 日志聚合上下文
//        appContext.setLogAggregationContext();

        // set local resources for the application master local files or archives as needed
        yarnManipulator.info("Copy App Master jar from local filesystem and add to local environment");
        // In this scenario, the jar file for the application master is part of the local resources
        Map<String, LocalResource> localResources = new HashMap<>();
        // Copy the application master jar to the filesystem
        // Create a local resource to point to the destination jar path
        fs = getFileSystem(conf, appMasterJar);
        Path dst;
        if (StringUtils.startsWithAny(appMasterJar, "hdfs", Constants.S_3_A, Constants.S_3_N, Constants.S_3, Constants.OSS)) {
            String path = new Path(appMasterJar).toUri().getPath();
            dst = fs.makeQualified(new Path(path));
        } else {
            dst = upLoad(fs, appMasterJar);
        }
        addlocalResources(fs, Constants.APP_MASTER_JAR_PATH, localResources, dst);
//        YarnHelper.addFrameworkToDistributedCache(fs, dst.toUri().toString(), localResources, conf);
        if (null != dataxHomeArchivePath) {
            if (StringUtils.endsWith(dataxHomeArchivePath, ".tar.gz")) {
//                addToLocalResources(fs, dataxHomeArchivePath, Constants.DATAX, localResources);
            }
        }
        // Set the log4j properties if needed
        if (!log4jPropFile.isEmpty()) {
            addToLocalResources(fs, log4jPropFile, Constants.LOG_4_J_PATH, appId.toString(), localResources, null);
        }
        if (javaOpts.length > 0) {
            addToLocalResources(fs, null, Constants.JAVA_OPTS_PATH, appId.toString(), localResources, StringUtils.join(javaOpts, " "));
        }
        if (shellArgs.length > 0) {
            addToLocalResources(fs, null, Constants.SHELL_ARGS_PATH, appId.toString(), localResources, StringUtils.join(shellArgs, " "));
        }

        if ("kerberos".equals(this.authentication)) {
            addToLocalResources(fs, krb5Conf, Constants.KRB5, appId.toString(), localResources, null);
            addToLocalResources(fs, kerberosKeytab, Constants.KEYTAB, appId.toString(), localResources, null);
            addToLocalResources(fs, configDir + "/" +Constants.CORE_SITE, Constants.CORE_SITE, appId.toString(), localResources, null);
            addToLocalResources(fs, configDir + "/" +Constants.HDFS_SITE, Constants.HDFS_SITE, appId.toString(), localResources, null);
            addToLocalResources(fs, configDir + "/" +Constants.YARN_SITE, Constants.YARN_SITE, appId.toString(), localResources, null);
        }

        // Set the env variables to be setup in the env where the application master will be run
        yarnManipulator.info("Set the environment for the application master");
        Map<String, String> env = new HashMap<>();
//        env.put("CLASSPATH", YarnHelper.buildClassPathEnv(conf));
        env.put(Constants.JAR_FILE_PATH, dst.toUri().toString());
        // Add AppMaster.jar location to classpath
        // At some point we should not be required to add the hadoop specific classpaths to the env.
        // It should be provided out of the box.
        // For now setting all required classpaths including the classpath to "." for the application jar
        StringBuilder classPathEnv = new StringBuilder(Environment.CLASSPATH.$$())
                .append(ApplicationConstants.CLASS_PATH_SEPARATOR).append("./*")
                .append(ApplicationConstants.CLASS_PATH_SEPARATOR)
                .append(Constants.getDataxDir() + Constants.DATAX + "/lib/*");
        for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH, YarnConfiguration.DEFAULT_YARN_CROSS_PLATFORM_APPLICATION_CLASSPATH)) {
            classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR);
            classPathEnv.append(c.trim());
        }
        classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR).append("./logback.xml");
        classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR).append("./parquet-logging.properties");
        // add the runtime classpath needed for tests to work
        if (conf.getBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, false)) {
            classPathEnv.append(':');
            classPathEnv.append(System.getProperty("java.class.path"));
        }

        env.put("LANG", "zh_CN.UTF-8");
        env.put("CLASSPATH", classPathEnv.toString());

        // Set the necessary command to execute the application master
        Vector<CharSequence> vargs = new Vector<>(30);
        // Set java executable command
        yarnManipulator.info("Setting up app master command");
        vargs.add("$JAVA_HOME/bin/java");
        // Set Xmx based on am memory size
        vargs.add("-Xms" + 128 + "m");
        if (reflect) {
            vargs.add("-Xmx" + masterMemory + "m");
        }
//        vargs.add("-XX:+TraceClassLoading");
        vargs.add("-Dreflect=" + reflect);
        vargs.add("-Dloglevel=info");
        vargs.add("-Djava.security.egd=file:///dev/urandom");
        vargs.add("-Duser.language=zh");
        vargs.add("-Dfile.encoding=utf-8");
        vargs.add("-Ddatax=" + dataxHomeArchivePath);
        vargs.add("-Ddatax.jobid=" + appName);
        vargs.add("-Ddatax.pch=" + pch);
        vargs.add("-Ddatax.json=" + Base64Util.encode(dataxJson));
        parameter.add("logback.configurationFile=file:"+Constants.getDataxDir() + Constants.DATAX +"/conf/eslogback.xml");
        if (CollectionUtils.isNotEmpty(parameter)) {
            for (String p : parameter) {
                vargs.add("-D" + p);
            }
        }
        if ("kerberos".equals(this.authentication)) {
            vargs.add("-Dhadoop.security.authentication=kerberos");
            vargs.add("-Dkerberos.userName=" + kerberosUserName);
        }
        vargs.add("-Dlogback.statusListenerClass=ch.qos.logback.core.status.NopStatusListener");

        // Set class name
        vargs.add(appMasterMainClass);
        // Set params for Application Master
        vargs.add("--master_memory " + masterMemory);
        vargs.add("--container_memory " + containerMemory);
        vargs.add("--container_vcores " + containerVirtualCores);
        vargs.add("--num_containers " + numContainers);
        vargs.add("--memory_overhead " + memoryOverhead);
        // Shell Command Container priority
        int shellCmdPriority = 0;
        vargs.add("--priority " + shellCmdPriority);
        if (debugFlag) {
            vargs.add("--debug");
        }
        for (Map.Entry<String, String> entry : shellEnv.entrySet()) {
            vargs.add("--shell_env " + entry.getKey() + "=" + entry.getValue());
        }
        vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout");
        vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr");
        // Get final commmand
        StringBuilder command = new StringBuilder();
        for (CharSequence str : vargs) {
            command.append(str).append(" ");
        }
        yarnManipulator.info("Completed setting up app master command " + command);

        List<String> commands = new ArrayList<>();
        commands.add(command.toString());
        // Set up the container launch context for the application master
        ContainerLaunchContext amContainer = ContainerLaunchContext.newInstance(localResources, env, commands, null, null, null);
        // Service data is a binary blob that can be passed to the application Not needed in this scenario
//        amContainer.setServiceData(serviceData);
        // Setup security tokens
        if (UserGroupInformation.isSecurityEnabled()) {
            // Note: Credentials class is marked as LimitedPrivate for HDFS and MapReduce
            Credentials credentials = new Credentials();
            String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL);
            if (tokenRenewer == null || tokenRenewer.length() == 0) {
                throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer");
            }
            // For now, only getting tokens for the default file-system.
            final Token<?>[] tokens = fs.addDelegationTokens(tokenRenewer, credentials);
            if (tokens != null) {
                for (Token<?> token : tokens) {
                    yarnManipulator.info("Got dt for " + fs.getUri() + "; " + token);
                }
            }
            DataOutputBuffer dob = new DataOutputBuffer();
            credentials.writeTokenStorageToStream(dob);
            ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
            amContainer.setTokens(fsTokens);
        }
        appContext.setAMContainerSpec(amContainer);

        // 设置applicationMaster的container运行资源请求
//        String hostName = "127.0.0.1";
//        int numContainers = 1;
//        ResourceRequest amRequest = ResourceRequest.newInstance(Priority.newInstance(10), hostName, Resource.newInstance(memory, vCores), numContainers);
//        applicationSubmissionContext.setAMContainerResourceRequest(amRequest);

        // Submit the application to the applications manager
        // SubmitApplicationResponse submitResp = applicationsManager.submitApplication(appRequest);
        // Ignore the response as either a valid response object is returned on success
        // or an exception thrown to denote some form of a failure
        yarnManipulator.info("Submitting application to ASM");
        yarnClient.submitApplication(appContext);

        // TODO
        // Try submitting the same request again app submission failure?
        // Monitor the application
        return appId;
    }

    private void loginKerberos() {
        //如果是加了kerberos的Flink,则需要先进行Kerberos认证
        if ("kerberos".equals(this.authentication) && StringUtils.isNotBlank(this.kerberosUserName) && StringUtils.isNotBlank(this.kerberosKeytab)) {
            System.setProperty("java.security.krb5.conf", this.krb5Conf);
            //开启Kerberos的续订,不过期,否则会有过期时间(一般24小时)
            conf.set("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
            UserGroupInformation.setConfiguration(conf);
            try {
                yarnManipulator.info("loginKerberos: " + this.kerberosUserName + ", " + this.kerberosKeytab);
                UserGroupInformation.loginUserFromKeytab(this.kerberosUserName, this.kerberosKeytab);
            } catch (Exception e) {
                yarnManipulator.error("loginKerberos fail:", e);
            }
        }
    }

    /**
     * Monitor the submitted application for completion.
     * Kill application if time expires.
     *
     * @param appId Application Id of application to be monitored
     * @return true if application completed successfully
     */
    public boolean monitorApplication(ApplicationId appId) throws YarnException, IOException {
        while (true) {
            // Check app status every 6 second.
            try {
                Thread.sleep(6000);
            } catch (InterruptedException e) {
                yarnManipulator.error("Thread sleep in monitoring loop interrupted");
            }

            // Get application report for the appId we are interested in
            ApplicationReport report = yarnClient.getApplicationReport(appId);
            yarnManipulator.info("Got application report from ASM for"
                    + ", appId=" + appId.getId()
                    + ", clientToAMToken=" + report.getClientToAMToken()
                    + ", appDiagnostics=" + report.getDiagnostics()
                    + ", appMasterHost=" + report.getHost()
                    + ", appQueue=" + report.getQueue()
                    + ", appMasterRpcPort=" + report.getRpcPort()
                    + ", appStartTime=" + report.getStartTime()
                    + ", yarnAppState=" + report.getYarnApplicationState().toString()
                    + ", distributedFinalState=" + report.getFinalApplicationStatus().toString()
                    + ", appTrackingUrl=" + report.getTrackingUrl()
                    + ", appUser=" + report.getUser());

            YarnApplicationState state = report.getYarnApplicationState();
            FinalApplicationStatus dsStatus = report.getFinalApplicationStatus();
            if (YarnApplicationState.FINISHED == state) {
                if (FinalApplicationStatus.SUCCEEDED == dsStatus) {
                    yarnManipulator.info("Application has completed successfully. Breaking monitoring loop");
                    return true;
                } else {
                    yarnManipulator.info("Application did finished unsuccessfully. " +
                            "=" + state + ", DSFinalStatus=" + dsStatus.toString() + ". Breaking monitoring loop");
                    return false;
                }
            } else if (YarnApplicationState.KILLED == state || YarnApplicationState.FAILED == state) {
                yarnManipulator.info("Application did not finish."
                        + " YarnState=" + state + ", DSFinalStatus=" + dsStatus.toString() + ". Breaking monitoring loop");
                return false;
            }
            if (clientTimeout > 0 && System.currentTimeMillis() > (clientStartTime + clientTimeout)) {
                yarnManipulator.info("Reached client specified timeout for application. Killing application");
                forceKillApplication(appId);
                return false;
            }
        }
    }

    /**
     * Kill a submitted application by sending a call to the ASM
     *
     * @param appId Application Id to be killed.
     */
    private void forceKillApplication(ApplicationId appId) throws YarnException, IOException {
        // TODO clarify whether multiple jobs with the same app id can be submitted and be running at
        // the same time.
        // If yes, can we kill a particular attempt only?
        // Response can be ignored as it is non-null on success or throws an exception in case of failures
        yarnClient.killApplication(appId);
    }

    private Path upLoad(FileSystem fs, String path) throws IOException {
        String name = FileUtils.getFile(path).getName();
        Path yarnJar = new Path(fs.getHomeDirectory(), "datax_on_yarn_jar/" + name);
        if (!fs.exists(yarnJar) || fs.getFileStatus(yarnJar).getLen() != FileUtils.getFile(path).length()) {
            fs.copyFromLocalFile(new Path(path), yarnJar);
        }
        return yarnJar;
    }

    private void addToLocalResources(FileSystem fs, String fileSrcPath, String fileDstPath, String appId,
                                     Map<String, LocalResource> localResources, String resources) throws IOException {
        String suffix = appName + "/" + appId + "/" + fileDstPath;
        Path dst = new Path(fs.getHomeDirectory(), suffix);
        if (fileSrcPath == null) {
            FSDataOutputStream ostream = null;
            try {
                ostream = FileSystem.create(fs, dst, new FsPermission((short) 0710));
                ostream.writeUTF(resources);
            } finally {
                IOUtils.closeQuietly(ostream);
            }
        } else {
            fs.copyFromLocalFile(new Path(fileSrcPath), dst);
        }
        addlocalResources(fs, fileDstPath, localResources, dst);
    }

    private static void addlocalResources(FileSystem fs, String fileDstPath, Map<String, LocalResource> localResources, Path dst) throws IOException {
        FileStatus scFileStatus = fs.getFileStatus(dst);
        LocalResource scRsrc =
                LocalResource.newInstance(
                        ConverterUtils.getYarnUrlFromURI(dst.toUri()),
                        LocalResourceType.FILE, LocalResourceVisibility.APPLICATION,
                        scFileStatus.getLen(), scFileStatus.getModificationTime());
        localResources.put(fileDstPath, scRsrc);
    }

//    private Path addToLocalResources(FileSystem fs, String path, String name, Map<String, LocalResource> localResources) throws IOException {
//        Path dst = fs.makeQualified(new Path(path));
//        FileStatus scFileStatus = fs.getFileStatus(dst);
//        LocalResource scRsrc = LocalResource.newInstance(
//                ConverterUtils.getYarnUrlFromURI(dst.toUri()),
//                LocalResourceType.ARCHIVE, LocalResourceVisibility.APPLICATION,
//                scFileStatus.getLen(), scFileStatus.getModificationTime());
        localResources.put(name, scRsrc);
//        return dst;
//    }

    public static FileSystem getFileSystem(Configuration conf, String path) throws IOException, URISyntaxException {
        FileSystem fs;
        if (StringUtils.startsWithAny(path, Constants.S_3_A, Constants.S_3_N, Constants.S_3)) {
            conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
            fs = FileSystem.get(new URI(path), conf);
        } else if (StringUtils.startsWithAny(path, Constants.OSS)) {
            fs = FileSystem.get(new URI(path), conf);
        } else {
            fs = FileSystem.get(conf);
        }
        return fs;
    }

    public void setKerberos(String krb5Conf, String kerberosUserName, String kerberosKeytab, String configDir) {
        this.configDir = configDir;
        this.krb5Conf = krb5Conf;
        this.kerberosUserName = kerberosUserName;
        this.kerberosKeytab = kerberosKeytab;
        this.authentication = "kerberos";
        System.setProperty("hadoop.security.authentication", "kerberos");
    }

}
ApplicationMaster
package com.on.yarn;

import com.google.common.annotations.VisibleForTesting;
import com.on.yarn.constant.Constants;
import com.on.yarn.datax.DataXExecutor;
import com.on.yarn.datax.DataXPidExecutor;
import com.on.yarn.datax.Executor;
import com.on.yarn.util.Log4jPropertyHelper;
import lombok.Data;
import org.apache.commons.cli.*;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.ExitUtil;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.ContainerManagementProtocol;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.*;
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
import org.apache.hadoop.yarn.client.api.async.impl.NMClientAsyncImpl;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.log4j.LogManager;

import java.io.*;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicInteger;

@Data
@InterfaceAudience.Public
@InterfaceStability.Unstable
public class ApplicationMaster {

    private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);

    // Configuration
    private Configuration conf;

    // Handle to communicate with the Resource Manager
    private AMRMClientAsync amRMClient;

    // In both secure and non-secure modes, this points to the job-submitter.
    private UserGroupInformation appSubmitterUgi;

    // Handle to communicate with the Node Manager
    private NMClientAsync nmClientAsync;

    // Listen to process the response from the Node Manager
    private NMCallbackHandler containerListener;

    // Application Attempt Id ( combination of attemptId and fail count )
    @VisibleForTesting
    protected ApplicationAttemptId appAttemptID;

    // TODO
    // For status update for clients - yet to be implemented
    // Hostname of the container
    private String appMasterHostname = "";
    // Port on which the app master listens for status updates from clients
    private int appMasterRpcPort = -1;
    // Tracking url to which app master publishes info for clients to monitor
    private String appMasterTrackingUrl = "";

    // App Master configuration
    // No. of containers to run shell command on
    @VisibleForTesting
    protected int numTotalContainers = 1;
    // Memory to request for the container on which the shell command will run
    private int containerMemory = 10;
    // VirtualCores to request for the container on which the shell command will run
    private int containerVirtualCores = 1;
    // Priority of the request
    private int requestPriority;

    // Counter for completed containers ( complete denotes successful or failed )
    private AtomicInteger numCompletedContainers = new AtomicInteger();
    // Allocated container count so that we know how many containers has the RM allocated to use
    @VisibleForTesting
    protected AtomicInteger numAllocatedContainers = new AtomicInteger();
    // Count of failed containers
    private AtomicInteger numFailedContainers = new AtomicInteger();
    // Count of containers already requested from the RM
    // Needed as once requested, we should not request for containers again.
    // Only request for more if the original requirement changes.
    @VisibleForTesting
    protected AtomicInteger numRequestedContainers = new AtomicInteger();

    // Args to be passed to the shell command
    private String shellArgs = "";

    private String javaOpts = "";

    // Env variables to be setup for the shell command
    private Map<String, String> shellEnv = new HashMap<>();

    private static volatile boolean done;

    private static volatile boolean doneDataX = false;

    private ByteBuffer allTokens;

    // Launch threads
    private List<Thread> launchThreads = new ArrayList<>();

    private ConcurrentHashMap<ContainerId, Container> runningContainers = new ConcurrentHashMap<>();

    //private YarnAppMasterHttpServer httpServer;

    public static final int DEFAULT_APP_MASTER_TRACKING_URL_PORT = 8090;

    // Container memory overhead in MB
    private int memoryOverhead = 10;

    private static int amMemory = 128;

    private static Process pro;

    private static Executor dataXExecutor = null;

    public ApplicationMaster() {
        // Set up the configuration
        conf = new YarnConfiguration();
        conf.set("yarn.resourcemanager.am.max-attempts", "3");
        conf.set("yarn.client.failover-sleep-base-ms", "1000");
        conf.set("mapreduce.am.max-attempts", "3");
    }

    public static void main(String[] args) {
        boolean result = false;
        ApplicationMaster appMaster = null;
        try {
            appMaster = new ApplicationMaster();
            LOG.info("Initializing ApplicationMaster");
            boolean doRun = appMaster.init(args);
            if (!doRun) {
                System.exit(0);
            }
            appMaster.run();
            LOG.info("ApplicationMaster finish...");
            if ("true".equals(System.getProperty("reflect"))) {
                dataXExecutor = new DataXExecutor();
            } else {
                dataXExecutor = new DataXPidExecutor(amMemory);
            }
            dataXExecutor.run();
            appMaster.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, "FINISHED");
            done = true;
            doneDataX = true;
            result = appMaster.finish();
            LOG.info("ApplicationMaster finish");
        } catch (Throwable t) {
            LOG.fatal("Error running ApplicationMaster", t);
            if (appMaster != null) {
                appMaster.unregisterApplicationMaster(FinalApplicationStatus.FAILED, ExceptionUtils.getFullStackTrace(t));
            }
            LogManager.shutdown();
            ExitUtil.terminate(1, t);
        } finally {
            if (result) {
                LOG.info("Application Master completed successfully. exiting");
                System.exit(0);
            } else {
                LOG.info("Application Master failed. exiting");
                System.exit(2);
            }
        }
    }

    /**
     * Parse command line options
     *
     * @param args Command line args
     * @return Whether init successful and run should be invoked
     * @throws ParseException
     * @throws IOException
     */
    public boolean init(String[] args) throws ParseException, IOException {
        Options opts = new Options();
        opts.addOption("app_attempt_id", true, "App Attempt ID. Not to be used unless for testing purposes");
        opts.addOption("debug", false, "Dump out debug information");
        opts.addOption("priority", true, "Application Priority. Default 0");
        opts.addOption("master_memory", true, "Amount of memory in MB to be requested to run the application master");
        opts.addOption("container_memory", true, "Amount of memory in MB to be requested to run the shell command");
        opts.addOption("container_vcores", true, "Amount of virtual cores to be requested to run the shell command");
        opts.addOption("num_containers", true, "No. of containers on which the shell command needs to be executed");
        opts.addOption("memory_overhead", true, "Amount of memory overhead in MB for container");
        opts.addOption("java_opts", true, "Java opts for container");
        opts.addOption("shell_env", true, "Environment for shell script. Specified as env_key=env_val pairs");
        opts.addOption("help", false, "Print usage");
        CommandLine cliParser = new GnuParser().parse(opts, args);
        if (args.length == 0) {
            new HelpFormatter().printHelp("ApplicationMaster", opts);
            throw new IllegalArgumentException("No args specified for application master to initialize");
        }
        //Check whether customer log4j.properties file exists
        if (fileExist(Constants.LOG_4_J_PATH)) {
            try {
                Log4jPropertyHelper.updateLog4jConf(ApplicationMaster.class, Constants.LOG_4_J_PATH);
            } catch (Exception e) {
                LOG.warn("Can not set up custom log4j properties. " + e);
            }
        }
        if (cliParser.hasOption("debug")) {
            dumpOutDebugInfo();
        }

        requestPriority = Integer.parseInt(cliParser.getOptionValue("priority", "0"));
        amMemory = Integer.parseInt(cliParser.getOptionValue("master_memory", "128"));
        if (amMemory < 0) {
            throw new IllegalArgumentException("Invalid memory specified for application master, exiting." + " Specified memory=" + amMemory);
        }

        Map<String, String> envs = System.getenv();
        if (!envs.containsKey(Environment.CONTAINER_ID.name())) {
            if (cliParser.hasOption("app_attempt_id")) {
                appAttemptID = ConverterUtils.toApplicationAttemptId(cliParser.getOptionValue("app_attempt_id", ""));
            } else {
                throw new IllegalArgumentException("Application Attempt Id not set in the environment");
            }
        } else {
            ContainerId containerId = ConverterUtils.toContainerId(envs.get(Environment.CONTAINER_ID.name()));
            appAttemptID = containerId.getApplicationAttemptId();
        }

        if (!envs.containsKey(ApplicationConstants.APP_SUBMIT_TIME_ENV)) {
            throw new RuntimeException(ApplicationConstants.APP_SUBMIT_TIME_ENV + " not set in the environment");
        }
        if (!envs.containsKey(Environment.NM_HOST.name())) {
            throw new RuntimeException(Environment.NM_HOST.name() + " not set in the environment");
        }
        if (!envs.containsKey(Environment.NM_HTTP_PORT.name())) {
            throw new RuntimeException(Environment.NM_HTTP_PORT + " not set in the environment");
        }
        if (!envs.containsKey(Environment.NM_PORT.name())) {
            throw new RuntimeException(Environment.NM_PORT.name() + " not set in the environment");
        }
        LOG.info("Application master for app, appId=" + appAttemptID.getApplicationId().getId()
                + ", clustertimestamp=" + appAttemptID.getApplicationId().getClusterTimestamp()
                + ", attemptId=" + appAttemptID.getAttemptId());

        containerMemory = Integer.parseInt(cliParser.getOptionValue("container_memory", "10"));
        containerVirtualCores = Integer.parseInt(cliParser.getOptionValue("container_vcores", "1"));
        numTotalContainers = Integer.parseInt(cliParser.getOptionValue("num_containers", "1"));
        if (numTotalContainers == 0) {
            throw new IllegalArgumentException("Cannot run distributed shell with no containers");
        }
        memoryOverhead = Integer.parseInt(cliParser.getOptionValue("memory_overhead", "1"));
        if (fileExist(Constants.JAVA_OPTS_PATH)) {
            javaOpts = readContent(Constants.JAVA_OPTS_PATH);
        }
        if (fileExist(Constants.SHELL_ARGS_PATH)) {
            shellArgs = readContent(Constants.SHELL_ARGS_PATH);
        }
        if (cliParser.hasOption("shell_env")) {
            String[] shellEnvs = cliParser.getOptionValues("shell_env");
            for (String env : shellEnvs) {
                env = env.trim();
                int index = env.indexOf('=');
                if (index == -1) {
                    shellEnv.put(env, "");
                    continue;
                }
                String key = env.substring(0, index);
                String val = "";
                if (index < (env.length() - 1)) {
                    val = env.substring(index + 1);
                }
                shellEnv.put(key, val);
            }
        }

        if (cliParser.hasOption("help")) {
            new HelpFormatter().printHelp("ApplicationMaster", opts);
            return false;
        }
        return true;
    }

    /**
     * Dump out contents of $CWD and the environment to stdout for debugging
     */
    private void dumpOutDebugInfo() {
        LOG.info("Dump debug output");
        Map<String, String> envs = System.getenv();
        for (Map.Entry<String, String> env : envs.entrySet()) {
            LOG.info("System env: key=" + env.getKey() + ", val=" + env.getValue());
        }
        BufferedReader buf = null;
        try {
            String lines = Shell.WINDOWS ? Shell.execCommand("cmd", "/c", "dir") : Shell.execCommand("ls", "-al");
            buf = new BufferedReader(new StringReader(lines));
            String line;
            while ((line = buf.readLine()) != null) {
                LOG.info("System CWD content: " + line);
            }
        } catch (IOException e) {
            LOG.error("Error in ApplicationMaster while dumping debug output", e);
        } finally {
            IOUtils.cleanup(LOG, buf);
        }
    }

    /**
     * Main run function for the application master
     *
     * @throws YarnException
     * @throws IOException
     */
    @SuppressWarnings({"unchecked"})
    public void run() throws Throwable {
        LOG.info("Starting ApplicationMaster");
        // Note: Credentials, Token, UserGroupInformation, DataOutputBuffer class are marked as LimitedPrivate
        DataOutputBuffer dob = new DataOutputBuffer();
        Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
        credentials.writeTokenStorageToStream(dob);
        // Now remove the AM->RM token so that containers cannot access it.
        Iterator<Token<?>> iter = credentials.getAllTokens().iterator();
        LOG.info("Executing with tokens:");
        while (iter.hasNext()) {
            Token<?> token = iter.next();
            LOG.info(token);
            if (token.getKind().equals(AMRMTokenIdentifier.KIND_NAME)) {
                iter.remove();
            }
        }
        allTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());

        // Create appSubmitterUgi and add original tokens to it
        String appSubmitterUserName = System.getenv(Environment.USER.name());
        appSubmitterUgi = UserGroupInformation.createRemoteUser(appSubmitterUserName);
        appSubmitterUgi.addCredentials(credentials);

        AMRMClientAsync.CallbackHandler allocListener = new RMCallbackHandler();
        amRMClient = AMRMClientAsync.createAMRMClientAsync(1000, allocListener);
        amRMClient.init(conf);
        amRMClient.start();

        containerListener = createNMCallbackHandler();
        nmClientAsync = new NMClientAsyncImpl(containerListener);
        nmClientAsync.init(conf);
        nmClientAsync.start();

        appMasterHostname = NetUtils.getHostname();

        // Setup local RPC Server to accept status requests directly from clients
        // TODO need to setup a protocol for client to be able to communicate to
        // the RPC server
        // TODO use the rpc port info to register with the RM for the client to
        // send requests to this app master

        // Register self with ResourceManager
        // This will start heartbeating to the RM
        RegisterApplicationMasterResponse response = amRMClient.registerApplicationMaster(appMasterHostname, appMasterRpcPort, appMasterTrackingUrl);

        // Setup local RPC Server to accept status requests directly from clients
        // TODO need to setup a protocol for client to be able to communicate to
        // the RPC server
        // TODO use the rpc port info to register with the RM for the client to
        // send requests to this app master

        // Dump out information about cluster capability as seen by the
        // resource manager
        int maxMem = response.getMaximumResourceCapability().getMemory();
        LOG.info("Max mem capability of resources in this cluster " + maxMem);

        int maxVCores = response.getMaximumResourceCapability().getVirtualCores();
        LOG.info("Max vcores capability of resources in this cluster " + maxVCores);

        // A resource ask cannot exceed the max.
        if (containerMemory + memoryOverhead > maxMem) {
            LOG.info("Container memory specified above max threshold of cluster."
                    + " Using max value." + ", specified=" + (containerMemory + memoryOverhead) + ", max=" + maxMem);
            containerMemory = maxMem - memoryOverhead;
        }

        if (containerVirtualCores > maxVCores) {
            LOG.info("Container virtual cores specified above max threshold of cluster."
                    + " Using max value." + ", specified=" + containerVirtualCores + ", max=" + maxVCores);
            containerVirtualCores = maxVCores;
        }

        List<Container> previousAMRunningContainers = response.getContainersFromPreviousAttempts();
        LOG.info(appAttemptID + " received " + previousAMRunningContainers.size() + " previous attempts' running containers on AM registration.");
        numAllocatedContainers.addAndGet(previousAMRunningContainers.size());

        recoverExecutors(previousAMRunningContainers);

        int numTotalContainersToRequest = numTotalContainers - previousAMRunningContainers.size();

        // Setup ask for containers from RM
        // Send request for containers to RM
        // Until we get our fully allocated quota, we keep on polling RM for
        // containers
        // Keep looping until all the containers are launched and shell script
        // executed on them ( regardless of success/failure).
        requestKContainers(numTotalContainersToRequest);
    }

    @VisibleForTesting
    NMCallbackHandler createNMCallbackHandler() {
        return new NMCallbackHandler(this);
    }

    @VisibleForTesting
    protected boolean finish() {
        // wait for completion.
        while (!done) {
            try {
                Thread.sleep(1000);
            } catch (InterruptedException ex) {
            }
        }

        // Join all launched threads
        // needed for when we time out
        // and we need to release containers
        for (Thread launchThread : launchThreads) {
            try {
                launchThread.join(10000);
            } catch (InterruptedException e) {
                LOG.error("Exception thrown in thread join: ", e);
            }
        }

        // When the application completes, it should stop all running containers
        LOG.info("Application completed. Stopping running containers");
        nmClientAsync.stop();

        // When the application completes, it should send a finish application
        // signal to the RM
        LOG.info("Application completed. Signalling finish to RM");

        FinalApplicationStatus appStatus;
        String appMessage = null;
        boolean success = true;
        // TODO: 2023/2/15 试试这里,不管node线程,直接结束
        if (numFailedContainers.get() == 0 && numCompletedContainers.get() == numTotalContainers) {
            appStatus = FinalApplicationStatus.SUCCEEDED;
        } else {
            if (doneDataX) {
                appStatus = FinalApplicationStatus.SUCCEEDED;
            } else {
                appStatus = FinalApplicationStatus.FAILED;
                appMessage = "Diagnostics." + ", total=" + numTotalContainers + ", completed=" + numCompletedContainers.get() + ", allocated="
                        + numAllocatedContainers.get() + ", failed=" + numFailedContainers.get();
                success = false;
            }
        }

        unregisterApplicationMaster(appStatus, appMessage);
        amRMClient.stop();
        return success;
    }

    public void unregisterApplicationMaster(FinalApplicationStatus appStatus, String appMessage) {
        try {
            amRMClient.unregisterApplicationMaster(appStatus, appMessage, null);
        } catch (YarnException | IOException ex) {
            LOG.error("Failed to unregister application", ex);
        }
    }

    /**
     * RMCallbackHandler
     */
    private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler {
        @SuppressWarnings("unchecked")
        @Override
        public void onContainersCompleted(List<ContainerStatus> completedContainers) {
            LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size());
            for (ContainerStatus containerStatus : completedContainers) {
                LOG.info(appAttemptID + " got container status for containerID=" + containerStatus.getContainerId()
                        + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics());
                // non complete containers should not be here
                assert (containerStatus.getState() == ContainerState.COMPLETE);

                // increment counters for completed/failed containers
                int exitStatus = containerStatus.getExitStatus();
                if (0 != exitStatus) {
                    // container failed
                    if (ContainerExitStatus.ABORTED != exitStatus) {
                        // shell script failed counts as completed
                        numCompletedContainers.incrementAndGet();
                        numFailedContainers.incrementAndGet();
                    } else {
                        // container was killed by framework, possibly preempted
                        // we should re-try as the container was lost for some reason
                        numAllocatedContainers.decrementAndGet();
                        numRequestedContainers.decrementAndGet();
                        // we do not need to release the container as it would be done by the RM
                    }
                } else {
                    // nothing to do container completed successfully
                    numCompletedContainers.incrementAndGet();
                    LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
                }
                runningContainers.remove(containerStatus.getContainerId());
            }
            // ask for more containers if any failed
            int askCount = numTotalContainers - numRequestedContainers.get();
            numRequestedContainers.addAndGet(askCount);
            if (askCount > 0) {
                for (int i = 0; i < askCount; ++i) {
                    ContainerRequest containerAsk = setupContainerAskForRM();
                    amRMClient.addContainerRequest(containerAsk);
                }
            }

            if (numCompletedContainers.get() == numTotalContainers) {
                done = true;
            }
        }

        @Override
        public void onContainersAllocated(List<Container> allocatedContainers) {
            LOG.info("Got response from RM for container ask, allocatedCnt=" + allocatedContainers.size());
            // We are sleeping here because there might be multiple calls and we want to keep the number of containers as expected.
            if (runningContainers.size() >= numTotalContainers) {
                return;
            }
            numAllocatedContainers.addAndGet(allocatedContainers.size());
            for (Container allocatedContainer : allocatedContainers) {
                LOG.info("Launching shell command on a new container"
                        + ", containerId=" + allocatedContainer.getId()
                        + ", containerNode=" + allocatedContainer.getNodeId().getHost() + ":" + allocatedContainer.getNodeId().getPort()
                        + ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress()
                        + ", containerResourceMemory=" + allocatedContainer.getResource().getMemory()
                        + ", containerResourceVirtualCores=" + allocatedContainer.getResource().getVirtualCores()
                        + ", containerToken" + allocatedContainer.getContainerToken().getIdentifier().toString());

                LaunchContainerRunnable runnableLaunchContainer = new LaunchContainerRunnable(allocatedContainer, containerListener);
                Thread launchThread = new Thread(runnableLaunchContainer);

                // launch and start the container on a separate thread to keep the main thread unblocked as all containers may not be allocated at one go.
                launchThreads.add(launchThread);
                launchThread.start();
            }
        }

        @Override
        public void onShutdownRequest() {
            done = true;
        }

        @Override
        public void onNodesUpdated(List<NodeReport> updatedNodes) {

        }

        @Override
        public float getProgress() {
            // set progress to deliver to RM on next heartbeat
            return (float) numCompletedContainers.get() / numTotalContainers;
        }

        @Override
        public void onError(Throwable e) {
            done = true;
            amRMClient.stop();
        }
    }

    /**
     * NMCallbackHandler
     */
    static class NMCallbackHandler implements NMClientAsync.CallbackHandler {

        private ConcurrentMap<ContainerId, Container> containers = new ConcurrentHashMap<>();
        private final ApplicationMaster applicationMaster;

        NMCallbackHandler(ApplicationMaster applicationMaster) {
            this.applicationMaster = applicationMaster;
        }

        void addContainer(ContainerId containerId, Container container) {
            containers.putIfAbsent(containerId, container);
        }

        @Override
        public void onContainerStopped(ContainerId containerId) {
            LOG.info("Succeeded to stop Container " + containerId);
            applicationMaster.runningContainers.remove(containerId);
            containers.remove(containerId);
        }

        @Override
        public void onContainerStatusReceived(ContainerId containerId, ContainerStatus containerStatus) {
            LOG.debug("Container Status: id=" + containerId + ", status=" + containerStatus);
        }

        @Override
        public void onContainerStarted(ContainerId containerId, Map<String, ByteBuffer> allServiceResponse) {
            LOG.debug("Succeeded to start Container " + containerId);
            Container container = containers.get(containerId);
            if (container != null) {
                applicationMaster.nmClientAsync.getContainerStatusAsync(containerId, container.getNodeId());
            }
        }

        @Override
        public void onStartContainerError(ContainerId containerId, Throwable t) {
            LOG.error("Failed to start Container " + containerId);
            containers.remove(containerId);
            applicationMaster.runningContainers.remove(containerId);
            applicationMaster.numCompletedContainers.incrementAndGet();
            applicationMaster.numFailedContainers.incrementAndGet();
        }

        @Override
        public void onGetContainerStatusError(ContainerId containerId, Throwable t) {
            LOG.error("Failed to query the status of Container " + containerId);
        }

        @Override
        public void onStopContainerError(ContainerId containerId, Throwable t) {
            LOG.error("Failed to stop Container " + containerId);
            applicationMaster.runningContainers.remove(containerId);
            containers.remove(containerId);
        }
    }

    /**
     * Thread to connect to the {@link ContainerManagementProtocol} and launch the container
     * that will execute the shell command.
     */
    private class LaunchContainerRunnable implements Runnable {
        // Allocated container
        Container container;
        NMCallbackHandler containerListener;

        /**
         * @param lcontainer        Allocated container
         * @param containerListener Callback handler of the container
         */
        LaunchContainerRunnable(Container lcontainer, NMCallbackHandler containerListener) {
            this.container = lcontainer;
            this.containerListener = containerListener;
        }

        @Override
        /**
         * Connects to CM, sets up container launch context
         * for shell command and eventually dispatches the container
         * start request to the CM.
         */
        public void run() {
            LOG.info("Setting up container launch container for containerId=" + container.getId());
            List<String> commands = new ArrayList<>();
            commands.add("ls");
            // Set up ContainerLaunchContext, setting local resource, environment, command and token for constructor.
            // Note for tokens: Set up tokens for the container too. Today, for normal shell commands, the container in distribute-shell doesn't need any tokens.
            // We are populating them mainly for NodeManagers to be able to download anyfiles in the distributed file-system.
            // The tokens are otherwise also useful in cases, for e.g., when one is running a "hadoop dfs" command inside the distributed shell.
            ContainerLaunchContext ctx = ContainerLaunchContext.newInstance(null, shellEnv, commands, null, allTokens.duplicate(), null);
            runningContainers.putIfAbsent(container.getId(), container);
            containerListener.addContainer(container.getId(), container);
            nmClientAsync.startContainerAsync(container, ctx);
        }
    }

    /**
     * Setup the request that will be sent to the RM for the container ask.
     *
     * @return the setup ResourceRequest to be sent to RM
     */
    private ContainerRequest setupContainerAskForRM() {
        // setup requirements for hosts using * as any host will do for the distributed shell app
        // Set up resource type requirements
        // For now, memory and CPU are supported so we set memory and cpu requirements
        Resource capability = Resource.newInstance(containerMemory + memoryOverhead, containerVirtualCores);
        // set the priority for the request
        // TODO - what is the range for priority? how to decide?
        Priority pri = Priority.newInstance(requestPriority);
        ContainerRequest request = new ContainerRequest(capability, null, null, pri);
        LOG.info("Requested container ask: " + request);
        return request;
    }

    private void recoverExecutors(List<Container> previousAMRunningContainers) {
        for (Container container : previousAMRunningContainers) {
            runningContainers.putIfAbsent(container.getId(), container);
        }
    }

    private void requestKContainers(int askCount) {
        LOG.info("Request new containers count:" + askCount);
        for (int i = 0; i < askCount; ++i) {
            ContainerRequest containerAsk = setupContainerAskForRM();
            amRMClient.addContainerRequest(containerAsk);
        }
        numRequestedContainers.set(numTotalContainers);
    }

    private synchronized void askMoreContainersIfNecessary() {
        int askCount = numTotalContainers - runningContainers.size();
        if (askCount > 0) {
            LOG.info("Request more containers count:" + askCount);
            requestKContainers(askCount);
        } else {
            LOG.info("No more to ask for containers");
        }
    }

    public ConcurrentHashMap<ContainerId, Container> getRunningContainers() {
        return runningContainers;
    }

    private boolean fileExist(String filePath) {
        return new File(filePath).exists();
    }

    private String readContent(String filePath) throws IOException {
        DataInputStream ds = null;
        try {
            ds = new DataInputStream(new FileInputStream(filePath));
            return ds.readUTF();
        } finally {
            org.apache.commons.io.IOUtils.closeQuietly(ds);
        }
    }

}

提交代码样例

public static void main(String[] args) throws Exception {
        DataxJob job = new DataxJob();
        job.setAppName("0101");
        job.setPch("0101");
        job.setMainJar("/work/datax-on-yarn-0.0.1-SNAPSHOT.jar");
        job.setJsonPath(new File("/job/job.json"));
        job.setDataxHome("/work/datax.tar.gz");
        job.setReflect(true);
        job.setParameter(Lists.newArrayList("datax.jobinfo=" +
                Base64Util.encode("{" +
                        "\"jobId\":\"0101\"," +
                        "\"rwId\":\"0101\"," +
                        "\"zrwId\":\"0101\"," +
                        "\"zrwpch\":\"0101\"," +
                        "\"rwlyDm\":\"01\"," +
                        "\"elasticsearch.url\":\"esip:port\"," +
                        "\"elasticsearch.username\":\"elastic\"," +
                        "\"elasticsearch.password\":\"pass\"," +
                        "\"bootstrap.servers\":\"kafkaip:9591\"," +
                        "\"security.protocol\":\"SASL_PLAINTEXT\"," +
                        "\"sasl.mechanism\":\"SCRAM-SHA-256\"," +
                        "\"sasl.jaas.config\":\"org.apache.kafka.common.security.scram.ScramLoginModule\\u0020required username=\\\"admin\\\"\\u0020password=\\\"kafkapass\\\";\"," +
                        "}")));
        Client client = new Client(createConfiguration(), job);
        ApplicationId applicationId = client.run();
        client.stop();
        System.out.println(applicationId);
    }

借鉴

https://github.com/duhanmin/datax-on-yarn

https://github.com/TianLangStudio/DataXServer

https://github.com/RebornHuan/pandora

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

风卷残尘

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值