Flink -- Failover

 

JobManager failover

 

LeaderLatch

private synchronized void setLeadership(boolean newValue)
{
boolean oldValue = hasLeadership.getAndSet(newValue);

if ( oldValue && !newValue ) //原来是leader,当前不是leader,所以是lost leadership
{ // Lost leadership, was true, now false
listeners.forEach(new Function<LeaderLatchListener, Void>()
{
@Override
public Void apply(LeaderLatchListener listener)
{
listener.notLeader();
return null;
}
});
}
else if ( !oldValue && newValue )
{ // Gained leadership, was false, now true
listeners.forEach(new Function<LeaderLatchListener, Void>()
{
@Override
public Void apply(LeaderLatchListener input)
{
input.isLeader();
return null;
}
});
}

notifyAll();
}
 

ZooKeeperLeaderElectionService

@Override
public void isLeader() {
synchronized (lock) {
issuedLeaderSessionID = UUID.randomUUID();


leaderContender.grantLeadership(issuedLeaderSessionID);
}
}

@Override
public void notLeader() {
synchronized (lock) {
issuedLeaderSessionID = null;
confirmedLeaderSessionID = null;



leaderContender.revokeLeadership();
}
}

可以看到,只是分别调用leaderContender.grantLeadership,leaderContender.revokeLeadership

 

而JobManager继承了leaderContender接口,

revokeLeadership

val newFuturesToComplete = cancelAndClearEverything(
new Exception("JobManager is no longer the leader."))

 

在cancelAndClearEverything中,关键的是suspend executionGraph;停止执行,但是并不会job删除,这样其他的JobManager还能重新提交

* The SUSPENDED state is a local terminal state which stops the execution of the job but does
* not remove the job from the HA job store so that it can be recovered by another JobManager.
private def cancelAndClearEverything(cause: Throwable)
: Seq[Future[Unit]] = {
val futures = for ((jobID, (eg, jobInfo)) <- currentJobs) yield {
future {
eg.suspend(cause) //suspend Execution Graph

if (jobInfo.listeningBehaviour != ListeningBehaviour.DETACHED) {
jobInfo.client ! decorateMessage(
Failure(new JobExecutionException(jobID, "All jobs are cancelled and cleared.", cause)))
}
}(context.dispatcher)
}

currentJobs.clear()

futures.toSeq
}

 

grantLeadership

context.system.scheduler.scheduleOnce(
jobRecoveryTimeout,
self,
decorateMessage(RecoverAllJobs))(
context.dispatcher)

主要是要恢复所有的job,RecoverAllJobs

case RecoverAllJobs =>
future {
try {
// The ActorRef, which is part of the submitted job graph can only be
// de-serialized in the scope of an actor system.
akka.serialization.JavaSerializer.currentSystem.withValue(
context.system.asInstanceOf[ExtendedActorSystem]) {

log.info(s"Attempting to recover all jobs.")

val jobGraphs = submittedJobGraphs.recoverJobGraphs().asScala //从submittedJobGraphs store里面读出所有submitted的job,也是从zk里面读出

if (!leaderElectionService.hasLeadership()) {
// we've lost leadership. mission: abort.
log.warn(s"Lost leadership during recovery. Aborting recovery of ${jobGraphs.size} " +
s"jobs.")
} else {
log.info(s"Re-submitting ${jobGraphs.size} job graphs.")

jobGraphs.foreach{
submittedJobGraph =>
self ! decorateMessage(RecoverSubmittedJob(submittedJobGraph)) //recover job
}
}
}
} catch {
case t: Throwable => log.error("Fatal error: Failed to recover jobs.", t)
}
}(context.dispatcher)

 

在recover job,

case RecoverSubmittedJob(submittedJobGraph) =>
if (!currentJobs.contains(submittedJobGraph.getJobId)) {
submitJob(
submittedJobGraph.getJobGraph(),
submittedJobGraph.getJobInfo(),
isRecovery = true)
}
else {
log.info(s"Ignoring job recovery for ${submittedJobGraph.getJobId}, " +
s"because it is already submitted.")
}

其实就是重新的submit job,注意这里的,isRecovery = true

在submit job时,如果isRecovery = true,会做下面的操作,然后后续具体的操作参考Checkpoint篇

if (isRecovery) {
executionGraph.restoreLatestCheckpointedState()
}

 

TaskManager Failover

在job manager内部通过death watch发现task manager dead,

/**
    * Handler to be executed when a task manager terminates.
    * (Akka Deathwatch or notifiction from ResourceManager)
    *
    * @param taskManager The ActorRef of the taskManager
    */
  private def handleTaskManagerTerminated(taskManager: ActorRef): Unit = {
    if (instanceManager.isRegistered(taskManager)) {
      log.info(s"Task manager ${taskManager.path} terminated.")

      instanceManager.unregisterTaskManager(taskManager, true)
      context.unwatch(taskManager)
    }
  }

instanceManager.unregisterTaskManager,

/**
* Unregisters the TaskManager with the given {@link ActorRef}. Unregistering means to mark
* the given instance as dead and notify {@link InstanceListener} about the dead instance.
*
* @param instanceID TaskManager which is about to be marked dead.
*/
public void unregisterTaskManager(ActorRef instanceID, boolean terminated){
    Instance instance = registeredHostsByConnection.get(instanceID);
    
    if (instance != null){
        ActorRef host = instance.getActorGateway().actor();
        
        registeredHostsByConnection.remove(host);
        registeredHostsById.remove(instance.getId());
        registeredHostsByResource.remove(instance.getResourceId());
        
        if (terminated) {
            deadHosts.add(instance.getActorGateway().actor());
        }
        
        instance.markDead();
        
        totalNumberOfAliveTaskSlots -= instance.getTotalNumberOfSlots();
        
        notifyDeadInstance(instance);
    }
}

 

instance.markDead()

public void markDead() {

    // create a copy of the slots to avoid concurrent modification exceptions
    List<Slot> slots;
    
    synchronized (instanceLock) {
    if (isDead) {
        return;
    }
    isDead = true;
    
    // no more notifications for the slot releasing
    this.slotAvailabilityListener = null;
    
    slots = new ArrayList<Slot>(allocatedSlots);
    
    allocatedSlots.clear();
        availableSlots.clear();
    }
    
    /*
    * releaseSlot must not own the instanceLock in order to avoid dead locks where a slot
    * owning the assignment group lock wants to give itself back to the instance which requires
    * the instance lock
    */
    for (Slot slot : slots) {
        slot.releaseSlot();
    }
}

 

SimpleSolt.releaseSlot

@Override 
public void releaseSlot() { 

    if (!isCanceled()) { 

        // kill all tasks currently running in this slot 
        Execution exec = this.executedTask; 
        if (exec != null && !exec.isFinished()) { 
            exec.fail(new Exception( 
                    "The slot in which the task was executed has been released. Probably loss of TaskManager " 
                            + getInstance())); 
        } 

        // release directly (if we are directly allocated), 
        // otherwise release through the parent shared slot 
        if (getParent() == null) { 
            // we have to give back the slot to the owning instance 
            if (markCancelled()) { 
                getInstance().returnAllocatedSlot(this); 
            } 
        } else { 
            // we have to ask our parent to dispose us 
            getParent().releaseChild(this); 
        }

}

 

Execution.fail

public void fail(Throwable t) {
processFail(t, false);
}

 

Execution.processFail

先将Execution的状态设为failed

transitionState(current, FAILED, t)
private boolean transitionState(ExecutionState currentState, ExecutionState targetState, Throwable error) { 

    if (STATE_UPDATER.compareAndSet(this, currentState, targetState)) {
        markTimestamp(targetState); 

        try {
            vertex.notifyStateTransition(attemptId, targetState, error);
        }
        catch (Throwable t) {
            LOG.error("Error while notifying execution graph of execution state transition.", t);
        }
        return true;
    } else {
        return false;
    }
}

设置完后,需要notifyStateTransition

getExecutionGraph().notifyExecutionChange(getJobvertexId(), subTaskIndex, executionId, newState, error);
void notifyExecutionChange(JobVertexID vertexId, int subtask, ExecutionAttemptID executionID, ExecutionState
                        newExecutionState, Throwable error)
{
    ExecutionJobVertex vertex = getJobVertex(vertexId);

    if (executionListenerActors.size() > 0) {
        String message = error == null ? null : ExceptionUtils.stringifyException(error);
        ExecutionGraphMessages.ExecutionStateChanged actorMessage =
                new ExecutionGraphMessages.ExecutionStateChanged(jobID, vertexId,  vertex.getJobVertex().getName(),
                                                                vertex.getParallelism(), subtask,
                                                                executionID, newExecutionState,
                                                                System.currentTimeMillis(), message);

        for (ActorGateway listener : executionListenerActors) {
            listener.tell(actorMessage);
        }
    }

    // see what this means for us. currently, the first FAILED state means -> FAILED
    if (newExecutionState == ExecutionState.FAILED) {
        fail(error);
    }
}

主要就是将ExecutionGraphMessages.ExecutionStateChanged,发送给所有的listeners

listener是在JobManager里面在提交job的时候加上的,

     if (jobInfo.listeningBehaviour == ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES) {
          // the sender wants to be notified about state changes
          val gateway = new AkkaActorGateway(jobInfo.client, leaderSessionID.orNull)

          executionGraph.registerExecutionListener(gateway)
          executionGraph.registerJobStatusListener(gateway)
      }

而在client,

JobClientActor,只是log和print这些信息
if (message instanceof ExecutionGraphMessages.ExecutionStateChanged) {
    logAndPrintMessage((ExecutionGraphMessages.ExecutionStateChanged) message);
} else if (message instanceof ExecutionGraphMessages.JobStatusChanged) {
    logAndPrintMessage((ExecutionGraphMessages.JobStatusChanged) message);
}

 

注意,这里如果newExecutionState == ExecutionState.FAILED,会调用ExecutionGraph.fail
就像注释说的,第一个failed,就意味着整个jobfailed

public void fail(Throwable t) {
    while (true) {
        JobStatus current = state;
        // stay in these states
        if (current == JobStatus.FAILING ||
            current == JobStatus.SUSPENDED ||
            current.isGloballyTerminalState()) {
            return;
        } else if (current == JobStatus.RESTARTING && transitionState(current, JobStatus.FAILED, t)) {
            synchronized (progressLock) {
                postRunCleanup();
                progressLock.notifyAll();
                return;
            }
        } else if (transitionState(current, JobStatus.FAILING, t)) { //将job的状态设为JobStatus.FAILING
            this.failureCause = t;

            if (!verticesInCreationOrder.isEmpty()) {
                // cancel all. what is failed will not cancel but stay failed
                for (ExecutionJobVertex ejv : verticesInCreationOrder) {
                    ejv.cancel();
                }
            } else {
                // set the state of the job to failed
                transitionState(JobStatus.FAILING, JobStatus.FAILED, t); //
            }

            return;
        }

    }
}

可以看到,这里直接把job状态设为Failing,并且调用所有的ExecutionJobVertex.cancel

 

接着,从ExecutionGraph中deregister这个execution,

vertex.getExecutionGraph().deregisterExecution(this);
Execution contained = currentExecutions.remove(exec.getAttemptId());

 

最终,调用

vertex.executionFailed(t);
void executionFailed(Throwable t) {
    jobVertex.vertexFailed(subTaskIndex, t);
}

 

ExecutionJobVertex

void vertexFailed(int subtask, Throwable error) {
    subtaskInFinalState(subtask);
}

private void subtaskInFinalState(int subtask) {
    synchronized (stateMonitor) {
        if (!finishedSubtasks[subtask]) {
            finishedSubtasks[subtask] = true;
            
            if (numSubtasksInFinalState+1 == parallelism) { //看看对于Vertex而言,是否所有的subTask都已经finished
                
                // call finalizeOnMaster hook
                try {
                    getJobVertex().finalizeOnMaster(getGraph().getUserClassLoader());
                }
                catch (Throwable t) {
                    getGraph().fail(t);
                }

                numSubtasksInFinalState++;
                
                // we are in our final state
                stateMonitor.notifyAll();
                
                // tell the graph
                graph.jobVertexInFinalState();
            } else {
                numSubtasksInFinalState++;
            }
        }
    }
}

graph.jobVertexInFinalState()

void jobVertexInFinalState() {
        numFinishedJobVertices++;

        if (numFinishedJobVertices == verticesInCreationOrder.size()) { //是否所有JobVertices都已经finished

            // we are done, transition to the final state
            JobStatus current;
            while (true) {
                current = this.state;

                if (current == JobStatus.RUNNING) {
                    if (transitionState(current, JobStatus.FINISHED)) {
                        postRunCleanup();
                        break;
                    }
                }
                else if (current == JobStatus.CANCELLING) {
                    if (transitionState(current, JobStatus.CANCELED)) {
                        postRunCleanup();
                        break;
                    }
                }
                else if (current == JobStatus.FAILING) {
                    boolean allowRestart = !(failureCause instanceof SuppressRestartsException);

                    if (allowRestart && restartStrategy.canRestart() && transitionState(current, JobStatus.RESTARTING)) {
                        restartStrategy.restart(this);
                        break;
                    } else if ((!allowRestart || !restartStrategy.canRestart()) && transitionState(current, JobStatus.FAILED, failureCause)) {
                        postRunCleanup();
                        break;
                    }
                }
                else if (current == JobStatus.SUSPENDED) {
                    // we've already cleaned up when entering the SUSPENDED state
                    break;
                }
                else if (current.isGloballyTerminalState()) {
                    LOG.warn("Job has entered globally terminal state without waiting for all " +
                        "job vertices to reach final state.");
                    break;
                }
                else {
                    fail(new Exception("ExecutionGraph went into final state from state " + current));
                    break;
                }
            }
            // done transitioning the state

            // also, notify waiters
            progressLock.notifyAll();
        }
    }
}

如果Job状态是JobStatus.FAILING,并且满足restart的条件,transitionState(current, JobStatus.RESTARTING)

restartStrategy.restart(this);

这个restart策略是可以配置的,但无论什么策略最终调用到,

executionGraph.restart();
public void restart() {
    try {
        synchronized (progressLock) {
            JobStatus current = state;

            if (current == JobStatus.CANCELED) {
                LOG.info("Canceled job during restart. Aborting restart.");
                return;
            } else if (current == JobStatus.FAILED) {
                LOG.info("Failed job during restart. Aborting restart.");
                return;
            } else if (current == JobStatus.SUSPENDED) {
                LOG.info("Suspended job during restart. Aborting restart.");
                return;
            } else if (current != JobStatus.RESTARTING) {
                throw new IllegalStateException("Can only restart job from state restarting.");
            }

            if (scheduler == null) {
                throw new IllegalStateException("The execution graph has not been scheduled before - scheduler is null.");
            }

            this.currentExecutions.clear();

            Collection<CoLocationGroup> colGroups = new HashSet<>();

            for (ExecutionJobVertex jv : this.verticesInCreationOrder) {

                CoLocationGroup cgroup = jv.getCoLocationGroup();
                if(cgroup != null && !colGroups.contains(cgroup)){
                    cgroup.resetConstraints();
                    colGroups.add(cgroup);
                }

                jv.resetForNewExecution();
            }

            for (int i = 0; i < stateTimestamps.length; i++) {
                if (i != JobStatus.RESTARTING.ordinal()) {
                    // Only clear the non restarting state in order to preserve when the job was
                    // restarted. This is needed for the restarting time gauge
                    stateTimestamps[i] = 0;
                }
            }
            numFinishedJobVertices = 0;
            transitionState(JobStatus.RESTARTING, JobStatus.CREATED);

            // if we have checkpointed state, reload it into the executions
            if (checkpointCoordinator != null) {
                boolean restored = checkpointCoordinator
                        .restoreLatestCheckpointedState(getAllVertices(), false, false); //重新加载checkpoint和状态

                // TODO(uce) Temporary work around to restore initial state on
                // failure during recovery. Will be superseded by FLINK-3397.
                if (!restored && savepointCoordinator != null) {
                    String savepointPath = savepointCoordinator.getSavepointRestorePath();
                    if (savepointPath != null) {
                        savepointCoordinator.restoreSavepoint(getAllVertices(), savepointPath);
                    }
                }
            }
        }

        scheduleForExecution(scheduler); //把ExecuteGraph加入调度,重新提交
    }
    catch (Throwable t) {
        fail(t);
    }
}
D:\software\develop\java\jdk-1.8\jdk-1.8\bin\java.exe "-javaagent:D:\software\develop\ideal\IntelliJ IDEA 2022.1.3\lib\idea_rt.jar=61865:D:\software\develop\ideal\IntelliJ IDEA 2022.1.3\bin" -Dfile.encoding=UTF-8 -classpath D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\charsets.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\deploy.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\ext\access-bridge-64.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\ext\cldrdata.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\ext\dnsns.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\ext\jaccess.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\ext\jfxrt.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\ext\localedata.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\ext\nashorn.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\ext\sunec.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\ext\sunjce_provider.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\ext\sunmscapi.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\ext\sunpkcs11.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\ext\zipfs.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\javaws.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\jce.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\jfr.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\jfxswt.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\jsse.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\management-agent.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\plugin.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\resources.jar;D:\software\develop\java\jdk-1.8\jdk-1.8\jre\lib\rt.jar;D:\workspace\UserBehaviorAnalysis\HotItemsAnalysis\target\classes;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-table-planner-blink_2.12\1.10.1\flink-table-planner-blink_2.12-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-table-common\1.10.1\flink-table-common-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-table-api-java\1.10.1\flink-table-api-java-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-table-api-scala_2.12\1.10.1\flink-table-api-scala_2.12-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\scala-lang\scala-compiler\2.12.7\scala-compiler-2.12.7.jar;D:\software\develop\Apache\maven-repository\org\scala-lang\modules\scala-xml_2.12\1.0.6\scala-xml_2.12-1.0.6.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-table-api-java-bridge_2.12\1.10.1\flink-table-api-java-bridge_2.12-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-table-api-scala-bridge_2.12\1.10.1\flink-table-api-scala-bridge_2.12-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-scala_2.12\1.10.1\flink-scala_2.12-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-streaming-scala_2.12\1.10.1\flink-streaming-scala_2.12-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-table-runtime-blink_2.12\1.10.1\flink-table-runtime-blink_2.12-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\codehaus\janino\janino\3.0.9\janino-3.0.9.jar;D:\software\develop\Apache\maven-repository\org\codehaus\janino\commons-compiler\3.0.9\commons-compiler-3.0.9.jar;D:\software\develop\Apache\maven-repository\org\apache\calcite\avatica\avatica-core\1.15.0\avatica-core-1.15.0.jar;D:\software\develop\Apache\maven-repository\org\reflections\reflections\0.9.10\reflections-0.9.10.jar;D:\software\develop\Apache\maven-repository\org\javassist\javassist\3.19.0-GA\javassist-3.19.0-GA.jar;D:\software\develop\Apache\maven-repository\org\slf4j\slf4j-api\1.7.15\slf4j-api-1.7.15.jar;D:\software\develop\Apache\maven-repository\com\google\code\findbugs\jsr305\1.3.9\jsr305-1.3.9.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\force-shading\1.10.1\force-shading-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-java\1.10.1\flink-java-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-core\1.10.1\flink-core-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-annotations\1.10.1\flink-annotations-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-metrics-core\1.10.1\flink-metrics-core-1.10.1.jar;D:\software\develop\Apache\maven-repository\com\esotericsoftware\kryo\kryo\2.24.0\kryo-2.24.0.jar;D:\software\develop\Apache\maven-repository\com\esotericsoftware\minlog\minlog\1.2\minlog-1.2.jar;D:\software\develop\Apache\maven-repository\org\objenesis\objenesis\2.1\objenesis-2.1.jar;D:\software\develop\Apache\maven-repository\commons-collections\commons-collections\3.2.2\commons-collections-3.2.2.jar;D:\software\develop\Apache\maven-repository\org\apache\commons\commons-compress\1.18\commons-compress-1.18.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-shaded-asm-7\7.1-9.0\flink-shaded-asm-7-7.1-9.0.jar;D:\software\develop\Apache\maven-repository\org\apache\commons\commons-lang3\3.3.2\commons-lang3-3.3.2.jar;D:\software\develop\Apache\maven-repository\org\apache\commons\commons-math3\3.5\commons-math3-3.5.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-streaming-java_2.12\1.10.1\flink-streaming-java_2.12-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-runtime_2.12\1.10.1\flink-runtime_2.12-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-queryable-state-client-java\1.10.1\flink-queryable-state-client-java-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-hadoop-fs\1.10.1\flink-hadoop-fs-1.10.1.jar;D:\software\develop\Apache\maven-repository\commons-io\commons-io\2.4\commons-io-2.4.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-shaded-netty\4.1.39.Final-9.0\flink-shaded-netty-4.1.39.Final-9.0.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-shaded-jackson\2.10.1-9.0\flink-shaded-jackson-2.10.1-9.0.jar;D:\software\develop\Apache\maven-repository\commons-cli\commons-cli\1.3.1\commons-cli-1.3.1.jar;D:\software\develop\Apache\maven-repository\com\typesafe\akka\akka-actor_2.12\2.5.21\akka-actor_2.12-2.5.21.jar;D:\software\develop\Apache\maven-repository\com\typesafe\config\1.3.3\config-1.3.3.jar;D:\software\develop\Apache\maven-repository\org\scala-lang\modules\scala-java8-compat_2.12\0.8.0\scala-java8-compat_2.12-0.8.0.jar;D:\software\develop\Apache\maven-repository\com\typesafe\akka\akka-stream_2.12\2.5.21\akka-stream_2.12-2.5.21.jar;D:\software\develop\Apache\maven-repository\org\reactivestreams\reactive-streams\1.0.2\reactive-streams-1.0.2.jar;D:\software\develop\Apache\maven-repository\com\typesafe\ssl-config-core_2.12\0.3.7\ssl-config-core_2.12-0.3.7.jar;D:\software\develop\Apache\maven-repository\org\scala-lang\modules\scala-parser-combinators_2.12\1.1.1\scala-parser-combinators_2.12-1.1.1.jar;D:\software\develop\Apache\maven-repository\com\typesafe\akka\akka-protobuf_2.12\2.5.21\akka-protobuf_2.12-2.5.21.jar;D:\software\develop\Apache\maven-repository\com\typesafe\akka\akka-slf4j_2.12\2.5.21\akka-slf4j_2.12-2.5.21.jar;D:\software\develop\Apache\maven-repository\org\clapper\grizzled-slf4j_2.12\1.3.2\grizzled-slf4j_2.12-1.3.2.jar;D:\software\develop\Apache\maven-repository\com\github\scopt\scopt_2.12\3.5.0\scopt_2.12-3.5.0.jar;D:\software\develop\Apache\maven-repository\org\xerial\snappy\snappy-java\1.1.4\snappy-java-1.1.4.jar;D:\software\develop\Apache\maven-repository\com\twitter\chill_2.12\0.7.6\chill_2.12-0.7.6.jar;D:\software\develop\Apache\maven-repository\com\twitter\chill-java\0.7.6\chill-java-0.7.6.jar;D:\software\develop\Apache\maven-repository\org\lz4\lz4-java\1.5.0\lz4-java-1.5.0.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-clients_2.12\1.10.1\flink-clients_2.12-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-optimizer_2.12\1.10.1\flink-optimizer_2.12-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-shaded-guava\18.0-9.0\flink-shaded-guava-18.0-9.0.jar;D:\software\develop\Apache\maven-repository\org\apache\kafka\kafka_2.12\2.2.0\kafka_2.12-2.2.0.jar;D:\software\develop\Apache\maven-repository\org\apache\kafka\kafka-clients\2.2.0\kafka-clients-2.2.0.jar;D:\software\develop\Apache\maven-repository\com\github\luben\zstd-jni\1.3.8-1\zstd-jni-1.3.8-1.jar;D:\software\develop\Apache\maven-repository\com\fasterxml\jackson\core\jackson-databind\2.9.8\jackson-databind-2.9.8.jar;D:\software\develop\Apache\maven-repository\com\fasterxml\jackson\core\jackson-annotations\2.9.0\jackson-annotations-2.9.0.jar;D:\software\develop\Apache\maven-repository\com\fasterxml\jackson\core\jackson-core\2.9.8\jackson-core-2.9.8.jar;D:\software\develop\Apache\maven-repository\com\fasterxml\jackson\datatype\jackson-datatype-jdk8\2.9.8\jackson-datatype-jdk8-2.9.8.jar;D:\software\develop\Apache\maven-repository\net\sf\jopt-simple\jopt-simple\5.0.4\jopt-simple-5.0.4.jar;D:\software\develop\Apache\maven-repository\com\yammer\metrics\metrics-core\2.2.0\metrics-core-2.2.0.jar;D:\software\develop\Apache\maven-repository\org\scala-lang\scala-library\2.12.8\scala-library-2.12.8.jar;D:\software\develop\Apache\maven-repository\org\scala-lang\scala-reflect\2.12.8\scala-reflect-2.12.8.jar;D:\software\develop\Apache\maven-repository\com\typesafe\scala-logging\scala-logging_2.12\3.9.0\scala-logging_2.12-3.9.0.jar;D:\software\develop\Apache\maven-repository\com\101tec\zkclient\0.11\zkclient-0.11.jar;D:\software\develop\Apache\maven-repository\org\apache\zookeeper\zookeeper\3.4.13\zookeeper-3.4.13.jar;D:\software\develop\Apache\maven-repository\org\apache\yetus\audience-annotations\0.5.0\audience-annotations-0.5.0.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-connector-kafka_2.12\1.10.1\flink-connector-kafka_2.12-1.10.1.jar;D:\software\develop\Apache\maven-repository\org\apache\flink\flink-connector-kafka-base_2.12\1.10.1\flink-connector-kafka-base_2.12-1.10.1.jar com.atguigu.hotitems_analysis.HotItems SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". SLF4J: Defaulting to no-operation (NOP) logger implementation SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details. Exception in thread "main" java.util.concurrent.ExecutionException: org.apache.flink.runtime.client.JobExecutionException: Job execution failed. at java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357) at java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1908) at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1640) at org.apache.flink.streaming.api.environment.LocalStreamEnvironment.execute(LocalStreamEnvironment.java:74) at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1620) at com.atguigu.hotitems_analysis.HotItems.main(HotItems.java:89) Caused by: org.apache.flink.runtime.client.JobExecutionException: Job execution failed. at org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:147) at org.apache.flink.client.program.PerJobMiniClusterFactory$PerJobMiniClusterJobClient.lambda$getJobExecutionResult$2(PerJobMiniClusterFactory.java:175) at java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616) at java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591) at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) at java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975) at org.apache.flink.runtime.concurrent.FutureUtils$1.onComplete(FutureUtils.java:874) at akka.dispatch.OnComplete.internal(Future.scala:264) at akka.dispatch.OnComplete.internal(Future.scala:261) at akka.dispatch.japi$CallbackBridge.apply(Future.scala:191) at akka.dispatch.japi$CallbackBridge.apply(Future.scala:188) at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64) at org.apache.flink.runtime.concurrent.Executors$DirectExecutionContext.execute(Executors.java:74) at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:72) at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1(Promise.scala:288) at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1$adapted(Promise.scala:288) at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:288) at akka.pattern.PromiseActorRef.$bang(AskSupport.scala:573) at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:22) at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:21) at scala.concurrent.Future.$anonfun$andThen$1(Future.scala:536) at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33) at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33) at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64) at akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:55) at akka.dispatch.BatchingExecutor$BlockableBatch.$anonfun$run$1(BatchingExecutor.scala:91) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:85) at akka.dispatch.BatchingExecutor$BlockableBatch.run(BatchingExecutor.scala:91) at akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:40) at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:44) at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) at akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) at akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) Caused by: org.apache.flink.runtime.JobException: Recovery is suppressed by NoRestartBackoffTimeStrategy at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:110) at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:76) at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:192) at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeHandleTaskFailure(DefaultScheduler.java:186) at org.apache.flink.runtime.scheduler.DefaultScheduler.updateTaskExecutionStateInternal(DefaultScheduler.java:180) at org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:496) at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(JobMaster.java:380) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(AkkaRpcActor.java:284) at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:199) at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:74) at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) at akka.actor.Actor.aroundReceive(Actor.scala:517) at akka.actor.Actor.aroundReceive$(Actor.scala:515) at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) at akka.actor.ActorCell.invoke(ActorCell.scala:561) at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) at akka.dispatch.Mailbox.run(Mailbox.scala:225) at akka.dispatch.Mailbox.exec(Mailbox.scala:235) ... 4 more Caused by: java.lang.IllegalStateException: No entry found for connection 3 at org.apache.kafka.clients.ClusterConnectionStates.nodeState(ClusterConnectionStates.java:339) at org.apache.kafka.clients.ClusterConnectionStates.disconnected(ClusterConnectionStates.java:143) at org.apache.kafka.clients.NetworkClient.initiateConnect(NetworkClient.java:926) at org.apache.kafka.clients.NetworkClient.ready(NetworkClient.java:287) at org.apache.kafka.clients.consumer.internals.ConsumerNetworkClient.trySend(ConsumerNetworkClient.java:474) at org.apache.kafka.clients.consumer.internals.ConsumerNetworkClient.poll(ConsumerNetworkClient.java:255) at org.apache.kafka.clients.consumer.internals.ConsumerNetworkClient.poll(ConsumerNetworkClient.java:236) at org.apache.kafka.clients.consumer.internals.ConsumerNetworkClient.poll(ConsumerNetworkClient.java:215) at org.apache.kafka.clients.consumer.internals.Fetcher.getTopicMetadata(Fetcher.java:292) at org.apache.kafka.clients.consumer.KafkaConsumer.partitionsFor(KafkaConsumer.java:1803) at org.apache.kafka.clients.consumer.KafkaConsumer.partitionsFor(KafkaConsumer.java:1771) at org.apache.flink.streaming.connectors.kafka.internal.KafkaPartitionDiscoverer.getAllPartitionsForTopics(KafkaPartitionDiscoverer.java:77) at org.apache.flink.streaming.connectors.kafka.internals.AbstractPartitionDiscoverer.discoverPartitions(AbstractPartitionDiscoverer.java:131) at org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumerBase.open(FlinkKafkaConsumerBase.java:511) at org.apache.flink.api.common.functions.util.FunctionUtils.openFunction(FunctionUtils.java:36) at org.apache.flink.streaming.api.operators.AbstractUdfStreamOperator.open(AbstractUdfStreamOperator.java:102) at org.apache.flink.streaming.runtime.tasks.StreamTask.initializeStateAndOpen(StreamTask.java:990) at org.apache.flink.streaming.runtime.tasks.StreamTask.lambda$beforeInvoke$0(StreamTask.java:453) at org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$SynchronizedStreamTaskActionExecutor.runThrowing(StreamTaskActionExecutor.java:94) at org.apache.flink.streaming.runtime.tasks.StreamTask.beforeInvoke(StreamTask.java:448) at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:460) at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:708) at org.apache.flink.runtime.taskmanager.Task.run(Task.java:533) at java.lang.Thread.run(Thread.java:750) Process finished with exit code 1
最新发布
06-23
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值