2021SC@SDUSC
package org.apache.storm.daemon.worker;
关于Worker
通过worker-data方法定义了一个包含很多共享数据的映射集合,worker中的很多方法都依赖它。
worker中的计时器
每个计时器都对应着一个java线程,worker中使用计时器进行心跳保持以及获取元数据的更新信息。
worker的心跳
do-heartbeat函数用于产生worker的心跳信息,这些信息被写入本地文件系统中,supervisor会读取这些心跳信息以判断worker的状态,然后决定是否要重启worker.
worker-state方法会创建一个LocalState对象,并调用该对象的put方法将worker的心跳信息存储到本地的文件系统,路径:STORM-LOCAL-DIR/workers//heartbeats,
worker的heartbeat目录下的文件,文件名为当前的时间戳。
worker启动之后从去zk的/assignments/{topology}路径。
然后根据assignment信息获取Map<List, NodeInfo> executorToNodePort,然后通过Executor.mkExecutor创建Executor。
worker中对ZMQ连接的维护
在进程期间,Storm利用ZMQ来发送和接收信息,并且采用端到端的方法完成消息传输,Worker会根据Topology的定义以及分配到自身的任务情况,计算出自己发出的消息将被哪些Task接收,基于Topology的这一任务分配信息,worker可以熟悉目标Task所在的机器和端口号。
可以看出,Worker通过两种机制来保证连接的可靠性,一是在zookeeper中注册watcher回调通知的方法,这种方式并不一定可靠,例如与zookeeper的连接丢失,则注册的watcher回调方法将失效。二是采用定时器的方法定期执行该函数。
Worker
public static void main(String[] args) throws Exception {
Preconditions.checkArgument(args.length == 5, "Illegal number of arguments. Expected: 5, Actual: " + args.length);
String stormId = args[0];
String assignmentId = args[1];
String supervisorPort = args[2];
String portStr = args[3];
String workerId = args[4];
Map<String, Object> conf = ConfigUtils.readStormConfig();
Utils.setupWorkerUncaughtExceptionHandler();
StormCommon.validateDistributedMode(conf);
int supervisorPortInt = Integer.parseInt(supervisorPort);
Worker worker = new Worker(conf, null, stormId, assignmentId, supervisorPortInt, Integer.parseInt(portStr), workerId);
//Add shutdown hooks before starting any other threads to avoid possible race condition
//between invoking shutdown hooks and registering shutdown hooks. See STORM-3658.
int workerShutdownSleepSecs = ObjectReader.getInt(conf.get(Config.SUPERVISOR_WORKER_SHUTDOWN_SLEEP_SECS));
LOG.info("Adding shutdown hook with kill in {} secs", workerShutdownSleepSecs);
Utils.addShutdownHookWithDelayedForceKill(worker::shutdown, workerShutdownSleepSecs);
worker.start();
}
main方法创建Worker,然后调用start。
Worker.start
public void start() throws Exception {
LOG.info("Launching worker for {} on {}:{} with id {} and conf {}", topologyId, assignmentId, port, workerId,
ConfigUtils.maskPasswords(conf));
// because in local mode, its not a separate
// process. supervisor will register it in this case
// if ConfigUtils.isLocalMode(conf) returns false then it is in distributed mode.
if (!ConfigUtils.isLocalMode(conf)) {
// Distributed mode
SysOutOverSLF4J.sendSystemOutAndErrToSLF4J();
String pid = Utils.processPid();
FileUtils.touch(new File(ConfigUtils.workerPidPath(conf, workerId, pid)));
FileUtils.writeStringToFile(new File(ConfigUtils.workerArtifactsPidPath(conf, topologyId, port)), pid,
Charset.forName("UTF-8"));
}
ClusterStateContext csContext = new ClusterStateContext(DaemonType.WORKER, topologyConf);
IStateStorage stateStorage = ClusterUtils.mkStateStorage(conf, topologyConf, csContext);
IStormClusterState stormClusterState = ClusterUtils.mkStormClusterState(stateStorage, null, csContext);
metricRegistry.start(topologyConf, port);
SharedMetricRegistries.add(WORKER_METRICS_REGISTRY, metricRegistry.getRegistry());
Credentials initialCredentials = stormClusterState.credentials(topologyId, null);
Map<String, String> initCreds = new HashMap<>();
if (initialCredentials != null) {
initCreds.putAll(initialCredentials.get_creds());
}
autoCreds = ClientAuthUtils.getAutoCredentials(topologyConf);
subject = ClientAuthUtils.populateSubject(null, autoCreds, initCreds);
Subject.doAs(subject, (PrivilegedExceptionAction<Object>)
() -> loadWorker(stateStorage, stormClusterState, initCreds, initialCredentials)
);
}
这里主要是调用loadWorker。
Worker.loadWorker
private Object loadWorker(IStateStorage stateStorage, IStormClusterState stormClusterState,
Map<String, String> initCreds, Credentials initialCredentials)
throws Exception {
workerState =
new WorkerState(conf, context, topologyId, assignmentId, supervisorIfaceSupplier, port, workerId,
topologyConf, stateStorage, stormClusterState,
autoCreds, metricRegistry, initialCredentials);
this.heatbeatMeter = metricRegistry.meter("doHeartbeat-calls", workerState.getWorkerTopologyContext(),
Constants.SYSTEM_COMPONENT_ID, (int) Constants.SYSTEM_TASK_ID);
// Heartbeat here so that worker process dies if this fails
// it's important that worker heartbeat to supervisor ASAP so that supervisor knows
// that worker is running and moves on
doHeartBeat();
executorsAtom = new AtomicReference<>(null);
// launch heartbeat threads immediately so that slow-loading tasks don't cause the worker to timeout
// to the supervisor
workerState.heartbeatTimer
.scheduleRecurring(0, (Integer) conf.get(Config.WORKER_HEARTBEAT_FREQUENCY_SECS), () -> {
try {
doHeartBeat();
} catch (IOException e) {
throw new RuntimeException(e);
}
});
Integer execHeartBeatFreqSecs = workerState.stormClusterState.isPacemakerStateStore()
? (Integer) conf.get(Config.TASK_HEARTBEAT_FREQUENCY_SECS)
: (Integer) conf.get(Config.EXECUTOR_METRICS_FREQUENCY_SECS);
workerState.executorHeartbeatTimer
.scheduleRecurring(0, execHeartBeatFreqSecs,
Worker.this::doExecutorHeartbeats);
workerState.refreshConnections();
workerState.activateWorkerWhenAllConnectionsReady();
workerState.refreshStormActive(null);
workerState.runWorkerStartHooks();
List<Executor> execs = new ArrayList<>();
for (List<Long> e : workerState.getLocalExecutors()) {
if (ConfigUtils.isLocalMode(conf)) {
Executor executor = LocalExecutor.mkExecutor(workerState, e, initCreds);
execs.add(executor);
for (int i = 0; i < executor.getTaskIds().size(); ++i) {
workerState.localReceiveQueues.put(executor.getTaskIds().get(i), executor.getReceiveQueue());
}
} else {
Executor executor = Executor.mkExecutor(workerState, e, initCreds);
for (int i = 0; i < executor.getTaskIds().size(); ++i) {
workerState.localReceiveQueues.put(executor.getTaskIds().get(i), executor.getReceiveQueue());
}
execs.add(executor);
}
}
List<IRunningExecutor> newExecutors = new ArrayList<IRunningExecutor>();
for (Executor executor : execs) {
newExecutors.add(executor.execute());
}
executorsAtom.set(newExecutors);
// This thread will send out messages destined for remote tasks (on other workers)
// If there are no remote outbound tasks, don't start the thread.
if (workerState.hasRemoteOutboundTasks()) {
transferThread = workerState.makeTransferThread();
transferThread.setName("Worker-Transfer");
}
establishLogSettingCallback();
final int credCheckMaxAllowed = 10;
final int[] credCheckErrCnt = new int[1]; // consecutive-error-count
workerState.refreshCredentialsTimer.scheduleRecurring(0,
(Integer) conf.get(Config.TASK_CREDENTIALS_POLL_SECS), () -> {
try {
checkCredentialsChanged();
credCheckErrCnt[0] = 0;
} catch (Exception ex) {
credCheckErrCnt[0]++;
if (credCheckErrCnt[0] <= credCheckMaxAllowed) {
LOG.warn("Ignoring {} of {} consecutive exceptions when checking for credential change",
credCheckErrCnt[0], credCheckMaxAllowed, ex);
} else {
LOG.error("Received {} consecutive exceptions, {} tolerated, when checking for credential change",
credCheckErrCnt[0], credCheckMaxAllowed, ex);
throw ex;
}
}
});
workerState.checkForUpdatedBlobsTimer.scheduleRecurring(0,
(Integer) conf.getOrDefault(Config.WORKER_BLOB_UPDATE_POLL_INTERVAL_SECS, 10),
() -> {
try {
LOG.debug("Checking if blobs have updated");
updateBlobUpdates();
} catch (IOException e) {
// IOException from reading the version files to be ignored
LOG.error(e.getStackTrace().toString());
}
}
);
// The jitter allows the clients to get the data at different times, and avoids thundering herd
if (!(Boolean) topologyConf.get(Config.TOPOLOGY_DISABLE_LOADAWARE_MESSAGING)) {
workerState.refreshLoadTimer.scheduleRecurringWithJitter(0, 1, 500, Worker.this::doRefreshLoad);
}
workerState.refreshConnectionsTimer.scheduleRecurring(0,
(Integer) conf.get(Config.TASK_REFRESH_POLL_SECS),
workerState::refreshConnections);
workerState.resetLogLevelsTimer.scheduleRecurring(0,
(Integer) conf.get(Config.WORKER_LOG_LEVEL_RESET_POLL_SECS),
logConfigManager::resetLogLevels);
workerState.refreshActiveTimer.scheduleRecurring(0, (Integer) conf.get(Config.TASK_REFRESH_POLL_SECS),
workerState::refreshStormActive);
setupFlushTupleTimer(topologyConf, newExecutors);
setupBackPressureCheckTimer(topologyConf);
LOG.info("Worker has topology config {}", ConfigUtils.maskPasswords(topologyConf));
LOG.info("Worker {} for storm {} on {}:{} has finished loading", workerId, topologyId, assignmentId, port);
return this;
}
这里通过workerState.getLocalExecutors()获取List executorId的集合。然后通过Executor.mkExecutor创建指定数量的Executor,然后调用execute()方法转换为ExecutorShutdown,然后保存到AtomicReference<List> executorsAtom。