一、前言
zookeeper version 3.6
上篇文章聊到了 zk 的底层线程通信模型,算是为本节 zk 是怎样处理请求的打下了一定的基础,因为如果直接进行 server 端请求的讲解可能会被绕晕,zk 底层模块间的交互基本追求完全的解耦,使用了很多队列,看完上篇文章应当至少对 zk client 端 zk server 端底层队列的使用有个大概的印象。
Zookeeper 请求准备按照以下三个层级进行介绍:启动前的准备阶段、请求前的协商阶段、请求处理阶段。
二、Zookeeper 启动前的准备阶段与协商阶段
client 端与 server 端的交互就是通过 org.apache.zookeeper.ZooKeeper 对象,下面看一个简单的案例
public class TestMain {
public static void main(String[] args) throws Exception {
ZooKeeper zooKeeper = new ZooKeeper("127.0.0.1:2181", 1000, (event) -> {
System.out.println("connect success");
});
zooKeeper.create("/test", "test".getBytes(), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
}
}
一)client 端
启动前的准备阶段与协商阶段就是在创建 Zookeeper 对象的时候完成的,下面看下关键代码
public ZooKeeper(
String connectString,
int sessionTimeout,
Watcher watcher,
boolean canBeReadOnly,
HostProvider aHostProvider,
ZKClientConfig clientConfig) throws IOException {
// 创建 watcher 管理类
watchManager = defaultWatchManager();
watchManager.defaultWatcher = watcher;
ConnectStringParser connectStringParser = new ConnectStringParser(connectString);
hostProvider = aHostProvider;
// 创建连接并初始化 sendThread 和 eventThread 超时时间等
cnxn = createConnection(
connectStringParser.getChrootPath(),
hostProvider,
sessionTimeout,
this,
watchManager,
getClientCnxnSocket(),
canBeReadOnly);
// 连接
cnxn.start();
}
这部分代码主要做了三件事
1)第一个就是构建 watcher 管理类,这里简单提一下两端的 watcher 管理器,其实 watcher 逻辑主要就是在 client 端维护的,server 端做的事也就是将对应的节点与连接绑定,client 端 watcher 管理器的作用主要就是接收 server 端的事件并且如果 server 端发生异常连接断开 client 端需要将 watcher 重新注册到 server 端。之后会详细介绍 watcher 的机制跟实现原理
2)解析连接字符串,这个地方会把提供的连接包装成一个对象,因为一个 client 端可以保存集群中多个节点的信息,之后就是创建处理连接的对象
3)整个阶段的处理逻辑主要在最后 start 方法中,在这里就是启动 SendThread 与 EventThread
public void run() {
// 这几行就是记录了下时间 主要就是为发送 ping 作准备
clientCnxnSocket.introduce(this, sessionId, outgoingQueue);
clientCnxnSocket.updateNow();
clientCnxnSocket.updateLastSendAndHeard();
int to;
long lastPingRwServer = Time.currentElapsedTime();
final int MAX_SEND_PING_INTERVAL = 10000; //10 seconds
InetSocketAddress serverAddress = null;
while (state.isAlive()) {
try {
// 这些代码是为了处理如果连接失败会怎样
if (!clientCnxnSocket.isConnected()) {
// don't re-establish connection if we are closing
if (closing) {
break;
}
// 如果最近有 ping 成功的机器,则就去连接这个
// 如果没有就从一开始提供的地址组里轮询去找
if (rwServerAddress != null) {
serverAddress = rwServerAddress;
rwServerAddress = null;
} else {
serverAddress = hostProvider.next(1000);
}
//这里就是连接的逻辑
startConnect(serverAddress);
clientCnxnSocket.updateLastSendAndHeard();
}
// 这里是连接成功之后的逻辑
// 主要就是安全性相关的处理
if (state.isConnected()) {
// determine whether we need to send an AuthFailed event.
if (zooKeeperSaslClient != null) {
boolean sendAuthEvent = false;
if (zooKeeperSaslClient.getSaslState() == ZooKeeperSaslClient.SaslState.INITIAL) {
try {
zooKeeperSaslClient.initialize(ClientCnxn.this);
} catch (SaslException e) {
LOG.error("SASL authentication with Zookeeper Quorum member failed.", e);
state = States.AUTH_FAILED;
sendAuthEvent = true;
}
}
KeeperState authState = zooKeeperSaslClient.getKeeperState();
if (authState != null) {
if (authState == KeeperState.AuthFailed) {
// An authentication error occurred during authentication with the Zookeeper Server.
state = States.AUTH_FAILED;
sendAuthEvent = true;
} else {
if (authState == KeeperState.SaslAuthenticated) {
sendAuthEvent = true;
}
}
}
if (sendAuthEvent) {
eventThread.queueEvent(new WatchedEvent(Watcher.Event.EventType.None, authState, null));
if (state == States.AUTH_FAILED) {
eventThread.queueEventOfDeath();
}
}
}
to = readTimeout - clientCnxnSocket.getIdleRecv();
} else {
// 这里记录了下是否超时
to = connectTimeout - clientCnxnSocket.getIdleRecv();
}
if (to <= 0) {
String warnInfo = String.format(
"Client session timed out, have not heard from server in %dms for session id 0x%s",
clientCnxnSocket.getIdleRecv(),
Long.toHexString(sessionId));
LOG.warn(warnInfo);
throw new SessionTimeoutException(warnInfo);
}
// 这里是连接成功之后会计算下次发送 ping 的时间
// 如果需要就会发送 ping 报文
if (state.isConnected()) {
//1000(1 second) is to prevent race condition missing to send the second ping
//also make sure not to send too many pings when readTimeout is small
int timeToNextPing = readTimeout / 2
- clientCnxnSocket.getIdleSend()
- ((clientCnxnSocket.getIdleSend() > 1000) ? 1000 : 0);
//send a ping request either time is due or no packet sent out within MAX_SEND_PING_INTERVAL
if (timeToNextPing <= 0 || clientCnxnSocket.getIdleSend() > MAX_SEND_PING_INTERVAL) {
// 发送 ping 报文并更新发送时间
sendPing();
clientCnxnSocket.updateLastSend();
} else {
if (timeToNextPing < to) {
to = timeToNextPing;
}
}
}
// 这里单独对 readonly 的心跳进行的处理
// If we are in read-only mode, seek for read/write server
if (state == States.CONNECTEDREADONLY) {
long now = Time.currentElapsedTime();
int idlePingRwServer = (int) (now - lastPingRwServer);
if (idlePingRwServer >= pingRwTimeout) {
lastPingRwServer = now;
idlePingRwServer = 0;
pingRwTimeout = Math.min(2 * pingRwTimeout, maxPingRwTimeout);
pingRwServer();
}
to = Math.min(to, pingRwTimeout - idlePingRwServer);
}
// 这里处理发送的逻辑
clientCnxnSocket.doTransport(to, pendingQueue, ClientCnxn.this);
} catch (Throwable e) {
if (closing) {
// closing so this is expected
LOG.warn(
"An exception was thrown while closing send thread for session 0x{}.",
Long.toHexString(getSessionId()),
e);
break;
} else {
LOG.warn(
"Session 0x{} for sever {}, Closing socket connection. "
+ "Attempting reconnect except it is a SessionExpiredException.",
Long.toHexString(getSessionId()),
serverAddress,
e);
// At this point, there might still be new packets appended to outgoingQueue.
// they will be handled in next connection or cleared up if closed.
cleanAndNotifyState();
}
}
}
synchronized (state) {
// When it comes to this point, it guarantees that later queued
// packet to outgoingQueue will be notified of death.
// 清理发送队列中未响应和未发送的请求
cleanup();
}
clientCnxnSocket.close();
// 这里广播连接失败或者关闭连接事件
if (state.isAlive()) {
eventThread.queueEvent(new WatchedEvent(Event.EventType.None, Event.KeeperState.Disconnected, null));
}
eventThread.queueEvent(new WatchedEvent(Event.EventType.None, Event.KeeperState.Closed, null));
ZooTrace.logTraceMessage(
LOG,
ZooTrace.getTextTraceLevel(),
"SendThread exited loop for session: 0x" + Long.toHexString(getSessionId()));
}
这部分代码其实就是处理了连接相关的事,只读连接与读写连接的差异处理,连接失败的重连,心跳机制。这里也是处理连接协商与发送真正请求的分水岭,处理连接协商的代码是这行开始 startConnect(serverAddress); 处理节点发送等操作是在 clientCnxnSocket.doTransport(to, pendingQueue, ClientCnxn.this);
下面就来看下处理连接协商的逻辑(其中有些非常简单的逻辑就不看了,有兴趣可自行查看下)
void primeConnection() throws IOException {
LOG.info(
"Socket connection established, initiating session, client: {}, server: {}",
clientCnxnSocket.getLocalSocketAddress(),
clientCnxnSocket.getRemoteSocketAddress());
isFirstConnect = false;
// 首次连接不会有 sessionId, sessionId 只出现在与服务器断开连接重连时
long sessId = (seenRwServerBefore) ? sessionId : 0;
// 构建协商请求
ConnectRequest conReq = new ConnectRequest(0, lastZxid, sessionTimeout, sessId, sessionPasswd);
// We add backwards since we are pushing into the front
// Only send if there's a pending watch
// TODO: here we have the only remaining use of zooKeeper in
// this class. It's to be eliminated!
// 这块逻辑是当与 server 断开连接的时候重新连接需要将之前的 watcher 重新注册到 server
if (!clientConfig.getBoolean(ZKClientConfig.DISABLE_AUTO_WATCH_RESET)) {
// 。。。 省略一大波关于 wathcer 处理的逻辑
Record record;
int opcode;
// 之前 watcher 都是一次性触发 后面出现了持久性的 watcher 所以会有两个 watcher 头
if (persistentWatchesBatch.isEmpty() && persistentRecursiveWatchesBatch.isEmpty()) {
// maintain compatibility with older servers - if no persistent/recursive watchers
// are used, use the old version of SetWatches
record = new SetWatches(setWatchesLastZxid, dataWatchesBatch, existWatchesBatch, childWatchesBatch);
opcode = OpCode.setWatches;
} else {
record = new SetWatches2(setWatchesLastZxid, dataWatchesBatch, existWatchesBatch,
childWatchesBatch, persistentWatchesBatch, persistentRecursiveWatchesBatch);
opcode = OpCode.setWatches2;
}
// 构建设置 watcher 的请求
RequestHeader header = new RequestHeader(ClientCnxn.SET_WATCHES_XID, opcode);
Packet packet = new Packet(header, new ReplyHeader(), record, null, null);
// 发送队列
outgoingQueue.addFirst(packet);
}
}
}
// 构建权限校验相关报文
for (AuthData id : authInfo) {
outgoingQueue.addFirst(
new Packet(
new RequestHeader(ClientCnxn.AUTHPACKET_XID, OpCode.auth),
null,
new AuthPacket(0, id.scheme, id.data),
null,
null));
}
// 构建请求协商报文
// 都调用了 addFirst 但是有先后顺序 协商请求 > 权限校验 > reset watcher
outgoingQueue.addFirst(new Packet(null, null, conReq, null, null, readOnly));
// 注册读写事件 也就是准备好发送/读取的标识
clientCnxnSocket.connectionPrimed();
LOG.debug("Session establishment request sent on {}", clientCnxnSocket.getRemoteSocketAddress());
}
这段代码简单概括下就是构建请求,构建重设 watcher 的请求,构建权限校验相关的请求,构建协商请求,其实构建完成也就完成了,剩余的逻辑就是交给 SendThread 线程去处理,简单看下 SendThread 是怎样处理请求的,直接进入到 doTransport 方法
void doTransport(
int waitTimeOut,
Queue<Packet> pendingQueue,
ClientCnxn cnxn) throws IOException, InterruptedException {
selector.select(waitTimeOut);
Set<SelectionKey> selected;
// 这里不会阻塞, 这里拿到的是被已经触发了事件的数组的 key
// 也可以看到 zk 下面的注释
// 从这几行注释也能看出 zk 对于高性能跟高可复用的的追求
synchronized (this) {
selected = selector.selectedKeys();
}
// Everything below and until we get back to the select is
// non blocking, so time is effectively a constant. That is
// Why we just have to do this once, here
updateNow();
for (SelectionKey k : selected) {
SocketChannel sc = ((SocketChannel) k.channel());
if ((k.readyOps() & SelectionKey.OP_CONNECT) != 0) {
// 这里是在前面获取连接不成功的时候在这里再次连接
if (sc.finishConnect()) {
updateLastSendAndHeard();
updateSocketAddresses();
sendThread.primeConnection();
}
} else if ((k.readyOps() & (SelectionKey.OP_READ | SelectionKey.OP_WRITE)) != 0) {
// 处理 IO 事件
doIO(pendingQueue, cnxn);
}
}
// 这里相当于再处理一遍写事件
if (sendThread.getZkState().isConnected()) {
if (findSendablePacket(outgoingQueue, sendThread.tunnelAuthInProgress()) != null) {
enableWrite();
}
}
selected.clear();
}
这里就是对事件的处理,其中不乏 zk 对 io 事件性能的追求,他并没有直接阻塞使用 select 而是直接遍历达到读写状态的 key 如果没有就去干其他事而不是等待,其中我们关心的部分就在 zk 怎样处理 io 事件,接下来看 doIO
void doIO(Queue<Packet> pendingQueue, ClientCnxn cnxn) throws InterruptedException, IOException {
SocketChannel sock = (SocketChannel) sockKey.channel();
if (sock == null) {
throw new IOException("Socket is null!");
}
if (sockKey.isReadable()) {
int rc = sock.read(incomingBuffer);
if (rc < 0) {
throw new EndOfStreamException("Unable to read additional data from server sessionid 0x"
+ Long.toHexString(sessionId)
+ ", likely server has closed socket");
}
if (!incomingBuffer.hasRemaining()) {
incomingBuffer.flip();
// zk 的请求报文会先发 4 个字节作为整个报文的长度
// 然后根据报文长度构建对应长度的请求 buffer
if (incomingBuffer == lenBuffer) {
recvCount.getAndIncrement();
readLength();
} else if (!initialized) {
// 如果没有完成连接的协商会走到协商报文的解析
readConnectResult();
// 协商报文解析成功如果此时处于不对任何事件感兴趣阶段
// 则注册读事件
enableRead();
// 如果有队列中有数据则注册写事件
if (findSendablePacket(outgoingQueue, sendThread.tunnelAuthInProgress()) != null) {
// Since SASL authentication has completed (if client is configured to do so),
// outgoing packets waiting in the outgoingQueue can now be sent.
enableWrite();
}
lenBuffer.clear();
incomingBuffer = lenBuffer;
updateLastHeard();
initialized = true;
} else {
//这部分是处理非协商请求的响应
sendThread.readResponse(incomingBuffer);
lenBuffer.clear();
incomingBuffer = lenBuffer;
updateLastHeard();
}
}
}
if (sockKey.isWritable()) {
Packet p = findSendablePacket(outgoingQueue, sendThread.tunnelAuthInProgress());
// 这里针对有没有请求体分成了不同的处理方式
if (p != null) {
updateLastSend();
// If we already started writing p, p.bb will already exist
// 如果没有请求体则会创建一些跟 request 协议相关的必须要的参数 比如报文长度是否只读等
if (p.bb == null) {
if ((p.requestHeader != null)
&& (p.requestHeader.getType() != OpCode.ping)
&& (p.requestHeader.getType() != OpCode.auth)) {
p.requestHeader.setXid(cnxn.getXid());
}
p.createBB();
}
// createBB 中会将请求构建成 byte
sock.write(p.bb);
// 下面跟上面方式处理不一样的会用一个队列来保存发出去的 request
// 请求完成之后会将请求结果和请求对象重新包装后返回给上层
// pendingQueue 就是保存未返回的请求包
if (!p.bb.hasRemaining()) {
sentCount.getAndIncrement();
outgoingQueue.removeFirstOccurrence(p);
if (p.requestHeader != null
&& p.requestHeader.getType() != OpCode.ping
&& p.requestHeader.getType() != OpCode.auth) {
synchronized (pendingQueue) {
pendingQueue.add(p);
}
}
}
}
// 队列为空便会取消掉事件, 这也是为什么之前有请求就会注册写事件
// 算是一部分性能的提升
if (outgoingQueue.isEmpty()) {
// No more packets to send: turn off write interest flag.
// Will be turned on later by a later call to enableWrite(),
// from within ZooKeeperSaslClient (if client is configured
// to attempt SASL authentication), or in either doIO() or
// in doTransport() if not.
disableWrite();
} else if (!initialized && p != null && !p.bb.hasRemaining()) {
// On initial connection, write the complete connect request
// packet, but then disable further writes until after
// receiving a successful connection response. If the
// session is expired, then the server sends the expiration
// response and immediately closes its end of the socket. If
// the client is simultaneously writing on its end, then the
// TCP stack may choose to abort with RST, in which case the
// client would never receive the session expired event. See
// http://docs.oracle.com/javase/6/docs/technotes/guides/net/articles/connection_release.html
disableWrite();
} else {
// Just in case
enableWrite();
}
}
}
这里就是 zk 处理读写事件的逻辑,读方法这里会主要介绍下处理协商请求也就是 readConnectResult , readResponse 主要是处理 zk 节点请求的逻辑,之后会介绍
写逻辑可以看到 zk 也是分两部分处理的,没有请求体的请求会填充必要信息之后发送出去,但是有请求体会放到等待队列中直到请求返回给上层,下面主要看下读取 sever 端的协商请求究竟干了些什么
void readConnectResult() throws IOException {
if (LOG.isTraceEnabled()) {
StringBuilder buf = new StringBuilder("0x[");
for (byte b : incomingBuffer.array()) {
buf.append(Integer.toHexString(b)).append(",");
}
buf.append("]");
if (LOG.isTraceEnabled()) {
LOG.trace("readConnectResult {} {}", incomingBuffer.remaining(), buf.toString());
}
}
ByteBufferInputStream bbis = new ByteBufferInputStream(incomingBuffer);
BinaryInputArchive bbia = BinaryInputArchive.getArchive(bbis);
ConnectResponse conRsp = new ConnectResponse();
// 将返回报文反序列化成 connectResponse 对象
conRsp.deserialize(bbia, "connect");
// read "is read-only" flag
boolean isRO = false;
try {
isRO = bbia.readBool("readOnly");
} catch (IOException e) {
// this is ok -- just a packet from an old server which
// doesn't contain readOnly field
LOG.warn("Connected to an old server; r-o mode will be unavailable");
}
this.sessionId = conRsp.getSessionId();
// 这里就是对 server 端返回的超时事件等参数进行配置
// 连接时候配置的超时参数会以配置中的为准, 之后会以服务端返回的为准
sendThread.onConnected(conRsp.getTimeOut(), this.sessionId, conRsp.getPasswd(), isRO);
跳过了部分代码,主要逻辑就在这个方法,其实也就在最后一句,设置 server 端返回的超时事件 sessionId 等参数,至此 client 端准备与协商阶段完成。
整个流程其实最主要就是最后一行。
二)server 端
server 端处理其实要简单许多,上面文章着重分析了 server 端的逻辑,因此本次介绍起处理请求来会相对简单一些,下面我会跳过上篇文章有关 IO 处理的逻辑,直奔主题
public void run() {
try {
// Check if stopped while request was on queue
if (stopped) {
workRequest.cleanup();
return;
}
workRequest.doWork();
} catch (Exception e) {
LOG.warn("Unexpected exception", e);
workRequest.cleanup();
}
}
public void doWork() throws InterruptedException {
if (!key.isValid()) {
selectorThread.cleanupSelectionKey(key);
return;
}
if (key.isReadable() || key.isWritable()) {
cnxn.doIO(key);
// Check if we shutdown or doIO() closed this connection
if (stopped) {
cnxn.close(ServerCnxn.DisconnectReason.SERVER_SHUTDOWN);
return;
}
if (!key.isValid()) {
selectorThread.cleanupSelectionKey(key);
return;
}
touchCnxn(cnxn);
}
// Mark this connection as once again ready for selection
cnxn.enableSelectable();
// Push an update request on the queue to resume selecting
// on the current set of interest ops, which may have changed
// as a result of the I/O operations we just performed.
if (!selectorThread.addInterestOpsUpdateRequest(key)) {
cnxn.close(ServerCnxn.DisconnectReason.CONNECTION_MODE_CHANGED);
}
}
上面这段代码其实没干什么事情,主要是为了回顾上篇文章提及的将准备好的请求包装成请求对象扔到线程池,下面看下doIO
void doIO(SelectionKey k) throws InterruptedException {
try {
if (!isSocketOpen()) {
LOG.warn("trying to do i/o on a null socket for session: 0x{}", Long.toHexString(sessionId));
return;
}
if (k.isReadable()) {
//读取报文长度 用开头的 4 个字节表示
int rc = sock.read(incomingBuffer);
if (rc < 0) {
handleFailedRead();
}
if (incomingBuffer.remaining() == 0) {
boolean isPayload;
if (incomingBuffer == lenBuffer) { // start of next request
incomingBuffer.flip();
// 这里主要是根据长度构建接收数组
isPayload = readLength(k);
incomingBuffer.clear();
} else {
// continuation
isPayload = true;
}
if (isPayload) { // not the case for 4letterword
readPayload();
} else {
// four letter words take care
// need not do anything else
return;
}
}
}
if (k.isWritable()) {
// 写请求
handleWrite(k);
if (!initialized && !getReadInterest() && !getWriteInterest()) {
throw new CloseRequestException("responded to info probe", DisconnectReason.INFO_PROBE);
}
}
} catch (CancelledKeyException e) {
LOG.warn("CancelledKeyException causing close of session: 0x{}", Long.toHexString(sessionId));
LOG.debug("CancelledKeyException stack trace", e);
close(DisconnectReason.CANCELLED_KEY_EXCEPTION);
} catch (CloseRequestException e) {
// expecting close to log session closure
close();
} catch (EndOfStreamException e) {
LOG.warn("Unexpected exception", e);
// expecting close to log session closure
close(e.getReason());
} catch (ClientCnxnLimitException e) {
// Common case exception, print at debug level
ServerMetrics.getMetrics().CONNECTION_REJECTED.add(1);
LOG.warn("Closing session 0x{}", Long.toHexString(sessionId), e);
close(DisconnectReason.CLIENT_CNX_LIMIT);
} catch (IOException e) {
LOG.warn("Close of session 0x{}", Long.toHexString(sessionId), e);
close(DisconnectReason.IO_EXCEPTION);
}
}
这部分看了现前面 client 端处理就很容易理解,基本是个镜像处理,处理读请求的数组构建完成是在 readPayLoad,处理写请求是在 handleWrite
private void readPayload() throws IOException, InterruptedException, ClientCnxnLimitException {
if (incomingBuffer.remaining() != 0) { // have we read length bytes?
int rc = sock.read(incomingBuffer); // sock is non-blocking, so ok
if (rc < 0) {
handleFailedRead();
}
}
if (incomingBuffer.remaining() == 0) { // have we read length bytes?
incomingBuffer.flip();
//记录收到的字节数 监控用
packetReceived(4 + incomingBuffer.remaining());
if (!initialized) {
// 初始化报文的处理 主要包括与客户端 sessionId 和 超时时间 的协商
readConnectRequest();
} else {
readRequest();
}
lenBuffer.clear();
incomingBuffer = lenBuffer;
}
}
public void processConnectRequest(ServerCnxn cnxn, ByteBuffer incomingBuffer)
throws IOException, ClientCnxnLimitException {
BinaryInputArchive bia = BinaryInputArchive.getArchive(new ByteBufferInputStream(incomingBuffer));
// 这里相当于对客户端首次对服务进行的请求
ConnectRequest connReq = new ConnectRequest();
connReq.deserialize(bia, "connect");
LOG.debug(
"Session establishment request from client {} client's lastZxid is 0x{}",
cnxn.getRemoteSocketAddress(),
Long.toHexString(connReq.getLastZxidSeen()));
long sessionId = connReq.getSessionId();
int tokensNeeded = 1;
// 这个相当于维护每个 session 对整个服务带来的负载的权重
if (connThrottle.isConnectionWeightEnabled()) {
if (sessionId == 0) {
if (localSessionEnabled) {
tokensNeeded = connThrottle.getRequiredTokensForLocal();
} else {
tokensNeeded = connThrottle.getRequiredTokensForGlobal();
}
} else {
tokensNeeded = connThrottle.getRequiredTokensForRenew();
}
}
// 限流算法 看是否可以建立连接
if (!connThrottle.checkLimit(tokensNeeded)) {
throw new ClientCnxnLimitException();
}
ServerMetrics.getMetrics().CONNECTION_TOKEN_DEFICIT.add(connThrottle.getDeficit());
ServerMetrics.getMetrics().CONNECTION_REQUEST_COUNT.add(1);
boolean readOnly = false;
try {
readOnly = bia.readBool("readOnly");
cnxn.isOldClient = false;
} catch (IOException e) {
// this is ok -- just a packet from an old client which
// doesn't contain readOnly field
LOG.warn(
"Connection request from old client {}; will be dropped if server is in r-o mode",
cnxn.getRemoteSocketAddress());
}
if (!readOnly && this instanceof ReadOnlyZooKeeperServer) {
String msg = "Refusing session request for not-read-only client " + cnxn.getRemoteSocketAddress();
LOG.info(msg);
throw new CloseRequestException(msg, ServerCnxn.DisconnectReason.CLIENT_ZXID_AHEAD);
}
if (connReq.getLastZxidSeen() > zkDb.dataTree.lastProcessedZxid) {
String msg = "Refusing session request for client "
+ cnxn.getRemoteSocketAddress()
+ " as it has seen zxid 0x"
+ Long.toHexString(connReq.getLastZxidSeen())
+ " our last zxid is 0x"
+ Long.toHexString(getZKDatabase().getDataTreeLastProcessedZxid())
+ " client must try another server";
LOG.info(msg);
throw new CloseRequestException(msg, ServerCnxn.DisconnectReason.NOT_READ_ONLY_CLIENT);
}
int sessionTimeout = connReq.getTimeOut();
byte[] passwd = connReq.getPasswd();
// 客户端与服务端协商超时事件 取较小的
int minSessionTimeout = getMinSessionTimeout();
if (sessionTimeout < minSessionTimeout) {
sessionTimeout = minSessionTimeout;
}
int maxSessionTimeout = getMaxSessionTimeout();
if (sessionTimeout > maxSessionTimeout) {
sessionTimeout = maxSessionTimeout;
}
cnxn.setSessionTimeout(sessionTimeout);
// We don't want to receive any packets until we are sure that the
// session is setup
// 连接初始化完成的时候不接受请求
cnxn.disableRecv();
// 分两种情况 如果客户端送过来的 sessionId 为 0 那么说这是首次进行连接
if (sessionId == 0) {
// 获取自增 sessionId
long id = createSession(cnxn, passwd, sessionTimeout);
LOG.debug(
"Client attempting to establish new session: session = 0x{}, zxid = 0x{}, timeout = {}, address = {}",
Long.toHexString(id),
Long.toHexString(connReq.getLastZxidSeen()),
connReq.getTimeOut(),
cnxn.getRemoteSocketAddress());
} else {
//如果客户端送来的 sessionId 不为 0 则说明之前有过连接则进行重连
long clientSessionId = connReq.getSessionId();
LOG.debug(
"Client attempting to renew session: session = 0x{}, zxid = 0x{}, timeout = {}, address = {}",
Long.toHexString(clientSessionId),
Long.toHexString(connReq.getLastZxidSeen()),
connReq.getTimeOut(),
cnxn.getRemoteSocketAddress());
if (serverCnxnFactory != null) {
serverCnxnFactory.closeSession(sessionId, ServerCnxn.DisconnectReason.CLIENT_RECONNECT);
}
if (secureServerCnxnFactory != null) {
secureServerCnxnFactory.closeSession(sessionId, ServerCnxn.DisconnectReason.CLIENT_RECONNECT);
}
cnxn.setSessionId(sessionId);
// 保存 session 以及对客户端的响应
reopenSession(cnxn, sessionId, passwd, sessionTimeout);
ServerMetrics.getMetrics().CONNECTION_REVALIDATE_COUNT.add(1);
}
}
上面那段代码也是相当于对请求类型的分流,一部分处理正常节点请求,一部分处理协商请求。
下面那段代码主要就是对 session,超时时间,readonly 做了处理
session 先简单理解为如果没有(sessionId 是 0)就创建,如果有就进行绑定续期等操作,之后会对session 做详细介绍
超时时间客户端会与服务端做对比,取较小的作为超时时间
readonly 则是判断 zk server 容器的类型
最终会调用 reopenSession 发送出去,其实内部也保存了一个队列,这部分的逻辑主要就是构建响应然后将响应扔到 outgoingBuffers 中去
最后来看下发送的逻辑
void handleWrite(SelectionKey k) throws IOException {
if (outgoingBuffers.isEmpty()) {
return;
}
/*
* This is going to reset the buffer position to 0 and the
* limit to the size of the buffer, so that we can fill it
* with data from the non-direct buffers that we need to
* send.
*/
// 获取堆外内存 默认给的是 64k
ByteBuffer directBuffer = NIOServerCnxnFactory.getDirectBuffer();
//返回 null 代表不使用堆外内存
if (directBuffer == null) {
ByteBuffer[] bufferList = new ByteBuffer[outgoingBuffers.size()];
// Use gathered write call. This updates the positions of the
// byte buffers to reflect the bytes that were written out.
// 一次性性将内容写出去
sock.write(outgoingBuffers.toArray(bufferList));
// Remove the buffers that we have sent
ByteBuffer bb;
while ((bb = outgoingBuffers.peek()) != null) {
if (bb == ServerCnxnFactory.closeConn) {
throw new CloseRequestException("close requested", DisconnectReason.CLIENT_CLOSED_CONNECTION);
}
// 这里根据 packetSentinel 这个报文进行计数
if (bb == packetSentinel) {
packetSent();
}
if (bb.remaining() > 0) {
break;
}
outgoingBuffers.remove();
}
} else {
directBuffer.clear();
// 这里做的事就是将内容填充到堆外内存 如果内存不够则截取到能够填满内存为止
for (ByteBuffer b : outgoingBuffers) {
if (directBuffer.remaining() < b.remaining()) {
/*
* When we call put later, if the directBuffer is to
* small to hold everything, nothing will be copied,
* so we've got to slice the buffer if it's too big.
*/
// 截取报文
b = (ByteBuffer) b.slice().limit(directBuffer.remaining());
}
/*
* put() is going to modify the positions of both
* buffers, put we don't want to change the position of
* the source buffers (we'll do that after the send, if
* needed), so we save and reset the position after the
* copy
*/
int p = b.position();
directBuffer.put(b);
b.position(p);
if (directBuffer.remaining() == 0) {
break;
}
}
/*
* Do the flip: limit becomes position, position gets set to
* 0. This sets us up for the write.
*/
directBuffer.flip();
int sent = sock.write(directBuffer);
ByteBuffer bb;
// Remove the buffers that we have sent
while ((bb = outgoingBuffers.peek()) != null) {
if (bb == ServerCnxnFactory.closeConn) {
throw new CloseRequestException("close requested", DisconnectReason.CLIENT_CLOSED_CONNECTION);
}
if (bb == packetSentinel) {
packetSent();
}
// 这里说明之前对报文做过截取 需要重新填充
if (sent < bb.remaining()) {
/*
* We only partially sent this buffer, so we update
* the position and exit the loop.
*/
bb.position(bb.position() + sent);
break;
}
/* We've sent the whole buffer, so drop the buffer */
// 这里相当检测发送长度与报文长度是否一致
sent -= bb.remaining();
outgoingBuffers.remove();
}
}
}
server 端与 client 端稍微不一样的就是 server 端会有批量响应的逻辑,注释做的挺清楚的,可以详细看下,至此,前置请求便分析完了。
本篇主要讲述了前置请求的处理内容以及 server 端 client 端的一些交互细节,下篇将介绍 zk 处理节点请求的逻辑。