本文详细分析一下zookeeper的数据存储。
ZKDatabase
维护zookeeper服务器内存数据库,包括session、dataTree和committedlog数据,从磁盘读取日志和快照后启动。
关键字段
// 数据节点树
protected DataTree dataTree;
protected ConcurrentHashMap<Long, Integer> sessionsWithTimeouts;
protected FileTxnSnapLog snapLog; // 用于操作底层数据文件
// committedLog中第一条和最后一条数据的zxid
protected long minCommittedLog, maxCommittedLog;
// committedLog最大容量,默认500
public int commitLogCount;
// 维护最后提交的请求集,可用于快速follower同步
protected Queue<Proposal> committedLog = new ArrayDeque<>();
protected ReentrantReadWriteLock logLock = new ReentrantReadWriteLock();
private volatile boolean initialized = false;
// txnlog计数
private AtomicInteger txnCount = new AtomicInteger(0);
构造方法
public ZKDatabase(FileTxnSnapLog snapLog) {
dataTree = createDataTree();
sessionsWithTimeouts = new ConcurrentHashMap<>();
this.snapLog = snapLog;
// 初始化snapshotSizeFactor默认0.33
// 初始化commitLogCount默认500
}
public DataTree createDataTree() {
return new DataTree();
}
创建DataTree对象:创建/zookeeper/quota、/zookeeper/config节点,创建dataWatches和childWatches对象(使用WatchManager实现类)。
主要方法
// 返回committedLog集
public synchronized Collection<Proposal> getCommittedLog();
// 返回dataTree.lastProcessedZxid的值
public long getDataTreeLastProcessedZxid();
// 返回dataTree.getSessions()集
public Collection<Long> getSessions();
// 返回sessionsWithTimeouts的size
public long getSessionCount();
// 从磁盘加载dataTree并把txnLog加载到committedLog中
public long loadDataBase() throws IOException;
// 从磁盘加载txnLog到committedLog中
public long fastForwardDataBase() throws IOException;
// 使用addCommittedProposal方法添加committedLog
private void addCommittedProposal(TxnHeader hdr, Record txn, TxnDigest digest);
// 添加committedLog
public void addCommittedProposal(Request request);
// 从txnLog加载Proposal
public Iterator<Proposal> getProposalsFromTxnLog(long startZxid, long sizeLimit);
// 使用dataTree.removeCnxn(cnxn)
public void removeCnxn(ServerCnxn cnxn);
// 使用dataTree.killSession(sessionId, zxid)
public void killSession(long sessionId, long zxid);
// 使用dataTree.dumpEphemerals(pwriter)
public void dumpEphemerals(PrintWriter pwriter);
// 使用dataTree.getEphemerals()
public Map<Long, Set<String>> getEphemerals();
// 使用dataTree.getNodeCount()
public int getNodeCount();
// 使用dataTree.getEphemerals(sessionId)
public Set<String> getEphemerals(long sessionId);
// 给dataTree.lastProcessedZxid赋值
public void setlastProcessedZxid(long zxid);
// 使用dataTree.processTxn(hdr, txn, digest)
public ProcessTxnResult processTxn(TxnHeader hdr, Record txn, TxnDigest digest);
// 使用dataTree.statNode(path, serverCnxn)
public Stat statNode(String path, ServerCnxn serverCnxn) throws KeeperException.NoNodeException;
// 使用dataTree.getNode(path)
public DataNode getNode(String path);
// 使用dataTree.getData(path, stat, watcher)
public byte[] getData(String path, Stat stat, Watcher watcher) throws KeeperException.NoNodeException;
// 使用dataTree.setWatches方法实现
public void setWatches(long relativeZxid, List<String> dataWatches,
List<String> existWatches, List<String> childWatches,
List<String> persistentWatches, List<String> persistentRecursiveWatches,
Watcher watcher);
// 使用dataTree.addWatch(basePath, watcher, mode)
public void addWatch(String basePath, Watcher watcher, int mode);
// 使用dataTree.getChildren(path, stat, watcher)
public List<String> getChildren(
String path, Stat stat, Watcher watcher) throws KeeperException.NoNodeException;
// 使用dataTree.getAllChildrenNumber(path)
public int getAllChildrenNumber(String path) throws KeeperException.NoNodeException;
// Truncate the ZKDatabase to the specified zxid
public boolean truncateLog(long zxid) throws IOException;
// Deserialize a snapshot from an input archive
public void deserializeSnapshot(InputArchive ia) throws IOException;
// Deserialize a snapshot that contains FileHeader from an input archive
// It is used by the admin restore command
public void deserializeSnapshot(final InputArchive ia, final CheckedInputStream is) throws IOException;
// Serialize the snapshot
public void serializeSnapshot(OutputArchive oa) throws IOException, InterruptedException;
// 使用snapLog.append(si)保存数据,txnCount++
public boolean append(Request si) throws IOException;
// 使用snapLog.rollLog()滚动底层txnLog
public void rollLog() throws IOException;
// 使用snapLog.commit()提交底层txnLog
public void commit() throws IOException;
// 初始化/zookeeper/config数据,集群启动时已介绍
public synchronized void initConfigInZKDatabase(QuorumVerifier qv);
// 使用dataTree.containsWatcher(path, type, watcher)
public boolean containsWatcher(String path, WatcherType type, Watcher watcher);
// 使用dataTree.removeWatch(path, type, watcher)
public boolean removeWatch(String path, WatcherType type, Watcher watcher);
loadDataBase方法
从磁盘加载dataTree并把txnLog加载到committedLog中:
public long loadDataBase() throws IOException {
long startTime = Time.currentElapsedTime();
// 1. 从snapshot加载dataTree
// 2. 使用fastForwardFromEdits方法从txnLog加载dataTree和committedlog
long zxid = snapLog.restore(dataTree, sessionsWithTimeouts, commitProposalPlaybackListener);
initialized = true;
// 略
return zxid;
}
fastForwardDataBase方法
从txnLog加载dataTree和committedlog集:
public long fastForwardDataBase() throws IOException {
// 会通过commitProposalPlaybackListener调用addCommittedProposal添加committedlog
long zxid = snapLog.fastForwardFromEdits(
dataTree, sessionsWithTimeouts, commitProposalPlaybackListener);
initialized = true;
return zxid;
}
addCommittedProposal方法
private void addCommittedProposal(TxnHeader hdr, Record txn, TxnDigest digest) {
Request r = new Request(0, hdr.getCxid(), hdr.getType(), hdr, txn, hdr.getZxid());
r.setTxnDigest(digest);
addCommittedProposal(r);
}
public void addCommittedProposal(Request request) {
WriteLock wl = logLock.writeLock();
try {
wl.lock();
if (committedLog.size() > commitLogCount) {
committedLog.remove();
minCommittedLog = committedLog.peek().packet.getZxid();
}
if (committedLog.isEmpty()) {
minCommittedLog = request.zxid;
maxCommittedLog = request.zxid;
}
byte[] data = request.getSerializeData();
QuorumPacket pp = new QuorumPacket(Leader.PROPOSAL, request.zxid, data, null);
Proposal p = new Proposal();
p.packet = pp;
p.request = request;
committedLog.add(p);
maxCommittedLog = p.packet.getZxid();
} finally {
wl.unlock();
}
}
getProposalsFromTxnLog方法
从txnlog获取Proposal,只填充packet字段:
public Iterator<Proposal> getProposalsFromTxnLog(long startZxid, long sizeLimit) {
if (sizeLimit < 0) {
return TxnLogProposalIterator.EMPTY_ITERATOR;
}
TxnIterator itr = null;
try {
// 从txnLog文件读取数据
// 底层通过FileTxnIterator类读取文件流实现
itr = snapLog.readTxnLog(startZxid, false);
// If we cannot guarantee that this is strictly the starting txn
// after a given zxid, we should fail.
if ((itr.getHeader() != null) && (itr.getHeader().getZxid() > startZxid)) {
itr.close();
return TxnLogProposalIterator.EMPTY_ITERATOR;
}
if (sizeLimit > 0) {
long txnSize = itr.getStorageSize();
if (txnSize > sizeLimit) {
itr.close();
return TxnLogProposalIterator.EMPTY_ITERATOR;
}
}
} catch (IOException e) {
itr.close();
return TxnLogProposalIterator.EMPTY_ITERATOR;
}
return new TxnLogProposalIterator(itr);
}
truncateLog方法
把txnlog数据truncate到指定的zxid位置,然后重新加载DataTree数据:
public boolean truncateLog(long zxid) throws IOException {
clear();
// truncate the log
boolean truncated = snapLog.truncateLog(zxid);
if (!truncated) {
return false;
}
loadDataBase();
return true;
}
deserializeSnapshot方法
public void deserializeSnapshot(InputArchive ia) throws IOException {
clear();
SerializeUtils.deserializeSnapshot(getDataTree(), ia, getSessionWithTimeOuts());
initialized = true;
}
public void deserializeSnapshot(final InputArchive ia, final CheckedInputStream is) throws IOException {
clear();
// deserialize data tree
final DataTree dataTree = getDataTree();
FileSnap.deserialize(dataTree, getSessionWithTimeOuts(), ia);
SnapStream.checkSealIntegrity(is, ia);
// deserialize digest and check integrity
if (dataTree.deserializeZxidDigest(ia, 0)) {
SnapStream.checkSealIntegrity(is, ia);
}
// deserialize lastProcessedZxid and check integrity
if (dataTree.deserializeLastProcessedZxid(ia)) {
SnapStream.checkSealIntegrity(is, ia);
}
// compare the digest to find inconsistency
if (dataTree.getDigestFromLoadedSnapshot() != null) {
dataTree.compareSnapshotDigests(dataTree.lastProcessedZxid);
}
initialized = true;
}
serializeSnapshot方法
public void serializeSnapshot(OutputArchive oa) throws IOException, InterruptedException {
SerializeUtils.serializeSnapshot(getDataTree(), oa, getSessionWithTimeOuts());
}
DataTree
维护树状结构,没有任何网络或客户端连接代码,因此可以以独立的方式进行测试。
维护两个并行的数据结构:一个从完整路径映射到DataNodes的哈希表和一个DataNodes树,对路径的所有访问都是通过哈希表进行的,只有在序列化到磁盘时才遍历DataNodes树。
关键字段
// This map provides a fast lookup to the data nodes
private final NodeHashMap nodes;
// Watcher
private IWatchManager dataWatches;
private IWatchManager childWatches;
// cached total size of paths and data for all DataNodes
private final AtomicLong nodeDataSize = new AtomicLong(0);
// This hashtable lists the paths of the ephemeral nodes of a session
private final Map<Long, HashSet<String>> ephemerals = new ConcurrentHashMap<>();
// This set contains the paths of all container nodes
private final Set<String> containers = Collections.newSetFromMap(new ConcurrentHashMap<>());
// This set contains the paths of all ttl nodes
private final Set<String> ttls = Collections.newSetFromMap(new ConcurrentHashMap<>());
// This is a pointer to the root of the DataTree
private DataNode root = new DataNode(new byte[0], -1L, new StatPersisted());
// create a /zookeeper filesystem that is the proc filesystem of zookeeper
private final DataNode procDataNode = new DataNode(new byte[0], -1L,