zookeeper源码(05)数据存储

本文详细分析一下zookeeper的数据存储。

ZKDatabase

维护zookeeper服务器内存数据库,包括session、dataTree和committedlog数据,从磁盘读取日志和快照后启动。

关键字段

// 数据节点树
protected DataTree dataTree;
protected ConcurrentHashMap<Long, Integer> sessionsWithTimeouts;
protected FileTxnSnapLog snapLog; // 用于操作底层数据文件
// committedLog中第一条和最后一条数据的zxid
protected long minCommittedLog, maxCommittedLog;
// committedLog最大容量,默认500
public int commitLogCount;
// 维护最后提交的请求集,可用于快速follower同步
protected Queue<Proposal> committedLog = new ArrayDeque<>();

protected ReentrantReadWriteLock logLock = new ReentrantReadWriteLock();
private volatile boolean initialized = false;

// txnlog计数
private AtomicInteger txnCount = new AtomicInteger(0);

构造方法

public ZKDatabase(FileTxnSnapLog snapLog) {
   
    dataTree = createDataTree();
    sessionsWithTimeouts = new ConcurrentHashMap<>();
    this.snapLog = snapLog;

    // 初始化snapshotSizeFactor默认0.33
    // 初始化commitLogCount默认500
}

public DataTree createDataTree() {
   
    return new DataTree();
}

创建DataTree对象:创建/zookeeper/quota、/zookeeper/config节点,创建dataWatches和childWatches对象(使用WatchManager实现类)。

主要方法

// 返回committedLog集
public synchronized Collection<Proposal> getCommittedLog();
// 返回dataTree.lastProcessedZxid的值
public long getDataTreeLastProcessedZxid();
// 返回dataTree.getSessions()集
public Collection<Long> getSessions();
// 返回sessionsWithTimeouts的size
public long getSessionCount();
// 从磁盘加载dataTree并把txnLog加载到committedLog中
public long loadDataBase() throws IOException;
// 从磁盘加载txnLog到committedLog中
public long fastForwardDataBase() throws IOException;
// 使用addCommittedProposal方法添加committedLog
private void addCommittedProposal(TxnHeader hdr, Record txn, TxnDigest digest);
// 添加committedLog
public void addCommittedProposal(Request request);
// 从txnLog加载Proposal
public Iterator<Proposal> getProposalsFromTxnLog(long startZxid, long sizeLimit);
// 使用dataTree.removeCnxn(cnxn)
public void removeCnxn(ServerCnxn cnxn);
// 使用dataTree.killSession(sessionId, zxid)
public void killSession(long sessionId, long zxid);
// 使用dataTree.dumpEphemerals(pwriter)
public void dumpEphemerals(PrintWriter pwriter);
// 使用dataTree.getEphemerals()
public Map<Long, Set<String>> getEphemerals();
// 使用dataTree.getNodeCount()
public int getNodeCount();
// 使用dataTree.getEphemerals(sessionId)
public Set<String> getEphemerals(long sessionId);
// 给dataTree.lastProcessedZxid赋值
public void setlastProcessedZxid(long zxid);
// 使用dataTree.processTxn(hdr, txn, digest)
public ProcessTxnResult processTxn(TxnHeader hdr, Record txn, TxnDigest digest);
// 使用dataTree.statNode(path, serverCnxn)
public Stat statNode(String path, ServerCnxn serverCnxn) throws KeeperException.NoNodeException;
// 使用dataTree.getNode(path)
public DataNode getNode(String path);
// 使用dataTree.getData(path, stat, watcher)
public byte[] getData(String path, Stat stat, Watcher watcher) throws KeeperException.NoNodeException;
// 使用dataTree.setWatches方法实现
public void setWatches(long relativeZxid, List<String> dataWatches,
                       List<String> existWatches, List<String> childWatches,
                       List<String> persistentWatches, List<String> persistentRecursiveWatches,
                       Watcher watcher);
// 使用dataTree.addWatch(basePath, watcher, mode)
public void addWatch(String basePath, Watcher watcher, int mode);
// 使用dataTree.getChildren(path, stat, watcher)
public List<String> getChildren(
    String path, Stat stat, Watcher watcher) throws KeeperException.NoNodeException;
// 使用dataTree.getAllChildrenNumber(path)
public int getAllChildrenNumber(String path) throws KeeperException.NoNodeException;
// Truncate the ZKDatabase to the specified zxid
public boolean truncateLog(long zxid) throws IOException;
// Deserialize a snapshot from an input archive
public void deserializeSnapshot(InputArchive ia) throws IOException;
// Deserialize a snapshot that contains FileHeader from an input archive
// It is used by the admin restore command
public void deserializeSnapshot(final InputArchive ia, final CheckedInputStream is) throws IOException;
// Serialize the snapshot
public void serializeSnapshot(OutputArchive oa) throws IOException, InterruptedException;
// 使用snapLog.append(si)保存数据,txnCount++
public boolean append(Request si) throws IOException;
// 使用snapLog.rollLog()滚动底层txnLog
public void rollLog() throws IOException;
// 使用snapLog.commit()提交底层txnLog
public void commit() throws IOException;
// 初始化/zookeeper/config数据,集群启动时已介绍
public synchronized void initConfigInZKDatabase(QuorumVerifier qv);
// 使用dataTree.containsWatcher(path, type, watcher)
public boolean containsWatcher(String path, WatcherType type, Watcher watcher);
// 使用dataTree.removeWatch(path, type, watcher)
public boolean removeWatch(String path, WatcherType type, Watcher watcher);

loadDataBase方法

从磁盘加载dataTree并把txnLog加载到committedLog中:

public long loadDataBase() throws IOException {
   
    long startTime = Time.currentElapsedTime();
    // 1. 从snapshot加载dataTree
    // 2. 使用fastForwardFromEdits方法从txnLog加载dataTree和committedlog
    long zxid = snapLog.restore(dataTree, sessionsWithTimeouts, commitProposalPlaybackListener);
    initialized = true;
    // 略
    return zxid;
}

fastForwardDataBase方法

从txnLog加载dataTree和committedlog集:

public long fastForwardDataBase() throws IOException {
   
    // 会通过commitProposalPlaybackListener调用addCommittedProposal添加committedlog
    long zxid = snapLog.fastForwardFromEdits(
        dataTree, sessionsWithTimeouts, commitProposalPlaybackListener);
    initialized = true;
    return zxid;
}

addCommittedProposal方法

private void addCommittedProposal(TxnHeader hdr, Record txn, TxnDigest digest) {
   
    Request r = new Request(0, hdr.getCxid(), hdr.getType(), hdr, txn, hdr.getZxid());
    r.setTxnDigest(digest);
    addCommittedProposal(r);
}

public void addCommittedProposal(Request request) {
   
    WriteLock wl = logLock.writeLock();
    try {
   
        wl.lock();
        if (committedLog.size() > commitLogCount) {
   
            committedLog.remove();
            minCommittedLog = committedLog.peek().packet.getZxid();
        }
        if (committedLog.isEmpty()) {
   
            minCommittedLog = request.zxid;
            maxCommittedLog = request.zxid;
        }
        byte[] data = request.getSerializeData();
        QuorumPacket pp = new QuorumPacket(Leader.PROPOSAL, request.zxid, data, null);
        Proposal p = new Proposal();
        p.packet = pp;
        p.request = request;
        committedLog.add(p);
        maxCommittedLog = p.packet.getZxid();
    } finally {
   
        wl.unlock();
    }
}

getProposalsFromTxnLog方法

从txnlog获取Proposal,只填充packet字段:

public Iterator<Proposal> getProposalsFromTxnLog(long startZxid, long sizeLimit) {
   
    if (sizeLimit < 0) {
   
        return TxnLogProposalIterator.EMPTY_ITERATOR;
    }

    TxnIterator itr = null;
    try {
   
        // 从txnLog文件读取数据
        // 底层通过FileTxnIterator类读取文件流实现
        itr = snapLog.readTxnLog(startZxid, false);

        // If we cannot guarantee that this is strictly the starting txn
        // after a given zxid, we should fail.
        if ((itr.getHeader() != null) && (itr.getHeader().getZxid() > startZxid)) {
   
            itr.close();
            return TxnLogProposalIterator.EMPTY_ITERATOR;
        }

        if (sizeLimit > 0) {
   
            long txnSize = itr.getStorageSize();
            if (txnSize > sizeLimit) {
   
                itr.close();
                return TxnLogProposalIterator.EMPTY_ITERATOR;
            }
        }
    } catch (IOException e) {
   
        itr.close();
        return TxnLogProposalIterator.EMPTY_ITERATOR;
    }
    return new TxnLogProposalIterator(itr);
}

truncateLog方法

把txnlog数据truncate到指定的zxid位置,然后重新加载DataTree数据:

public boolean truncateLog(long zxid) throws IOException {
   
    clear();

    // truncate the log
    boolean truncated = snapLog.truncateLog(zxid);

    if (!truncated) {
   
        return false;
    }

    loadDataBase();
    return true;
}

deserializeSnapshot方法

public void deserializeSnapshot(InputArchive ia) throws IOException {
   
    clear();
    SerializeUtils.deserializeSnapshot(getDataTree(), ia, getSessionWithTimeOuts());
    initialized = true;
}

public void deserializeSnapshot(final InputArchive ia, final CheckedInputStream is) throws IOException {
   
    clear();

    // deserialize data tree
    final DataTree dataTree = getDataTree();
    FileSnap.deserialize(dataTree, getSessionWithTimeOuts(), ia);
    SnapStream.checkSealIntegrity(is, ia);

    // deserialize digest and check integrity
    if (dataTree.deserializeZxidDigest(ia, 0)) {
   
        SnapStream.checkSealIntegrity(is, ia);
    }

    // deserialize lastProcessedZxid and check integrity
    if (dataTree.deserializeLastProcessedZxid(ia)) {
   
        SnapStream.checkSealIntegrity(is, ia);
    }

    // compare the digest to find inconsistency
    if (dataTree.getDigestFromLoadedSnapshot() != null) {
   
        dataTree.compareSnapshotDigests(dataTree.lastProcessedZxid);
    }

    initialized = true;
}

serializeSnapshot方法

public void serializeSnapshot(OutputArchive oa) throws IOException, InterruptedException {
   
    SerializeUtils.serializeSnapshot(getDataTree(), oa, getSessionWithTimeOuts());
}

DataTree

维护树状结构,没有任何网络或客户端连接代码,因此可以以独立的方式进行测试。

维护两个并行的数据结构:一个从完整路径映射到DataNodes的哈希表和一个DataNodes树,对路径的所有访问都是通过哈希表进行的,只有在序列化到磁盘时才遍历DataNodes树。

关键字段

// This map provides a fast lookup to the data nodes
private final NodeHashMap nodes;
// Watcher
private IWatchManager dataWatches;
private IWatchManager childWatches;
// cached total size of paths and data for all DataNodes
private final AtomicLong nodeDataSize = new AtomicLong(0);
// This hashtable lists the paths of the ephemeral nodes of a session
private final Map<Long, HashSet<String>> ephemerals = new ConcurrentHashMap<>();
// This set contains the paths of all container nodes
private final Set<String> containers = Collections.newSetFromMap(new ConcurrentHashMap<>());
// This set contains the paths of all ttl nodes
private final Set<String> ttls = Collections.newSetFromMap(new ConcurrentHashMap<>());
// This is a pointer to the root of the DataTree
private DataNode root = new DataNode(new byte[0], -1L, new StatPersisted());
// create a /zookeeper filesystem that is the proc filesystem of zookeeper
private final DataNode procDataNode = new DataNode(new byte[0], -1L, 
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值