【Redis-6.0.8】Redis集群源码-clusterCron

最新推荐文章于 2025-01-04 17:43:29 发布

原创最新推荐文章于 2025-01-04 17:43:29 发布 · 489 阅读

1 ·

CC 4.0 BY-SA版权

redis 专栏收录该内容

49 篇文章

订阅专栏

0.阅读引用

clusterProcessPacket- clusterProcessGossipSection-clusterCron

1.serverCron中调用clusterCron

serverCron{
...
if (server.cluster_enabled) clusterCron();
...
}
 
clusterCron函数执行如下操作：
(1)向其他节点发送MEET消息，将其加入集群;
(2)每1s会随机选择一个节点，发送ping消息;
(3)如果一个节点在超时时间之内仍未收到ping包的响应(cluster-node-timeout配置项指定的时间),则将其
标记为pfail;
(4)检查是否需要进行主从切换，如果需要则执行切换;
(5)检查是否需要进行副本漂移，如果需要，执行副本漂移操作.
 
注意：
a.对于步骤(1),当在一个集群节点A执行CLUSTER MEET ip port命令时，会将“ip:port”指定的节点B加入该集
群中,但该命令执行时只是将B的“ip:port”信息保存到A节点中，然后在clusterCron函数中为A节点“ip:port”
指定的B节点建立连接并发送MEET类型的数据包.
 
b.对于步骤(3),Redis集群中节点的故障状态有两种.一种为pfail（Possible failure），当一个节点A未在
指定时间收到另一个节点B对ping包的响应时，A节点会将B节点标记为pfail。另一种是，当大多数Master节点
确认B为pfail之后，就会将B标记为fail. fail状态的节点才会需要执行主从切换.

2.clusterCron主流程的实现

/* -----------------------------------------------------------------------------
 * CLUSTER cron job
 * -------------------------------------------------------------------------- */

/* This is executed 10 times every second */
/* 集群的周期性执行函数,每秒执行10次,100ms执行一次 */
void clusterCron(void) {
    dictIterator *di;
    dictEntry *de;
    int update_state = 0;
    /* 没有从节点的主节点的个数-光杆司令的个数*/
    int orphaned_masters; /* How many masters there are without ok slaves. */
    /* 所有从节点从属的主节点个数 */
    int max_slaves; /* Max number of ok slaves for a single master. */
    /* 如果myself是从节点,该从节点对应的主节点下有多少个主节点 */
    int this_slaves; /* Number of ok slaves for our master (if we are slave). */
    
    mstime_t min_pong = 0, now = mstime();
    clusterNode *min_pong_node = NULL;
    /* 局部静态变量,表示该函数执行了多少次 */
    static unsigned long long iteration = 0;
    mstime_t handshake_timeout;
    /* 每执行一次，对iteration做加加的操作 */
    iteration++; /* Number of times this function was called so far. */

    /* We want to take myself->ip in sync with the cluster-announce-ip option.
     * The option can be set at runtime via CONFIG SET, so we periodically check
     * if the option changed to reflect this into myself->ip. */
    /*
       我们想要将myself->ip设置地与cluster-announce-ip配置中的是一致的.
       这个配置是可以在运行时的时候通过CONFIG SET来改变的,所以我们间断性地
       检测这个选项配置是否是否能够真实地被写入到myself->ip中.
     */
    {
        static char *prev_ip = NULL;
        char *curr_ip = server.cluster_announce_ip;
        int changed = 0;

        if (prev_ip == NULL && curr_ip != NULL) changed = 1;
        else if (prev_ip != NULL && curr_ip == NULL) changed = 1;
        else if (prev_ip && curr_ip && strcmp(prev_ip,curr_ip)) changed = 1;

        if (changed) {
            if (prev_ip) zfree(prev_ip);
            prev_ip = curr_ip;

            if (curr_ip) {
                /* We always take a copy of the previous IP address, by
                 * duplicating the string. This way later we can check if
                 * the address really changed. */
                prev_ip = zstrdup(prev_ip);
                strncpy(myself->ip,server.cluster_announce_ip,NET_IP_STR_LEN);
                myself->ip[NET_IP_STR_LEN-1] = '\0';
            } else {
                myself->ip[0] = '\0'; /* Force autodetection. */
            }
        }
    }

    /* The handshake timeout is the time after which a handshake node that was
     * not turned into a normal node is removed from the nodes. Usually it is
     * just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use
     * the value of 1 second. */
    /*  获取握手的超时时间，如果事件太短以至于小于1秒的话，就将其设置成1秒 */
    handshake_timeout = server.cluster_node_timeout;
    if (handshake_timeout < 1000) handshake_timeout = 1000;

    /* Update myself flags. */
    /* 更新当前节点的标志 */
    clusterUpdateMyselfFlags();

    /* Check if we have disconnected nodes and re-establish the connection.
     * Also update a few stats while we are here, that can be used to make
     * better decisions in other part of the code. */
    
    /* 获取安全迭代器 */
    di = dictGetSafeIterator(server.cluster->nodes);
     /* 初始化stats_pfail_nodes(状态为pfail的节点的个数)为0 */
    server.cluster->stats_pfail_nodes = 0;
    /* 遍历所有集群中的节点，如果有未建立连接的节点，那么发送PING或PONG消息，建立连接 */
    while((de = dictNext(di)) != NULL) {
        /* 获取节点 */
        clusterNode *node = dictGetVal(de);

        /* Not interested in reconnecting the link with myself or nodes
         * for which we have no address. */
        /* 对于是自己的节点和没有地址的节点不感兴趣直接跳过 */
        if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR)) continue;
        /* 遇到状态是CLUSTER_NODE_PFAIL的节点,对stats_pfail_nodes统计的变量加1,
           CLUSTER_NODE_PFAIL是疑似下线的节点
        */
        if (node->flags & CLUSTER_NODE_PFAIL)
            server.cluster->stats_pfail_nodes++;

        /* A Node in HANDSHAKE state has a limited lifespan equal to the
         * configured node timeout. */
        /* 
            如果node节点处于握手状态,但是从建立连接开始到现在已经超时,
            那么从集群中删除该节点,遍历下一个节点
        */
        if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) {
            clusterDelNode(node);
            continue;
        }
        /* 如果节点的连接对象为空 */
        if (node->link == NULL) {
            /* 为节点创建一个连接对象 */
            clusterLink *link = createClusterLink(node);
            /* 
            通过判断tls_cluster的信息来判断是调用connCreateTLS还是调用connCreateSocket
            来创建连接
            */
            link->conn = server.tls_cluster ? connCreateTLS() : connCreateSocket();
            /* 将link设置为link->conn这个连接的私有信息 */
            connSetPrivateData(link->conn, link);
            /* 建立当前节点与node这个节点的连接 */
            if (connConnect(link->conn, node->ip, node->cport, NET_FIRST_BIND_ADDR,
                        clusterLinkConnectHandler) == -1) {
                /* We got a synchronous error from connect before
                 * clusterSendPing() had a chance to be called.
                 * If node->ping_sent is zero, failure detection can't work,
                 * so we claim we actually sent a ping now (that will
                 * be really sent as soon as the link is obtained). */
                /*
                    如果ping_sent【最近一次发送PING的时间】为0,察觉故障无法执行,
                    因此要设置发送PING的时间,当建立连接后会真正的的发送PING命令,
                    如果连接出错,那么跳过该节点.
                */
                
                if (node->ping_sent == 0) node->ping_sent = mstime();
                serverLog(LL_DEBUG, "Unable to connect to "
                    "Cluster Node [%s]:%d -> %s", node->ip,
                    node->cport, server.neterr);
                /* 释放节点，继续循环 */
                freeClusterLink(link);
                continue;
            }
            /* 为node设置连接对象 */
            node->link = link;
        }
    }
    /* 释放安全迭代器 */
    dictReleaseIterator(di);

    /* Ping some random node 1 time every 10 iterations, so that we usually ping
     * one random node every second. */
     /*
         在十次执行此函数中有一次会随机PING一些节点,这样我们通常就可以1秒钟能够ping
         到一个随机的节点
     */
    if (!(iteration % 10)) {
        int j;

        /* Check a few random nodes and ping the one with the oldest
         * pong_received time. */
        /*
            随机抽查5个节点,向pong_received值最小的发送PING消息
            pong_received【接收到PONG的时间】
         */
        for (j = 0; j < 5; j++) {
            /* 随机抽查一个节点 */
            de = dictGetRandomKey(server.cluster->nodes);
            clusterNode *this = dictGetVal(de);

            /* Don't ping nodes disconnected or with a ping currently active. */
            /* 跳过无连接或已经发送过PING的节点 */
            if (this->link == NULL || this->ping_sent != 0) continue;
            /*  跳过myself节点和处于握手状态的节点 */
            if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
                continue;
            
            /* 需要再研究，这里是什么意思？ */
            /* 当min_pong_node为NULL或者min_pong大于当前节点收到的pong的时间的情况下 */
            /* menwen-查找出这个5个随机抽查的节点，接收到PONG回复过去最久的节点 */
            if (min_pong_node == NULL || min_pong > this->pong_received) {
                min_pong_node = this;
                min_pong = this->pong_received;
            }
        }
        /* 如果min_pong_node不为NULL,
           向接收到PONG回复过去最久的节点发送PING消息,判断是否可达
         */
        if (min_pong_node) {
            serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name);
            clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
        }
    }

    /* Iterate nodes to check if we need to flag something as failing.
     * This loop is also responsible to:
     * 1) Check if there are orphaned masters (masters without non failing
     *    slaves).
     * 2) Count the max number of non failing slaves for a single master.
     * 3) Count the number of slaves for our master, if we are a slave. */
    /*
        迭代所有的节点,检查是否需要标记某个节点下线的状态:
        (1)检查是否有孤立的主节点(主节点的从节点全部下线);
        (2)计算单个主节点没下线从节点的最大个数;
        (3)如果myself是从节点,计算该从节点的主节点有多少个从节点.
        追加注释：
        (1)孤立的主节点个数用orphaned_masters记录;
        (2)计算单个主节点没下线从节点的最大个数用max_slaves记录;
        (3)如果myself是从节点,计算该从节点的主节点有多少个从节点用this_slaves记录.
    */
    orphaned_masters = 0;
    max_slaves = 0;
    this_slaves = 0;
    di = dictGetSafeIterator(server.cluster->nodes);
    while((de = dictNext(di)) != NULL) {
        /* 迭代所有的节点 */
        clusterNode *node = dictGetVal(de);
        now = mstime(); /* Use an updated time at every iteration. */
        /* 跳过myself节点，无地址NOADDR节点，和处于握手状态的节点 */
        if (node->flags &
            (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE))
                continue;

        /* Orphaned master check, useful only if the current instance
         * is a slave that may migrate to another master. */
        /*
           对无从节点的主节点进行判断,仅仅在当前节点是一个可能快要变成另外一个主节点
           的时候有效
        */
        /* 如果myself是从节点并且node节点是主节点并且该主节点不处于下线状态 */
        if (nodeIsSlave(myself) && nodeIsMaster(node) && !nodeFailed(node)) {
            
            /* 获取node主节点有多少个正常的从节点*/
            int okslaves = clusterCountNonFailingSlaves(node);

            /* A master is orphaned if it is serving a non-zero number of
             * slots, have no working slaves, but used to have at least one
             * slave, or failed over a master that used to have slaves. */
             /*
              node主节点没有ok的从节点,
              并且node节点负责有槽位，
              并且node节点指定了槽迁移标识
              */
      
            if (okslaves == 0 && node->numslots > 0 &&
                node->flags & CLUSTER_NODE_MIGRATE_TO)
            {
                   /* 孤立的主节点数加1,光杆司令的数量加一 */
                   orphaned_masters++;
            }
            /* 更新一个主节点最多ok从节点的数量 */
            if (okslaves > max_slaves) max_slaves = okslaves;
            /* 如果myself是从节点, 并且从属于当前node主节点，
               更新该从节点的主节点有多少个从节点的值 
            */
            if (nodeIsSlave(myself) && myself->slaveof == node)
                this_slaves = okslaves;
        }

        /* If we are not receiving any data for more than half the cluster
         * timeout, reconnect the link: maybe there is a connection
         * issue even if the node is alive. */
        /*
            如果等待PONG回复的时间超过cluster_node_timeout的一半,则重新建立连接.
           即使节点正常,但是它的连接出问题
        */
        /* 计算ping延迟和数据延迟 */
        mstime_t ping_delay = now - node->ping_sent;
        mstime_t data_delay = now - node->data_received;
      
        /* 如果node->link不为NULL(表明是连接着的)
           且server.cluster_node_timeout不为0(表明还没有重连)
           且node->ping_sent不为0(表明节点还在等待PONG回复)
           且node->pong_received小于node->ping_sent(表明节点仍然在等待PONG回复)
           且ping_delay > server.cluster_node_timeout/2(表明等到PONG回复的时间已经超过了 
           timeout/2)
           且data_delay > server.cluster_node_timeout/2(表明现在已经超过timeout/2的时间 
           没有看到数据的传送)
         */
        if (node->link && /* is connected */
            now - node->link->ctime >
            server.cluster_node_timeout && /* was not already reconnected */
            node->ping_sent && /* we already sent a ping */
            node->pong_received < node->ping_sent && /* still waiting pong */
            /* and we are waiting for the pong more than timeout/2 */
            ping_delay > server.cluster_node_timeout/2 &&
            /* and in such interval we are not seeing any traffic at all. */
            data_delay > server.cluster_node_timeout/2)
        {
            /* Disconnect the link, it will be reconnected automatically. */
            /* 释放连接,等待下个周期的自动重连 */
            freeClusterLink(node->link);
        }

        /* If we have currently no active ping in this instance, and the
         * received PONG is older than half the cluster timeout, send
         * a new ping now, to ensure all the nodes are pinged without
         * a too big delay. */

        /* 如果当前没有发送PING消息,并且在一定时间内也没有收到PONG回复 */
        if (node->link &&
            node->ping_sent == 0 &&
            (now - node->pong_received) > server.cluster_node_timeout/2)
        {
            /*  给node节点发送一个PING消息 */
            clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
            continue;
        }

        /* If we are a master and one of the slaves requested a manual
         * failover, ping it continuously. */
         /*
            如果当前节点是一个主节点且有从节点请求手动故障转移,那么就持续
            地PING它
         */
         /*
            mf_end-如果为0,表示没有正在进行手动的故障转移.否则表示手动故障转移的时间限制.
            如果有从节点手动请求故障转移且当前节点是主节点且当前节点是手动请求故障转移的
            节点且当前节点的节点不为NULL
         */
        if (server.cluster->mf_end &&
            nodeIsMaster(myself) &&
            server.cluster->mf_slave == node &&
            node->link)
        {
            clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
            continue;
        }

        /* Check only if we have an active ping for this instance. */
        /* 
           如果当前还没有发送PING消息,则跳过,
           只有发送了PING消息之后,才会执行以下操作
         */
        if (node->ping_sent == 0) continue;

        /* Check if this node looks unreachable.
         * Note that if we already received the PONG, then node->ping_sent
         * is zero, so can't reach this code at all, so we don't risk of
         * checking for a PONG delay if we didn't sent the PING.
         *
         * We also consider every incoming data as proof of liveness, since
         * our cluster bus link is also used for data: under heavy data
         * load pong delays are possible. */
         /*
            检查一下当前的节点是否看起来不可到达.
            要注意一下如果我们之前就有收到过PING,那么node->ping_sent这个字段是0,
            所以不可能到达这个状态.所以我们不想在没有发送PING消息的情况下冒着一定
            的风险去检测PONG回复的延迟.
         */


        /*取ping_delay和data_delay中较小的值作为节点的延迟 */
        mstime_t node_delay = (ping_delay < data_delay) ? ping_delay :
                                                          data_delay;
        /* 如果节点的延迟超过了配置文件中设置的cluster-node-timeout
         */
        if (node_delay > server.cluster_node_timeout) {
            /* Timeout reached. Set the node as possibly failing if it is
             * not already in this state. */
             /*

             */
            if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
                serverLog(LL_DEBUG,"*** NODE %.40s possibly failing",
                    node->name);
                node->flags |= CLUSTER_NODE_PFAIL;
                update_state = 1;
            }
        }
    }
    dictReleaseIterator(di);

    /* If we are a slave node but the replication is still turned off,
     * enable it if we know the address of our master and it appears to
     * be up. */
    /*
          myself->slaveof【pointer to the master node】
          
          如果myself是从节点
          且server.masterhost为NULL
          且myself->slaveof不为NULL
          且myself->slaveof(mysql对应的从节点的指针)的地址是存在的
          那么设置当前服务器的主节点的地址,即IP和端口号.
    */
    
    if (nodeIsSlave(myself) &&
        server.masterhost == NULL &&
        myself->slaveof &&
        nodeHasAddr(myself->slaveof))
    {
        replicationSetMaster(myself->slaveof->ip, myself->slaveof->port);
    }

    /* Abourt a manual failover if the timeout is reached. */
    /* 终止一个超时的手动故障转移操作 */
    manualFailoverCheckTimeout();
    
    /* 如果当前节点是从节点 */
    if (nodeIsSlave(myself)) {
        
        /* 设置手动故障转移的状态 */
        clusterHandleManualFailover();
         
        /* 
          如果当前节点没有被设置成不允许进行故障转移,那么
          调用clusterHandleSlaveFailover执行从节点的自动或手动故障转移
        */
        if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
            clusterHandleSlaveFailover();
        /* If there are orphaned slaves, and we are a slave among the masters
         * with the max number of non-failing slaves, consider migrating to
         * the orphaned masters. Note that it does not make sense to try
         * a migration if there is no master with at least *two* working
         * slaves. */
        if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves)
            clusterHandleSlaveMigration(max_slaves);
    }
    /* 
       如果存在孤立的主节点,并且集群中的某一主节点有超过2个正常的从节点,
       并且该主节点正好是myself节点的主节点
    */
    if (update_state || server.cluster->state == CLUSTER_FAIL)
        /* 更新集群状态 */
        clusterUpdateState();
}

3.clusterCron中重要的函数

3.1 clusterSendPing

/* Send a PING or PONG packet to the specified node, making sure to add enough
 * gossip informations. */
void clusterSendPing(clusterLink *link, int type) {
    unsigned char *buf;
    clusterMsg *hdr;
    int gossipcount = 0; /* Number of gossip sections added so far. */
    int wanted; /* Number of gossip sections we want to append if possible. */
    int totlen; /* Total packet length. */
    /* freshnodes is the max number of nodes we can hope to append at all:
     * nodes available minus two (ourself and the node we are sending the
     * message to). However practically there may be less valid nodes since
     * nodes in handshake state, disconnected, are not considered. */
    int freshnodes = dictSize(server.cluster->nodes)-2;

    /* How many gossip sections we want to add? 1/10 of the number of nodes
     * and anyway at least 3. Why 1/10?
     *
     * If we have N masters, with N/10 entries, and we consider that in
     * node_timeout we exchange with each other node at least 4 packets
     * (we ping in the worst case in node_timeout/2 time, and we also
     * receive two pings from the host), we have a total of 8 packets
     * in the node_timeout*2 falure reports validity time. So we have
     * that, for a single PFAIL node, we can expect to receive the following
     * number of failure reports (in the specified window of time):
     *
     * PROB * GOSSIP_ENTRIES_PER_PACKET * TOTAL_PACKETS:
     *
     * PROB = probability of being featured in a single gossip entry,
     *        which is 1 / NUM_OF_NODES.
     * ENTRIES = 10.
     * TOTAL_PACKETS = 2 * 4 * NUM_OF_MASTERS.
     *
     * If we assume we have just masters (so num of nodes and num of masters
     * is the same), with 1/10 we always get over the majority, and specifically
     * 80% of the number of nodes, to account for many masters failing at the
     * same time.
     *
     * Since we have non-voting slaves that lower the probability of an entry
     * to feature our node, we set the number of entries per packet as
     * 10% of the total nodes we have. */
    wanted = floor(dictSize(server.cluster->nodes)/10);
    if (wanted < 3) wanted = 3;
    if (wanted > freshnodes) wanted = freshnodes;

    /* Include all the nodes in PFAIL state, so that failure reports are
     * faster to propagate to go from PFAIL to FAIL state. */
    int pfail_wanted = server.cluster->stats_pfail_nodes;

    /* Compute the maxium totlen to allocate our buffer. We'll fix the totlen
     * later according to the number of gossip sections we really were able
     * to put inside the packet. */
    totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
    totlen += (sizeof(clusterMsgDataGossip)*(wanted+pfail_wanted));
    /* Note: clusterBuildMessageHdr() expects the buffer to be always at least
     * sizeof(clusterMsg) or more. */
    if (totlen < (int)sizeof(clusterMsg)) totlen = sizeof(clusterMsg);
    buf = zcalloc(totlen);
    hdr = (clusterMsg*) buf;

    /* Populate the header. */
    if (link->node && type == CLUSTERMSG_TYPE_PING)
        link->node->ping_sent = mstime();
    clusterBuildMessageHdr(hdr,type);

    /* Populate the gossip fields */
    int maxiterations = wanted*3;
    while(freshnodes > 0 && gossipcount < wanted && maxiterations--) {
        dictEntry *de = dictGetRandomKey(server.cluster->nodes);
        clusterNode *this = dictGetVal(de);

        /* Don't include this node: the whole packet header is about us
         * already, so we just gossip about other nodes. */
        if (this == myself) continue;

        /* PFAIL nodes will be added later. */
        if (this->flags & CLUSTER_NODE_PFAIL) continue;

        /* In the gossip section don't include:
         * 1) Nodes in HANDSHAKE state.
         * 3) Nodes with the NOADDR flag set.
         * 4) Disconnected nodes if they don't have configured slots.
         */
        if (this->flags & (CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_NOADDR) ||
            (this->link == NULL && this->numslots == 0))
        {
            freshnodes--; /* Tecnically not correct, but saves CPU. */
            continue;
        }

        /* Do not add a node we already have. */
        if (clusterNodeIsInGossipSection(hdr,gossipcount,this)) continue;

        /* Add it */
        clusterSetGossipEntry(hdr,gossipcount,this);
        freshnodes--;
        gossipcount++;
    }

    /* If there are PFAIL nodes, add them at the end. */
    if (pfail_wanted) {
        dictIterator *di;
        dictEntry *de;

        di = dictGetSafeIterator(server.cluster->nodes);
        while((de = dictNext(di)) != NULL && pfail_wanted > 0) {
            clusterNode *node = dictGetVal(de);
            if (node->flags & CLUSTER_NODE_HANDSHAKE) continue;
            if (node->flags & CLUSTER_NODE_NOADDR) continue;
            if (!(node->flags & CLUSTER_NODE_PFAIL)) continue;
            clusterSetGossipEntry(hdr,gossipcount,node);
            freshnodes--;
            gossipcount++;
            /* We take the count of the slots we allocated, since the
             * PFAIL stats may not match perfectly with the current number
             * of PFAIL nodes. */
            pfail_wanted--;
        }
        dictReleaseIterator(di);
    }

    /* Ready to send... fix the totlen fiend and queue the message in the
     * output buffer. */
    totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
    totlen += (sizeof(clusterMsgDataGossip)*gossipcount);
    hdr->count = htons(gossipcount);
    hdr->totlen = htonl(totlen);
    clusterSendMessage(link,buf,totlen);
    zfree(buf);
}