redis源码分析[04]-网络通信-anet.c

本文深入探讨了网络编程中的关键技巧,包括错误处理、设置socket选项、心跳检测、Nagle算法控制、缓冲区大小调整、超时设置、地址解析、连接建立、监听配置、读写操作优化等,适用于TCP和Unix域套接字。
//设置错误信息
static void anetSetError(char *err, const char *fmt, ...)
{
    va_list ap;

    if (!err) return;
    va_start(ap, fmt);
    vsnprintf(err, ANET_ERR_LEN, fmt, ap);
    va_end(ap);
}
//设置一个socket的阻塞状态 1非阻塞 0阻塞
int anetSetBlock(char *err, int fd, int non_block) {
    int flags;

    //获取一个文件描述符的阻塞状态
    if ((flags = fcntl(fd, F_GETFL)) == -1) {
        anetSetError(err, "fcntl(F_GETFL): %s", strerror(errno));
        return ANET_ERR;
    }

    if (non_block)
        flags |= O_NONBLOCK;
    else
        flags &= ~O_NONBLOCK;

    //设置阻塞状态
    if (fcntl(fd, F_SETFL, flags) == -1) {
        anetSetError(err, "fcntl(F_SETFL,O_NONBLOCK): %s", strerror(errno));
        return ANET_ERR;
    }
    return ANET_OK;
}
//设置为非阻塞
int anetNonBlock(char *err, int fd) {
    return anetSetBlock(err,fd,1);
}
//设置为阻塞
int anetBlock(char *err, int fd) {
    return anetSetBlock(err,fd,0);
}

/* 设置心跳检测 */
int anetKeepAlive(char *err, int fd, int interval)
{
    int val = 1;
    //设置心跳检测
    if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &val, sizeof(val)) == -1)
    {
        anetSetError(err, "setsockopt SO_KEEPALIVE: %s", strerror(errno));
        return ANET_ERR;
    }

#ifdef __linux__
    /* Default settings are more or less garbage, with the keepalive time
     * set to 7200 by default on Linux. Modify settings to make the feature
     * actually useful. */

    /* 发送第一次心跳的时间在interval秒后 */
    val = interval;
    if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &val, sizeof(val)) < 0) {
        anetSetError(err, "setsockopt TCP_KEEPIDLE: %s\n", strerror(errno));
        return ANET_ERR;
    }

    /* 探测发包间隔为 interval/3 */
    val = interval/3;
    if (val == 0) val = 1;
    if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &val, sizeof(val)) < 0) {
        anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno));
        return ANET_ERR;
    }

    /* 尝试检测的次数 */
    val = 3;
    if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &val, sizeof(val)) < 0) {
        anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno));
        return ANET_ERR;
    }
#else
    ((void) interval); /* Avoid unused var warning for non Linux systems. */
#endif

    return ANET_OK;
}
//设置是否开启Nagle算法
static int anetSetTcpNoDelay(char *err, int fd, int val)
{
//该算法要求一个TCP连接上最多只能有一个未被确认的小分组,在该小分组的确认到来之前,不能发送其他小分组。
    if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)) == -1)
    {
        anetSetError(err, "setsockopt TCP_NODELAY: %s", strerror(errno));
        return ANET_ERR;
    }
    return ANET_OK;
}

int anetEnableTcpNoDelay(char *err, int fd)
{
    return anetSetTcpNoDelay(err, fd, 1);
}

int anetDisableTcpNoDelay(char *err, int fd)
{
    return anetSetTcpNoDelay(err, fd, 0);
}


int anetSetSendBuffer(char *err, int fd, int buffsize)
{
    if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buffsize, sizeof(buffsize)) == -1)
    {
        anetSetError(err, "setsockopt SO_SNDBUF: %s", strerror(errno));
        return ANET_ERR;
    }
    return ANET_OK;
}

int anetTcpKeepAlive(char *err, int fd)
{
    int yes = 1;
    if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &yes, sizeof(yes)) == -1) {
        anetSetError(err, "setsockopt SO_KEEPALIVE: %s", strerror(errno));
        return ANET_ERR;
    }
    return ANET_OK;
}

/* Set the socket send timeout (SO_SNDTIMEO socket option) to the specified
 * number of milliseconds, or disable it if the 'ms' argument is zero. */
int anetSendTimeout(char *err, int fd, long long ms) {
    struct timeval tv;

    tv.tv_sec = ms/1000;
    tv.tv_usec = (ms%1000)*1000;
    if (setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)) == -1) {
        anetSetError(err, "setsockopt SO_SNDTIMEO: %s", strerror(errno));
        return ANET_ERR;
    }
    return ANET_OK;
}

/* anetGenericResolve() is called by anetResolve() and anetResolveIP() to
 * do the actual work. It resolves the hostname "host" and set the string
 * representation of the IP address into the buffer pointed by "ipbuf".
 *
 * If flags is set to ANET_IP_ONLY the function only resolves hostnames
 * that are actually already IPv4 or IPv6 addresses. This turns the function
 * into a validating / normalizing function. */
int anetGenericResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len,
                       int flags)
{
    struct addrinfo hints, *info;
    int rv;

    memset(&hints,0,sizeof(hints));
    if (flags & ANET_IP_ONLY) hints.ai_flags = AI_NUMERICHOST;
    hints.ai_family = AF_UNSPEC;
    hints.ai_socktype = SOCK_STREAM;  /* specify socktype to avoid dups */

    if ((rv = getaddrinfo(host, NULL, &hints, &info)) != 0) {
        anetSetError(err, "%s", gai_strerror(rv));
        return ANET_ERR;
    }
    if (info->ai_family == AF_INET) {
        struct sockaddr_in *sa = (struct sockaddr_in *)info->ai_addr;
        inet_ntop(AF_INET, &(sa->sin_addr), ipbuf, ipbuf_len);
    } else {
        struct sockaddr_in6 *sa = (struct sockaddr_in6 *)info->ai_addr;
        inet_ntop(AF_INET6, &(sa->sin6_addr), ipbuf, ipbuf_len);
    }

    freeaddrinfo(info);
    return ANET_OK;
}

int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len) {
    return anetGenericResolve(err,host,ipbuf,ipbuf_len,ANET_NONE);
}

int anetResolveIP(char *err, char *host, char *ipbuf, size_t ipbuf_len) {
    return anetGenericResolve(err,host,ipbuf,ipbuf_len,ANET_IP_ONLY);
}

static int anetSetReuseAddr(char *err, int fd) {
    int yes = 1;
    /* Make sure connection-intensive things like the redis benckmark
     * will be able to close/open sockets a zillion of times */
    if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) == -1) {
        anetSetError(err, "setsockopt SO_REUSEADDR: %s", strerror(errno));
        return ANET_ERR;
    }
    return ANET_OK;
}

static int anetCreateSocket(char *err, int domain) {
    int s;
    if ((s = socket(domain, SOCK_STREAM, 0)) == -1) {
        anetSetError(err, "creating socket: %s", strerror(errno));
        return ANET_ERR;
    }

    /* Make sure connection-intensive things like the redis benchmark
     * will be able to close/open sockets a zillion of times */
    if (anetSetReuseAddr(err,s) == ANET_ERR) {
        close(s);
        return ANET_ERR;
    }
    return s;
}

#define ANET_CONNECT_NONE 0
#define ANET_CONNECT_NONBLOCK 1
#define ANET_CONNECT_BE_BINDING 2 /* Best effort binding. */
static int anetTcpGenericConnect(char *err, char *addr, int port,
                                 char *source_addr, int flags)
{
    int s = ANET_ERR, rv;
    char portstr[6];  /* strlen("65535") + 1; */
    struct addrinfo hints, *servinfo, *bservinfo, *p, *b;

    snprintf(portstr,sizeof(portstr),"%d",port);
    memset(&hints,0,sizeof(hints));
    hints.ai_family = AF_UNSPEC;
    hints.ai_socktype = SOCK_STREAM;

    if ((rv = getaddrinfo(addr,portstr,&hints,&servinfo)) != 0) {
        anetSetError(err, "%s", gai_strerror(rv));
        return ANET_ERR;
    }
    for (p = servinfo; p != NULL; p = p->ai_next) {
        /* Try to create the socket and to connect it.
         * If we fail in the socket() call, or on connect(), we retry with
         * the next entry in servinfo. */
        if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1)
            continue;
        if (anetSetReuseAddr(err,s) == ANET_ERR) goto error;
        if (flags & ANET_CONNECT_NONBLOCK && anetNonBlock(err,s) != ANET_OK)
            goto error;
        if (source_addr) {
            int bound = 0;
            /* Using getaddrinfo saves us from self-determining IPv4 vs IPv6 */
            if ((rv = getaddrinfo(source_addr, NULL, &hints, &bservinfo)) != 0)
            {
                anetSetError(err, "%s", gai_strerror(rv));
                goto error;
            }
            for (b = bservinfo; b != NULL; b = b->ai_next) {
                if (bind(s,b->ai_addr,b->ai_addrlen) != -1) {
                    bound = 1;
                    break;
                }
            }
            freeaddrinfo(bservinfo);
            if (!bound) {
                anetSetError(err, "bind: %s", strerror(errno));
                goto error;
            }
        }
        if (connect(s,p->ai_addr,p->ai_addrlen) == -1) {
            /* If the socket is non-blocking, it is ok for connect() to
             * return an EINPROGRESS error here. */
            if (errno == EINPROGRESS && flags & ANET_CONNECT_NONBLOCK)
                goto end;
            close(s);
            s = ANET_ERR;
            continue;
        }

        /* If we ended an iteration of the for loop without errors, we
         * have a connected socket. Let's return to the caller. */
        goto end;
    }
    if (p == NULL)
        anetSetError(err, "creating socket: %s", strerror(errno));

error:
    if (s != ANET_ERR) {
        close(s);
        s = ANET_ERR;
    }

end:
    freeaddrinfo(servinfo);

    /* Handle best effort binding: if a binding address was used, but it is
     * not possible to create a socket, try again without a binding address. */
    if (s == ANET_ERR && source_addr && (flags & ANET_CONNECT_BE_BINDING)) {
        return anetTcpGenericConnect(err,addr,port,NULL,flags);
    } else {
        return s;
    }
}

int anetTcpConnect(char *err, char *addr, int port)
{
    return anetTcpGenericConnect(err,addr,port,NULL,ANET_CONNECT_NONE);
}

int anetTcpNonBlockConnect(char *err, char *addr, int port)
{
    return anetTcpGenericConnect(err,addr,port,NULL,ANET_CONNECT_NONBLOCK);
}

int anetTcpNonBlockBindConnect(char *err, char *addr, int port,
                               char *source_addr)
{
    return anetTcpGenericConnect(err,addr,port,source_addr,
            ANET_CONNECT_NONBLOCK);
}

int anetTcpNonBlockBestEffortBindConnect(char *err, char *addr, int port,
                                         char *source_addr)
{
    return anetTcpGenericConnect(err,addr,port,source_addr,
            ANET_CONNECT_NONBLOCK|ANET_CONNECT_BE_BINDING);
}

int anetUnixGenericConnect(char *err, char *path, int flags)
{
    int s;
    struct sockaddr_un sa;

    if ((s = anetCreateSocket(err,AF_LOCAL)) == ANET_ERR)
        return ANET_ERR;

    sa.sun_family = AF_LOCAL;
    strncpy(sa.sun_path,path,sizeof(sa.sun_path)-1);
    if (flags & ANET_CONNECT_NONBLOCK) {
        if (anetNonBlock(err,s) != ANET_OK)
            return ANET_ERR;
    }
    if (connect(s,(struct sockaddr*)&sa,sizeof(sa)) == -1) {
        if (errno == EINPROGRESS &&
            flags & ANET_CONNECT_NONBLOCK)
            return s;

        anetSetError(err, "connect: %s", strerror(errno));
        close(s);
        return ANET_ERR;
    }
    return s;
}

int anetUnixConnect(char *err, char *path)
{
    return anetUnixGenericConnect(err,path,ANET_CONNECT_NONE);
}

int anetUnixNonBlockConnect(char *err, char *path)
{
    return anetUnixGenericConnect(err,path,ANET_CONNECT_NONBLOCK);
}

/* Like read(2) but make sure 'count' is read before to return
 * (unless error or EOF condition is encountered) */
int anetRead(int fd, char *buf, int count)
{
    ssize_t nread, totlen = 0;
    while(totlen != count) {
        nread = read(fd,buf,count-totlen);
        if (nread == 0) return totlen;
        if (nread == -1) return -1;
        totlen += nread;
        buf += nread;
    }
    return totlen;
}

/* Like write(2) but make sure 'count' is written before to return
 * (unless error is encountered) */
int anetWrite(int fd, char *buf, int count)
{
    ssize_t nwritten, totlen = 0;
    while(totlen != count) {
        nwritten = write(fd,buf,count-totlen);
        if (nwritten == 0) return totlen;
        if (nwritten == -1) return -1;
        totlen += nwritten;
        buf += nwritten;
    }
    return totlen;
}

static int anetListen(char *err, int s, struct sockaddr *sa, socklen_t len, int backlog) {
    if (bind(s,sa,len) == -1) {
        anetSetError(err, "bind: %s", strerror(errno));
        close(s);
        return ANET_ERR;
    }

    if (listen(s, backlog) == -1) {
        anetSetError(err, "listen: %s", strerror(errno));
        close(s);
        return ANET_ERR;
    }
    return ANET_OK;
}

static int anetV6Only(char *err, int s) {
    int yes = 1;
    if (setsockopt(s,IPPROTO_IPV6,IPV6_V6ONLY,&yes,sizeof(yes)) == -1) {
        anetSetError(err, "setsockopt: %s", strerror(errno));
        close(s);
        return ANET_ERR;
    }
    return ANET_OK;
}

static int _anetTcpServer(char *err, int port, char *bindaddr, int af, int backlog)
{
    int s, rv;
    char _port[6];  /* strlen("65535") */
    struct addrinfo hints, *servinfo, *p;

    snprintf(_port,6,"%d",port);
    memset(&hints,0,sizeof(hints));
    hints.ai_family = af;
    hints.ai_socktype = SOCK_STREAM;
    hints.ai_flags = AI_PASSIVE;    /* No effect if bindaddr != NULL */

    if ((rv = getaddrinfo(bindaddr,_port,&hints,&servinfo)) != 0) {
        anetSetError(err, "%s", gai_strerror(rv));
        return ANET_ERR;
    }
    for (p = servinfo; p != NULL; p = p->ai_next) {
        if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1)
            continue;

        if (af == AF_INET6 && anetV6Only(err,s) == ANET_ERR) goto error;
        if (anetSetReuseAddr(err,s) == ANET_ERR) goto error;
        if (anetListen(err,s,p->ai_addr,p->ai_addrlen,backlog) == ANET_ERR) goto error;
        goto end;
    }
    if (p == NULL) {
        anetSetError(err, "unable to bind socket");
        goto error;
    }

error:
    s = ANET_ERR;
end:
    freeaddrinfo(servinfo);
    return s;
}

int anetTcpServer(char *err, int port, char *bindaddr, int backlog)
{
    return _anetTcpServer(err, port, bindaddr, AF_INET, backlog);
}

int anetTcp6Server(char *err, int port, char *bindaddr, int backlog)
{
    return _anetTcpServer(err, port, bindaddr, AF_INET6, backlog);
}

int anetUnixServer(char *err, char *path, mode_t perm, int backlog)
{
    int s;
    struct sockaddr_un sa;

    if ((s = anetCreateSocket(err,AF_LOCAL)) == ANET_ERR)
        return ANET_ERR;

    memset(&sa,0,sizeof(sa));
    sa.sun_family = AF_LOCAL;
    strncpy(sa.sun_path,path,sizeof(sa.sun_path)-1);
    if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa),backlog) == ANET_ERR)
        return ANET_ERR;
    if (perm)
        chmod(sa.sun_path, perm);
    return s;
}

static int anetGenericAccept(char *err, int s, struct sockaddr *sa, socklen_t *len) {
    int fd;
    while(1) {
        fd = accept(s,sa,len);
        if (fd == -1) {
            if (errno == EINTR)
                continue;
            else {
                anetSetError(err, "accept: %s", strerror(errno));
                return ANET_ERR;
            }
        }
        break;
    }
    return fd;
}

int anetTcpAccept(char *err, int s, char *ip, size_t ip_len, int *port) {
    int fd;
    struct sockaddr_storage sa;
    socklen_t salen = sizeof(sa);
    if ((fd = anetGenericAccept(err,s,(struct sockaddr*)&sa,&salen)) == -1)
        return ANET_ERR;

    if (sa.ss_family == AF_INET) {
        struct sockaddr_in *s = (struct sockaddr_in *)&sa;
        if (ip) inet_ntop(AF_INET,(void*)&(s->sin_addr),ip,ip_len);
        if (port) *port = ntohs(s->sin_port);
    } else {
        struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa;
        if (ip) inet_ntop(AF_INET6,(void*)&(s->sin6_addr),ip,ip_len);
        if (port) *port = ntohs(s->sin6_port);
    }
    return fd;
}

int anetUnixAccept(char *err, int s) {
    int fd;
    struct sockaddr_un sa;
    socklen_t salen = sizeof(sa);
    if ((fd = anetGenericAccept(err,s,(struct sockaddr*)&sa,&salen)) == -1)
        return ANET_ERR;

    return fd;
}

int anetPeerToString(int fd, char *ip, size_t ip_len, int *port) {
    struct sockaddr_storage sa;
    socklen_t salen = sizeof(sa);

    if (getpeername(fd,(struct sockaddr*)&sa,&salen) == -1) goto error;
    if (ip_len == 0) goto error;

    if (sa.ss_family == AF_INET) {
        struct sockaddr_in *s = (struct sockaddr_in *)&sa;
        if (ip) inet_ntop(AF_INET,(void*)&(s->sin_addr),ip,ip_len);
        if (port) *port = ntohs(s->sin_port);
    } else if (sa.ss_family == AF_INET6) {
        struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa;
        if (ip) inet_ntop(AF_INET6,(void*)&(s->sin6_addr),ip,ip_len);
        if (port) *port = ntohs(s->sin6_port);
    } else if (sa.ss_family == AF_UNIX) {
        if (ip) strncpy(ip,"/unixsocket",ip_len);
        if (port) *port = 0;
    } else {
        goto error;
    }
    return 0;

error:
    if (ip) {
        if (ip_len >= 2) {
            ip[0] = '?';
            ip[1] = '\0';
        } else if (ip_len == 1) {
            ip[0] = '\0';
        }
    }
    if (port) *port = 0;
    return -1;
}

/* Format an IP,port pair into something easy to parse. If IP is IPv6
 * (matches for ":"), the ip is surrounded by []. IP and port are just
 * separated by colons. This the standard to display addresses within Redis. */
int anetFormatAddr(char *buf, size_t buf_len, char *ip, int port) {
    return snprintf(buf,buf_len, strchr(ip,':') ?
           "[%s]:%d" : "%s:%d", ip, port);
}

/* Like anetFormatAddr() but extract ip and port from the socket's peer. */
int anetFormatPeer(int fd, char *buf, size_t buf_len) {
    char ip[INET6_ADDRSTRLEN];
    int port;

    anetPeerToString(fd,ip,sizeof(ip),&port);
    return anetFormatAddr(buf, buf_len, ip, port);
}

int anetSockName(int fd, char *ip, size_t ip_len, int *port) {
    struct sockaddr_storage sa;
    socklen_t salen = sizeof(sa);

    if (getsockname(fd,(struct sockaddr*)&sa,&salen) == -1) {
        if (port) *port = 0;
        ip[0] = '?';
        ip[1] = '\0';
        return -1;
    }
    if (sa.ss_family == AF_INET) {
        struct sockaddr_in *s = (struct sockaddr_in *)&sa;
        if (ip) inet_ntop(AF_INET,(void*)&(s->sin_addr),ip,ip_len);
        if (port) *port = ntohs(s->sin_port);
    } else {
        struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa;
        if (ip) inet_ntop(AF_INET6,(void*)&(s->sin6_addr),ip,ip_len);
        if (port) *port = ntohs(s->sin6_port);
    }
    return 0;
}

int anetFormatSock(int fd, char *fmt, size_t fmt_len) {
    char ip[INET6_ADDRSTRLEN];
    int port;

    anetSockName(fd,ip,sizeof(ip),&port);
    return anetFormatAddr(fmt, fmt_len, ip, port);
}

 

<think> 我们遇到了一个新的错误信息:`Node redis-cluster-1.redis-cluster.middleware.svc.cluster.local:6380 replied with error: ERR Invalid node address specified: redis-cluster-0.redis-cluster.middleware.svc.cluster.local:6380`。这个错误发生在Redis集群初始化过程中,当我们尝试使用`redis-cli --cluster create`命令创建集群时。让我们分析可能的原因和解决方案。 ### 错误分析 1. **DNS解析问题**:节点名称(如`redis-cluster-0.redis-cluster.middleware.svc.cluster.local`)可能无法被正确解析为IP地址。每个节点在加入集群时都需要能够解析其他节点的域名。 2. **节点未准备好**:在执行初始化命令时,某些节点可能还未完全启动,导致无法识别其他节点地址。 3. **Redis版本兼容性问题**:不同节点可能运行不同版本的Redis,导致协议不兼容。 4. **网络策略限制**:Kubernetes网络策略可能阻止了节点之间的通信,特别是跨节点的通信。 5. **节点配置错误**:Redis节点的配置文件(如`cluster-enabled`)可能未正确设置为集群模式,导致节点拒绝加入集群。 ### 解决方案 #### 1. 验证DNS解析 在初始化Job的容器中,添加DNS解析检查命令,确保每个节点域名都能解析为正确的IP地址。修改Job的command部分: ```yaml command: - /bin/bash - -c - | set -e echo "开始Redis集群初始化..." # 验证节点DNS解析 for i in {0..5}; do echo "解析节点 redis-cluster-$i:" nslookup redis-cluster-$i.redis-cluster.middleware.svc.cluster.local done # 生成节点列表 NODES="" for i in {0..5}; do NODES="${NODES} redis-cluster-$i.redis-cluster.middleware.svc.cluster.local:6380" done # ... 其余部分保持不变 ``` #### 2. 确保节点已启动 在尝试初始化集群之前,等待所有节点准备好。我们可以使用循环检查每个节点是否响应`PING`命令。 ```yaml command: - /bin/bash - -c - | set -e echo "开始Redis集群初始化..." # 等待所有节点启动 for i in {0..5}; do until redis-cli -h redis-cluster-$i.redis-cluster.middleware.svc.cluster.local -p 6380 ping; do echo "节点 redis-cluster-$i 未就绪,等待10秒..." sleep 10 done echo "节点 redis-cluster-$i 已就绪." done # 生成节点列表 # ... (同上) ``` #### 3. 检查Redis集群配置文件 确保每个Redis节点都正确配置了集群模式。在Redis的配置文件中,必须包含: ```conf cluster-enabled yes cluster-node-timeout 15000 cluster-config-file nodes.conf ``` #### 4. 使用IP地址代替域名 如果DNS解析问题持续存在,可以考虑使用Pod的IP地址来初始化集群。但注意,在Kubernetes StatefulSet中,Pod重启后IP可能会变化,因此不推荐长期使用。但作为初始化Job的临时方案,可以尝试: ```yaml command: - /bin/bash - -c - | set -e echo "开始Redis集群初始化..." # 获取当前Pod的IP(仅用于调试,不推荐) # 注意:集群节点应该使用稳定的网络标识(如域名),所以这只是一个备选方案 # 等待节点并获取IP nodes=() for i in {0..5}; do until ip=$(dig +short redis-cluster-$i.redis-cluster.middleware.svc.cluster.local); do echo "等待解析 redis-cluster-$i ..." sleep 5 done echo "节点 redis-cluster-$i 的IP: $ip" nodes+=("$ip:6380") done # 初始化集群 echo "初始化节点: ${nodes[*]}" redis-cli --cluster create "${nodes[@]}" --cluster-replicas 1 --cluster-yes # ... 其余部分 ``` #### 5. 检查网络策略 确保在`middleware`命名空间中,Redis Pod之间允许通信。检查NetworkPolicy是否允许6380端口的TCP通信。 ### 完整修复后的Job配置示例 ```yaml apiVersion: batch/v1 kind: Job metadata: name: redis-cluster-init namespace: middleware spec: backoffLimit: 2 template: spec: restartPolicy: OnFailure containers: - name: cluster-init image: 192.168.3.183:8001/library/redis:6.2.14 command: - /bin/bash - -c - | set -ex echo "===== 开始Redis集群初始化 =====" # 等待所有节点就绪(逐个检查) for i in {0..5}; do node_addr="redis-cluster-$i.redis-cluster.middleware.svc.cluster.local:6380" echo "等待节点 $node_addr 就绪..." until redis-cli -h redis-cluster-$i.redis-cluster.middleware.svc.cluster.local -p 6380 ping; do sleep 10 done echo "节点 $node_addr 已响应." done # 生成节点列表(使用数组避免空格问题) nodes=() for i in {0..5}; do nodes+=("redis-cluster-$i.redis-cluster.middleware.svc.cluster.local:6380") done # 初始化集群 echo "创建集群: ${nodes[*]}" redis-cli --cluster create "${nodes[@]}" \ --cluster-replicas 1 \ --cluster-yes # 验证集群状态 echo "验证集群状态..." redis-cli --cluster check ${nodes[0]} echo "===== Redis集群初始化成功! =====" ``` ### 关键点说明 - **`set -ex`**:开启详细执行日志,出错自动退出。 - **节点等待循环**:确保所有节点启动完成后再初始化。 - **数组存储节点地址**:避免字符串拼接造成空格问题。 - **`--cluster-yes`**:自动确认集群配置变更。 ### 相关问题 1. Kubernetes中StatefulSet的DNS解析机制是什么? 2. Redis集群初始化阶段如何避免节点地址解析失败? 3. 如何诊断Kubernetes中的网络策略问题? 4. Redis集群模式下节点间通信需要哪些端口? 5. 如何为Redis集群配置持久化存储以避免数据丢失?[^1] </think> ### Redis集群初始化错误分析:无效节点地址 在Redis集群初始化job中遇到`ERR Invalid node address specified`错误的主要原因如下: #### 错误原因分析 1. **DNS解析问题**(核心原因): - Kubernetes集群内DNS解析未完成时尝试创建集群[^1] - 域名`redis-cluster-X.redis-cluster.middleware.svc.cluster.local`尚未被集群DNS系统正确注册 - Job容器尝试连接尚未完全就绪的Redis节点 2. **节点启动顺序问题**: - StatefulSet中的Redis节点未完全启动就开始初始化 - 各节点启动速度不一致导致部分节点无法识别其他节点 3. **网络策略限制**: - Kubernetes网络策略可能阻止了节点间的通信 - Redis集群需要的端口(6380)未完全开放 4. **Redis配置问题**: - `cluster-enabled`配置未正确设置 - 节点未以集群模式启动 --- ### 完整解决方案 #### 修复后的Job配置(redis-cluster-job.yaml) ```yaml apiVersion: batch/v1 kind: Job metadata: name: redis-cluster-init namespace: middleware spec: backoffLimit: 3 # 增加重试次数 template: spec: restartPolicy: OnFailure containers: - name: cluster-init image: 192.168.3.183:8001/library/redis:6.2.14 command: - /bin/bash - -c - | set -ex # 启用详细日志和出错退出 echo "===== Redis集群初始化开始 =====" # 1. 确保DNS解析就绪 echo "验证DNS解析..." for i in {0..5}; do until nslookup redis-cluster-$i.redis-cluster.middleware.svc.cluster.local >/dev/null 2>&1; do echo "等待 redis-cluster-$i DNS解析..." sleep 10 done done # 2. 等待所有Redis节点就绪 echo "等待节点启动..." for i in {0..5}; do until redis-cli -h redis-cluster-$i.redis-cluster.middleware.svc.cluster.local -p 6380 ping >/dev/null 2>&1; do echo "节点 $i 未就绪,等待..." sleep 15 done done sleep 20 # 额外缓冲时间 # 3. 使用IP地址代替域名(备选方案) echo "获取节点IP地址..." nodes=() for i in {0..5}; do ip=$(getent hosts redis-cluster-$i.redis-cluster.middleware.svc.cluster.local | awk '{ print $1 }') nodes+=("$ip:6380") done # 4. 创建集群(双保险方案) echo "初始化集群..." if ! redis-cli --cluster create "${nodes[@]}" \ --cluster-replicas 1 \ --cluster-yes; then # 回退到域名方式(某些环境IP可能变化) echo "使用IP方式失败,尝试域名方式..." redis-cli --cluster create \ redis-cluster-0.redis-cluster.middleware.svc.cluster.local:6380 \ redis-cluster-1.redis-cluster.middleware.svc.cluster.local:6380 \ redis-cluster-2.redis-cluster.middleware.svc.cluster.local:6380 \ redis-cluster-3.redis-cluster.middleware.svc.cluster.local:6380 \ redis-cluster-4.redis-cluster.middleware.svc.cluster.local:6380 \ redis-cluster-5.redis-cluster.middleware.svc.cluster.local:6380 \ --cluster-replicas 1 \ --cluster-yes fi # 5. 最终验证 echo "验证集群状态..." redis-cli --cluster check redis-cluster-0.redis-cluster.middleware.svc.cluster.local:6380 redis-cli -h redis-cluster-0 -p 6380 cluster info | grep -E 'cluster_state|cluster_slots_covered' echo "===== Redis集群初始化成功! =====" ``` #### 关键修复点说明 1. **DNS等待机制**: ```bash until nslookup redis-cluster-$i...; do sleep 10; done ``` 确保所有节点域名可解析[^1] 2. **节点就绪检查**: ```bash until redis-cli -h redis-cluster-$i ping; do sleep 15; done ``` 确认Redis进程已启动并响应 3. **双保险连接方式**: - 优先使用IP地址连接(避免DNS未刷新) - 域名方式作为备选方案(防止IP变化) 4. **延长等待时间**: `sleep 20` 确保所有节点完成握手和配置同步 --- ### 验证成功指标 1. **集群状态**: ```bash cluster_state:ok ``` 2. **槽位覆盖**: ```bash cluster_slots_covered:16384 ``` 3. **节点角色**: ```bash 3 master(s) | 3 slave(s) ``` --- ### 相关问题 1. Kubernetes StatefulSet中如何确保DNS解析顺序? 2. Redis集群初始化时需要开放哪些网络端口?[^1] 3. 如何诊断Kubernetes中的DNS解析问题? 4. Redis集群节点无法互通的常见网络策略配置有哪些? 5. 为什么Redis集群需要精确分配16384个槽位?[^1]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值