TCP cookie代码走读_without syncookies last quarter of * backlog is f-优快云博客

本文深入解析TCP SYN Cookie的工作原理，包括如何计算和验证SYN Cookie，以及在网络优化中可能遇到的问题。当半连接队列满或受到SYN flood攻击时，SYN Cookie用于保护服务器，避免资源耗尽。SYN Cookie通过序列号计算包含连接信息，如源和目的地址、端口、时间戳等，并在ACK包中校验和提取MSS。文章还探讨了SYN Cookie的不足，如丢失其他TCP选项信息，以及可能的优化策略。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

今天带大家走读TCP cookie的相关代码，让大家了解cookie的原理。

收到SYN包，计算COOKIE值

关键代码路径：

tcp_v4_rcv

tcp_v4_do_rcv

tcp_rcv_state_process

icsk->icsk_af_ops->conn_request = tcp_v4_conn_request

tcp_conn_request

进入主处理函数：

int tcp_conn_request(struct request_sock_ops *rsk_ops,
                     const struct tcp_request_sock_ops *af_ops,
                     struct sock *sk, struct sk_buff *skb)
{
        struct tcp_fastopen_cookie foc = { .len = -1 };
        __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
        struct tcp_options_received tmp_opt;
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        struct sock *fastopen_sk = NULL;
        struct request_sock *req;
        bool want_cookie = false;
        struct dst_entry *dst;
        struct flowi fl;

        /* TW buckets are converted to open requests without
         * limitations, they conserve resources and peer is
         * evidently real one.
         */
        if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
             inet_csk_reqsk_queue_is_full(sk)) && !isn) {
                want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
                if (!want_cookie)
                        goto drop;
        }

        if (sk_acceptq_is_full(sk)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
                goto drop;
        }

        req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
        if (!req)
                goto drop;

        req->syncookie = want_cookie;
        tcp_rsk(req)->af_specific = af_ops;
        tcp_rsk(req)->ts_off = 0;
#if IS_ENABLED(CONFIG_MPTCP)
        tcp_rsk(req)->is_mptcp = 0;
#endif

        tcp_clear_options(&tmp_opt);
        tmp_opt.mss_clamp = af_ops->mss_clamp;
        tmp_opt.user_mss  = tp->rx_opt.user_mss;
        tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
                          want_cookie ? NULL : &foc);

        if (want_cookie && !tmp_opt.saw_tstamp)
                tcp_clear_options(&tmp_opt);

        if (IS_ENABLED(CONFIG_SMC) && want_cookie)
                tmp_opt.smc_ok = 0;

        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
        tcp_openreq_init(req, &tmp_opt, skb, sk);
        inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;

        /* Note: tcp_v6_init_req() might override ir_iif for link locals */
        inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);

        dst = af_ops->route_req(sk, skb, &fl, req);
        if (!dst)
                goto drop_and_free;

        if (tmp_opt.tstamp_ok)
                tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
  
          if (!want_cookie && !isn) {
                /* Kill the following clause, if you dislike this way. */
                if (!net->ipv4.sysctl_tcp_syncookies &&
                    (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
                     (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
                    !tcp_peer_is_proven(req, dst)) {
                        /* Without syncookies last quarter of
                         * backlog is filled with destinations,
                         * proven to be alive.
                         * It means that we continue to communicate
                         * to destinations, already remembered
                         * to the moment of synflood.
                         */
                        pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
                                    rsk_ops->family);
                        goto drop_and_release;
                }

                isn = af_ops->init_seq(skb);
        }

        tcp_ecn_create_request(req, skb, sk, dst);

        if (want_cookie) {
                isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
                if (!tmp_opt.tstamp_ok)
                        inet_rsk(req)->ecn_ok = 0;
        }

        tcp_rsk(req)->snt_isn = isn;
        tcp_rsk(req)->txhash = net_tx_rndhash();
        tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
        tcp_openreq_init_rwin(req, sk, dst);
        sk_rx_queue_set(req_to_sk(req), skb);
        if (!want_cookie) {
                tcp_reqsk_record_syn(sk, req, skb);
                fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
        }
        if (fastopen_sk) {
                af_ops->send_synack(fastopen_sk, dst, &fl, req,
                                    &foc, TCP_SYNACK_FASTOPEN, skb);
                /* Add the child socket directly into the accept queue */
                if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
                        reqsk_fastopen_remove(fastopen_sk, req, false);
                        bh_unlock_sock(fastopen_sk);
                        sock_put(fastopen_sk);
                        goto drop_and_free;
                }
                sk->sk_data_ready(sk);
                bh_unlock_sock(fastopen_sk);
                sock_put(fastopen_sk);
        } else {
                tcp_rsk(req)->tfo_listener = false;
                if (!want_cookie)
                        inet_csk_reqsk_queue_hash_add(sk, req,
                                tcp_timeout_init((struct sock *)req));
                af_ops->send_synack(sk, dst, &fl, req, &foc,
                                    !want_cookie ? TCP_SYNACK_NORMAL :
                                                   TCP_SYNACK_COOKIE,
                                    skb);
                if (want_cookie) {
                        reqsk_free(req);
                        return 0;
                }
        }
        reqsk_put(req);
        return 0;

drop_and_release:
        dst_release(dst);
drop_and_free:
        __reqsk_free(req);
drop:
        tcp_listendrop(sk);
        return 0;
}
EXPORT_SYMBOL(tcp_conn_request);

关键代码代码详解：

        /* TW buckets are converted to open requests without
         * limitations, they conserve resources and peer is
         * evidently real one.
         */
        if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
             inet_csk_reqsk_queue_is_full(sk)) && !isn) {
                want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
                if (!want_cookie)
                        goto drop;
        }

net->
ipv4.sysctl_tcp_syncookies == 2 表示强制开启cookie功能，正常服务器配置为 1

inet_csk_reqsk_queue_is_full(sk) 判断半连接队列是否已满，如果满了开启cookie功能。

半连接队列满了，但是没有开启cookie，直接drop掉syn报文。

        if (sk_acceptq_is_full(sk)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
                goto drop;
        }

accept队列满了，这时候也直接drop掉。accept队列的大小是在listen系统调用时配置的，如果不配置使用内核默认配置。

        if (want_cookie) {
                isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
                if (!tmp_opt.tstamp_ok)
                        inet_rsk(req)->ecn_ok = 0;
        }

计算synack报文的isn(seq序列号)，进入cookie_v4_init_sequence函数，请注意传入该函数的第四个参数mss为syn报文中解析到的对端协商的MSS，实现如下：

__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
{
        const struct iphdr *iph = ip_hdr(skb);
        const struct tcphdr *th = tcp_hdr(skb);

        return __cookie_v4_init_sequence(iph, th, mssp);
}

static __u16 const msstab[] = {
        536,
        1300,
        1440,   /* 1440, 1452: PPPoE */
        1460,
};

/*
 * Generate a syncookie.  mssp points to the mss, which is returned
 * rounded down to the value encoded in the cookie.
 */
u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
                              u16 *mssp)
{
        int mssind;
        const __u16 mss = *mssp;

        for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
                if (mss >= msstab[mssind])
                        break;
        *mssp = msstab[mssind];

        return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
                                     th->source, th->dest, ntohl(th->seq),
                                     mssind);
}

因为seq只有32bit不能完整的保留mss，内核只能用最小的空间，表达最合理的MSS值。所以内核定义了一个msstab的数组，数组的序号对应了MSS值，且数组的序号降本编译进seq序号中，具体放置在什么未知后文分析。

选择最合适的MSS也是一个技术活，不能大于对端的mss，且需要尽可能逼近对端的mss，所以内核的实现也很简单，不讲解了，在上面函数的代码中。

static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
                                   __be16 dport, __u32 sseq, __u32 data)
{
        /*
         * Compute the secure sequence number.
         * The output should be:
         *   HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
         *      + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
         * Where sseq is their sequence number and count increases every
         * minute by 1.
         * As an extra hack, we add a small "data" value that encodes the
         * MSS into the second hash value.
         */
        u32 count = tcp_cookie_time();
        return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
                sseq + (count << COOKIEBITS) +
                ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
                 & COOKIEMASK));
}

下面是生产synack的seq序列号了(可以自行了解syncookie的基本原理)，__cookie_v4_init_sequence函数中选定的mss作为参数data传入了secure_tcp_syn_cookie中，syncookie由四部分信息合成：

四元组计算得到的(cookie_hash(saddr, daddr, sport, dport, 0, 0))值，暂且表达为a；
客户端的seq值(sseq)，暂且表达为b；
当前计算的时间的u32值((count << COOKIEBITS))，低8位移动到了高8位，低24清零，暂且表达为c;
自由空间值((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK)，自由空间值暂且表达为d，自由空间值又由两部分组成，分别是四元组计算得到的值(表达为d)和data值(表达为e)，计算只是用了自由空间值的低32位。
c 高8位和 d 低32位组合暂且表达为f；

cookie_hash函数是利用了四元组计算hash值，具体细节自己看代码了，不理解可查RFC。

tcp_cookie_time()函数利用jiffiess计算时间，函数输出的结果换位成分钟了。也即cout表示开机到现在持续的分钟数。count 低8位移动到高24-31位，count最大表示256分钟了。

(cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK) ，整体只保留了低24位，并且data被编排在低2位。(count << COOKIEBITS) 保留的高8位。

可以尝试想象，如果SYNACK的响应包被收到了，那么a 和 b 是明确的，这时候可以提取c的高8位，检查时间是否过期(syncookie要求synack在特定时间内必须回应，超时失效)，提取c的低24位，在低24位中再次理由四元组计算得到MSS。假设收到的ack报文序号为seq，应答序号为seq_ack，那么计算还原思路应该是：seq_ack - 1 = a + b + f，其中a,b都是已知变量，f = seq_ack - 1 - a - b，提取f的的高8位和当前的时间比较，提取f的低32位, f(低32位) = 利用四元组计算hash值 + MSS索引值，最终推导了MSS的所以，提取了MSS值。

收到ACK包，合法校验和MSS提取

收到cookie机制下的带三次握手

tcp_v4_rcv

tcp_v4_do_rcv (在TCP_LISTEN状态接收到ACK)

tcp_v4_cookie_check

cookie_v4_check

关键代码走读：

/*
 * Check if a ack sequence number is a valid syncookie.
 * Return the decoded mss if it is, or 0 if not.
 */
int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
                      u32 cookie)
{
        __u32 seq = ntohl(th->seq) - 1;
        __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
                                            th->source, th->dest, seq);

        return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
}
EXPORT_SYMBOL_GPL(__cookie_v4_check);

其中cookie是server接受到SYN报文时计算的syn cookie值，实际上等于ack_seq - 1。

mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; 很明显是计算到了mss数组的所以，提取对应的MSS。


/*
 * This retrieves the small "data" value from the syncookie.
 * If the syncookie is bad, the data returned will be out of
 * range.  This must be checked by the caller.
 *
 * The count value used to generate the cookie must be less than
 * MAX_SYNCOOKIE_AGE minutes in the past.
 * The return value (__u32)-1 if this test fails.
 */
static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
                                  __be16 sport, __be16 dport, __u32 sseq)
{
        u32 diff, count = tcp_cookie_time();

        /* Strip away the layers from the cookie */
        cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;

        /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
        diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
        if (diff >= MAX_SYNCOOKIE_AGE)
                return (__u32)-1;

        return (cookie -
                cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
                & COOKIEMASK;   /* Leaving the data behind */
}

count = tcp_cookie_time(); 获取到当前的时间。

cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; 计算得到了上面的f，

        /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
        diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
        if (diff >= MAX_SYNCOOKIE_AGE)
                return (__u32)-1;

计算应答是否在合理的时间范围内，其中 MAX_SYNCOOKIE_AGE 为 2，也即要求在2分钟之内必须应答，否则直接丢失。后面的一堆位运算和移动预算显然是在提取f高8位值。

显然还需要提取MSS的索引，MSS索引编排在f的低32位中。

        return (cookie -
                cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
                & COOKIEMASK;   /* Leaving the data behind */

上面讲过了 f = (四元组计算的hash值 + mss 索引) 的低32位值，所以上面代码先利用四元组计算到hash值，减去后得到了mss索引。

讲到这里syncookie的基本内容也讲完了。

网络优化相关的思考

如果你是做网络优化的，尤其是在CDN行业从事网络优化工作者，你一定发现了TCP syncookie的不足，因为server只编排保存了MSS信息(MSS都是偏小的)，其它的协商信息全部丢失，丢失的ws，sack等关键信息，这些信息很大程度上决定了TCP传输的效率，很难想象最大窗口为65535，不支持sack的TCP连接在丢包的情况，究竟面临的是什么再难。

所以，你一定在想有没有办法让server编排保持w设sack等信息呢，肯定是有的，但是需要付出一定的代价。编排MSS引入了碰撞冲突，同样编排ws和sack也会引入冲突，并且还会放大冲突，细节不讲。