今天带大家走读TCP cookie的相关代码,让大家了解cookie的原理。
收到SYN包,计算COOKIE值
关键代码路径:
tcp_v4_rcv
tcp_v4_do_rcv
tcp_rcv_state_process
icsk->icsk_af_ops->conn_request = tcp_v4_conn_request
tcp_conn_request
进入主处理函数:
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
{
struct tcp_fastopen_cookie foc = { .len = -1 };
__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct request_sock *req;
bool want_cookie = false;
struct dst_entry *dst;
struct flowi fl;
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
* evidently real one.
*/
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
if (!want_cookie)
goto drop;
}
if (sk_acceptq_is_full(sk)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
if (!req)
goto drop;
req->syncookie = want_cookie;
tcp_rsk(req)->af_specific = af_ops;
tcp_rsk(req)->ts_off = 0;
#if IS_ENABLED(CONFIG_MPTCP)
tcp_rsk(req)->is_mptcp = 0;
#endif
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
tmp_opt.user_mss = tp->rx_opt.user_mss;
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
want_cookie ? NULL : &foc);
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
if (IS_ENABLED(CONFIG_SMC) && want_cookie)
tmp_opt.smc_ok = 0;
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb, sk);
inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
dst = af_ops->route_req(sk, skb, &fl, req);
if (!dst)
goto drop_and_free;
if (tmp_opt.tstamp_ok)
tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
if (!want_cookie && !isn) {
/* Kill the following clause, if you dislike this way. */
if (!net->ipv4.sysctl_tcp_syncookies &&
(net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(net->ipv4.sysctl_max_syn_backlog >> 2)) &&
!tcp_peer_is_proven(req, dst)) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
* It means that we continue to communicate
* to destinations, already remembered
* to the moment of synflood.
*/
pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
rsk_ops->family);
goto drop_and_release;
}
isn = af_ops->init_seq(skb);
}
tcp_ecn_create_request(req, skb, sk, dst);
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
if (!tmp_opt.tstamp_ok)
inet_rsk(req)->ecn_ok = 0;
}
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
tcp_openreq_init_rwin(req, sk, dst);
sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
&foc, TCP_SYNACK_FASTOPEN, skb);
/* Add the child socket directly into the accept queue */
if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
reqsk_fastopen_remove(fastopen_sk, req, false);
bh_unlock_sock(fastopen_sk);
sock_put(fastopen_sk);
goto drop_and_free;
}
sk->sk_data_ready(sk);
bh_unlock_sock(fastopen_sk);
sock_put(fastopen_sk);
} else {
tcp_rsk(req)->tfo_listener = false;
if (!want_cookie)
inet_csk_reqsk_queue_hash_add(sk, req,
tcp_timeout_init((struct sock *)req));
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
TCP_SYNACK_COOKIE,
skb);
if (want_cookie) {
reqsk_free(req);
return 0;
}
}
reqsk_put(req);
return 0;
drop_and_release:
dst_release(dst);
drop_and_free:
__reqsk_free(req);
drop:
tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_conn_request);
关键代码代码详解:
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
* evidently real one.
*/
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
if (!want_cookie)
goto drop;
}
net->
ipv4.sysctl_tcp_syncookies == 2 表示强制开启cookie功能,正常服务器配置为 1
inet_csk_reqsk_queue_is_full(sk) 判断半连接队列是否已满,如果满了开启cookie功能。
半连接队列满了,但是没有开启cookie,直接drop掉syn报文。
if (sk_acceptq_is_full(sk)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
accept队列满了,这时候也直接drop掉。accept队列的大小是在listen系统调用时配置的,如果不配置使用内核默认配置。
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
if (!tmp_opt.tstamp_ok)
inet_rsk(req)->ecn_ok = 0;
}
计算synack报文的isn(seq序列号),进入cookie_v4_init_sequence函数,请注意传入该函数的第四个参数mss为syn报文中解析到的对端协商的MSS,实现如下:
__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
{
const struct iphdr *iph = ip_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
return __cookie_v4_init_sequence(iph, th, mssp);
}
static __u16 const msstab[] = {
536,
1300,
1440, /* 1440, 1452: PPPoE */
1460,
};
/*
* Generate a syncookie. mssp points to the mss, which is returned
* rounded down to the value encoded in the cookie.
*/
u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
u16 *mssp)
{
int mssind;
const __u16 mss = *mssp;
for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
if (mss >= msstab[mssind])
break;
*mssp = msstab[mssind];
return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
th->source, th->dest, ntohl(th->seq),
mssind);
}
因为seq只有32bit不能完整的保留mss,内核只能用最小的空间,表达最合理的MSS值。所以内核定义了一个msstab的数组,数组的序号对应了MSS值,且数组的序号降本编译进seq序号中,具体放置在什么未知后文分析。
选择最合适的MSS也是一个技术活,不能大于对端的mss,且需要尽可能逼近对端的mss,所以内核的实现也很简单,不讲解了,在上面函数的代码中。
static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
__be16 dport, __u32 sseq, __u32 data)
{
/*
* Compute the secure sequence number.
* The output should be:
* HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
* + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
* Where sseq is their sequence number and count increases every
* minute by 1.
* As an extra hack, we add a small "data" value that encodes the
* MSS into the second hash value.
*/
u32 count = tcp_cookie_time();
return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
sseq + (count << COOKIEBITS) +
((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
& COOKIEMASK));
}
下面是生产synack的seq序列号了(可以自行了解syncookie的基本原理),__cookie_v4_init_sequence函数中选定的mss作为参数data传入了secure_tcp_syn_cookie中,syncookie由四部分信息合成:
- 四元组计算得到的(cookie_hash(saddr, daddr, sport, dport, 0, 0))值,暂且表达为a;
- 客户端的seq值(sseq),暂且表达为b;
- 当前计算的时间的u32值((count << COOKIEBITS)),低8位移动到了高8位,低24清零,暂且表达为c;
- 自由空间值((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK),自由空间值暂且表达为d,自由空间值又由两部分组成,分别是四元组计算得到的值(表达为d)和data值(表达为e),计算只是用了自由空间值的低32位。
- c 高8位 和 d 低32位组合暂且表达为f;
cookie_hash函数是利用了四元组计算hash值,具体细节自己看代码了,不理解可查RFC。
tcp_cookie_time()函数利用jiffiess计算时间,函数输出的结果换位成分钟了。也即cout表示开机到现在持续的分钟数。count 低8位移动到高24-31位,count最大表示256分钟了。
(cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK) ,整体只保留了低24位,并且data被编排在低2位。(count << COOKIEBITS) 保留的高8位。
可以尝试想象,如果SYNACK的响应包被收到了,那么a 和 b 是明确的,这时候可以提取c的高8位,检查时间是否过期(syncookie要求synack在特定时间内必须回应,超时失效),提取c的低24位,在低24位中再次理由四元组计算得到MSS。假设收到的ack报文序号为seq,应答序号为seq_ack,那么计算还原思路应该是:seq_ack - 1 = a + b + f,其中a,b都是已知变量 ,f = seq_ack - 1 - a - b,提取f的的高8位和当前的时间比较,提取f的低32位, f(低32位) = 利用四元组计算hash值 + MSS索引值, 最终推导了MSS的所以,提取了MSS值。
收到ACK包,合法校验和MSS提取
收到cookie机制下的带三次握手
tcp_v4_rcv
tcp_v4_do_rcv (在TCP_LISTEN状态接收到ACK)
tcp_v4_cookie_check
cookie_v4_check
关键代码走读:
/*
* Check if a ack sequence number is a valid syncookie.
* Return the decoded mss if it is, or 0 if not.
*/
int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
u32 cookie)
{
__u32 seq = ntohl(th->seq) - 1;
__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
th->source, th->dest, seq);
return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
}
EXPORT_SYMBOL_GPL(__cookie_v4_check);
其中cookie是server接受到SYN报文时计算的syn cookie值,实际上等于ack_seq - 1。
mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; 很明显是计算到了mss数组的所以,提取对应的MSS。
/*
* This retrieves the small "data" value from the syncookie.
* If the syncookie is bad, the data returned will be out of
* range. This must be checked by the caller.
*
* The count value used to generate the cookie must be less than
* MAX_SYNCOOKIE_AGE minutes in the past.
* The return value (__u32)-1 if this test fails.
*/
static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
__be16 sport, __be16 dport, __u32 sseq)
{
u32 diff, count = tcp_cookie_time();
/* Strip away the layers from the cookie */
cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
if (diff >= MAX_SYNCOOKIE_AGE)
return (__u32)-1;
return (cookie -
cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
& COOKIEMASK; /* Leaving the data behind */
}
count = tcp_cookie_time(); 获取到当前的时间。
cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; 计算得到了上面的f,
/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
if (diff >= MAX_SYNCOOKIE_AGE)
return (__u32)-1;
计算应答是否在合理的时间范围内,其中 MAX_SYNCOOKIE_AGE 为 2,也即要求在2分钟之内必须应答,否则直接丢失。后面的一堆位运算和移动预算显然是在提取f高8位值。
显然还需要提取MSS的索引,MSS索引编排在f的低32位中。
return (cookie -
cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
& COOKIEMASK; /* Leaving the data behind */
上面讲过了 f = (四元组计算的hash值 + mss 索引) 的低32位值,所以上面代码先利用四元组计算到hash值,减去后得到了mss索引。
讲到这里syncookie的基本内容也讲完了。
网络优化相关的思考
如果你是做网络优化的,尤其是在CDN行业从事网络优化工作者,你一定发现了TCP syncookie的不足,因为server只编排保存了MSS信息(MSS都是偏小的),其它的协商信息全部丢失,丢失的ws,sack等关键信息,这些信息很大程度上决定了TCP传输的效率,很难想象最大窗口为65535,不支持sack的TCP连接在丢包的情况,究竟面临的是什么再难。
所以,你一定在想有没有办法让server编排保持w设sack等信息呢,肯定是有的,但是需要付出一定的代价。编排MSS引入了碰撞冲突,同样编排ws和sack也会引入冲突,并且还会放大冲突,细节不讲。