TCP窗口的选择

TCP窗口的选择
-
1) TCP连接可看成端与端之间全双工的数据管道,
每一端既是发送端又是接收端并且可以同时发送和接收数据流.
输入数据流被发送端分割成连续的TCP片段, 再被接收端拼合成连续的输出数据流.
由于TCP片段经过IP网络传输是有延迟的, 发送端发送TCP片段时采用了流水线策略.
发送端并不要求接收端对每一个TCP片段作出应答, 而是连续地发送TCP片段,
直到超过一定的限度. 接收端接收到TCP片段后, 以一定的策略作出响应,
将已经成功拼合的数据流的终止序列号发送给对端. 发送端跟踪接收端的响应,
使未响应的已发送数据不超过当前的发送窗口长度. 可见,
发送端数据的持续流动是靠接收端生成的响应来异步驱动的, 合理的选择发送窗口尺寸,
可以补偿响应延迟, 使得数据能够稳定地流动, 消除等待状态, 允分利用网络带宽.
-
2) 发送窗口尺寸不是由发送端而是由接收端选择.
连接端点之间通过TCP包window字段动态交换各自的接收窗口尺寸,
使得接收端的接收窗口等于发送端的发送窗口. 发送端每生成一个TCP包,
都要重新计算window字段, 通告自已的接收窗口.
接收窗口尺寸的选取与对接收端MSS的估测值(ack.rcv_mss)和当前可用的接收缓存总量有关,
接收缓存按一定的比例划为可用接收窗口,
比例系数可由sysctl_tcp_adv_win_scale系统配置设定. 计算出的窗口尺寸是MSS的整数倍,
如果接收缓存比较宽裕, 当前接收窗口小于可用窗口至少1个rcv_mss时,
保持当前接收窗口尺寸不变, 不变的接收窗口可以提高接收端的处理效率.
当可用窗口不足1个MSS时, 产生零窗口, 意味着发送端拒绝接收数据. 如果系统内存比较紧张,
发送窗口被钳制为4个通告钳位段长(advmss).
-
3) 连接成功后数据传送前的接收窗口称为初始接收窗口,
它的值由套接字接收缓存总量和通告钳位段长(dvmss)算出, 一般情况下, 当MSS不超过1460时,
初始窗口不超过4个MSS, MSS大于1460不超过4380时, 初始窗口不超过3个MSS, MSS大于4380时,
初始窗口不超过2个MSS. 接收钳位窗口(rcv_ssthresh)初值为初始接收窗口.
-
4) 在连接握手时, 可以用窗口标度选项(TCPOPT_WINDOW)按指数倍扩展16位window所表示的尺寸,
指数最大为14, 最大允许约1G的窗口. 发送端生成window字段所用的标度为tp->rcv_scale,
接收端提取window字段所用的标度为tp->snd_scale.


-

#define MAX_TCP_WINDOW 32767

int sysctl_tcp_app_win = 31;

/* Determine a window scaling and initial window to offer.
* Based on the assumption that the given amount of space
* will be offered. Store the results in the tp structure.
* NOTE: for smooth operation initial space offering should
* be a multiple of mss if possible. We assume here that mss >= 1.
* This MUST be enforced by all callers.
*/
static inline void tcp_select_initial_window(int space, __u32 mss,
__u32 *rcv_wnd,
__u32 *window_clamp,
int wscale_ok,
__u8 *rcv_wscale) 初始接收窗口的选择
{
/* If no clamp set the clamp to the max possible scaled window */
if (*window_clamp == 0)
(*window_clamp) = (65535<<14); 设置极限钳位窗口, 经过重新定标的窗口尺寸最大约为1G
space = min(*window_clamp,space);

/* Quantize space offering to a multiple of mss if possible. */
if (space > mss)
space = (space/mss)*mss; 窗口尺寸为MSS的整数倍

/* NOTE: offering an initial window larger than 32767
* will break some buggy TCP stacks. We try to be nice.
* If we are not window scaling, then this truncates
* our initial window offering to 32k. There should also
* be a sysctl option to stop being nice.
*/
(*rcv_wnd) = min(space, MAX_TCP_WINDOW); 窗口基数不超过32767
(*rcv_wscale) = 0;
if (wscale_ok) { 允许窗口标度
/* See RFC1323 for an explanation of the limit to 14 */
while (space > 65535 && (*rcv_wscale) < 14) { 求出窗口比例指数
space >>= 1;
(*rcv_wscale)++;
}
if (*rcv_wscale && sysctl_tcp_app_win && space>=mss &&
space - max((space>>sysctl_tcp_app_win), mss>>*rcv_wscale) < 65536/2)
(*rcv_wscale)--;
}

/* Set initial window to value enough for senders,
* following RFC1414. Senders, not following this RFC,
* will be satisfied with 2.
*/
if (mss > (1<<*rcv_wscale)) {
int init_cwnd = 4;
if (mss > 1460*3)
init_cwnd = 2;
else if (mss > 1460)
init_cwnd = 3;
if (*rcv_wnd > init_cwnd*mss)
*rcv_wnd = init_cwnd*mss;
}
/* Set the clamp no higher than max representable value */
(*window_clamp) = min(65535<<(*rcv_wscale),*window_clamp);
}

/* Chose a new window to advertise, update state in tcp_opt for the
* socket, and return result with RFC1323 scaling applied. The return
* value can be stuffed directly into th->window for an outgoing
* frame.
*/
static __inline__ u16 tcp_select_window(struct sock *sk) 数据传输时window字段的选择
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
u32 cur_win = tcp_receive_window(tp); 取当前接收窗口内剩余有效接收空间
u32 new_win = __tcp_select_window(sk); 根据套接字剩余接收缓存选择的接收窗口

/* Never shrink the offered window */
if(new_win < cur_win) {
/* Danger Will Robinson!
* Don't update rcv_wup/rcv_wnd here or else
* we will not be able to advertise a zero
* window in time. --DaveM
*
* Relax Will Robinson.
*/
new_win = cur_win; 取大者
}
tp->rcv_wnd = new_win; 刷新接收窗口尺寸
tp->rcv_wup = tp->rcv_nxt; 刷新接收窗口的起始位置

/* RFC1323 scaling applied */
new_win >>= tp->rcv_wscale; 重新标度

#ifdef TCP_FORMAL_WINDOW
if (new_win == 0) {
/* If we advertise zero window, disable fast path. */
tp->pred_flags = 0;
} else if (cur_win == 0 && tp->pred_flags == 0 &&
skb_queue_len(&tp->out_of_order_queue) == 0 &&
!tp->urg_data) {
/* If we open zero window, enable fast path.
Without this it will be open by the first data packet,
it is too late to merge checksumming to copy.
*/
tcp_fast_path_on(tp);
}
#endif

return new_win;
}

/* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
*/
static __inline__ u32 tcp_receive_window(struct tcp_opt *tp)
{
s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; 计算接收窗口中剩余空间

if (win < 0)
win = 0;
return (u32) win;
}
/* This function returns the amount that we can raise the
* usable window based on the following constraints
*
* 1. The window can never be shrunk once it is offered (RFC 793)
* 2. We limit memory per socket
*
* RFC 1122:
* "the suggested [SWS] avoidance algorithm for the receiver is to keep
* RECV.NEXT + RCV.WIN fixed until:
* RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
*
* i.e. don't raise the right edge of the window until you can raise
* it at least MSS bytes.
*
* Unfortunately, the recommended algorithm breaks header prediction,
* since header prediction assumes th->window stays fixed.
*
* Strictly speaking, keeping th->window fixed violates the receiver
* side SWS prevention criteria. The problem is that under this rule
* a stream of single byte packets will cause the right side of the
* window to always advance by a single byte.
*
* Of course, if the sender implements sender side SWS prevention
* then this will not be a problem.
*
* BSD seems to make the following compromise:
*
* If the free space is less than the 1/4 of the maximum
* space available and the free space is less than 1/2 mss,
* then set the window to 0.
* [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
* Otherwise, just prevent the window from shrinking
* and from being larger than the largest representable value.
*
* This prevents incremental opening of the window in the regime
* where TCP is limited by the speed of the reader side taking
* data out of the TCP receive queue. It does nothing about
* those cases where the window is constrained on the sender side
* because the pipeline is full.
*
* BSD also seems to "accidentally" limit itself to windows that are a
* multiple of MSS, at least until the free space gets quite small.
* This would appear to be a side effect of the mbuf implementation.
* Combining these two algorithms results in the observed behavior
* of having a fixed window size at almost all times.
*
* Below we obtain similar behavior by forcing the offered window to
* a multiple of the mss when it is feasible to do so.
*
* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
* Regular options like TIMESTAMP are taken into account.
*/
u32 __tcp_select_window(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
/* MSS for the peer's data. Previous verions used mss_clamp
* here. I don't know if the value based on our guesses
* of peer's MSS is better for the performance. It's more correct
* but may be worse for the performance because of rcv_mss
* fluctuations. --SAW 1998/11/1
*/
unsigned int mss = tp->ack.rcv_mss; 取对端MSS的估测值
int free_space;
u32 window;

/* Sometimes free_space can be < 0. */
free_space = tcp_space(sk); 取套接字剩余接收缓存允许的窗口
if (tp->window_clamp < mss)
mss = tp->window_clamp; 不超过发送端最大接收窗口

if (free_space < (int)min(tp->window_clamp, tcp_full_space(sk)) / 2) {
如果可用窗口小于半个极限窗口
tp->ack.quick = 0; 脱离立即响应状态

if (tcp_memory_pressure) 如果处于内存紧张状态
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4*tp->advmss);
使钳位窗口不超过4个初始通告钳位段长

if (free_space < ((int)mss)) 如果可用窗口小于对端MSS
return 0; 返回零窗口
}

if (free_space > tp->rcv_ssthresh)
free_space = tp->rcv_ssthresh; 钳位可用窗口

/* Get the largest window that is a nice multiple of mss.
* Window clamp already applied above.
* If our current window offering is within 1 mss of the
* free space we just keep it. This prevents the divide
* and multiply from happening most of the time.
* We also don't do any window rounding when the free space
* is too small.
*/
window = tp->rcv_wnd; 取当前接收窗口
if ((((int) window) <= (free_space - ((int) mss))) ||
如果当前接收窗口小于可用窗口1个MSS 或者
(((int) window) > free_space)) 当前窗口大于可用窗口
window = (((unsigned int) free_space)/mss)*mss;
将可用窗口按照对端MSS取整后设为当前窗口

return window;
}
/* Note: caller must be prepared to deal with negative returns */
static inline int tcp_space(struct sock *sk)
{
return tcp_win_from_space(sk->rcvbuf - atomic_read(&sk->rmem_alloc));
}

int sysctl_tcp_adv_win_scale = 2; 窗口与接收缓存之间的比例系数

static inline int tcp_win_from_space(int space) 将一定比例的接收缓存划为接收窗口
{
return sysctl_tcp_adv_win_scale<=0 ?
(space>>(-sysctl_tcp_adv_win_scale)) : 指数比例
space - (space>>sysctl_tcp_adv_win_scale); 倍数比例
}
static inline int tcp_full_space( struct sock *sk) 取套接字全部接收缓存对应的窗口
{
return tcp_win_from_space(sk->rcvbuf);
}

/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
static __inline__ int
tcp_may_update_window(struct tcp_opt *tp, u32 ack, u32 ack_seq, u32 nwin)
{
return (after(ack, tp->snd_una) || 当接收端响应序号在发送端未响应序号之后 或者
after(ack_seq, tp->snd_wl1) || 响应包的自身序列号在上次响应序列之后 或者
(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); 新窗口大于发送窗口
}
static __inline__ void tcp_update_wl(struct tcp_opt *tp, u32 ack, u32 seq)
{
tp->snd_wl1 = seq;
}
static __inline__ void tcp_fast_path_on(struct tcp_opt *tp)
{
__tcp_fast_path_on(tp, tp->snd_wnd>>tp->snd_wscale);
}
static __inline__ void __tcp_fast_path_on(struct tcp_opt *tp, u32 snd_wnd)
{
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
ntohl(TCP_FLAG_ACK) |
snd_wnd); 生成TCP包头标志预测字
}
/* Update our send window.
*
* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
* and in FreeBSD. NetBSD's one is even worse.) is wrong.
*/
static int tcp_ack_update_window(struct sock *sk, struct tcp_opt *tp,
struct sk_buff *skb, u32 ack, u32 ack_seq) 根据接收端响应刷新发送窗口
{
int flag = 0;
u32 nwin = ntohs(skb->h.th->window) << tp->snd_wscale; 取重新标度的窗口

if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
flag |= FLAG_WIN_UPDATE;
tcp_update_wl(tp, ack, ack_seq); 记录发送窗口刷新时响应报文自身的序列号

if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin; 刷新发送窗口

/* Note, it is the only place, where
* fast path is recovered for sending TCP.
*/
if (skb_queue_len(&tp->out_of_order_queue) == 0 && 如果没有超前的接收片段
#ifdef TCP_FORMAL_WINDOW
tcp_receive_window(tp) &&
#endif
!tp->urg_data) 并且非紧急数据片段
tcp_fast_path_on(tp); 启用TCP头快速识别

if (nwin > tp->max_window) {
tp->max_window = nwin; 记录发送窗口所达的最大值
tcp_sync_mss(sk, tp->pmtu_cookie); 重新计算MSS
}
}
}

tp->snd_una = ack; 刷新发送窗口的起始边界

#ifdef TCP_DEBUG
if (before(tp->snd_una + tp->snd_wnd, tp->snd_nxt)) {
if (tp->snd_nxt-(tp->snd_una + tp->snd_wnd) >= (1<snd_wscale)
&& net_ratelimit())
printk(KERN_DEBUG "TCP: peer %u.%u.%u.%u:%u/%u shrinks window %u:%u:%u. Bad, what
else can I say?/n",
NIPQUAD(sk->daddr), htons(sk->dport), sk->num,
tp->snd_una, tp->snd_wnd, tp->snd_nxt);
}
#endif

return flag;
}

Edited by lucian_yao on 07/18/01 04:28
PM.

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值