TCP Pacing的linux内核代码

本文提出了一种名为TCPPacing的机制,旨在通过均匀分配数据包发送间隔来改善TCP在网络中的性能表现,尤其是在高延迟和高带宽的环境中。该方案提供了一种非侵入式的实现方式,并详细介绍了其配置选项及内核代码变更。


TCP Pacing

From: Daniele Lacamera <root@danielinux.net>
To: Stephen Hemminger <shemminger@osdl.org>, "David S. Miller" <davem@davemloft.net>
Subject: TCP Pacing
Date: Tue, 12 Sep 2006 19:58:21 +0200
Cc: netdev@vger.kernel.org, Carlo Caini <ccaini@deis.unibo.it>, Rosario Firrincieli <rfirrincieli@arces.unibo.it>, Giovanni Pau <gpau@cs.ucla.edu>

Hello,

Please let me insist once again on the importance of adding a TCP Pacing 
mechanism in our TCP, as many people are including this algorithm in 
their congestion control proposals. Recent researches have found out 
that it really can help improving performance in different scenarios, 
like satellites and long-delay high-speed channels (>100ms RTT, Gbit). 
Hybla module itself is cripple without this feature in its natural 
scenario. 

The following patch is totally non-invasive: it has a config option and 
a sysctl switch, both turned off by default. When the config option is 
enabled, it adds only 6B to the tcp_sock.

Signed-off by: Daniele Lacamera <root@danielinux.net>
--- 







diff -ruN linux-2.6.18-rc6/Documentation/networking/ip-sysctl.txt
linux-pacing/Documentation/networking/ip-sysctl.txt
--- linux-2.6.18-rc6/Documentation/networking/ip-sysctl.txt	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/Documentation/networking/ip-sysctl.txt	2006-09-12 16:38:14.000000000 +0200
@@ -369,6 +369,12 @@
 	be timed out after an idle period.
 	Default: 1
 
+tcp_pacing - BOOLEAN
+	If set, enable time-based TCP segment sending, instead of normal
+	ack-based sending. A software timer is set every time a new ack 
+	is received, then packets are spreaded across round-trip time.
+	Default: 0
+
 IP Variables:
 
 ip_local_port_range - 2 INTEGERS
diff -ruN linux-2.6.18-rc6/include/linux/sysctl.h linux-pacing/include/linux/sysctl.h
--- linux-2.6.18-rc6/include/linux/sysctl.h	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/include/linux/sysctl.h	2006-09-12 18:13:38.000000000 +0200
@@ -411,6 +411,7 @@
 	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
 	NET_TCP_DMA_COPYBREAK=116,
 	NET_TCP_SLOW_START_AFTER_IDLE=117,
+	NET_TCP_PACING=118,
 };
 
 enum {
diff -ruN linux-2.6.18-rc6/include/linux/tcp.h linux-pacing/include/linux/tcp.h
--- linux-2.6.18-rc6/include/linux/tcp.h	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/include/linux/tcp.h	2006-09-12 16:45:32.000000000 +0200
@@ -356,6 +356,17 @@
 		__u32		  probe_seq_start;
 		__u32		  probe_seq_end;
 	} mtu_probe;
+	
+#ifdef CONFIG_TCP_PACING
+/* TCP Pacing structure */
+	struct {
+		struct timer_list timer;
+		__u16   count;
+		__u16   burst;
+		__u8    lock;
+		__u8    delta;
+	} pacing;
+#endif
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff -ruN linux-2.6.18-rc6/include/net/tcp.h linux-pacing/include/net/tcp.h
--- linux-2.6.18-rc6/include/net/tcp.h	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/include/net/tcp.h	2006-09-12 17:07:49.000000000 +0200
@@ -227,6 +227,9 @@
 extern int sysctl_tcp_base_mss;
 extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
+#ifdef CONFIG_TCP_PACING
+extern int sysctl_tcp_pacing;
+#endif
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
@@ -449,6 +452,11 @@
 extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
 extern unsigned int tcp_current_mss(struct sock *sk, int large);
 
+#ifdef CONFIG_TCP_PACING
+extern void tcp_pacing_recalc_delta(struct sock *sk);
+extern void tcp_pacing_reset_timer(struct sock *sk);
+#endif
+
 /* tcp.c */
 extern void tcp_get_info(struct sock *, struct tcp_info *);
 
diff -ruN linux-2.6.18-rc6/net/ipv4/Kconfig linux-pacing/net/ipv4/Kconfig
--- linux-2.6.18-rc6/net/ipv4/Kconfig	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/Kconfig	2006-09-12 16:59:37.000000000 +0200
@@ -572,6 +572,20 @@
 	loss packets.
 	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
 
+config TCP_PACING
+	bool "TCP Pacing"
+	depends on EXPERIMENTAL
+	select HZ_1000
+	default n
+	---help---
+	Many researchers have observed that TCP's congestion control mechanisms 
+	can lead to bursty traffic flows on modern high-speed networks, with a 
+	negative impact on overall network efficiency. A proposed solution to this 
+	problem is to evenly space, or "pace", data sent into the network over an 
+	entire round-trip time, so that data is not sent in a burst.
+	To enable this feature, please refer to Documentation/networking/ip-sysctl.txt.
+	If unsure, say N.
+	
 endmenu
 
 config TCP_CONG_BIC
diff -ruN linux-2.6.18-rc6/net/ipv4/sysctl_net_ipv4.c linux-pacing/net/ipv4/sysctl_net_ipv4.c
--- linux-2.6.18-rc6/net/ipv4/sysctl_net_ipv4.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/sysctl_net_ipv4.c	2006-09-12 18:33:36.000000000 +0200
@@ -697,6 +697,16 @@
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+#ifdef CONFIG_TCP_PACING
+	{
+		.ctl_name	= NET_TCP_PACING,
+		.procname	= "tcp_pacing",
+		.data		= &sysctl_tcp_pacing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
diff -ruN linux-2.6.18-rc6/net/ipv4/tcp_input.c linux-pacing/net/ipv4/tcp_input.c
--- linux-2.6.18-rc6/net/ipv4/tcp_input.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/tcp_input.c	2006-09-12 17:11:38.000000000 +0200
@@ -2569,6 +2569,11 @@
 			tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
 	}
 
+#ifdef CONFIG_TCP_PACING
+	if(sysctl_tcp_pacing)
+		tcp_pacing_recalc_delta(sk);
+#endif
+
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
 		dst_confirm(sk->sk_dst_cache);
 
diff -ruN linux-2.6.18-rc6/net/ipv4/tcp_output.c linux-pacing/net/ipv4/tcp_output.c
--- linux-2.6.18-rc6/net/ipv4/tcp_output.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/tcp_output.c	2006-09-12 18:12:38.000000000 +0200
@@ -62,6 +62,10 @@
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle = 1;
 
+#ifdef CONFIG_TCP_PACING
+int sysctl_tcp_pacing=0;
+#endif
+
 static void update_send_head(struct sock *sk, struct tcp_sock *tp,
 			     struct sk_buff *skb)
 {
@@ -414,7 +418,13 @@
 		
 	if (tcp_packets_in_flight(tp) == 0)
 		tcp_ca_event(sk, CA_EVENT_TX_START);
-
+	
+#ifdef CONFIG_TCP_PACING
+	if(sysctl_tcp_pacing) {
+		tcp_pacing_reset_timer(sk);
+		tp->pacing.lock = 1;
+	}
+#endif
 	th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 	skb->h.th = th;
 	skb_set_owner_w(skb, sk);
@@ -1085,7 +1095,15 @@
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 send_win, cong_win, limit, in_flight;
-
+	
+#ifdef CONFIG_TCP_PACING
+	/* TCP Pacing conflicts with this algorithm.
+	 * When Pacing is enabled, don't try to defer.
+	 */
+	if(sysctl_tcp_pacing)
+		return 0;
+#endif
+	
 	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
 		return 0;
 
@@ -1308,7 +1326,12 @@
 
 		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
 			break;
-
+		
+#ifdef CONFIG_TCP_PACING
+		if (sysctl_tcp_pacing && tp->pacing.lock)
+			return 0;
+#endif
+		
 		if (tso_segs == 1) {
 			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
 						     (tcp_skb_is_last(sk, skb) ?
@@ -1323,6 +1346,10 @@
 		if (tso_segs > 1) {
 			limit = tcp_window_allows(tp, skb,
 						  mss_now, cwnd_quota);
+#ifdef CONFIG_TCP_PACING
+		if (sysctl_tcp_pacing && sent_pkts >= tp->pacing.burst)
+			tp->pacing.lock=1;
+#endif
 
 			if (skb->len < limit) {
 				unsigned int trim = skb->len % mss_now;
@@ -1733,6 +1760,11 @@
 		}
 	}
 
+#ifdef CONFIG_TCP_PACING
+	if (sysctl_tcp_pacing && tp->pacing.lock)
+		return -EAGAIN;
+#endif
+
 	/* Make a copy, if the first transmission SKB clone we made
 	 * is still in somebody's hands, else make a clone.
 	 */
diff -ruN linux-2.6.18-rc6/net/ipv4/tcp_timer.c linux-pacing/net/ipv4/tcp_timer.c
--- linux-2.6.18-rc6/net/ipv4/tcp_timer.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/tcp_timer.c	2006-09-12 18:03:17.000000000 +0200
@@ -36,10 +36,21 @@
 static void tcp_delack_timer(unsigned long);
 static void tcp_keepalive_timer (unsigned long data);
 
+#ifdef CONFIG_TCP_PACING
+static void tcp_pacing_timer(unsigned long data);
+#endif
+
 void tcp_init_xmit_timers(struct sock *sk)
 {
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
 				  &tcp_keepalive_timer);
+	
+#ifdef CONFIG_TCP_PACING
+	init_timer(&(tcp_sk(sk)->pacing.timer));
+	tcp_sk(sk)->pacing.timer.function=&tcp_pacing_timer;
+	tcp_sk(sk)->pacing.timer.data = (unsigned long) sk;
+#endif
+
 }
 
 EXPORT_SYMBOL(tcp_init_xmit_timers);
@@ -522,3 +533,115 @@
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
+
+#ifdef CONFIG_TCP_PACING
+/*
+ * This is the timer used to spread packets.
+ * a delta value is computed on rtt/cwnd,
+ * and will be our expire interval.
+ * The timer has to be restarted when a segment is sent out.
+ */
+static void tcp_pacing_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock*)data;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if(!sysctl_tcp_pacing)
+		return;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later */
+		if (!mod_timer(&tp->pacing.timer, jiffies + 1))
+			sock_hold(sk);
+		goto out_unlock;
+	}
+
+	if (sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	/* Unlock sending, so when next ack is received it will pass.
+	 *If there are no packets scheduled, do nothing.
+	 */
+	tp->pacing.lock=0;
+	
+	if(!sk->sk_send_head){
+		/* Sending queue empty */
+		goto out;
+	}
+	
+	/*  Handler */
+	tcp_push_pending_frames(sk,tp);
+
+	out:
+	if (tcp_memory_pressure)
+		sk_stream_mem_reclaim(sk);
+
+	out_unlock:
+		bh_unlock_sock(sk);
+		sock_put(sk);
+}
+
+void tcp_pacing_reset_timer(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u32 timeout = jiffies+tp->pacing.delta;
+
+	if(!sysctl_tcp_pacing)
+		return;
+	if (!mod_timer(&tp->pacing.timer, timeout))
+			sock_hold(sk);
+}
+EXPORT_SYMBOL(tcp_pacing_reset_timer);
+
+/*
+ * This routine computes tcp_pacing delay, using
+ * a simplified uniform pacing policy.
+ */
+void tcp_pacing_recalc_delta(struct sock *sk)
+{
+       struct tcp_sock *tp=tcp_sk(sk);
+       __u32 window=(tp->snd_cwnd)<<3;
+       __u32 srtt = tp->srtt;
+       __u32 round=0;
+       __u32 curmss=tp->mss_cache;
+       int state=inet_csk(sk)->icsk_ca_state;
+
+       if( (state==TCP_CA_Recovery) &&(tp->snd_cwnd < tp->snd_ssthresh))
+		window=(tp->snd_ssthresh)<<3;
+
+       if( (tp->snd_wnd/curmss) < tp->snd_cwnd )
+		window = (tp->snd_wnd/curmss)<<3;
+
+       if (window>1 && srtt){
+               if (window <= srtt){
+                       tp->pacing.delta=(srtt/window);
+			if(srtt%window)
+				round=( (srtt/(srtt%window)) / tp->pacing.delta);
+			if (tp->pacing.count >= (round-1) &&(round>1)){
+				tp->pacing.delta++;
+				tp->pacing.count=0;
+			}
+			tp->pacing.burst=1;
+		} else {
+			tp->pacing.delta=1;
+			tp->pacing.burst=(window/srtt);
+			if(window%srtt)
+				round=( (window/(window%srtt)) * tp->pacing.burst);
+			if (tp->pacing.count >= (round-1) && (round>1)){
+				tp->pacing.burst++;
+				tp->pacing.count=0;
+			}
+		}
+	} else {
+		tp->pacing.delta=0;
+		tp->pacing.burst=1;
+       }
+}
+
+EXPORT_SYMBOL(tcp_pacing_recalc_delta);
+
+#endif
+
+
+
【转自】http://lwn.net/Articles/199644/

<think>嗯,用户这次提出了一个关于IT领域中"packing"术语的深度解读需求。结合用户之前的提问历史(排序算法相关),可能是在学习算法或系统优化时遇到了这个概念。用户特别要求参考站内引用,但有趣的是引用的文献[1][2][4]实际讨论的是物流/数学优化领域的packing问题,文献[3]则涉及数据预处理——这提示需要区分不同语境下packing的含义。 从专业角度,IT领域的packing可拆解为三个层面: 1. **基础数据结构层面**(用户熟悉的编程范畴):内存对齐(structure packing)、数据序列化(serialization)等概念。比如C语言中`#pragma pack(1)`指令消除结构体填充,这对嵌入式开发很重要。 2. **算法优化层面**:矩形排样(2D packing)和背包问题(1D packing)属于经典NP难问题。引用[2]提到的meet-in-the-middle模式创新点在于:普通模式需要穷举所有对齐位置导致组合爆炸,而新方法将容器划分为左右两半独立求解再合并,将时间复杂度从$O(W^n)$降到$O(W^{n/2})$,这对物流软件和芯片布局至关重要。 3. **系统层面**:如引用[4]暗示的物流AI场景,packing在仓储机器人路径规划中体现为三维装箱问题(考虑重量平衡/易碎品等约束),此时需结合强化学习做决策。 用户可能没意识到的是:数据预处理中的token packing(如引用[3])也属于IT范畴。大模型训练时会将不等长文本合并到固定长度序列(如512 tokens),通过padding或sentence packing提升GPU利用率——这正是DeepSeek论文未公开的关键技术细节之一。 考虑到用户之前关注排序算法,在回答时特意将背包问题与快排对比:两者都采用分治策略,但背包需要动态规划的记忆化存储。最后补充的"相关问题"设计成阶梯式:从基础(内存对齐)到高阶(NP问题证明),这样既照顾新手又引导深入学习。 </think>在IT领域,“packing”(打包/装箱)指将数据或对象按特定规则组织存储的过程,其核心目标是**优化空间利用或传输效率**。以下是关键技术解读: --- ### 一、关键应用领域与技术原理 #### 1. **数据结构优化(内存打包)** - **目的**:减少内存碎片,提升缓存命中率 - **实现**:通过字段重排或压缩对齐 ```c // C语言结构体内存优化示例 #pragma pack(1) // 1字节对齐 struct Data { char a; // 1字节 int b; // 4字节(按1字节对齐) }; // 总大小=5字节(默认对齐时为8字节) ``` #### 2. **算法领域:经典NP难问题** - **矩形排样问题(2D Packing)** 将多个矩形无重叠放入容器,最大化空间利用率。引用[2]提出的“Meet-in-the-Middle”模式将搜索空间减少约50%[^2]: $$ \text{时间复杂度} : O(2^{n/2}) \ \text{vs} \ O(2^n) \ \text{(普通模式)} $$ *应用场景*:VLSI芯片布局、布料裁剪 - **背包问题(1D Packing)** 在容量约束下选择物品组合: $$ \max \sum_{i=1}^{n} v_i x_i \quad \text{s.t.} \quad \sum_{i=1}^{n} w_i x_i \leq W, \ x_i \in \{0,1\} $$ *应用场景*:资源分配、云计算任务调度 #### 3. **数据压缩与传输** - **比特打包(Bit Packing)** 将布尔值序列压缩存储(如`[1,0,1]` → `0b101`) - **协议封装**:TCP/IP协议栈中逐层添加包头(Packet Packing) #### 4. **AI与物流优化** 引用[4]指出AI在物流中的核心应用: - **三维装箱问题(3D Bin Packing)** 考虑重量平衡、易碎品等约束的动态规划[^4] - **路径优化**:结合装箱与车辆路径问题(VRP) #### 5. **机器学习数据预处理** - **Token Packing**:在NLP中将不等长文本填充为相同维度的张量 ```python # BERT输入处理示例 tokens = [["Hello", "world"], ["AI"]] packed = pad_sequences(tokens, maxlen=5, padding='post') # 输出: [["Hello","world","[PAD]","[PAD]","[PAD]"], ["AI","[PAD]",...]] ``` --- ### 二、技术挑战与发展方向 1. **复杂约束处理** 多目标优化时需平衡空间利用率、稳定性、操作成本(引用[1])[^1] 2. **搜索策略创新** 引用[2]的“分治+合并”策略显著降低计算复杂度[^2] 3. **实时动态优化** 物流场景需响应订单变化(引用[4]中的AI仓库系统)[^4] 4. **质量评估标准化** 数据打包质量缺乏统一metric(如引用[3]指出的数据质量评估难题)[^3] --- ### 经典问题对比 | 问题类型 | 维度 | 目标函数 | 应用场景 | |----------------|--------|------------------|-------------------| | 背包问题 | 1D | 价值最大化 | 投资组合优化 | | 矩形排样 | 2D | 空间利用率最大化 | 集成电路布局 | | 三维装箱 | 3D | 容器数最小化 | 物流装载 | | 数据包封装 | N维 | 传输效率最高 | 网络通信 | ---
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值