Linux网络数据接收过程分析

本文详细解析了Linux内核中网络数据包从网卡驱动接收至IP层的处理流程,包括netif_rx_ni、enqueue_to_backlog、net_rx_action等关键函数的工作原理,以及数据包如何在不同队列间流转。

在Linux内核中,当网卡驱动接收到数据时,会调用netif_rx_ni函数传递数据到IP层

,主要把数据包链接到input_pkt_queue队列,并启动一次软中断函数

int netif_rx_ni(struct sk_buff *skb)

{

         int err;

         preempt_disable();

         err = netif_rx(skb);

         if (local_softirq_pending())

                   do_softirq();

         preempt_enable();

 

         return err;

}

1.    netif_rx 函数

int netif_rx(struct sk_buff *skb)

{

         int ret;

 

         /*…………..*/

         {

                   unsigned int qtail;

        直接调用enqueue_to_backlog函数

                   ret = enqueue_to_backlog(skb, get_cpu(), &qtail);

                   put_cpu();

         }

         return ret;

}

static int enqueue_to_backlog(struct sk_buff *skb, int cpu,

                                  unsigned int *qtail)

{

         struct softnet_data *sd;

         unsigned long flags;

 

         /*获取per cpu数据 */

         sd = &per_cpu(softnet_data, cpu);

 

         local_irq_save(flags);

 

         rps_lock(sd);

        /*如果input_pkt_queue队列中有skb包,且没有超过netdev_max_backlog,则会直接把skb链接到input_pkt_queue队列 */

         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {

                   if (skb_queue_len(&sd->input_pkt_queue)) {

enqueue:

                            __skb_queue_tail(&sd->input_pkt_queue, skb);

                            input_queue_tail_incr_save(sd, qtail);

                            rps_unlock(sd);

                            local_irq_restore(flags);

                            return NET_RX_SUCCESS;

                   }

 

                   /* Schedule NAPI for backlog device

                    * We can use non atomic operation since we own the queue lock

                    */

                   if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {

                            if (!rps_ipi_queued(sd))/*如果input_pkt_queue队列为空,启动一次软中断,并把sd挂接到poll_list链表 */       

                                     ____napi_schedule(sd, &sd->backlog);

                   }

                   goto enqueue;//挂接skb到input_pkt_queue队列

         }

 

         sd->dropped++;

         rps_unlock(sd);

 

         local_irq_restore(flags);

   /* 如果超过最大包数,则直接丢掉*/

         atomic_long_inc(&skb->dev->rx_dropped);

         kfree_skb(skb);

         return NET_RX_DROP;

}

 

2.    net_rx_action 函数

static void net_rx_action(struct softirq_action *h)

{

         struct softnet_data *sd = &__get_cpu_var(softnet_data);

         unsigned long time_limit = jiffies + 2;

         int budget = netdev_budget;/*此网络设备允许最大传输数,目前为300 */

         void *have;

 

         local_irq_disable();

 

         while (!list_empty(&sd->poll_list)) {

                   struct napi_struct *n;

                   int work, weight;

                   if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))

                            goto softnet_break; /*如果budget用完,或者已经超过2 jiffies 则退出 */

                   local_irq_enable();

                  

                   n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);

                   have = netpoll_poll_lock(n);

                   weight = n->weight;

                   work = 0;

                   if (test_bit(NAPI_STATE_SCHED, &n->state)) {

                            work = n->poll(n, weight);//调用poll函数,如果驱动没有实现,则调用默认的process_backlog函数

                            trace_napi_poll(n);

                   }

                   WARN_ON_ONCE(work > weight);

                   budget -= work;

                   local_irq_disable();

                   if (unlikely(work == weight)) { /*如果网络设备的budget用完,则完成一次接收 */

                            if (unlikely(napi_disable_pending(n))) {

                                     local_irq_enable();

                                     napi_complete(n);

                                     local_irq_disable();

                            } else {

                                     if (n->gro_list) {

                                               /* flush too old packets

                                                * If HZ < 1000, flush all packets.

                                                */

                                               local_irq_enable();

                                               napi_gro_flush(n, HZ >= 1000);

                                               local_irq_disable();

                                     }

                                     list_move_tail(&n->poll_list, &sd->poll_list);

                            }

                   }

                   netpoll_poll_unlock(have);

         }

out:

         net_rps_action_and_irq_enable(sd);

}

static int process_backlog(struct napi_struct *napi, int quota)

{

         int work = 0;

         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);

         napi->weight = weight_p;

         local_irq_disable();

         while (work < quota) {

                   struct sk_buff *skb;

                   unsigned int qlen;

                   /*第一次进来时为空 */

                   while ((skb = __skb_dequeue(&sd->process_queue))) {

                            local_irq_enable();

             /*把数据传递到IP层 */

                            __netif_receive_skb(skb);

                            local_irq_disable();

                            input_queue_head_incr(sd);

                            if (++work >= quota) {

                                     local_irq_enable();

                                     return work;

                            }

                   }

                   rps_lock(sd);

                   qlen = skb_queue_len(&sd->input_pkt_queue);

                   if (qlen)/*直接把input_ptk_queue链接到process_queue队列 */

                            skb_queue_splice_tail_init(&sd->input_pkt_queue,

                                                           &sd->process_queue);

                   if (qlen < quota - work) {

                            /*如果此napi没有数据包,则把napi移除 */

                            list_del(&napi->poll_list);

                            napi->state = 0;

                            quota = work + qlen;

                   }

                   rps_unlock(sd);

         }

         local_irq_enable();

 

         return work;

}

3.    __netif_receive_skb函数

static int __netif_receive_skb(struct sk_buff *skb)

{

         int ret;

 

         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {

                   unsigned long pflags = current->flags;

                   current->flags |= PF_MEMALLOC;

                   ret = __netif_receive_skb_core(skb, true);

                   tsk_restore_flags(current, pflags, PF_MEMALLOC);

         } else       /*直接调用__netif_receive_skb_core函数 */

                   ret = __netif_receive_skb_core(skb, false);

         return ret;

}

static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)

{

         struct packet_type *ptype, *pt_prev;

         rx_handler_func_t *rx_handler;

         struct net_device *orig_dev;

         struct net_device *null_or_dev;

         bool deliver_exact = false;

         int ret = NET_RX_DROP;

         __be16 type;

 

         orig_dev = skb->dev;

        /*重置network报头和mac head长度 */

         skb_reset_network_header(skb);

         if (!skb_transport_header_was_set(skb))

                   skb_reset_transport_header(skb);

         skb_reset_mac_len(skb);

         pt_prev = NULL;

         rcu_read_lock();

another_round:

         skb->skb_iif = skb->dev->ifindex;

         __this_cpu_inc(softnet_data.processed);

         /* deliver only exact match when indicated */

         null_or_dev = deliver_exact ? skb->dev : NULL;

         type = skb->protocol;

         /*先把skb传递给ptype_all的协议层,如果用tcpdump抓包时,会注册处理函数到ptype_all */

     list_for_each_entry_rcu(ptype, &ptype_all, list) {

                   if (!ptype->dev || ptype->dev == skb->dev) {

                            if (pt_prev)

                                     ret = deliver_skb(skb, pt_prev, orig_dev);

                            pt_prev = ptype;

                   }

         }

 

        /*ptype_base为基本协议处理,

         协议层通过dev_add_pack注册pack处理函数

         如IP的ip_rcv和arp_rcv  */

         list_for_each_entry_rcu(ptype,

                            &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {

                   if (ptype->type == type &&

                       (ptype->dev == null_or_dev || ptype->dev == skb->dev ||

                        ptype->dev == orig_dev)) {

                            if (pt_prev)

                                     ret = deliver_skb(skb, pt_prev, orig_dev);

                            pt_prev = ptype;

                   }

         }

         if (pt_prev) {

                   if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))

                            goto drop;

                   else

                            ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);

         } else {

drop:

                   atomic_long_inc(&skb->dev->rx_dropped);

                   kfree_skb(skb);

                   /* Jamal, now you will not able to escape explaining

                    * me how you were going to use this. :-)

                    */

                   ret = NET_RX_DROP;

         }

unlock:

         rcu_read_unlock();

out:

         return ret;

Linux 系统中网络数据接收流程是一个复杂而高效的过程,涉及多个内核子系统的协作。从硬件层到应用层,数据包需要经过多个阶段的处理,包括硬件中断、软中断、协议栈解析、套接字队列等。以下是对 Linux 网络接收数据流程的详细分析: ### 网络数据接收的整体路径 当网络接口控制器(NIC)接收数据包时,首先通过 DMA(直接内存访问)将数据写入预先分配的内存缓冲区(称为 sk_buff),然后触发硬件中断通知 CPU 数据到达。CPU 响应中断后,会调度软中断(softirq)处理程序来完成后续的数据包处理。 在软中断上下文中,数据包会经过网络协议栈的处理,包括以太网帧的解析、IP 层的路由判断、传输层(TCP/UDP)的端口匹配等。最终,数据包会被放入与目标套接字(socket)关联的接收队列中,等待用户空间的应用程序通过系统调用(如 `recvfrom` 或 `read`)读取。 ### 网络数据接收的关键机制 #### 1. 硬件中断与 NAPI(New API) 现代网卡通常支持 NAPI 机制,用于减少频繁的硬件中断带来的性能开销。在高流量场景下,NAPI 会切换到轮询模式,在一个软中断周期内连续处理多个数据包,从而提高吞吐量并降低 CPU 占用率。 #### 2. sk_buff(socket buffer) sk_buff 是 Linux 内核中用于管理网络数据包的核心数据结构。它不仅保存数据内容,还记录了数据包的元信息(如协议类型、偏移量、校验和状态等)。每个数据包在进入协议栈时都会被封装为一个 sk_buff 实例。 #### 3. 协议栈处理 数据包进入协议栈后,首先由链路层(如以太网)进行解析,剥离帧头,判断上层协议(如 IPv4/IPv6)。接着进入 IP 层,进行路由查找和 TTL 检查,最终根据协议字段(如 TCP/UDP)将数据传递给相应的传输层处理。 对于 TCP 数据包,内核会检查连接状态、序列号、窗口大小等信息,并将数据放入接收队列中。对于 UDP 数据包,则直接根据端口号匹配目标 socket,并将数据放入其接收队列。 #### 4. 套接字接收队列 每个 socket 都维护一个接收队列(receive queue),用于缓存等待用户进程读取的数据包。如果队列已满,内核可能会丢弃数据包或触发流控机制。应用程序通过 `recvfrom`、`read` 或 `poll/select` 等系统调用从队列中读取数据。 ### 示例代码:用户空间接收数据 以下是一个简单的 UDP 接收数据的示例代码,展示了如何在用户空间通过 socket 接收网络数据: ```c #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <sys/socket.h> #include <netinet/in.h> int main() { int sockfd; struct sockaddr_in servaddr, cliaddr; char buffer[1024]; socklen_t len = sizeof(cliaddr); // 创建 UDP socket sockfd = socket(AF_INET, SOCK_DGRAM, 0); if (sockfd < 0) { perror("socket creation failed"); exit(EXIT_FAILURE); } // 设置服务器地址 memset(&servaddr, 0, sizeof(servaddr)); servaddr.sin_family = AF_INET; servaddr.sin_addr.s_addr = INADDR_ANY; servaddr.sin_port = htons(8888); // 绑定 socket if (bind(sockfd, (const struct sockaddr *)&servaddr, sizeof(servaddr)) < 0) { perror("bind failed"); close(sockfd); exit(EXIT_FAILURE); } // 接收数据 int n = recvfrom(sockfd, buffer, sizeof(buffer), 0, (struct sockaddr *)&cliaddr, &len); if (n > 0) { buffer[n] = '\0'; printf("Received message: %s\n", buffer); } close(sockfd); return 0; } ``` ### 数据路径中的性能优化 - **RPS(Receive Packet Steering)**:将数据包分发到不同的 CPU 上处理,提升多核系统的并行处理能力。 - **RSS(Receive Side Scaling)**:由网卡硬件支持,将数据包分发到多个队列,进一步提升并行性。 - **XPS(Transmit Packet Steering)**:优化数据发送路径,确保每个 CPU 使用最合适的 TX 队列。 - **GRO(Generic Receive Offload)**:将多个数据包合并为一个大包处理,减少协议栈处理次数,提升吞吐量。 Linux 内核通过这些机制和优化手段,确保网络数据接收路径的高效性和可扩展性[^2]。 ---
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值