Linux网络数据接收过程分析

最新推荐文章于 2024-07-17 11:29:01 发布

原创最新推荐文章于 2024-07-17 11:29:01 发布 · 2.9k 阅读

2 ·

CC 4.0 BY-SA版权

网络协议栈专栏收录该内容

23 篇文章

订阅专栏

本文详细解析了Linux内核中网络数据包从网卡驱动接收至IP层的处理流程，包括netif_rx_ni、enqueue_to_backlog、net_rx_action等关键函数的工作原理，以及数据包如何在不同队列间流转。

在Linux内核中，当网卡驱动接收到数据时，会调用netif_rx_ni函数传递数据到IP层

，主要把数据包链接到input_pkt_queue队列，并启动一次软中断函数

int netif_rx_ni(struct sk_buff *skb)

{

int err;

preempt_disable();

err = netif_rx(skb);

if (local_softirq_pending())

do_softirq();

preempt_enable();

return err;

}

1. netif_rx 函数

int netif_rx(struct sk_buff *skb)

{

int ret;

/*…………..*/

{

unsigned int qtail;

直接调用enqueue_to_backlog函数

ret = enqueue_to_backlog(skb, get_cpu(), &qtail);

put_cpu();

}

return ret;

}

static int enqueue_to_backlog(struct sk_buff *skb, int cpu,

unsigned int *qtail)

{

struct softnet_data *sd;

unsigned long flags;

/*获取per cpu数据 */

sd = &per_cpu(softnet_data, cpu);

local_irq_save(flags);

rps_lock(sd);

/*如果input_pkt_queue队列中有skb包,且没有超过netdev_max_backlog,则会直接把skb链接到input_pkt_queue队列 */

if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {

if (skb_queue_len(&sd->input_pkt_queue)) {

enqueue:

__skb_queue_tail(&sd->input_pkt_queue, skb);

input_queue_tail_incr_save(sd, qtail);

rps_unlock(sd);

local_irq_restore(flags);

return NET_RX_SUCCESS;

}

/* Schedule NAPI for backlog device

* We can use non atomic operation since we own the queue lock

if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {

if (!rps_ipi_queued(sd))/*如果input_pkt_queue队列为空,启动一次软中断,并把sd挂接到poll_list链表 */

____napi_schedule(sd, &sd->backlog);

}

goto enqueue;//挂接skb到input_pkt_queue队列

}

sd->dropped++;

rps_unlock(sd);

local_irq_restore(flags);

/* 如果超过最大包数，则直接丢掉*/

atomic_long_inc(&skb->dev->rx_dropped);

kfree_skb(skb);

return NET_RX_DROP;

}

2. net_rx_action 函数

static void net_rx_action(struct softirq_action *h)

{

struct softnet_data *sd = &__get_cpu_var(softnet_data);

unsigned long time_limit = jiffies + 2;

int budget = netdev_budget;/*此网络设备允许最大传输数，目前为300 */

void *have;

local_irq_disable();

while (!list_empty(&sd->poll_list)) {

struct napi_struct *n;

int work, weight;

if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))

goto softnet_break; /*如果budget用完，或者已经超过2 jiffies 则退出 */

local_irq_enable();

n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);

have = netpoll_poll_lock(n);

weight = n->weight;

work = 0;

if (test_bit(NAPI_STATE_SCHED, &n->state)) {

work = n->poll(n, weight);//调用poll函数，如果驱动没有实现，则调用默认的process_backlog函数

trace_napi_poll(n);

}

WARN_ON_ONCE(work > weight);

budget -= work;

local_irq_disable();

if (unlikely(work == weight)) { /*如果网络设备的budget用完,则完成一次接收 */

if (unlikely(napi_disable_pending(n))) {

local_irq_enable();

napi_complete(n);

local_irq_disable();

} else {

if (n->gro_list) {

/* flush too old packets

* If HZ < 1000, flush all packets.

local_irq_enable();

napi_gro_flush(n, HZ >= 1000);

local_irq_disable();

}

list_move_tail(&n->poll_list, &sd->poll_list);

}

netpoll_poll_unlock(have);

}

out:

net_rps_action_and_irq_enable(sd);

}

static int process_backlog(struct napi_struct *napi, int quota)

{

int work = 0;

struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);

napi->weight = weight_p;

local_irq_disable();

while (work < quota) {

struct sk_buff *skb;

unsigned int qlen;

/*第一次进来时为空 */

while ((skb = __skb_dequeue(&sd->process_queue))) {

local_irq_enable();

/*把数据传递到IP层 */

__netif_receive_skb(skb);

local_irq_disable();

input_queue_head_incr(sd);

if (++work >= quota) {

local_irq_enable();

return work;

}

rps_lock(sd);

qlen = skb_queue_len(&sd->input_pkt_queue);

if (qlen)/*直接把input_ptk_queue链接到process_queue队列 */

skb_queue_splice_tail_init(&sd->input_pkt_queue,

&sd->process_queue);

if (qlen < quota - work) {

/*如果此napi没有数据包，则把napi移除 */

list_del(&napi->poll_list);

napi->state = 0;

quota = work + qlen;

}

rps_unlock(sd);

}

local_irq_enable();

return work;

}

3. __netif_receive_skb函数

static int __netif_receive_skb(struct sk_buff *skb)

{

int ret;

if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {

unsigned long pflags = current->flags;

current->flags |= PF_MEMALLOC;

ret = __netif_receive_skb_core(skb, true);

tsk_restore_flags(current, pflags, PF_MEMALLOC);

} else /*直接调用__netif_receive_skb_core函数 */

ret = __netif_receive_skb_core(skb, false);

return ret;

}

static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)

{

struct packet_type *ptype, *pt_prev;

rx_handler_func_t *rx_handler;

struct net_device *orig_dev;

struct net_device *null_or_dev;

bool deliver_exact = false;

int ret = NET_RX_DROP;

__be16 type;

orig_dev = skb->dev;

/*重置network报头和mac head长度 */

skb_reset_network_header(skb);

if (!skb_transport_header_was_set(skb))

skb_reset_transport_header(skb);

skb_reset_mac_len(skb);

pt_prev = NULL;

rcu_read_lock();

another_round:

skb->skb_iif = skb->dev->ifindex;

__this_cpu_inc(softnet_data.processed);

/* deliver only exact match when indicated */

null_or_dev = deliver_exact ? skb->dev : NULL;

type = skb->protocol;

/*先把skb传递给ptype_all的协议层,如果用tcpdump抓包时，会注册处理函数到ptype_all */

list_for_each_entry_rcu(ptype, &ptype_all, list) {

if (!ptype->dev || ptype->dev == skb->dev) {

if (pt_prev)

ret = deliver_skb(skb, pt_prev, orig_dev);

pt_prev = ptype;

}

/*ptype_base为基本协议处理,

协议层通过dev_add_pack注册pack处理函数

如IP的ip_rcv和arp_rcv */

list_for_each_entry_rcu(ptype,

&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {

if (ptype->type == type &&

(ptype->dev == null_or_dev || ptype->dev == skb->dev ||

ptype->dev == orig_dev)) {

if (pt_prev)

ret = deliver_skb(skb, pt_prev, orig_dev);

pt_prev = ptype;

}

if (pt_prev) {

if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))

goto drop;

else

ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);

} else {

drop:

atomic_long_inc(&skb->dev->rx_dropped);

kfree_skb(skb);

/* Jamal, now you will not able to escape explaining

* me how you were going to use this. :-)

ret = NET_RX_DROP;

}

unlock:

rcu_read_unlock();

out:

return ret;