napi 流程记录
1、初始化一个napi_struct 结构。此时 NAPI的状态为 NAPI_STATE_SCHED状态。。
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
INIT_LIST_HEAD(&napi->poll_list);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
init_gro_hash(napi);
napi->skb = NULL;
INIT_LIST_HEAD(&napi->rx_list);
napi->rx_count = 0;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
netdev_err_once(dev, "%s() called with weight %d\n", __func__,
weight);
napi->weight = weight;
list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
set_bit(NAPI_STATE_SCHED, &napi->state);//初始化为NAPI_STATE_SCHED状态
napi_hash_add(napi);
}
2、使能一个napi_struct, napi_enable. 此时NAPI的状态全被清除
static inline void napi_enable(struct napi_struct *n)
{
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
clear_bit(NAPI_STATE_NPSVC, &n->state);
}
3、网卡硬件中断产生, 执行中断处理函数,并调用
bool napi_schedule_prep(struct napi_struct *n);和 napi_schedule。
napi_schedule_prep 检测NAPI状态是否是NAPI_STATE_SCHED或NAPI_STATE_DISABLE状态,是的话就返回,不是则设置NAPI 为NAPI_STATE_SCHED状态,然后执行napi_schedule,将NAPI挂到队列里面,触发一个软件中断。
//网卡接收中断处理函数
irqreturn xxx_intr (int irq, xxx *data)
{
if (likely(napi_schedule_prep(&data->napi_rx))) {
__napi_schedule(&data->napi_rx);
} else {
//clear irq
}
return IRQ_HANDLED;
}
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
}
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
//将NAPI加入软件中断的poll_list链表里面
list_add_tail(&napi->poll_list, &sd->poll_list);
//发出一个软件中断
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
4、处理软件中断, 此时状态仍为 NAPI_STATE_SCHED状态
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(netdev_budget_usecs);
int budget = netdev_budget;
LIST_HEAD(list);
LIST_HEAD(repoll);
local_irq_disable();
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
for (;;) { //轮训NAPI 队列
struct napi_struct *n;
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
goto out;
break;
}
//取出一个NAPI
n = list_first_entry(&list, struct napi_struct, poll_list);
//执行poll操作
budget -= napi_poll(n, &repoll);
/* If softirq window is exhausted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
sd->time_squeeze++;
break;
}
}
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
out:
__kfree_skb_flush();
}
//将NAPI节点从队列里面删除,再执行驱动自己注册的收包回调
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
void *have;
int work, weight;
//先从队列里面删除
list_del_init(&n->poll_list);
have = netpoll_poll_lock(n);
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);//调用我们驱动里面注册的回调
trace_napi_poll(n, work, weight);
}
WARN_ON_ONCE(work > weight);
if (likely(work < weight))
goto out_unlock;
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still "owns" the NAPI instance and therefore can
* move the instance around on the list at-will.
*/
if (unlikely(napi_disable_pending(n))) {
napi_complete(n);
goto out_unlock;
}
if (n->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
napi_gro_flush(n, HZ >= 1000);
}
gro_normal_list(n);
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
*/
if (unlikely(!list_empty(&n->poll_list))) {
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
n->dev ? n->dev->name : "backlog");
goto out_unlock;
}
list_add_tail(&n->poll_list, repoll);
out_unlock:
netpoll_poll_unlock(have);
return work;
}
4、执行驱动收报回调,最后掉用napi_complete,将NAPI NAPIF_STATE_SCHED 状态清除.
bool napi_complete(struct napi_struct *n, int work_done)
{
unsigned long flags, val, new;
/*
* 1) Don't let napi dequeue from the cpu poll list
* just in case its running on a different cpu.
* 2) If we are busy polling, do nothing here, we have
* the guarantee we will be called later.
*/
if (unlikely(n->state & (NAPIF_STATE_NPSVC |
NAPIF_STATE_IN_BUSY_POLL)))
return false;
if (n->gro_bitmask) {
unsigned long timeout = 0;
if (work_done)
timeout = n->dev->gro_flush_timeout;
/* When the NAPI instance uses a timeout and keeps postponing
* it, we need to bound somehow the time packets are kept in
* the GRO layer
*/
napi_gro_flush(n, !!timeout);
if (timeout)
hrtimer_start(&n->timer, ns_to_ktime(timeout),
HRTIMER_MODE_REL_PINNED);
}
gro_normal_list(n);
if (unlikely(!list_empty(&n->poll_list))) {
/* If n->poll_list is not empty, we need to mask irqs */
local_irq_save(flags);
list_del_init(&n->poll_list);
local_irq_restore(flags);
}
do {
val = READ_ONCE(n->state);
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
* This C code was suggested by Alexander Duyck to help gcc.
*/
new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
NAPIF_STATE_SCHED;
} while (cmpxchg(&n->state, val, new) != val);
if (unlikely(val & NAPIF_STATE_MISSED)) {
__napi_schedule(n);
return false;
}
return true;
}