linux 内核poll/select/epoll实现剖析

最新推荐文章于 2022-10-13 17:07:00 发布

原创

最新推荐文章于 2022-10-13 17:07:00 发布 · 1.3k 阅读

5 ·

CC 4.0 BY-SA版权

文章标签：

#epoll #数据结构与算法

本文深入剖析了Linux内核中的poll、select和epoll的实现原理。从f_ops.poll和wait_queue开始，详细讲解了poll_table结构、关键函数以及epoll的复杂性，包括递归死循环和唤醒风暴的检测机制。通过对epoll_ctl、epoll_wait等关键函数的分析，展示了epoll在性能和可扩展性上的优势。最后，对比了poll、select和epoll在处理大量文件描述符时的效率差异。

f_ops.poll和wait_queue

poll/select/epoll的实现都是基于文件提供的poll方法(f_op->poll)，
该方法利用poll_table提供的_qproc方法向文件内部事件掩码_key对应的的一个或多个等待队列(wait_queue_head_t)上添加包含唤醒函数(wait_queue_t.func)的节点(wait_queue_t)，并检查文件当前就绪的状态返回给poll的调用者(依赖于文件的实现)。
当文件的状态发生改变时(例如网络数据包到达)，文件就会遍历事件对应的等待队列并调用回调函数(wait_queue_t.func)唤醒等待线程。

通常的file.f_ops.poll实现及相关结构体如下

struct file {
    const struct file_operations	*f_op;
    spinlock_t			f_lock;
    // 文件内部实现细节
    void			   *private_data;
#ifdef CONFIG_EPOLL
    /* Used by fs/eventpoll.c to link all the hooks to this file */
    struct list_head	f_ep_links;
    struct list_head	f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
    // 其他细节....
};

// 文件操作
struct file_operations {
    // 文件提供给poll/select/epoll
    // 获取文件当前状态, 以及就绪通知接口函数
    unsigned int (*poll) (struct file *, struct poll_table_struct *);
    // 其他方法read/write 等... ...
};

// 通常的file.f_ops.poll 方法的实现
unsigned int file_f_op_poll (struct file *filp, struct poll_table_struct *wait)
{
    unsigned int mask = 0;
    wait_queue_head_t * wait_queue;

    //1. 根据事件掩码wait->key_和文件实现filep->private_data 取得事件掩码对应的一个或多个wait queue head
    some_code();

    // 2. 调用poll_wait 向获得的wait queue head 添加节点
    poll_wait(filp, wait_queue, wait);

    // 3. 取得当前就绪状态保存到mask
    some_code();

    return mask;
}

// select/poll/epoll 向文件注册就绪后回调节点的接口结构
typedef struct poll_table_struct {
    // 向wait_queue_head 添加回调节点(wait_queue_t)的接口函数
    poll_queue_proc _qproc;
    // 关注的事件掩码, 文件的实现利用此掩码将等待队列传递给_qproc
    unsigned long   _key;
} poll_table;
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);


// 通用的poll_wait 函数, 文件的f_ops->poll 通常会调用此函数
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
    if (p && p->_qproc && wait_address) {
        // 调用_qproc 在wait_address 上添加节点和回调函数
        // 调用 poll_table_struct 上的函数指针向wait_address添加节点, 并设置节点的func
        // (如果是select或poll 则是 __pollwait, 如果是 epoll 则是 ep_ptable_queue_proc),
        p->_qproc(filp, wait_address, p);
    }
}


// wait_queue 头节点
typedef struct __wait_queue_head wait_queue_head_t;
struct __wait_queue_head {
    spinlock_t lock;
    struct list_head task_list;
};

// wait_queue 节点
typedef struct __wait_queue wait_queue_t;
struct __wait_queue {
    unsigned int flags;
#define WQ_FLAG_EXCLUSIVE	0x01
    void *private;
    wait_queue_func_t func;
    struct list_head task_list;
};
typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);


// 当文件的状态发生改变时, 文件会调用此函数，此函数通过调用wait_queue_t.func通知poll的调用者
// 其中key是文件当前的事件掩码
void __wake_up(wait_queue_head_t *q, unsigned int mode,
               int nr_exclusive, void *key)
{
    unsigned long flags;

    spin_lock_irqsave(&q->lock, flags);
    __wake_up_common(q, mode, nr_exclusive, 0, key);
    spin_unlock_irqrestore(&q->lock, flags);
}
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
                             int nr_exclusive, int wake_flags, void *key)
{
    wait_queue_t *curr, *next;
    // 遍历并调用func 唤醒, 通常func会唤醒调用poll的线程
    list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
        unsigned flags = curr->flags;

        if (curr->func(curr, mode, wake_flags, key) &&
                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) {
            break;
        }
    }
}

poll 和 select

poll和select的实现基本上是一致的，只是传递参数有所不同，他们的基本流程如下：

1. 复制用户数据到内核空间

2. 估计超时时间

3. 遍历每个文件并调用f_op->poll 取得文件当前就绪状态，如果前面遍历的文件都没有就绪，向文件插入wait_queue节点

4. 遍历完成后检查状态：

a). 如果已经有就绪的文件转到5；

b). 如果有信号产生，重启poll或select（转到 1或3）；

c). 否则挂起进程等待超时或唤醒，超时或被唤醒后再次遍历所有文件取得每个文件的就绪状态

5. 将所有文件的就绪状态复制到用户空间

6. 清理申请的资源

关键结构体

下面是poll/select共用的结构体及其相关功能:

poll_wqueues 是 select/poll 对poll_table接口的具体化实现,其中的table, inline_index和inline_entries都是为了管理内存。
poll_table_entry 与一个文件相关联，用于管理插入到文件的wait_queue节点。

// select/poll 对poll_table的具体化实现
struct poll_wqueues {
    poll_table pt;
    struct poll_table_page *table;     // 如果inline_entries 空间不足, 从poll_table_page 中分配
    struct task_struct *polling_task;  // 调用poll 或select 的进程
    int triggered;                     // 已触发标记
    int error;
    int inline_index;                  // 下一个要分配的inline_entrie 索引
    struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];//
};
// 帮助管理select/poll  申请的内存
struct poll_table_page {
    struct poll_table_page  * next;       // 下一个 page
    struct poll_table_entry * entry;      // 指向第一个entries
    struct poll_table_entry entries[0];
};
// 与一个正在poll /select 的文件相关联,
struct poll_table_entry {
    struct file *filp;               // 在poll/select中的文件
    unsigned long key;
    wait_queue_t wait;               // 插入到wait_queue_head_t 的节点
    wait_queue_head_t *wait_address; // 文件上的wait_queue_head_t 地址
};

公共函数

下面是poll/select公用的一些函数，这些函数实现了poll和select的核心功能。

poll_initwait 用于初始化poll_wqueues，

__pollwait 实现了向文件中添加回调节点的逻辑，

pollwake 当文件状态发生改变时，由文件调用，用来唤醒线程，

poll_get_entry，free_poll_entry，poll_freewait用来申请释放poll_table_entry 占用的内存，并负责释放文件上的wait_queue节点。

// poll_wqueues 的初始化:
// 初始化 poll_wqueues , __pollwait会在文件就绪时被调用
void poll_initwait(struct poll_wqueues *pwq)
{
    // 初始化poll_table, 相当于调用基类的构造函数
    init_poll_funcptr(&pwq->pt, __pollwait);
    /*
     * static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
     * {
     *     pt->_qproc = qproc;
     *     pt->_key   = ~0UL;
     * }
     */
    pwq->polling_task = current;
    pwq->triggered = 0;
    pwq->error = 0;
    pwq->table = NULL;
    pwq->inline_index = 0;
}


// wait_queue设置函数
// poll/select 向文件wait_queue中添加节点的方法
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                       poll_table *p)
{
    struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
    struct poll_table_entry *entry = poll_get_entry(pwq);
    if (!entry) {
        return;
    }
    get_file(filp); //put_file() in free_poll_entry()
    entry->filp = filp;
    entry->wait_address = wait_address; // 等待队列头
    entry->key = p->key;
    // 设置回调为 pollwake
    init_waitqueue_func_entry(&entry->wait, pollwake);
    entry->wait.private = pwq;
    // 添加到等待队列
    add_wait_queue(wait_address, &entry->wait);
}

// 在等待队列(wait_queue_t)上回调函数(func)
// 文件就绪后被调用，唤醒调用进程，其中key是文件提供的当前状态掩码
static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    struct poll_table_entry *entry;
    // 取得文件对应的poll_table_entry
    entry = container_of(wait, struct poll_table_entry, wait);
    // 过滤不关注的事件
    if (key && !((unsigned long)key & entry->key)) {
        return 0;
    }
    // 唤醒
    return __pollwake(wait, mode, sync, key);
}
static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    struct poll_wqueues *pwq = wait->private;
    // 将调用进程 pwq->polling_task 关联到 dummy_wait
    DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
    smp_wmb();
    pwq->triggered = 1;// 标记为已触发
    // 唤醒调用进程
    return default_wake_function(&dummy_wait, mode, sync, key);
}

// 默认的唤醒函数,poll/select 设置的回调函数会调用此函数唤醒
// 直接唤醒等待队列上的线程,即将线程移到运行队列(rq)
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
                          void *key)
{
    // 这个函数比较复杂, 这里就不具体分析了
    return try_to_wake_up(curr->private, mode, wake_flags);
}

poll，select对poll_table_entry的申请和释放采用的是类似内存池的管理方式，先使用预分配的空间，预分配的空间不足时，分配一个内存页，使用内存页上的空间。

// 分配或使用已先前申请的 poll_table_entry,
static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) {
    struct poll_table_page *table = p->table;

    if (p->inline_index < N_INLINE_POLL_ENTRIES) {
        return p->inline_entries + p->inline_index++;
    }

    if (!table || POLL_TABLE_FULL(table)) {
        struct poll_table_page *new_table;
        new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
        if (!new_table) {
            p->error = -ENOMEM;
            return NULL;
        }
        new_table->entry = new_table->entries;
        new_table->next = table;
        p->table = new_table;
        table = new_table;
    }
    return table->entry++;
}

// 清理poll_wqueues 占用的资源
void poll_freewait(struct poll_wqueues *pwq)
{
    struct poll_table_page * p = pwq->table;
    // 遍历所有已分配的inline poll_table_entry
    int i;
    for (i = 0; i < pwq->inline_index; i++) {
        free_poll_entry(pwq->inline_entries + i);
    }
    // 遍历在poll_table_page上分配的inline poll_table_entry
    // 并释放poll_table_page
    while (p) {
        struct poll_table_entry * entry;
        struct poll_table_page *old;
        entry = p->entry;
        do {
            entry--;
            free_poll_entry(entry);
        } while (entry > p->entries);
        old = p;
        p = p->next;
        free_page((unsigned long) old);
    }
}
static void free_poll_entry(struct poll_table_entry *entry)
{
    // 从等待队列中删除, 释放文件引用计数
    remove_wait_queue(entry->wait_address, &entry->wait);
    fput(entry->filp);
}

poll/select核心结构关系

下图是 poll/select 实现公共部分的关系图，包含了与文件直接的关系，以及函数之间的依赖。

poll的实现

// poll 使用的结构体
struct pollfd {
    int fd;        // 描述符
    short events;  // 关注的事件掩码
    short revents; // 返回的事件掩码
};
// long sys_poll(struct pollfd *ufds, unsigned int nfds, long timeout_msecs)
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                long, timeout_msecs)
{
    struct timespec end_time, *to = NULL;
    int ret;
    if (timeout_msecs >= 0) {
        to = &end_time;
        // 将相对超时时间msec 转化为绝对时间
        poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
                                NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
    }
    // do sys poll
    ret = do_sys_poll(ufds, nfds, to);
    // do_sys_poll 被信号中断, 重新调用, 对使用者来说 poll 是不会被信号中断的.
    if (ret == -EINTR) {
        struct restart_block *restart_block;
        restart_block = &current_thread_info()->restart_block;
        restart_block->fn = do_restart_poll; // 设置重启的函数
        restart_block->poll.ufds = ufds;
        restart_block->poll.nfds = nfds;
        if (timeout_msecs >= 0) {
            restart_block->poll.tv_sec = end_time.tv_sec;
            restart_block->poll.tv_nsec = end_time.tv_nsec;
            restart_block->poll.has_timeout = 1;
        } else {
            restart_block->poll.has_timeout = 0;
        }
        // ERESTART_RESTARTBLOCK 不会返回给用户进程,
        // 而是会被系统捕获, 然后调用 do_restart_poll,
        ret = -ERESTART_RESTARTBLOCK;
    }
    return ret;
}
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                struct timespec *end_time)
{
    struct poll_wqueues table;
    int err = -EFAULT, fdcount, len, size;
    /* 首先使用栈上的空间，节约内存，加速访问 */
    long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
    struct poll_list *const head = (struct poll_list *)stack_pps;
    struct poll_list *walk = head;
    unsigned long todo = nfds;
    if (nfds > rlimit(RLIMIT_NOFILE)) {
        // 文件描述符数量超过当前进程限制
        return -EINVAL