epoll源码分析——I/O复用函数总结（二）

最新推荐文章于 2023-04-10 23:51:14 发布

linxdcn

最新推荐文章于 2023-04-10 23:51:14 发布

阅读量559

点赞数

本文链接：https://blog.youkuaiyun.com/linxdcn/article/details/73028584

版权

随笔专栏收录该内容

4 篇文章

订阅专栏

本文深入剖析了Linux的epoll机制，包括epoll的核心数据结构、epoll_create、epoll_ctl及epoll_wait等关键函数的工作原理。同时对比了epoll与其他I/O复用技术如select和poll的不同之处。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

转载请注明出处：http://blog.youkuaiyun.com/linxdcn/article/details/73028584

1 概述

Linux内核一旦发现进程指定的一个或多个IO条件就绪，它就通知进程。这个能力称为IO复用。

IO复用的函数主要由select、poll和epoll，本文主要介绍epoll函数。

epoll提供了三个函数，epoll_create,epoll_ctl和epoll_wait，epoll_create是创建一个epoll句柄；epoll_ctl是注册要监听的事件类型；epoll_wait则是等待事件的产生。

2 数据结构

// epoll的核心实现，内核中对应于一个epoll描述符
struct eventpoll {  
    spinlock_t lock;  
    struct mutex mtx;  
    wait_queue_head_t wq; // sys_epoll_wait() 等待在这里  
    // f_op->poll()  使用的, 被其他事件通知机制利用的wait_address  
    wait_queue_head_t poll_wait;  
    // 已就绪的需要检查的epitem 列表
    struct list_head rdllist;  
    // 保存所有加入到当前epoll的文件对应的epitem
    struct rb_root rbr;  
    // 当正在向用户空间复制数据时, 产生的可用文件  
    struct epitem *ovflist;  
    // The user that created the eventpoll descriptor
    struct user_struct *user;  
    struct file *file;  
    // 优化循环检查，避免循环检查中重复的遍历 
    int visited;  
    struct list_head visited_list_link;  
}  

// 对应于一个加入到epoll的文件  
struct epitem {  
    // 挂载到eventpoll 的红黑树节点  
    struct rb_node rbn;  
    // 挂载到eventpoll.rdllist 的节点  
    struct list_head rdllink;  
    // 连接到ovflist 的指针  
    struct epitem *next;  
    /* 文件描述符信息fd + file, 红黑树的key */  
    struct epoll_filefd ffd;  
    /* Number of active wait queue attached to poll operations */  
    int nwait;  
    // 当前文件的等待队列(eppoll_entry)列表  
    // 同一个文件上可能会监视多种事件,  
    // 这些事件可能属于不同的wait_queue中  
    // (取决于对应文件类型的实现),  
    // 所以需要使用链表  
    struct list_head pwqlist;  
    // 当前epitem 的所有者  
    struct eventpoll *ep;  
    /* List header used to link this item to the &quot;struct file&quot; items list */  
    struct list_head fllink;  
    /* epoll_ctl 传入的用户数据 */  
    struct epoll_event event;  
};  

struct epoll_filefd {  
    struct file *file;  
    int fd;  
};  

// 与一个文件上的一个wait_queue_head 相关联，因为同一文件可能有多个等待的事件，这些事件可能使用不同的等待队列  
struct eppoll_entry {  
    // List struct epitem.pwqlist  
    struct list_head llink;  
    // 所有者  
    struct epitem *base;  
    // 添加到wait_queue 中的节点  
    wait_queue_t wait;  
    // 文件wait_queue 头  
    wait_queue_head_t *whead;  
};  

// 用户使用的epoll_event  
struct epoll_event {  
    __u32 events;  
    __u64 data;  
} EPOLL_PACKED;

2 epoll_create函数

// 真正调用sys_epoll_create1
SYSCALL_DEFINE1(epoll_create, int, size)
{
    if (size <= 0)
        return -EINVAL;

    return sys_epoll_create1(0);
}

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
    int error, fd;
    struct eventpoll *ep = NULL;
    struct file *file;

    /* Check the EPOLL_* constant for consistency.  */
    BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

    if (flags & ~EPOLL_CLOEXEC)
        return -EINVAL;

    // 分配一个struct eventpoll
    error = ep_alloc(&ep);
    if (error < 0)
        return error;
    /*
     * Creates all the items needed to setup an eventpoll file. That is,
     * a file structure and a free file descriptor.
     */
    fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
    if (fd < 0) {
        error = fd;
        goto out_free_ep;
    }
    file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                 O_RDWR | (flags & O_CLOEXEC));
    if (IS_ERR(file)) {
        error = PTR_ERR(file);
        goto out_free_fd;
    }
    ep->file = file;
    fd_install(fd, file);
    return fd;

out_free_fd:
    put_unused_fd(fd);
out_free_ep:
    ep_free(ep);
    return error;
}

3 epoll_ctl函数

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        struct epoll_event __user *, event)
{
    int error;
    int full_check = 0;
    struct fd f, tf;
    struct eventpoll *ep;
    struct epitem *epi;
    struct epoll_event epds;
    struct eventpoll *tep = NULL;

    error = -EFAULT;
    if (ep_op_has_event(op) &&
        // 1 复制用户空间数据到内核
        copy_from_user(&epds, event, sizeof(struct epoll_event)))
        goto error_return;

    // 取得 epfd 对应的文件,epfd既然是真正的fd,
    // 那么内核空间就会有与之对于的一个struct file结构
    error = -EBADF;
    f = fdget(epfd);
    if (!f.file)
        goto error_return;

    // 取得目标文件，我们需要监听的fd
    tf = fdget(fd);
    if (!tf.file)
        goto error_fput;

    // 目标文件必须提供 poll 操作
    error = -EPERM;
    if (!tf.file->f_op->poll)
        goto error_tgt_fput;

    /* Check if EPOLLWAKEUP is allowed */
    if (ep_op_has_event(op))
        ep_take_care_of_epollwakeup(&epds);

    // 添加自身或epfd 不是epoll 句柄
    error = -EINVAL;
    if (f.file == tf.file || !is_file_epoll(f.file))
        goto error_tgt_fput;

    /*
     * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
     * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
     * Also, we do not currently supported nested exclusive wakeups.
     */
    if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
        if (op == EPOLL_CTL_MOD)
            goto error_tgt_fput;
        if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
                (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
            goto error_tgt_fput;
    }

    // 取得内部结构eventpoll
    ep = f.file->private_data;

    /*
     * When we insert an epoll file descriptor, inside another epoll file
     * descriptor, there is the change of creating closed loops, which are
     * better be handled here, than in more critical paths. While we are
     * checking for loops we also determine the list of files reachable
     * and hang them on the tfile_check_list, so we can check that we
     * haven't created too many possible wakeup paths.
     *
     * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
     * the epoll file descriptor is attaching directly to a wakeup source,
     * unless the epoll file descriptor is nested. The purpose of taking the
     * 'epmutex' on add is to prevent complex toplogies such as loops and
     * deep wakeup paths from forming in parallel through multiple
     * EPOLL_CTL_ADD operations.
     */
    mutex_lock_nested(&ep->mtx, 0);
    if (op == EPOLL_CTL_ADD) {
        if (!list_empty(&f.file->f_ep_links) ||
                        is_file_epoll(tf.file)) {
            full_check = 1;
            mutex_unlock(&ep->mtx);
            mutex_lock(&epmutex);
            if (is_file_epoll(tf.file)) {
                error = -ELOOP;
                if (ep_loop_check(ep, tf.file) != 0) {
                    clear_tfile_check_list();
                    goto error_tgt_fput;
                }
            } else
                list_add(&tf.file->f_tfile_llink,
                            &tfile_check_list);
            mutex_lock_nested(&ep->mtx, 0);
            if (is_file_epoll(tf.file)) {
                tep = tf.file->private_data;
                mutex_lock_nested(&tep->mtx, 1);
            }
        }
    }

    // 对于每一个监听的fd, 内核都有分配一个epitem结构
    epi = ep_find(ep, tf.file, fd);

    error = -EINVAL;
    switch (op) {
    // 添加相关
    case EPOLL_CTL_ADD:
        if (!epi) {
            // 之前的find没有找到有效的epitem, 证明是第一次插入, 接受
            epds.events |= POLLERR | POLLHUP;
            error = ep_insert(ep, &epds, tf.file, fd, full_check);
        } else
            error = -EEXIST;
        if (full_check)
            clear_tfile_check_list();
        break;
    // 删除相关
    case EPOLL_CTL_DEL:
        if (epi)
            error = ep_remove(ep, epi);
        else
            error = -ENOENT;
        break;
    // 修改相关
    case EPOLL_CTL_MOD:
        if (epi) {
            if (!(epi->event.events & EPOLLEXCLUSIVE)) {
                epds.events |= POLLERR | POLLHUP;
                error = ep_modify(ep, epi, &epds);
            }
        } else
            error = -ENOENT;
        break;
    }
    if (tep != NULL)
        mutex_unlock(&tep->mtx);
    mutex_unlock(&ep->mtx);

error_tgt_fput:
    if (full_check)
        mutex_unlock(&epmutex);

    fdput(tf);
error_fput:
    fdput(f);
error_return:

    return error;
}

4 epoll_wait函数

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
        int, maxevents, int, timeout)
{
    int error;
    struct fd f;
    struct eventpoll *ep;

    // 检查输入数据有效性
    if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
        return -EINVAL;

    // 验证用户传入的内存是可写的
    if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
        return -EFAULT;

    // 获取epollfd的struct file
    f = fdget(epfd);
    if (!f.file)
        return -EBADF;

    // 检查获取到的file
    error = -EINVAL;
    if (!is_file_epoll(f.file))
        goto error_fput;

    // 获取eventpoll结构
    ep = f.file->private_data;

    // 等待事件
    error = ep_poll(ep, events, maxevents, timeout);

error_fput:
    fdput(f);
    return error;
}

```c
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
           int maxevents, long timeout)
{
    int res = 0, eavail, timed_out = 0;
    unsigned long flags;
    u64 slack = 0;
    wait_queue_t wait;
    ktime_t expires, *to = NULL;

    if (timeout > 0) {
        // 转换为内核时间
        struct timespec64 end_time = ep_set_mstimeout(timeout);

        slack = select_estimate_accuracy(&end_time);
        to = &expires;
        *to = timespec64_to_ktime(end_time);
    } else if (timeout == 0) {
        // 用户设置了非阻塞，直接检查readylist
        timed_out = 1;
        spin_lock_irqsave(&ep->lock, flags);
        goto check_events;
    }

fetch_events:

    if (!ep_events_available(ep))
        ep_busy_loop(ep, timed_out);

    spin_lock_irqsave(&ep->lock, flags);

    // 没有可用的事件，ready list 和ovflist 都为空
    if (!ep_events_available(ep)) {
        /*
         * Busy poll timed out.  Drop NAPI ID for now, we can add
         * it back in when we have moved a socket with a valid NAPI
         * ID onto the ready list.
         */
        ep_reset_busy_poll_napi_id(ep);

        // 添加当前进程的唤醒函数
        init_waitqueue_entry(&wait, current);
        __add_wait_queue_exclusive(&ep->wq, &wait);

        for (;;) {
            /*
             * We don't want to sleep if the ep_poll_callback() sends us
             * a wakeup in between. That's why we set the task state
             * to TASK_INTERRUPTIBLE before doing the checks.
             */
            set_current_state(TASK_INTERRUPTIBLE);
            if (ep_events_available(ep) || timed_out)
                break;
            if (signal_pending(current)) {
                res = -EINTR;
                break;
            }

            spin_unlock_irqrestore(&ep->lock, flags);
            // 挂起当前进程，等待唤醒或超时
            if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
                timed_out = 1;

            spin_lock_irqsave(&ep->lock, flags);
        }

        __remove_wait_queue(&ep->wq, &wait);
        __set_current_state(TASK_RUNNING);
    }
check_events:
    // 再次检查是否有可用事件
    eavail = ep_events_available(ep);

    spin_unlock_irqrestore(&ep->lock, flags);

    /*
     * Try to transfer events to user space. In case we get 0 events and
     * there's still timeout left over, we go trying again in search of
     * more luck.
     */
    if (!res && eavail &&
        // 复制事件到用户空间
        !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
        goto fetch_events;

    return res;
}