epoll源码解析

本文将详细解析epoll在Linux内核中的实现原理,通过源码分析其工作流程,包括epoll_create、epoll_ctl和epoll_wait等关键函数的实现,帮助读者深入理解epoll的高性能I/O复用机制。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

epoll源码解析

    struct epoll_filefd {
        struct file *file;
        int fd;
    } __packed;
    
    /*
     * Each file descriptor added to the eventpoll interface will
     * have an entry of this type linked to the "rbr" RB tree.
     * Avoid increasing the size of this struct, there can be many thousands
     * of these on a server and we do not want this to take another cache line.
     */
    struct epitem {
        union {
            /* RB tree node links this structure to the eventpoll RB tree */
            //红黑树节点,根节点连在epoll的rbr上面
            struct rb_node rbn;
            /* Used to free the struct epitem */
            struct rcu_head rcu;
        };
    
        /* List header used to link this structure to the eventpoll ready list */
        //链表节点,已经ready的epitem都会被链到rdllist中
        struct list_head rdllink;
    
        /*
             * Works together "struct eventpoll"->ovflist in keeping the
             * single linked chain of items.
             */
        //?
        struct epitem *next;
    
        /* The file descriptor information this item refers to */
        //此epitem监听的文件描述符
        struct epoll_filefd ffd;
    
        /* Number of active wait queue attached to poll operations */
        //poll操作中事件的个数
        int nwait;
    
        /* List containing poll wait queues */
        //一个双向链表,保存着被监视文件的等待队列
        struct list_head pwqlist;
    
        /* The "container" of this item */
        //所属的eventpoll结构
        struct eventpoll *ep;
    
        /* List header used to link this item to the "struct file" items list */
        //ffd文件描述符所对应的文件的链表
        struct list_head fllink;
    
        /* wakeup_source used when EPOLLWAKEUP is set */
        struct wakeup_source __rcu *ws;
    
        /* The structure that describe the interested events and the source fd */
        //当前epitem所关心的事件,通过epoll_ctl从用户态传来
        struct epoll_event event;
    };
    
    /*
     * This structure is stored inside the "private_data" member of the file
     * structure and represents the main data structure for the eventpoll
     * interface.
     *
     * Access to it is protected by the lock inside wq.
     */
    struct eventpoll {
        /*
             * This mutex is used to ensure that files are not removed
             * while epoll is using them. This is held during the event
             * collection loop, the file cleanup path, the epoll file exit
             * code and the ctl operations.
             */
        //对本数据结构的访问加锁
        struct mutex mtx;
    
        /* Wait queue used by sys_epoll_wait() */
        //调用epoll_wait()时, 我们就是"睡"在了这个等待队列上...
        wait_queue_head_t wq;
    
        /* Wait queue used by file->poll() */
        //调用file->poll()时的等待队列
        wait_queue_head_t poll_wait;
    
        /* List of ready file descriptors */
        //ready list 不为空可唤醒epoll_wait
        struct list_head rdllist;
    
        /* RB tree root used to store monitored fd structs */
        //红黑树根节点
        struct rb_root_cached rbr;
    
        /*
             * This is a single linked list that chains all the "struct epitem" that
             * happened while transferring ready events to userspace w/out
             * holding ->wq.lock.
             */
        //传向用户空间的一个中间态
        struct epitem *ovflist;
    
        /* wakeup_source used when ep_scan_ready_list is running */
        //?
        struct wakeup_source *ws;
    
        /* The user that created the eventpoll descriptor */
        //创建epollfd的用户的信息
        struct user_struct *user;
    
        //此epollfd对应的文件
        struct file *file;
    
        /* used to optimize loop detection check */
        //?优化
        int visited;
        struct list_head visited_list_link;
    
    };
    
    /* Wait structure used by the poll hooks */
    //等待队列的结点结构
    struct eppoll_entry {
        /* List header used to link this structure to the "struct epitem" */
        struct list_head llink;
    
        /* The "base" pointer is set to the container "struct epitem" */
        //指向一个epitem
        struct epitem *base;
    
        /*
             * Wait queue item that will be linked to the target file wait
             * queue head.
             */
        //等待队列节点
        wait_queue_entry_t wait;
    
        /* The wait queue head that linked the "wait" wait queue item */
        //等待队列头
        wait_queue_head_t *whead;
    };
    
    /* Wrapper struct used by poll queueing */
    //
    struct ep_pqueue {
        //一个函数指针,最终指向回调函数
        poll_table pt;
        //指向一个epitem
        struct epitem *epi;
    };
    
    /* Used by the ep_send_events() function as callback private data */
    //用作回调函数的参数的结构体
    struct ep_send_events_data {
        int maxevents;
        struct epoll_event __user *events;
        int res;
    };
    
    
    static inline int ep_events_available(struct eventpoll *ep)
    {
        return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
    }
    
    static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
    {
        int pwake = 0;
        unsigned long flags;
        struct epitem *epi = ep_item_from_wait(wait);//从等待队列获取epitem
        struct eventpoll *ep = epi->ep;//从epitem获取eventpoll
        __poll_t pollflags = key_to_poll(key);
        int ewake = 0;
    
        spin_lock_irqsave(&ep->wq.lock, flags);
    
        ep_set_busy_poll_napi_id(epi);
    
        /*
             * If the event mask does not contain any poll(2) event, we consider the
             * descriptor to be disabled. This condition is likely the effect of the
             * EPOLLONESHOT bit that disables the descriptor when an event is received,
             * until the next EPOLL_CTL_MOD will be issued.
             */
        //判断注册的感兴趣事件
        //#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
        if (!(epi->event.events & ~EP_PRIVATE_BITS))
            goto out_unlock;
    
        /*
             * Check the events coming with the callback. At this stage, not
             * every device reports the events in the "key" parameter of the
             * callback. We need to be able to handle both cases here, hence the
             * test for "key" != NULL before the event match test.
             */
        if (pollflags && !(pollflags & epi->event.events))
            goto out_unlock;
    
        /*
             * If we are transferring events to userspace, we can hold no locks
             * (because we're accessing user memory, and because of linux f_op->poll()
             * semantics). All the events that happen during that period of time are
             * chained in ep->ovflist and requeued later on.
             */
        //若epoll_wait已经返回,则等待下一次epoll_wait
        if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
            if (epi->next == EP_UNACTIVE_PTR) {
                epi->next = ep->ovflist;
                ep->ovflist = epi;
                if (epi->ws) {
                    /*
                                     * Activate ep->ws since epi->ws may get
                                     * deactivated at any time.
                                     */
                    __pm_stay_awake(ep->ws);
                }
    
            }
            goto out_unlock;
        }
    
        /* If this file is already in the ready list we exit soon */
        //将fd/eptiem添加到ready list中
        if (!ep_is_linked(epi)) {
            list_add_tail(&epi->rdllink, &ep->rdllist);
            ep_pm_stay_awake_rcu(epi);
        }
    
        /*
             * Wake up ( if active ) both the eventpoll wait list and the ->poll()
             * wait list.
             */
        //唤醒进程epoll_wait()的进程
        if (waitqueue_active(&ep->wq)) {
            if ((epi->event.events & EPOLLEXCLUSIVE) &&
                    !(pollflags & POLLFREE)) {
                switch (pollflags & EPOLLINOUT_BITS) {
                case EPOLLIN:
                    if (epi->event.events & EPOLLIN)
                        ewake = 1;
                    break;
                case EPOLLOUT:
                    if (epi->event.events & EPOLLOUT)
                        ewake = 1;
                    break;
                case 0:
                    ewake = 1;
                    break;
                }
            }
            wake_up_locked(&ep->wq);
        }
        //唤醒file->poll()的进程
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    
    out_unlock:
        spin_unlock_irqrestore(&ep->wq.lock, flags);
    
        /* We have to call this outside the lock */
        if (pwake)
            ep_poll_safewake(&ep->poll_wait);
    
        if (!(epi->event.events & EPOLLEXCLUSIVE))
            ewake = 1;
    
        if (pollflags & POLLFREE) {
            /*
                     * If we race with ep_remove_wait_queue() it can miss
                     * ->whead = NULL and do another remove_wait_queue() after
                     * us, so we can't use __remove_wait_queue().
                     */
            list_del_init(&wait->entry);
            /*
                     * ->whead != NULL protects us from the race with ep_free()
                     * or ep_remove(), ep_remove_wait_queue() takes whead->lock
                     * held by the caller. Once we nullify it, nothing protects
                     * ep/epi or even wait.
                     */
            smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
        }
    
        return ewake;
    }
    
    /*
     * This is the callback that is used to add our wait queue to the
     * target file wakeup lists.
     */
    static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                                     poll_table *pt)
    {
        struct epitem *epi = ep_item_from_epqueue(pt);
        struct eppoll_entry *pwq;
    
        if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
            //初始化等待队列,指定回调函数,当fd变化,pwq->wait也就是队列被唤醒时,将调用回调函数。
            init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
            pwq->whead = whead;
            pwq->base = epi;
            //刚创建的等待队列节点加入等待队列
            if (epi->event.events & EPOLLEXCLUSIVE)
                add_wait_queue_exclusive(whead, &pwq->wait);
            else
                add_wait_queue(whead, &pwq->wait);
            list_add_tail(&pwq->llink, &epi->pwqlist);
            //当前epitem所加入的等待队列的数量,最大值也只能是1了?
            epi->nwait++;
        } else {
            /* We have to signal that an error occurred */
            epi->nwait = -1;
        }
    }
    
    /*
     * Must be called with "mtx" held.
     */
    static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
                         struct file *tfile, int fd, int full_check)
    {
        int error, pwake = 0;
        __poll_t revents;
        long user_watches;
        struct epitem *epi;
        struct ep_pqueue epq;
    
        lockdep_assert_irqs_enabled();
    
        user_watches = atomic_long_read(&ep->user->epoll_watches);
        if (unlikely(user_watches >= max_user_watches))
            return -ENOSPC;
        //分配一个epitem来保存这个加入的fd
        if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
            return -ENOMEM;
    
        /* Item initialization follow here ... */
        //初始化epitem
        INIT_LIST_HEAD(&epi->rdllink);
        INIT_LIST_HEAD(&epi->fllink);
        INIT_LIST_HEAD(&epi->pwqlist);
        epi->ep = ep;
        //保存需要监听的文件描述符ffd和对应的文件tfile
        ep_set_ffd(&epi->ffd, tfile, fd);
        epi->event = *event;
        epi->nwait = 0;
        epi->next = EP_UNACTIVE_PTR;
        if (epi->event.events & EPOLLWAKEUP) {
            error = ep_create_wakeup_source(epi);
            if (error)
                goto error_create_wakeup_source;
        } else {
            RCU_INIT_POINTER(epi->ws, NULL);
        }
    
        /* Initialize the poll table using the queue callback */
        //初始化ep_pqueue
        epq.epi = epi;
        //安装poll回调函数
        init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
    
        /*
             * Attach the item to the poll hooks and get current event bits.
             * We can safely use the file* here because its usage count has
             * been increased by the caller of this function. Note that after
             * this operation completes, the poll callback can start hitting
             * the new item.
             */
        //?注册epq.pt
        revents = ep_item_poll(epi, &epq.pt, 1);
    
        /*
             * We have to check if something went wrong during the poll wait queue
             * install process. Namely an allocation for a wait queue failed due
             * high memory pressure.
             */
        error = -ENOMEM;
        if (epi->nwait < 0)
            goto error_unregister;
    
        /* Add the current item to the list of active epoll hook for this file */
        //每个文件会将监听自己的epitem链接起来
        spin_lock(&tfile->f_lock);
        list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
        spin_unlock(&tfile->f_lock);
    
        /*
             * Add the current item to the RB tree. All RB tree operations are
             * protected by "mtx", and ep_insert() is called with "mtx" held.
             */
        //插入到对应的eventpoll下的红黑树里
        ep_rbtree_insert(ep, epi);
    
        /* now check if we've created too many backpaths */
        error = -EINVAL;
        if (full_check && reverse_path_check())
            goto error_remove_epi;
    
        /* We have to drop the new item inside our item list to keep track of it */
        spin_lock_irq(&ep->wq.lock);
    
        /* record NAPI ID of new item if present */
        ep_set_busy_poll_napi_id(epi);
    
        /* If the file is already "ready" we drop it inside the ready list */
        //如果监听的fd已经发生,则直接处理一下
        if (revents && !ep_is_linked(epi)) {
            //加ready list
            list_add_tail(&epi->rdllink, &ep->rdllist);
            ep_pm_stay_awake(epi);
    
            /* Notify waiting tasks that events are available */
            //唤醒epoll_wait的进程和file->poll()的进程
            if (waitqueue_active(&ep->wq))
                wake_up_locked(&ep->wq);
            if (waitqueue_active(&ep->poll_wait))
                pwake++;
        }
    
        spin_unlock_irq(&ep->wq.lock);
    
        atomic_long_inc(&ep->user->epoll_watches);
    
        /* We have to call this outside the lock */
        if (pwake)
            ep_poll_safewake(&ep->poll_wait);
    
        return 0;
    error_remove_epi:
        spin_lock(&tfile->f_lock);
        list_del_rcu(&epi->fllink);
        spin_unlock(&tfile->f_lock);
    
        rb_erase_cached(&epi->rbn, &ep->rbr);
    
    error_unregister:
        ep_unregister_pollwait(ep, epi);
    
        /*
             * We need to do this because an event could have been arrived on some
             * allocated wait queue. Note that we don't care about the ep->ovflist
             * list, since that is used/cleaned only inside a section bound by "mtx".
             * And ep_insert() is called with "mtx" held.
             */
        spin_lock_irq(&ep->wq.lock);
        if (ep_is_linked(epi))
            list_del_init(&epi->rdllink);
        spin_unlock_irq(&ep->wq.lock);
    
        wakeup_source_unregister(ep_wakeup_source(epi));
    
    error_create_wakeup_source:
        kmem_cache_free(epi_cache, epi);
    
        return error;
    }
    
    /*
     * Modify the interest event mask by dropping an event if the new mask
     * has a match in the current file status. Must be called with "mtx" held.
     */
    static int ep_modify(struct eventpoll *ep, struct epitem *epi,
                         const struct epoll_event *event)
    {
        int pwake = 0;
        poll_table pt;
    
        lockdep_assert_irqs_enabled();
    
        init_poll_funcptr(&pt, NULL);
    
        /*
             * Set the new event interest mask before calling f_op->poll();
             * otherwise we might miss an event that happens between the
             * f_op->poll() call and the new event set registering.
             */
        epi->event.events = event->events; /* need barrier below */
        epi->event.data = event->data; /* protected by mtx */
        if (epi->event.events & EPOLLWAKEUP) {
            if (!ep_has_wakeup_source(epi))
                ep_create_wakeup_source(epi);
        } else if (ep_has_wakeup_source(epi)) {
            ep_destroy_wakeup_source(epi);
        }
    
        /*
             * The following barrier has two effects:
             *
             * 1) Flush epi changes above to other CPUs.  This ensures
             *    we do not miss events from ep_poll_callback if an
             *    event occurs immediately after we call f_op->poll().
             *    We need this because we did not take ep->wq.lock while
             *    changing epi above (but ep_poll_callback does take
             *    ep->wq.lock).
             *
             * 2) We also need to ensure we do not miss _past_ events
             *    when calling f_op->poll().  This barrier also
             *    pairs with the barrier in wq_has_sleeper (see
             *    comments for wq_has_sleeper).
             *
             * This barrier will now guarantee ep_poll_callback or f_op->poll
             * (or both) will notice the readiness of an item.
             */
        smp_mb();
    
        /*
             * Get current event bits. We can safely use the file* here because
             * its usage count has been increased by the caller of this function.
             * If the item is "hot" and it is not registered inside the ready
             * list, push it inside.
             */
        if (ep_item_poll(epi, &pt, 1)) {
            spin_lock_irq(&ep->wq.lock);
            if (!ep_is_linked(epi)) {
                list_add_tail(&epi->rdllink, &ep->rdllist);
                ep_pm_stay_awake(epi);
    
                /* Notify waiting tasks that events are available */
                if (waitqueue_active(&ep->wq))
                    wake_up_locked(&ep->wq);
                if (waitqueue_active(&ep->poll_wait))
                    pwake++;
            }
            spin_unlock_irq(&ep->wq.lock);
        }
    
        /* We have to call this outside the lock */
        if (pwake)
            ep_poll_safewake(&ep->poll_wait);
    
        return 0;
    }
    
    static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                       int maxevents, long timeout)
    {
        int res = 0, eavail, timed_out = 0;
        u64 slack = 0;
        wait_queue_entry_t wait;
        ktime_t expires, *to = NULL;
    
        lockdep_assert_irqs_enabled();
    
        if (timeout > 0) {
            struct timespec64 end_time = ep_set_mstimeout(timeout);
    
            slack = select_estimate_accuracy(&end_time);
            to = &expires;
            *to = timespec64_to_ktime(end_time);
        } else if (timeout == 0) {
            /*
                     * Avoid the unnecessary trip to the wait queue loop, if the
                     * caller specified a non blocking operation.
                     */
            timed_out = 1;
            spin_lock_irq(&ep->wq.lock);
            goto check_events;
        }
    
    fetch_events:
    
        if (!ep_events_available(ep))
            ep_busy_loop(ep, timed_out);
    
        spin_lock_irq(&ep->wq.lock);
    
        //没有事件则睡眠
        if (!ep_events_available(ep)) {
            /*
                     * Busy poll timed out.  Drop NAPI ID for now, we can add
                     * it back in when we have moved a socket with a valid NAPI
                     * ID onto the ready list.
                     */
            ep_reset_busy_poll_napi_id(ep);
    
            /*
                     * We don't have any available event to return to the caller.
                     * We need to sleep here, and we will be wake up by
                     * ep_poll_callback() when events will become available.
                     */
            init_waitqueue_entry(&wait, current);
            __add_wait_queue_exclusive(&ep->wq, &wait);
    
            for (::) {
                /*
                             * We don't want to sleep if the ep_poll_callback() sends us
                             * a wakeup in between. That's why we set the task state
                             * to TASK_INTERRUPTIBLE before doing the checks.
                             */
                //设置状态为可唤醒
                set_current_state(TASK_INTERRUPTIBLE);
                /*
                             * Always short-circuit for fatal signals to allow
                             * threads to make a timely exit without the chance of
                             * finding more events available and fetching
                             * repeatedly.
                             */
                if (fatal_signal_pending(current)) {
                    res = -EINTR;
                    break;
                }
                //有文件状态就绪或者超时就break
                if (ep_events_available(ep) || timed_out)
                    break;
                //收到信号也break
                if (signal_pending(current)) {
                    res = -EINTR;
                    break;
                }
                //进入睡眠状态,等待文件的状态就绪or超时
                spin_unlock_irq(&ep->wq.lock);
                if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
                    timed_out = 1;
    
                spin_lock_irq(&ep->wq.lock);
            }
            //清除出等待队列
            __remove_wait_queue(&ep->wq, &wait);
            __set_current_state(TASK_RUNNING);
        }
    check_events:
        /* Is it worth to try to dig for events ? */
        //     ep->ovflist链表存储的向用户传递事件时暂存就绪的文件。
        //     所以不管是就绪队列ep->rdllist不为空,或者ep->ovflist不等于
        //     EP_UNACTIVE_PTR,都有可能现在已经有文件的状态就绪。
        //     ep->ovflist不等于EP_UNACTIVE_PTR有两种情况,一种是NULL,此时
        //     可能正在向用户传递事件,不一定就有文件状态就绪,
        //     一种情况时不为NULL,此时可以肯定有文件状态就绪,
        //     参见ep_send_events()。
        eavail = ep_events_available(ep);
    
        spin_unlock_irq(&ep->wq.lock);
    
        /*
             * Try to transfer events to user space. In case we get 0 events and
             * there's still timeout left over, we go trying again in search of
             * more luck.
             */
        //再来一遍
        if (!res && eavail &&
                !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
            goto fetch_events;
    
        return res;
    }
    
    
    /*
     * Open an eventpoll file descriptor.
     */
    static int do_epoll_create(int flags)
    {
        int error, fd;
        struct eventpoll *ep = NULL;
        struct file *file;
    
        /* Check the EPOLL_* constant for consistency.  */
        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
    
        if (flags & ~EPOLL_CLOEXEC)
            return -EINVAL;
        /*
             * Create the internal data structure ("struct eventpoll").
             */
        //创建event_polle
        error = ep_alloc(&ep);
        if (error < 0)
            return error;
        /*
             * Creates all the items needed to setup an eventpoll file. That is,
             * a file structure and a free file descriptor.
             */
        fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
        if (fd < 0) {
            error = fd;
            goto out_free_ep;
        }
    
        //创建对应epoll的新file
        file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                                  O_RDWR | (flags & O_CLOEXEC));
        if (IS_ERR(file)) {
            error = PTR_ERR(file);
            goto out_free_fd;
        }
    
        //绑定fd和file,并返回fd,这个fd也就是epoll描述符
        ep->file = file;
        fd_install(fd, file);
        return fd;
    
    out_free_fd:
        put_unused_fd(fd);
    out_free_ep:
        ep_free(ep);
        return error;
    }
    
    SYSCALL_DEFINE1(epoll_create, int, size)
    {
        if (size <= 0)
            return -EINVAL;
    
        return do_epoll_create(0);
    }
    
    /*
     * The following function implements the controller interface for
     * the eventpoll file that enables the insertion/removal/change of
     * file descriptors inside the interest set.
     */
    SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                    struct epoll_event __user *, event)
    {
        int error;
        int full_check = 0;
        struct fd f, tf;
        struct eventpoll *ep;
        struct epitem *epi;
        struct epoll_event epds;
        struct eventpoll *tep = NULL;
    
        error = -EFAULT;
        //判断参数合法性,将event赋给epds,即从用户空间拷贝到内核空间
        if (ep_op_has_event(op) &&
                copy_from_user(&epds, event, sizeof(struct epoll_event)))
            goto error_return;
    
        error = -EBADF;
        f = fdget(epfd);
        if (!f.file)
            goto error_return;
    
        /* Get the "struct file *" for the target file */
        tf = fdget(fd);
        if (!tf.file)
            goto error_fput;
    
        /* The target file descriptor must support poll */
        error = -EPERM;
        if (!file_can_poll(tf.file))
            goto error_tgt_fput;
    
        /* Check if EPOLLWAKEUP is allowed */
        if (ep_op_has_event(op))
            ep_take_care_of_epollwakeup(&epds);
    
        /*
             * We have to check that the file structure underneath the file descriptor
             * the user passed to us _is_ an eventpoll file. And also we do not permit
             * adding an epoll file descriptor inside itself.
             */
        error = -EINVAL;
        if (f.file == tf.file || !is_file_epoll(f.file))
            goto error_tgt_fput;
    
        /*
             * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
             * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
             * Also, we do not currently supported nested exclusive wakeups.
             */
        if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
            if (op == EPOLL_CTL_MOD)
                goto error_tgt_fput;
            if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
                                        (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
                goto error_tgt_fput;
        }
    
        /*
             * At this point it is safe to assume that the "private_data" contains
             * our own data structure.
             */
        //ep是在create时存进去的,取出来
        ep = f.file->private_data;
    
        /*
             * When we insert an epoll file descriptor, inside another epoll file
             * descriptor, there is the change of creating closed loops, which are
             * better be handled here, than in more critical paths. While we are
             * checking for loops we also determine the list of files reachable
             * and hang them on the tfile_check_list, so we can check that we
             * haven't created too many possible wakeup paths.
             *
             * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
             * the epoll file descriptor is attaching directly to a wakeup source,
             * unless the epoll file descriptor is nested. The purpose of taking the
             * 'epmutex' on add is to prevent complex toplogies such as loops and
             * deep wakeup paths from forming in parallel through multiple
             * EPOLL_CTL_ADD operations.
             */
        //出错管理
        mutex_lock_nested(&ep->mtx, 0);
        if (op == EPOLL_CTL_ADD) {
            if (!list_empty(&f.file->f_ep_links) ||
                    is_file_epoll(tf.file)) {
                full_check = 1;
                mutex_unlock(&ep->mtx);
                mutex_lock(&epmutex);
                if (is_file_epoll(tf.file)) {
                    error = -ELOOP;
                    if (ep_loop_check(ep, tf.file) != 0) {
                        clear_tfile_check_list();
                        goto error_tgt_fput;
                    }
                } else
                    list_add(&tf.file->f_tfile_llink,
                             &tfile_check_list);
                mutex_lock_nested(&ep->mtx, 0);
                if (is_file_epoll(tf.file)) {
                    tep = tf.file->private_data;
                    mutex_lock_nested(&tep->mtx, 1);
                }
            }
        }
    
        /*
             * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
             * above, we can be sure to be able to use the item looked up by
             * ep_find() till we release the mutex.
             */
        //在ep下面所链接的红黑树中find这个含有fd的epitem
        epi = ep_find(ep, tf.file, fd);
    
        error = -EINVAL;
        switch (op) {
        case EPOLL_CTL_ADD:
            if (!epi) {
                //没有,则插入,默认包含EPOLLERR和EPOLLHUP事件
                epds.events |= EPOLLERR | EPOLLHUP;
                error = ep_insert(ep, &epds, tf.file, fd, full_check);
            } else
                error = -EEXIST;
            if (full_check)
                clear_tfile_check_list();
            break;
        case EPOLL_CTL_DEL:
            if (epi)
                //找到则删除
                error = ep_remove(ep, epi);
            else
                error = -ENOENT;
            break;
        case EPOLL_CTL_MOD:
            if (epi) {
                //修改
                if (!(epi->event.events & EPOLLEXCLUSIVE)) {
                    epds.events |= EPOLLERR | EPOLLHUP;
                    error = ep_modify(ep, epi, &epds);
                }
            } else
                error = -ENOENT;
            break;
        }
        if (tep != NULL)
            mutex_unlock(&tep->mtx);
        mutex_unlock(&ep->mtx);
    
    error_tgt_fput:
        if (full_check)
            mutex_unlock(&epmutex);
    
        fdput(tf);
    error_fput:
        fdput(f);
    error_return:
    
        return error;
    }
    
    /*
     * Implement the event wait interface for the eventpoll file. It is the kernel
     * part of the user space epoll_wait(2).
     */
    static int do_epoll_wait(int epfd, struct epoll_event __user *events,
                             int maxevents, int timeout)
    {
        int error;
        struct fd f;
        struct eventpoll *ep;
    
        /* The maximum number of event must be greater than zero */
        if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
            return -EINVAL;
    
        /* Verify that the area passed by the user is writeable */
        if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
            return -EFAULT;
    
        /* Get the "struct file *" for the eventpoll file */
        f = fdget(epfd);
        if (!f.file)
            return -EBADF;
    
        /*
             * We have to check that the file structure underneath the fd
             * the user passed to us _is_ an eventpoll file.
             */
        error = -EINVAL;
        if (!is_file_epoll(f.file))
            goto error_fput;
    
        /*
             * At this point it is safe to assume that the "private_data" contains
             * our own data structure.
             */
        //获取eventpoll
        ep = f.file->private_data;
    
        /* Time to fish for events ... */
        error = ep_poll(ep, events, maxevents, timeout);
    
    error_fput:
        fdput(f);
        return error;
    }
    
    SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
                    int, maxevents, int, timeout)
    {
        return do_epoll_wait(epfd, events, maxevents, timeout);
    }
    
    static int __init eventpoll_init(void)
    {
        struct sysinfo si;
    
        si_meminfo(&si);
        /*
             * Allows top 4% of lomem to be allocated for epoll watches (per user).
             */
        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
                EP_ITEM_COST;
        BUG_ON(max_user_watches < 0);
    
        /*
             * Initialize the structure used to perform epoll file descriptor
             * inclusion loops checks.
             */
        ep_nested_calls_init(&poll_loop_ncalls);
    
        /*
             * We can have many thousands of epitems, so prevent this from
             * using an extra cache line on 64-bit (and smaller) CPUs
             */
        BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
    
        /* Allocates slab cache used to allocate "struct epitem" items */
        //用kmem_cache_create(slab分配器)分配内存用来存放struct epitem和struct eppoll_entry。
        epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
                                      0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
    
        /* Allocates slab cache used to allocate "struct eppoll_entry" */
        pwq_cache = kmem_cache_create("eventpoll_pwq",
                                      sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
    
        return 0;
    }
    fs_initcall(eventpoll_init);
### epoll 源码分析与 Linux epoll 实现原理 Linux 的 `epoll` 是一种高效的 I/O 事件通知机制,适用于高并发场景。以下是关于 `epoll` 的源码实现及其实现原理的详细解析。 #### 1. epoll 的核心结构体 `epoll` 的核心数据结构是 `struct eventpoll`,它在内核中定义于 `fs/eventpoll.c` 文件中。该结构体用于管理所有的事件以及相关的文件描述符。以下是其主要成员[^2]: ```c struct eventpoll { spinlock_t lock; /* 保护内部数据结构的自旋锁 */ struct rb_root rbr; /* 红黑树,用于存储注册的文件描述符 */ struct list_head rdlist; /* 双向链表,存储已就绪的事件 */ wait_queue_head_t wq; /* 等待队列,用于线程阻塞 */ wait_queue_head_t poll_wait; /* 等待队列,用于等待文件描述符事件 */ struct user_struct *user; /* 用户空间引用计数 */ int fd; /* epoll 文件描述符 */ ... }; ``` #### 2. epoll 的三个主要接口 `epoll` 提供了三个主要接口:`epoll_create`、`epoll_ctl` 和 `epoll_wait`,它们分别负责创建 epoll 实例、控制事件注册和等待事件发生。 ##### 2.1 epoll_create `epoll_create` 的作用是创建一个 epoll 实例,并返回一个文件描述符。其内部实现如下: - 调用 `ep_alloc` 分配并初始化 `struct eventpoll` 结构体。 - 创建匿名文件系统节点,并将 `struct eventpoll` 指针绑定到文件对象的 `private_data` 中。 - 返回文件描述符以供用户空间使用[^5]。 代码示例如下: ```c static int do_epoll_create(int flags) { int error, fd; struct eventpoll *ep = NULL; struct file *file; error = ep_alloc(&ep); // 分配 eventpoll 结构体 if (error < 0) return error; fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); // 获取未使用的文件描述符 if (fd < 0) { error = fd; goto out_free_ep; } file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); // 创建匿名文件 if (IS_ERR(file)) { error = PTR_ERR(file); goto out_free_fd; } ep->file = file; fd_install(fd, file); // 将文件描述符与文件对象绑定 return fd; } ``` ##### 2.2 epoll_ctl `epoll_ctl` 用于注册或修改文件描述符上的事件。其核心功能包括: - 将目标文件描述符插入到 `epoll` 的红黑树中。 - 设置监听的事件类型(如读、写等)。 - 如果文件描述符已经就绪,则将其加入 `rdlist` 链表中[^3]。 ##### 2.3 epoll_wait `epoll_wait` 的作用是阻塞当前线程,直到有事件发生。其实现流程如下: - 遍历 `rdlist` 链表,将所有就绪的事件返回给用户空间。 - 如果没有就绪事件,则将当前线程挂起到 `wq` 等待队列中,直到有事件发生[^4]。 #### 3. epoll 的实现原理 `epoll` 的高效性来源于以下几个方面: - **红黑树**:使用红黑树存储注册的文件描述符,保证插入、删除和查找操作的时间复杂度为 O(log n)。 - **双向链表**:使用双向链表存储已就绪的事件,方便快速遍历和清理。 - **事件驱动**:通过回调函数机制,当文件描述符状态发生变化时,内核会主动通知 `epoll`,避免轮询带来的性能开销[^6]。 #### 4. 示例代码 以下是一个简单的 `epoll` 使用示例: ```c #include <sys/epoll.h> #include <unistd.h> #include <stdio.h> #define MAX_EVENTS 10 int main() { int epoll_fd = epoll_create1(0); // 创建 epoll 实例 if (epoll_fd == -1) { perror("epoll_create1"); return 1; } struct epoll_event event; event.events = EPOLLIN; // 监听读事件 event.data.fd = 0; // 标准输入文件描述符 if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, 0, &event) == -1) { // 注册事件 perror("epoll_ctl"); return 1; } struct epoll_event events[MAX_EVENTS]; int nfds = epoll_wait(epoll_fd, events, MAX_EVENTS, -1); // 等待事件 if (nfds == -1) { perror("epoll_wait"); return 1; } for (int i = 0; i < nfds; i++) { printf("Event on fd %d\n", events[i].data.fd); } close(epoll_fd); return 0; } ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值