epoll机制源码分析
epoll 调用方法
// 调用epoll_create建立一个epoll对象(在epoll文件系统中给这个句柄分配资源);
//参数:监听的最大fd数目 返回值:epollfd
int epoll_create(int size);
//添加、删除、更改操作
//参数:epollfd,op:操作,fd:操作的文件描述符,event:操作的事件
//返回值: 成功时,epoll_ctl()返回零。发生错误时epoll_ctl()返回-1并正确设置了errno
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
//等待事件发生
//参数: epfd:epollfd,events:关心的事件组,maxevents:最大事件数,timeout:超时时间
//返回值:就绪fd数
int epoll_wait(int epfd, struct epoll_event *events,int maxevents, int timeout);
epoll实现原理
epoll采用的数据结构是红黑树和双向链表,其中红黑树是保存所有的监听的fd及其对应的事件,双向链表为其活跃的事件,epoll_create是做创建工作,epoll_ctl只是操作红黑树,epoll_wait是等待事件发生的函数,如果当前没有事件也没有超时,会将当前的进程放到等待队列中(一个epollevent一个),那么调用epoll_wait的进程就会睡眠。被唤醒以后,要么是超时,要么是有事件,如果有事件就会把双向链表中的事件复制到用户区。
在睡眠过程当中,每一个监听的fd都维护了一个等待队列,该等待队列上,成员没有指向任何的进程任务,假如监听的fd有事件到来,那么该fd的等待队列就会被唤醒(wake_up),该队列上注册的回调函数就会被调用,而注册的回调函数是ep_poll_callback。这个函数就是通过epoll_ctl的时候,调用设备驱动的poll去注册的。
//参数key被当作一个unsigned long整数使用, 携带的是events
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
这个函数就会把当前fd对应的结构加入到就绪队列也就是双向链表,然后唤醒之前调用epoll_wait被阻塞的进程。
然后在epoll_wait中把就绪链表复制到用户空间并返回。
源码分析
关键结构
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the "rbr" RB tree.
*/
//每一个socketfd都会有一个epitem结构体
struct epitem {
/* RB tree node used to link this structure to the eventpoll RB tree */
struct rb_node rbn;//红黑树节点
/* List header used to link this structure to the eventpoll ready list */
struct list_head rdllink;//双向链表节点
/*
* Works together "struct eventpoll"->ovflist in keeping the
* single linked chain of items.
*/
struct epitem *next;//下一个节点
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;//该结构体对应的epoll文件描述符
/* Number of active wait queue attached to poll operations */
//附加到轮询操作的活动等待队列数
int nwait;//
/* List containing poll wait queues */
struct list_head pwqlist;//每一个fd对用的等待队列,放了回调函数的
/* The "container" of this item */
struct eventpoll *ep;//属于哪个eventpoll
/* List header used to link this item to the "struct file" items list */
struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source *ws;//唤醒资源被使用时被置位
/* The structure that describe the interested events and the source fd */
struct epoll_event event;//关心的事件
};
/*
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
*/
//每一个epollfd拥有一个该结构体,里面含有红黑树的头
//双向链表的头
struct eventpoll {
/* Protect the access to this structure */
spinlock_t lock;
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
struct mutex mtx;//确保epoll使用的文件描述符不会被删除
/* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq;//等待队列,epoll_wait的进程放在这个等待队列
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;//这个用于epollfd本身被poll的时候用的等待队列,用没用到呢?好像没有
/* List of ready file descriptors */
struct list_head rdllist;//双向链表头
/* RB tree root used to store monitored fd structs */
struct rb_root rbr;//红黑树根节点
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
* holding ->lock.
*/
struct epitem *ovflist;
/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
struct user_struct *user;
struct file *file;
/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
};
/* Wait structure used by the poll hooks 等待队列的一个封装*/
struct eppoll_entry {
/* List header used to link this structure to the "struct epitem" */
struct list_head llink;
/* The "base" pointer is set to the container "struct epitem" */
struct epitem *base;
/*
* Wait queue item that will be linked to the target file wait
* queue head.
*/
wait_queue_t wait;
/* The wait queue head that linked the "wait" wait queue item */
wait_queue_head_t *whead;
};
调用epoll_wait的时候主要调用的就是这个函数ep_poll,英文注释不翻译了,能看懂
/**
* ep_poll - Retrieves ready events, and delivers them to the caller supplied
* event buffer.
*
* @ep: Pointer to the eventpoll context.
* @events: Pointer to the userspace buffer where the ready events should be
* stored.
* @maxevents: Size (in terms of number of events) of the caller event buffer.
* @timeout: Maximum timeout for the ready events fetch operation, in
* milliseconds. If the @timeout is zero, the function will not block,
* while if the @timeout is less than zero, the function will block
* until at least one event has been retrieved (or an error
* occurred).
*
* Returns: Returns the number of ready events which have been fetched, or an
* error code, in case of error.
*/
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res = 0, eavail, timed_out = 0;
unsigned long flags;
long slack = 0;
wait_queue_t wait; //等待队列
ktime_t expires, *to = NULL;
//删除了一部分时间相关的代码
fetch_events:
spin_lock_irqsave(&ep->lock, flags);
if (!ep_events_available(ep)) {
/*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* ep_poll_callback() when events will become available.
*/
/*******************重要内容***********************/
init_waitqueue_entry(&wait, current);//初始化wait,wait->private=current,wait_>func = try_to_wake()
__add_wait_queue_exclusive(&ep->wq, &wait);//把队列节点加入到ep->wq等待队列
for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
set_current_state(TASK_INTERRUPTIBLE);
/*******************重要内容**********************/
//循环检查当前是否是有就绪事件或者超时
if (ep_events_available(ep) || timed_out)
break;
if (signal_pending(current)) {
res = -EINTR;
break;
}
spin_unlock_irqrestore(&ep->lock, flags);
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
}
__remove_wait_queue(&ep->wq, &wait);
set_current_state(TASK_RUNNING);
}
check_events:
/* Is it worth to try to dig for events ? */
eavail = ep_events_available(ep);
spin_unlock_irqrestore(&ep->lock, flags);
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
if (!res && eavail &&
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
return res;
}
比较好的参考博客: http://blog.chinaunix.net/uid-28541347-id-4273856.html
http://blog.chinaunix.net/uid-28541347-id-4238524.html
思考题
-
epoll只有在epoll_ctl的时候把注册的fd从用户空间复制到内核空间,在每次epoll_wait的时候把活跃事件数从内核空间复制到用户空间。(而poll则是每次调用poll都会将所有监听的fd从用户复制到内核,返回时又从内核复制到用户)。
-
epoll的线程安全
双向链表的添加和删除的操作时 ===> spinlock自旋锁
红黑树的添加或者删除操作时 ===> mutex
epoll_wait ===> 查询链表是否为空时用的是spinlock,复制数据时用的是mutex,
-
epoll是将进程挂在了一个等待队列上,与epollfd关联,而poll则是将进程挂在每一个被监听的fd的等待队列上,这样任何一个的fd事件到来,都会调用回调,这个回调是唤醒函数,而epoll中是ep_poll_callback。epfd对应的等待队列是唤醒函数,在ep_poll_callback中被调用。
-
采用红黑树的原因,是因为红黑树在插入,查找,删除时的复杂度都是logn。
-
总体来说:
大部分情况下,反射的效率都比遍历来的高,但是!
但是当所有Socket都活跃的时候,反射还会更高么?这时候所有的callback都被唤醒,会导致资源的竞争(操作双向队列加的自旋锁).既然都是要处理所有的Socket,那么遍历是最简单最有效的实现方式.举例来说:
对于IM服务器,服务器和服务器之间都是长链接,但数量不多,一般一台60\70个,比如采用ICE这种架构设计,但请求相当频繁和密集,这时候通过反射唤醒callback不一定比用select去遍历处理更好.
对于web portal服务器,都是浏览器客户端发起的http短链接请求,数量很大,好一点的网站动辄每分钟上千个请求过来,同时服务器端还有更多的闲置等待超时的Socket,这时候没必要把全部的Socket都遍历处理,因为那些等待超时的请求是大多数的,这样用Epoll会更好. -
在ET模式中,如果原先有100个字节的数据,第一次读取了20字节,又来了200字节数据是否会触发?
- 答:会触发的。只要fd状态发生变化,无论什么模式都会触发。而ET模式和LT模式的区别就在于,如果是LT模式,再将双向链表中的fd结构复制到用户态以后,会重新放回双向链表,使其继续触发,直到数据被读完。而ET模式则从双向链表取下以后就没有了。epoll看的双向链表中事件是否存在,当epoll_wait返回时,双向链表中的节点就不存在了。所以在ET模式中就必须是一次性读完所有的数据,不然会导致后面一直无法触发。