11.2 epoll系统调用

        在完成listen系统调用后,作为TCP Server的进程就可以等待接受连接请求了。当请求到来时,进程需要调用accept系统调用生成一个新的socket,并用之与客户端传输数据。这时进程需要管理的socket有两类:1)等待请求到来并与之建立连接的socket;2)已经与客户端建立的一对一的连接并与之进行数据传输的socket。当这些socket的数量很多时,如何及时获知哪些socket有可读|可写等I/O事件通告到来并对其进行处理,就成了会极大影响TCP Server性能的关键问题。

11.2.1 epoll模型

        Linux epoll是一个高效的I/O事件通告机制。下面说明一下TCP是如何使用epoll完成对I/O事件的监控的。使用epoll的模型示例:

int main(void)
{
    struct epoll_event  ev,events[20] = {};
    int                 fd;
    int                 listenfd;
    int                 sockfd;
    int                 nfds;
    int                 i;
    int                 rfd;
    ssize_t             rlen;
    ssize_t             wlen;

    listenfd = socket(AF_INET, SOCK_STREAM, 0);
    ...
    bind(listenfd, serveraddr, serveraddrlen);
    ...
    listen(listenfd, 10);
    ...

    epfd = epoll_create(256);

    ev.data.fd = listenfd;
    ev.events = EPOLLIN;
    epoll_ctl(epfd, EPOLL_CTL_ADD, listenfd, &ev); //监控listenfd上发生的I/O事件

    while (1) {
        nfds = epoll_wait(epfd, events, 20, -1); //等待事件通告,当没有通告时进程睡眠,不占用CPU;有事件通告时进程被唤醒,然后处理事件
        for(i = 0; i < nfds; ++i) {
            if (events[i].events & EPOLLIN) {
                if ((sockfd = events[i].data.fd) < 0) {
                    continue;
                }

                if (sockfd == listenfd) { //1类socket的可读事件发生,即新的连接请求到来
                    printf("Registered vm has changed!\n");
                    sockfd = accept(listenfd, clientaddr, clientaddrlen); //接受请求,产生新的socket描述符
                    ...
                } else { //2类socket的可读事件发生
                    rlen = read(sockfd, buf, sizeof(buf));
                }
                ...
                ev.data.fd = sockfd;
                ev.events = EPOLLIN|EPOLLET;
                epoll_ctl(epfd, EPOLL_CTL_ADD, sockfd, &ev);
                ...
            } else if(events[i].events & EPOLLOUT) {    //可写事件到来,即告知进程sockfd可以发送数据
                sockfd = events[i].data.fd;
                wlen = write(sockfd, data, data_len);
                if (0 <= wlen && wlen < n) {    //当有数据无法发送完毕时,可以定制可写事件通告,使epoll在得知sockfd可发送数据时通知进程
                    ev.data.fd = sockfd;
                    ev.events = EPOLLOUT|EPOLLET;
                    epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev);
                }
                ...
            }
            ...
        }
    }

    ...

    return 0;
}

        epoll_create用于产生一个epoll的文件描述符,一个epoll文件描述符对应一个文件描述符集合。

        epoll_ctl用于控制这个集合中的成员(加入、删除、变更定制事件等)。在内核中,epoll_ctl会将新的fd加入到一颗红黑树中加以管理。

        epoll_wait用于等待集合中成员的I/O事件发生;如果所有成员都没有I/O事件,则保持进程的睡眠状态;否则,进程会被唤醒,epoll_wait会返回所有发生的事件的信息。下面重点研究epoll是如何使进程睡眠,在有I/O事件时内核又如何唤醒进程的。

11.2.1 epoll_ctl内核代码

        epoll_ctl内核代码如下:

1788 SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1789         struct epoll_event __user *, event)
1790 {
1791     int error;
1792     int did_lock_epmutex = 0;
1793     struct file *file, *tfile;
1794     struct eventpoll *ep;
1795     struct epitem *epi;  
1796     struct epoll_event epds;
1797
1798     error = -EFAULT;     
1799     if (ep_op_has_event(op) &&
1800         copy_from_user(&epds, event, sizeof(struct epoll_event)))
1801         goto error_return;
1802
1803     /* Get the "struct file *" for the eventpoll file */
1804     error = -EBADF;      
1805     file = fget(epfd);   
1806     if (!file)
1807         goto error_return;
1808
1809     /* Get the "struct file *" for the target file */
1810     tfile = fget(fd);    
1811     if (!tfile)
1812         goto error_fput;
1813
1814     /* The target file descriptor must support poll */
1815     error = -EPERM;
1816     if (!tfile->f_op || !tfile->f_op->poll)
1817         goto error_tgt_fput;
1818
1819     /* Check if EPOLLWAKEUP is allowed */
1820     if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
1821         epds.events &= ~EPOLLWAKEUP;   
1822
1823     /*
1824      * We have to check that the file structure underneath the file descriptor
1825      * the user passed to us _is_ an eventpoll file. And also we do not permit
1826      * adding an epoll file descriptor inside itself.
1827      */
1828     error = -EINVAL;
1829     if (file == tfile || !is_file_epoll(file))
1830         goto error_tgt_fput;
1831
1832     /*
1833      * At this point it is safe to assume that the "private_data" contains
1834      * our own data structure.
1835      */
1836     ep = file->private_data;
...
1850     if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
1851         mutex_lock(&epmutex);
1852         did_lock_epmutex = 1;
1853     }
1854     if (op == EPOLL_CTL_ADD) {
1855         if (is_file_epoll(tfile)) {
1856             error = -ELOOP;
1857             if (ep_loop_check(ep, tfile) != 0) {
1858                 clear_tfile_check_list();
1859                 goto error_tgt_fput;
1860             }
1861         } else
1862             list_add(&tfile->f_tfile_llink, &tfile_check_list);
1863     }
1864
1865     mutex_lock_nested(&ep->mtx, 0);
1866
1867     /*
1868      * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
1869      * above, we can be sure to be able to use the item looked up by
1870      * ep_find() till we release the mutex.
1871      */
1872     epi = ep_find(ep, tfile, fd);
1873
1874     error = -EINVAL;
1875     switch (op) {
1876     case EPOLL_CTL_ADD:    //添加
1877         if (!epi) {
1878             epds.events |= POLLERR | POLLHUP;
1879             error = ep_insert(ep, &epds, tfile, fd);
1880         } else
1881             error = -EEXIST;
1882         clear_tfile_check_list();
1883         break;
1884     case EPOLL_CTL_DEL:    //删除
1885         if (epi)
1886             error = ep_remove(ep, epi);
1887         else
1888             error = -ENOENT;
1889         bre
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值