在完成listen系统调用后,作为TCP Server的进程就可以等待接受连接请求了。当请求到来时,进程需要调用accept系统调用生成一个新的socket,并用之与客户端传输数据。这时进程需要管理的socket有两类:1)等待请求到来并与之建立连接的socket;2)已经与客户端建立的一对一的连接并与之进行数据传输的socket。当这些socket的数量很多时,如何及时获知哪些socket有可读|可写等I/O事件通告到来并对其进行处理,就成了会极大影响TCP Server性能的关键问题。
11.2.1 epoll模型
Linux epoll是一个高效的I/O事件通告机制。下面说明一下TCP是如何使用epoll完成对I/O事件的监控的。使用epoll的模型示例:
int main(void)
{
struct epoll_event ev,events[20] = {};
int fd;
int listenfd;
int sockfd;
int nfds;
int i;
int rfd;
ssize_t rlen;
ssize_t wlen;
listenfd = socket(AF_INET, SOCK_STREAM, 0);
...
bind(listenfd, serveraddr, serveraddrlen);
...
listen(listenfd, 10);
...
epfd = epoll_create(256);
ev.data.fd = listenfd;
ev.events = EPOLLIN;
epoll_ctl(epfd, EPOLL_CTL_ADD, listenfd, &ev); //监控listenfd上发生的I/O事件
while (1) {
nfds = epoll_wait(epfd, events, 20, -1); //等待事件通告,当没有通告时进程睡眠,不占用CPU;有事件通告时进程被唤醒,然后处理事件
for(i = 0; i < nfds; ++i) {
if (events[i].events & EPOLLIN) {
if ((sockfd = events[i].data.fd) < 0) {
continue;
}
if (sockfd == listenfd) { //1类socket的可读事件发生,即新的连接请求到来
printf("Registered vm has changed!\n");
sockfd = accept(listenfd, clientaddr, clientaddrlen); //接受请求,产生新的socket描述符
...
} else { //2类socket的可读事件发生
rlen = read(sockfd, buf, sizeof(buf));
}
...
ev.data.fd = sockfd;
ev.events = EPOLLIN|EPOLLET;
epoll_ctl(epfd, EPOLL_CTL_ADD, sockfd, &ev);
...
} else if(events[i].events & EPOLLOUT) { //可写事件到来,即告知进程sockfd可以发送数据
sockfd = events[i].data.fd;
wlen = write(sockfd, data, data_len);
if (0 <= wlen && wlen < n) { //当有数据无法发送完毕时,可以定制可写事件通告,使epoll在得知sockfd可发送数据时通知进程
ev.data.fd = sockfd;
ev.events = EPOLLOUT|EPOLLET;
epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev);
}
...
}
...
}
}
...
return 0;
}
epoll_create用于产生一个epoll的文件描述符,一个epoll文件描述符对应一个文件描述符集合。
epoll_ctl用于控制这个集合中的成员(加入、删除、变更定制事件等)。在内核中,epoll_ctl会将新的fd加入到一颗红黑树中加以管理。
epoll_wait用于等待集合中成员的I/O事件发生;如果所有成员都没有I/O事件,则保持进程的睡眠状态;否则,进程会被唤醒,epoll_wait会返回所有发生的事件的信息。下面重点研究epoll是如何使进程睡眠,在有I/O事件时内核又如何唤醒进程的。
11.2.1 epoll_ctl内核代码
epoll_ctl内核代码如下:
1788 SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1789 struct epoll_event __user *, event)
1790 {
1791 int error;
1792 int did_lock_epmutex = 0;
1793 struct file *file, *tfile;
1794 struct eventpoll *ep;
1795 struct epitem *epi;
1796 struct epoll_event epds;
1797
1798 error = -EFAULT;
1799 if (ep_op_has_event(op) &&
1800 copy_from_user(&epds, event, sizeof(struct epoll_event)))
1801 goto error_return;
1802
1803 /* Get the "struct file *" for the eventpoll file */
1804 error = -EBADF;
1805 file = fget(epfd);
1806 if (!file)
1807 goto error_return;
1808
1809 /* Get the "struct file *" for the target file */
1810 tfile = fget(fd);
1811 if (!tfile)
1812 goto error_fput;
1813
1814 /* The target file descriptor must support poll */
1815 error = -EPERM;
1816 if (!tfile->f_op || !tfile->f_op->poll)
1817 goto error_tgt_fput;
1818
1819 /* Check if EPOLLWAKEUP is allowed */
1820 if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
1821 epds.events &= ~EPOLLWAKEUP;
1822
1823 /*
1824 * We have to check that the file structure underneath the file descriptor
1825 * the user passed to us _is_ an eventpoll file. And also we do not permit
1826 * adding an epoll file descriptor inside itself.
1827 */
1828 error = -EINVAL;
1829 if (file == tfile || !is_file_epoll(file))
1830 goto error_tgt_fput;
1831
1832 /*
1833 * At this point it is safe to assume that the "private_data" contains
1834 * our own data structure.
1835 */
1836 ep = file->private_data;
...
1850 if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
1851 mutex_lock(&epmutex);
1852 did_lock_epmutex = 1;
1853 }
1854 if (op == EPOLL_CTL_ADD) {
1855 if (is_file_epoll(tfile)) {
1856 error = -ELOOP;
1857 if (ep_loop_check(ep, tfile) != 0) {
1858 clear_tfile_check_list();
1859 goto error_tgt_fput;
1860 }
1861 } else
1862 list_add(&tfile->f_tfile_llink, &tfile_check_list);
1863 }
1864
1865 mutex_lock_nested(&ep->mtx, 0);
1866
1867 /*
1868 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
1869 * above, we can be sure to be able to use the item looked up by
1870 * ep_find() till we release the mutex.
1871 */
1872 epi = ep_find(ep, tfile, fd);
1873
1874 error = -EINVAL;
1875 switch (op) {
1876 case EPOLL_CTL_ADD: //添加
1877 if (!epi) {
1878 epds.events |= POLLERR | POLLHUP;
1879 error = ep_insert(ep, &epds, tfile, fd);
1880 } else
1881 error = -EEXIST;
1882 clear_tfile_check_list();
1883 break;
1884 case EPOLL_CTL_DEL: //删除
1885 if (epi)
1886 error = ep_remove(ep, epi);
1887 else
1888 error = -ENOENT;
1889 bre