网络服务器的传统实现方式是使用每个进程或线程处理一个连接,对于大规模的并发访问来说,这样的时间和空间效率都是不行的。因此推荐的方法是:在一个线程中使用非阻塞IO循环处理IO事件。
epoll系统调用
epoll系统调用在linux2.6开始支持,提供了三个函数。
epoll_create()创建epoll实例。
epoll_ctl()为epoll实例添加或删除描述符。
epoll_wait()用于等待特定的事件。
当描述符添加到epoll实例时可以设置为两种模式,Edge-Triggered和Level-Triggered。两种模式的区别是:
1、采用
Level-Triggered模式时,当IO数据可读时,epoll_wait()会因事件到达而返回。如果你数据没有读完,还可以调用epoll_wait()并马上返回。这是epoll默认采用的模式。
2、采用
Edge-Triggered模式时,当IO数据可读时,事件到达的通知只会发出一次。如果数据没有读完,再次调用epoll_wait()时还会阻塞(因为刚才已经发出过一次事件通知了,需要等到有新事件到达)。
epoll 事件的结构体
typedef union epoll_data
{
void *ptr;
int fd;
__uint32_t u32;
__uint64_t u64;
} epoll_data_t;
struct epoll_event
{
__uint32_t events; /* Epoll events */
epoll_data_t data; /* User data variable */
};
一般要处理的到达事件:
出错事件
新连接
发送来的数据就绪
要发送出的数据就绪
下面实现一个小型的TCP服务器,能输出所有发送到服务器上的数据。
第一步,写一个函数创建和绑定多个TCP套接字:
第二步,将套接字设置为非阻塞;
第三步,事件循环。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netdb.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/epoll.h>
#include <errno.h>
#define MAXEVENTS 64
static int make_socket_non_blocking (int sfd)
{
int flags, s;
flags = fcntl (sfd, F_GETFL, 0);
if (flags == -1)
{
perror ("fcntl");
return -1;
}
flags |= O_NONBLOCK;
s = fcntl (sfd, F_SETFL, flags);
if (s == -1)
{
perror ("fcntl");
return -1;
}
return 0;
}
static int create_and_bind (char *port)
{
struct addrinfo hints;
struct addrinfo *result, *rp;
int s, sfd;
memset (&hints, 0, sizeof (struct addrinfo));
hints.ai_family = AF_UNSPEC; /* Return IPv4 and IPv6 choices */
hints.ai_socktype = SOCK_STREAM; /* We want a TCP socket */
hints.ai_flags = AI_PASSIVE; /* All interfaces */
s = getaddrinfo (NULL, port, &hints, &result); //一个地址可能映射到多个ip上去
if (s != 0)
{
fprintf (stderr, "getaddrinfo: %s\n", gai_strerror (s));
return -1;
}
for (rp = result; rp != NULL; rp = rp->ai_next) //遍历链表
{
sfd = socket (rp->ai_family, rp->ai_socktype, rp->ai_protocol);
if (sfd == -1)
continue;
s = bind (sfd, rp->ai_addr, rp->ai_addrlen);
if (s == 0)
{
/* We managed to bind successfully! */
break;
}
close (sfd);
}
if (rp == NULL)
{
fprintf (stderr, "Could not bind\n");
return -1;
}
freeaddrinfo (result);
return sfd;
}
int main (int argc, char *argv[]) //需要参数:绑定的端口号
{
int sfd, s;
int efd;
struct epoll_event event;
struct epoll_event *events;
if (argc != 2)
{
fprintf (stderr, "Usage: %s [port]\n", argv[0]);
exit (EXIT_FAILURE);
}
sfd = create_and_bind (argv[1]);
if (sfd == -1)
abort ();
s = make_socket_non_blocking (sfd);
if (s == -1)
abort ();
s = listen (sfd, SOMAXCONN);
if (s == -1)
{
perror ("listen");
abort ();
}
efd = epoll_create1 (0); //创建epoll实例
if (efd == -1)
{
perror ("epoll_create");
abort ();
}
event.data.fd = sfd;
event.events = EPOLLIN | EPOLLET; //描述符对应的文件支持读操作,采用Edge-Triggered模式
s = epoll_ctl (efd, EPOLL_CTL_ADD, sfd, &event); //向epoll添加描述符
if (s == -1)
{
perror ("epoll_ctl");
abort ();
}
/* Buffer where events are returned */
events = calloc (MAXEVENTS, sizeof event);
/*事件循环 */
while (1)
{
int n, i;
n = epoll_wait (efd, events, MAXEVENTS, -1);
//返回值n是就绪的socket描述符的个数
for (i = 0; i < n; i++)
{
if ((events[i].events & EPOLLERR) || //文件描述服发生错误
(events[i].events & EPOLLHUP) || //文件描述符被挂断
(!(events[i].events & EPOLLIN))) //文件描述符不可读
{
/* An error has occured on this fd, or the socket is not
ready for reading (why were we notified then?) */
fprintf (stderr, "epoll error\n");
close (events[i].data.fd);
continue;
}
else if (sfd == events[i].data.fd) //新连接事件
{
/* We have a notification on the listening socket, which
means one or more incoming connections. */
while (1)
{
struct sockaddr in_addr;
socklen_t in_len;
int infd;
char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
in_len = sizeof in_addr;
infd = accept (sfd, &in_addr, &in_len);
if (infd == -1)
{
if ((errno == EAGAIN) ||
(errno == EWOULDBLOCK))
{
/* We have processed all incoming
connections. */
break;
}
else
{
perror ("accept");
break;
}
}
s = getnameinfo (&in_addr, in_len,
hbuf, sizeof hbuf,
sbuf, sizeof sbuf,
NI_NUMERICHOST | NI_NUMERICSERV);
if (s == 0)
{
printf("Accepted connection on descriptor %d "
"(host=%s, port=%s)\n", infd, hbuf, sbuf);
}
/* Make the incoming socket non-blocking and add it to the
list of fds to monitor. */
s = make_socket_non_blocking (infd);
if (s == -1)
abort ();
event.data.fd = infd;
event.events = EPOLLIN | EPOLLET;
s = epoll_ctl (efd, EPOLL_CTL_ADD, infd, &event);
if (s == -1)
{
perror ("epoll_ctl");
abort ();
}
}
continue;
}
else
{
/* We have data on the fd waiting to be read. Read and
display it. We must read whatever data is available
completely, as we are running in edge-triggered mode
and won't get a notification again for the same
data. */
int done = 0;
while (1)
{
ssize_t count;
char buf[512];
count = read (events[i].data.fd, buf, sizeof buf); //从文件中读出到buf
if (count == -1)
{
/* If errno == EAGAIN, that means we have read all
data. So go back to the main loop. */
if (errno != EAGAIN)
{
perror ("read");
done = 1;
}
break;
}
else if (count == 0)
{
/* End of file. The remote has closed the
connection. */
done = 1;
break;
}
/* 从buf写入到标准输出 */
s = write (1, buf, count);
if (s == -1)
{
perror ("write");
abort ();
}
}
if (done)
{
printf ("Closed connection on descriptor %d\n",
events[i].data.fd);
/* Closing the descriptor will make epoll remove it
from the set of descriptors which are monitored. */
close (events[i].data.fd);
}
}
}
}
free (events);
close (sfd);
return EXIT_SUCCESS;
}
epoll的实现原理
select比epoll效率差的原因:select是轮询,epoll是触发式的。
epoll会在内核注册了一个文件系统,用于存储被监控文件描述符(比如socket描述符)。
epol拥有自己的内核高速cache区,用于安置每一个我们想监控的描述符,这些描述符会以红黑树的组织形式保存在内核cache里,以支持快速的查找、插入、删除。
在调用epoll_create()时,内核存储要监控的文件描述符了。调用epoll_ctl()是在往内核的数据结构里添加新的文件描述符。
就绪链表(Ready List)
epoll维护了一个就绪链表。当某个被epoll监控的文件描述符有事件发生时,向内核发出中断信号,内核会将该事件放到就绪链表中,并发出一个事件到达的通知。这样,所以到达的事件会保留在就绪链表中,等待用户态进程去处理。
在
Level-Triggered模式下,只要就绪链表中有就绪事件,调用epoll_wait()就会马上返回让你去处理。
在Edge-Triggered模式下,不管就绪链表中是否有就绪事件,只要不来新的事件到达通知,epoll_wait()会一直阻塞。
参考资料: