注:本文分析基于3.10.0-693.el7内核版本,即CentOS 7.4
三次握手完成了,下面就该发送数据了。发送数据使用的函数有很多,比如send、sendto、sendmsg、sendmmsg,甚至还有write、wirtev等,那这些函数都是怎么将数据发送的呢,下面我们就来探个究竟。
send()函数
函数原型
ssize_t send(int sockfd, const void *buf, size_t len, int flags);
其中,
sockfd表示发送数据的socket文件描述符。如果是客户端,即为socket系统调用创建的文件描述符;如果是服务端,即为accept系统调用返回的文件描述符;
buf表示用户数据存储起始地址;
len表示用户数据长度;
flags用于控制send的具体行为,比如是否阻塞,是否路由等。
内核实现
我们看看内核实现,
SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
unsigned int, flags)
{
return sys_sendto(fd, buff, len, flags, NULL, 0);
}
可见send最终调用的是sendto的内核函数,仅仅是做了一层封装,增加了最后两个参数。
sendto()函数
函数原型
ssize_t sendto(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen);
其中,前面四个参数和send()一致,
dest_addr表示数据发送的目标地址;
addrlen表示目标地址结构的大小,即为sizeof(struct sockaddr)。
sendto常用于无连接状态下的数据发送,即UDP报文,如果用于有连接状态,即TCP报文的发送,则dest_addr和addrlen这两个参数将被忽略。可见,
send(sockfd, buf, len, flags);
等效于
sendto(sockfd, buf, len, flags, NULL, 0);
内核实现
SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
unsigned int, flags)
{
return sys_sendto(fd, buff, len, flags, NULL, 0);
}
SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
unsigned int, flags, struct sockaddr __user *, addr,
int, addr_len)
{
struct socket *sock;
struct sockaddr_storage address;
int err;
struct msghdr msg;
struct iovec iov;
int fput_needed;
if (len > INT_MAX)
len = INT_MAX;
//一贯作风,根据fd获取socket结构体
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
//缓冲区地址,也就是我们发送的数据存在iovec结构体中
iov.iov_base = buff;
iov.iov_len = len;
//其实最终发送类的函数,用户消息都是封装在msghdr结构体中
msg.msg_name = NULL;
msg.msg_iov = &iov;//数据存储区
msg.msg_iovlen = 1;//数据块数量
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_namelen = 0;
if (addr) {
//如果目标地址不为空,将地址信息拷贝至内核
err = move_addr_to_kernel(addr, addr_len, &address);
if (err < 0)
goto out_put;
msg.msg_name = (struct sockaddr *)&address;
msg.msg_namelen = addr_len;
}
if (sock->file->f_flags & O_NONBLOCK)//非阻塞调用
flags |= MSG_DONTWAIT;
msg.msg_flags = flags;
//向下调用sock_sendmsg
err = sock_sendmsg(sock, &msg, len);
out_put:
fput_light(sock->file, fput_needed);
out:
return err;
}
sock_sendmsg中构造IO请求,然后进入__sock_sendmsg,
int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct kiocb iocb;
struct sock_iocb siocb;
int ret;
//构造一个IO请求,用kiocb结构表示
init_sync_kiocb(&iocb, NULL);
//将kiocb和sock_iocb关联
iocb.private = &siocb;
//往下调用__sock_sendmsg,多数发送类函数最终都会进入这个函数
ret = __sock_sendmsg(&iocb, sock, msg, size);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&iocb);
return ret;
}
到这里我们先暂停一下,对于类send函数,sock_sendmsg是socket层的一个汇聚点,我们暂且停下脚步,看看其他发送类系统调用是怎么走到这的。
sendmsg()函数
函数原型
ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
其中,
sockfd表示发送数据的socket文件描述符;
msg表示用户发送数据,封装在struct msghdr结构中;
flags用于控制sendmsg的具体行为,和上面send、sendto一样。
消息头结构
和send、sendto不同的是,sendmsg将用户的数据封装在消息头结构struct msghdr中,其结构体定义如下,
struct msghdr {
void *msg_name; /* optional address 目的地址*/
socklen_t msg_namelen; /* size of address 目的地址长度*/
struct iovec *msg_iov; /* scatter/gather array 分散的数据块数组*/
size_t msg_iovlen; /* # elements in msg_iov 分散的数据块数组个数*/
void *msg_control; /* ancillary data, see below 控制数据*/
size_t msg_controllen; /* ancillary data buffer len 控制数据长度*/
int msg_flags; /* flags on received message */
};
其中分散的数据块数组struct iovec结构体信息如下
struct iovec
{
void __user *iov_base; /* 用户数据 */
__kernel_size_t iov_len; /* 用户数据长度 */
};
内核实现
SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned int, flags)
{
if (flags & MSG_CMSG_COMPAT)
return -EINVAL;
return __sys_sendmsg(fd, msg, flags);
}
简单的封装后,进入__sys_sendmsg函数,
long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags)
{
int fput_needed, err;
struct msghdr msg_sys;
struct socket *sock;
//老套路之由fd获取对应socket
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
//又是一层简单的封装
err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL);
fput_light(sock->file, fput_needed);
out:
return err;
}
内核总是喜欢封装各种各样的函数,现在又来到___sys_sendmsg函数,
static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
struct msghdr *msg_sys, unsigned int flags,
struct used_address *used_address)
{
struct compat_msghdr __user *msg_compat =
(struct compat_msghdr __user *)msg;
struct sockaddr_storage address;
struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
unsigned char ctl[sizeof(struct cmsghdr) + 20]
__attribute__ ((aligned(sizeof(__kernel_size_t))));
/* 20 is size of ipv6_pktinfo */
unsigned char *ctl_buf = ctl;
int err, ctl_len, total_len;
err = -EFAULT;
//需要兼容32位系统,这里我们不考虑这种情况
if (MSG_CMSG_COMPAT & flags) {
if (get_compat_msghdr(msg_sys, msg_compat))
return -EFAULT;
} else {
//将用户的消息头数据拷贝至内核
err = copy_msghdr_from_user(msg_sys, msg);
if (err)
return err;
}
if (msg_sys->msg_iovlen > UIO_FASTIOV) {
err = -EMSGSIZE;
//消息个数最大只能是1024个
if (msg_sys->msg_iovlen > UIO_MAXIOV)
goto out;
err = -ENOMEM;
//为iovec,即用户实际数据分配内存
iov = kmalloc(msg_sys->msg_iovlen * sizeof(struct iovec), GFP_KERNEL);
if (!iov)
goto out;
}
/* This will also move the address data into kernel space */
if (MSG_CMSG_COMPAT & flags) {
err = verify_compat_iovec(msg_sys, iov, &address, VERIFY_READ);
} else
//拷贝iovec结构里的数据到内核
err = verify_iovec(msg_sys, iov, &address, VERIFY_READ);
if (err < 0)
goto out_freeiov;
total_len = err;
err = -ENOBUFS;
if (msg_sys->msg_controllen > INT_MAX)
goto out_freeiov;
ctl_len = msg_sys->msg_controllen;
if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
err =
cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl, sizeof(ctl));
if (err)
goto out_freeiov;
ctl_buf = msg_sys->msg_control;
ctl_len = msg_sys->msg_controllen;
} else if (ctl_len) {
//如果消息头里有控制信息,也一并拷贝到内核
if (ctl_len > sizeof(ctl)) {
ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
if (ctl_buf == NULL)
goto out_freeiov;
}
err = -EFAULT;
if (copy_from_user(ctl_buf, (void __user __force *)msg_sys->msg_control, ctl_len))
goto out_freectl;
msg_sys->msg_control = ctl_buf;
}
msg_sys->msg_flags = flags;
if (sock->file->f_flags & O_NONBLOCK)//非阻塞调用
msg_sys->msg_flags |= MSG_DONTWAIT;
//在sendmsg调用中used_address为NULL
//在sendmmsg中不为空,发送多个消息时,如果此次发送消息的名字和地址与上次相同,则直接调用sock_sendmsg_nosec
//其实与sock_sendmsg的差别仅在于是否做security_socket_sendmsg检查,应该是出于性能考虑
if (used_address && msg_sys->msg_name &&
used_address->name_len == msg_sys->msg_namelen &&
!memcmp(&used_address->name, msg_sys->msg_name, used_address->name_len)) {
err = sock_sendmsg_nosec(sock, msg_sys, total_len);
goto out_freectl;
}
//看吧,即将进入sock_sendmsg
err = sock_sendmsg(sock, msg_sys, total_len);
//在sendmsg调用中used_address为NULL
//在sendmmsg调用时,因为发送多个消息,可以直接使用上一次的消息头信息,因此保存下来
if (used_address && err >= 0) {
used_address->name_len = msg_sys->msg_namelen;
if (msg_sys->msg_name)
memcpy(&used_address->name, msg_sys->msg_name, used_address->name_len);
}
...
}
和sendto一样,之后就进入sock_sendmsg流程,同样,我们再回过头看看sendmmsg()系统调用。
sendmmsg()函数
函数原型
int sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, unsigned int flags);
其中,
sockfd表示发送数据的socket文件描述符;
msgvec表示用户发送数据,可包含多个消息,指向struct mmsghdr结构数组;
vlen表示struct mmsghdr结构数组长度,即消息个数;
flags用于控制sendmmsg的具体行为,和上面send、sendto一样。
struct mmsghdr结构
struct mmsghdr {
struct msghdr msg_hdr; /* Message header */
unsigned int msg_len; /* Number of bytes transmitted 发送数据长度*/
};
内核实现
SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
unsigned int, vlen, unsigned int, flags)
{
if (flags & MSG_CMSG_COMPAT)
return -EINVAL;
return __sys_sendmmsg(fd, mmsg, vlen, flags);
}
和sendmsg一样的封装,往下调用__sys_sendmmsg,
int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
unsigned int flags)
{
int fput_needed, err, datagrams;
struct socket *sock;
struct mmsghdr __user *entry;
struct compat_mmsghdr __user *compat_entry;
struct msghdr msg_sys;
struct used_address used_address;
if (vlen > UIO_MAXIOV)
vlen = UIO_MAXIOV;
datagrams = 0;
//是它,就是它,我们的老朋友
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
return err;
used_address.name_len = UINT_MAX;
entry = mmsg;
compat_entry = (struct compat_mmsghdr __user *)mmsg;
err = 0;
//既然是发送多个消息,那也就是多次调用sendmsg
while (datagrams < vlen) {
if (MSG_CMSG_COMPAT & flags) {//兼容性问题,暂不考虑
err = ___sys_sendmsg(sock, (struct msghdr __user *)compat_entry,
&msg_sys, flags, &used_address);
if (err < 0)
break;
err = __put_user(err, &compat_entry->msg_len);
++compat_entry;
} else {
//这不,走过你(sendmsg)来时的路,想像着....
err = ___sys_sendmsg(sock, (struct msghdr __user *)entry,
&msg_sys, flags, &used_address);
if (err < 0)
break;
err = put_user(err, &entry->msg_len);
++entry;
}
if (err)
break;
++datagrams;
}
fput_light(sock->file, fput_needed);
/* We only return an error if no datagrams were able to be sent */
if (datagrams != 0)
return datagrams;
return err;
}
这样,send、sendto、sendmsg、sendmmsg这四个兄弟总算走到一起,在sock_sendmsg()共襄盛举。
那write家族又是怎么和他们汇合的呢,请看下回分解。