splice是linux2.6内核中新增的零拷贝数据发送函数,主要用于将数据发送到管道或从管道中接收数据。于splice类似的零拷贝发送函数还有sendfile,不同的是sendfile不能发送socket中的数据。所谓零拷贝是指(与传统的read/write模式相比),在数据发送的过程中,不会产生用户态、内核态之间的数据拷贝。在代理模式下,使用经典的read/write方式转发socket数据的流程为:
buf = malloc(len); \\首先申请一块长度为len的内存
read(sockfd1, buf, len); \\将第一个socket sockfd1中len长度的数据读入buf
write(sockfd2, buf, len); \\将buf中的数据通过sockfd2发送出去
这种转发数据的方式会导致数据会在内核态socket buff与用户态的buf之间发生两次copy(读一次,写一次)。
为了提高数据转发的效率,于是splice横空出世!splice系统调用的原型为:
ssize_t splice(int fd_in, loff_t *off_in, int fd_out,
loff_t *off_out, size_t len, unsigned int flags);
参数解析:
fd_in:要读入数据的文件描述符
off_in:要读入数据的起始偏移
fd_out:要写入数据的文件描述符
off_out:要写入数据的起始偏移
len:要写入数据的长度
flags:标志位
要特别强调的一点是,fd_in和fd_out中必须有一个是管道的描述符
使用splice转发数据的流程为:
1、调用mkfifo或者pipe创建一个管道
2、splice(sockfd1, &off_in_1, pipe_fd_w, &off_out_w, len, 0) \\将sockfd1中的数据移动到管道的写端
3、splice(pipe_fd_r, &off_in_r, sockfd2, &off_out_2, len, 0) \\通过管道的读端,将数据通过sockfd2发送出去
splice的效率为什么会比普通的read/write方法高呢?下面我们来分析一下代码。splice系统调用对应的内核代码为:1721 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1722 int, fd_out, loff_t __user *, off_out,
1723 size_t, len, unsigned int, flags)
1724 {
1725 struct fd in, out;
1726 long error;
1727
1728 if (unlikely(!len))
1729 return 0;
1730
1731 error = -EBADF;
1732 in = fdget(fd_in);
1733 if (in.file) {
1734 if (in.file->f_mode & FMODE_READ) { //fd_in可读
1735 out = fdget(fd_out);
1736 if (out.file) {
1737 if (out.file->f_mode & FMODE_WRITE) //fd_out可写
1738 error = do_splice(in.file, off_in,
1739 out.file, off_out,
1740 len, flags);
1741 fdput(out);
1742 }
1743 }
1744 fdput(in);
1745 }
1746 return error;
1747 }
do_splice函数:
1324 static long do_splice(struct file *in, loff_t __user *off_in,
1325 struct file *out, loff_t __user *off_out,
1326 size_t len, unsigned int flags)
1327 {
1328 struct pipe_inode_info *ipipe;
1329 struct pipe_inode_info *opipe;
1330 loff_t offset;
1331 long ret;
1332
1333 ipipe = get_pipe_info(in);
1334 opipe = get_pipe_info(out);
1335
1336 if (ipipe && opipe) { //in和out都是管道
...
1350 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1351 }
1352
1353 if (ipipe) { //in是管道
1354 if (off_in)
1355 return -ESPIPE;
1356 if (off_out) {
1357 if (!(out->f_mode & FMODE_PWRITE))
1358 return -EINVAL;
1359 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1360 return -EFAULT;
1361 } else {
1362 offset = out->f_pos;
1363 }
1364
1365 ret = do_splice_from(ipipe, out, &offset, len, flags); //从管道中度数据发送到out
1366
1367 if (!off_out)
1368 out->f_pos = offset;
1369 else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1370 ret = -EFAULT;
1371
1372 return ret;
1373 }
1374
1375 if (opipe) { //out是管道
1376 if (off_out)
1377 return -ESPIPE;
1378 if (off_in) {
1379 if (!(in->f_mode & FMODE_PREAD))
1380 return -EINVAL;
1381 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1382 return -EFAULT;
1383 } else {
1384 offset = in->f_pos;
1385 }
1386
1387 ret = do_splice_to(in, &offset, opipe, len, flags); //从in中读取数据放入管道
1388
1389 if (!off_in)
1390 in->f_pos = offset;
1391 else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1392 ret = -EFAULT;
1393
1394 return ret;
1395 }
1396
1397 return -EINVAL;
1398 }
代理模式下in和out必有一个是socket,所以不会走1353行的分支。
首先分析从socket读数据放入管道的流程,这个功能由do_splice_to函数完成:1127 static long do_splice_to(struct file *in, loff_t *ppos,
1128 struct pipe_inode_info *pipe, size_t len,
1129 unsigned int flags)
1130 {
1131 ssize_t (*splice_read)(struct file *, loff_t *,
1132 struct pipe_inode_info *, size_t, unsigned int);
1133 int ret;
1134
1135 if (unlikely(!(in->f_mode & FMODE_READ)))
1136 return -EBADF;
1137
1138 ret = rw_verify_area(READ, in, ppos, len);
1139 if (unlikely(ret < 0))
1140 return ret;
1141
1142 if (in->f_op && in->f_op->splice_read)//如果文件是socket,则in->f_op->splice_read指向sock_splice_read
1143 splice_read = in->f_op->splice_read;
1144 else
1145 splice_read = default_file_splice_read;
1146
1147 return splice_read(in, ppos, pipe, len, flags);
1148 }
sock_splice_read函数:
871 static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
872 struct pipe_inode_info *pipe, size_t len,
873 unsigned int flags)
874 {
875 struct socket *sock = file->private_data;
876
877 if (unlikely(!sock->ops->splice_read))
878 return -EINVAL;
879
880 return sock->ops->splice_read(sock, ppos, pipe, len, flags);//指向tcp_splice_read
881 }
tcp_splice_read函数:
670 ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
671 struct pipe_inode_info *pipe, size_t len,
672 unsigned int flags)
673 {
674 struct sock *sk = sock->sk;
675 struct tcp_splice_state tss = {
676 .pipe = pipe,
677 .len = len,
678 .flags = flags,
679 };
680 long timeo;
681 ssize_t spliced;
682 int ret;
683
684 sock_rps_record_flow(sk);
685 /*
686 * We can't seek on a socket input
687 */
688 if (unlikely(*ppos))
689 return -ESPIPE;
690
691 ret = spliced = 0;
692
693 lock_sock(sk);
694
695 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
696 while (tss.len) {
697 ret = __tcp_splice_read(sk, &tss);//将socket缓存中的数据读到管道中
698 if (ret < 0)