4.3 TCP Splice

最新推荐文章于 2023-03-10 12:34:53 发布

原创

最新推荐文章于 2023-03-10 12:34:53 发布 · 2.6k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#linux内核 #tcp

　　splice是linux2.6内核中新增的零拷贝数据发送函数，主要用于将数据发送到管道或从管道中接收数据。于splice类似的零拷贝发送函数还有sendfile，不同的是sendfile不能发送socket中的数据。所谓零拷贝是指（与传统的read/write模式相比），在数据发送的过程中，不会产生用户态、内核态之间的数据拷贝。在代理模式下，使用经典的read/write方式转发socket数据的流程为：

buf = malloc(len);  \\首先申请一块长度为len的内存
read(sockfd1, buf, len);  \\将第一个socket sockfd1中len长度的数据读入buf
write(sockfd2, buf, len); \\将buf中的数据通过sockfd2发送出去

　　这种转发数据的方式会导致数据会在内核态socket buff与用户态的buf之间发生两次copy（读一次，写一次）。

　为了提高数据转发的效率，于是splice横空出世！splice系统调用的原型为：

ssize_t splice(int fd_in, loff_t *off_in, int fd_out,
                      loff_t *off_out, size_t len, unsigned int flags);

　　参数解析：

　　fd_in：要读入数据的文件描述符

　off_in：要读入数据的起始偏移

　fd_out：要写入数据的文件描述符

　off_out：要写入数据的起始偏移

　len：要写入数据的长度

　flags：标志位

　要特别强调的一点是，fd_in和fd_out中必须有一个是管道的描述符

　　使用splice转发数据的流程为：

1、调用mkfifo或者pipe创建一个管道

2、splice(sockfd1, &off_in_1, pipe_fd_w, &off_out_w, len, 0) \\将sockfd1中的数据移动到管道的写端

3、splice(pipe_fd_r, &off_in_r, sockfd2, &off_out_2, len, 0) \\通过管道的读端，将数据通过sockfd2发送出去

　　splice的效率为什么会比普通的read/write方法高呢？下面我们来分析一下代码。splice系统调用对应的内核代码为：

1721 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1722         int, fd_out, loff_t __user *, off_out,
1723         size_t, len, unsigned int, flags)
1724 {
1725     struct fd in, out;   
1726     long error;
1727 
1728     if (unlikely(!len))  
1729         return 0;        
1730 
1731     error = -EBADF;      
1732     in = fdget(fd_in);
1733     if (in.file) {
1734         if (in.file->f_mode & FMODE_READ) {　//fd_in可读
1735             out = fdget(fd_out);           
1736             if (out.file) {
1737                 if (out.file->f_mode & FMODE_WRITE)　//fd_out可写
1738                     error = do_splice(in.file, off_in,
1739                               out.file, off_out,             
1740                               len, flags);
1741                 fdput(out);
1742             }
1743         }
1744         fdput(in);
1745     }
1746     return error;
1747 }

　　do_splice函数：

1324 static long do_splice(struct file *in, loff_t __user *off_in,
1325               struct file *out, loff_t __user *off_out,
1326               size_t len, unsigned int flags)
1327 {
1328     struct pipe_inode_info *ipipe;
1329     struct pipe_inode_info *opipe;
1330     loff_t offset;
1331     long ret;
1332 
1333     ipipe = get_pipe_info(in);
1334     opipe = get_pipe_info(out);
1335 
1336     if (ipipe && opipe) {　　//in和out都是管道
... 
1350         return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1351     }
1352 
1353     if (ipipe) {　　//in是管道
1354         if (off_in)
1355             return -ESPIPE;
1356         if (off_out) {
1357             if (!(out->f_mode & FMODE_PWRITE))
1358                 return -EINVAL;
1359             if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1360                 return -EFAULT;
1361         } else {
1362             offset = out->f_pos;
1363         }
1364 
1365         ret = do_splice_from(ipipe, out, &offset, len, flags);　//从管道中度数据发送到out
1366 
1367         if (!off_out)
1368             out->f_pos = offset;
1369         else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1370             ret = -EFAULT;
1371 
1372         return ret;
1373     }
1374 
1375     if (opipe) {　　//out是管道
1376         if (off_out)
1377             return -ESPIPE;
1378         if (off_in) {
1379             if (!(in->f_mode & FMODE_PREAD))
1380                 return -EINVAL;
1381             if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1382                 return -EFAULT;
1383         } else {
1384             offset = in->f_pos;
1385         }
1386 
1387         ret = do_splice_to(in, &offset, opipe, len, flags);　//从in中读取数据放入管道
1388 
1389         if (!off_in)
1390             in->f_pos = offset;
1391         else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1392             ret = -EFAULT;
1393 
1394         return ret;
1395     }
1396 
1397     return -EINVAL;
1398 }

　　代理模式下in和out必有一个是socket，所以不会走1353行的分支。首先分析从socket读数据放入管道的流程，这个功能由do_splice_to函数完成：

1127 static long do_splice_to(struct file *in, loff_t *ppos,
1128              struct pipe_inode_info *pipe, size_t len,
1129              unsigned int flags)
1130 {
1131     ssize_t (*splice_read)(struct file *, loff_t *,
1132                    struct pipe_inode_info *, size_t, unsigned int);
1133     int ret;
1134
1135     if (unlikely(!(in->f_mode & FMODE_READ)))
1136         return -EBADF;   
1137
1138     ret = rw_verify_area(READ, in, ppos, len);
1139     if (unlikely(ret < 0))
1140         return ret;      
1141
1142     if (in->f_op && in->f_op->splice_read)//如果文件是socket，则in->f_op->splice_read指向sock_splice_read
1143         splice_read = in->f_op->splice_read;
1144     else
1145         splice_read = default_file_splice_read;
1146
1147     return splice_read(in, ppos, pipe, len, flags);
1148 }

　　 sock_splice_read函数：

 871 static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
 872                 struct pipe_inode_info *pipe, size_t len,
 873                 unsigned int flags)
 874 {
 875     struct socket *sock = file->private_data;
 876
 877     if (unlikely(!sock->ops->splice_read))
 878         return -EINVAL;  
 879
 880     return sock->ops->splice_read(sock, ppos, pipe, len, flags);//指向tcp_splice_read
 881 }

　　 tcp_splice_read函数：

 670 ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 671             struct pipe_inode_info *pipe, size_t len,
 672             unsigned int flags)
 673 {   
 674     struct sock *sk = sock->sk;
 675     struct tcp_splice_state tss = {
 676         .pipe = pipe,
 677         .len = len,
 678         .flags = flags,
 679     };
 680     long timeo;    
 681     ssize_t spliced;
 682     int ret;
 683     
 684     sock_rps_record_flow(sk);
 685     /*
 686      * We can't seek on a socket input  
 687      */
 688     if (unlikely(*ppos))
 689         return -ESPIPE;
 690
 691     ret = spliced = 0;
 692     
 693     lock_sock(sk);
 694
 695     timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
 696     while (tss.len) {
 697         ret = __tcp_splice_read(sk, &tss);//将socket缓存中的数据读到管道中
 698         if (ret < 0)