最近一直在读内核网络协议栈源码,这里以ipv4/tcp为例对socket相关系统调用的流程做一个简要整理,这些相关系统调用的内部细节虽然各有不同,但其调用流程则基本一致;
调用流程:
(1)系统调用 --> (2)查找socket --> (3)执行socket的对应操作函数 --> (4)执行传输层协议的对应操作函数;
中间核心数据结构为inetws_array[],位于af_inet.c,以第一个元素type=SOCK_STREAM,protocol=IPPROTO_TCP为例,该类型适用与tcp协议,当创建tcp socket时,其操作socket->ops赋值为&inet_stream_ops,对应的传输控制块操作sock->sk_prot赋值为&tcp_prot;
1 /* Upon startup we insert all the elements in inetsw_array[] into 2 * the linked list inetsw. 3 */ 4 static struct inet_protosw inetsw_array[] = 5 { 6 { 7 .type = SOCK_STREAM, 8 .protocol = IPPROTO_TCP, 9 .prot = &tcp_prot, 10 .ops = &inet_stream_ops, 11 .flags = INET_PROTOSW_PERMANENT | 12 INET_PROTOSW_ICSK, 13 }, 14 15 { 16 .type = SOCK_DGRAM, 17 .protocol = IPPROTO_UDP, 18 .prot = &udp_prot, 19 .ops = &inet_dgram_ops, 20 .flags = INET_PROTOSW_PERMANENT, 21 }, 22 23 { 24 .type = SOCK_DGRAM, 25 .protocol = IPPROTO_ICMP, 26 .prot = &ping_prot, 27 .ops = &inet_sockraw_ops, 28 .flags = INET_PROTOSW_REUSE, 29 }, 30 31 { 32 .type = SOCK_RAW, 33 .protocol = IPPROTO_IP, /* wild card */ 34 .prot = &raw_prot, 35 .ops = &inet_sockraw_ops, 36 .flags = INET_PROTOSW_REUSE, 37 } 38 };
查看inet_stream_ops结构会发现,其中包含了各种socket系统调用的对应的处理函数;
1 const struct proto_ops inet_stream_ops = { 2 .family = PF_INET, 3 .owner = THIS_MODULE, 4 .release = inet_release, 5 .bind = inet_bind, 6 .connect = inet_stream_connect, 7 .socketpair = sock_no_socketpair, 8 .accept = inet_accept, 9 .getname = inet_getname, 10 .poll = tcp_poll, 11 .ioctl = inet_ioctl, 12 .listen = inet_listen, 13 .shutdown = inet_shutdown, 14 .setsockopt = sock_common_setsockopt, 15 .getsockopt = sock_common_getsockopt, 16 .sendmsg = inet_sendmsg, 17 .recvmsg = inet_recvmsg, 18 .mmap = sock_no_mmap, 19 .sendpage = inet_sendpage, 20 .splice_read = tcp_splice_read, 21 .read_sock = tcp_read_sock, 22 .peek_len = tcp_peek_len, 23 #ifdef CONFIG_COMPAT 24 .compat_setsockopt = compat_sock_common_setsockopt, 25 .compat_getsockopt = compat_sock_common_getsockopt, 26 .compat_ioctl = inet_compat_ioctl, 27 #endif 28 };
查看tcp_prot可见,其中对应了实现了传输层tcp的各种socket操作;
1 struct proto tcp_prot = { 2 .name = "TCP", 3 .owner = THIS_MODULE, 4 .close = tcp_close, 5 .connect = tcp_v4_connect, 6 .disconnect = tcp_disconnect, 7 .accept = inet_csk_accept, 8 .ioctl = tcp_ioctl, 9 .init = tcp_v4_init_sock, 10 .destroy = tcp_v4_destroy_sock, 11 .shutdown = tcp_shutdown, 12 .setsockopt = tcp_setsockopt, 13 .getsockopt = tcp_getsockopt, 14 .keepalive = tcp_set_keepalive, 15 .recvmsg = tcp_recvmsg, 16 .sendmsg = tcp_sendmsg, 17 .sendpage = tcp_sendpage, 18 .backlog_rcv = tcp_v4_do_rcv, 19 .release_cb = tcp_release_cb, 20 .hash = inet_hash, 21 .unhash = inet_unhash, 22 .get_port = inet_csk_get_port, 23 .enter_memory_pressure = tcp_enter_memory_pressure, 24 .stream_memory_free = tcp_stream_memory_free, 25 .sockets_allocated = &tcp_sockets_allocated, 26 .orphan_count = &tcp_orphan_count, 27 .memory_allocated = &tcp_memory_allocated, 28 .memory_pressure = &tcp_memory_pressure, 29 .sysctl_mem = sysctl_tcp_mem, 30 .sysctl_wmem = sysctl_tcp_wmem, 31 .sysctl_rmem = sysctl_tcp_rmem, 32 .max_header = MAX_TCP_HEADER, 33 .obj_size = sizeof(struct tcp_sock), 34 .slab_flags = SLAB_TYPESAFE_BY_RCU, 35 .twsk_prot = &tcp_timewait_sock_ops, 36 .rsk_prot = &tcp_request_sock_ops, 37 .h.hashinfo = &tcp_hashinfo, 38 .no_autobind = true, 39 #ifdef CONFIG_COMPAT 40 .compat_setsockopt = compat_tcp_setsockopt, 41 .compat_getsockopt = compat_tcp_getsockopt, 42 #endif 43 .diag_destroy = tcp_abort, 44 };
具体实例,以tcp bind系统调用为例,其中红色部分为上面提到的步骤:
1 /* 2 * Bind a name to a socket. Nothing much to do here since it's 3 * the protocol's responsibility to handle the local address. 4 * 5 * We move the socket address to kernel space before we call 6 * the protocol layer (having also checked the address is ok). 7 */ 8 9 SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) 10 { 11 struct socket *sock; 12 struct sockaddr_storage address; 13 int err, fput_needed; 14 15 /* 获取socket ,fput_need标识是否需要减少文件引用计数*/ 16 sock = sockfd_lookup_light(fd, &err, &fput_needed); 17 if (sock) { 18 /* 将用户空间地址复制到内核空间 */ 19 err = move_addr_to_kernel(umyaddr, addrlen, &address); 20 if (err >= 0) { 21 /* 安全模块的bind检查 */ 22 err = security_socket_bind(sock, 23 (struct sockaddr *)&address, 24 addrlen); 25 if (!err) 26 /* 调用socket的bind操作 */ 27 err = sock->ops->bind(sock, 28 (struct sockaddr *) 29 &address, addrlen); 30 } 31 32 /* 根据fput_needed决定是否减少引用计数 */ 33 fput_light(sock->file, fput_needed); 34 } 35 return err; 36 }
上面红色的sock->ops->bind操作实际是调用了inet_stream_ops.bind
1 /* 地址绑定 */ 2 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 3 { 4 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 5 struct sock *sk = sock->sk; 6 struct inet_sock *inet = inet_sk(sk); 7 struct net *net = sock_net(sk); 8 unsigned short snum; 9 int chk_addr_ret; 10 u32 tb_id = RT_TABLE_LOCAL; 11 int err; 12 13 /* If the socket has its own bind function then use it. (RAW) */ 14 /* 15 如果传输控制块有自己的bind操作则调用, 16 目前只有raw实现了自己的bind 17 */ 18 if (sk->sk_prot->bind) { 19 err = sk->sk_prot->bind(sk, uaddr, addr_len); 20 goto out; 21 } 22 23 err = -EINVAL; 24 /* 地址长度错误 */ 25 if (addr_len < sizeof(struct sockaddr_in)) 26 goto out; 27 28 /* 如果不是AF_INET协议族 */ 29 if (addr->sin_family != AF_INET) { 30 /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) 31 * only if s_addr is INADDR_ANY. 32 */ 33 err = -EAFNOSUPPORT; 34 35 /* 接受AF_UNSPEC && s_addr=htonl(INADDR_ANY)的情况 */ 36 if (addr->sin_family != AF_UNSPEC || 37 addr->sin_addr.s_addr != htonl(INADDR_ANY)) 38 goto out; 39 } 40 41 tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; 42 chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); 43 44 /* Not specified by any standard per-se, however it breaks too 45 * many applications when removed. It is unfortunate since 46 * allowing applications to make a non-local bind solves 47 * several problems with systems using dynamic addressing. 48 * (ie. your servers still start up even if your ISDN link 49 * is temporarily down) 50 */ 51 err = -EADDRNOTAVAIL; 52 53 /* 合法性检查 */ 54 if (!net->ipv4.sysctl_ip_nonlocal_bind && 55 !(inet->freebind || inet->transparent) && 56 addr->sin_addr.s_addr != htonl(INADDR_ANY) && 57 chk_addr_ret != RTN_LOCAL && 58 chk_addr_ret != RTN_MULTICAST && 59 chk_addr_ret != RTN_BROADCAST) 60 goto out; 61 62 /* 源端口 */ 63 snum = ntohs(addr->sin_port); 64 err = -EACCES; 65 66 /* 绑定特权端口的权限检查 */ 67 if (snum && snum < inet_prot_sock(net) && 68 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) 69 goto out; 70 71 /* We keep a pair of addresses. rcv_saddr is the one 72 * used by hash lookups, and saddr is used for transmit. 73 * 74 * In the BSD API these are the same except where it 75 * would be illegal to use them (multicast/broadcast) in 76 * which case the sending device address is used. 77 */ 78 lock_sock(sk); 79 80 /* Check these errors (active socket, double bind). */ 81 err = -EINVAL; 82 83 /* 传输控制块的状态不是CLOSE || 存在本地端口 */ 84 if (sk->sk_state != TCP_CLOSE || inet->inet_num) 85 goto out_release_sock; 86 87 /* 设置源地址rcv_addr用作hash查找,saddr用作传输 */ 88 inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; 89 90 /* 组播或者广播,使用设备地址 */ 91 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) 92 inet->inet_saddr = 0; /* Use device */ 93 94 /* Make sure we are allowed to bind here. */ 95 96 /* 97 端口不为0,或者端口为0允许绑定 98 则使用协议的具体获取端口函数绑定端口 99 */ 100 if ((snum || !inet->bind_address_no_port) && 101 sk->sk_prot->get_port(sk, snum)) { 102 103 /* 绑定失败 */ 104 inet->inet_saddr = inet->inet_rcv_saddr = 0; 105 106 /* 端口在使用中 */ 107 err = -EADDRINUSE; 108 goto out_release_sock; 109 } 110 111 /* 传输控制块已经绑定本地地址或端口标志 */ 112 if (inet->inet_rcv_saddr) 113 sk->sk_userlocks |= SOCK_BINDADDR_LOCK; 114 if (snum) 115 sk->sk_userlocks |= SOCK_BINDPORT_LOCK; 116 117 /* 设置源端口 */ 118 inet->inet_sport = htons(inet->inet_num); 119 120 /* 设置目的地址和端口默认值 */ 121 inet->inet_daddr = 0; 122 inet->inet_dport = 0; 123 124 /* 设置路由默认值 */ 125 sk_dst_reset(sk); 126 err = 0; 127 out_release_sock: 128 release_sock(sk); 129 out: 130 return err; 131 }
上面的sk->sk_prot->bind以及sk->sk_prot->get_port为具体传输层实现的对应操作函数,其中只有raw socket实现了bind操作,我们不关注,而以tcp的get_port操作为例,实际上也就是调用了tcp_prot.get_port,具体tcp实现为inet_csk_get_port;(该函数尚未分析,后续补充)
1 /* Obtain a reference to a local port for the given sock, 2 * if snum is zero it means select any available local port. 3 * We try to allocate an odd port (and leave even ports for connect()) 4 */ 5 int inet_csk_get_port(struct sock *sk, unsigned short snum) 6 { 7 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; 8 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; 9 int ret = 1, port = snum; 10 struct inet_bind_hashbucket *head; 11 struct net *net = sock_net(sk); 12 struct inet_bind_bucket *tb = NULL; 13 kuid_t uid = sock_i_uid(sk); 14 15 if (!port) { 16 head = inet_csk_find_open_port(sk, &tb, &port); 17 if (!head) 18 return ret; 19 if (!tb) 20 goto tb_not_found; 21 goto success; 22 } 23 head = &hinfo->bhash[inet_bhashfn(net, port, 24 hinfo->bhash_size)]; 25 spin_lock_bh(&head->lock); 26 inet_bind_bucket_for_each(tb, &head->chain) 27 if (net_eq(ib_net(tb), net) && tb->port == port) 28 goto tb_found; 29 tb_not_found: 30 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 31 net, head, port); 32 if (!tb) 33 goto fail_unlock; 34 tb_found: 35 if (!hlist_empty(&tb->owners)) { 36 if (sk->sk_reuse == SK_FORCE_REUSE) 37 goto success; 38 39 if ((tb->fastreuse > 0 && reuse) || 40 sk_reuseport_match(tb, sk)) 41 goto success; 42 if (inet_csk_bind_conflict(sk, tb, true, true)) 43 goto fail_unlock; 44 } 45 success: 46 if (!hlist_empty(&tb->owners)) { 47 tb->fastreuse = reuse; 48 if (sk->sk_reuseport) { 49 tb->fastreuseport = FASTREUSEPORT_ANY; 50 tb->fastuid = uid; 51 tb->fast_rcv_saddr = sk->sk_rcv_saddr; 52 tb->fast_ipv6_only = ipv6_only_sock(sk); 53 #if IS_ENABLED(CONFIG_IPV6) 54 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; 55 #endif 56 } else { 57 tb->fastreuseport = 0; 58 } 59 } else { 60 if (!reuse) 61 tb->fastreuse = 0; 62 if (sk->sk_reuseport) { 63 /* We didn't match or we don't have fastreuseport set on 64 * the tb, but we have sk_reuseport set on this socket 65 * and we know that there are no bind conflicts with 66 * this socket in this tb, so reset our tb's reuseport 67 * settings so that any subsequent sockets that match 68 * our current socket will be put on the fast path. 69 * 70 * If we reset we need to set FASTREUSEPORT_STRICT so we 71 * do extra checking for all subsequent sk_reuseport 72 * socks. 73 */ 74 if (!sk_reuseport_match(tb, sk)) { 75 tb->fastreuseport = FASTREUSEPORT_STRICT; 76 tb->fastuid = uid; 77 tb->fast_rcv_saddr = sk->sk_rcv_saddr; 78 tb->fast_ipv6_only = ipv6_only_sock(sk); 79 #if IS_ENABLED(CONFIG_IPV6) 80 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; 81 #endif 82 } 83 } else { 84 tb->fastreuseport = 0; 85 } 86 } 87 if (!inet_csk(sk)->icsk_bind_hash) 88 inet_bind_hash(sk, tb, port); 89 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); 90 ret = 0; 91 92 fail_unlock: 93 spin_unlock_bh(&head->lock); 94 return ret; 95 } 96 EXPORT_SYMBOL_GPL(inet_csk_get_port);