socket相关系统调用的调用流程

 

最近一直在读内核网络协议栈源码,这里以ipv4/tcp为例对socket相关系统调用的流程做一个简要整理,这些相关系统调用的内部细节虽然各有不同,但其调用流程则基本一致;

 

调用流程:

(1)系统调用 --> (2)查找socket --> (3)执行socket的对应操作函数  --> (4)执行传输层协议的对应操作函数;

 

中间核心数据结构为inetws_array[],位于af_inet.c,以第一个元素type=SOCK_STREAM,protocol=IPPROTO_TCP为例,该类型适用与tcp协议,当创建tcp socket时,其操作socket->ops赋值为&inet_stream_ops,对应的传输控制块操作sock->sk_prot赋值为&tcp_prot;

 1 /* Upon startup we insert all the elements in inetsw_array[] into
 2  * the linked list inetsw.
 3  */
 4 static struct inet_protosw inetsw_array[] =
 5 {
 6     {
 7         .type =       SOCK_STREAM,
 8         .protocol =   IPPROTO_TCP,
 9         .prot =       &tcp_prot,
10         .ops =        &inet_stream_ops,
11         .flags =      INET_PROTOSW_PERMANENT |
12                   INET_PROTOSW_ICSK,
13     },
14 
15     {
16         .type =       SOCK_DGRAM,
17         .protocol =   IPPROTO_UDP,
18         .prot =       &udp_prot,
19         .ops =        &inet_dgram_ops,
20         .flags =      INET_PROTOSW_PERMANENT,
21        },
22 
23        {
24         .type =       SOCK_DGRAM,
25         .protocol =   IPPROTO_ICMP,
26         .prot =       &ping_prot,
27         .ops =        &inet_sockraw_ops,
28         .flags =      INET_PROTOSW_REUSE,
29        },
30 
31        {
32            .type =       SOCK_RAW,
33            .protocol =   IPPROTO_IP,    /* wild card */
34            .prot =       &raw_prot,
35            .ops =        &inet_sockraw_ops,
36            .flags =      INET_PROTOSW_REUSE,
37        }
38 };

 

查看inet_stream_ops结构会发现,其中包含了各种socket系统调用的对应的处理函数;

 1 const struct proto_ops inet_stream_ops = {
 2     .family           = PF_INET,
 3     .owner           = THIS_MODULE,
 4     .release       = inet_release,
 5     .bind           = inet_bind,
 6     .connect       = inet_stream_connect,
 7     .socketpair       = sock_no_socketpair,
 8     .accept           = inet_accept,
 9     .getname       = inet_getname,
10     .poll           = tcp_poll,
11     .ioctl           = inet_ioctl,
12     .listen           = inet_listen,
13     .shutdown       = inet_shutdown,
14     .setsockopt       = sock_common_setsockopt,
15     .getsockopt       = sock_common_getsockopt,
16     .sendmsg       = inet_sendmsg,
17     .recvmsg       = inet_recvmsg,
18     .mmap           = sock_no_mmap,
19     .sendpage       = inet_sendpage,
20     .splice_read       = tcp_splice_read,
21     .read_sock       = tcp_read_sock,
22     .peek_len       = tcp_peek_len,
23 #ifdef CONFIG_COMPAT
24     .compat_setsockopt = compat_sock_common_setsockopt,
25     .compat_getsockopt = compat_sock_common_getsockopt,
26     .compat_ioctl       = inet_compat_ioctl,
27 #endif
28 };

 

查看tcp_prot可见,其中对应了实现了传输层tcp的各种socket操作;

 1 struct proto tcp_prot = {
 2     .name            = "TCP",
 3     .owner            = THIS_MODULE,
 4     .close            = tcp_close,
 5     .connect        = tcp_v4_connect,
 6     .disconnect        = tcp_disconnect,
 7     .accept            = inet_csk_accept,
 8     .ioctl            = tcp_ioctl,
 9     .init            = tcp_v4_init_sock,
10     .destroy        = tcp_v4_destroy_sock,
11     .shutdown        = tcp_shutdown,
12     .setsockopt        = tcp_setsockopt,
13     .getsockopt        = tcp_getsockopt,
14     .keepalive        = tcp_set_keepalive,
15     .recvmsg        = tcp_recvmsg,
16     .sendmsg        = tcp_sendmsg,
17     .sendpage        = tcp_sendpage,
18     .backlog_rcv        = tcp_v4_do_rcv,
19     .release_cb        = tcp_release_cb,
20     .hash            = inet_hash,
21     .unhash            = inet_unhash,
22     .get_port        = inet_csk_get_port,
23     .enter_memory_pressure    = tcp_enter_memory_pressure,
24     .stream_memory_free    = tcp_stream_memory_free,
25     .sockets_allocated    = &tcp_sockets_allocated,
26     .orphan_count        = &tcp_orphan_count,
27     .memory_allocated    = &tcp_memory_allocated,
28     .memory_pressure    = &tcp_memory_pressure,
29     .sysctl_mem        = sysctl_tcp_mem,
30     .sysctl_wmem        = sysctl_tcp_wmem,
31     .sysctl_rmem        = sysctl_tcp_rmem,
32     .max_header        = MAX_TCP_HEADER,
33     .obj_size        = sizeof(struct tcp_sock),
34     .slab_flags        = SLAB_TYPESAFE_BY_RCU,
35     .twsk_prot        = &tcp_timewait_sock_ops,
36     .rsk_prot        = &tcp_request_sock_ops,
37     .h.hashinfo        = &tcp_hashinfo,
38     .no_autobind        = true,
39 #ifdef CONFIG_COMPAT
40     .compat_setsockopt    = compat_tcp_setsockopt,
41     .compat_getsockopt    = compat_tcp_getsockopt,
42 #endif
43     .diag_destroy        = tcp_abort,
44 };

 

 

具体实例,以tcp bind系统调用为例,其中红色部分为上面提到的步骤:

 1 /*
 2  *    Bind a name to a socket. Nothing much to do here since it's
 3  *    the protocol's responsibility to handle the local address.
 4  *
 5  *    We move the socket address to kernel space before we call
 6  *    the protocol layer (having also checked the address is ok).
 7  */
 8 
 9 SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
10 {
11     struct socket *sock;
12     struct sockaddr_storage address;
13     int err, fput_needed;
14 
15     /* 获取socket ,fput_need标识是否需要减少文件引用计数*/
16     sock = sockfd_lookup_light(fd, &err, &fput_needed);
17     if (sock) {
18         /* 将用户空间地址复制到内核空间 */
19         err = move_addr_to_kernel(umyaddr, addrlen, &address);
20         if (err >= 0) {
21             /* 安全模块的bind检查 */
22             err = security_socket_bind(sock,
23                            (struct sockaddr *)&address,
24                            addrlen);
25             if (!err)
26                 /* 调用socket的bind操作 */
27                 err = sock->ops->bind(sock,
28                               (struct sockaddr *)
29                               &address, addrlen);
30         }
31 
32         /* 根据fput_needed决定是否减少引用计数 */
33         fput_light(sock->file, fput_needed);
34     }
35     return err;
36 }

 

上面红色的sock->ops->bind操作实际是调用了inet_stream_ops.bind

  1 /* 地址绑定 */
  2 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
  3 {
  4     struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
  5     struct sock *sk = sock->sk;
  6     struct inet_sock *inet = inet_sk(sk);
  7     struct net *net = sock_net(sk);
  8     unsigned short snum;
  9     int chk_addr_ret;
 10     u32 tb_id = RT_TABLE_LOCAL;
 11     int err;
 12 
 13     /* If the socket has its own bind function then use it. (RAW) */
 14     /* 
 15         如果传输控制块有自己的bind操作则调用,
 16         目前只有raw实现了自己的bind 
 17     */
 18     if (sk->sk_prot->bind) {
 19         err = sk->sk_prot->bind(sk, uaddr, addr_len);
 20         goto out;
 21     }
 22     
 23     err = -EINVAL;
 24     /* 地址长度错误 */
 25     if (addr_len < sizeof(struct sockaddr_in))
 26         goto out;
 27 
 28     /* 如果不是AF_INET协议族 */
 29     if (addr->sin_family != AF_INET) {
 30         /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
 31          * only if s_addr is INADDR_ANY.
 32          */
 33         err = -EAFNOSUPPORT;
 34 
 35         /* 接受AF_UNSPEC && s_addr=htonl(INADDR_ANY)的情况 */
 36         if (addr->sin_family != AF_UNSPEC ||
 37             addr->sin_addr.s_addr != htonl(INADDR_ANY))
 38             goto out;
 39     }
 40 
 41     tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
 42     chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
 43 
 44     /* Not specified by any standard per-se, however it breaks too
 45      * many applications when removed.  It is unfortunate since
 46      * allowing applications to make a non-local bind solves
 47      * several problems with systems using dynamic addressing.
 48      * (ie. your servers still start up even if your ISDN link
 49      *  is temporarily down)
 50      */
 51     err = -EADDRNOTAVAIL;
 52 
 53     /* 合法性检查 */
 54     if (!net->ipv4.sysctl_ip_nonlocal_bind &&
 55         !(inet->freebind || inet->transparent) &&
 56         addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
 57         chk_addr_ret != RTN_LOCAL &&
 58         chk_addr_ret != RTN_MULTICAST &&
 59         chk_addr_ret != RTN_BROADCAST)
 60         goto out;
 61 
 62     /* 源端口 */
 63     snum = ntohs(addr->sin_port);
 64     err = -EACCES;
 65 
 66     /* 绑定特权端口的权限检查 */
 67     if (snum && snum < inet_prot_sock(net) &&
 68         !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
 69         goto out;
 70 
 71     /*      We keep a pair of addresses. rcv_saddr is the one
 72      *      used by hash lookups, and saddr is used for transmit.
 73      *
 74      *      In the BSD API these are the same except where it
 75      *      would be illegal to use them (multicast/broadcast) in
 76      *      which case the sending device address is used.
 77      */
 78     lock_sock(sk);
 79 
 80     /* Check these errors (active socket, double bind). */
 81     err = -EINVAL;
 82 
 83     /* 传输控制块的状态不是CLOSE || 存在本地端口 */
 84     if (sk->sk_state != TCP_CLOSE || inet->inet_num)
 85         goto out_release_sock;
 86 
 87     /* 设置源地址rcv_addr用作hash查找,saddr用作传输 */
 88     inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
 89 
 90     /* 组播或者广播,使用设备地址 */
 91     if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
 92         inet->inet_saddr = 0;  /* Use device */
 93 
 94     /* Make sure we are allowed to bind here. */
 95 
 96     /* 
 97         端口不为0,或者端口为0允许绑定 
 98         则使用协议的具体获取端口函数绑定端口
 99     */
100     if ((snum || !inet->bind_address_no_port) &&
101         sk->sk_prot->get_port(sk, snum)) {
102 
103         /* 绑定失败 */
104         inet->inet_saddr = inet->inet_rcv_saddr = 0;
105 
106         /* 端口在使用中 */
107         err = -EADDRINUSE;
108         goto out_release_sock;
109     }
110 
111     /* 传输控制块已经绑定本地地址或端口标志 */
112     if (inet->inet_rcv_saddr)
113         sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
114     if (snum)
115         sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
116 
117     /* 设置源端口 */
118     inet->inet_sport = htons(inet->inet_num);
119 
120     /* 设置目的地址和端口默认值 */
121     inet->inet_daddr = 0;
122     inet->inet_dport = 0;
123 
124     /* 设置路由默认值 */
125     sk_dst_reset(sk);
126     err = 0;
127 out_release_sock:
128     release_sock(sk);
129 out:
130     return err;
131 }

 

上面的sk->sk_prot->bind以及sk->sk_prot->get_port为具体传输层实现的对应操作函数,其中只有raw socket实现了bind操作,我们不关注,而以tcp的get_port操作为例,实际上也就是调用了tcp_prot.get_port,具体tcp实现为inet_csk_get_port;(该函数尚未分析,后续补充)

 1 /* Obtain a reference to a local port for the given sock,
 2  * if snum is zero it means select any available local port.
 3  * We try to allocate an odd port (and leave even ports for connect())
 4  */
 5 int inet_csk_get_port(struct sock *sk, unsigned short snum)
 6 {
 7     bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 8     struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
 9     int ret = 1, port = snum;
10     struct inet_bind_hashbucket *head;
11     struct net *net = sock_net(sk);
12     struct inet_bind_bucket *tb = NULL;
13     kuid_t uid = sock_i_uid(sk);
14 
15     if (!port) {
16         head = inet_csk_find_open_port(sk, &tb, &port);
17         if (!head)
18             return ret;
19         if (!tb)
20             goto tb_not_found;
21         goto success;
22     }
23     head = &hinfo->bhash[inet_bhashfn(net, port,
24                       hinfo->bhash_size)];
25     spin_lock_bh(&head->lock);
26     inet_bind_bucket_for_each(tb, &head->chain)
27         if (net_eq(ib_net(tb), net) && tb->port == port)
28             goto tb_found;
29 tb_not_found:
30     tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
31                      net, head, port);
32     if (!tb)
33         goto fail_unlock;
34 tb_found:
35     if (!hlist_empty(&tb->owners)) {
36         if (sk->sk_reuse == SK_FORCE_REUSE)
37             goto success;
38 
39         if ((tb->fastreuse > 0 && reuse) ||
40             sk_reuseport_match(tb, sk))
41             goto success;
42         if (inet_csk_bind_conflict(sk, tb, true, true))
43             goto fail_unlock;
44     }
45 success:
46     if (!hlist_empty(&tb->owners)) {
47         tb->fastreuse = reuse;
48         if (sk->sk_reuseport) {
49             tb->fastreuseport = FASTREUSEPORT_ANY;
50             tb->fastuid = uid;
51             tb->fast_rcv_saddr = sk->sk_rcv_saddr;
52             tb->fast_ipv6_only = ipv6_only_sock(sk);
53 #if IS_ENABLED(CONFIG_IPV6)
54             tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
55 #endif
56         } else {
57             tb->fastreuseport = 0;
58         }
59     } else {
60         if (!reuse)
61             tb->fastreuse = 0;
62         if (sk->sk_reuseport) {
63             /* We didn't match or we don't have fastreuseport set on
64              * the tb, but we have sk_reuseport set on this socket
65              * and we know that there are no bind conflicts with
66              * this socket in this tb, so reset our tb's reuseport
67              * settings so that any subsequent sockets that match
68              * our current socket will be put on the fast path.
69              *
70              * If we reset we need to set FASTREUSEPORT_STRICT so we
71              * do extra checking for all subsequent sk_reuseport
72              * socks.
73              */
74             if (!sk_reuseport_match(tb, sk)) {
75                 tb->fastreuseport = FASTREUSEPORT_STRICT;
76                 tb->fastuid = uid;
77                 tb->fast_rcv_saddr = sk->sk_rcv_saddr;
78                 tb->fast_ipv6_only = ipv6_only_sock(sk);
79 #if IS_ENABLED(CONFIG_IPV6)
80                 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
81 #endif
82             }
83         } else {
84             tb->fastreuseport = 0;
85         }
86     }
87     if (!inet_csk(sk)->icsk_bind_hash)
88         inet_bind_hash(sk, tb, port);
89     WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
90     ret = 0;
91 
92 fail_unlock:
93     spin_unlock_bh(&head->lock);
94     return ret;
95 }
96 EXPORT_SYMBOL_GPL(inet_csk_get_port);

 

转载于:https://www.cnblogs.com/wanpengcoder/p/7623101.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值