Socket Kernel Source Chapter05 listen

本文解析了listen系统调用的内部实现过程,从sys_listen到inet_listen再到inet_csk_listen_start等关键函数,详细介绍了监听队列初始化、状态设置及端口获取流程。

5 listen

对应到sys_socketcall中调用sys_listen(),

5.1  sys_listen

asmlinkage long sys_listen(int fd, int backlog)
{
        struct socket *sock;
        int err, fput_needed;
        int somaxconn;

 //fd通过sockfd_lookup来找到指定的sock指针

       sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (sock) {
                somaxconn = sock->sk->sk_net->sysctl_somaxconn;
                if ((unsigned)backlog > somaxconn)   //等待队列上限
                        backlog = somaxconn;

                err = security_socket_listen(sock, backlog); //安全部分忽略
                if (!err)
                        err = sock->ops->listen(sock, backlog); //参见struct proto_ops,会调用inet_listen

                fput_light(sock->file, fput_needed);
        }
        return err;
}

 

 

5.2 inet_listen

 

socket结构-->sock结构

int inet_listen(struct socket *sock, int backlog)
{
        struct sock *sk = sock->sk;
        unsigned char old_state;
        int err;

        lock_sock(sk);  //加锁

        err = -EINVAL;

        //检查状态字和类型
        if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
                goto out;

        old_state = sk->sk_state;
        if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                goto out;

        

        //当状态字不是TCP_LISTEN时,调用inet_csk_listen_start
        if (old_state != TCP_LISTEN) {
                err = inet_csk_listen_start(sk, backlog);
                if (err)
                        goto out;
        }
        sk->sk_max_ack_backlog = backlog; //纪录最大排队数量
        err = 0;

out:
        release_sock(sk);
        return err;
}

 

 

5.3 inet_csk_listen_start

int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
        struct inet_sock *inet = inet_sk(sk); //强制类型转换
        struct inet_connection_sock *icsk = inet_csk(sk); //强制类型转换
        int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); //初始化queue,监听队列

        if (rc != 0)
                return rc;

        sk->sk_max_ack_backlog = 0;
        sk->sk_ack_backlog = 0;
        inet_csk_delack_init(sk); //初始化inet_csk(sk)->icsk_ack

        

       //设置状态为LISTEN

        sk->sk_state = TCP_LISTEN;
        if (!sk->sk_prot->get_port(sk, inet->num)) {
                inet->sport = htons(inet->num);

                sk_dst_reset(sk);
                sk->sk_prot->hash(sk);

                return 0;
        }

        sk->sk_state = TCP_CLOSE;  //获取端口失败,关闭
        __reqsk_queue_destroy(&icsk->icsk_accept_queue);
        return -EADDRINUSE;
}

 

5.3.1 inet_csk

强制类型转换

static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
{
        return (struct inet_connection_sock *)sk;
}
struct inet_connection_sock {
        /* inet_sock has to be the first member! */
        struct inet_sock          icsk_inet;
        struct request_sock_queue icsk_accept_queue;
        struct inet_bind_bucket   *icsk_bind_hash;
        unsigned long             icsk_timeout;
        struct timer_list         icsk_retransmit_timer;
        struct timer_list         icsk_delack_timer;
        __u32                     icsk_rto;
        __u32                     icsk_pmtu_cookie;
        const struct tcp_congestion_ops *icsk_ca_ops;
        const struct inet_connection_sock_af_ops *icsk_af_ops;
        unsigned int              (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
        __u8                      icsk_ca_state;
        __u8                      icsk_retransmits;
        __u8                      icsk_pending;
        __u8                      icsk_backoff;
        __u8                      icsk_syn_retries;
        __u8                      icsk_probes_out;
        __u16                     icsk_ext_hdr_len;
        struct {
                __u8              pending;       /* ACK is pending                         */
                __u8              quick;         /* Scheduled number of quick acks         */
                __u8              pingpong;      /* The session is interactive             */
                __u8              blocked;       /* Delayed ACK was blocked by socket lock */
                __u32             ato;           /* Predicted tick of soft clock           */
                unsigned long     timeout;       /* Currently scheduled timeout            */
                __u32             lrcvtime;      /* timestamp of last received data packet */
                __u16             last_seg_size; /* Size of last incoming segment          */
                __u16             rcv_mss;       /* MSS used for delayed ACK decisions     */
        } icsk_ack;
        struct {
                int               enabled;

                /* Range of MTUs to search */
                int               search_high;
                int               search_low;

                /* Information on the current probe. */
                int               probe_size;
        } icsk_mtup;
        u32                       icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE       (16 * sizeof(u32))
};

 

5.3.2 reqsk_queue_alloc

初始化queue,监听队列

int reqsk_queue_alloc(struct request_sock_queue *queue,
                      unsigned int nr_table_entries)
{
        size_t lopt_size = sizeof(struct listen_sock);
        struct listen_sock *lopt;

        nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
        nr_table_entries = max_t(u32, nr_table_entries, 8);
        nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
        lopt_size += nr_table_entries * sizeof(struct request_sock *);
        if (lopt_size > PAGE_SIZE)
                lopt = __vmalloc(lopt_size,
                        GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
                        PAGE_KERNEL);
        else
                lopt = kzalloc(lopt_size, GFP_KERNEL);
        if (lopt == NULL)
                return -ENOMEM;

        for (lopt->max_qlen_log = 3;
             (1 << lopt->max_qlen_log) < nr_table_entries;
             lopt->max_qlen_log++);

        get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
        rwlock_init(&queue->syn_wait_lock);
        queue->rskq_accept_head = NULL; //监听队列头
        lopt->nr_table_entries = nr_table_entries; //最大个数

        write_lock_bh(&queue->syn_wait_lock);
        queue->listen_opt = lopt;                 
        write_unlock_bh(&queue->syn_wait_lock);

        return 0;
}
 

struct listen_sock {
        u8                      max_qlen_log;
        /* 3 bytes hole, try to use */
        int                     qlen;
        int                     qlen_young;
        int                     clock_hand;
        u32                     hash_rnd;
        u32                     nr_table_entries;
        struct request_sock     *syn_table[0];
};

5.3.3 inet_csk_delack_init

static inline void inet_csk_delack_init(struct sock *sk)
{
        memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack));
}

 

5.3.4 get_port

根据snum得到需要的端口,如果snum==0,则取任意一个可用的端口

int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
        struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
        struct inet_bind_hashbucket *head;
        struct hlist_node *node;
        struct inet_bind_bucket *tb;
        int ret;
        struct net *net = sk->sk_net;

        local_bh_disable();
        if (!snum) {
                int remaining, rover, low, high;

                inet_get_local_port_range(&low, &high);     //获取可用的端口范围
                remaining = (high - low) + 1;
                rover = net_random() % remaining + low;     //获取随机

                do {
                        head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
                        spin_lock(&head->lock);
                        inet_bind_bucket_for_each(tb, node, &head->chain)
                                if (tb->ib_net == net && tb->port == rover)
                                        goto next;
                        break;
                next:
                        spin_unlock(&head->lock);
                        if (++rover > high)
                                rover = low;
                } while (--remaining > 0);

                ret = 1;
                if (remaining <= 0)
                        goto fail;

                snum = rover;
        } else {
                head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
                spin_lock(&head->lock);
                inet_bind_bucket_for_each(tb, node, &head->chain)
                        if (tb->ib_net == net && tb->port == snum)
                                goto tb_found;
        }
        tb = NULL;
        goto tb_not_found;
tb_found:
        if (!hlist_empty(&tb->owners)) {
                if (sk->sk_reuse > 1)
                        goto success;
                if (tb->fastreuse > 0 &&
                    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
                        goto success;
                } else {
                        ret = 1;
                        if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb))
                                goto fail_unlock;
                }
        }
tb_not_found:
        ret = 1;
        if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
                                        net, head, snum)) == NULL)
                goto fail_unlock;
        if (hlist_empty(&tb->owners)) {
                if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
                        tb->fastreuse = 1;
                else
                        tb->fastreuse = 0;
        } else if (tb->fastreuse &&
                   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
                tb->fastreuse = 0;
success:
        if (!inet_csk(sk)->icsk_bind_hash)
                inet_bind_hash(sk, tb, snum); //获取端口成功,赋值给sk
        BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
        ret = 0;

fail_unlock:
        spin_unlock(&head->lock);
fail:
        local_bh_enable();
        return ret;
}

EXPORT_SYMBOL_GPL(inet_csk_get_port);
 

获取可用的端口上下限 

void inet_get_local_port_range(int *low, int *high)
{
        unsigned seq;
        do {
                seq = read_seqbegin(&sysctl_port_range_lock);

                *low = sysctl_local_port_range[0];
                *high = sysctl_local_port_range[1];
        } while (read_seqretry(&sysctl_port_range_lock, seq));
}

hash函数,计算其hash值

static inline int inet_bhashfn(const __u16 lport, const int bhash_size)
{
        return lport & (bhash_size - 1);
}

将获取的端口值,赋值给inet_sk

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
                    const unsigned short snum)
{
        inet_sk(sk)->num = snum;
        sk_add_bind_node(sk, &tb->owners);
        inet_csk(sk)->icsk_bind_hash = tb;
}

 

struct inet_bind_bucket {
        struct net              *ib_net;
        unsigned short          port;
        signed short            fastreuse;
        struct hlist_node       node;
        struct hlist_head       owners;
};

#define inet_bind_bucket_for_each(tb, node, head) /
        hlist_for_each_entry(tb, node, head, node)

struct inet_bind_hashbucket {
        spinlock_t              lock;
        struct hlist_head       chain;
};

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值