5 listen
对应到sys_socketcall中调用sys_listen(),
5.1 sys_listen
asmlinkage long sys_listen(int fd, int backlog)
{
struct socket *sock;
int err, fput_needed;
int somaxconn;
//fd通过sockfd_lookup来找到指定的sock指针
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
somaxconn = sock->sk->sk_net->sysctl_somaxconn;
if ((unsigned)backlog > somaxconn) //等待队列上限
backlog = somaxconn;
err = security_socket_listen(sock, backlog); //安全部分忽略
if (!err)
err = sock->ops->listen(sock, backlog); //参见struct proto_ops,会调用inet_listen
fput_light(sock->file, fput_needed);
}
return err;
}
5.2 inet_listen
由socket结构-->sock结构
int inet_listen(struct socket *sock, int backlog) lock_sock(sk); //加锁 err = -EINVAL; //检查状态字和类型 old_state = sk->sk_state; //当状态字不是TCP_LISTEN时,调用inet_csk_listen_start out:
{
struct sock *sk = sock->sk;
unsigned char old_state;
int err;
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
goto out;
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
if (old_state != TCP_LISTEN) {
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
}
sk->sk_max_ack_backlog = backlog; //纪录最大排队数量
err = 0;
release_sock(sk);
return err;
}
5.3 inet_csk_listen_start
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
struct inet_sock *inet = inet_sk(sk); //强制类型转换
struct inet_connection_sock *icsk = inet_csk(sk); //强制类型转换
int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); //初始化queue,监听队列
if (rc != 0)
return rc;
sk->sk_max_ack_backlog = 0;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk); //初始化inet_csk(sk)->icsk_ack
//设置状态为LISTEN
sk->sk_state = TCP_LISTEN;
if (!sk->sk_prot->get_port(sk, inet->num)) {
inet->sport = htons(inet->num);
sk_dst_reset(sk);
sk->sk_prot->hash(sk);
return 0;
}
sk->sk_state = TCP_CLOSE; //获取端口失败,关闭
__reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE;
}
5.3.1 inet_csk
强制类型转换
static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
{
return (struct inet_connection_sock *)sk;
}
struct inet_connection_sock {
/* inet_sock has to be the first member! */
struct inet_sock icsk_inet;
struct request_sock_queue icsk_accept_queue;
struct inet_bind_bucket *icsk_bind_hash;
unsigned long icsk_timeout;
struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer;
__u32 icsk_rto;
__u32 icsk_pmtu_cookie;
const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops;
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state;
__u8 icsk_retransmits;
__u8 icsk_pending;
__u8 icsk_backoff;
__u8 icsk_syn_retries;
__u8 icsk_probes_out;
__u16 icsk_ext_hdr_len;
struct {
__u8 pending; /* ACK is pending */
__u8 quick; /* Scheduled number of quick acks */
__u8 pingpong; /* The session is interactive */
__u8 blocked; /* Delayed ACK was blocked by socket lock */
__u32 ato; /* Predicted tick of soft clock */
unsigned long timeout; /* Currently scheduled timeout */
__u32 lrcvtime; /* timestamp of last received data packet */
__u16 last_seg_size; /* Size of last incoming segment */
__u16 rcv_mss; /* MSS used for delayed ACK decisions */
} icsk_ack;
struct {
int enabled;
/* Range of MTUs to search */
int search_high;
int search_low;
/* Information on the current probe. */
int probe_size;
} icsk_mtup;
u32 icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE (16 * sizeof(u32))
};
5.3.2 reqsk_queue_alloc
初始化queue,监听队列
int reqsk_queue_alloc(struct request_sock_queue *queue,
unsigned int nr_table_entries)
{
size_t lopt_size = sizeof(struct listen_sock);
struct listen_sock *lopt;
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
nr_table_entries = max_t(u32, nr_table_entries, 8);
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
lopt_size += nr_table_entries * sizeof(struct request_sock *);
if (lopt_size > PAGE_SIZE)
lopt = __vmalloc(lopt_size,
GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL);
else
lopt = kzalloc(lopt_size, GFP_KERNEL);
if (lopt == NULL)
return -ENOMEM;
for (lopt->max_qlen_log = 3;
(1 << lopt->max_qlen_log) < nr_table_entries;
lopt->max_qlen_log++);
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
rwlock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL; //监听队列头
lopt->nr_table_entries = nr_table_entries; //最大个数
write_lock_bh(&queue->syn_wait_lock);
queue->listen_opt = lopt;
write_unlock_bh(&queue->syn_wait_lock);
return 0;
}
struct listen_sock {
u8 max_qlen_log;
/* 3 bytes hole, try to use */
int qlen;
int qlen_young;
int clock_hand;
u32 hash_rnd;
u32 nr_table_entries;
struct request_sock *syn_table[0];
};
5.3.3 inet_csk_delack_init
static inline void inet_csk_delack_init(struct sock *sk)
{
memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack));
}
5.3.4 get_port
根据snum得到需要的端口,如果snum==0,则取任意一个可用的端口
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
struct inet_bind_hashbucket *head;
struct hlist_node *node;
struct inet_bind_bucket *tb;
int ret;
struct net *net = sk->sk_net;
local_bh_disable();
if (!snum) {
int remaining, rover, low, high;
inet_get_local_port_range(&low, &high); //获取可用的端口范围
remaining = (high - low) + 1;
rover = net_random() % remaining + low; //获取随机
do {
head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->ib_net == net && tb->port == rover)
goto next;
break;
next:
spin_unlock(&head->lock);
if (++rover > high)
rover = low;
} while (--remaining > 0);
ret = 1;
if (remaining <= 0)
goto fail;
snum = rover;
} else {
head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->ib_net == net && tb->port == snum)
goto tb_found;
}
tb = NULL;
goto tb_not_found;
tb_found:
if (!hlist_empty(&tb->owners)) {
if (sk->sk_reuse > 1)
goto success;
if (tb->fastreuse > 0 &&
sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
goto success;
} else {
ret = 1;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb))
goto fail_unlock;
}
}
tb_not_found:
ret = 1;
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
net, head, snum)) == NULL)
goto fail_unlock;
if (hlist_empty(&tb->owners)) {
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
tb->fastreuse = 1;
else
tb->fastreuse = 0;
} else if (tb->fastreuse &&
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
tb->fastreuse = 0;
success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum); //获取端口成功,赋值给sk
BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
ret = 0;
fail_unlock:
spin_unlock(&head->lock);
fail:
local_bh_enable();
return ret;
}
EXPORT_SYMBOL_GPL(inet_csk_get_port);
获取可用的端口上下限
void inet_get_local_port_range(int *low, int *high)
{
unsigned seq;
do {
seq = read_seqbegin(&sysctl_port_range_lock);
*low = sysctl_local_port_range[0];
*high = sysctl_local_port_range[1];
} while (read_seqretry(&sysctl_port_range_lock, seq));
}
hash函数,计算其hash值
static inline int inet_bhashfn(const __u16 lport, const int bhash_size)
{
return lport & (bhash_size - 1);
}
将获取的端口值,赋值给inet_sk
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum)
{
inet_sk(sk)->num = snum;
sk_add_bind_node(sk, &tb->owners);
inet_csk(sk)->icsk_bind_hash = tb;
}
struct inet_bind_bucket {
struct net *ib_net;
unsigned short port;
signed short fastreuse;
struct hlist_node node;
struct hlist_head owners;
};
#define inet_bind_bucket_for_each(tb, node, head) /
hlist_for_each_entry(tb, node, head, node)
struct inet_bind_hashbucket {
spinlock_t lock;
struct hlist_head chain;
};
本文解析了listen系统调用的内部实现过程,从sys_listen到inet_listen再到inet_csk_listen_start等关键函数,详细介绍了监听队列初始化、状态设置及端口获取流程。
8046

被折叠的 条评论
为什么被折叠?



