转载自:http://www.cppblog.com/qinqing1984/archive/2015/05/03/210522.html
Socket操作
系统调用read(v)、write(v)是用户空间读写socket的一种方法,为了弄清楚它们是怎么通过VFS将请求转发到特定协议的实现,下面以read为例(write同理),并假定文件描述符对应的是IPv4 TCP类型的socket,来跟踪它的执行流程。首先来看下sys_read的代码,定义在fs/read_write.c中。
1
SYSCALL_DEFINE3(
read
, unsigned
int
, fd,
char
__user
*
, buf, size_t, count)
2
{
3
struct file *file;
4
ssize_t ret = -EBADF;
5
int fput_needed;
6
7
file = fget_light(fd, &fput_needed);
8
if (file) {
9
loff_t pos = file_pos_read(file);
10
ret = vfs_read(file, buf, count, &pos);
11

12
}
13
14
return ret;
15
}

2

3

4

5

6

7

8

9

10

11


12

13

14

15

1
ssize_t
vfs_read
(
struct
file
*
file,
char
__user
*
buf, size_t count, loff_t
*
pos)
2
{
3
ssize_t ret;
4

5
ret = rw_verify_area(READ, file, pos, count);
6
if (ret >= 0) {
7
count = ret;
8
if (file->f_op->read)
9
ret = file->f_op->read(file, buf, count, pos);
10
else
11
ret = do_sync_read(file, buf, count, pos);
12

13
}
14
15
return ret;
16
}

2

3

4


5

6

7

8

9

10

11

12


13

14

15

16

1
ssize_t
d
o_sync_read
(
struct
file
*
filp,
char
__user
*
buf, size_t len, loff_t
*
ppos)
2
{
3
struct iovec iov = { .iov_base = buf, .iov_len = len };
4
struct kiocb kiocb;
5
ssize_t ret;
6
7

8
for (;;) {
9
ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
10
if (ret != -EIOCBRETRY)
11
break;
12
wait_on_retry_sync_kiocb(&kiocb);
13
}
14
15
if (-EIOCBQUEUED == ret)
16
ret = wait_on_sync_kiocb(&kiocb);
17
*ppos = kiocb.ki_pos;
18
return ret;
19
}

2

3

4

5

6

7


8

9

10

11

12

13

14

15

16

17

18

19

在socket层跟踪sock_aio_read,可以得到最后调用的是sock->ops->recvmsg,由于socket类型为IPv4 TCP,因此sock->ops在socket创建过程中被设为inet_stream_ops,定义在net/ipv4/af_inet.c中。
1
const
struct
proto_ops
inet_stream_ops
=
{
2
.family = PF_INET,
3

4
.release = inet_release,
5

6
.recvmsg = sock_common_recvmsg,
7

8
}
;

2

3


4

5


6

7


8

1
int
sock_common_recvmsg
(
struct
kiocb
*
iocb,
struct
socket
*
sock,
struct
msghdr
*
msg, size_t size,
int
flags)
2
{
3
struct sock *sk = sock->sk;
4
int addr_len = 0;
5
int err;
6
7
err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,flags & ~MSG_DONTWAIT, &addr_len);
8

9
return err;
10
}

2

3

4

5

6

7

8


9

10

1
struct
proto
tcp_prot
=
{
2
.name = "TCP",
3

4
.close = tcp_close,
5

6
.recvmsg = tcp_recvmsg,
7

8
}
;

2

3


4

5


6

7


8



Socket销毁
Socket操作既可以调用文件IO,也可以调用Berkeley Sockets API。但销毁不同,系统调用close是用户空间销毁socket的唯一方法,它定义在fs/open.c中。
1
SYSCALL_DEFINE1(
close
, unsigned
int
, fd)
2
{
3
struct file * filp;
4
struct files_struct *files = current->files;
5
struct fdtable *fdt;
6
int retval;
7
8
spin_lock(&files->file_lock);
9
fdt = files_fdtable(files);
10

11
filp = fdt->fd[fd];
12

13
rcu_assign_pointer(fdt->fd[fd], NULL);
14
FD_CLR(fd, fdt->close_on_exec);
15
__put_unused_fd(files, fd);
16
spin_unlock(&files->file_lock);
17
retval = filp_close(filp, files);
18

19
}

2

3

4

5

6

7

8

9

10


11

12


13

14

15

16

17

18


19

1
int
filp_close
(
struct
file
*
filp, fl_owner_t id)
2
{
3
int retval = 0;
4
5
if (!file_count(filp)) {
6
printk(KERN_ERR "VFS: Close: file count is 0\n");
7
return 0;
8
}
9
10
if (filp->f_op && filp->f_op->flush)
11
retval = filp->f_op->flush(filp, id);
12
13
dnotify_flush(filp, id);
14
locks_remove_posix(filp, id);
15
fput(filp);
16
return retval;
17
}

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

1
void
fput
(
struct
file
*
file)
2
{
3
if (atomic_long_dec_and_test(&file->f_count))
4
__fput(file);
5
}

2

3

4

5

1
void
sock_release
(
struct
socket
*
sock)
2
{
3
if (sock->ops) {
4
struct module *owner = sock->ops->owner;
5
6
sock->ops->release(sock);
7
sock->ops = NULL;
8
module_put(owner);
9
}
10
if (sock->fasync_list)
11
printk(KERN_ERR "sock_release: fasync list not empty!\n");
12
13
percpu_sub(sockets_in_use, 1);
14
if (!sock->file) {
15
iput(SOCK_INODE(sock));
16
return;
17
}
18
sock->file = NULL;
19
}

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19


在 上篇初始化一节 ,我们已知sockfs文件系统被装载,然而实际上没有卸载它的方式。由于TCP/IP协议栈和sockfs被静态编译到内核中,而不是一个内核模块。因此没必要提供一个卸载函数,sockfs伪文件系统在启动到关闭期间,总是被装载着的。