read
include/linux/fs.h
file 结构体中有
const struct file_operations *f_op;
struct file {
/*
* fu_list becomes invalid after file_free is called and queued via
* fu_rcuhead for RCU freeing
*/
union {
struct list_head fu_list;
struct rcu_head fu_rcuhead;
} f_u;
struct path f_path;
#define f_dentry f_path.dentry
#define f_vfsmnt f_path.mnt
const struct file_operations *f_op;
spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */
atomic_long_t f_count;
unsigned int f_flags;
fmode_t f_mode;
loff_t f_pos;
struct fown_struct f_owner;
const struct cred *f_cred;
struct file_ra_state f_ra;
u64 f_version;
#ifdef CONFIG_SECURITY
void *f_security;
#endif
/* needed for tty driver, and maybe others */
void *private_data;
#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct list_head f_ep_links;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
#ifdef CONFIG_DEBUG_WRITECOUNT
unsigned long f_mnt_write_state;
#endif
};
在 read 的系统调用过程中,会用到f_op
389 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
390 {
391 struct file *file;
392 ssize_t ret = -EBADF;
393 int fput_needed;
394
395 file = fget_light(fd, &fput_needed);
396 if (file) {
397 loff_t pos = file_pos_read(file);
398 ret = vfs_read(file, buf, count, &pos);
399 trace_fs_read(fd, buf, count, ret);
400 file_pos_write(file, pos);
401 fput_light(file, fput_needed);
402 }
403
404 return ret;
405 }
293 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
294 {
295 ssize_t ret;
296
297 if (!(file->f_mode & FMODE_READ))
298 return -EBADF;
299 if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
300 return -EINVAL;
301 if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
302 return -EFAULT;
303
304 ret = rw_verify_area(READ, file, pos, count);
305 if (ret >= 0) {
306 count = ret;
307 if (file->f_op->read)
308 ret = file->f_op->read(file, buf, count, pos);
309 else
310 ret = do_sync_read(file, buf, count, pos);
311 if (ret > 0) {
312 fsnotify_access(file->f_path.dentry);
313 add_rchar(current, ret);
314 }
315 inc_syscr(current);
316 }
317
318 return ret;
319 }
file_operation 结构体中都是函数指针,VFS 定义的统一的接口,具体特定的文件系统进行注册也就是给 file_operation 的各个函数指针赋值,
1493 struct file_operations {
1494 struct module *owner;
1495 loff_t (*llseek) (struct file *, loff_t, int);
1496 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
1497 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
1498 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
1499 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
1500 int (*readdir) (struct file *, void *, filldir_t);
1501 unsigned int (*poll) (struct file *, struct poll_table_struct *);
1502 int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
1503 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
1504 long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
1505 int (*mmap) (struct file *, struct vm_area_struct *);
1506 int (*open) (struct inode *, struct file *);
1507 int (*flush) (struct file *, fl_owner_t id);
1508 int (*release) (struct inode *, struct file *);
1509 int (*fsync) (struct file *, struct dentry *, int datasync);
1510 int (*aio_fsync) (struct kiocb *, int datasync);
1511 int (*fasync) (int, struct file *, int);
1512 int (*lock) (struct file *, int, struct file_lock *);
1513 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
1514 unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
1515 int (*check_flags)(int);
1516 int (*flock) (struct file *, int, struct file_lock *);
1517 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
1518 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
1519 int (*setlease)(struct file *, long, struct file_lock **);
1520 };
ubifs 对于file_operation 的赋值
1575 const struct file_operations ubifs_file_operations = {
1576 .llseek = generic_file_llseek,
1577 .read = do_sync_read,
1578 .write = do_sync_write,
1579 .aio_read = generic_file_aio_read,
1580 .aio_write = ubifs_aio_write,
1581 .mmap = ubifs_file_mmap,
1582 .fsync = ubifs_fsync,
1583 .unlocked_ioctl = ubifs_ioctl,
1584 .splice_read = generic_file_splice_read,
1585 .splice_write = generic_file_splice_write,
1586 #ifdef CONFIG_COMPAT
1587 .compat_ioctl = ubifs_compat_ioctl,
1588 #endif
1589 };
267 ssize_t
do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
268 {
269 struct iovec iov = { .iov_base = buf, .iov_len = len };
270 struct kiocb kiocb;
271 ssize_t ret;
272
273 init_sync_kiocb(&kiocb, filp);
274 kiocb.ki_pos = *ppos;
275 kiocb.ki_left = len;
276 kiocb.ki_nbytes = len;
277
278 for (;;) {
279 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
280 if (ret != -EIOCBRETRY)
281 break;
282 wait_on_retry_sync_kiocb(&kiocb);
283 }
284
285 if (-EIOCBQUEUED == ret)
286 ret = wait_on_sync_kiocb(&kiocb);
287 *ppos = kiocb.ki_pos;
288 return ret;
289 }
1575 const struct file_operations
ubifs_file_operations = {
1576 .llseek = generic_file_llseek,
1577 .read = do_sync_read,
1578 .write = do_sync_write,
1579 .aio_read = generic_file_aio_read,
1580 .aio_write = ubifs_aio_write,
1581 .mmap = ubifs_file_mmap,
1582 .fsync = ubifs_fsync,
1583 .unlocked_ioctl = ubifs_ioctl,
1584 .splice_read = generic_file_splice_read,
1585 .splice_write = generic_file_splice_write,
1586 #ifdef CONFIG_COMPAT
1587 .compat_ioctl = ubifs_compat_ioctl,
1588 #endif
1589 };
1281 ssize_t
1282 generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1283 unsigned long nr_segs, loff_t pos)
1284 {
1285 struct file *filp = iocb->ki_filp;
1286 ssize_t retval;
1287 unsigned long seg;
1288 size_t count;
1289 loff_t *ppos = &iocb->ki_pos;
1290
1291 count = 0;
1292 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
... ...
1323 for (seg = 0; seg < nr_segs; seg++) {
1324 read_descriptor_t desc;
1325
1326 desc.written = 0;
1327 desc.arg.buf = iov[seg].iov_base;
1328 desc.count = iov[seg].iov_len;
1329 if (desc.count == 0)
1330 continue;
1331 desc.error = 0;
1332 do_generic_file_read(filp, ppos, &desc, file_read_actor);
977 static void do_generic_file_read(struct file *filp, loff_t *ppos,
978 read_descriptor_t *desc, read_actor_t actor)
979 {
980 struct address_space *mapping = filp->f_mapping;
981 struct inode *inode = mapping->host;
982 struct file_ra_state *ra = &filp->f_ra;
... ...
1116 readpage:
1117 /*
1118 * A previous I/O error may have been due to temporary
1119 * failures, eg. multipath errors.
1120 * PG_error will be set again if readpage fails.
1121 */
1122 ClearPageError(page);
1123 /* Start the actual read. The read will unlock the page. */
1124 error = mapping->a_ops->readpage(filp, page);
1547 const struct address_space_operations
ubifs_file_address_operations = {
1548 .readpage = ubifs_readpage,
1549 .writepage = ubifs_writepage,
1550 .write_begin = ubifs_write_begin,
1551 .write_end = ubifs_write_end,
1552 .invalidatepage = ubifs_invalidatepage,
1553 .set_page_dirty = ubifs_set_page_dirty,
1554 .releasepage = ubifs_releasepage,
1555 };
系统调用read -> VFS -> ubifs, 这个阶段就总结完了。 接下来就是调用ubifs read_page 以下的接口去读page 数据了。
这个阶段在深入linux内核架构一书 P455~P459 总结的很好,建议详细反复看这几页,反复看这些代码, VFS 的精华都在这了。
open
1071 SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
1072 {
1073 long ret;
1074
1075 if (force_o_largefile())
1076 flags |= O_LARGEFILE;
1077
1078 ret = do_sys_open(AT_FDCWD, filename, flags, mode);
1079 /* avoid REGPARM breakage on x86: */
1080 asmlinkage_protect(3, ret, filename, flags, mode);
1081 return ret;
1082 }
1048 long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
1049 {
1050 char *tmp = getname(filename);
1051 int fd = PTR_ERR(tmp);
1052
1053 if (!IS_ERR(tmp)) {
1054 fd = get_unused_fd_flags(flags);
1055 if (fd >= 0) {
1056 struct file *f =
do_filp_open(dfd, tmp, flags, mode, 0);
1057 if (IS_ERR(f)) {
1058 put_unused_fd(fd);
1059 fd = PTR_ERR(f);
1060 } else {
1061 fsnotify_open(f->f_path.dentry);
1062 fd_install(fd, f);
1063 }
1064 trace_fs_open(fd, tmp);
1065 }
1066 putname(tmp);
1067 }
1068 return fd;
1069 }
1761 struct file *do_filp_open(int dfd, const char *pathname,
1762 int open_flag, int mode, int acc_mode)
1763 {
1764 struct file *filp;
1765 struct nameidata nd;
1766 int error;
1767 struct path path;
1768 int count = 0;
1769 int flag = open_to_namei_flags(open_flag);
1770 int force_reval = 0;
1771
1772 if (!(open_flag & O_CREAT))
1773 mode = 0;
... ...
1872 holder = path;
1873 nd.flags &= ~LOOKUP_PARENT;
1874 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1875 if (inode->i_op->put_link)
1876 inode->i_op->put_link(holder.dentry, &nd, cookie);
1877 path_put(&holder);
1612 static struct file *do_last(struct nameidata *nd, struct path *path,
1613 int open_flag, int acc_mode,
1614 int mode, const char *pathname)
1615 {
1616 struct dentry *dir = nd->path.dentry;
1617 struct file *filp;
1618 int error = -EISDIR;
1685 /* Negative dentry, just create the file */
1686 if (!path->dentry->d_inode) {
1687 /*
1688 * This write is needed to ensure that a
1689 * ro->rw transition does not occur between
1690 * the time when the file is created and when
1691 * a permanent write count is taken through
1692 * the 'struct file' in nameidata_to_filp().
1693 */
1694 error = mnt_want_write(nd->path.mnt);
1695 if (error)
1696 goto exit_mutex_unlock;
1697 error = __open_namei_create(nd, path, open_flag, mode);
1698 if (error) {
1699 mnt_drop_write(nd->path.mnt);
1700 goto exit;
1701 }
1503 static int __open_namei_create(struct nameidata *nd, struct path *path,
1504 int open_flag, int mode)
1505 {
1506 int error;
1507 struct dentry *dir = nd->path.dentry;
1508
1509 if (!IS_POSIXACL(dir->d_inode))
1510 mode &= ~current_umask();
1511 error = security_path_mknod(&nd->path, path->dentry, mode, 0);
1512 if (error)
1513 goto out_unlock;
1514 error = vfs_create(dir->d_inode, path->dentry, mode, nd);
1404 int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1405 struct nameidata *nd)
1406 {
1407 int error = may_create(dir, dentry);
1408
1409 if (error)
1410 return error;
1411
1412 if (!dir->i_op->create)
1413 return -EACCES; /* shouldn't it be ENOSYS? */
1414 mode &= S_IALLUGO;
1415 mode |= S_IFREG;
1416 error = security_inode_create(dir, dentry, mode);
1417 if (error)
1418 return error;
1419 error = dir->i_op->create(dir, dentry, mode, nd);
1420 if (!error)
1421 fsnotify_create(dir, dentry);
1422 return error;
1423 }
以ubifs 为例, 在ubifs_iget 中进行的结构体注册(这个函数在mount 的时候被调用):
101 struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
102 {
103 int err;
104 union ubifs_key key;
105 struct ubifs_ino_node *ino;
106 struct ubifs_info *c = sb->s_fs_info;
107 struct inode *inode;
108 struct ubifs_inode *ui;
181 case S_IFDIR:
182 printk(KERN_WARNING "qc_2 \n");
183 inode->i_op = &ubifs_dir_inode_operations;
184 inode->i_fop = &ubifs_dir_operations;
185 if (ui->data_len != 0) {
186 err = 11;
187 goto out_invalid;
188 }
189 break;
1201 const struct inode_operations ubifs_dir_inode_operations = {
1202 .lookup = ubifs_lookup,
1203 .create = ubifs_create,
1204 .link = ubifs_link,
1205 .symlink = ubifs_symlink,
1206 .unlink = ubifs_unlink,
1207 .mkdir = ubifs_mkdir,
1208 .rmdir = ubifs_rmdir,
1209 .mknod = ubifs_mknod,
1210 .rename = ubifs_rename,
1211 .setattr = ubifs_setattr,
1212 .getattr = ubifs_getattr,
1213 #ifdef CONFIG_UBIFS_FS_XATTR
1214 .setxattr = ubifs_setxattr,
1215 .getxattr = ubifs_getxattr,
1216 .listxattr = ubifs_listxattr,
1217 .removexattr = ubifs_removexattr,
1218 #endif
1219 };