VFS 代码分析(open/read/write)

本文解析了Linux VFS中的文件操作流程,包括read、open等系统调用的实现细节,并以ubifs文件系统为例,展示了具体的操作过程。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

read

include/linux/fs.h

file 结构体中有

const struct file_operations    *f_op;

struct file {
        /*   
         * fu_list becomes invalid after file_free is called and queued via
         * fu_rcuhead for RCU freeing
         */
        union {
                struct list_head        fu_list;
                struct rcu_head         fu_rcuhead;
        } f_u; 
        struct path             f_path;
#define f_dentry        f_path.dentry
#define f_vfsmnt        f_path.mnt
        const struct file_operations    *f_op;
        spinlock_t              f_lock;  /* f_ep_links, f_flags, no IRQ */
        atomic_long_t           f_count;
        unsigned int            f_flags;
        fmode_t                 f_mode;
        loff_t                  f_pos;
        struct fown_struct      f_owner;
        const struct cred       *f_cred;
        struct file_ra_state    f_ra;

        u64                     f_version;
#ifdef CONFIG_SECURITY
        void                    *f_security;
#endif
        /* needed for tty driver, and maybe others */
        void                    *private_data;

#ifdef CONFIG_EPOLL
        /* Used by fs/eventpoll.c to link all the hooks to this file */
        struct list_head        f_ep_links;
#endif /* #ifdef CONFIG_EPOLL */
        struct address_space    *f_mapping;
#ifdef CONFIG_DEBUG_WRITECOUNT
        unsigned long f_mnt_write_state;
#endif
};

在 read 的系统调用过程中,会用到f_op

389 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
390 {
391         struct file *file;
392         ssize_t ret = -EBADF;
393         int fput_needed;
394
395         file = fget_light(fd, &fput_needed);
396         if (file) {
397                 loff_t pos = file_pos_read(file);
398                 ret = vfs_read(file, buf, count, &pos);
399                 trace_fs_read(fd, buf, count, ret);
400                 file_pos_write(file, pos);
401                 fput_light(file, fput_needed);
402         }
403
404         return ret;
405 }

293 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
294 {
295         ssize_t ret;
296
297         if (!(file->f_mode & FMODE_READ))
298                 return -EBADF;
299         if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
300                 return -EINVAL;
301         if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
302                 return -EFAULT;
303
304         ret = rw_verify_area(READ, file, pos, count);
305         if (ret >= 0) {
306                 count = ret;
307                 if (file->f_op->read)
308                         ret = file->f_op->read(file, buf, count, pos);
309                 else
310                         ret = do_sync_read(file, buf, count, pos);
311                 if (ret > 0) {
312                         fsnotify_access(file->f_path.dentry);
313                         add_rchar(current, ret);
314                 }
315                 inc_syscr(current);
316         }
317
318         return ret;
319 }

file_operation 结构体中都是函数指针,VFS 定义的统一的接口,具体特定的文件系统进行注册也就是给 file_operation 的各个函数指针赋值,

1493 struct file_operations {
1494         struct module *owner;   
1495         loff_t (*llseek) (struct file *, loff_t, int);
1496         ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
1497         ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
1498         ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
1499         ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
1500         int (*readdir) (struct file *, void *, filldir_t);
1501         unsigned int (*poll) (struct file *, struct poll_table_struct *);
1502         int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
1503         long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
1504         long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
1505         int (*mmap) (struct file *, struct vm_area_struct *);
1506         int (*open) (struct inode *, struct file *);
1507         int (*flush) (struct file *, fl_owner_t id);
1508         int (*release) (struct inode *, struct file *);
1509         int (*fsync) (struct file *, struct dentry *, int datasync);
1510         int (*aio_fsync) (struct kiocb *, int datasync);
1511         int (*fasync) (int, struct file *, int); 
1512         int (*lock) (struct file *, int, struct file_lock *);
1513         ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
1514         unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
1515         int (*check_flags)(int);
1516         int (*flock) (struct file *, int, struct file_lock *);
1517         ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
1518         ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
1519         int (*setlease)(struct file *, long, struct file_lock **);
1520 };

ubifs 对于file_operation 的赋值

1575 const struct file_operations ubifs_file_operations = {
1576         .llseek         = generic_file_llseek,
1577         .read           = do_sync_read,
1578         .write          = do_sync_write,
1579         .aio_read       = generic_file_aio_read,
1580         .aio_write      = ubifs_aio_write,
1581         .mmap           = ubifs_file_mmap,
1582         .fsync          = ubifs_fsync,
1583         .unlocked_ioctl = ubifs_ioctl,
1584         .splice_read    = generic_file_splice_read,
1585         .splice_write   = generic_file_splice_write,
1586 #ifdef CONFIG_COMPAT
1587         .compat_ioctl   = ubifs_compat_ioctl,
1588 #endif
1589 };

267 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
268 {
269         struct iovec iov = { .iov_base = buf, .iov_len = len };                                                                                                                                            
270         struct kiocb kiocb;       
271         ssize_t ret;
272
273         init_sync_kiocb(&kiocb, filp);
274         kiocb.ki_pos = *ppos;     
275         kiocb.ki_left = len;      
276         kiocb.ki_nbytes = len;
277
278         for (;;) {
279                 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);                                                                                                                                 
280                 if (ret != -EIOCBRETRY)           
281                         break;            
282                 wait_on_retry_sync_kiocb(&kiocb);
283         }
284    
285         if (-EIOCBQUEUED == ret)          
286                 ret = wait_on_sync_kiocb(&kiocb);
287         *ppos = kiocb.ki_pos;     
288         return ret;  
289 }  

1575 const struct file_operations ubifs_file_operations = {
1576         .llseek         = generic_file_llseek,
1577         .read           = do_sync_read,
1578         .write          = do_sync_write,
1579         .aio_read       = generic_file_aio_read,
1580         .aio_write      = ubifs_aio_write,
1581         .mmap           = ubifs_file_mmap,
1582         .fsync          = ubifs_fsync,
1583         .unlocked_ioctl = ubifs_ioctl,
1584         .splice_read    = generic_file_splice_read,
1585         .splice_write   = generic_file_splice_write,
1586 #ifdef CONFIG_COMPAT
1587         .compat_ioctl   = ubifs_compat_ioctl,
1588 #endif
1589 };   


1281 ssize_t
1282 generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1283                 unsigned long nr_segs, loff_t pos)
1284 {
1285         struct file *filp = iocb->ki_filp;
1286         ssize_t retval;
1287         unsigned long seg;
1288         size_t count;
1289         loff_t *ppos = &iocb->ki_pos;
1290
1291         count = 0;
1292         retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
... ...
1323         for (seg = 0; seg < nr_segs; seg++) {
1324                 read_descriptor_t desc;
1325
1326                 desc.written = 0;
1327                 desc.arg.buf = iov[seg].iov_base;
1328                 desc.count = iov[seg].iov_len;
1329                 if (desc.count == 0)
1330                         continue;
1331                 desc.error = 0;
1332                 do_generic_file_read(filp, ppos, &desc, file_read_actor);



 977 static void do_generic_file_read(struct file *filp, loff_t *ppos,
 978                 read_descriptor_t *desc, read_actor_t actor)
 979 {
 980         struct address_space *mapping = filp->f_mapping;
 981         struct inode *inode = mapping->host;
 982         struct file_ra_state *ra = &filp->f_ra;
... ...
1116 readpage:
1117                 /*
1118                  * A previous I/O error may have been due to temporary
1119                  * failures, eg. multipath errors.
1120                  * PG_error will be set again if readpage fails.
1121                  */
1122                 ClearPageError(page);
1123                 /* Start the actual read. The read will unlock the page. */
1124                 error = mapping->a_ops->readpage(filp, page);

1547 const struct address_space_operations ubifs_file_address_operations = {
1548         .readpage       = ubifs_readpage,
1549         .writepage      = ubifs_writepage,
1550         .write_begin    = ubifs_write_begin,
1551         .write_end      = ubifs_write_end,
1552         .invalidatepage = ubifs_invalidatepage,
1553         .set_page_dirty = ubifs_set_page_dirty,
1554         .releasepage    = ubifs_releasepage,
1555 };   

系统调用read -> VFS -> ubifs, 这个阶段就总结完了。 接下来就是调用ubifs read_page 以下的接口去读page 数据了。

这个阶段在深入linux内核架构一书 P455~P459 总结的很好,建议详细反复看这几页,反复看这些代码, VFS 的精华都在这了。


open

1071 SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)                                                                                                                               
1072 {
1073         long ret;   
1074   
1075         if (force_o_largefile())          
1076                 flags |= O_LARGEFILE;                                                                                                                                                                     
1077   
1078         ret = do_sys_open(AT_FDCWD, filename, flags, mode);
1079         /* avoid REGPARM breakage on x86: */
1080         asmlinkage_protect(3, ret, filename, flags, mode);                                                                                                                                                
1081         return ret;
1082 }


1048 long do_sys_open(int dfd, const char __user *filename, int flags, int mode)                                                                                                                               
1049 {
1050         char *tmp = getname(filename);
1051         int fd = PTR_ERR(tmp);                                                                                                                                                                            
1052
1053         if (!IS_ERR(tmp)) {               
1054                 fd = get_unused_fd_flags(flags);
1055                 if (fd >= 0) {                    
1056                         struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);                                                                                                                          
1057                         if (IS_ERR(f)) {                  
1058                                 put_unused_fd(fd);        
1059                                 fd = PTR_ERR(f);  
1060                         } else {                          
1061                                 fsnotify_open(f->f_path.dentry);
1062                                 fd_install(fd, f);
1063                         }
1064                         trace_fs_open(fd, tmp);                                                                                                                                                           
1065                 }   
1066                 putname(tmp);     
1067         }           
1068         return fd;
1069 }


1761 struct file *do_filp_open(int dfd, const char *pathname,
1762                 int open_flag, int mode, int acc_mode)
1763 {
1764         struct file *filp;
1765         struct nameidata nd;
1766         int error;
1767         struct path path;
1768         int count = 0;
1769         int flag = open_to_namei_flags(open_flag);
1770         int force_reval = 0;
1771
1772         if (!(open_flag & O_CREAT))
1773                 mode = 0;

... ...

1872                 holder = path;
1873                 nd.flags &= ~LOOKUP_PARENT;
1874                 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1875                 if (inode->i_op->put_link)
1876                         inode->i_op->put_link(holder.dentry, &nd, cookie);
1877                 path_put(&holder);

1612 static struct file *do_last(struct nameidata *nd, struct path *path,
1613                             int open_flag, int acc_mode,
1614                             int mode, const char *pathname)
1615 {
1616         struct dentry *dir = nd->path.dentry;
1617         struct file *filp;
1618         int error = -EISDIR;

1685         /* Negative dentry, just create the file */
1686         if (!path->dentry->d_inode) {
1687                 /*
1688                  * This write is needed to ensure that a
1689                  * ro->rw transition does not occur between
1690                  * the time when the file is created and when
1691                  * a permanent write count is taken through
1692                  * the 'struct file' in nameidata_to_filp().
1693                  */
1694                 error = mnt_want_write(nd->path.mnt);
1695                 if (error)
1696                         goto exit_mutex_unlock;
1697                 error = __open_namei_create(nd, path, open_flag, mode);
1698                 if (error) {
1699                         mnt_drop_write(nd->path.mnt);
1700                         goto exit;
1701                 }

1503 static int __open_namei_create(struct nameidata *nd, struct path *path,
1504                                 int open_flag, int mode)
1505 {
1506         int error;
1507         struct dentry *dir = nd->path.dentry;
1508
1509         if (!IS_POSIXACL(dir->d_inode))
1510                 mode &= ~current_umask();
1511         error = security_path_mknod(&nd->path, path->dentry, mode, 0);
1512         if (error)
1513                 goto out_unlock;
1514         error = vfs_create(dir->d_inode, path->dentry, mode, nd);

1404 int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1405                 struct nameidata *nd)
1406 {
1407         int error = may_create(dir, dentry);
1408
1409         if (error)
1410                 return error;
1411
1412         if (!dir->i_op->create)
1413                 return -EACCES; /* shouldn't it be ENOSYS? */
1414         mode &= S_IALLUGO;
1415         mode |= S_IFREG;
1416         error = security_inode_create(dir, dentry, mode);
1417         if (error)
1418                 return error;
1419         error = dir->i_op->create(dir, dentry, mode, nd);
1420         if (!error)
1421                 fsnotify_create(dir, dentry);
1422         return error;
1423 }



以ubifs  为例, 在ubifs_iget 中进行的结构体注册(这个函数在mount 的时候被调用):

 101 struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
 102 {
 103         int err;
 104         union ubifs_key key;
 105         struct ubifs_ino_node *ino;
 106         struct ubifs_info *c = sb->s_fs_info;
 107         struct inode *inode;
 108         struct ubifs_inode *ui;

 181         case S_IFDIR:
 182                 printk(KERN_WARNING "qc_2 \n");
 183                 inode->i_op  = &ubifs_dir_inode_operations;
 184                 inode->i_fop = &ubifs_dir_operations;
 185                 if (ui->data_len != 0) {
 186                         err = 11;
 187                         goto out_invalid;
 188                 }
 189                 break;

1201 const struct inode_operations ubifs_dir_inode_operations = {
1202         .lookup      = ubifs_lookup,
1203         .create      = ubifs_create,
1204         .link        = ubifs_link,
1205         .symlink     = ubifs_symlink,
1206         .unlink      = ubifs_unlink,
1207         .mkdir       = ubifs_mkdir,
1208         .rmdir       = ubifs_rmdir,
1209         .mknod       = ubifs_mknod,
1210         .rename      = ubifs_rename,
1211         .setattr     = ubifs_setattr,
1212         .getattr     = ubifs_getattr,
1213 #ifdef CONFIG_UBIFS_FS_XATTR
1214         .setxattr    = ubifs_setxattr,
1215         .getxattr    = ubifs_getxattr,
1216         .listxattr   = ubifs_listxattr,
1217         .removexattr = ubifs_removexattr,
1218 #endif          
1219 };              


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值