inode数据结构_系统分析(2)

最新推荐文章于 2023-10-24 17:20:25 发布

原创最新推荐文章于 2023-10-24 17:20:25 发布 · 558 阅读

1 ·

CC 4.0 BY-SA版权

kernel_read 专栏收录该内容

11 篇文章

订阅专栏

本文深入探讨Linux系统中文件系统的挂载流程，包括挂载命令的内部机制、超级块的作用、dentry与inode的关系，以及vfsmount结构在挂载过程中的作用。通过分析sys_mount系统调用，揭示了挂载新文件系统所需的关键步骤。

从文件系统上来看，挂载一个文件系统也是挂载一个文件，文件系统是一个super_block，它和inode什么关系。

从mount挂载命令开始，需要先注册，然后再装载系统。分为如下步骤：

1，注册文件系统

int register_filesystem(struct file_system_type * fs)
{
	int res = 0;
	struct file_system_type ** p;

	BUG_ON(strchr(fs->name, '.'));
	if (fs->next)
		return -EBUSY;
	write_lock(&file_systems_lock);
	p = find_filesystem(fs->name, strlen(fs->name));
	if (*p)
		res = -EBUSY;
	else
		*p = fs;
	write_unlock(&file_systems_lock);
	return res;
}

static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
	struct file_system_type **p;
	for (p=&file_systems; *p; p=&(*p)->next)
		if (strlen((*p)->name) == len &&
		    strncmp((*p)->name, name, len) == 0)
			break;
	return p;
}

static struct file_system_type *file_systems;

struct file_system_type {
	const char *name;
	int fs_flags;
	struct dentry *(*mount) (struct file_system_type *, int,
		       const char *, void *);
	void (*kill_sb) (struct super_block *);
	struct module *owner;
	struct file_system_type * next;
	struct hlist_head fs_supers;



	struct lock_class_key s_lock_key;
	struct lock_class_key s_umount_key;
	struct lock_class_key s_vfs_rename_key;

	struct lock_class_key i_lock_key;
	struct lock_class_key i_mutex_key;
	struct lock_class_key i_mutex_dir_key;
};


struct dentry {
	/* RCU lookup touched fields */
	unsigned int d_flags;		/* protected by d_lock */
	seqcount_t d_seq;		/* per dentry seqlock */
	struct hlist_bl_node d_hash;	/* lookup hash list */
	struct dentry *d_parent;	/* parent directory */
	struct qstr d_name;
	struct inode *d_inode;		/* Where the name belongs to - NULL is
					 * negative */
	unsigned char d_iname[DNAME_INLINE_LEN];	/* small names */

	/* Ref lookup also touches following */
	unsigned int d_count;		/* protected by d_lock */
	spinlock_t d_lock;		/* per dentry lock */
	const struct dentry_operations *d_op;

	struct super_block *d_sb;	/* The root of the dentry tree */
//d_sb   dentry的根节点

	unsigned long d_time;		/* used by d_revalidate */
	void *d_fsdata;			/* fs-specific data */

	struct list_head d_lru;		/* LRU list */
	/*
	 * d_child and d_rcu can share memory
	 */
	union {
		struct list_head d_child;	/* child of parent list */
	 	struct rcu_head d_rcu;
	} d_u;
	struct list_head d_subdirs;	/* our children */
	struct list_head d_alias;	/* inode alias list */
};

1）挂载流程：指针函数find_filesystem返回一个 file_system_type** 类型的值，传入参数是名称和长度，函数中定义一个二级指针p,p指向全局指针变量file_systems的地址，file_systems本身是个指针，指向一个file_system_type类型实例，则*p就是取该实例的内容，p=&(*p)->next根据系统链表以此查找，用strlen先判断当前系统链表节点的名称长度是否与传入参数len相同并且，名称也相同，如果没有找到，则证明当前链表无此节点，错返回p这个二级指针给register_filesystem中的p，然后判定该系统当前是否位忙状态，如果不是，那么让*p指向fs这个地址(fs也是个指针)，上锁。

2）file_system_type: name是一个系统名称，fs_flags是使用的标志，owner是一个指向module结构的指针(仅当文件系统以模块形式加载时才有意义)。next是个单链表。fs_supers是表头。mount是读取dentry结构的函数，这里面包括了超级块等成员。

3)dentry是干嘛的？

dentry是目录项，用于描述文件的逻辑属性，目录也是个文件；inode是索引节点，代表物理意义上的文件。

我们去掉该结构中所有的链表和锁，还有rcu等一些暂时无关的东西，那么能保留下来的就只有下面这么一个简化版的dentry:

struct dentry {

	struct qstr d_name;
	struct inode *d_inode;		/* Where the name belongs to - NULL is
					 * negative */
        struct dentry *d_parent;	/* parent directory */
	unsigned char d_iname[DNAME_INLINE_LEN];	/* small names */
	const struct dentry_operations *d_op;
        struct list_head d_child;	/* child of parent list */
	struct super_block *d_sb;	/* The root of the dentry tree */

};

一个d_name文件名称，一个d_iname用于保存短的名称，这两个并不冲突，能先用d_iname时就先不用d_name。dentry->d_inode指向相应的inode索引节点。dentry->d_parent和dentry.d_child形成了树状结构。dentry->d_sb则指向该目录下所属文件系统的超级块。

4）那超级块又是个什么东西？

一个超级块对应着一个文件系统，文件系统的装载操作始于超级块的读取。超级块是文件系统的核心结构，保存了文件系统所有的特征数据。

超级块的获取通过file_system_type对象中的mount函数指针来读取，在内核中跟踪如下：

file_system_type bd_type = {.mount = bd_mount} ----> bd_mount()---->mount_pseudo()---->sget()最终由sget函数调用alloc_super()获取一个超级块并进行初始化。

super_block结构体(简化)

LIST_HEAD(super_blocks);

struct super_block {
//用于将系统中所有的超级块聚集到一个链表中，该表表头是全程变量super_blocks，定义在super.c中。
	struct list_head	s_list;		/* Keep this first */
//总是一个数字，包含文件系统的块设备描述符，例如/dev/hda1 符号：0x301
	dev_t			s_dev;		/* search index; _not_ kdev_t */
	unsigned char		s_dirt;

//用于指定文件系统的块长度，本质上这两个变量以不通的方式表示了相同的信息
	unsigned char		s_blocksize_bits;
	unsigned long		s_blocksize;

//保存了文件系统可以处理的最大文件长度
	loff_t			s_maxbytes;	/* Max file size */

//指向file_system_type实例，保存了与文件系统有关的一般类型信息
	struct file_system_type	*s_type;
//指向一个super操作函数集合，提供一些一般性的接口，用于处理超级块相关操作，
//操作的实现由底层文件系统的代码提供。
	const struct super_operations	*s_op;
//安装标识
	unsigned long		s_flags;
//区别于其他文件系统的标识
	unsigned long		s_magic;
//将超级块与全局根目录的dentry关联
	struct dentry		*s_root;
//属性处理指针
	const struct xattr_handler **s_xattr;
//所有inode 
	struct list_head	s_inodes;	/* all inodes */

//s_files链表包含了一系列file结构，列出了该超级块表示的文件系统上所有打开的文件。
	struct list_head	s_files;

//指定了底层文件系统的数据所在的块设备
	struct block_device	*s_bdev;
//链表元素，同一类型的文件系统通过这个链表将所有的super_block连接起来
	struct hlist_node	s_instances;

	char s_id[32];				/* Informational name */
//万精油型指针，指向文件系统实现的私有数据
	void 			*s_fs_info;	/* Filesystem private info */
//最大可连接项
	unsigned int		s_max_links;

	fmode_t			s_mode;

	const struct dentry_operations *s_d_op; /* default d_op for dentries */

	
};

通过上边超级块的结构，就可以知道这个文件系统的信息，第一个链表负责把所有的超级块连起来。包括了描述符，file_system_type，inode，dentry，root，s_magic(区别于其它文件系统的标识)，files等等。如此定义了一个文件系统。

内核中的结构体，目前看来list_head是必不可少的，它的意思是这样，如果以一整个结构体来作为节点组成链表，那么这个节点就非常大，效率不高。但是把第一个成员设置成链表，只要查找这个链表就可以了，遍历这个s_list，找到想要的节点，再去读取整个super_block长度的内存块，就把super_block读取出来，所以它放在了第一个位置。而后面其它的结构体的list_head也是在固定的位置，一层套一层的链表就体现了linux系统的分层思想。

关于超级块的super_operations方法集合，inode部分主要是对inode的操作，也就是文件系统操作文件；write_super将超级块写入存储介质，比如sdcard，put_super将超级块的私有信息从内存中移除，文件系统卸载时会用到这个回调函数。sync_fs将文件系统数据与底层块设备上的数据同步。show_options用于proc文件系统，用以显示文件系统装载的选项。

struct super_operations {
   	struct inode *(*alloc_inode)(struct super_block *sb);
	void (*destroy_inode)(struct inode *);

   	void (*dirty_inode) (struct inode *, int flags);
	int (*write_inode) (struct inode *, struct writeback_control *wbc);
	int (*drop_inode) (struct inode *);
	void (*evict_inode) (struct inode *);
	void (*put_super) (struct super_block *);
	void (*write_super) (struct super_block *);
	int (*sync_fs)(struct super_block *sb, int wait);
	int (*freeze_fs) (struct super_block *);
	int (*unfreeze_fs) (struct super_block *);
	int (*statfs) (struct dentry *, struct kstatfs *);
	int (*remount_fs) (struct super_block *, int *, char *);
	void (*umount_begin) (struct super_block *);

	int (*show_options)(struct seq_file *, struct dentry *);
	int (*show_devname)(struct seq_file *, struct dentry *);
	int (*show_path)(struct seq_file *, struct dentry *);
	int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
#endif
	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
	int (*nr_cached_objects)(struct super_block *);
	void (*free_cached_objects)(struct super_block *, int);
};

关于super_block，inode，dentry的概念，至此很清晰了，每一个super_block是一个文件系统，下面的文件是通过inode和dentry来联合描述的。

2，装载文件系统

注意点是：装载新文件系统必须执行的任务、装载点的数据结构。

每装载一个文件系统，对应一个vfsmount结构的实例：

struct vfsmount {
	struct dentry *mnt_root;	/* root of the mounted tree */
//子文件系统链表的起点，挂载树的根节点
	struct super_block *mnt_sb;	/* pointer to superblock */
//一个指向超级块的指针，获取一个超级块才允许产生一个文件系统
	int mnt_flags;
//标志，里面会含有一些权限，是否只读等。 具体设置的值在mount.h中进行了宏定义
};

vfsmount和dentry通过path(路径)整合到一起，path.vfsmount.dentry是挂载点也是该挂载的文件系统的根节点。path.dentry则是这个文件系统的节点。

struct path {
	struct vfsmount *mnt;
	struct dentry *dentry;
};

struct nameidata {
	struct path	path;
	struct qstr	last;
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags;
	unsigned	seq;
	int		last_type;
	unsigned	depth;
	char *saved_names[MAX_NESTED_LINKS + 1];

	/* Intent data */
	union {
		struct open_intent open;
	} intent;
};

如下做了一个相关的图，左边inode进程相关，右边文件系统相关

这是自己画的，若发现错误，欢迎指正。

装载文件系统的过程可以参考 linux内核sys_mount()分析

mount的系统调用是sys_mount, 在内核中是SYSCALL_DEFINE5.

以下是整个调用过程需要调用的函数：


SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
		char __user *, type, unsigned long, flags, void __user *, data)
{
	int ret;
	char *kernel_type;
	char *kernel_dir;
	char *kernel_dev;
	unsigned long data_page;

	ret = copy_mount_string(type, &kernel_type);
	if (ret < 0)
		goto out_type;

	kernel_dir = getname(dir_name);
	if (IS_ERR(kernel_dir)) {
		ret = PTR_ERR(kernel_dir);
		goto out_dir;
	}

	ret = copy_mount_string(dev_name, &kernel_dev);
	if (ret < 0)
		goto out_dev;

	ret = copy_mount_options(data, &data_page);
	if (ret < 0)
		goto out_data;

	ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,
		(void *) data_page);

	free_page(data_page);
out_data:
	kfree(kernel_dev);
out_dev:
	putname(kernel_dir);
out_dir:
	kfree(kernel_type);
out_type:
	return ret;
}


long do_mount(char *dev_name, char *dir_name, char *type_page,
		  unsigned long flags, void *data_page)
{
	struct path path;
	int retval = 0;
	int mnt_flags = 0;

	/* Discard magic */
	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
		flags &= ~MS_MGC_MSK;

	/* Basic sanity checks */

	if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
		return -EINVAL;

	if (data_page)
		((char *)data_page)[PAGE_SIZE - 1] = 0;

	/* ... and get the mountpoint */
	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
	if (retval)
		return retval;

	retval = security_sb_mount(dev_name, &path,
				   type_page, flags, data_page);
	if (retval)
		goto dput_out;

	/* Default to relatime unless overriden */
	if (!(flags & MS_NOATIME))
		mnt_flags |= MNT_RELATIME;

	/* Separate the per-mountpoint flags */
	if (flags & MS_NOSUID)
		mnt_flags |= MNT_NOSUID;
	if (flags & MS_NODEV)
		mnt_flags |= MNT_NODEV;
	if (flags & MS_NOEXEC)
		mnt_flags |= MNT_NOEXEC;
	if (flags & MS_NOATIME)
		mnt_flags |= MNT_NOATIME;
	if (flags & MS_NODIRATIME)
		mnt_flags |= MNT_NODIRATIME;
	if (flags & MS_STRICTATIME)
		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
	if (flags & MS_RDONLY)
		mnt_flags |= MNT_READONLY;

	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
		   MS_STRICTATIME);

	if (flags & MS_REMOUNT)
		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
				    data_page);
	else if (flags & MS_BIND)
		retval = do_loopback(&path, dev_name, flags & MS_REC);
	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
		retval = do_change_type(&path, flags);
	else if (flags & MS_MOVE)
		retval = do_move_mount(&path, dev_name);
	else
		retval = do_new_mount(&path, type_page, flags, mnt_flags,
				      dev_name, data_page);
dput_out:
	path_put(&path);
	return retval;
}

kern_path ----> do_path_lookup ----> do_path_lookup ----> path_lookupat


static int path_lookupat(int dfd, const char *name,
				unsigned int flags, struct nameidata *nd)
{
	struct file *base = NULL;
	struct path path;
	int err;

	/*
	 * Path walking is largely split up into 2 different synchronisation
	 * schemes, rcu-walk and ref-walk (explained in
	 * Documentation/filesystems/path-lookup.txt). These share much of the
	 * path walk code, but some things particularly setup, cleanup, and
	 * following mounts are sufficiently divergent that functions are
	 * duplicated. Typically there is a function foo(), and its RCU
	 * analogue, foo_rcu().
	 *
	 * -ECHILD is the error number of choice (just to avoid clashes) that
	 * is returned if some aspect of an rcu-walk fails. Such an error must
	 * be handled by restarting a traditional ref-walk (which will always
	 * be able to complete).
	 */
	err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);

	if (unlikely(err))
		return err;

	current->total_link_count = 0;
	err = link_path_walk(name, nd);

	if (!err && !(flags & LOOKUP_PARENT)) {
		err = lookup_last(nd, &path);
		while (err > 0) {
			void *cookie;
			struct path link = path;
			nd->flags |= LOOKUP_PARENT;
			err = follow_link(&link, nd, &cookie);
			if (!err)
				err = lookup_last(nd, &path);
			put_link(nd, &link, cookie);
		}
	}

	if (!err)
		err = complete_walk(nd);

	if (!err && nd->flags & LOOKUP_DIRECTORY) {
		if (!nd->inode->i_op->lookup) {
			path_put(&nd->path);
			err = -ENOTDIR;
		}
	}

	if (base)
		fput(base);

	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
		path_put(&nd->root);
		nd->root.mnt = NULL;
	}
	return err;
}

static int do_new_mount(struct path *path, char *type, int flags,
			int mnt_flags, char *name, void *data)
{
	struct vfsmount *mnt;
	int err;

	if (!type)
		return -EINVAL;

	/* we need capabilities... */
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

	mnt = do_kern_mount(type, flags, name, data);
	if (IS_ERR(mnt))
		return PTR_ERR(mnt);

	err = do_add_mount(real_mount(mnt), path, mnt_flags);
	if (err)
		mntput(mnt);
	return err;
}


static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
{
	int err;

	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);

	err = lock_mount(path);
	if (err)
		return err;

	err = -EINVAL;
	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(real_mount(path->mnt)))
		goto unlock;

	/* Refuse the same filesystem on the same mount point */
	err = -EBUSY;
	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
	    path->mnt->mnt_root == path->dentry)
		goto unlock;

	err = -EINVAL;
	if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
		goto unlock;

	newmnt->mnt.mnt_flags = mnt_flags;
	err = graft_tree(newmnt, path);

unlock:
	unlock_mount(path);
	return err;
}

static int graft_tree(struct mount *mnt, struct path *path)
{
	if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
		return -EINVAL;

	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
	      S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
		return -ENOTDIR;

	if (d_unlinked(path->dentry))
		return -ENOENT;

	return attach_recursive_mnt(mnt, path, NULL);
}

static int attach_recursive_mnt(struct mount *source_mnt,
			struct path *path, struct path *parent_path)
{
	LIST_HEAD(tree_list);
	struct mount *dest_mnt = real_mount(path->mnt);
	struct dentry *dest_dentry = path->dentry;
	struct mount *child, *p;
	int err;

	if (IS_MNT_SHARED(dest_mnt)) {
		err = invent_group_ids(source_mnt, true);
		if (err)
			goto out;
	}
	err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
	if (err)
		goto out_cleanup_ids;

	br_write_lock(vfsmount_lock);

	if (IS_MNT_SHARED(dest_mnt)) {
		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
			set_mnt_shared(p);
	}
	if (parent_path) {
		detach_mnt(source_mnt, parent_path);
		attach_mnt(source_mnt, path);
		touch_mnt_namespace(source_mnt->mnt_ns);
	} else {
		mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
		commit_tree(source_mnt);
	}

	list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
		list_del_init(&child->mnt_hash);
		commit_tree(child);
	}
	br_write_unlock(vfsmount_lock);

	return 0;

 out_cleanup_ids:
	if (IS_MNT_SHARED(dest_mnt))
		cleanup_group_ids(source_mnt, NULL);
 out:
	return err;
}


void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry,
			struct mount *child_mnt)
{
	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
	child_mnt->mnt_mountpoint = dget(dentry);
	child_mnt->mnt_parent = mnt;
	spin_lock(&dentry->d_lock);
	dentry->d_flags |= DCACHE_MOUNTED;
	spin_unlock(&dentry->d_lock);
}