Linux那些事儿之我是Sysfs(7)dentry与inode

本文深入探讨了文件系统中dentry(目录项)和inode(索引节点)的概念及其作用,详细介绍了它们如何共同构成文件的内存表示,并解释了它们与磁盘文件之间的对应关系。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

我们在进程中要怎样去描述一个文件呢?我们用目录项(dentry)和索引节点(inode)。它们的定义如下:

include/linux/dcache.h

struct dentry {
	/* RCU lookup touched fields */
	unsigned int d_flags;		/* protected by d_lock */
	seqcount_t d_seq;		/* per dentry seqlock */
	struct hlist_bl_node d_hash;	/* lookup hash list */
	struct dentry *d_parent;	/* parent directory */
	struct qstr d_name;
	struct inode *d_inode;		/* Where the name belongs to - NULL is
					 * negative */
	unsigned char d_iname[DNAME_INLINE_LEN];	/* small names */

	/* Ref lookup also touches following */
	struct lockref d_lockref;	/* per-dentry lock and refcount */
	const struct dentry_operations *d_op;
	struct super_block *d_sb;	/* The root of the dentry tree */
	unsigned long d_time;		/* used by d_revalidate */
	void *d_fsdata;			/* fs-specific data */

	struct list_head d_lru;		/* LRU list */
	struct list_head d_child;	/* child of parent list */
	struct list_head d_subdirs;	/* our children */
	/*
	 * d_alias and d_rcu can share memory
	 */
	union {
		struct hlist_node d_alias;	/* inode alias list */
	 	struct rcu_head d_rcu;
	} d_u;
};


include/linux/fs.h

struct inode {
	umode_t			i_mode;
	unsigned short		i_opflags;
	kuid_t			i_uid;
	kgid_t			i_gid;
	unsigned int		i_flags;

#ifdef CONFIG_FS_POSIX_ACL
	struct posix_acl	*i_acl;
	struct posix_acl	*i_default_acl;
#endif

	const struct inode_operations	*i_op;
	struct super_block	*i_sb;
	struct address_space	*i_mapping;

#ifdef CONFIG_SECURITY
	void			*i_security;
#endif

	/* Stat data, not accessed from path walking */
	unsigned long		i_ino;
	/*
	 * Filesystems may only read i_nlink directly.  They shall use the
	 * following functions for modification:
	 *
	 *    (set|clear|inc|drop)_nlink
	 *    inode_(inc|dec)_link_count
	 */
	union {
		const unsigned int i_nlink;
		unsigned int __i_nlink;
	};
	dev_t			i_rdev;
	loff_t			i_size;
	struct timespec		i_atime;
	struct timespec		i_mtime;
	struct timespec		i_ctime;
	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
	unsigned short          i_bytes;
	unsigned int		i_blkbits;
	blkcnt_t		i_blocks;

#ifdef __NEED_I_SIZE_ORDERED
	seqcount_t		i_size_seqcount;
#endif

	/* Misc */
	unsigned long		i_state;
	struct mutex		i_mutex;

	unsigned long		dirtied_when;	/* jiffies of first dirtying */

	struct hlist_node	i_hash;
	struct list_head	i_wb_list;	/* backing dev IO list */
	struct list_head	i_lru;		/* inode LRU list */
	struct list_head	i_sb_list;
	union {
		struct hlist_head	i_dentry;
		struct rcu_head		i_rcu;
	};
	u64			i_version;
	atomic_t		i_count;
	atomic_t		i_dio_count;
	atomic_t		i_writecount;
#ifdef CONFIG_IMA
	atomic_t		i_readcount; /* struct files open RO */
#endif
	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
	struct file_lock	*i_flock;
	struct address_space	i_data;
#ifdef CONFIG_QUOTA
	struct dquot		*i_dquot[MAXQUOTAS];
#endif
	struct list_head	i_devices;
	union {
		struct pipe_inode_info	*i_pipe;
		struct block_device	*i_bdev;
		struct cdev		*i_cdev;
	};

	__u32			i_generation;

#ifdef CONFIG_FSNOTIFY
	__u32			i_fsnotify_mask; /* all events this inode cares about */
	struct hlist_head	i_fsnotify_marks;
#endif

	void			*i_private; /* fs or device private pointer */
};

 

        所谓"文件", 就是按一定的形式存储在介质上的信息,所以一个文件其实包含了两方面的信息,一是存储的数据本身,二是有关该文件的组织和管理的信息。在内存中, 每个文件都有一个dentry(目录项)和inode(索引节点)结构,dentry记录着文件名,上级目录等信息,正是它形成了我们所看到的树状结构;而有关该文件的组织和管理的信息主要存放inode里面,它记录着文件在存储介质上的位置与分布。同时dentry->d_inode指向相应的inode结构。dentry与inode是多对一的关系,因为有可能一个文件有好几个文件名(硬链接, hard link, 可以参考这个网页 http://www.ugrad.cs.ubc.ca/~cs219/CourseNotes/Unix/commands-links.html)。

所有的dentry用d_parent和d_child连接起来,就形成了我们熟悉的树状结构。

inode代表的是物理意义上的文件,通过inode可以得到一个数组,这个数组记录了文件内容的位置,如该文件位于硬盘的第3,8,10块,那么这个数组的内容就是3,8,10。其索引节点号inode->i_ino,在同一个文件系统中是唯一的,内核只要根据i_ino,就可以计算出它对应的inode在介质上的位置。就硬盘来说,根据i_ino就可以计算出它对应的inode属于哪个块(block),从而找到相应的inode结构。但仅仅用inode还是无法描述出所有的文件系统,对于某一种特定的文件系统而言,比如ext3,在内存中用ext3_inode_info描述。他是一个包含inode的"容器"。

fs/ext3/ext3.h

struct ext3_inode_info {
	__le32	i_data[15];	/* unconverted */
	__u32	i_flags;
#ifdef EXT3_FRAGMENTS
	__u32	i_faddr;
	__u8	i_frag_no;
	__u8	i_frag_size;
#endif
	ext3_fsblk_t	i_file_acl;
	__u32	i_dir_acl;
	__u32	i_dtime;

	/*
	 * i_block_group is the number of the block group which contains
	 * this file's inode.  Constant across the lifetime of the inode,
	 * it is ued for making block allocation decisions - we try to
	 * place a file's data blocks near its inode block, and new inodes
	 * near to their parent directory's inode.
	 */
	__u32	i_block_group;
	unsigned long	i_state_flags;	/* Dynamic state flags for ext3 */

	/* block reservation info */
	struct ext3_block_alloc_info *i_block_alloc_info;

	__u32	i_dir_start_lookup;
#ifdef CONFIG_EXT3_FS_XATTR
	/*
	 * Extended attributes can be read independently of the main file
	 * data. Taking i_mutex even when reading would cause contention
	 * between readers of EAs and writers of regular file data, so
	 * instead we synchronize on xattr_sem when reading or changing
	 * EAs.
	 */
	struct rw_semaphore xattr_sem;
#endif

	struct list_head i_orphan;	/* unlinked but open inodes */

	/*
	 * i_disksize keeps track of what the inode size is ON DISK, not
	 * in memory.  During truncate, i_size is set to the new size by
	 * the VFS prior to calling ext3_truncate(), but the filesystem won't
	 * set i_disksize to 0 until the truncate is actually under way.
	 *
	 * The intent is that i_disksize always represents the blocks which
	 * are used by this file.  This allows recovery to restart truncate
	 * on orphans if we crash during truncate.  We actually write i_disksize
	 * into the on-disk inode when writing inodes out, instead of i_size.
	 *
	 * The only time when i_disksize and i_size may be different is when
	 * a truncate is in progress.  The only things which change i_disksize
	 * are ext3_get_block (growth) and ext3_truncate (shrinkth).
	 */
	loff_t	i_disksize;

	/* on-disk additional length */
	__u16 i_extra_isize;

	/*
	 * truncate_mutex is for serialising ext3_truncate() against
	 * ext3_getblock().  In the 2.4 ext2 design, great chunks of inode's
	 * data tree are chopped off during truncate. We can't do that in
	 * ext3 because whenever we perform intermediate commits during
	 * truncate, the inode and all the metadata blocks *must* be in a
	 * consistent state which allows truncation of the orphans to restart
	 * during recovery.  Hence we must fix the get_block-vs-truncate race
	 * by other means, so we have truncate_mutex.
	 */
	struct mutex truncate_mutex;

	/*
	 * Transactions that contain inode's metadata needed to complete
	 * fsync and fdatasync, respectively.
	 */
	atomic_t i_sync_tid;
	atomic_t i_datasync_tid;

	struct inode vfs_inode;
};


 

__le32 i_data[15]这个数组就是上一段中所提到的那个数组。

注意,在遥远的2.4的古代,不同文件系统索引节点的内存映像(ext3_inode_info,reiserfs_inode_info,msdos_inode_info ...)都是用一个union内嵌在inode数据结构中的. 但inode作为一种非常基本的数据结构而言,这样搞太大了,不利于快速的分配和回收。但是后来发明了container_of(...)这种方法后,就把union移到了外部,我们可以用类似container_of(inode, struct ext3_inode_info, vfs_inode),从inode出发,得到其的"容器"。

dentry和inode终究都是在内存中的,它们的原始信息必须要有一个载体。否则断电之后岂不是玩完了?且听我慢慢道来。

文件可以分为磁盘文件,设备文件,和特殊文件三种。设备文件暂且不表。

磁盘文件
就磁盘文件而言,dentry和inode的载体在存储介质(磁盘)上。对于像ext3这样的磁盘文件来说,存储介质中的目录项和索引节点载体如下,

fs/ext3/ext3.h

struct ext3_inode {
	__le16	i_mode;		/* File mode */
	__le16	i_uid;		/* Low 16 bits of Owner Uid */
	__le32	i_size;		/* Size in bytes */
	__le32	i_atime;	/* Access time */
	__le32	i_ctime;	/* Creation time */
	__le32	i_mtime;	/* Modification time */
	__le32	i_dtime;	/* Deletion Time */
	__le16	i_gid;		/* Low 16 bits of Group Id */
	__le16	i_links_count;	/* Links count */
	__le32	i_blocks;	/* Blocks count */
	__le32	i_flags;	/* File flags */
	union {
		struct {
			__u32  l_i_reserved1;
		} linux1;
		struct {
			__u32  h_i_translator;
		} hurd1;
		struct {
			__u32  m_i_reserved1;
		} masix1;
	} osd1;				/* OS dependent 1 */
	__le32	i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
	__le32	i_generation;	/* File version (for NFS) */
	__le32	i_file_acl;	/* File ACL */
	__le32	i_dir_acl;	/* Directory ACL */
	__le32	i_faddr;	/* Fragment address */
	union {
		struct {
			__u8	l_i_frag;	/* Fragment number */
			__u8	l_i_fsize;	/* Fragment size */
			__u16	i_pad1;
			__le16	l_i_uid_high;	/* these 2 fields    */
			__le16	l_i_gid_high;	/* were reserved2[0] */
			__u32	l_i_reserved2;
		} linux2;
		struct {
			__u8	h_i_frag;	/* Fragment number */
			__u8	h_i_fsize;	/* Fragment size */
			__u16	h_i_mode_high;
			__u16	h_i_uid_high;
			__u16	h_i_gid_high;
			__u32	h_i_author;
		} hurd2;
		struct {
			__u8	m_i_frag;	/* Fragment number */
			__u8	m_i_fsize;	/* Fragment size */
			__u16	m_pad1;
			__u32	m_i_reserved2[2];
		} masix2;
	} osd2;				/* OS dependent 2 */
	__le16	i_extra_isize;
	__le16	i_pad1;
};

 

fs/ext3/ext3.h

struct ext3_dir_entry_2 {
	__le32	inode;			/* Inode number */
	__le16	rec_len;		/* Directory entry length */
	__u8	name_len;		/* Name length */
	__u8	file_type;
	char	name[EXT3_NAME_LEN];	/* File name */
};


__le32 i_block[EXT3_N_BLOCKS];

i_block数组指示了文件的内容所存放的地点(在硬盘上的位置)。

ext3_inode是放在索引节点区,而ext3_dir_entry_2是以文件内容的形式存放在数据区。我们只要知道了ino,由于ext3_inode大小已知,我们就可以计算出ext3_inode在索引节点区的位置( ino * sizeof(ext3_inode) ),而得到了ext3_inode,我们根据i_block就可以知道这个文件的数据存放的地点。将磁盘上ext3_inode的内容读入到ext3_inode_info中的函数是ext3_read_inode()。以一个有100 block的硬盘为例,一个文件系统的组织布局大致如下图。位图区中的每一位表示每一个相应的对象有没有被使用。

 

特殊文件
特殊文件在内存中有inode和dentry数据结构,但是不一定在存储介质上有"索引节点",它断电之后的确就玩完了,所以不需要什么载体。当从一个特殊文件读时,所读出的数据是由系统内部按一定的规则临时生成的,或从内存中收集,加工出来的。sysfs里面就是典型的特殊文件。它存储的信息都是由系统动态的生成的,它动态的包含了整个机器的硬件资源情况。从sysfs读写就相当于向kobject层次结构提取数据。

还请注意, 我们谈到目录项和索引节点时,有两种含义。一种是在存储介质(硬盘)中的(如ext3_inode),一种是在内存中的,后者是根据前者生成的。内存中的表示就是dentry和inode,它是VFS中的一层,不管什么样的文件系统,最后在内存中描述它的都是dentry和inode结构。我们使用不同的文件系统,就是将它们各自的文件信息都抽象到dentry和inode中去。这样对于高层来说,我们就可以不关心底层的实现,我们使用的都是一系列标准的函数调用。这就是VFS的精髓,实际上就是面向对象。

我们在进程中打开一个文件F,实际上就是要在内存中建立F的dentry,和inode结构,并让它们与进程结构联系来,把VFS中定义的接口给接起来。我们来看一看这个经典的图。这张图之于文件系统,就像每天爱你多一些之于张学友,番茄炒蛋之于复旦南区食堂,刻骨铭心。

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值