学习Linux已经有一段时间了,最近看了下mount这个系统调用的一些流程,把它用博客记录下来,方便自己以后查找,也可以给那些有需要的人提供一些帮助。
当在用户层或者启动脚本中时调用mount函数把一个设备用相应的文件系统挂载起来时,可以让我们很方便的去访问这个设备中的文件;在内核中,mount的入口函数在fs/namespace.c
SYSCALL_DEFINE5(mount,char__user *, dev_name,char__user *, dir_name,
char__user *, type, unsignedlong, flags,void__user *, data)
{
intret;
char*kernel_type;
char*kernel_dir;
char*kernel_dev;
unsignedlongdata_page;
ret = copy_mount_string(type, &kernel_type);//复制数据到内核空间
if(ret
gotoout_type;
kernel_dir = getname(dir_name); //复制数据到内核空间
if(IS_ERR(kernel_dir)) {
ret = PTR_ERR(kernel_dir);
gotoout_dir;
}
ret = copy_mount_string(dev_name, &kernel_dev); //复制数据到内核空间
if(ret
gotoout_dev;
ret = copy_mount_options(data, &data_page);//复制数据到内核空间
if(ret
gotoout_data;
ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,
(void*) data_page);
free_page(data_page);
out_data:
kfree(kernel_dev);
out_dev:
putname(kernel_dir);
out_dir:
kfree(kernel_type);
out_type:
returnret;
}
用户空间传递了dev_name、dir_name、type、flags和data五个参数到内核中,由于dev_name、dir_name、type和data四个参数都是指针,都指向用户空间的某区域,所以需要用特定的函数将这些数据从用户层拷贝到内核。
这个函数的主要实现都在do_mount函数中:
longdo_mount(char*dev_name,char*dir_name,char*type_page,
unsignedlongflags,void*data_page)
{
structpath path;
intretval = 0;
intmnt_flags = 0;
/* Discard magic */
if((flags & MS_MGC_MSK) == MS_MGC_VAL)
flags &= ~MS_MGC_MSK;
/* Basic sanity checks */
if(!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
return-EINVAL;
if(data_page)
((char*)data_page)[PAGE_SIZE - 1] = 0;
/* ... and get the mountpoint */
retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
if(retval)
returnretval;
retval = security_sb_mount(dev_name, &path,
type_page, flags, data_page);
if(retval)
gotodput_out;
/* Default to relatime unless overriden */
if(!(flags & MS_NOATIME))
mnt_flags |= MNT_RELATIME;
/* Separate the per-mountpoint flags */
if(flags & MS_NOSUID)
mnt_flags |= MNT_NOSUID;
if(flags & MS_NODEV)
mnt_flags |= MNT_NODEV;
if(flags & MS_NOEXEC)
mnt_flags |= MNT_NOEXEC;
if(flags & MS_NOATIME)
mnt_flags |= MNT_NOATIME;
if(flags & MS_NODIRATIME)
mnt_flags |= MNT_NODIRATIME;
if(flags & MS_STRICTATIME)
mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
if(flags & MS_RDONLY)
mnt_flags |= MNT_READONLY;
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
MS_STRICTATIME);
if(flags & MS_REMOUNT)
retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
data_page);
elseif(flags & MS_BIND)
retval = do_loopback(&path, dev_name, flags & MS_REC);
elseif(flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&path, flags);
elseif(flags & MS_MOVE)
retval = do_move_mount(&path, dev_name);
else
retval = do_new_mount(&path, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
path_put(&path);
returnretval;
}
前面都是对一些指针的判断,函数kern_path用于在给定的字符串去查找出将要挂在在哪个目录中,查找成功会通过path这个指针带回查找的结构,之后用do_new_mount这个函数去进行下一步的挂载。
kern_path函数中只调用了函数do_path_lookup
staticintdo_path_lookup(intdfd,constchar*name,
unsignedintflags,structnameidata *nd)
{
intretval = path_init(dfd, name, flags, nd);
if(!retval)
retval = path_walk(name, nd);
if(unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
nd->path.dentry->d_inode))
audit_inode(name, nd->path.dentry);
if(nd->root.mnt) {
path_put(&nd->root);
nd->root.mnt = NULL;
}
returnretval;
}
分为两部分看:第一部分调用path_init,用于初始化查找的根目录;第二部分在根目录的基础上对所给的字符串目录进行逐级查找。
先看path_init
staticintpath_init(intdfd,constchar*name, unsignedintflags,structnameidata *nd)
{
intretval = 0;
intfput_needed;
structfile *file;
nd->last_type = LAST_ROOT;/* if there are only slashes... */
nd->flags = flags;
nd->depth = 0;
nd->root.mnt = NULL;
if(*name=='/') {
set_root(nd);
nd->path = nd->root;
path_get(&nd->root);
}elseif(dfd == AT_FDCWD) {
structfs_struct *fs = current->fs;
read_lock(&fs->lock);
nd->path = fs->pwd;
path_get(&fs->pwd);
read_unlock(&fs->lock);
}else{
structdentry *dentry;
file = fget_light(dfd, &fput_needed);
retval = -EBADF;
if(!file)
gotoout_fail;
dentry = file->f_path.dentry;
retval = -ENOTDIR;
if(!S_ISDIR(dentry->d_inode->i_mode))
gotofput_fail;
retval = file_permission(file, MAY_EXEC);
if(retval)
gotofput_fail;
nd->path = file->f_path;
path_get(&file->f_path);
fput_light(file, fput_needed);
}
return0;
fput_fail:
fput_light(file, fput_needed);
out_fail:
returnretval;
}
这个函数就是一个if ..else..语句,如果第一个字符时'/',则说明是绝对路径,从当前进程描述符的fs的root成员中得到根目录,否则从pwd中保存的当前路径作为查找根目录。
再来看path_walk
staticintpath_walk(constchar*name,structnameidata *nd)
{
structpath save = nd->path;
intresult;
current->total_link_count = 0;
/* make sure the stuff we saved doesn't go away */
path_get(&save);
result = link_path_walk(name, nd);
if(result == -ESTALE) {
/* nd->path had been dropped */
current->total_link_count = 0;
nd->path = save;
path_get(&nd->path);
nd->flags |= LOOKUP_REVAL;
result = link_path_walk(name, nd);
}
path_put(&save);
returnresult;
}
path_walk函数的代码中只调用了link_path_walk
staticintlink_path_walk(constchar*name,structnameidata *nd)
{
structpath next;
structinode *inode;
interr;
unsignedintlookup_flags = nd->flags;
while(*name=='/') //去掉开头的/字符
name++;
if(!*name)
gotoreturn_reval;
inode = nd->path.dentry->d_inode;
if(nd->depth)
lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
/* At this point we know we have a real path component. */
for(;;) {
unsignedlonghash;
structqstrthis; //临时保存将要查找的目录
unsignedintc;
nd->flags |= LOOKUP_CONTINUE;
err = exec_permission(inode);
if(err)
break;
this.name = name;
c = *(constunsignedchar*)name;
hash = init_name_hash();
do{
name++;
hash = partial_name_hash(c, hash); //计算hash值
c = *(constunsignedchar*)name;
}while(c && (c !='/'));
this.len = name - (constchar*)this.name;
this.hash = end_name_hash(hash);
/* remove trailing slashes? */
if(!c)
gotolast_component; //跳转去处理最后一级目录
while(*++name =='/');
if(!*name)
gotolast_with_slashes;
/*
* "." and ".." are special - ".." especially so because it has
* to be able to know about the current root directory and
* parent relationships.
*/
if(this.name[0] =='.')switch(this.len) {
default:
break;
case2:
if(this.name[1] !='.')
break;
follow_dotdot(nd);
inode = nd->path.dentry->d_inode; //两个点将当前设置为上一级目录
/* fallthrough */
case1:
continue; //只有一个点不做任何处理
}
/* This does the actual lookups.. */
err = do_lookup(nd, &this, &next); //真正的查找函数
if(err)
break;
err = -ENOENT;
inode = next.dentry->d_inode;
if(!inode)
gotoout_dput;
if(inode->i_op->follow_link) {
err = do_follow_link(&next, nd);
if(err)
gotoreturn_err;
err = -ENOENT;
inode = nd->path.dentry->d_inode;
if(!inode)
break;
}else
path_to_nameidata(&next, nd);
err = -ENOTDIR;
if(!inode->i_op->lookup)
break;
continue;
/* here ends the main loop */
last_with_slashes:
lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
last_component:
/* Clear LOOKUP_CONTINUE iff it was previously unset */
nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
if(lookup_flags & LOOKUP_PARENT)
gotolookup_parent;
if(this.name[0] =='.')switch(this.len) {
default:
break;
case2:
if(this.name[1] !='.')
break;
follow_dotdot(nd);
inode = nd->path.dentry->d_inode;
/* fallthrough */
case1:
gotoreturn_reval;
}
err = do_lookup(nd, &this, &next);
if(err)
break;
inode = next.dentry->d_inode;
if(follow_on_final(inode, lookup_flags)) {
err = do_follow_link(&next, nd);
if(err)
gotoreturn_err;
inode = nd->path.dentry->d_inode;
}else
path_to_nameidata(&next, nd);
err = -ENOENT;
if(!inode)
break;
if(lookup_flags & LOOKUP_DIRECTORY) {
err = -ENOTDIR;
if(!inode->i_op->lookup)
break;
}
gotoreturn_base;
lookup_parent:
nd->last =this;
nd->last_type = LAST_NORM;
if(this.name[0] !='.')
gotoreturn_base;
if(this.len == 1)
nd->last_type = LAST_DOT;
elseif(this.len == 2 &&this.name[1] =='.')
nd->last_type = LAST_DOTDOT;
else
gotoreturn_base;
return_reval:
/*
* We bypassed the ordinary revalidation routines.
* We may need to check the cached dentry for staleness.
*/
if(nd->path.dentry && nd->path.dentry->d_sb &&
(nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
err = -ESTALE;
/* Note: we do not d_invalidate() */
if(!nd->path.dentry->d_op->d_revalidate(
nd->path.dentry, nd))
break;
}
return_base:
return0;
out_dput:
path_put_conditional(&next, nd);
break;
}
path_put(&nd->path);
return_err:
returnerr;
}
link_path_walk函数先把给的字符串进行拆分,去除每级目录的名字,然后调用do_lookup函数在当前的目录基础上进行查找,知道查完整个字符串。
staticintdo_lookup(structnameidata *nd,structqstr *name,structpath *path)
{
structvfsmount *mnt = nd->path.mnt;
structdentry *dentry, *parent;
structinode *dir;
/*
* See if the low-level filesystem might want
* to use its own hash..
*/
if(nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
interr = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
if(err
returnerr;
}
dentry = __d_lookup(nd->path.dentry, name);
if(!dentry)
gotoneed_lookup;
if(dentry->d_op && dentry->d_op->d_revalidate)
gotoneed_revalidate;
done:
path->mnt = mnt;
path->dentry = dentry;
__follow_mount(path);
return0;
need_lookup:
parent = nd->path.dentry;
dir = parent->d_inode;
mutex_lock(&dir->i_mutex);
/*
* First re-do the cached lookup just in case it was created
* while we waited for the directory semaphore..
*
* FIXME! This could use version numbering or similar to
* avoid unnecessary cache lookups.
*
* The "dcache_lock" is purely to protect the RCU list walker
* from concurrent renames at this point (we mustn't get false
* negatives from the RCU list walk here, unlike the optimistic
* fast walk).
*
* so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
*/
dentry = d_lookup(parent, name);
if(!dentry) {
structdentry *new;
/* Don't create child dentry for a dead directory. */
dentry = ERR_PTR(-ENOENT);
if(IS_DEADDIR(dir))
gotoout_unlock;
new= d_alloc(parent, name);
dentry = ERR_PTR(-ENOMEM);
if(new) {
dentry = dir->i_op->lookup(dir,new, nd);
if(dentry)
dput(new);
else
dentry =new;
}
out_unlock:
mutex_unlock(&dir->i_mutex);
if(IS_ERR(dentry))
gotofail;
gotodone;
}
/*
* Uhhuh! Nasty case: the cache was re-populated while
* we waited on the semaphore. Need to revalidate.
*/
mutex_unlock(&dir->i_mutex);
if(dentry->d_op && dentry->d_op->d_revalidate) {
dentry = do_revalidate(dentry, nd);
if(!dentry)
dentry = ERR_PTR(-ENOENT);
}
if(IS_ERR(dentry))
gotofail;
gotodone;
need_revalidate:
dentry = do_revalidate(dentry, nd);
if(!dentry)
gotoneed_lookup;
if(IS_ERR(dentry))
gotofail;
gotodone;
fail:
returnPTR_ERR(dentry);
}
do_lookup先调用__d_lookup进行查找,如果查找失败,再去启用d_lookup,d_lookup其实内部还是调用__d_lookup函数,只是在这个基础上会使用信号量保护起来,以防止重命名造成的同步问题;如果都查找失败就新分配一个dentry并把它连接起来,函数的最后会调用__follow_mount,用于在当前dentry上查找是否存在挂载点,并用最新的挂载点的dentry和vfsmount对path进行重新赋值。__follow_mount的代码如下:
staticint__follow_mount(structpath *path)
{
intres = 0;
while(d_mountpoint(path->dentry)) {
structvfsmount *mounted = lookup_mnt(path);
if(!mounted)
break;
dput(path->dentry);
if(res)
mntput(path->mnt);
path->mnt = mounted;
path->dentry = dget(mounted->mnt_root);
res = 1;
}
returnres;
}
再看下__d_lookup函数的实现:
structdentry * __d_lookup(structdentry * parent,structqstr * name)
{
unsignedintlen = name->len;
unsignedinthash = name->hash;
constunsignedchar*str = name->name;
structhlist_head *head = d_hash(parent,hash);
structdentry *found = NULL;
structhlist_node *node;
structdentry *dentry;
rcu_read_lock();
hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
structqstr *qstr;
if(dentry->d_name.hash != hash)
continue;
if(dentry->d_parent != parent)
continue;
spin_lock(&dentry->d_lock);
/*
* Recheck the dentry after taking the lock - d_move may have
* changed things. Don't bother checking the hash because we're
* about to compare the whole name anyway.
*/
if(dentry->d_parent != parent)
gotonext;
/* non-existing due to RCU? */
if(d_unhashed(dentry))
gotonext;
/*
* It is safe to compare names since d_move() cannot
* change the qstr (protected by d_lock).
*/
qstr = &dentry->d_name;
if(parent->d_op && parent->d_op->d_compare) {
if(parent->d_op->d_compare(parent, qstr, name))
gotonext;
}else{//如果d_compare函数没有实现就匹配字符串,对没有特殊要求的文件系统都可以匹配字符串即可
if(qstr->len != len)
gotonext;
if(memcmp(qstr->name, str, len))
gotonext;
}
atomic_inc(&dentry->d_count);
found = dentry;
spin_unlock(&dentry->d_lock);
break;
next:
spin_unlock(&dentry->d_lock);
}
rcu_read_unlock();
returnfound;
}
__d_lookup函数会遍历父目录的hash表找出相匹配的子目录。
到这里整个挂载目录的查找就结束了,kern_path完成之后会通过path变量带回挂载点的dentry和父文件系统的vfsmount到do_mount函数中。
do_mountj继续调用do_new_mount函数:do_new_mount分为两部分,第一部分是生成挂载所需的超级快等文件结构;第二部分用于将一种生成的加到内核中去。
先看第一部分,通过do_kern_mount实现,do_kern_mount有调用了vfs_kern_mount:
structvfsmount *
vfs_kern_mount(structfile_system_type *type,intflags,constchar*name,void*data)
{
structvfsmount *mnt;
char*secdata = NULL;
interror;
if(!type)
returnERR_PTR(-ENODEV);
error = -ENOMEM;
mnt = alloc_vfsmnt(name);
if(!mnt)
gotoout;
if(flags & MS_KERNMOUNT)
mnt->mnt_flags = MNT_INTERNAL;
if(data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
secdata = alloc_secdata();
if(!secdata)
gotoout_mnt;
error = security_sb_copy_data(data, secdata);
if(error)
gotoout_free_secdata;
}
error = type->get_sb(type, flags, name, data, mnt);
if(error
gotoout_free_secdata;
BUG_ON(!mnt->mnt_sb);
WARN_ON(!mnt->mnt_sb->s_bdi);
mnt->mnt_sb->s_flags |= MS_BORN;
error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
if(error)
gotoout_sb;
/*
* filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
* but s_maxbytes was an unsigned long long for many releases. Throw
* this warning for a little while to try and catch filesystems that
* violate this rule. This warning should be either removed or
* converted to a BUG() in 2.6.34.
*/
WARN((mnt->mnt_sb->s_maxbytes s_maxbytes to "
"negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
up_write(&mnt->mnt_sb->s_umount);
free_secdata(secdata);
returnmnt;
out_sb:
dput(mnt->mnt_root);
deactivate_locked_super(mnt->mnt_sb);
out_free_secdata:
free_secdata(secdata);
out_mnt:
free_vfsmnt(mnt);
out:
returnERR_PTR(error);
}
这部分的重点在于type->get_sb(type, flags, name, data, mnt); 调用特定文件系统的get_sb函数生成超级块对象和挂载点等数据结构。
第二部分的代码为函数do_add_mount
intdo_add_mount(structvfsmount *newmnt,structpath *path,
intmnt_flags,structlist_head *fslist)
{
interr;
mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
down_write(&namespace_sem);
/* Something was mounted here while we slept */
while(d_mountpoint(path->dentry) &&
follow_down(path))
;
err = -EINVAL;
if(!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
gotounlock;
/* Refuse the same filesystem on the same mount point */
err = -EBUSY;
if(path->mnt->mnt_sb == newmnt->mnt_sb &&
path->mnt->mnt_root == path->dentry)
gotounlock;
err = -EINVAL;
if(S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
gotounlock;
newmnt->mnt_flags = mnt_flags;
if((err = graft_tree(newmnt, path)))
gotounlock;
if(fslist)/* add to the specified expiration list */
list_add_tail(&newmnt->mnt_expire, fslist);
up_write(&namespace_sem);
return0;
unlock:
up_write(&namespace_sem);
mntput(newmnt);
returnerr;
}
继续调用graft_tree
staticintgraft_tree(structvfsmount *mnt,structpath *path)
{
interr;
if(mnt->mnt_sb->s_flags & MS_NOUSER)
return-EINVAL;
if(S_ISDIR(path->dentry->d_inode->i_mode) !=
S_ISDIR(mnt->mnt_root->d_inode->i_mode))
return-ENOTDIR;
err = -ENOENT;
mutex_lock(&path->dentry->d_inode->i_mutex);
if(cant_mount(path->dentry))
gotoout_unlock;
if(!d_unlinked(path->dentry))
err =attach_recursive_mnt(mnt, path, NULL);
out_unlock:
mutex_unlock(&path->dentry->d_inode->i_mutex);
returnerr;
}
调用attach_recursive_mnt
staticintattach_recursive_mnt(structvfsmount *source_mnt,
structpath *path,structpath *parent_path)
{
LIST_HEAD(tree_list);
structvfsmount *dest_mnt = path->mnt;
structdentry *dest_dentry = path->dentry;
structvfsmount *child, *p;
interr;
if(IS_MNT_SHARED(dest_mnt)) {
err = invent_group_ids(source_mnt,true);
if(err)
gotoout;
}
err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
if(err)
gotoout_cleanup_ids;
spin_lock(&vfsmount_lock);
if(IS_MNT_SHARED(dest_mnt)) {
for(p = source_mnt; p; p = next_mnt(p, source_mnt))
set_mnt_shared(p);
}
if(parent_path) {
detach_mnt(source_mnt, parent_path);
attach_mnt(source_mnt, path);
touch_mnt_namespace(parent_path->mnt->mnt_ns);
}else{
mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
commit_tree(source_mnt);
}
list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
list_del_init(&child->mnt_hash);
commit_tree(child);
}
spin_unlock(&vfsmount_lock);
return0;
out_cleanup_ids:
if(IS_MNT_SHARED(dest_mnt))
cleanup_group_ids(source_mnt, NULL);
out:
returnerr;
}
调用mnt_set_mountpoint
voidmnt_set_mountpoint(structvfsmount *mnt,structdentry *dentry,
structvfsmount *child_mnt)
{
child_mnt->mnt_parent = mntget(mnt);//设置父文件系统
child_mnt->mnt_mountpoint = dget(dentry);//设置挂载点目录项
dentry->d_mounted++;//挂载计数加1
}
到这里整个mount的流程就分析完毕了,mount的流程可以分为以下几个步骤:
一、查找给定挂载目录中的目录项结构和挂载点;
二、通过设备节点和文件系统类型生成新挂载文件系统的超级快等结构;
三、将二中生成的结构连接到一中查找到的路径中