linux mount 函数实现,Linux之mount流程分析-优快云博客

学习Linux已经有一段时间了，最近看了下mount这个系统调用的一些流程，把它用博客记录下来，方便自己以后查找，也可以给那些有需要的人提供一些帮助。

当在用户层或者启动脚本中时调用mount函数把一个设备用相应的文件系统挂载起来时，可以让我们很方便的去访问这个设备中的文件；在内核中，mount的入口函数在fs/namespace.c

SYSCALL_DEFINE5(mount,char__user *, dev_name,char__user *, dir_name,

char__user *, type, unsignedlong, flags,void__user *, data)

{

intret;

char*kernel_type;

char*kernel_dir;

char*kernel_dev;

unsignedlongdata_page;

ret = copy_mount_string(type, &kernel_type);//复制数据到内核空间

if(ret

gotoout_type;

kernel_dir = getname(dir_name); //复制数据到内核空间

if(IS_ERR(kernel_dir)) {

ret = PTR_ERR(kernel_dir);

gotoout_dir;

}

ret = copy_mount_string(dev_name, &kernel_dev); //复制数据到内核空间

if(ret

gotoout_dev;

ret = copy_mount_options(data, &data_page);//复制数据到内核空间

if(ret

gotoout_data;

ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,

(void*) data_page);

free_page(data_page);

out_data:

kfree(kernel_dev);

out_dev:

putname(kernel_dir);

out_dir:

kfree(kernel_type);

out_type:

returnret;

}

用户空间传递了dev_name、dir_name、type、flags和data五个参数到内核中，由于dev_name、dir_name、type和data四个参数都是指针，都指向用户空间的某区域，所以需要用特定的函数将这些数据从用户层拷贝到内核。

这个函数的主要实现都在do_mount函数中：

longdo_mount(char*dev_name,char*dir_name,char*type_page,

unsignedlongflags,void*data_page)

{

structpath path;

intretval = 0;

intmnt_flags = 0;

/* Discard magic */

if((flags & MS_MGC_MSK) == MS_MGC_VAL)

flags &= ~MS_MGC_MSK;

/* Basic sanity checks */

if(!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))

return-EINVAL;

if(data_page)

((char*)data_page)[PAGE_SIZE - 1] = 0;

/* ... and get the mountpoint */

retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);

if(retval)

returnretval;

retval = security_sb_mount(dev_name, &path,

type_page, flags, data_page);

if(retval)

gotodput_out;

/* Default to relatime unless overriden */

if(!(flags & MS_NOATIME))

mnt_flags |= MNT_RELATIME;

/* Separate the per-mountpoint flags */

if(flags & MS_NOSUID)

mnt_flags |= MNT_NOSUID;

if(flags & MS_NODEV)

mnt_flags |= MNT_NODEV;

if(flags & MS_NOEXEC)

mnt_flags |= MNT_NOEXEC;

if(flags & MS_NOATIME)

mnt_flags |= MNT_NOATIME;

if(flags & MS_NODIRATIME)

mnt_flags |= MNT_NODIRATIME;

if(flags & MS_STRICTATIME)

mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);

if(flags & MS_RDONLY)

mnt_flags |= MNT_READONLY;

MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |

MS_STRICTATIME);

if(flags & MS_REMOUNT)

retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,

data_page);

elseif(flags & MS_BIND)

retval = do_loopback(&path, dev_name, flags & MS_REC);

elseif(flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

retval = do_change_type(&path, flags);

elseif(flags & MS_MOVE)

retval = do_move_mount(&path, dev_name);

else

retval = do_new_mount(&path, type_page, flags, mnt_flags,

dev_name, data_page);

dput_out:

path_put(&path);

returnretval;

}

前面都是对一些指针的判断，函数kern_path用于在给定的字符串去查找出将要挂在在哪个目录中，查找成功会通过path这个指针带回查找的结构，之后用do_new_mount这个函数去进行下一步的挂载。

kern_path函数中只调用了函数do_path_lookup

staticintdo_path_lookup(intdfd,constchar*name,

unsignedintflags,structnameidata *nd)

{

intretval = path_init(dfd, name, flags, nd);

if(!retval)

retval = path_walk(name, nd);

if(unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&

nd->path.dentry->d_inode))

audit_inode(name, nd->path.dentry);

if(nd->root.mnt) {

path_put(&nd->root);

nd->root.mnt = NULL;

}

returnretval;

}

分为两部分看：第一部分调用path_init，用于初始化查找的根目录；第二部分在根目录的基础上对所给的字符串目录进行逐级查找。

先看path_init

staticintpath_init(intdfd,constchar*name, unsignedintflags,structnameidata *nd)

{

intretval = 0;

intfput_needed;

structfile *file;

nd->last_type = LAST_ROOT;/* if there are only slashes... */

nd->flags = flags;

nd->depth = 0;

nd->root.mnt = NULL;

if(*name=='/') {

set_root(nd);

nd->path = nd->root;

path_get(&nd->root);

}elseif(dfd == AT_FDCWD) {

structfs_struct *fs = current->fs;

read_lock(&fs->lock);

nd->path = fs->pwd;

path_get(&fs->pwd);

read_unlock(&fs->lock);

}else{

structdentry *dentry;

file = fget_light(dfd, &fput_needed);

retval = -EBADF;

if(!file)

gotoout_fail;

dentry = file->f_path.dentry;

retval = -ENOTDIR;

if(!S_ISDIR(dentry->d_inode->i_mode))

gotofput_fail;

retval = file_permission(file, MAY_EXEC);

if(retval)

gotofput_fail;

nd->path = file->f_path;

path_get(&file->f_path);

fput_light(file, fput_needed);

}

return0;

fput_fail:

fput_light(file, fput_needed);

out_fail:

returnretval;

}

这个函数就是一个if ..else..语句，如果第一个字符时'/'，则说明是绝对路径，从当前进程描述符的fs的root成员中得到根目录，否则从pwd中保存的当前路径作为查找根目录。

再来看path_walk

staticintpath_walk(constchar*name,structnameidata *nd)

{

structpath save = nd->path;

intresult;

current->total_link_count = 0;

/* make sure the stuff we saved doesn't go away */

path_get(&save);

result = link_path_walk(name, nd);

if(result == -ESTALE) {

/* nd->path had been dropped */

current->total_link_count = 0;

nd->path = save;

path_get(&nd->path);

nd->flags |= LOOKUP_REVAL;

result = link_path_walk(name, nd);

}

path_put(&save);

returnresult;

}

path_walk函数的代码中只调用了link_path_walk

staticintlink_path_walk(constchar*name,structnameidata *nd)

{

structpath next;

structinode *inode;

interr;

unsignedintlookup_flags = nd->flags;

while(*name=='/') //去掉开头的/字符

name++;

if(!*name)

gotoreturn_reval;

inode = nd->path.dentry->d_inode;

if(nd->depth)

lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);

/* At this point we know we have a real path component. */

for(;;) {

unsignedlonghash;

structqstrthis; //临时保存将要查找的目录

unsignedintc;

nd->flags |= LOOKUP_CONTINUE;

err = exec_permission(inode);

if(err)

break;

this.name = name;

c = *(constunsignedchar*)name;

hash = init_name_hash();

do{

name++;

hash = partial_name_hash(c, hash); //计算hash值

c = *(constunsignedchar*)name;

}while(c && (c !='/'));

this.len = name - (constchar*)this.name;

this.hash = end_name_hash(hash);

/* remove trailing slashes? */

if(!c)

gotolast_component; //跳转去处理最后一级目录

while(*++name =='/');

if(!*name)

gotolast_with_slashes;

* "." and ".." are special - ".." especially so because it has

* to be able to know about the current root directory and

* parent relationships.

if(this.name[0] =='.')switch(this.len) {

default:

break;

case2:

if(this.name[1] !='.')

break;

follow_dotdot(nd);

inode = nd->path.dentry->d_inode; //两个点将当前设置为上一级目录

/* fallthrough */

case1:

continue; //只有一个点不做任何处理

}

/* This does the actual lookups.. */

err = do_lookup(nd, &this, &next); //真正的查找函数

if(err)

break;

err = -ENOENT;

inode = next.dentry->d_inode;

if(!inode)

gotoout_dput;

if(inode->i_op->follow_link) {

err = do_follow_link(&next, nd);

if(err)

gotoreturn_err;

err = -ENOENT;

inode = nd->path.dentry->d_inode;

if(!inode)

break;

}else

path_to_nameidata(&next, nd);

err = -ENOTDIR;

if(!inode->i_op->lookup)

break;

continue;

/* here ends the main loop */

last_with_slashes:

lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

last_component:

/* Clear LOOKUP_CONTINUE iff it was previously unset */

nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;

if(lookup_flags & LOOKUP_PARENT)

gotolookup_parent;

if(this.name[0] =='.')switch(this.len) {

default:

break;

case2:

if(this.name[1] !='.')

break;

follow_dotdot(nd);

inode = nd->path.dentry->d_inode;

/* fallthrough */

case1:

gotoreturn_reval;

}

err = do_lookup(nd, &this, &next);

if(err)

break;

inode = next.dentry->d_inode;

if(follow_on_final(inode, lookup_flags)) {

err = do_follow_link(&next, nd);

if(err)

gotoreturn_err;

inode = nd->path.dentry->d_inode;

}else

path_to_nameidata(&next, nd);

err = -ENOENT;

if(!inode)

break;

if(lookup_flags & LOOKUP_DIRECTORY) {

err = -ENOTDIR;

if(!inode->i_op->lookup)

break;

}

gotoreturn_base;

lookup_parent:

nd->last =this;

nd->last_type = LAST_NORM;

if(this.name[0] !='.')

gotoreturn_base;

if(this.len == 1)

nd->last_type = LAST_DOT;

elseif(this.len == 2 &&this.name[1] =='.')

nd->last_type = LAST_DOTDOT;

else

gotoreturn_base;

return_reval:

* We bypassed the ordinary revalidation routines.

* We may need to check the cached dentry for staleness.

if(nd->path.dentry && nd->path.dentry->d_sb &&

(nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {

err = -ESTALE;

/* Note: we do not d_invalidate() */

if(!nd->path.dentry->d_op->d_revalidate(

nd->path.dentry, nd))

break;

}

return_base:

return0;

out_dput:

path_put_conditional(&next, nd);

break;

}

path_put(&nd->path);

return_err:

returnerr;

}

link_path_walk函数先把给的字符串进行拆分，去除每级目录的名字，然后调用do_lookup函数在当前的目录基础上进行查找，知道查完整个字符串。

staticintdo_lookup(structnameidata *nd,structqstr *name,structpath *path)

{

structvfsmount *mnt = nd->path.mnt;

structdentry *dentry, *parent;

structinode *dir;

* See if the low-level filesystem might want

* to use its own hash..

if(nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {

interr = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);

if(err

returnerr;

}

dentry = __d_lookup(nd->path.dentry, name);

if(!dentry)

gotoneed_lookup;

if(dentry->d_op && dentry->d_op->d_revalidate)

gotoneed_revalidate;

done:

path->mnt = mnt;

path->dentry = dentry;

__follow_mount(path);

return0;

need_lookup:

parent = nd->path.dentry;

dir = parent->d_inode;

mutex_lock(&dir->i_mutex);

* First re-do the cached lookup just in case it was created

* while we waited for the directory semaphore..

* FIXME! This could use version numbering or similar to

* avoid unnecessary cache lookups.

* The "dcache_lock" is purely to protect the RCU list walker

* from concurrent renames at this point (we mustn't get false

* negatives from the RCU list walk here, unlike the optimistic

* fast walk).

* so doing d_lookup() (with seqlock), instead of lockfree __d_lookup

dentry = d_lookup(parent, name);

if(!dentry) {

structdentry *new;

/* Don't create child dentry for a dead directory. */

dentry = ERR_PTR(-ENOENT);

if(IS_DEADDIR(dir))

gotoout_unlock;

new= d_alloc(parent, name);

dentry = ERR_PTR(-ENOMEM);

if(new) {

dentry = dir->i_op->lookup(dir,new, nd);

if(dentry)

dput(new);

else

dentry =new;

}

out_unlock:

mutex_unlock(&dir->i_mutex);

if(IS_ERR(dentry))

gotofail;

gotodone;

}

* Uhhuh! Nasty case: the cache was re-populated while

* we waited on the semaphore. Need to revalidate.

mutex_unlock(&dir->i_mutex);

if(dentry->d_op && dentry->d_op->d_revalidate) {

dentry = do_revalidate(dentry, nd);

if(!dentry)

dentry = ERR_PTR(-ENOENT);

}

if(IS_ERR(dentry))

gotofail;

gotodone;

need_revalidate:

dentry = do_revalidate(dentry, nd);

if(!dentry)

gotoneed_lookup;

if(IS_ERR(dentry))

gotofail;

gotodone;

fail:

returnPTR_ERR(dentry);

}

do_lookup先调用__d_lookup进行查找，如果查找失败，再去启用d_lookup，d_lookup其实内部还是调用__d_lookup函数，只是在这个基础上会使用信号量保护起来，以防止重命名造成的同步问题；如果都查找失败就新分配一个dentry并把它连接起来，函数的最后会调用__follow_mount，用于在当前dentry上查找是否存在挂载点，并用最新的挂载点的dentry和vfsmount对path进行重新赋值。__follow_mount的代码如下：

staticint__follow_mount(structpath *path)

{

intres = 0;

while(d_mountpoint(path->dentry)) {

structvfsmount *mounted = lookup_mnt(path);

if(!mounted)

break;

dput(path->dentry);

if(res)

mntput(path->mnt);

path->mnt = mounted;

path->dentry = dget(mounted->mnt_root);

res = 1;

}

returnres;

}

再看下__d_lookup函数的实现：

structdentry * __d_lookup(structdentry * parent,structqstr * name)

{

unsignedintlen = name->len;

unsignedinthash = name->hash;

constunsignedchar*str = name->name;

structhlist_head *head = d_hash(parent,hash);

structdentry *found = NULL;

structhlist_node *node;

structdentry *dentry;

rcu_read_lock();

hlist_for_each_entry_rcu(dentry, node, head, d_hash) {

structqstr *qstr;

if(dentry->d_name.hash != hash)

continue;

if(dentry->d_parent != parent)

continue;

spin_lock(&dentry->d_lock);

* Recheck the dentry after taking the lock - d_move may have

* changed things. Don't bother checking the hash because we're

* about to compare the whole name anyway.

if(dentry->d_parent != parent)

gotonext;

/* non-existing due to RCU? */

if(d_unhashed(dentry))

gotonext;

* It is safe to compare names since d_move() cannot

* change the qstr (protected by d_lock).

qstr = &dentry->d_name;

if(parent->d_op && parent->d_op->d_compare) {

if(parent->d_op->d_compare(parent, qstr, name))

gotonext;

}else{//如果d_compare函数没有实现就匹配字符串，对没有特殊要求的文件系统都可以匹配字符串即可

if(qstr->len != len)

gotonext;

if(memcmp(qstr->name, str, len))

gotonext;

}

atomic_inc(&dentry->d_count);

found = dentry;

spin_unlock(&dentry->d_lock);

break;

spin_unlock(&dentry->d_lock);

}

rcu_read_unlock();

returnfound;

}

__d_lookup函数会遍历父目录的hash表找出相匹配的子目录。

到这里整个挂载目录的查找就结束了，kern_path完成之后会通过path变量带回挂载点的dentry和父文件系统的vfsmount到do_mount函数中。

do_mountj继续调用do_new_mount函数：do_new_mount分为两部分，第一部分是生成挂载所需的超级快等文件结构；第二部分用于将一种生成的加到内核中去。

先看第一部分，通过do_kern_mount实现，do_kern_mount有调用了vfs_kern_mount：

structvfsmount *

vfs_kern_mount(structfile_system_type *type,intflags,constchar*name,void*data)

{

structvfsmount *mnt;

char*secdata = NULL;

interror;

if(!type)

returnERR_PTR(-ENODEV);

error = -ENOMEM;

mnt = alloc_vfsmnt(name);

if(!mnt)

gotoout;

if(flags & MS_KERNMOUNT)

mnt->mnt_flags = MNT_INTERNAL;

if(data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {

secdata = alloc_secdata();

if(!secdata)

gotoout_mnt;

error = security_sb_copy_data(data, secdata);

if(error)

gotoout_free_secdata;

}

error = type->get_sb(type, flags, name, data, mnt);

if(error

gotoout_free_secdata;

BUG_ON(!mnt->mnt_sb);

WARN_ON(!mnt->mnt_sb->s_bdi);

mnt->mnt_sb->s_flags |= MS_BORN;

error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);

if(error)

gotoout_sb;

* filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE

* but s_maxbytes was an unsigned long long for many releases. Throw

* this warning for a little while to try and catch filesystems that

* violate this rule. This warning should be either removed or

* converted to a BUG() in 2.6.34.

WARN((mnt->mnt_sb->s_maxbytes s_maxbytes to "

"negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);

mnt->mnt_mountpoint = mnt->mnt_root;

mnt->mnt_parent = mnt;

up_write(&mnt->mnt_sb->s_umount);

free_secdata(secdata);

returnmnt;

out_sb:

dput(mnt->mnt_root);

deactivate_locked_super(mnt->mnt_sb);

out_free_secdata:

free_secdata(secdata);

out_mnt:

free_vfsmnt(mnt);

out:

returnERR_PTR(error);

}

这部分的重点在于type->get_sb(type, flags, name, data, mnt); 调用特定文件系统的get_sb函数生成超级块对象和挂载点等数据结构。

第二部分的代码为函数do_add_mount

intdo_add_mount(structvfsmount *newmnt,structpath *path,

intmnt_flags,structlist_head *fslist)

{

interr;

mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);

down_write(&namespace_sem);

/* Something was mounted here while we slept */

while(d_mountpoint(path->dentry) &&

follow_down(path))

;

err = -EINVAL;

if(!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))

gotounlock;

/* Refuse the same filesystem on the same mount point */

err = -EBUSY;

if(path->mnt->mnt_sb == newmnt->mnt_sb &&

path->mnt->mnt_root == path->dentry)

gotounlock;

err = -EINVAL;

if(S_ISLNK(newmnt->mnt_root->d_inode->i_mode))

gotounlock;

newmnt->mnt_flags = mnt_flags;

if((err = graft_tree(newmnt, path)))

gotounlock;

if(fslist)/* add to the specified expiration list */

list_add_tail(&newmnt->mnt_expire, fslist);

up_write(&namespace_sem);

return0;

unlock:

up_write(&namespace_sem);

mntput(newmnt);

returnerr;

}

继续调用graft_tree

staticintgraft_tree(structvfsmount *mnt,structpath *path)

{

interr;

if(mnt->mnt_sb->s_flags & MS_NOUSER)

return-EINVAL;

if(S_ISDIR(path->dentry->d_inode->i_mode) !=

S_ISDIR(mnt->mnt_root->d_inode->i_mode))

return-ENOTDIR;

err = -ENOENT;

mutex_lock(&path->dentry->d_inode->i_mutex);

if(cant_mount(path->dentry))

gotoout_unlock;

if(!d_unlinked(path->dentry))

err =attach_recursive_mnt(mnt, path, NULL);

out_unlock:

mutex_unlock(&path->dentry->d_inode->i_mutex);

returnerr;

}

调用attach_recursive_mnt

staticintattach_recursive_mnt(structvfsmount *source_mnt,

structpath *path,structpath *parent_path)

{

LIST_HEAD(tree_list);

structvfsmount *dest_mnt = path->mnt;

structdentry *dest_dentry = path->dentry;

structvfsmount *child, *p;

interr;

if(IS_MNT_SHARED(dest_mnt)) {

err = invent_group_ids(source_mnt,true);

if(err)

gotoout;

}

err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);

if(err)

gotoout_cleanup_ids;

spin_lock(&vfsmount_lock);

if(IS_MNT_SHARED(dest_mnt)) {

for(p = source_mnt; p; p = next_mnt(p, source_mnt))

set_mnt_shared(p);

}

if(parent_path) {

detach_mnt(source_mnt, parent_path);

attach_mnt(source_mnt, path);

touch_mnt_namespace(parent_path->mnt->mnt_ns);

}else{

mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);

commit_tree(source_mnt);

}

list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {

list_del_init(&child->mnt_hash);

commit_tree(child);

}

spin_unlock(&vfsmount_lock);

return0;

out_cleanup_ids:

if(IS_MNT_SHARED(dest_mnt))

cleanup_group_ids(source_mnt, NULL);

out:

returnerr;

}

调用mnt_set_mountpoint

voidmnt_set_mountpoint(structvfsmount *mnt,structdentry *dentry,

structvfsmount *child_mnt)

{

child_mnt->mnt_parent = mntget(mnt);//设置父文件系统

child_mnt->mnt_mountpoint = dget(dentry);//设置挂载点目录项

dentry->d_mounted++;//挂载计数加1

}

到这里整个mount的流程就分析完毕了，mount的流程可以分为以下几个步骤：

一、查找给定挂载目录中的目录项结构和挂载点；

二、通过设备节点和文件系统类型生成新挂载文件系统的超级快等结构；

三、将二中生成的结构连接到一中查找到的路径中