fuse用户态、内核态通信机制分析

最新推荐文章于 2025-07-03 09:07:31 发布

原创最新推荐文章于 2025-07-03 09:07:31 发布 · 6.4k 阅读

8 ·

CC 4.0 BY-SA版权

文章标签：

#struct #list #semaphore #structure #optimization #cache

Linux开发专栏收录该内容

129 篇文章

订阅专栏

本文详细解析了FUSE（Filesystem in Userspace）如何实现内核与用户态之间的通信。通过分析FUSE的运行流程、关键数据结构及核心函数，深入探讨了FUSE如何处理文件系统操作请求，以及用户态与内核态如何交换数据。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

关于fuse用户态文件系统的文章有很多，比如http://my.debugman.net/program/fuse-180.html，就写得很全面。但关于fuse用户态、内核态通信的文章还比较少，我现在发现的一篇是http://blog.chinaunix.net/uid-20687780-id-313603.html，主要讲解了用户态、内核态的通信协议。

这里主要分析一下fuse的内核态用户态通信机制。fuse的主要运行流程如下图所示：

当用户态程序执行了POSIX的文件系统操作，经过glibc，变换为系统调用传递给vfs，vfs再将其传给FUSE的内核模块，FUSE的内核模块根据系统调用的类型，将请求发送到用户态的FUSE进程，并等待用户态进程的应答。FUSE内核模块再收到应答后，将其发送给vfs，把最终运行结果呈现到用户态程序。

那FUSE是如何让用户态与内核态通信的呢？这个在源代码中可以看得比较清楚。

首先在，内核代码fs/fuse/dev.c中，

/* 为fuse定义一个misc设备 */
static struct miscdevice fuse_miscdevice = {
    .minor = FUSE_MINOR,
    .name  = "fuse",                  /* 生产的misc设备将会出现在/dev/fuse */
    .fops = &fuse_dev_operations,
};

int __init fuse_dev_init(void)
{
    int err = -ENOMEM;
    fuse_req_cachep = kmem_cache_create("fuse_request",
                        sizeof(struct fuse_req),
                        0, 0, NULL);
    if (!fuse_req_cachep)
        goto out;

    err = misc_register(&fuse_miscdevice);   /* 注册成misc设备，misc设备的主设备号为10 */
    if (err)
        goto out_cache_clean;

    return 0;

 out_cache_clean:
    kmem_cache_destroy(fuse_req_cachep);
 out:
    return err;
}

通过调用fuse_dev_init函数，将会生成一个misc设备（类似字符设备，但主设备号为10，并且会在/dev/目录下根据设备名，自动生成设备文件）在/dev/fuse下。用户态代码在通过open这个设备文件，并且通过如下函数，注册向fuse内核态通信的函数：

struct fuse_chan *fuse_kern_chan_new(int fd)
{
	struct fuse_chan_ops op = {
		.receive = fuse_kern_chan_receive,
		.send = fuse_kern_chan_send,
		.destroy = fuse_kern_chan_destroy,
	};
	size_t bufsize = getpagesize() + 0x1000;
	bufsize = bufsize < MIN_BUFSIZE ? MIN_BUFSIZE : bufsize;
	return fuse_chan_new(&op, fd, bufsize, NULL);
}

fuse_kern_chan_receive函数，通过res = read(fuse_chan_fd(ch), buf, size);从/dev/fuse中读取内核发来的情求，再通过fuse_kern_chan_send函数中的ssize_t res = writev(fuse_chan_fd(ch), iov, count);将数据发送到内核模块。

再回到内核模块，还是fs/fuse/dev.c文件中，FUSE通过为/dev/fuse设备文件注册以下操作回调来支持用户态的对其的读写操作：

const struct file_operations fuse_dev_operations = {
	.owner		= THIS_MODULE,
	.llseek		= no_llseek,    /* 不支持seek操作 */
	.read		= do_sync_read,   /* 使用通用的同步读函数 */
	.aio_read	= fuse_dev_read,   /* fuse为用户态读取提供的异步函数 */
	.write		= do_sync_write,  /* 使用通用的同步写函授 */
	.aio_write	= fuse_dev_write,  /* fuse为用户态读取提供的异步函授 */
	.poll		= fuse_dev_poll,   /* 检查是否在一个文件上有操作发生，如果没有则睡眠，直到该文件上有操作发生*/
	.release	= fuse_dev_release,   /* 用户态close该设备文件对应的fd */
	.fasync		= fuse_dev_fasync,    /* 通过信号来启用或禁止I/O事件通告*/
};

其中，do_sync_read中，调用了ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos)，同样do_sync_write函数中，也调用了ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos)，所以他们不用单独实现。

在FUSE内核中，存在一个fuse_conn结构体，为用户态、内核态通信服务，其结构为：

/**
 * A Fuse connection.
 *
 * This structure is created, when the filesystem is mounted, and is
 * destroyed, when the client device is closed and the filesystem is
 * unmounted.
 */
struct fuse_conn {
	/** Lock protecting accessess to  members of this structure */
	spinlock_t lock;

	/** Mutex protecting against directory alias creation */
	struct mutex inst_mutex;

	/** Refcount 结构体的引用计数*/
	atomic_t count;

	/** The user id for this mount 用户ID*/
	uid_t user_id;

	/** The group id for this mount 组ID*/
	gid_t group_id;

	/** The fuse mount flags for this mount  挂载参数*/
	unsigned flags;

	/** Maximum read size 最大读取字节数*/
	unsigned max_read;

	/** Maximum write size 最大写入字节数*/
	unsigned max_write;

	/** Readers of the connection are waiting on this 读取请求的等待队列*/
	wait_queue_head_t waitq;

	/** The list of pending requests 正在等待的队列*/
	struct list_head pending;

	/** The list of requests being processed 正在处理的队列*/
	struct list_head processing;

	/** The list of requests under I/O 正在进行IO操作的队列*/
	struct list_head io;

	/** The next unique kernel file handle */
	u64 khctr;

	/** rbtree of fuse_files waiting for poll events indexed by ph */
	struct rb_root polled_files;

	/** Maximum number of outstanding background requests 最大后台请求数*/
	unsigned max_background;

	/** Number of background requests at which congestion starts */
	unsigned congestion_threshold;

	/** Number of requests currently in the background 后台请求数*/
	unsigned num_background;

	/** Number of background requests currently queued for userspace 正在执行的后台请求数*/
	unsigned active_background;

	/** The list of background requests set aside for later queuing */
	struct list_head bg_queue;

	/** Pending interrupts 中断请求队列*/
	struct list_head interrupts;

	/** Flag indicating if connection is blocked.  This will be
	    the case before the INIT reply is received, and if there
	    are too many outstading backgrounds requests 阻塞标志*/
	int blocked;

	/** waitq for blocked connection 阻塞等待队列*/
	wait_queue_head_t blocked_waitq;

	/** waitq for reserved requests 等待服务的队列*/
	wait_queue_head_t reserved_req_waitq;

	/** The next unique request id */
	u64 reqctr;

	/** Connection established, cleared on umount, connection
	    abort and device release 连接标志*/
	unsigned connected;

	/** Connection failed (version mismatch).  Cannot race with
	    setting other bitfields since it is only set once in INIT
	    reply, before any other request, and never cleared */
	unsigned conn_error:1;

	/** Connection successful.  Only set in INIT */
	unsigned conn_init:1;

	/** Do readpages asynchronously?  Only set in INIT */
	unsigned async_read:1;

	/** Do not send separate SETATTR request before open(O_TRUNC)  */
	unsigned atomic_o_trunc:1;

	/** Filesystem supports NFS exporting.  Only set in INIT */
	unsigned export_support:1;

	/** Set if bdi is valid */
	unsigned bdi_initialized:1;

	/*
	 * The following bitfields are only for optimization purposes
	 * and hence races in setting them will not cause malfunction
	 */

	/** Is fsync not implemented by fs? */
	unsigned no_fsync:1;

	/** Is fsyncdir not implemented by fs? */
	unsigned no_fsyncdir:1;

	/** Is flush not implemented by fs? */
	unsigned no_flush:1;

	/** Is setxattr not implemented by fs? */
	unsigned no_setxattr:1;

	/** Is getxattr not implemented by fs? */
	unsigned no_getxattr:1;

	/** Is listxattr not implemented by fs? */
	unsigned no_listxattr:1;

	/** Is removexattr not implemented by fs? */
	unsigned no_removexattr:1;

	/** Are file locking primitives not implemented by fs? */
	unsigned no_lock:1;

	/** Is access not implemented by fs? */
	unsigned no_access:1;

	/** Is create not implemented by fs? */
	unsigned no_create:1;

	/** Is interrupt not implemented by fs? */
	unsigned no_interrupt:1;

	/** Is bmap not implemented by fs? */
	unsigned no_bmap:1;

	/** Is poll not implemented by fs? */
	unsigned no_poll:1;

	/** Do multi-page cached writes */
	unsigned big_writes:1;

	/** Don't apply umask to creation modes */
	unsigned dont_mask:1;

	/** The number of requests waiting for completion */
	atomic_t num_waiting;

	/** Negotiated minor version */
	unsigned minor;

	/** Backing dev info */
	struct backing_dev_info bdi;

	/** Entry on the fuse_conn_list */
	struct list_head entry;

	/** Device ID from super block 超级块的设备id*/
	dev_t dev;

	/** Dentries in the control filesystem */
	struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];

	/** number of dentries used in the above array */
	int ctl_ndents;

	/** O_ASYNC requests */
	struct fasync_struct *fasync;

	/** Key for lock owner ID scrambling */
	u32 scramble_key[4];

	/** Reserved request for the DESTROY message */
	struct fuse_req *destroy_req;

	/** Version counter for attribute changes 文件属性的版本*/
	u64 attr_version;

	/** Called on final put */
	void (*release)(struct fuse_conn *);

	/** Super block for this connection. */
	struct super_block *sb;

	/** Read/write semaphore to hold when accessing sb. 访问超级块的信号量*/
	struct rw_semaphore killsb;
};

fuse_conn结构体的指针将会保存在file->private_data中，每次内核态向用户态发送情求时都会用到fuse_conn结构体。在fuse_dev_read函数的处理流程主要入下：

static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
			      unsigned long nr_segs, loff_t pos)
{
        struct fuse_in *in; /* 用来表示用户态读入的内核 */
	//省略变量定义
	struct fuse_conn *fc = fuse_get_conn(file);   /* 获得fuse_conn结构体的指针 */
	if (!fc)
		return -EPERM;

 restart:
	spin_lock(&fc->lock);
	err = -EAGAIN;
	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
	    !request_pending(fc))  //如果是非阻塞方式，则判断队列中有无等待处理请求，无请求则直接返回
		goto err_unlock;

	request_wait(fc);        //阻塞等待内核态的请求到了
        ......
	if (!list_empty(&fc->interrupts)) {  //判断是否有中断请求需要发送，有则先发中断请求
		req = list_entry(fc->interrupts.next, struct fuse_req,
				 intr_entry);
		return fuse_read_interrupt(fc, req, iov, nr_segs);
	}

	req = list_entry(fc->pending.next, struct fuse_req, list);  //从pending队列中获得下一个要发生的请求
	req->state = FUSE_REQ_READING;
	list_move(&req->list, &fc->io);  //将请求移动到正在进行IO的队列中

	in = &req->in;
	reqsize = in->h.len;
	/* If request is too large, reply with an error and restart the read */
	........
	
	spin_unlock(&fc->lock);
	fuse_copy_init(&cs, fc, 1, req, iov, nr_segs);  //为将请求拷贝到用户态做准备
	err = fuse_copy_one(&cs, &in->h, sizeof(in->h));   //将请求的包头拷贝到用户态
	if (!err)
		err = fuse_copy_args(&cs, in->numargs, in->argpages,
				     (struct fuse_arg *) in->args, 0);  //将请求的包体拷贝到用户态，如果包中有多个参数，则需要循环将参数拷完
	fuse_copy_finish(&cs);  //完成拷贝，释放内存
	spin_lock(&fc->lock);
	req->locked = 0;
	//对发送过程进行错误判断，省略
	....
	if (!req->isreply)   //如果没有返回值，则结束请求
		request_end(fc, req);
	else {
		req->state = FUSE_REQ_SENT;  //如果这个请求需要用户态返回执行结果
		list_move_tail(&req->list, &fc->processing);  //则将请求转到processing队列中，交给fuse_dev_write来处理
		if (req->interrupted)
			queue_interrupt(fc, req);
		spin_unlock(&fc->lock);
	}
	return reqsize;

 err_unlock:
	spin_unlock(&fc->lock);
	return err;
}

其中fuse_in结构体如下所示：

/** The request input */
struct fuse_in {
	/** The request header  指令的头部*/
	struct fuse_in_header h;
    
	/** True if the data for the last argument is in req->pages */
	unsigned argpages:1;

	/** Number of arguments 这条指令中包含的参数个数*/
	unsigned numargs;

	/** Array of arguments 参数的数组*/
	struct fuse_in_arg args[3];
};

此结构体中，包含的另外两个结构体

struct fuse_in_header {
	__u32	len;      //包的长度
	__u32	opcode;  //操作码，用来表示操作类型
	__u64	unique;   //此包的唯一编号
	__u64	nodeid;   //表示操作文件节点的id，类似ino
	__u32	uid;
	__u32	gid;
	__u32	pid;
	__u32	padding;  //是否处于挂起状态 ???
};

/** One input argument of a request */
struct fuse_in_arg {
	unsigned size;    //参数的长度
	const void *value;  //参数的指针
};