diff mbox series

[05/13] fuse: Add a uring config ioctl and ring destruction

Message ID 20230321011047.3425786-6-bschubert@ddn.com (mailing list archive)
State Mainlined, archived
Headers show
Series fuse uring communication | expand

Commit Message

Bernd Schubert March 21, 2023, 1:10 a.m. UTC
Ring data are created with an ioctl, destruction goes via
fuse_abort_conn(). A new module parameter is added, for
now uring defaults to disabled.

This also adds the remaining fuse ring data structures.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
cc: Miklos Szeredi <miklos@szeredi.hu>
cc: linux-fsdevel@vger.kernel.org
cc: Amir Goldstein <amir73il@gmail.com>
cc: fuse-devel@lists.sourceforge.net
---
 fs/fuse/Makefile      |   2 +-
 fs/fuse/dev.c         |  21 +++
 fs/fuse/dev_uring.c   | 330 ++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/dev_uring_i.h |  20 +++
 fs/fuse/fuse_i.h      | 178 +++++++++++++++++++++++
 fs/fuse/inode.c       |   7 +
 6 files changed, 557 insertions(+), 1 deletion(-)
 create mode 100644 fs/fuse/dev_uring.c
 create mode 100644 fs/fuse/dev_uring_i.h
diff mbox series

Patch

diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 0c48b35c058d..634d47477393 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -7,7 +7,7 @@  obj-$(CONFIG_FUSE_FS) += fuse.o
 obj-$(CONFIG_CUSE) += cuse.o
 obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
 
-fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
+fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o dev_uring.o
 fuse-$(CONFIG_FUSE_DAX) += dax.o
 
 virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e0669b8e4618..07323b041377 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -8,6 +8,7 @@ 
 
 #include "fuse_i.h"
 #include "fuse_dev_i.h"
+#include "dev_uring_i.h"
 
 #include <linux/init.h>
 #include <linux/module.h>
@@ -2171,6 +2172,11 @@  void fuse_abort_conn(struct fuse_conn *fc)
 		spin_unlock(&fc->lock);
 
 		fuse_dev_end_requests(&to_end);
+
+		mutex_lock(&fc->ring.start_stop_lock);
+		if (fc->ring.configured && !fc->ring.queues_stopped)
+			fuse_uring_end_requests(fc);
+		mutex_unlock(&fc->ring.start_stop_lock);
 	} else {
 		spin_unlock(&fc->lock);
 	}
@@ -2247,6 +2253,7 @@  static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 	int res;
 	int oldfd;
 	struct fuse_dev *fud = NULL;
+	struct fuse_uring_cfg ring_conf;
 
 	switch (cmd) {
 	case FUSE_DEV_IOC_CLONE:
@@ -2271,6 +2278,20 @@  static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 				fput(old);
 			}
 		}
+		break;
+	case FUSE_DEV_IOC_URING:
+		/* XXX fud ensures fc->ring.start_stop_lock is initialized? */
+		fud = fuse_get_dev(file);
+		if (fud) {
+			res = copy_from_user(&ring_conf, (void *)arg,
+					     sizeof(ring_conf));
+			if (res == 0)
+				res = fuse_uring_ioctl(file, &ring_conf);
+			else
+				res = -EFAULT;
+		} else
+			pr_info("%s: Did not get fud\n", __func__);
+
 		break;
 	default:
 		res = -ENOTTY;
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
new file mode 100644
index 000000000000..12fd21526b2b
--- /dev/null
+++ b/fs/fuse/dev_uring.c
@@ -0,0 +1,330 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+ */
+
+#include "fuse_i.h"
+#include "fuse_dev_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/sched/signal.h>
+#include <linux/uio.h>
+#include <linux/miscdevice.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/swap.h>
+#include <linux/splice.h>
+#include <linux/sched.h>
+#include <linux/io_uring.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/io_uring.h>
+#include <linux/topology.h>
+
+static bool __read_mostly enable_uring;
+module_param(enable_uring, bool, 0644);
+MODULE_PARM_DESC(enable_uring,
+	"Enable uring userspace communication through uring.");
+
+static struct fuse_ring_queue *
+fuse_uring_get_queue(struct fuse_conn *fc, int qid)
+{
+	char *ptr = (char *)fc->ring.queues;
+
+	if (unlikely(qid > fc->ring.nr_queues)) {
+		WARN_ON(1);
+		qid = 0;
+	}
+
+	return (struct fuse_ring_queue *)(ptr + qid * fc->ring.queue_size);
+}
+
+/* Abort all list queued request on the given ring queue */
+static void fuse_uring_end_queue_requests(struct fuse_ring_queue *queue)
+{
+	spin_lock(&queue->lock);
+	queue->aborted = 1;
+	fuse_dev_end_requests(&queue->fg_queue);
+	fuse_dev_end_requests(&queue->bg_queue);
+	spin_unlock(&queue->lock);
+}
+
+void fuse_uring_end_requests(struct fuse_conn *fc)
+{
+	int qid;
+
+	for (qid = 0; qid < fc->ring.nr_queues; qid++) {
+		struct fuse_ring_queue *queue =
+			fuse_uring_get_queue(fc, qid);
+
+		if (!queue->configured)
+			continue;
+
+		fuse_uring_end_queue_requests(queue);
+	}
+}
+
+/**
+ * use __vmalloc_node_range() (needs to be
+ * exported?) or add a new (exported) function vm_alloc_user_node()
+ */
+static char *fuse_uring_alloc_queue_buf(int size, int node)
+{
+	char *buf;
+
+	if (size <= 0) {
+		pr_info("Invalid queue buf size: %d.\n", size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	buf = vmalloc_node_user(size, node);
+	return buf ? buf : ERR_PTR(-ENOMEM);
+}
+
+/**
+ * Ring setup for this connection
+ */
+static int fuse_uring_conn_cfg(struct fuse_conn *fc,
+			       struct fuse_uring_cfg *cfg)
+__must_hold(fc->ring.stop_waitq.lock)
+{
+	size_t queue_sz;
+
+	if (cfg->nr_queues == 0) {
+		pr_info("zero number of queues is invalid.\n");
+		return -EINVAL;
+	}
+
+	if (cfg->nr_queues > 1 &&
+	    cfg->nr_queues != num_present_cpus()) {
+		pr_info("nr-queues (%d) does not match nr-cores (%d).\n",
+			cfg->nr_queues, num_present_cpus());
+		return -EINVAL;
+	}
+
+	if (cfg->qid > cfg->nr_queues) {
+		pr_info("qid (%d) exceeds number of queues (%d)\n",
+			cfg->qid, cfg->nr_queues);
+		return -EINVAL;
+	}
+
+	if (cfg->req_arg_len < FUSE_RING_MIN_IN_OUT_ARG_SIZE) {
+		pr_info("Per req buffer size too small (%d), min: %d\n",
+			cfg->req_arg_len, FUSE_RING_MIN_IN_OUT_ARG_SIZE);
+		return -EINVAL;
+	}
+
+	if (unlikely(fc->ring.queues)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	fc->ring.daemon = current;
+	get_task_struct(fc->ring.daemon);
+
+	fc->ring.nr_queues = cfg->nr_queues;
+	fc->ring.per_core_queue = cfg->nr_queues > 1;
+
+	fc->ring.max_fg = cfg->fg_queue_depth;
+	fc->ring.max_bg = cfg->bg_queue_depth;
+	fc->ring.queue_depth = cfg->fg_queue_depth + cfg->bg_queue_depth;
+
+	fc->ring.req_arg_len = cfg->req_arg_len;
+	fc->ring.req_buf_sz =
+		round_up(sizeof(struct fuse_ring_req) + fc->ring.req_arg_len,
+			 PAGE_SIZE);
+
+	/* verified during mmap that kernel and userspace have the same
+	 * buffer size
+	 */
+	fc->ring.queue_buf_size = fc->ring.req_buf_sz * fc->ring.queue_depth;
+
+	queue_sz = sizeof(*fc->ring.queues) +
+			fc->ring.queue_depth * sizeof(struct fuse_ring_ent);
+	fc->ring.queues = kcalloc(cfg->nr_queues, queue_sz, GFP_KERNEL);
+	if (!fc->ring.queues)
+		return -ENOMEM;
+	fc->ring.queue_size = queue_sz;
+
+	fc->ring.queue_refs = 0;
+
+	return 0;
+}
+
+static int fuse_uring_queue_cfg(struct fuse_conn *fc, unsigned int qid,
+				unsigned int node_id)
+__must_hold(fc->ring.stop_waitq.lock)
+{
+	int tag;
+	struct fuse_ring_queue *queue;
+	char *buf;
+
+	if (qid >= fc->ring.nr_queues) {
+		pr_info("fuse ring queue config: qid=%u >= nr-queues=%zu\n",
+			qid, fc->ring.nr_queues);
+		return -EINVAL;
+	}
+	queue = fuse_uring_get_queue(fc, qid);
+
+	if (queue->configured) {
+		pr_info("fuse ring qid=%u already configured!\n", qid);
+		return -EALREADY;
+	}
+
+	queue->qid = qid;
+	queue->fc = fc;
+	queue->req_fg = 0;
+	bitmap_zero(queue->req_avail_map, fc->ring.queue_depth);
+	spin_lock_init(&queue->lock);
+	INIT_LIST_HEAD(&queue->fg_queue);
+	INIT_LIST_HEAD(&queue->bg_queue);
+
+	buf = fuse_uring_alloc_queue_buf(fc->ring.queue_buf_size, node_id);
+	queue->queue_req_buf = buf;
+	if (IS_ERR(queue->queue_req_buf)) {
+		int err = PTR_ERR(queue->queue_req_buf);
+
+		queue->queue_req_buf = NULL;
+		return err;
+	}
+
+	for (tag = 0; tag < fc->ring.queue_depth; tag++) {
+		struct fuse_ring_ent *ent = &queue->ring_ent[tag];
+
+		ent->queue = queue;
+		ent->tag = tag;
+		ent->fuse_req = NULL;
+		ent->rreq = (struct fuse_ring_req *)buf;
+
+		pr_devel("initialize qid=%d tag=%d queue=%p req=%p",
+			 qid, tag, queue, ent);
+
+		ent->rreq->flags = 0;
+
+		ent->state = FRRS_INIT;
+		ent->need_cmd_done = 0;
+		ent->need_req_end = 0;
+		fc->ring.queue_refs++;
+		buf += fc->ring.req_buf_sz;
+	}
+
+	queue->configured = 1;
+	queue->aborted = 0;
+	fc->ring.nr_queues_ioctl_init++;
+	if (fc->ring.nr_queues_ioctl_init == fc->ring.nr_queues) {
+		fc->ring.configured = 1;
+		pr_devel("fc=%p nr-queues=%zu depth=%zu ioctl ready\n",
+			fc, fc->ring.nr_queues, fc->ring.queue_depth);
+	}
+
+	return 0;
+}
+
+/**
+ * Configure the queue for t he given qid. First call will also initialize
+ * the ring for this connection.
+ */
+static int fuse_uring_cfg(struct fuse_conn *fc, unsigned int qid,
+			  struct fuse_uring_cfg *cfg)
+{
+	int rc;
+
+	/* The lock is taken, so that user space may configure all queues
+	 * in parallel
+	 */
+	mutex_lock(&fc->ring.start_stop_lock);
+
+	if (fc->ring.configured) {
+		rc = -EALREADY;
+		goto unlock;
+	}
+
+	if (fc->ring.daemon == NULL) {
+		rc = fuse_uring_conn_cfg(fc, cfg);
+		if (rc != 0)
+			goto unlock;
+	}
+
+	rc = fuse_uring_queue_cfg(fc, qid, cfg->numa_node_id);
+
+unlock:
+	mutex_unlock(&fc->ring.start_stop_lock);
+
+	return rc;
+}
+
+int fuse_uring_ioctl(struct file *file, struct fuse_uring_cfg *cfg)
+{
+	struct fuse_dev *fud = fuse_get_dev(file);
+	struct fuse_conn *fc;
+
+	if (fud == NULL)
+		return -ENODEV;
+
+	fc = fud->fc;
+
+	pr_devel("%s fc=%p flags=%x cmd=%d qid=%d nq=%d fg=%d bg=%d\n",
+		 __func__, fc, cfg->flags, cfg->cmd, cfg->qid, cfg->nr_queues,
+		 cfg->fg_queue_depth, cfg->bg_queue_depth);
+
+
+	switch (cfg->cmd) {
+	case FUSE_URING_IOCTL_CMD_QUEUE_CFG:
+		return fuse_uring_cfg(fc, cfg->qid, cfg);
+	default:
+		return -EINVAL;
+	}
+
+	/* no cmd flag set */
+	return -EINVAL;
+}
+
+/**
+ * Finalize the ring destruction when queue ref counters are zero.
+ */
+void fuse_uring_ring_destruct(struct fuse_conn *fc)
+{
+	unsigned int qid;
+
+	if (READ_ONCE(fc->ring.queue_refs) != 0) {
+		pr_info("fc=%p refs=%d configured=%d",
+			fc, fc->ring.queue_refs, fc->ring.configured);
+		WARN_ON(1);
+		return;
+	}
+
+	put_task_struct(fc->ring.daemon);
+	fc->ring.daemon = NULL;
+
+	for (qid = 0; qid < fc->ring.nr_queues; qid++) {
+		int tag;
+		struct fuse_ring_queue *queue = fuse_uring_get_queue(fc, qid);
+
+		if (!queue->configured)
+			continue;
+
+		for (tag = 0; tag < fc->ring.queue_depth; tag++) {
+			struct fuse_ring_ent *ent = &queue->ring_ent[tag];
+
+			if (ent->need_cmd_done) {
+				pr_warn("fc=%p qid=%d tag=%d cmd not done\n",
+					fc, qid, tag);
+				io_uring_cmd_done(ent->cmd, -ENOTCONN, 0);
+				ent->need_cmd_done = 0;
+			}
+		}
+
+		vfree(queue->queue_req_buf);
+	}
+
+	kfree(fc->ring.queues);
+	fc->ring.queues = NULL;
+	fc->ring.nr_queues_ioctl_init = 0;
+	fc->ring.queue_depth = 0;
+	fc->ring.nr_queues = 0;
+}
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
new file mode 100644
index 000000000000..4ab440ee00f2
--- /dev/null
+++ b/fs/fuse/dev_uring_i.h
@@ -0,0 +1,20 @@ 
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+ */
+
+#ifndef _FS_FUSE_DEV_URING_I_H
+#define _FS_FUSE_DEV_URING_I_H
+
+#include "fuse_i.h"
+
+void fuse_uring_end_requests(struct fuse_conn *fc);
+void fuse_uring_ring_destruct(struct fuse_conn *fc);
+int fuse_uring_ioctl(struct file *file, struct fuse_uring_cfg *cfg);
+#endif
+
+
+
+
+
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 46797a171a84..634d90084690 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -529,6 +529,177 @@  struct fuse_sync_bucket {
 	struct rcu_head rcu;
 };
 
+enum fuse_ring_req_state {
+
+	FRRS_INVALD = 0,
+
+	/* request is basially initialied */
+	FRRS_INIT = 1u << 0,
+
+	/* request is committed from user space and waiting for a new fuse req */
+	FRRS_FUSE_FETCH_COMMIT = 1u << 1,
+
+	/* The ring request waits for a new fuse request */
+	 FRRS_FUSE_WAIT = 1u << 2,
+
+	/* The ring req got assigned a fuse req */
+	FRRS_FUSE_REQ = 1u << 3,
+
+	/* request is in or on the way to user space */
+	FRRS_USERSPACE = 1u << 4,
+
+	/* process is in the process to get freed */
+	FRRS_FREEING   = 1u << 5,
+
+	/* fuse_req_end was already done */
+	FRRS_FUSE_REQ_END = 1u << 6,
+
+	/* And error in the uring cmd command receiving function
+	 * request will then go back to user space
+	 */
+	FRRS_CMD_ERR      = 1u << 7,
+
+	/* request is released */
+	FRRS_FREED = 1u << 8,
+};
+
+/** A fuse ring entry, part of the ring queue */
+struct fuse_ring_ent {
+	/* pointer to kernel request buffer, userspace side has direct access
+	 * to it through the mmaped buffer
+	 */
+	struct fuse_ring_req *rreq;
+
+	int tag;
+
+	struct fuse_ring_queue *queue;
+
+	/* state the request is currently in */
+	u64 state;
+
+	int need_cmd_done:1;
+	int need_req_end:1;
+
+	struct fuse_req *fuse_req; /* when a list request is handled */
+
+	struct io_uring_cmd *cmd;
+};
+
+/* IORING_MAX_ENTRIES */
+#define FUSE_URING_MAX_QUEUE_DEPTH 32768
+
+struct fuse_ring_queue {
+	unsigned long flags;
+
+	struct fuse_conn *fc;
+
+	int qid;
+
+	/* This bitmap holds, which entries are available in the fuse_ring_ent
+	 * array.
+	 * XXX: Is there a way to make this dynamic
+	 */
+	DECLARE_BITMAP(req_avail_map, FUSE_URING_MAX_QUEUE_DEPTH);
+
+	/* available number of foreground requests  */
+	int req_fg;
+
+	/* available number of background requests */
+	int req_bg;
+
+	/* queue lock, taken when any value in the queue changes _and_ also
+	 * a ring entry state changes.
+	 */
+	spinlock_t lock;
+
+	/* per queue memory buffer that is divided per request */
+	char *queue_req_buf;
+
+	struct list_head bg_queue;
+	struct list_head fg_queue;
+
+	int configured:1;
+	int aborted:1;
+
+	/* size depends on queue depth */
+	struct fuse_ring_ent ring_ent[] ____cacheline_aligned_in_smp;
+};
+
+/**
+ * Describes if uring is for communication and holds alls the data needed
+ * for uring communication
+ */
+struct fuse_ring {
+
+	/* number of ring queues */
+	size_t nr_queues;
+
+	/* number of entries per queue */
+	size_t queue_depth;
+
+	/* max arg size for a request */
+	size_t req_arg_len;
+
+	/* req_arg_len + sizeof(struct fuse_req) */
+	size_t req_buf_sz;
+
+	/* max number of background requests per queue */
+	size_t max_bg;
+
+	/* max number of foreground requests */
+	size_t max_fg;
+
+	/* size of struct fuse_ring_queue + queue-depth * entry-size */
+	size_t queue_size;
+
+	/* buffer size per queue, that is used per queue entry */
+	size_t queue_buf_size;
+
+	/* When zero the queue can be freed on destruction */
+	int queue_refs;
+
+	/* Hold ring requests */
+	struct fuse_ring_queue *queues;
+
+	/* number of initialized queues with the ioctl */
+	int nr_queues_ioctl_init;
+
+	/* number of initialized queues with the uring cmd */
+	atomic_t nr_queues_cmd_init;
+
+	/* one queue per core or a single queue only ? */
+	unsigned int per_core_queue:1;
+
+	/* userspace sent a stop ioctl */
+	unsigned int stop_requested:1;
+
+	/* Is the ring completely iocl configured */
+	unsigned int configured:1;
+
+	/* Is the ring read to take requests */
+	unsigned int ready:1;
+
+	/* used on shutdown */
+	unsigned int queues_stopped:1;
+
+	/* userspace process */
+	struct task_struct *daemon;
+
+	struct mutex start_stop_lock;
+
+	/* userspace has a special thread that exists only to wait
+	 * in the kernel for process stop, to release uring
+	 */
+	wait_queue_head_t stop_waitq;
+
+	/* The daemon might get killed and uring then needs
+	 * to be released without getting a umount notification, this
+	 * workqueue exists to release uring even without a process
+	 * being hold in the stop_waitq
+	 */
+	struct delayed_work stop_monitor;
+};
+
 /**
  * A Fuse connection.
  *
@@ -836,6 +1007,13 @@  struct fuse_conn {
 
 	/* New writepages go into this bucket */
 	struct fuse_sync_bucket __rcu *curr_bucket;
+
+	/*
+	 * XXX Move to struct fuse_dev?
+	 * XXX Allocate dynamically?
+	 */
+	/**  uring connection information*/
+	struct fuse_ring ring;
 };
 
 /*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index de9b9ec5ce81..3f765e65a7b0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -7,6 +7,7 @@ 
 */
 
 #include "fuse_i.h"
+#include "dev_uring_i.h"
 
 #include <linux/pagemap.h>
 #include <linux/slab.h>
@@ -855,6 +856,9 @@  void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
 	fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
 	fc->max_pages_limit = FUSE_MAX_MAX_PAGES;
 
+	mutex_init(&fc->ring.start_stop_lock);
+	fc->ring.daemon = NULL;
+
 	INIT_LIST_HEAD(&fc->mounts);
 	list_add(&fm->fc_entry, &fc->mounts);
 	fm->fc = fc;
@@ -1785,6 +1789,9 @@  void fuse_conn_destroy(struct fuse_mount *fm)
 		fuse_ctl_remove_conn(fc);
 		mutex_unlock(&fuse_mutex);
 	}
+
+	if (fc->ring.daemon != NULL)
+		fuse_uring_ring_destruct(fc);
 }
 EXPORT_SYMBOL_GPL(fuse_conn_destroy);