@@ -63,3 +63,15 @@ config FUSE_PASSTHROUGH
to be performed directly on a backing file.
If you want to allow passthrough operations, answer Y.
+
+config FUSE_IO_URING
+ bool "FUSE communication over io-uring"
+ default y
+ depends on FUSE_FS
+ depends on IO_URING
+ help
+ This allows sending FUSE requests over the IO uring interface and
+ also adds request core affinity.
+
+ If you want to allow fuse server/client communication through io-uring,
+ answer Y
@@ -11,5 +11,6 @@ fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
fuse-y += iomode.o
fuse-$(CONFIG_FUSE_DAX) += dax.o
fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
+fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
virtiofs-y := virtio_fs.o
@@ -6,6 +6,7 @@
See the file COPYING.
*/
+#include "dev_uring_i.h"
#include "fuse_i.h"
#include "fuse_dev_i.h"
@@ -2398,6 +2399,9 @@ const struct file_operations fuse_dev_operations = {
.fasync = fuse_dev_fasync,
.unlocked_ioctl = fuse_dev_ioctl,
.compat_ioctl = compat_ptr_ioctl,
+#ifdef CONFIG_FUSE_IO_URING
+ .uring_cmd = fuse_uring_cmd,
+#endif
};
EXPORT_SYMBOL_GPL(fuse_dev_operations);
new file mode 100644
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (c) 2023-2024 DataDirect Networks.
+ */
+
+#include <linux/fs.h>
+
+#include "fuse_i.h"
+#include "dev_uring_i.h"
+#include "fuse_dev_i.h"
+
+#include <linux/io_uring/cmd.h>
+
+#ifdef CONFIG_FUSE_IO_URING
+static bool __read_mostly enable_uring;
+module_param(enable_uring, bool, 0644);
+MODULE_PARM_DESC(enable_uring,
+ "Enable uring userspace communication through uring.");
+#endif
+
+static int fuse_ring_ring_ent_unset_userspace(struct fuse_ring_ent *ent)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+
+ lockdep_assert_held(&queue->lock);
+
+ if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE))
+ return -EIO;
+
+ ent->state = FRRS_COMMIT;
+ list_move(&ent->list, &queue->ent_intermediate_queue);
+
+ return 0;
+}
+
+void fuse_uring_destruct(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+ int qid;
+
+ if (!ring)
+ return;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = ring->queues[qid];
+
+ if (!queue)
+ continue;
+
+ WARN_ON(!list_empty(&queue->ent_avail_queue));
+ WARN_ON(!list_empty(&queue->ent_intermediate_queue));
+
+ kfree(queue);
+ ring->queues[qid] = NULL;
+ }
+
+ kfree(ring->queues);
+ kfree(ring);
+ fc->ring = NULL;
+}
+
+/*
+ * Basic ring setup for this connection based on the provided configuration
+ */
+static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = NULL;
+ size_t nr_queues = num_possible_cpus();
+ struct fuse_ring *res = NULL;
+
+ ring = kzalloc(sizeof(*fc->ring) +
+ nr_queues * sizeof(struct fuse_ring_queue),
+ GFP_KERNEL_ACCOUNT);
+ if (!ring)
+ return NULL;
+
+ ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *),
+ GFP_KERNEL_ACCOUNT);
+ if (!ring->queues)
+ goto out_err;
+
+ spin_lock(&fc->lock);
+ if (fc->ring) {
+ /* race, another thread created the ring in the mean time */
+ spin_unlock(&fc->lock);
+ res = fc->ring;
+ goto out_err;
+ }
+
+ fc->ring = ring;
+ ring->nr_queues = nr_queues;
+ ring->fc = fc;
+
+ spin_unlock(&fc->lock);
+ return ring;
+
+out_err:
+ if (ring)
+ kfree(ring->queues);
+ kfree(ring);
+ return res;
+}
+
+static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
+ int qid)
+{
+ struct fuse_conn *fc = ring->fc;
+ struct fuse_ring_queue *queue;
+
+ queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT);
+ if (!queue)
+ return ERR_PTR(-ENOMEM);
+ spin_lock(&fc->lock);
+ if (ring->queues[qid]) {
+ spin_unlock(&fc->lock);
+ kfree(queue);
+ return ring->queues[qid];
+ }
+ ring->queues[qid] = queue;
+
+ queue->qid = qid;
+ queue->ring = ring;
+ spin_lock_init(&queue->lock);
+
+ INIT_LIST_HEAD(&queue->ent_avail_queue);
+ INIT_LIST_HEAD(&queue->ent_intermediate_queue);
+
+ spin_unlock(&fc->lock);
+
+ return queue;
+}
+
+/*
+ * Put a ring request onto hold, it is no longer used for now.
+ */
+static void fuse_uring_ent_avail(struct fuse_ring_ent *ring_ent,
+ struct fuse_ring_queue *queue)
+ __must_hold(&queue->lock)
+{
+ struct fuse_ring *ring = queue->ring;
+
+ lockdep_assert_held(&queue->lock);
+
+ /* unsets all previous flags - basically resets */
+ pr_devel("%s ring=%p qid=%d state=%d\n", __func__, ring,
+ ring_ent->queue->qid, ring_ent->state);
+
+ if (WARN_ON(ring_ent->state != FRRS_COMMIT)) {
+ pr_warn("%s qid=%d state=%d\n", __func__, ring_ent->queue->qid,
+ ring_ent->state);
+ return;
+ }
+
+ list_move(&ring_ent->list, &queue->ent_avail_queue);
+
+ ring_ent->state = FRRS_WAIT;
+}
+
+/*
+ * fuse_uring_req_fetch command handling
+ */
+static void _fuse_uring_fetch(struct fuse_ring_ent *ring_ent,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_queue *queue = ring_ent->queue;
+
+ spin_lock(&queue->lock);
+ fuse_uring_ent_avail(ring_ent, queue);
+ ring_ent->cmd = cmd;
+ spin_unlock(&queue->lock);
+}
+
+static int fuse_uring_fetch(struct io_uring_cmd *cmd, unsigned int issue_flags,
+ struct fuse_conn *fc)
+{
+ const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ struct fuse_ring_ent *ring_ent;
+ int err;
+
+#if 0
+ /* Does not work as sending over io-uring is async */
+ err = -ETXTBSY;
+ if (fc->initialized) {
+ pr_info_ratelimited(
+ "Received FUSE_URING_REQ_FETCH after connection is initialized\n");
+ return err;
+ }
+#endif
+
+ err = -ENOMEM;
+ if (!ring) {
+ ring = fuse_uring_create(fc);
+ if (!ring)
+ return err;
+ }
+
+ queue = ring->queues[cmd_req->qid];
+ if (!queue) {
+ queue = fuse_uring_create_queue(ring, cmd_req->qid);
+ if (!queue)
+ return err;
+ }
+
+ /*
+ * The created queue above does not need to be destructed in
+ * case of entry errors below, will be done at ring destruction time.
+ */
+
+ ring_ent = kzalloc(sizeof(*ring_ent), GFP_KERNEL_ACCOUNT);
+ if (ring_ent == NULL)
+ return err;
+
+ ring_ent->queue = queue;
+ ring_ent->cmd = cmd;
+ ring_ent->rreq = (struct fuse_ring_req __user *)cmd_req->buf_ptr;
+ ring_ent->max_arg_len = cmd_req->buf_len - sizeof(*ring_ent->rreq);
+ INIT_LIST_HEAD(&ring_ent->list);
+
+ spin_lock(&queue->lock);
+
+ /*
+ * FUSE_URING_REQ_FETCH is an initialization exception, needs
+ * state override
+ */
+ ring_ent->state = FRRS_USERSPACE;
+ err = fuse_ring_ring_ent_unset_userspace(ring_ent);
+ spin_unlock(&queue->lock);
+ if (WARN_ON_ONCE(err != 0))
+ goto err;
+
+ _fuse_uring_fetch(ring_ent, cmd, issue_flags);
+
+ return 0;
+err:
+ list_del_init(&ring_ent->list);
+ kfree(ring_ent);
+ return err;
+}
+
+/**
+ * Entry function from io_uring to handle the given passthrough command
+ * (op cocde IORING_OP_URING_CMD)
+ */
+int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
+ struct fuse_dev *fud;
+ struct fuse_conn *fc;
+ u32 cmd_op = cmd->cmd_op;
+ int err = 0;
+
+ pr_devel("%s:%d received: cmd op %d\n", __func__, __LINE__, cmd_op);
+
+ /* Disabled for now, especially as teardown is not implemented yet */
+ err = -EOPNOTSUPP;
+ pr_info_ratelimited("fuse-io-uring is not enabled yet\n");
+ goto out;
+
+ err = -EOPNOTSUPP;
+ if (!enable_uring) {
+ pr_info_ratelimited("uring is disabled\n");
+ goto out;
+ }
+
+ err = -ENOTCONN;
+ fud = fuse_get_dev(cmd->file);
+ if (!fud) {
+ pr_info_ratelimited("No fuse device found\n");
+ goto out;
+ }
+ fc = fud->fc;
+
+ if (fc->aborted)
+ goto out;
+
+ switch (cmd_op) {
+ case FUSE_URING_REQ_FETCH:
+ err = fuse_uring_fetch(cmd, issue_flags, fc);
+ break;
+ default:
+ err = -EINVAL;
+ pr_devel("Unknown uring command %d", cmd_op);
+ goto out;
+ }
+out:
+ pr_devel("uring cmd op=%d, qid=%d ID=%llu ret=%d\n", cmd_op,
+ cmd_req->qid, cmd_req->commit_id, err);
+
+ if (err < 0)
+ io_uring_cmd_done(cmd, err, 0, issue_flags);
+
+ return -EIOCBQUEUED;
+}
new file mode 100644
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * FUSE: Filesystem in Userspace
+ * Copyright (c) 2023-2024 DataDirect Networks.
+ */
+
+#ifndef _FS_FUSE_DEV_URING_I_H
+#define _FS_FUSE_DEV_URING_I_H
+
+#include "fuse_i.h"
+
+#ifdef CONFIG_FUSE_IO_URING
+
+enum fuse_ring_req_state {
+
+ /* ring entry received from userspace and it being processed */
+ FRRS_COMMIT,
+
+ /* The ring request waits for a new fuse request */
+ FRRS_WAIT,
+
+ /* request is in or on the way to user space */
+ FRRS_USERSPACE,
+};
+
+/** A fuse ring entry, part of the ring queue */
+struct fuse_ring_ent {
+ /* userspace buffer */
+ struct fuse_ring_req __user *rreq;
+
+ /* the ring queue that owns the request */
+ struct fuse_ring_queue *queue;
+
+ struct io_uring_cmd *cmd;
+
+ struct list_head list;
+
+ /*
+ * state the request is currently in
+ * (enum fuse_ring_req_state)
+ */
+ unsigned int state;
+
+ /* struct fuse_ring_req::in_out_arg size*/
+ size_t max_arg_len;
+};
+
+struct fuse_ring_queue {
+ /*
+ * back pointer to the main fuse uring structure that holds this
+ * queue
+ */
+ struct fuse_ring *ring;
+
+ /* queue id, typically also corresponds to the cpu core */
+ unsigned int qid;
+
+ /*
+ * queue lock, taken when any value in the queue changes _and_ also
+ * a ring entry state changes.
+ */
+ spinlock_t lock;
+
+ /* available ring entries (struct fuse_ring_ent) */
+ struct list_head ent_avail_queue;
+
+ /*
+ * entries in the process of being committed or in the process
+ * to be send to userspace
+ */
+ struct list_head ent_intermediate_queue;
+};
+
+/**
+ * Describes if uring is for communication and holds alls the data needed
+ * for uring communication
+ */
+struct fuse_ring {
+ /* back pointer */
+ struct fuse_conn *fc;
+
+ /* number of ring queues */
+ size_t nr_queues;
+
+ struct fuse_ring_queue **queues;
+};
+
+void fuse_uring_destruct(struct fuse_conn *fc);
+int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
+
+#else /* CONFIG_FUSE_IO_URING */
+
+struct fuse_ring;
+
+static inline void fuse_uring_create(struct fuse_conn *fc)
+{
+}
+
+static inline void fuse_uring_destruct(struct fuse_conn *fc)
+{
+}
+
+#endif /* CONFIG_FUSE_IO_URING */
+
+#endif /* _FS_FUSE_DEV_URING_I_H */
@@ -8,6 +8,7 @@
#include <linux/types.h>
+
/* Ordinary requests have even IDs, while interrupts IDs are odd */
#define FUSE_INT_REQ_BIT (1ULL << 0)
#define FUSE_REQ_ID_STEP (1ULL << 1)
@@ -917,6 +917,11 @@ struct fuse_conn {
/** IDR for backing files ids */
struct idr backing_files_map;
#endif
+
+#ifdef CONFIG_FUSE_IO_URING
+ /** uring connection information*/
+ struct fuse_ring *ring;
+#endif
};
/*
@@ -7,6 +7,7 @@
*/
#include "fuse_i.h"
+#include "dev_uring_i.h"
#include <linux/pagemap.h>
#include <linux/slab.h>
@@ -947,6 +948,8 @@ static void delayed_release(struct rcu_head *p)
{
struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu);
+ fuse_uring_destruct(fc);
+
put_user_ns(fc->user_ns);
fc->release(fc);
}
@@ -1186,4 +1186,74 @@ struct fuse_supp_groups {
uint32_t groups[];
};
+/**
+ * Size of the ring buffer header
+ */
+#define FUSE_RING_HEADER_BUF_SIZE 4096
+#define FUSE_RING_MIN_IN_OUT_ARG_SIZE 4096
+
+/*
+ * Request is background type. Daemon side is free to use this information
+ * to handle foreground/background CQEs with different priorities.
+ */
+#define FUSE_RING_REQ_FLAG_ASYNC (1ull << 0)
+
+/**
+ * This structure mapped onto the
+ */
+struct fuse_ring_req {
+ union {
+ /* The first 4K are command data */
+ char ring_header[FUSE_RING_HEADER_BUF_SIZE];
+
+ struct {
+ uint64_t flags;
+
+ uint32_t in_out_arg_len;
+ uint32_t padding;
+
+ /* kernel fills in, reads out */
+ union {
+ struct fuse_in_header in;
+ struct fuse_out_header out;
+ };
+ };
+ };
+
+ char in_out_arg[];
+};
+
+/**
+ * sqe commands to the kernel
+ */
+enum fuse_uring_cmd {
+ FUSE_URING_REQ_INVALID = 0,
+
+ /* submit sqe to kernel to get a request */
+ FUSE_URING_REQ_FETCH = 1,
+
+ /* commit result and fetch next request */
+ FUSE_URING_REQ_COMMIT_AND_FETCH = 2,
+};
+
+/**
+ * In the 80B command area of the SQE.
+ */
+struct fuse_uring_cmd_req {
+ /* User buffer */
+ uint64_t buf_ptr;
+
+ /* entry identifier */
+ uint64_t commit_id;
+
+ /* queue the command is for (queue index) */
+ uint16_t qid;
+ uint8_t padding[6];
+
+ /* length of the user buffer */
+ uint32_t buf_len;
+
+ uint32_t flags;
+};
+
#endif /* _LINUX_FUSE_H */
This adds basic support for ring SQEs (with opcode=IORING_OP_URING_CMD). For now only FUSE_URING_REQ_FETCH is handled to register queue entries. Signed-off-by: Bernd Schubert <bschubert@ddn.com> --- fs/fuse/Kconfig | 12 ++ fs/fuse/Makefile | 1 + fs/fuse/dev.c | 4 + fs/fuse/dev_uring.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 105 ++++++++++++++++ fs/fuse/fuse_dev_i.h | 1 + fs/fuse/fuse_i.h | 5 + fs/fuse/inode.c | 3 + include/uapi/linux/fuse.h | 70 +++++++++++ 9 files changed, 498 insertions(+)