Message ID | 20240529-fuse-uring-for-6-9-rfc2-out-v1-7-d149476b1d65@ddn.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | fuse: fuse-over-io-uring | expand |
On Wed, May 29, 2024 at 08:00:42PM +0200, Bernd Schubert wrote: > Signed-off-by: Bernd Schubert <bschubert@ddn.com> > --- > fs/fuse/dev.c | 3 ++ > fs/fuse/dev_uring.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++ > fs/fuse/dev_uring_i.h | 22 +++++++++ > include/uapi/linux/fuse.h | 3 ++ > 4 files changed, 142 insertions(+) > > diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c > index bc77413932cf..349c1d16b0df 100644 > --- a/fs/fuse/dev.c > +++ b/fs/fuse/dev.c > @@ -2470,6 +2470,9 @@ const struct file_operations fuse_dev_operations = { > .fasync = fuse_dev_fasync, > .unlocked_ioctl = fuse_dev_ioctl, > .compat_ioctl = compat_ptr_ioctl, > +#if IS_ENABLED(CONFIG_FUSE_IO_URING) I'm loathe to use #if IS_ENABLED() when we can use #ifdef CONFIG_FUSE_IO_URING which is more standard across the kernel. > + .mmap = fuse_uring_mmap, > +#endif > }; > EXPORT_SYMBOL_GPL(fuse_dev_operations); > > diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c > index 702a994cf192..9491bdaa5716 100644 > --- a/fs/fuse/dev_uring.c > +++ b/fs/fuse/dev_uring.c > @@ -120,3 +120,117 @@ void fuse_uring_ring_destruct(struct fuse_ring *ring) > > mutex_destroy(&ring->start_stop_lock); > } > + > +static inline int fuse_uring_current_nodeid(void) > +{ > + int cpu; > + const struct cpumask *proc_mask = current->cpus_ptr; > + > + cpu = cpumask_first(proc_mask); > + > + return cpu_to_node(cpu); You don't need this, just use numa_node_id(); > +} > + > +static char *fuse_uring_alloc_queue_buf(int size, int node) > +{ > + char *buf; > + > + if (size <= 0) { > + pr_info("Invalid queue buf size: %d.\n", size); > + return ERR_PTR(-EINVAL); > + } > + > + buf = vmalloc_node_user(size, node); > + return buf ? buf : ERR_PTR(-ENOMEM); > +} This is excessive, we base size off of ring->queue_buf_size, or the fuse_uring_mmap() size we get from the vma, which I don't think can ever be 0 or negative. I think we just validate that ->queue_buf_size is always correct, and if we're really worried about it in fuse_uring_mmap we validate that sz is correct there, and then we just use vmalloc_node_user() directly instead of having this helper. > + > +/** > + * fuse uring mmap, per ring qeuue. > + * Userpsace maps a kernel allocated ring/queue buffer. For numa awareness, > + * userspace needs to run the do the mapping from a core bound thread. > + */ > +int > +fuse_uring_mmap(struct file *filp, struct vm_area_struct *vma) I'm not seeing anywhere else in the fuse code that has this style, I'd prefer we keep it consistent with the rest of the kernel and have int fuse_uring_mmap(struct file *filp, struct vm_area_struct *vma) additionally you're using the docstyle strings without the actual docstyle formatting, which is pissing off my git hooks that run checkpatch. Not a big deal, but if you're going to provide docstyle comments then please do it formatted properly, or just do a normal /* * fuse uring mmap.... */ > +{ > + struct fuse_dev *fud = fuse_get_dev(filp); > + struct fuse_conn *fc; > + struct fuse_ring *ring; > + size_t sz = vma->vm_end - vma->vm_start; > + int ret; > + struct fuse_uring_mbuf *new_node = NULL; > + void *buf = NULL; > + int nodeid; > + > + if (vma->vm_pgoff << PAGE_SHIFT != FUSE_URING_MMAP_OFF) { > + pr_debug("Invalid offset, expected %llu got %lu\n", > + FUSE_URING_MMAP_OFF, vma->vm_pgoff << PAGE_SHIFT); > + return -EINVAL; > + } > + > + if (!fud) > + return -ENODEV; > + fc = fud->fc; > + ring = fc->ring; > + if (!ring) > + return -ENODEV; > + > + nodeid = ring->numa_aware ? fuse_uring_current_nodeid() : NUMA_NO_NODE; nodeid = ring->numa_awayre ? numa_node_id() : NUMA_NO_NODE; > + > + /* check if uring is configured and if the requested size matches */ > + if (ring->nr_queues == 0 || ring->queue_depth == 0) { > + ret = -EINVAL; > + goto out; > + } > + > + if (sz != ring->queue_buf_size) { > + ret = -EINVAL; > + pr_devel("mmap size mismatch, expected %zu got %zu\n", > + ring->queue_buf_size, sz); > + goto out; > + } > + > + if (current->nr_cpus_allowed != 1 && ring->numa_aware) { > + ret = -EINVAL; > + pr_debug( > + "Numa awareness, but thread has more than allowed cpu.\n"); > + goto out; > + } > + > + buf = fuse_uring_alloc_queue_buf(ring->queue_buf_size, nodeid); > + if (IS_ERR(buf)) { > + ret = PTR_ERR(buf); > + goto out; > + } All of the above you can just return ret, you don't have to jump to out. > + > + new_node = kmalloc(sizeof(*new_node), GFP_USER); > + if (unlikely(new_node == NULL)) { > + ret = -ENOMEM; > + goto out; Here I would just if (unlikely(new_node == NULL)) { vfree(buf); return -ENOMEM; } > + } > + > + ret = remap_vmalloc_range(vma, buf, 0); > + if (ret) > + goto out; And since this is the only place we can fail with both things allocated I'd just if (ret) { vfree(buf); kfree(new_node); return ret; } and then drop the bit below where you free the buffers if there's an error. > + > + mutex_lock(&ring->start_stop_lock); > + /* > + * In this function we do not know the queue the buffer belongs to. > + * Later server side will pass the mmaped address, the kernel address > + * will be found through the map. > + */ > + new_node->kbuf = buf; > + new_node->ubuf = (void *)vma->vm_start; > + rb_add(&new_node->rb_node, &ring->mem_buf_map, > + fuse_uring_rb_tree_buf_less); > + mutex_unlock(&ring->start_stop_lock); > +out: > + if (ret) { > + kfree(new_node); > + vfree(buf); > + } > + > + pr_devel("%s: pid %d addr: %p sz: %zu ret: %d\n", __func__, > + current->pid, (char *)vma->vm_start, sz, ret); > + > + return ret; > +} > diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h > index 58ab4671deff..c455ae0e729a 100644 > --- a/fs/fuse/dev_uring_i.h > +++ b/fs/fuse/dev_uring_i.h > @@ -181,6 +181,7 @@ struct fuse_ring { > > void fuse_uring_abort_end_requests(struct fuse_ring *ring); > int fuse_uring_conn_cfg(struct fuse_ring *ring, struct fuse_ring_config *rcfg); > +int fuse_uring_mmap(struct file *filp, struct vm_area_struct *vma); > int fuse_uring_queue_cfg(struct fuse_ring *ring, > struct fuse_ring_queue_config *qcfg); > void fuse_uring_ring_destruct(struct fuse_ring *ring); > @@ -208,6 +209,27 @@ static inline void fuse_uring_conn_destruct(struct fuse_conn *fc) > kfree(ring); > } > > +static inline int fuse_uring_rb_tree_buf_cmp(const void *key, > + const struct rb_node *node) > +{ > + const struct fuse_uring_mbuf *entry = > + rb_entry(node, struct fuse_uring_mbuf, rb_node); > + > + if (key == entry->ubuf) > + return 0; > + > + return (unsigned long)key < (unsigned long)entry->ubuf ? -1 : 1; > +} > + > +static inline bool fuse_uring_rb_tree_buf_less(struct rb_node *node1, > + const struct rb_node *node2) > +{ > + const struct fuse_uring_mbuf *entry1 = > + rb_entry(node1, struct fuse_uring_mbuf, rb_node); > + > + return fuse_uring_rb_tree_buf_cmp(entry1->ubuf, node2) < 0; > +} > + These are only used in dev_uring.c, just put them in there instead of the header file. Thanks, Josef
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index bc77413932cf..349c1d16b0df 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2470,6 +2470,9 @@ const struct file_operations fuse_dev_operations = { .fasync = fuse_dev_fasync, .unlocked_ioctl = fuse_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, +#if IS_ENABLED(CONFIG_FUSE_IO_URING) + .mmap = fuse_uring_mmap, +#endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 702a994cf192..9491bdaa5716 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -120,3 +120,117 @@ void fuse_uring_ring_destruct(struct fuse_ring *ring) mutex_destroy(&ring->start_stop_lock); } + +static inline int fuse_uring_current_nodeid(void) +{ + int cpu; + const struct cpumask *proc_mask = current->cpus_ptr; + + cpu = cpumask_first(proc_mask); + + return cpu_to_node(cpu); +} + +static char *fuse_uring_alloc_queue_buf(int size, int node) +{ + char *buf; + + if (size <= 0) { + pr_info("Invalid queue buf size: %d.\n", size); + return ERR_PTR(-EINVAL); + } + + buf = vmalloc_node_user(size, node); + return buf ? buf : ERR_PTR(-ENOMEM); +} + +/** + * fuse uring mmap, per ring qeuue. + * Userpsace maps a kernel allocated ring/queue buffer. For numa awareness, + * userspace needs to run the do the mapping from a core bound thread. + */ +int +fuse_uring_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct fuse_dev *fud = fuse_get_dev(filp); + struct fuse_conn *fc; + struct fuse_ring *ring; + size_t sz = vma->vm_end - vma->vm_start; + int ret; + struct fuse_uring_mbuf *new_node = NULL; + void *buf = NULL; + int nodeid; + + if (vma->vm_pgoff << PAGE_SHIFT != FUSE_URING_MMAP_OFF) { + pr_debug("Invalid offset, expected %llu got %lu\n", + FUSE_URING_MMAP_OFF, vma->vm_pgoff << PAGE_SHIFT); + return -EINVAL; + } + + if (!fud) + return -ENODEV; + fc = fud->fc; + ring = fc->ring; + if (!ring) + return -ENODEV; + + nodeid = ring->numa_aware ? fuse_uring_current_nodeid() : NUMA_NO_NODE; + + /* check if uring is configured and if the requested size matches */ + if (ring->nr_queues == 0 || ring->queue_depth == 0) { + ret = -EINVAL; + goto out; + } + + if (sz != ring->queue_buf_size) { + ret = -EINVAL; + pr_devel("mmap size mismatch, expected %zu got %zu\n", + ring->queue_buf_size, sz); + goto out; + } + + if (current->nr_cpus_allowed != 1 && ring->numa_aware) { + ret = -EINVAL; + pr_debug( + "Numa awareness, but thread has more than allowed cpu.\n"); + goto out; + } + + buf = fuse_uring_alloc_queue_buf(ring->queue_buf_size, nodeid); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out; + } + + new_node = kmalloc(sizeof(*new_node), GFP_USER); + if (unlikely(new_node == NULL)) { + ret = -ENOMEM; + goto out; + } + + ret = remap_vmalloc_range(vma, buf, 0); + if (ret) + goto out; + + mutex_lock(&ring->start_stop_lock); + /* + * In this function we do not know the queue the buffer belongs to. + * Later server side will pass the mmaped address, the kernel address + * will be found through the map. + */ + new_node->kbuf = buf; + new_node->ubuf = (void *)vma->vm_start; + rb_add(&new_node->rb_node, &ring->mem_buf_map, + fuse_uring_rb_tree_buf_less); + mutex_unlock(&ring->start_stop_lock); +out: + if (ret) { + kfree(new_node); + vfree(buf); + } + + pr_devel("%s: pid %d addr: %p sz: %zu ret: %d\n", __func__, + current->pid, (char *)vma->vm_start, sz, ret); + + return ret; +} diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 58ab4671deff..c455ae0e729a 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -181,6 +181,7 @@ struct fuse_ring { void fuse_uring_abort_end_requests(struct fuse_ring *ring); int fuse_uring_conn_cfg(struct fuse_ring *ring, struct fuse_ring_config *rcfg); +int fuse_uring_mmap(struct file *filp, struct vm_area_struct *vma); int fuse_uring_queue_cfg(struct fuse_ring *ring, struct fuse_ring_queue_config *qcfg); void fuse_uring_ring_destruct(struct fuse_ring *ring); @@ -208,6 +209,27 @@ static inline void fuse_uring_conn_destruct(struct fuse_conn *fc) kfree(ring); } +static inline int fuse_uring_rb_tree_buf_cmp(const void *key, + const struct rb_node *node) +{ + const struct fuse_uring_mbuf *entry = + rb_entry(node, struct fuse_uring_mbuf, rb_node); + + if (key == entry->ubuf) + return 0; + + return (unsigned long)key < (unsigned long)entry->ubuf ? -1 : 1; +} + +static inline bool fuse_uring_rb_tree_buf_less(struct rb_node *node1, + const struct rb_node *node2) +{ + const struct fuse_uring_mbuf *entry1 = + rb_entry(node1, struct fuse_uring_mbuf, rb_node); + + return fuse_uring_rb_tree_buf_cmp(entry1->ubuf, node2) < 0; +} + static inline struct fuse_ring_queue * fuse_uring_get_queue(struct fuse_ring *ring, int qid) { diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 0449640f2501..00d0154ec2da 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1259,4 +1259,7 @@ struct fuse_supp_groups { #define FUSE_RING_HEADER_BUF_SIZE 4096 #define FUSE_RING_MIN_IN_OUT_ARG_SIZE 4096 +/* The offset parameter is used to identify the request type */ +#define FUSE_URING_MMAP_OFF 0xf8000000ULL + #endif /* _LINUX_FUSE_H */
Signed-off-by: Bernd Schubert <bschubert@ddn.com> --- fs/fuse/dev.c | 3 ++ fs/fuse/dev_uring.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 22 +++++++++ include/uapi/linux/fuse.h | 3 ++ 4 files changed, 142 insertions(+)