diff mbox series

[RFC,v3,17/17] fuse: {uring} Pin the user buffer

Message ID 20240901-b4-fuse-uring-rfcv3-without-mmap-v3-17-9207f7391444@ddn.com (mailing list archive)
State New
Headers show
Series fuse: fuse-over-io-uring | expand

Commit Message

Bernd Schubert Sept. 1, 2024, 1:37 p.m. UTC
This is to allow copying into the buffer from the application
without the need to copy in ring context (and with that,
the need that the ring task is active in kernel space).

Also absolutely needed for now to avoid this teardown issue

 1525.905504] KASAN: null-ptr-deref in range [0x00000000000001a0-0x00000000000001a7]
[ 1525.910431] CPU: 15 PID: 183 Comm: kworker/15:1 Tainted: G           O       6.10.0+ #48
[ 1525.916449] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[ 1525.922470] Workqueue: events io_fallback_req_func
[ 1525.925840] RIP: 0010:__lock_acquire+0x74/0x7b80
[ 1525.929010] Code: 89 bc 24 80 00 00 00 0f 85 1c 5f 00 00 83 3d 6e 80 b0 02 00 0f 84 1d 12 00 00 83 3d 65 c7 67 02 00 74 27 48 89 f8 48 c1 e8 03 <42> 80 3c 30 00 74 0d e8 50 44 42 00 48 8b bc 24 80 00 00 00 48 c7
[ 1525.942211] RSP: 0018:ffff88810b2af490 EFLAGS: 00010002
[ 1525.945672] RAX: 0000000000000034 RBX: 0000000000000000 RCX: 0000000000000001
[ 1525.950421] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000000001a0
[ 1525.955200] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000
[ 1525.959979] R10: dffffc0000000000 R11: fffffbfff07b1cbe R12: 0000000000000000
[ 1525.964252] R13: 0000000000000001 R14: dffffc0000000000 R15: 0000000000000001
[ 1525.968225] FS:  0000000000000000(0000) GS:ffff88875b200000(0000) knlGS:0000000000000000
[ 1525.973932] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1525.976694] CR2: 00005555b6a381f0 CR3: 000000012f5f1000 CR4: 00000000000006f0
[ 1525.980030] Call Trace:
[ 1525.981371]  <TASK>
[ 1525.982567]  ? __die_body+0x66/0xb0
[ 1525.984376]  ? die_addr+0xc1/0x100
[ 1525.986111]  ? exc_general_protection+0x1c6/0x330
[ 1525.988401]  ? asm_exc_general_protection+0x22/0x30
[ 1525.990864]  ? __lock_acquire+0x74/0x7b80
[ 1525.992901]  ? mark_lock+0x9f/0x360
[ 1525.994635]  ? __lock_acquire+0x1420/0x7b80
[ 1525.996629]  ? attach_entity_load_avg+0x47d/0x550
[ 1525.998765]  ? hlock_conflict+0x5a/0x1f0
[ 1526.000515]  ? __bfs+0x2dc/0x5a0
[ 1526.001993]  lock_acquire+0x1fb/0x3d0
[ 1526.004727]  ? gup_fast_fallback+0x13f/0x1d80
[ 1526.006586]  ? gup_fast_fallback+0x13f/0x1d80
[ 1526.008412]  gup_fast_fallback+0x158/0x1d80
[ 1526.010170]  ? gup_fast_fallback+0x13f/0x1d80
[ 1526.011999]  ? __lock_acquire+0x2b07/0x7b80
[ 1526.013793]  __iov_iter_get_pages_alloc+0x36e/0x980
[ 1526.015876]  ? do_raw_spin_unlock+0x5a/0x8a0
[ 1526.017734]  iov_iter_get_pages2+0x56/0x70
[ 1526.019491]  fuse_copy_fill+0x48e/0x980 [fuse]
[ 1526.021400]  fuse_copy_args+0x174/0x6a0 [fuse]
[ 1526.023199]  fuse_uring_prepare_send+0x319/0x6c0 [fuse]
[ 1526.025178]  fuse_uring_send_req_in_task+0x42/0x100 [fuse]
[ 1526.027163]  io_fallback_req_func+0xb4/0x170
[ 1526.028737]  ? process_scheduled_works+0x75b/0x1160
[ 1526.030445]  process_scheduled_works+0x85c/0x1160
[ 1526.032073]  worker_thread+0x8ba/0xce0
[ 1526.033388]  kthread+0x23e/0x2b0
[ 1526.035404]  ? pr_cont_work_flush+0x290/0x290
[ 1526.036958]  ? kthread_blkcg+0xa0/0xa0
[ 1526.038321]  ret_from_fork+0x30/0x60
[ 1526.039600]  ? kthread_blkcg+0xa0/0xa0
[ 1526.040942]  ret_from_fork_asm+0x11/0x20
[ 1526.042353]  </TASK>

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
---
 fs/fuse/dev.c         |   9 +++
 fs/fuse/dev_uring.c   | 186 ++++++++++++++++++++++++++++++++------------------
 fs/fuse/dev_uring_i.h |  15 ++--
 fs/fuse/fuse_dev_i.h  |   2 +
 4 files changed, 143 insertions(+), 69 deletions(-)

Comments

Jens Axboe Sept. 4, 2024, 3:47 p.m. UTC | #1
On 9/1/24 7:37 AM, Bernd Schubert wrote:
> This is to allow copying into the buffer from the application
> without the need to copy in ring context (and with that,
> the need that the ring task is active in kernel space).
> 
> Also absolutely needed for now to avoid this teardown issue

I'm fine using these helpers, but they are absolutely not needed to
avoid that teardown issue - well they may help because it's already
mapped, but it's really the fault of your handler from attempting to map
in user pages from when it's teardown/fallback task_work. If invoked and
the ring is dying or not in the right task (as per the patch from
Pavel), then just cleanup and return -ECANCELED.

> +/*
> + * Copy from memmap.c, should be exported
> + */
> +static void io_pages_free(struct page ***pages, int npages)
> +{
> +	struct page **page_array = *pages;
> +
> +	if (!page_array)
> +		return;
> +
> +	unpin_user_pages(page_array, npages);
> +	kvfree(page_array);
> +	*pages = NULL;
> +}

I noticed this and the mapping helper being copied before seeing the
comments - just export them from memmap.c and use those rather than
copying in the code. Add that as a prep patch.

> @@ -417,6 +437,7 @@ static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
>  		goto seterr;
>  	}
>  
> +	/* FIXME copied from dev.c, check what 512 means  */
>  	if (oh->error <= -512 || oh->error > 0) {
>  		err = -EINVAL;
>  		goto seterr;

-512 is -ERESTARTSYS
Bernd Schubert Sept. 4, 2024, 4:08 p.m. UTC | #2
Hi Jens,

thanks for your help.

On 9/4/24 17:47, Jens Axboe wrote:
> On 9/1/24 7:37 AM, Bernd Schubert wrote:
>> This is to allow copying into the buffer from the application
>> without the need to copy in ring context (and with that,
>> the need that the ring task is active in kernel space).
>>
>> Also absolutely needed for now to avoid this teardown issue
> 
> I'm fine using these helpers, but they are absolutely not needed to
> avoid that teardown issue - well they may help because it's already
> mapped, but it's really the fault of your handler from attempting to map
> in user pages from when it's teardown/fallback task_work. If invoked and
> the ring is dying or not in the right task (as per the patch from
> Pavel), then just cleanup and return -ECANCELED.

As I had posted on Friday/Saturday, it didn't work. I had added a 
debug pr_info into Pavels patch, somehow it didn't trigger on PF_EXITING 
and I didn't further debug it yet as I was working on the pin anyway.
And since Monday occupied with other work...

For this series it is needed to avoid kernel crashes. If we can can fix 
patch 15 and 16, the better. Although we will still later on need it as
optimization.



> 
>> +/*
>> + * Copy from memmap.c, should be exported
>> + */
>> +static void io_pages_free(struct page ***pages, int npages)
>> +{
>> +	struct page **page_array = *pages;
>> +
>> +	if (!page_array)
>> +		return;
>> +
>> +	unpin_user_pages(page_array, npages);
>> +	kvfree(page_array);
>> +	*pages = NULL;
>> +}
> 
> I noticed this and the mapping helper being copied before seeing the
> comments - just export them from memmap.c and use those rather than
> copying in the code. Add that as a prep patch.

No issue to do that either. The hard part is then to get it through
different branches. I had removed the big optimization of 
__wake_up_on_current_cpu in this series, because it needs another
export.


> 
>> @@ -417,6 +437,7 @@ static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
>>  		goto seterr;
>>  	}
>>  
>> +	/* FIXME copied from dev.c, check what 512 means  */
>>  	if (oh->error <= -512 || oh->error > 0) {
>>  		err = -EINVAL;
>>  		goto seterr;
> 
> -512 is -ERESTARTSYS
> 

Ah thank you! I'm going to add separate patch for dev.c, as I wrote, this was
just a copy-and-paste.

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 592d0d96a106..779b23fa01c2 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2028,7 +2028,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
        }
 
        err = -EINVAL;
-       if (oh.error <= -512 || oh.error > 0)
+       if (oh.error <= -ERESTARTSYS || oh.error > 0)
                goto copy_finish;
 
        spin_lock(&fpq->lock);


Thanks,
Bernd
Jens Axboe Sept. 4, 2024, 4:16 p.m. UTC | #3
On 9/4/24 10:08 AM, Bernd Schubert wrote:
> Hi Jens,
> 
> thanks for your help.
> 
> On 9/4/24 17:47, Jens Axboe wrote:
>> On 9/1/24 7:37 AM, Bernd Schubert wrote:
>>> This is to allow copying into the buffer from the application
>>> without the need to copy in ring context (and with that,
>>> the need that the ring task is active in kernel space).
>>>
>>> Also absolutely needed for now to avoid this teardown issue
>>
>> I'm fine using these helpers, but they are absolutely not needed to
>> avoid that teardown issue - well they may help because it's already
>> mapped, but it's really the fault of your handler from attempting to map
>> in user pages from when it's teardown/fallback task_work. If invoked and
>> the ring is dying or not in the right task (as per the patch from
>> Pavel), then just cleanup and return -ECANCELED.
> 
> As I had posted on Friday/Saturday, it didn't work. I had added a 
> debug pr_info into Pavels patch, somehow it didn't trigger on PF_EXITING 
> and I didn't further debug it yet as I was working on the pin anyway.
> And since Monday occupied with other work...

Then there's something wrong with that patch, as it definitely should
work. How did you reproduce the teardown crash? I'll take a look here.

That said, it may indeed be the better approach to pin upfront. I just
want to make sure it's not done as a bug fix for something that should
not be happening.

> For this series it is needed to avoid kernel crashes. If we can can fix 
> patch 15 and 16, the better. Although we will still later on need it as
> optimization.

Yeah exactly, didn't see this before typing the above :-)

>>> +/*
>>> + * Copy from memmap.c, should be exported
>>> + */
>>> +static void io_pages_free(struct page ***pages, int npages)
>>> +{
>>> +	struct page **page_array = *pages;
>>> +
>>> +	if (!page_array)
>>> +		return;
>>> +
>>> +	unpin_user_pages(page_array, npages);
>>> +	kvfree(page_array);
>>> +	*pages = NULL;
>>> +}
>>
>> I noticed this and the mapping helper being copied before seeing the
>> comments - just export them from memmap.c and use those rather than
>> copying in the code. Add that as a prep patch.
> 
> No issue to do that either. The hard part is then to get it through
> different branches. I had removed the big optimization of 
> __wake_up_on_current_cpu in this series, because it needs another
> export.

It's not that hard, just split it out in the next patch and I'll be
happy to ack/review it so it can go in with the other patches rather
than needing to go in separately.
Jens Axboe Sept. 4, 2024, 6:59 p.m. UTC | #4
On 9/1/24 7:37 AM, Bernd Schubert wrote:
> @@ -465,53 +486,41 @@ static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
>  
>  static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
>  				     struct fuse_req *req,
> -				     struct fuse_ring_ent *ent)
> +				     struct fuse_ring_ent *ent,
> +				     struct fuse_ring_req *rreq)
>  {
> -	struct fuse_ring_req __user *rreq = ent->rreq;
>  	struct fuse_copy_state cs;
>  	struct fuse_args *args = req->args;
>  	struct iov_iter iter;
> -	int err;
> -	int res_arg_len;
> +	int res_arg_len, err;
>  
> -	err = copy_from_user(&res_arg_len, &rreq->in_out_arg_len,
> -			     sizeof(res_arg_len));
> -	if (err)
> -		return err;
> -
> -	err = import_ubuf(ITER_SOURCE, (void __user *)&rreq->in_out_arg,
> -			  ent->max_arg_len, &iter);
> -	if (err)
> -		return err;
> +	res_arg_len = rreq->in_out_arg_len;
>  
>  	fuse_copy_init(&cs, 0, &iter);
>  	cs.is_uring = 1;
> +	cs.ring.pages = &ent->user_pages[FUSE_RING_PAYLOAD_PG];
>  	cs.req = req;
>  
> -	return fuse_copy_out_args(&cs, args, res_arg_len);
> +	err = fuse_copy_out_args(&cs, args, res_arg_len);
> +
> +	return err;
>  }

This last assignment, and 'err' in general, can go away after this
patch.
Bernd Schubert Sept. 4, 2024, 7:25 p.m. UTC | #5
On 9/4/24 18:16, Jens Axboe wrote:
> On 9/4/24 10:08 AM, Bernd Schubert wrote:
>> Hi Jens,
>>
>> thanks for your help.
>>
>> On 9/4/24 17:47, Jens Axboe wrote:
>>> On 9/1/24 7:37 AM, Bernd Schubert wrote:
>>>> This is to allow copying into the buffer from the application
>>>> without the need to copy in ring context (and with that,
>>>> the need that the ring task is active in kernel space).
>>>>
>>>> Also absolutely needed for now to avoid this teardown issue
>>>
>>> I'm fine using these helpers, but they are absolutely not needed to
>>> avoid that teardown issue - well they may help because it's already
>>> mapped, but it's really the fault of your handler from attempting to map
>>> in user pages from when it's teardown/fallback task_work. If invoked and
>>> the ring is dying or not in the right task (as per the patch from
>>> Pavel), then just cleanup and return -ECANCELED.
>>
>> As I had posted on Friday/Saturday, it didn't work. I had added a 
>> debug pr_info into Pavels patch, somehow it didn't trigger on PF_EXITING 
>> and I didn't further debug it yet as I was working on the pin anyway.
>> And since Monday occupied with other work...
> 
> Then there's something wrong with that patch, as it definitely should
> work. How did you reproduce the teardown crash? I'll take a look here.

Thank you! In this specific case

1) Run passthrough_hp with --debug-fuse

2) dd if=/dev/zero of=/scratch/test/testfile bs=1M count=1

Then on the console that has passthrough_hp output and runs slow with my
ASAN/etc kernel: ctrl-z and kill -9 %
I guess a pkill -9 passthrough_hp should also work


But I can investigate later on myself what is the issue with PF_EXITING,
just not today and maybe not tomorrow either.

> 
> That said, it may indeed be the better approach to pin upfront. I just
> want to make sure it's not done as a bug fix for something that should
> not be happening.
> 
>> For this series it is needed to avoid kernel crashes. If we can can fix 
>> patch 15 and 16, the better. Although we will still later on need it as
>> optimization.
> 
> Yeah exactly, didn't see this before typing the above :-)
> 
>>>> +/*
>>>> + * Copy from memmap.c, should be exported
>>>> + */
>>>> +static void io_pages_free(struct page ***pages, int npages)
>>>> +{
>>>> +	struct page **page_array = *pages;
>>>> +
>>>> +	if (!page_array)
>>>> +		return;
>>>> +
>>>> +	unpin_user_pages(page_array, npages);
>>>> +	kvfree(page_array);
>>>> +	*pages = NULL;
>>>> +}
>>>
>>> I noticed this and the mapping helper being copied before seeing the
>>> comments - just export them from memmap.c and use those rather than
>>> copying in the code. Add that as a prep patch.
>>
>> No issue to do that either. The hard part is then to get it through
>> different branches. I had removed the big optimization of 
>> __wake_up_on_current_cpu in this series, because it needs another
>> export.
> 
> It's not that hard, just split it out in the next patch and I'll be
> happy to ack/review it so it can go in with the other patches rather
> than needing to go in separately.

Great thank you very much, will do!


Thanks,
Bernd
Jens Axboe Sept. 4, 2024, 7:40 p.m. UTC | #6
On 9/4/24 1:25 PM, Bernd Schubert wrote:
> 
> 
> On 9/4/24 18:16, Jens Axboe wrote:
>> On 9/4/24 10:08 AM, Bernd Schubert wrote:
>>> Hi Jens,
>>>
>>> thanks for your help.
>>>
>>> On 9/4/24 17:47, Jens Axboe wrote:
>>>> On 9/1/24 7:37 AM, Bernd Schubert wrote:
>>>>> This is to allow copying into the buffer from the application
>>>>> without the need to copy in ring context (and with that,
>>>>> the need that the ring task is active in kernel space).
>>>>>
>>>>> Also absolutely needed for now to avoid this teardown issue
>>>>
>>>> I'm fine using these helpers, but they are absolutely not needed to
>>>> avoid that teardown issue - well they may help because it's already
>>>> mapped, but it's really the fault of your handler from attempting to map
>>>> in user pages from when it's teardown/fallback task_work. If invoked and
>>>> the ring is dying or not in the right task (as per the patch from
>>>> Pavel), then just cleanup and return -ECANCELED.
>>>
>>> As I had posted on Friday/Saturday, it didn't work. I had added a 
>>> debug pr_info into Pavels patch, somehow it didn't trigger on PF_EXITING 
>>> and I didn't further debug it yet as I was working on the pin anyway.
>>> And since Monday occupied with other work...
>>
>> Then there's something wrong with that patch, as it definitely should
>> work. How did you reproduce the teardown crash? I'll take a look here.
> 
> Thank you! In this specific case
> 
> 1) Run passthrough_hp with --debug-fuse
> 
> 2) dd if=/dev/zero of=/scratch/test/testfile bs=1M count=1
> 
> Then on the console that has passthrough_hp output and runs slow with my
> ASAN/etc kernel: ctrl-z and kill -9 %
> I guess a pkill -9 passthrough_hp should also work

Eerily similar to what I tried, but I managed to get it to trigger.
Should work what's in there, but I think checking for task != current is
better and not race prone like PF_EXITING is. So maybe? Try with the
below incremental.

diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 55bdcb4b63b3..fa5a0f724a84 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -121,7 +121,8 @@ static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 	unsigned flags = IO_URING_F_COMPLETE_DEFER;
 
-	if (req->task->flags & PF_EXITING)
+	/* Different task should only happen if the original is going away */
+	if (req->task != current)
 		flags |= IO_URING_F_TASK_DEAD;
 
 	/* task_work executor checks the deffered list completion */
Bernd Schubert Sept. 5, 2024, 9:04 p.m. UTC | #7
On 9/4/24 21:40, Jens Axboe wrote:
> On 9/4/24 1:25 PM, Bernd Schubert wrote:
>>
>>
>> On 9/4/24 18:16, Jens Axboe wrote:
>>> On 9/4/24 10:08 AM, Bernd Schubert wrote:
>>>> Hi Jens,
>>>>
>>>> thanks for your help.
>>>>
>>>> On 9/4/24 17:47, Jens Axboe wrote:
>>>>> On 9/1/24 7:37 AM, Bernd Schubert wrote:
>>>>>> This is to allow copying into the buffer from the application
>>>>>> without the need to copy in ring context (and with that,
>>>>>> the need that the ring task is active in kernel space).
>>>>>>
>>>>>> Also absolutely needed for now to avoid this teardown issue
>>>>>
>>>>> I'm fine using these helpers, but they are absolutely not needed to
>>>>> avoid that teardown issue - well they may help because it's already
>>>>> mapped, but it's really the fault of your handler from attempting to map
>>>>> in user pages from when it's teardown/fallback task_work. If invoked and
>>>>> the ring is dying or not in the right task (as per the patch from
>>>>> Pavel), then just cleanup and return -ECANCELED.
>>>>
>>>> As I had posted on Friday/Saturday, it didn't work. I had added a 
>>>> debug pr_info into Pavels patch, somehow it didn't trigger on PF_EXITING 
>>>> and I didn't further debug it yet as I was working on the pin anyway.
>>>> And since Monday occupied with other work...
>>>
>>> Then there's something wrong with that patch, as it definitely should
>>> work. How did you reproduce the teardown crash? I'll take a look here.
>>
>> Thank you! In this specific case
>>
>> 1) Run passthrough_hp with --debug-fuse
>>
>> 2) dd if=/dev/zero of=/scratch/test/testfile bs=1M count=1
>>
>> Then on the console that has passthrough_hp output and runs slow with my
>> ASAN/etc kernel: ctrl-z and kill -9 %
>> I guess a pkill -9 passthrough_hp should also work
> 
> Eerily similar to what I tried, but I managed to get it to trigger.
> Should work what's in there, but I think checking for task != current is
> better and not race prone like PF_EXITING is. So maybe? Try with the
> below incremental.
> 
> diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
> index 55bdcb4b63b3..fa5a0f724a84 100644
> --- a/io_uring/uring_cmd.c
> +++ b/io_uring/uring_cmd.c
> @@ -121,7 +121,8 @@ static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
>  	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
>  	unsigned flags = IO_URING_F_COMPLETE_DEFER;
>  
> -	if (req->task->flags & PF_EXITING)
> +	/* Different task should only happen if the original is going away */
> +	if (req->task != current)
>  		flags |= IO_URING_F_TASK_DEAD;
>  
>  	/* task_work executor checks the deffered list completion */
> 

Thanks, just tested this version works fine!
My user of that (patch 16/17) left the fuse ring entry in bad state -
fixed in my v4 branch.

Thanks,
Bernd
diff mbox series

Patch

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 9f0f2120b1fa..492bb95fde4e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -769,6 +769,15 @@  static int fuse_copy_fill(struct fuse_copy_state *cs)
 			cs->pipebufs++;
 			cs->nr_segs++;
 		}
+	} else if (cs->is_uring) {
+		cs->pg = cs->ring.pages[cs->ring.page_idx++];
+		/*
+		 * non stricly needed, just to avoid a uring exception in
+		 * fuse_copy_finish
+		 */
+		get_page(cs->pg);
+		cs->len = PAGE_SIZE;
+		cs->offset = 0;
 	} else {
 		size_t off;
 		err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off);
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index a65c5d08fce1..4cc0facaaae3 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -29,6 +29,9 @@ 
 #include <linux/topology.h>
 #include <linux/io_uring/cmd.h>
 
+#define FUSE_RING_HEADER_PG 0
+#define FUSE_RING_PAYLOAD_PG 1
+
 struct fuse_uring_cmd_pdu {
 	struct fuse_ring_ent *ring_ent;
 };
@@ -250,6 +253,21 @@  static void fuse_uring_stop_fuse_req_end(struct fuse_ring_ent *ent)
 	fuse_request_end(req);
 }
 
+/*
+ * Copy from memmap.c, should be exported
+ */
+static void io_pages_free(struct page ***pages, int npages)
+{
+	struct page **page_array = *pages;
+
+	if (!page_array)
+		return;
+
+	unpin_user_pages(page_array, npages);
+	kvfree(page_array);
+	*pages = NULL;
+}
+
 /*
  * Release a request/entry on connection tear down
  */
@@ -275,6 +293,8 @@  static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent,
 	if (ent->fuse_req)
 		fuse_uring_stop_fuse_req_end(ent);
 
+	io_pages_free(&ent->user_pages, ent->nr_user_pages);
+
 	ent->state = FRRS_FREED;
 }
 
@@ -417,6 +437,7 @@  static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
 		goto seterr;
 	}
 
+	/* FIXME copied from dev.c, check what 512 means  */
 	if (oh->error <= -512 || oh->error > 0) {
 		err = -EINVAL;
 		goto seterr;
@@ -465,53 +486,41 @@  static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
 
 static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
 				     struct fuse_req *req,
-				     struct fuse_ring_ent *ent)
+				     struct fuse_ring_ent *ent,
+				     struct fuse_ring_req *rreq)
 {
-	struct fuse_ring_req __user *rreq = ent->rreq;
 	struct fuse_copy_state cs;
 	struct fuse_args *args = req->args;
 	struct iov_iter iter;
-	int err;
-	int res_arg_len;
+	int res_arg_len, err;
 
-	err = copy_from_user(&res_arg_len, &rreq->in_out_arg_len,
-			     sizeof(res_arg_len));
-	if (err)
-		return err;
-
-	err = import_ubuf(ITER_SOURCE, (void __user *)&rreq->in_out_arg,
-			  ent->max_arg_len, &iter);
-	if (err)
-		return err;
+	res_arg_len = rreq->in_out_arg_len;
 
 	fuse_copy_init(&cs, 0, &iter);
 	cs.is_uring = 1;
+	cs.ring.pages = &ent->user_pages[FUSE_RING_PAYLOAD_PG];
 	cs.req = req;
 
-	return fuse_copy_out_args(&cs, args, res_arg_len);
+	err = fuse_copy_out_args(&cs, args, res_arg_len);
+
+	return err;
 }
 
- /*
-  * Copy data from the req to the ring buffer
-  */
+/*
+ * Copy data from the req to the ring buffer
+ */
 static int fuse_uring_copy_to_ring(struct fuse_ring *ring, struct fuse_req *req,
-				   struct fuse_ring_ent *ent)
+				   struct fuse_ring_ent *ent,
+				   struct fuse_ring_req *rreq)
 {
-	struct fuse_ring_req __user *rreq = ent->rreq;
 	struct fuse_copy_state cs;
 	struct fuse_args *args = req->args;
-	int err, res;
+	int err;
 	struct iov_iter iter;
 
-	err = import_ubuf(ITER_DEST, (void __user *)&rreq->in_out_arg,
-			  ent->max_arg_len, &iter);
-	if (err) {
-		pr_info("Import user buffer failed\n");
-		return err;
-	}
-
 	fuse_copy_init(&cs, 1, &iter);
 	cs.is_uring = 1;
+	cs.ring.pages = &ent->user_pages[FUSE_RING_PAYLOAD_PG];
 	cs.req = req;
 	err = fuse_copy_args(&cs, args->in_numargs, args->in_pages,
 			     (struct fuse_arg *)args->in_args, 0);
@@ -520,10 +529,7 @@  static int fuse_uring_copy_to_ring(struct fuse_ring *ring, struct fuse_req *req,
 		return err;
 	}
 
-	BUILD_BUG_ON((sizeof(rreq->in_out_arg_len) != sizeof(cs.ring.offset)));
-	res = copy_to_user(&rreq->in_out_arg_len, &cs.ring.offset,
-			   sizeof(rreq->in_out_arg_len));
-	err = res > 0 ? -EFAULT : res;
+	rreq->in_out_arg_len = cs.ring.offset;
 
 	return err;
 }
@@ -531,11 +537,11 @@  static int fuse_uring_copy_to_ring(struct fuse_ring *ring, struct fuse_req *req,
 static int
 fuse_uring_prepare_send(struct fuse_ring_ent *ring_ent)
 {
-	struct fuse_ring_req *rreq = ring_ent->rreq;
+	struct fuse_ring_req *rreq = NULL;
 	struct fuse_ring_queue *queue = ring_ent->queue;
 	struct fuse_ring *ring = queue->ring;
 	struct fuse_req *req = ring_ent->fuse_req;
-	int err = 0, res;
+	int err = 0;
 
 	if (WARN_ON(ring_ent->state != FRRS_FUSE_REQ)) {
 		pr_err("qid=%d tag=%d ring-req=%p buf_req=%p invalid state %d on send\n",
@@ -551,25 +557,27 @@  fuse_uring_prepare_send(struct fuse_ring_ent *ring_ent)
 		 __func__, queue->qid, ring_ent->tag, ring_ent->state,
 		 req->in.h.opcode, req->in.h.unique);
 
+	rreq = kmap_local_page(ring_ent->user_pages[FUSE_RING_HEADER_PG]);
+
 	/* copy the request */
-	err = fuse_uring_copy_to_ring(ring, req, ring_ent);
+	err = fuse_uring_copy_to_ring(ring, req, ring_ent, rreq);
 	if (unlikely(err)) {
 		pr_info("Copy to ring failed: %d\n", err);
 		goto err;
 	}
 
 	/* copy fuse_in_header */
-	res = copy_to_user(&rreq->in, &req->in.h, sizeof(rreq->in));
-	err = res > 0 ? -EFAULT : res;
-	if (err)
-		goto err;
+	rreq->in = req->in.h;
 
+	err = 0;
 	set_bit(FR_SENT, &req->flags);
-	return 0;
-
+out:
+	if (rreq)
+		kunmap_local(rreq);
+	return err;
 err:
 	fuse_uring_req_end(ring_ent, true, err);
-	return err;
+	goto out;
 }
 
 /*
@@ -682,16 +690,13 @@  static void fuse_uring_commit(struct fuse_ring_ent *ring_ent,
 {
 	struct fuse_ring *ring = ring_ent->queue->ring;
 	struct fuse_conn *fc = ring->fc;
-	struct fuse_ring_req *rreq = ring_ent->rreq;
+	struct fuse_ring_req *rreq;
 	struct fuse_req *req = ring_ent->fuse_req;
 	ssize_t err = 0;
 	bool set_err = false;
 
-	err = copy_from_user(&req->out.h, &rreq->out, sizeof(req->out.h));
-	if (err) {
-		req->out.h.error = err;
-		goto out;
-	}
+	rreq = kmap_local_page(ring_ent->user_pages[FUSE_RING_HEADER_PG]);
+	req->out.h = rreq->out;
 
 	err = fuse_uring_out_header_has_err(&req->out.h, req, fc);
 	if (err) {
@@ -701,7 +706,8 @@  static void fuse_uring_commit(struct fuse_ring_ent *ring_ent,
 		goto out;
 	}
 
-	err = fuse_uring_copy_from_ring(ring, req, ring_ent);
+	err = fuse_uring_copy_from_ring(ring, req, ring_ent, rreq);
+	kunmap_local(rreq);
 	if (err)
 		set_err = true;
 
@@ -830,6 +836,46 @@  __must_hold(ring_ent->queue->lock)
 	return 0;
 }
 
+/*
+ * Copy from memmap.c, should be exported there
+ */
+static struct page **io_pin_pages(unsigned long uaddr, unsigned long len,
+				  int *npages)
+{
+	unsigned long start, end, nr_pages;
+	struct page **pages;
+	int ret;
+
+	end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	start = uaddr >> PAGE_SHIFT;
+	nr_pages = end - start;
+	if (WARN_ON_ONCE(!nr_pages))
+		return ERR_PTR(-EINVAL);
+
+	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+					pages);
+	/* success, mapped all pages */
+	if (ret == nr_pages) {
+		*npages = nr_pages;
+		return pages;
+	}
+
+	/* partial map, or didn't map anything */
+	if (ret >= 0) {
+		/* if we did partial map, release any pages we did get */
+		if (ret)
+			unpin_user_pages(pages, ret);
+		ret = -EFAULT;
+	}
+	kvfree(pages);
+	return ERR_PTR(ret);
+}
+
+
 /* FUSE_URING_REQ_FETCH handler */
 static int fuse_uring_fetch(struct fuse_ring_ent *ring_ent,
 			    struct io_uring_cmd *cmd, unsigned int issue_flags)
@@ -837,39 +883,48 @@  static int fuse_uring_fetch(struct fuse_ring_ent *ring_ent,
 {
 	struct fuse_ring *ring = ring_ent->queue->ring;
 	struct fuse_ring_queue *queue = ring_ent->queue;
-	int ret;
+	int err;
 
 	/* No other bit must be set here */
-	ret = -EINVAL;
+	err = -EINVAL;
 	if (ring_ent->state != FRRS_INIT)
-		goto err;
+		goto err_unlock;
 
 	/*
 	 * FUSE_URING_REQ_FETCH is an initialization exception, needs
 	 * state override
 	 */
 	ring_ent->state = FRRS_USERSPACE;
-	ret = fuse_ring_ring_ent_unset_userspace(ring_ent);
-	if (ret != 0) {
-		pr_info_ratelimited(
-			"qid=%d tag=%d register req state %d expected %d",
-			queue->qid, ring_ent->tag, ring_ent->state,
-			FRRS_INIT);
+	fuse_ring_ring_ent_unset_userspace(ring_ent);
+
+	err = _fuse_uring_fetch(ring_ent, cmd, issue_flags);
+	if (err)
+		goto err_unlock;
+
+	spin_unlock(&queue->lock);
+
+	/* must not hold the queue->lock */
+	ring_ent->user_pages = io_pin_pages(ring_ent->user_buf,
+					    ring_ent->user_buf_len,
+					    &ring_ent->nr_user_pages);
+	if (IS_ERR(ring_ent->user_pages)) {
+		err = PTR_ERR(ring_ent->user_pages);
+		pr_info("qid=%d ent=%d pin-res=%d\n",
+			queue->qid, ring_ent->tag, err);
 		goto err;
 	}
 
-	ret = _fuse_uring_fetch(ring_ent, cmd, issue_flags);
-	if (ret)
-		goto err;
-
 	/*
 	 * The ring entry is registered now and needs to be handled
 	 * for shutdown.
 	 */
 	atomic_inc(&ring->queue_refs);
-err:
+	return 0;
+
+err_unlock:
 	spin_unlock(&queue->lock);
-	return ret;
+err:
+	return err;
 }
 
 /**
@@ -920,7 +975,9 @@  int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	if (unlikely(fc->aborted || queue->stopped))
 		goto err_unlock;
 
-	ring_ent->rreq = (void __user *)cmd_req->buf_ptr;
+	ring_ent->user_buf = cmd_req->buf_ptr;
+	ring_ent->user_buf_len = cmd_req->buf_len;
+
 	ring_ent->max_arg_len = cmd_req->buf_len -
 				offsetof(struct fuse_ring_req, in_out_arg);
 	ret = -EINVAL;
@@ -930,7 +987,6 @@  int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 		goto err_unlock;
 	}
 
-	ring_ent->rreq = (void __user *)cmd_req->buf_ptr;
 	ring_ent->max_arg_len = cmd_req->buf_len -
 				offsetof(struct fuse_ring_req, in_out_arg);
 	if (cmd_req->buf_len < ring->req_buf_sz) {
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index f1247ee57dc4..2e43b2e9bcf2 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -60,10 +60,17 @@  struct fuse_ring_ent {
 	/* fuse_req assigned to the ring entry */
 	struct fuse_req *fuse_req;
 
-	/*
-	 * buffer provided by fuse server
-	 */
-	struct fuse_ring_req __user *rreq;
+	/* buffer provided by fuse server */
+	unsigned long __user user_buf;
+
+	/* length of user_buf */
+	size_t user_buf_len;
+
+	/* mapped user_buf pages */
+	struct page **user_pages;
+
+	/* number of user pages */
+	int nr_user_pages;
 
 	/* struct fuse_ring_req::in_out_arg size*/
 	size_t max_arg_len;
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 0fbb4f28261c..63e0e5dcb9f4 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -32,6 +32,8 @@  struct fuse_copy_state {
 	struct {
 		/* overall offset with the user buffer */
 		unsigned int offset;
+		struct page **pages;
+		int page_idx;
 	} ring;
 };