diff mbox series

[6/6] fuse: Verify userspace asks to requeue interrupt that we really sent

Message ID 154149666097.17764.12092615786683141764.stgit@localhost.localdomain (mailing list archive)
State New, archived
Headers show
Series fuse: Interrupt-related optimizations and improvements | expand

Commit Message

Kirill Tkhai Nov. 6, 2018, 9:31 a.m. UTC
When queue_interrupt() is called from fuse_dev_do_write(),
it came from userspace directly. Userspace may pass any
request id, even the request's we have not interrupted
(or even background's request). This patch adds sanity
check to make kernel safe against that.

The problem is real interrupt may be queued and requeued
by two tasks on two cpus. This case, the requeuer has not
guarantees it sees FR_INTERRUPTED bit on its cpu (since
we know nothing about the way userspace manages requests
between its threads and whether it uses smp barriers).

To eliminate this problem, queuer writes FR_INTERRUPTED
bit again under fiq->waitq.lock, and this guarantees
requeuer will see the bit, when checks it.

I try to introduce solution, which does not affect on
performance, and which does not force to take more
locks. This is the reason, the below solution is worse:

   request_wait_answer()
   {
     ...
  +  spin_lock(&fiq->waitq.lock);
     set_bit(FR_INTERRUPTED, &req->flags);
  +  spin_unlock(&fiq->waitq.lock);
     ...
   }

Also, it does not look a better idea to extend fuse_dev_do_read()
with the fix, since it's already a big function:

   fuse_dev_do_read()
   {
     ...
     if (test_bit(FR_INTERRUPTED, &req->flags)) {
  +      /* Comment */
  +      barrier();
  +      set_bit(FR_INTERRUPTED, &req->flags);
         queue_interrupt(fiq, req);
     }
     ...
   }

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 fs/fuse/dev.c |   26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

Comments

Miklos Szeredi Nov. 7, 2018, 1:55 p.m. UTC | #1
On Tue, Nov 6, 2018 at 10:31 AM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> When queue_interrupt() is called from fuse_dev_do_write(),
> it came from userspace directly. Userspace may pass any
> request id, even the request's we have not interrupted
> (or even background's request). This patch adds sanity
> check to make kernel safe against that.

Okay, I understand this far.

> The problem is real interrupt may be queued and requeued
> by two tasks on two cpus. This case, the requeuer has not
> guarantees it sees FR_INTERRUPTED bit on its cpu (since
> we know nothing about the way userspace manages requests
> between its threads and whether it uses smp barriers).

This sounds like BS.   There's an explicit  smp_mb__after_atomic()
after the set_bit(FR_INTERRUPTED,...).  Which means FR_INTERRUPTED is
going to be visible on all CPU's after this, no need to fool around
with setting FR_INTERRUPTED again, etc...



>
> To eliminate this problem, queuer writes FR_INTERRUPTED
> bit again under fiq->waitq.lock, and this guarantees
> requeuer will see the bit, when checks it.
>
> I try to introduce solution, which does not affect on
> performance, and which does not force to take more
> locks. This is the reason, the below solution is worse:
>
>    request_wait_answer()
>    {
>      ...
>   +  spin_lock(&fiq->waitq.lock);
>      set_bit(FR_INTERRUPTED, &req->flags);
>   +  spin_unlock(&fiq->waitq.lock);
>      ...
>    }
>
> Also, it does not look a better idea to extend fuse_dev_do_read()
> with the fix, since it's already a big function:
>
>    fuse_dev_do_read()
>    {
>      ...
>      if (test_bit(FR_INTERRUPTED, &req->flags)) {
>   +      /* Comment */
>   +      barrier();
>   +      set_bit(FR_INTERRUPTED, &req->flags);
>          queue_interrupt(fiq, req);
>      }
>      ...
>    }
>
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>  fs/fuse/dev.c |   26 +++++++++++++++++++++-----
>  1 file changed, 21 insertions(+), 5 deletions(-)
>
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index 315d395d5c02..3bfc5ed61c9a 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -475,13 +475,27 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
>         fuse_put_request(fc, req);
>  }
>
> -static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
> +static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
>  {
>         bool kill = false;
>
>         if (test_bit(FR_FINISHED, &req->flags))
> -               return;
> +               return 0;
>         spin_lock(&fiq->waitq.lock);
> +       /* Check for we've sent request to interrupt this req */
> +       if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
> +               spin_unlock(&fiq->waitq.lock);
> +               return -EINVAL;
> +       }
> +       /*
> +        * Interrupt may be queued from fuse_dev_do_read(), and
> +        * later requeued on other cpu by fuse_dev_do_write().
> +        * To make FR_INTERRUPTED bit visible for the requeuer
> +        * (under fiq->waitq.lock) we write it once again.
> +        */
> +       barrier();
> +       __set_bit(FR_INTERRUPTED, &req->flags);
> +
>         if (list_empty(&req->intr_entry)) {
>                 list_add_tail(&req->intr_entry, &fiq->interrupts);
>                 /*
> @@ -492,7 +506,7 @@ static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
>                 if (test_bit(FR_FINISHED, &req->flags)) {
>                         list_del_init(&req->intr_entry);
>                         spin_unlock(&fiq->waitq.lock);
> -                       return;
> +                       return 0;
>                 }
>                 wake_up_locked(&fiq->waitq);
>                 kill = true;
> @@ -500,6 +514,7 @@ static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
>         spin_unlock(&fiq->waitq.lock);
>         if (kill)
>                 kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
> +       return (int)kill;
>  }
>
>  static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
> @@ -1959,8 +1974,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
>                         nbytes = -EINVAL;
>                 else if (oh.error == -ENOSYS)
>                         fc->no_interrupt = 1;
> -               else if (oh.error == -EAGAIN)
> -                       queue_interrupt(&fc->iq, req);
> +               else if (oh.error == -EAGAIN &&
> +                        queue_interrupt(&fc->iq, req) < 0)
> +                       nbytes = -EINVAL;
>
>                 fuse_put_request(fc, req);
>                 fuse_copy_finish(cs);
>
Kirill Tkhai Nov. 7, 2018, 2:25 p.m. UTC | #2
On 07.11.2018 16:55, Miklos Szeredi wrote:
> On Tue, Nov 6, 2018 at 10:31 AM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>> When queue_interrupt() is called from fuse_dev_do_write(),
>> it came from userspace directly. Userspace may pass any
>> request id, even the request's we have not interrupted
>> (or even background's request). This patch adds sanity
>> check to make kernel safe against that.
> 
> Okay, I understand this far.
> 
>> The problem is real interrupt may be queued and requeued
>> by two tasks on two cpus. This case, the requeuer has not
>> guarantees it sees FR_INTERRUPTED bit on its cpu (since
>> we know nothing about the way userspace manages requests
>> between its threads and whether it uses smp barriers).
> 
> This sounds like BS. There's an explicit  smp_mb__after_atomic()
> after the set_bit(FR_INTERRUPTED,...).  Which means FR_INTERRUPTED is
> going to be visible on all CPU's after this, no need to fool around
> with setting FR_INTERRUPTED again, etc...

Hm, but how does it make the bit visible on all CPUS?

The problem is that smp_mb_xxx() barrier on a cpu has a sense
only in pair with the appropriate barrier on the second cpu.
There is no guarantee for visibility, if second cpu does not
have a barrier:

  CPU 1                  CPU2                        CPU3                       CPU4                        CPU5

  set FR_INTERRUPTED     set FR_SENT                                            
  <smp mb>               <smp mb>                          
  test FR_SENT (== 0)    test FR_INTERRUPTED (==1)
                         list_add[&req->intr_entry]  read[req by intr_entry]
                                                     <place to insert a barrier>
                                                     goto userspace
                                                     write in userspace buffer
                                                                                read from userspace buffer  
                                                                                write to userspace buffer
                                                                                                             read from userspace buffer
                                                                                                             enter kernel
                                                                                                             <place to insert a barrier>
                                                                                                             test FR_INTERRUPTED <- Not visible

The sequence:

set_bit(FR_INTERRUPTED, ...)
smp_mb__after_atomic();
test_bit(FR_SENT, &req->flags)

just guarantees the expected order on CPU2, which uses <smp mb>,
but CPU5 does not have any guarantees.


This 5 CPUs picture is a scheme, which illustrates the possible way userspace
may manage interrupts. Tags <place to insert a barrier> show places, where
we not have barriers yet, but where we may introduce them. But even in case
of we introduce them, there is no a way, that such the barriers help against CPU4.
So, this is the reason we have to set FR_INTERRUPTED bit again to make it visible
under the spinlock on CPU5.

Thanks,
Kirill

>>
>> To eliminate this problem, queuer writes FR_INTERRUPTED
>> bit again under fiq->waitq.lock, and this guarantees
>> requeuer will see the bit, when checks it.
>>
>> I try to introduce solution, which does not affect on
>> performance, and which does not force to take more
>> locks. This is the reason, the below solution is worse:
>>
>>    request_wait_answer()
>>    {
>>      ...
>>   +  spin_lock(&fiq->waitq.lock);
>>      set_bit(FR_INTERRUPTED, &req->flags);
>>   +  spin_unlock(&fiq->waitq.lock);
>>      ...
>>    }
>>
>> Also, it does not look a better idea to extend fuse_dev_do_read()
>> with the fix, since it's already a big function:
>>
>>    fuse_dev_do_read()
>>    {
>>      ...
>>      if (test_bit(FR_INTERRUPTED, &req->flags)) {
>>   +      /* Comment */
>>   +      barrier();
>>   +      set_bit(FR_INTERRUPTED, &req->flags);
>>          queue_interrupt(fiq, req);
>>      }
>>      ...
>>    }
>>
>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>> ---
>>  fs/fuse/dev.c |   26 +++++++++++++++++++++-----
>>  1 file changed, 21 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
>> index 315d395d5c02..3bfc5ed61c9a 100644
>> --- a/fs/fuse/dev.c
>> +++ b/fs/fuse/dev.c
>> @@ -475,13 +475,27 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
>>         fuse_put_request(fc, req);
>>  }
>>
>> -static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
>> +static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
>>  {
>>         bool kill = false;
>>
>>         if (test_bit(FR_FINISHED, &req->flags))
>> -               return;
>> +               return 0;
>>         spin_lock(&fiq->waitq.lock);
>> +       /* Check for we've sent request to interrupt this req */
>> +       if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
>> +               spin_unlock(&fiq->waitq.lock);
>> +               return -EINVAL;
>> +       }
>> +       /*
>> +        * Interrupt may be queued from fuse_dev_do_read(), and
>> +        * later requeued on other cpu by fuse_dev_do_write().
>> +        * To make FR_INTERRUPTED bit visible for the requeuer
>> +        * (under fiq->waitq.lock) we write it once again.
>> +        */
>> +       barrier();
>> +       __set_bit(FR_INTERRUPTED, &req->flags);
>> +
>>         if (list_empty(&req->intr_entry)) {
>>                 list_add_tail(&req->intr_entry, &fiq->interrupts);
>>                 /*
>> @@ -492,7 +506,7 @@ static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
>>                 if (test_bit(FR_FINISHED, &req->flags)) {
>>                         list_del_init(&req->intr_entry);
>>                         spin_unlock(&fiq->waitq.lock);
>> -                       return;
>> +                       return 0;
>>                 }
>>                 wake_up_locked(&fiq->waitq);
>>                 kill = true;
>> @@ -500,6 +514,7 @@ static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
>>         spin_unlock(&fiq->waitq.lock);
>>         if (kill)
>>                 kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
>> +       return (int)kill;
>>  }
>>
>>  static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
>> @@ -1959,8 +1974,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
>>                         nbytes = -EINVAL;
>>                 else if (oh.error == -ENOSYS)
>>                         fc->no_interrupt = 1;
>> -               else if (oh.error == -EAGAIN)
>> -                       queue_interrupt(&fc->iq, req);
>> +               else if (oh.error == -EAGAIN &&
>> +                        queue_interrupt(&fc->iq, req) < 0)
>> +                       nbytes = -EINVAL;
>>
>>                 fuse_put_request(fc, req);
>>                 fuse_copy_finish(cs);
>>
Miklos Szeredi Nov. 7, 2018, 2:45 p.m. UTC | #3
On Wed, Nov 7, 2018 at 3:25 PM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> On 07.11.2018 16:55, Miklos Szeredi wrote:
>> On Tue, Nov 6, 2018 at 10:31 AM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>>> When queue_interrupt() is called from fuse_dev_do_write(),
>>> it came from userspace directly. Userspace may pass any
>>> request id, even the request's we have not interrupted
>>> (or even background's request). This patch adds sanity
>>> check to make kernel safe against that.
>>
>> Okay, I understand this far.
>>
>>> The problem is real interrupt may be queued and requeued
>>> by two tasks on two cpus. This case, the requeuer has not
>>> guarantees it sees FR_INTERRUPTED bit on its cpu (since
>>> we know nothing about the way userspace manages requests
>>> between its threads and whether it uses smp barriers).
>>
>> This sounds like BS. There's an explicit  smp_mb__after_atomic()
>> after the set_bit(FR_INTERRUPTED,...).  Which means FR_INTERRUPTED is
>> going to be visible on all CPU's after this, no need to fool around
>> with setting FR_INTERRUPTED again, etc...
>
> Hm, but how does it make the bit visible on all CPUS?
>
> The problem is that smp_mb_xxx() barrier on a cpu has a sense
> only in pair with the appropriate barrier on the second cpu.
> There is no guarantee for visibility, if second cpu does not
> have a barrier:
>
>   CPU 1                  CPU2                        CPU3                       CPU4                        CPU5
>
>   set FR_INTERRUPTED     set FR_SENT
>   <smp mb>               <smp mb>
>   test FR_SENT (== 0)    test FR_INTERRUPTED (==1)
>                          list_add[&req->intr_entry]  read[req by intr_entry]
>                                                      <place to insert a barrier>
>                                                      goto userspace
>                                                      write in userspace buffer
>                                                                                 read from userspace buffer
>                                                                                 write to userspace buffer
>                                                                                                              read from userspace buffer
>                                                                                                              enter kernel
>                                                                                                              <place to insert a barrier>
>                                                                                                              test FR_INTERRUPTED <- Not visible
>
> The sequence:
>
> set_bit(FR_INTERRUPTED, ...)
> smp_mb__after_atomic();
> test_bit(FR_SENT, &req->flags)
>
> just guarantees the expected order on CPU2, which uses <smp mb>,
> but CPU5 does not have any guarantees.

What you are missing is that there are other things that synchronize
memory access besides the memory barrier.  In your example the
ordering will be guaranteed by the fiq->waitq.lock in
queue_interrupt() on CPU2: the set_bit() cannot move past the one-way
barrier provided by the spin_unlock().

Thanks,
Miklos
Kirill Tkhai Nov. 7, 2018, 4:40 p.m. UTC | #4
On 07.11.2018 17:45, Miklos Szeredi wrote:
> On Wed, Nov 7, 2018 at 3:25 PM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>> On 07.11.2018 16:55, Miklos Szeredi wrote:
>>> On Tue, Nov 6, 2018 at 10:31 AM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>>>> When queue_interrupt() is called from fuse_dev_do_write(),
>>>> it came from userspace directly. Userspace may pass any
>>>> request id, even the request's we have not interrupted
>>>> (or even background's request). This patch adds sanity
>>>> check to make kernel safe against that.
>>>
>>> Okay, I understand this far.
>>>
>>>> The problem is real interrupt may be queued and requeued
>>>> by two tasks on two cpus. This case, the requeuer has not
>>>> guarantees it sees FR_INTERRUPTED bit on its cpu (since
>>>> we know nothing about the way userspace manages requests
>>>> between its threads and whether it uses smp barriers).
>>>
>>> This sounds like BS. There's an explicit  smp_mb__after_atomic()
>>> after the set_bit(FR_INTERRUPTED,...).  Which means FR_INTERRUPTED is
>>> going to be visible on all CPU's after this, no need to fool around
>>> with setting FR_INTERRUPTED again, etc...
>>
>> Hm, but how does it make the bit visible on all CPUS?
>>
>> The problem is that smp_mb_xxx() barrier on a cpu has a sense
>> only in pair with the appropriate barrier on the second cpu.
>> There is no guarantee for visibility, if second cpu does not
>> have a barrier:
>>
>>   CPU 1                  CPU2                        CPU3                       CPU4                        CPU5
>>
>>   set FR_INTERRUPTED     set FR_SENT
>>   <smp mb>               <smp mb>
>>   test FR_SENT (== 0)    test FR_INTERRUPTED (==1)
>>                          list_add[&req->intr_entry]  read[req by intr_entry]
>>                                                      <place to insert a barrier>
>>                                                      goto userspace
>>                                                      write in userspace buffer
>>                                                                                 read from userspace buffer
>>                                                                                 write to userspace buffer
>>                                                                                                              read from userspace buffer
>>                                                                                                              enter kernel
>>                                                                                                              <place to insert a barrier>
>>                                                                                                              test FR_INTERRUPTED <- Not visible
>>
>> The sequence:
>>
>> set_bit(FR_INTERRUPTED, ...)
>> smp_mb__after_atomic();
>> test_bit(FR_SENT, &req->flags)
>>
>> just guarantees the expected order on CPU2, which uses <smp mb>,
>> but CPU5 does not have any guarantees.
> 
> What you are missing is that there are other things that synchronize
> memory access besides the memory barrier.  In your example the
> ordering will be guaranteed by the fiq->waitq.lock in
> queue_interrupt() on CPU2: the set_bit() cannot move past the one-way
> barrier provided by the spin_unlock().

I thought, RELEASE is related to memory operations, which is made on the same cpu.
Strange thing, but the below memory-model test says, it's not true. Ok, I'll change
the patch, thanks for pointing this.


===== tools/memory-model/litmus-tests/test.litmus =====
C SB+test

{}

P0(atomic_t *flags)
{
	int r0;

	atomic_add(1, flags);
	smp_mb__after_atomic();
	r0 = (atomic_read(flags) & 2);
}

P1(atomic_t *flags, int *linked)
{
	int r0;

	atomic_add(2, flags);
	smp_mb__after_atomic();
	r0 = (atomic_read(flags) & 1);

	if (r0) {
		spin_lock(mylock);
		*linked = 1;
		spin_unlock(mylock);
	}
}

P2(atomic_t *flags, int *linked)
{
	int r0;

	spin_lock(mylock);
	if (*linked) {
		r0 = atomic_read(flags);
	} else
		r0 = 0;
	spin_unlock(mylock);
}

exists (0:r0=0 /\ 1:r0=1 /\ 2:r0=2)

===================================

kirill@pro:~/linux-next/tools/memory-model$ ~/.opam/system/bin/herd7 -conf linux-kernel.cfg litmus-tests/test.litmus 
Test SB+test Allowed
States 5
0:r0=0; 1:r0=1; 2:r0=0;
0:r0=0; 1:r0=1; 2:r0=3;
0:r0=2; 1:r0=0; 2:r0=0;
0:r0=2; 1:r0=1; 2:r0=0;
0:r0=2; 1:r0=1; 2:r0=3;
No
Witnesses
Positive: 0 Negative: 7
Condition exists (0:r0=0 /\ 1:r0=1 /\ 2:r0=2)
Observation SB+test Never 0 7
Time SB+test 0.09
Hash=0213fd54f80c511af2326e1bd120a96b
diff mbox series

Patch

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 315d395d5c02..3bfc5ed61c9a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -475,13 +475,27 @@  static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 	fuse_put_request(fc, req);
 }
 
-static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
 	bool kill = false;
 
 	if (test_bit(FR_FINISHED, &req->flags))
-		return;
+		return 0;
 	spin_lock(&fiq->waitq.lock);
+	/* Check for we've sent request to interrupt this req */
+	if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
+		spin_unlock(&fiq->waitq.lock);
+		return -EINVAL;
+	}
+	/*
+	 * Interrupt may be queued from fuse_dev_do_read(), and
+	 * later requeued on other cpu by fuse_dev_do_write().
+	 * To make FR_INTERRUPTED bit visible for the requeuer
+	 * (under fiq->waitq.lock) we write it once again.
+	 */
+	barrier();
+	__set_bit(FR_INTERRUPTED, &req->flags);
+
 	if (list_empty(&req->intr_entry)) {
 		list_add_tail(&req->intr_entry, &fiq->interrupts);
 		/*
@@ -492,7 +506,7 @@  static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 		if (test_bit(FR_FINISHED, &req->flags)) {
 			list_del_init(&req->intr_entry);
 			spin_unlock(&fiq->waitq.lock);
-			return;
+			return 0;
 		}
 		wake_up_locked(&fiq->waitq);
 		kill = true;
@@ -500,6 +514,7 @@  static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 	spin_unlock(&fiq->waitq.lock);
 	if (kill)
 		kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+	return (int)kill;
 }
 
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
@@ -1959,8 +1974,9 @@  static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 			nbytes = -EINVAL;
 		else if (oh.error == -ENOSYS)
 			fc->no_interrupt = 1;
-		else if (oh.error == -EAGAIN)
-			queue_interrupt(&fc->iq, req);
+		else if (oh.error == -EAGAIN &&
+			 queue_interrupt(&fc->iq, req) < 0)
+			nbytes = -EINVAL;
 
 		fuse_put_request(fc, req);
 		fuse_copy_finish(cs);