diff mbox series

block: ensure the memory order between bi_private and bi_status

Message ID 20210701113537.582120-1-houtao1@huawei.com (mailing list archive)
State New, archived
Headers show
Series block: ensure the memory order between bi_private and bi_status | expand

Commit Message

Hou Tao July 1, 2021, 11:35 a.m. UTC
When running stress test on null_blk under linux-4.19.y, the following
warning is reported:

  percpu_ref_switch_to_atomic_rcu: percpu ref (css_release) <= 0 (-3) after switching to atomic

The cause is that css_put() is invoked twice on the same bio as shown below:

CPU 1:                         CPU 2:

// IO completion kworker       // IO submit thread
                               __blkdev_direct_IO_simple
                                 submit_bio

bio_endio
  bio_uninit(bio)
    css_put(bi_css)
    bi_css = NULL
                               set_current_state(TASK_UNINTERRUPTIBLE)
  bio->bi_end_io
    blkdev_bio_end_io_simple
      bio->bi_private = NULL
                               // bi_private is NULL
                               READ_ONCE(bio->bi_private)
        wake_up_process
          smp_mb__after_spinlock

                               bio_unint(bio)
                                 // read bi_css as no-NULL
                                 // so call css_put() again
                                 css_put(bi_css)

Because there is no memory barriers between the reading and the writing of
bi_private and bi_css, so reading bi_private as NULL can not guarantee
bi_css will also be NULL on weak-memory model host (e.g, ARM64).

For the latest kernel source, css_put() has been removed from bio_unint(),
but the memory-order problem still exists, because the order between
bio->bi_private and {bi_status|bi_blkg} is also assumed in
__blkdev_direct_IO_simple(). It is reproducible that
__blkdev_direct_IO_simple() may read bi_status as 0 event if
bi_status is set as an errno in req_bio_endio().

In __blkdev_direct_IO(), the memory order between dio->waiter and
dio->bio.bi_status is not guaranteed neither. Until now it is unable to
reproduce it, maybe because dio->waiter and dio->bio.bi_status are
in the same cache-line. But it is better to add guarantee for memory
order.

Fixing it by using smp_load_acquire() & smp_store_release() to guarantee
the order between {bio->bi_private|dio->waiter} and {bi_status|bi_blkg}.

Fixes: 189ce2b9dcc3 ("block: fast-path for small and simple direct I/O requests")
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 fs/block_dev.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

Comments

Hou Tao July 7, 2021, 6:29 a.m. UTC | #1
ping ?

On 7/1/2021 7:35 PM, Hou Tao wrote:
> When running stress test on null_blk under linux-4.19.y, the following
> warning is reported:
>
>   percpu_ref_switch_to_atomic_rcu: percpu ref (css_release) <= 0 (-3) after switching to atomic
>
> The cause is that css_put() is invoked twice on the same bio as shown below:
>
> CPU 1:                         CPU 2:
>
> // IO completion kworker       // IO submit thread
>                                __blkdev_direct_IO_simple
>                                  submit_bio
>
> bio_endio
>   bio_uninit(bio)
>     css_put(bi_css)
>     bi_css = NULL
>                                set_current_state(TASK_UNINTERRUPTIBLE)
>   bio->bi_end_io
>     blkdev_bio_end_io_simple
>       bio->bi_private = NULL
>                                // bi_private is NULL
>                                READ_ONCE(bio->bi_private)
>         wake_up_process
>           smp_mb__after_spinlock
>
>                                bio_unint(bio)
>                                  // read bi_css as no-NULL
>                                  // so call css_put() again
>                                  css_put(bi_css)
>
> Because there is no memory barriers between the reading and the writing of
> bi_private and bi_css, so reading bi_private as NULL can not guarantee
> bi_css will also be NULL on weak-memory model host (e.g, ARM64).
>
> For the latest kernel source, css_put() has been removed from bio_unint(),
> but the memory-order problem still exists, because the order between
> bio->bi_private and {bi_status|bi_blkg} is also assumed in
> __blkdev_direct_IO_simple(). It is reproducible that
> __blkdev_direct_IO_simple() may read bi_status as 0 event if
> bi_status is set as an errno in req_bio_endio().
>
> In __blkdev_direct_IO(), the memory order between dio->waiter and
> dio->bio.bi_status is not guaranteed neither. Until now it is unable to
> reproduce it, maybe because dio->waiter and dio->bio.bi_status are
> in the same cache-line. But it is better to add guarantee for memory
> order.
>
> Fixing it by using smp_load_acquire() & smp_store_release() to guarantee
> the order between {bio->bi_private|dio->waiter} and {bi_status|bi_blkg}.
>
> Fixes: 189ce2b9dcc3 ("block: fast-path for small and simple direct I/O requests")
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>  fs/block_dev.c | 19 +++++++++++++++----
>  1 file changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index eb34f5c357cf..a602c6315b0b 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -224,7 +224,11 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
>  {
>  	struct task_struct *waiter = bio->bi_private;
>  
> -	WRITE_ONCE(bio->bi_private, NULL);
> +	/*
> +	 * Paired with smp_load_acquire in __blkdev_direct_IO_simple()
> +	 * to ensure the order between bi_private and bi_xxx
> +	 */
> +	smp_store_release(&bio->bi_private, NULL);
>  	blk_wake_io_task(waiter);
>  }
>  
> @@ -283,7 +287,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
>  	qc = submit_bio(&bio);
>  	for (;;) {
>  		set_current_state(TASK_UNINTERRUPTIBLE);
> -		if (!READ_ONCE(bio.bi_private))
> +		/* Refer to comments in blkdev_bio_end_io_simple() */
> +		if (!smp_load_acquire(&bio.bi_private))
>  			break;
>  		if (!(iocb->ki_flags & IOCB_HIPRI) ||
>  		    !blk_poll(bdev_get_queue(bdev), qc, true))
> @@ -353,7 +358,12 @@ static void blkdev_bio_end_io(struct bio *bio)
>  		} else {
>  			struct task_struct *waiter = dio->waiter;
>  
> -			WRITE_ONCE(dio->waiter, NULL);
> +			/*
> +			 * Paired with smp_load_acquire() in
> +			 * __blkdev_direct_IO() to ensure the order between
> +			 * dio->waiter and bio->bi_xxx
> +			 */
> +			smp_store_release(&dio->waiter, NULL);
>  			blk_wake_io_task(waiter);
>  		}
>  	}
> @@ -478,7 +488,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
>  
>  	for (;;) {
>  		set_current_state(TASK_UNINTERRUPTIBLE);
> -		if (!READ_ONCE(dio->waiter))
> +		/* Refer to comments in blkdev_bio_end_io */
> +		if (!smp_load_acquire(&dio->waiter))
>  			break;
>  
>  		if (!(iocb->ki_flags & IOCB_HIPRI) ||
Hou Tao July 13, 2021, 1:14 a.m. UTC | #2
ping ?

On 7/7/2021 2:29 PM, Hou Tao wrote:
> ping ?
>
> On 7/1/2021 7:35 PM, Hou Tao wrote:
>> When running stress test on null_blk under linux-4.19.y, the following
>> warning is reported:
>>
>>   percpu_ref_switch_to_atomic_rcu: percpu ref (css_release) <= 0 (-3) after switching to atomic
>>
>> The cause is that css_put() is invoked twice on the same bio as shown below:
>>
>> CPU 1:                         CPU 2:
>>
>> // IO completion kworker       // IO submit thread
>>                                __blkdev_direct_IO_simple
>>                                  submit_bio
>>
>> bio_endio
>>   bio_uninit(bio)
>>     css_put(bi_css)
>>     bi_css = NULL
>>                                set_current_state(TASK_UNINTERRUPTIBLE)
>>   bio->bi_end_io
>>     blkdev_bio_end_io_simple
>>       bio->bi_private = NULL
>>                                // bi_private is NULL
>>                                READ_ONCE(bio->bi_private)
>>         wake_up_process
>>           smp_mb__after_spinlock
>>
>>                                bio_unint(bio)
>>                                  // read bi_css as no-NULL
>>                                  // so call css_put() again
>>                                  css_put(bi_css)
>>
>> Because there is no memory barriers between the reading and the writing of
>> bi_private and bi_css, so reading bi_private as NULL can not guarantee
>> bi_css will also be NULL on weak-memory model host (e.g, ARM64).
>>
>> For the latest kernel source, css_put() has been removed from bio_unint(),
>> but the memory-order problem still exists, because the order between
>> bio->bi_private and {bi_status|bi_blkg} is also assumed in
>> __blkdev_direct_IO_simple(). It is reproducible that
>> __blkdev_direct_IO_simple() may read bi_status as 0 event if
>> bi_status is set as an errno in req_bio_endio().
>>
>> In __blkdev_direct_IO(), the memory order between dio->waiter and
>> dio->bio.bi_status is not guaranteed neither. Until now it is unable to
>> reproduce it, maybe because dio->waiter and dio->bio.bi_status are
>> in the same cache-line. But it is better to add guarantee for memory
>> order.
>>
>> Fixing it by using smp_load_acquire() & smp_store_release() to guarantee
>> the order between {bio->bi_private|dio->waiter} and {bi_status|bi_blkg}.
>>
>> Fixes: 189ce2b9dcc3 ("block: fast-path for small and simple direct I/O requests")
>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>> ---
>>  fs/block_dev.c | 19 +++++++++++++++----
>>  1 file changed, 15 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index eb34f5c357cf..a602c6315b0b 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -224,7 +224,11 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
>>  {
>>  	struct task_struct *waiter = bio->bi_private;
>>  
>> -	WRITE_ONCE(bio->bi_private, NULL);
>> +	/*
>> +	 * Paired with smp_load_acquire in __blkdev_direct_IO_simple()
>> +	 * to ensure the order between bi_private and bi_xxx
>> +	 */
>> +	smp_store_release(&bio->bi_private, NULL);
>>  	blk_wake_io_task(waiter);
>>  }
>>  
>> @@ -283,7 +287,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
>>  	qc = submit_bio(&bio);
>>  	for (;;) {
>>  		set_current_state(TASK_UNINTERRUPTIBLE);
>> -		if (!READ_ONCE(bio.bi_private))
>> +		/* Refer to comments in blkdev_bio_end_io_simple() */
>> +		if (!smp_load_acquire(&bio.bi_private))
>>  			break;
>>  		if (!(iocb->ki_flags & IOCB_HIPRI) ||
>>  		    !blk_poll(bdev_get_queue(bdev), qc, true))
>> @@ -353,7 +358,12 @@ static void blkdev_bio_end_io(struct bio *bio)
>>  		} else {
>>  			struct task_struct *waiter = dio->waiter;
>>  
>> -			WRITE_ONCE(dio->waiter, NULL);
>> +			/*
>> +			 * Paired with smp_load_acquire() in
>> +			 * __blkdev_direct_IO() to ensure the order between
>> +			 * dio->waiter and bio->bi_xxx
>> +			 */
>> +			smp_store_release(&dio->waiter, NULL);
>>  			blk_wake_io_task(waiter);
>>  		}
>>  	}
>> @@ -478,7 +488,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
>>  
>>  	for (;;) {
>>  		set_current_state(TASK_UNINTERRUPTIBLE);
>> -		if (!READ_ONCE(dio->waiter))
>> +		/* Refer to comments in blkdev_bio_end_io */
>> +		if (!smp_load_acquire(&dio->waiter))
>>  			break;
>>  
>>  		if (!(iocb->ki_flags & IOCB_HIPRI) ||
> .
Christoph Hellwig July 15, 2021, 7:01 a.m. UTC | #3
On Thu, Jul 01, 2021 at 07:35:37PM +0800, Hou Tao wrote:
> When running stress test on null_blk under linux-4.19.y, the following
> warning is reported:
> 
>   percpu_ref_switch_to_atomic_rcu: percpu ref (css_release) <= 0 (-3) after switching to atomic
> 
> The cause is that css_put() is invoked twice on the same bio as shown below:
> 
> CPU 1:                         CPU 2:
> 
> // IO completion kworker       // IO submit thread
>                                __blkdev_direct_IO_simple
>                                  submit_bio
> 
> bio_endio
>   bio_uninit(bio)
>     css_put(bi_css)
>     bi_css = NULL
>                                set_current_state(TASK_UNINTERRUPTIBLE)
>   bio->bi_end_io
>     blkdev_bio_end_io_simple
>       bio->bi_private = NULL
>                                // bi_private is NULL
>                                READ_ONCE(bio->bi_private)
>         wake_up_process
>           smp_mb__after_spinlock
> 
>                                bio_unint(bio)
>                                  // read bi_css as no-NULL
>                                  // so call css_put() again
>                                  css_put(bi_css)
> 
> Because there is no memory barriers between the reading and the writing of
> bi_private and bi_css, so reading bi_private as NULL can not guarantee
> bi_css will also be NULL on weak-memory model host (e.g, ARM64).
> 
> For the latest kernel source, css_put() has been removed from bio_unint(),
> but the memory-order problem still exists, because the order between
> bio->bi_private and {bi_status|bi_blkg} is also assumed in
> __blkdev_direct_IO_simple(). It is reproducible that
> __blkdev_direct_IO_simple() may read bi_status as 0 event if
> bi_status is set as an errno in req_bio_endio().
> 
> In __blkdev_direct_IO(), the memory order between dio->waiter and
> dio->bio.bi_status is not guaranteed neither. Until now it is unable to
> reproduce it, maybe because dio->waiter and dio->bio.bi_status are
> in the same cache-line. But it is better to add guarantee for memory
> order.
> 
> Fixing it by using smp_load_acquire() & smp_store_release() to guarantee
> the order between {bio->bi_private|dio->waiter} and {bi_status|bi_blkg}.
> 
> Fixes: 189ce2b9dcc3 ("block: fast-path for small and simple direct I/O requests")

This obviously does not look broken, but smp_load_acquire /
smp_store_release is way beyond my paygrade.  Adding some CCs.

> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>  fs/block_dev.c | 19 +++++++++++++++----
>  1 file changed, 15 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index eb34f5c357cf..a602c6315b0b 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -224,7 +224,11 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
>  {
>  	struct task_struct *waiter = bio->bi_private;
>  
> -	WRITE_ONCE(bio->bi_private, NULL);
> +	/*
> +	 * Paired with smp_load_acquire in __blkdev_direct_IO_simple()
> +	 * to ensure the order between bi_private and bi_xxx
> +	 */
> +	smp_store_release(&bio->bi_private, NULL);
>  	blk_wake_io_task(waiter);
>  }
>  
> @@ -283,7 +287,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
>  	qc = submit_bio(&bio);
>  	for (;;) {
>  		set_current_state(TASK_UNINTERRUPTIBLE);
> -		if (!READ_ONCE(bio.bi_private))
> +		/* Refer to comments in blkdev_bio_end_io_simple() */
> +		if (!smp_load_acquire(&bio.bi_private))
>  			break;
>  		if (!(iocb->ki_flags & IOCB_HIPRI) ||
>  		    !blk_poll(bdev_get_queue(bdev), qc, true))
> @@ -353,7 +358,12 @@ static void blkdev_bio_end_io(struct bio *bio)
>  		} else {
>  			struct task_struct *waiter = dio->waiter;
>  
> -			WRITE_ONCE(dio->waiter, NULL);
> +			/*
> +			 * Paired with smp_load_acquire() in
> +			 * __blkdev_direct_IO() to ensure the order between
> +			 * dio->waiter and bio->bi_xxx
> +			 */
> +			smp_store_release(&dio->waiter, NULL);
>  			blk_wake_io_task(waiter);
>  		}
>  	}
> @@ -478,7 +488,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
>  
>  	for (;;) {
>  		set_current_state(TASK_UNINTERRUPTIBLE);
> -		if (!READ_ONCE(dio->waiter))
> +		/* Refer to comments in blkdev_bio_end_io */
> +		if (!smp_load_acquire(&dio->waiter))
>  			break;
>  
>  		if (!(iocb->ki_flags & IOCB_HIPRI) ||
> -- 
> 2.29.2
---end quoted text---
Peter Zijlstra July 15, 2021, 8:13 a.m. UTC | #4
On Thu, Jul 15, 2021 at 09:01:48AM +0200, Christoph Hellwig wrote:
> On Thu, Jul 01, 2021 at 07:35:37PM +0800, Hou Tao wrote:
> > When running stress test on null_blk under linux-4.19.y, the following
> > warning is reported:
> > 
> >   percpu_ref_switch_to_atomic_rcu: percpu ref (css_release) <= 0 (-3) after switching to atomic
> > 
> > The cause is that css_put() is invoked twice on the same bio as shown below:
> > 
> > CPU 1:                         CPU 2:
> > 
> > // IO completion kworker       // IO submit thread
> >                                __blkdev_direct_IO_simple
> >                                  submit_bio
> > 
> > bio_endio
> >   bio_uninit(bio)
> >     css_put(bi_css)
> >     bi_css = NULL
> >                                set_current_state(TASK_UNINTERRUPTIBLE)
> >   bio->bi_end_io
> >     blkdev_bio_end_io_simple
> >       bio->bi_private = NULL
> >                                // bi_private is NULL
> >                                READ_ONCE(bio->bi_private)
> >         wake_up_process
> >           smp_mb__after_spinlock
> > 
> >                                bio_unint(bio)
> >                                  // read bi_css as no-NULL
> >                                  // so call css_put() again
> >                                  css_put(bi_css)
> > 
> > Because there is no memory barriers between the reading and the writing of
> > bi_private and bi_css, so reading bi_private as NULL can not guarantee
> > bi_css will also be NULL on weak-memory model host (e.g, ARM64).
> > 
> > For the latest kernel source, css_put() has been removed from bio_unint(),
> > but the memory-order problem still exists, because the order between
> > bio->bi_private and {bi_status|bi_blkg} is also assumed in
> > __blkdev_direct_IO_simple(). It is reproducible that
> > __blkdev_direct_IO_simple() may read bi_status as 0 event if
> > bi_status is set as an errno in req_bio_endio().
> > 
> > In __blkdev_direct_IO(), the memory order between dio->waiter and
> > dio->bio.bi_status is not guaranteed neither. Until now it is unable to
> > reproduce it, maybe because dio->waiter and dio->bio.bi_status are
> > in the same cache-line. But it is better to add guarantee for memory
> > order.

Cachelines don't guarantee anything, you can get partial forwards.

> > Fixing it by using smp_load_acquire() & smp_store_release() to guarantee
> > the order between {bio->bi_private|dio->waiter} and {bi_status|bi_blkg}.
> > 
> > Fixes: 189ce2b9dcc3 ("block: fast-path for small and simple direct I/O requests")
> 
> This obviously does not look broken, but smp_load_acquire /
> smp_store_release is way beyond my paygrade.  Adding some CCs.

This block stuff is a bit beyond me, lets see if we can make sense of
it.

> > Signed-off-by: Hou Tao <houtao1@huawei.com>
> > ---
> >  fs/block_dev.c | 19 +++++++++++++++----
> >  1 file changed, 15 insertions(+), 4 deletions(-)
> > 
> > diff --git a/fs/block_dev.c b/fs/block_dev.c
> > index eb34f5c357cf..a602c6315b0b 100644
> > --- a/fs/block_dev.c
> > +++ b/fs/block_dev.c
> > @@ -224,7 +224,11 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
> >  {
> >  	struct task_struct *waiter = bio->bi_private;
> >  
> > -	WRITE_ONCE(bio->bi_private, NULL);
> > +	/*
> > +	 * Paired with smp_load_acquire in __blkdev_direct_IO_simple()
> > +	 * to ensure the order between bi_private and bi_xxx
> > +	 */

This comment doesn't help me; where are the other stores? Presumably
somewhere before this is called, but how does one go about finding them?

The Changelog seems to suggest you only care about bi_css, not bi_xxx in
general. In specific you can only care about stores that happen before
this; is all of bi_xxx written before here? If not, you have to be more
specific.

Also, this being a clear, this very much isn't the typical publish
pattern.

On top of all that, smp_wmb() would be sufficient here and would be
cheaper on some platforms (ARM comes to mind).

> > +	smp_store_release(&bio->bi_private, NULL);
> >  	blk_wake_io_task(waiter);
> >  }
> >  
> > @@ -283,7 +287,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
> >  	qc = submit_bio(&bio);
> >  	for (;;) {
> >  		set_current_state(TASK_UNINTERRUPTIBLE);
> > -		if (!READ_ONCE(bio.bi_private))
> > +		/* Refer to comments in blkdev_bio_end_io_simple() */
> > +		if (!smp_load_acquire(&bio.bi_private))
> >  			break;
> >  		if (!(iocb->ki_flags & IOCB_HIPRI) ||
> >  		    !blk_poll(bdev_get_queue(bdev), qc, true))

That comment there doesn't help me find any relevant later loads and is
thus again inadequate.

Here the purpose seems to be to ensure the bi_css load happens after the
bi_private load, and this again is cheaper done using smp_rmb().

Also, the implication seems to be -- but is not spelled out anywhere --
that if bi_private is !NULL, it is stable.

> > @@ -353,7 +358,12 @@ static void blkdev_bio_end_io(struct bio *bio)
> >  		} else {
> >  			struct task_struct *waiter = dio->waiter;
> >  
> > -			WRITE_ONCE(dio->waiter, NULL);
> > +			/*
> > +			 * Paired with smp_load_acquire() in
> > +			 * __blkdev_direct_IO() to ensure the order between
> > +			 * dio->waiter and bio->bi_xxx
> > +			 */
> > +			smp_store_release(&dio->waiter, NULL);
> >  			blk_wake_io_task(waiter);
> >  		}
> >  	}
> > @@ -478,7 +488,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
> >  
> >  	for (;;) {
> >  		set_current_state(TASK_UNINTERRUPTIBLE);
> > -		if (!READ_ONCE(dio->waiter))
> > +		/* Refer to comments in blkdev_bio_end_io */
> > +		if (!smp_load_acquire(&dio->waiter))
> >  			break;
> >  
> >  		if (!(iocb->ki_flags & IOCB_HIPRI) ||

Idem for these...
Hou Tao July 16, 2021, 9:02 a.m. UTC | #5
Hi Peter,

On 7/15/2021 4:13 PM, Peter Zijlstra wrote:
> On Thu, Jul 15, 2021 at 09:01:48AM +0200, Christoph Hellwig wrote:
>> On Thu, Jul 01, 2021 at 07:35:37PM +0800, Hou Tao wrote:
>>> When running stress test on null_blk under linux-4.19.y, the following
>>> warning is reported:
>>>
>>>   percpu_ref_switch_to_atomic_rcu: percpu ref (css_release) <= 0 (-3) after switching to atomic
>>>
snip
>>> In __blkdev_direct_IO(), the memory order between dio->waiter and
>>> dio->bio.bi_status is not guaranteed neither. Until now it is unable to
>>> reproduce it, maybe because dio->waiter and dio->bio.bi_status are
>>> in the same cache-line. But it is better to add guarantee for memory
>>> order.
> Cachelines don't guarantee anything, you can get partial forwards.

Could you please point me to any reference ? I can not google

any memory order things by using "partial forwards".

>
>>> Fixing it by using smp_load_acquire() & smp_store_release() to guarantee
>>> the order between {bio->bi_private|dio->waiter} and {bi_status|bi_blkg}.
>>>
>>> Fixes: 189ce2b9dcc3 ("block: fast-path for small and simple direct I/O requests")
>> This obviously does not look broken, but smp_load_acquire /
>> smp_store_release is way beyond my paygrade.  Adding some CCs.
> This block stuff is a bit beyond me, lets see if we can make sense of
> it.
>
>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>>> ---
>>>  fs/block_dev.c | 19 +++++++++++++++----
>>>  1 file changed, 15 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>>> index eb34f5c357cf..a602c6315b0b 100644
>>> --- a/fs/block_dev.c
>>> +++ b/fs/block_dev.c
>>> @@ -224,7 +224,11 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
>>>  {
>>>  	struct task_struct *waiter = bio->bi_private;
>>>  
>>> -	WRITE_ONCE(bio->bi_private, NULL);
>>> +	/*
>>> +	 * Paired with smp_load_acquire in __blkdev_direct_IO_simple()
>>> +	 * to ensure the order between bi_private and bi_xxx
>>> +	 */
> This comment doesn't help me; where are the other stores? Presumably
> somewhere before this is called, but how does one go about finding them?

Yes, the change log is vague and it will be corrected. The other stores

happen in req_bio_endio() and its callees when the request completes.

> The Changelog seems to suggest you only care about bi_css, not bi_xxx in
> general. In specific you can only care about stores that happen before
> this; is all of bi_xxx written before here? If not, you have to be more
> specific.

Actually we care about all bi_xxx which are written in req_bio_endio, and all writes

to bi_xxx happen before blkdev_bio_end_io_simple(). Here I just try to

use bi_status as one example.

> Also, this being a clear, this very much isn't the typical publish
> pattern.
>
> On top of all that, smp_wmb() would be sufficient here and would be
> cheaper on some platforms (ARM comes to mind).
Thanks for your knowledge, I will use smp_wmb() instead of smp_store_release().
>
>>> +	smp_store_release(&bio->bi_private, NULL);
>>>  	blk_wake_io_task(waiter);
>>>  }
>>>  
>>> @@ -283,7 +287,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
>>>  	qc = submit_bio(&bio);
>>>  	for (;;) {
>>>  		set_current_state(TASK_UNINTERRUPTIBLE);
>>> -		if (!READ_ONCE(bio.bi_private))
>>> +		/* Refer to comments in blkdev_bio_end_io_simple() */
>>> +		if (!smp_load_acquire(&bio.bi_private))
>>>  			break;
>>>  		if (!(iocb->ki_flags & IOCB_HIPRI) ||
>>>  		    !blk_poll(bdev_get_queue(bdev), qc, true))
> That comment there doesn't help me find any relevant later loads and is
> thus again inadequate.
>
> Here the purpose seems to be to ensure the bi_css load happens after the
> bi_private load, and this again is cheaper done using smp_rmb().
Yes and thanks again.
>
> Also, the implication seems to be -- but is not spelled out anywhere --
> that if bi_private is !NULL, it is stable.

What is the meaning of "it is stable" ? Do you mean if bi_private is NULL,

the values of bi_xxx should be ensured ?

>>> @@ -353,7 +358,12 @@ static void blkdev_bio_end_io(struct bio *bio)
>>>  		} else {
>>>  			struct task_struct *waiter = dio->waiter;
>>>  
>>> -			WRITE_ONCE(dio->waiter, NULL);
>>> +			/*
>>> +			 * Paired with smp_load_acquire() in
>>> +			 * __blkdev_direct_IO() to ensure the order between
>>> +			 * dio->waiter and bio->bi_xxx
>>> +			 */
>>> +			smp_store_release(&dio->waiter, NULL);
>>>  			blk_wake_io_task(waiter);
>>>  		}
>>>  	}
>>> @@ -478,7 +488,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
>>>  
>>>  	for (;;) {
>>>  		set_current_state(TASK_UNINTERRUPTIBLE);
>>> -		if (!READ_ONCE(dio->waiter))
>>> +		/* Refer to comments in blkdev_bio_end_io */
>>> +		if (!smp_load_acquire(&dio->waiter))
>>>  			break;
>>>  
>>>  		if (!(iocb->ki_flags & IOCB_HIPRI) ||
> Idem for these...
> .

Thanks

Regards,

Tao
Peter Zijlstra July 16, 2021, 10:19 a.m. UTC | #6
On Fri, Jul 16, 2021 at 05:02:33PM +0800, Hou Tao wrote:

> > Cachelines don't guarantee anything, you can get partial forwards.
> 
> Could you please point me to any reference ? I can not google
> 
> any memory order things by using "partial forwards".

I'm not sure I have references, but there are CPUs that can do, for
example, store forwarding at a granularity below cachelines (ie at
register size).

In such a case a CPU might observe the stored value before it is
committed to memory.


> >>> @@ -224,7 +224,11 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
> >>>  {
> >>>  	struct task_struct *waiter = bio->bi_private;
> >>>  
> >>> -	WRITE_ONCE(bio->bi_private, NULL);
> >>> +	/*
> >>> +	 * Paired with smp_load_acquire in __blkdev_direct_IO_simple()
> >>> +	 * to ensure the order between bi_private and bi_xxx
> >>> +	 */
> > This comment doesn't help me; where are the other stores? Presumably
> > somewhere before this is called, but how does one go about finding them?
> 
> Yes, the change log is vague and it will be corrected. The other stores
> 
> happen in req_bio_endio() and its callees when the request completes.

Aaah, right. So initially I was wondering if it would make sense to put
the barrier there, but having looked at this a little longer, this
really seems to be about these two DIO methods.

> > The Changelog seems to suggest you only care about bi_css, not bi_xxx in
> > general. In specific you can only care about stores that happen before
> > this; is all of bi_xxx written before here? If not, you have to be more
> > specific.
> 
> Actually we care about all bi_xxx which are written in req_bio_endio,
> and all writes to bi_xxx happen before blkdev_bio_end_io_simple().
> Here I just try to use bi_status as one example.

I see req_bio_endio() change bi_status, bi_flags and bi_iter, but afaict
there's more bi_ fields.

> >>> @@ -283,7 +287,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
> >>>  	qc = submit_bio(&bio);
> >>>  	for (;;) {
> >>>  		set_current_state(TASK_UNINTERRUPTIBLE);
> >>> -		if (!READ_ONCE(bio.bi_private))
> >>> +		/* Refer to comments in blkdev_bio_end_io_simple() */
> >>> +		if (!smp_load_acquire(&bio.bi_private))
> >>>  			break;
> >>>  		if (!(iocb->ki_flags & IOCB_HIPRI) ||
> >>>  		    !blk_poll(bdev_get_queue(bdev), qc, true))
> > That comment there doesn't help me find any relevant later loads and is
> > thus again inadequate.
> >
> > Here the purpose seems to be to ensure the bi_css load happens after the
> > bi_private load, and this again is cheaper done using smp_rmb().
> Yes and thanks again.
> >
> > Also, the implication seems to be -- but is not spelled out anywhere --
> > that if bi_private is !NULL, it is stable.
> 
> What is the meaning of "it is stable" ? Do you mean if bi_private is NULL,
> the values of bi_xxx should be ensured ?

With stable I mean that if it is !NULL the value is always the same.

I've read more code and this is indeed the case, specifically, here
bi_private seems to be 'current' and will only be changed to NULL.
Paul E. McKenney July 19, 2021, 6:09 p.m. UTC | #7
On Fri, Jul 16, 2021 at 12:19:29PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 16, 2021 at 05:02:33PM +0800, Hou Tao wrote:
> 
> > > Cachelines don't guarantee anything, you can get partial forwards.
> > 
> > Could you please point me to any reference ? I can not google
> > 
> > any memory order things by using "partial forwards".
> 
> I'm not sure I have references, but there are CPUs that can do, for
> example, store forwarding at a granularity below cachelines (ie at
> register size).
> 
> In such a case a CPU might observe the stored value before it is
> committed to memory.

There have been examples of systems with multiple hardware threads
per core, but where the hardware threads share a store buffer.
In this case, the other threads in the core might see a store before
it is committed to a cache line.

As you might imagine, the hardware implementation of memory barriers
in such a system is tricky.  But not impossible.

							Thanx, Paul
Paul E. McKenney July 19, 2021, 6:16 p.m. UTC | #8
On Thu, Jul 15, 2021 at 09:01:48AM +0200, Christoph Hellwig wrote:
> On Thu, Jul 01, 2021 at 07:35:37PM +0800, Hou Tao wrote:

[ . . . ]

> > Fixing it by using smp_load_acquire() & smp_store_release() to guarantee
> > the order between {bio->bi_private|dio->waiter} and {bi_status|bi_blkg}.
> > 
> > Fixes: 189ce2b9dcc3 ("block: fast-path for small and simple direct I/O requests")
> 
> This obviously does not look broken, but smp_load_acquire /
> smp_store_release is way beyond my paygrade.  Adding some CCs.

Huh.

I think that it was back in 2006 when I first told Linus that my goal was
to make memory ordering routine.  I clearly have not yet achieved that
goal, even given a lot of help from a lot of people over a lot of years.

Oh well, what is life without an ongoing challenge?  ;-)

							Thanx, Paul
diff mbox series

Patch

diff --git a/fs/block_dev.c b/fs/block_dev.c
index eb34f5c357cf..a602c6315b0b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -224,7 +224,11 @@  static void blkdev_bio_end_io_simple(struct bio *bio)
 {
 	struct task_struct *waiter = bio->bi_private;
 
-	WRITE_ONCE(bio->bi_private, NULL);
+	/*
+	 * Paired with smp_load_acquire in __blkdev_direct_IO_simple()
+	 * to ensure the order between bi_private and bi_xxx
+	 */
+	smp_store_release(&bio->bi_private, NULL);
 	blk_wake_io_task(waiter);
 }
 
@@ -283,7 +287,8 @@  __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	qc = submit_bio(&bio);
 	for (;;) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!READ_ONCE(bio.bi_private))
+		/* Refer to comments in blkdev_bio_end_io_simple() */
+		if (!smp_load_acquire(&bio.bi_private))
 			break;
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
 		    !blk_poll(bdev_get_queue(bdev), qc, true))
@@ -353,7 +358,12 @@  static void blkdev_bio_end_io(struct bio *bio)
 		} else {
 			struct task_struct *waiter = dio->waiter;
 
-			WRITE_ONCE(dio->waiter, NULL);
+			/*
+			 * Paired with smp_load_acquire() in
+			 * __blkdev_direct_IO() to ensure the order between
+			 * dio->waiter and bio->bi_xxx
+			 */
+			smp_store_release(&dio->waiter, NULL);
 			blk_wake_io_task(waiter);
 		}
 	}
@@ -478,7 +488,8 @@  static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	for (;;) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!READ_ONCE(dio->waiter))
+		/* Refer to comments in blkdev_bio_end_io */
+		if (!smp_load_acquire(&dio->waiter))
 			break;
 
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||