diff mbox series

[v4,1/2] introduce test_bit_acquire and use it in wait_on_bit

Message ID alpine.LRH.2.02.2208010640260.22006@file01.intranet.prod.int.rdu2.redhat.com (mailing list archive)
State New, archived
Headers show
Series [v4,1/2] introduce test_bit_acquire and use it in wait_on_bit | expand

Commit Message

Mikulas Patocka Aug. 1, 2022, 10:42 a.m. UTC
wait_on_bit tests the bit without any memory barriers, consequently the
code that follows wait_on_bit may be moved before testing the bit on
architectures with weak memory ordering. When the code tests for some
event using wait_on_bit and then performs a load operation, the load may
be unexpectedly moved before wait_on_bit and it may return data that
existed before the event occurred.

Such bugs exist in fs/buffer.c:__wait_on_buffer,
drivers/md/dm-bufio.c:new_read,
drivers/media/usb/dvb-usb-v2/dvb_usb_core.c:dvb_usb_start_feed,
drivers/bluetooth/btusb.c:btusb_mtk_hci_wmt_sync
and perhaps in other places.

We fix this class of bugs by adding a new function test_bit_acquire that
reads the bit and provides acquire memory ordering semantics.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org

---
 arch/s390/include/asm/bitops.h                       |   10 ++++++++++
 arch/x86/include/asm/bitops.h                        |    7 ++++++-
 include/asm-generic/bitops/instrumented-non-atomic.h |   11 +++++++++++
 include/asm-generic/bitops/non-atomic.h              |   13 +++++++++++++
 include/linux/wait_bit.h                             |    8 ++++----
 kernel/sched/wait_bit.c                              |    6 +++---
 6 files changed, 47 insertions(+), 8 deletions(-)

Comments

Will Deacon Aug. 1, 2022, 3:54 p.m. UTC | #1
On Mon, Aug 01, 2022 at 06:42:15AM -0400, Mikulas Patocka wrote:
> wait_on_bit tests the bit without any memory barriers, consequently the
> code that follows wait_on_bit may be moved before testing the bit on
> architectures with weak memory ordering. When the code tests for some
> event using wait_on_bit and then performs a load operation, the load may
> be unexpectedly moved before wait_on_bit and it may return data that
> existed before the event occurred.
> 
> Such bugs exist in fs/buffer.c:__wait_on_buffer,
> drivers/md/dm-bufio.c:new_read,
> drivers/media/usb/dvb-usb-v2/dvb_usb_core.c:dvb_usb_start_feed,
> drivers/bluetooth/btusb.c:btusb_mtk_hci_wmt_sync
> and perhaps in other places.
> 
> We fix this class of bugs by adding a new function test_bit_acquire that
> reads the bit and provides acquire memory ordering semantics.
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> Cc: stable@vger.kernel.org
> 
> ---
>  arch/s390/include/asm/bitops.h                       |   10 ++++++++++
>  arch/x86/include/asm/bitops.h                        |    7 ++++++-
>  include/asm-generic/bitops/instrumented-non-atomic.h |   11 +++++++++++
>  include/asm-generic/bitops/non-atomic.h              |   13 +++++++++++++
>  include/linux/wait_bit.h                             |    8 ++++----
>  kernel/sched/wait_bit.c                              |    6 +++---
>  6 files changed, 47 insertions(+), 8 deletions(-)
> 
> Index: linux-2.6/arch/x86/include/asm/bitops.h
> ===================================================================
> --- linux-2.6.orig/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> +++ linux-2.6/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> @@ -203,8 +203,10 @@ arch_test_and_change_bit(long nr, volati
>  
>  static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
>  {
> -	return ((1UL << (nr & (BITS_PER_LONG-1))) &
> +	bool r = ((1UL << (nr & (BITS_PER_LONG-1))) &
>  		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> +	barrier();
> +	return r;

Hmm, I find it a bit weird to have a barrier() here given that 'addr' is
volatile and we don't need a barrier() like this in the definition of
READ_ONCE(), for example.

> Index: linux-2.6/include/linux/wait_bit.h
> ===================================================================
> --- linux-2.6.orig/include/linux/wait_bit.h	2022-08-01 12:27:43.000000000 +0200
> +++ linux-2.6/include/linux/wait_bit.h	2022-08-01 12:27:43.000000000 +0200
> @@ -71,7 +71,7 @@ static inline int
>  wait_on_bit(unsigned long *word, int bit, unsigned mode)
>  {
>  	might_sleep();
> -	if (!test_bit(bit, word))
> +	if (!test_bit_acquire(bit, word))
>  		return 0;
>  	return out_of_line_wait_on_bit(word, bit,
>  				       bit_wait,

Yet another approach here would be to leave test_bit as-is and add a call to
smp_acquire__after_ctrl_dep() since that exists already -- I don't have
strong opinions about it, but it saves you having to add another stub to
x86.

Will
Mikulas Patocka Aug. 1, 2022, 4:12 p.m. UTC | #2
On Mon, 1 Aug 2022, Will Deacon wrote:

> On Mon, Aug 01, 2022 at 06:42:15AM -0400, Mikulas Patocka wrote:
> 
> > Index: linux-2.6/arch/x86/include/asm/bitops.h
> > ===================================================================
> > --- linux-2.6.orig/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > +++ linux-2.6/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > @@ -203,8 +203,10 @@ arch_test_and_change_bit(long nr, volati
> >  
> >  static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
> >  {
> > -	return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > +	bool r = ((1UL << (nr & (BITS_PER_LONG-1))) &
> >  		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > +	barrier();
> > +	return r;
> 
> Hmm, I find it a bit weird to have a barrier() here given that 'addr' is
> volatile and we don't need a barrier() like this in the definition of
> READ_ONCE(), for example.

gcc doesn't reorder two volatile accesses, but it can reorder non-volatile
accesses around volatile accesses.

The purpose of the compiler barrier is to make sure that the non-volatile 
accesses that follow test_bit are not reordered by the compiler before the 
volatile access to addr.

> > Index: linux-2.6/include/linux/wait_bit.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/wait_bit.h	2022-08-01 12:27:43.000000000 +0200
> > +++ linux-2.6/include/linux/wait_bit.h	2022-08-01 12:27:43.000000000 +0200
> > @@ -71,7 +71,7 @@ static inline int
> >  wait_on_bit(unsigned long *word, int bit, unsigned mode)
> >  {
> >  	might_sleep();
> > -	if (!test_bit(bit, word))
> > +	if (!test_bit_acquire(bit, word))
> >  		return 0;
> >  	return out_of_line_wait_on_bit(word, bit,
> >  				       bit_wait,
> 
> Yet another approach here would be to leave test_bit as-is and add a call to
> smp_acquire__after_ctrl_dep() since that exists already -- I don't have
> strong opinions about it, but it saves you having to add another stub to
> x86.

It would be the same as my previous patch with smp_rmb() that Linus didn't 
like. But I think smp_rmb (or smp_acquire__after_ctrl_dep) would be 
correct here.

> Will

Mikulas
Boqun Feng Aug. 1, 2022, 6:17 p.m. UTC | #3
On Mon, Aug 01, 2022 at 12:12:47PM -0400, Mikulas Patocka wrote:
> 
> 
> On Mon, 1 Aug 2022, Will Deacon wrote:
> 
> > On Mon, Aug 01, 2022 at 06:42:15AM -0400, Mikulas Patocka wrote:
> > 
> > > Index: linux-2.6/arch/x86/include/asm/bitops.h
> > > ===================================================================
> > > --- linux-2.6.orig/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > > +++ linux-2.6/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > > @@ -203,8 +203,10 @@ arch_test_and_change_bit(long nr, volati
> > >  
> > >  static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
> > >  {
> > > -	return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > +	bool r = ((1UL << (nr & (BITS_PER_LONG-1))) &
> > >  		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > > +	barrier();
> > > +	return r;
> > 
> > Hmm, I find it a bit weird to have a barrier() here given that 'addr' is
> > volatile and we don't need a barrier() like this in the definition of
> > READ_ONCE(), for example.
> 
> gcc doesn't reorder two volatile accesses, but it can reorder non-volatile
> accesses around volatile accesses.
> 
> The purpose of the compiler barrier is to make sure that the non-volatile 
> accesses that follow test_bit are not reordered by the compiler before the 
> volatile access to addr.
> 

Better to have a constant_test_bit_acquire()? I don't think all
test_bit() call sites need the ordering?

Regards,
Boqun

> > > Index: linux-2.6/include/linux/wait_bit.h
> > > ===================================================================
> > > --- linux-2.6.orig/include/linux/wait_bit.h	2022-08-01 12:27:43.000000000 +0200
> > > +++ linux-2.6/include/linux/wait_bit.h	2022-08-01 12:27:43.000000000 +0200
> > > @@ -71,7 +71,7 @@ static inline int
> > >  wait_on_bit(unsigned long *word, int bit, unsigned mode)
> > >  {
> > >  	might_sleep();
> > > -	if (!test_bit(bit, word))
> > > +	if (!test_bit_acquire(bit, word))
> > >  		return 0;
> > >  	return out_of_line_wait_on_bit(word, bit,
> > >  				       bit_wait,
> > 
> > Yet another approach here would be to leave test_bit as-is and add a call to
> > smp_acquire__after_ctrl_dep() since that exists already -- I don't have
> > strong opinions about it, but it saves you having to add another stub to
> > x86.
> 
> It would be the same as my previous patch with smp_rmb() that Linus didn't 
> like. But I think smp_rmb (or smp_acquire__after_ctrl_dep) would be 
> correct here.
> 
> > Will
> 
> Mikulas
>
David Laight Aug. 2, 2022, 8 a.m. UTC | #4
From: Boqun Feng
> Sent: 01 August 2022 19:17
> 
> On Mon, Aug 01, 2022 at 12:12:47PM -0400, Mikulas Patocka wrote:
> >
> >
> > On Mon, 1 Aug 2022, Will Deacon wrote:
> >
> > > On Mon, Aug 01, 2022 at 06:42:15AM -0400, Mikulas Patocka wrote:
> > >
> > > > Index: linux-2.6/arch/x86/include/asm/bitops.h
> > > > ===================================================================
> > > > --- linux-2.6.orig/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > > > +++ linux-2.6/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > > > @@ -203,8 +203,10 @@ arch_test_and_change_bit(long nr, volati
> > > >
> > > >  static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
> > > >  {
> > > > -	return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > > +	bool r = ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > >  		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > > > +	barrier();
> > > > +	return r;
> > >
> > > Hmm, I find it a bit weird to have a barrier() here given that 'addr' is
> > > volatile and we don't need a barrier() like this in the definition of
> > > READ_ONCE(), for example.
> >
> > gcc doesn't reorder two volatile accesses, but it can reorder non-volatile
> > accesses around volatile accesses.
> >
> > The purpose of the compiler barrier is to make sure that the non-volatile
> > accesses that follow test_bit are not reordered by the compiler before the
> > volatile access to addr.
> >
> 
> Better to have a constant_test_bit_acquire()? I don't think all
> test_bit() call sites need the ordering?

It is also unlikely that the compiler will 'usefully' move a read
across the test_bit() call - which is likely to be in a conditional.
So barrier() is unlikely to significantly affect the generated code.

Indeed, perhaps test_bit() should always enforce read ordering
even one weakly ordered cpu?
It is used with set_bit() and clear_bit() which are expensive
locked operations - so a slightly more expensive test_bit()
probably doesn't matter.

Remember these aren't functions to replace &= and |=.
(In spite of some code paths.)

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Will Deacon Aug. 2, 2022, 8:40 a.m. UTC | #5
On Mon, Aug 01, 2022 at 12:12:47PM -0400, Mikulas Patocka wrote:
> On Mon, 1 Aug 2022, Will Deacon wrote:
> > On Mon, Aug 01, 2022 at 06:42:15AM -0400, Mikulas Patocka wrote:
> > 
> > > Index: linux-2.6/arch/x86/include/asm/bitops.h
> > > ===================================================================
> > > --- linux-2.6.orig/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > > +++ linux-2.6/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > > @@ -203,8 +203,10 @@ arch_test_and_change_bit(long nr, volati
> > >  
> > >  static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
> > >  {
> > > -	return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > +	bool r = ((1UL << (nr & (BITS_PER_LONG-1))) &
> > >  		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > > +	barrier();
> > > +	return r;
> > 
> > Hmm, I find it a bit weird to have a barrier() here given that 'addr' is
> > volatile and we don't need a barrier() like this in the definition of
> > READ_ONCE(), for example.
> 
> gcc doesn't reorder two volatile accesses, but it can reorder non-volatile
> accesses around volatile accesses.
> 
> The purpose of the compiler barrier is to make sure that the non-volatile 
> accesses that follow test_bit are not reordered by the compiler before the 
> volatile access to addr.

If we need these accesses to be ordered reliably, then we need a CPU barrier
and that will additionally prevent the compiler reordering. So I still don't
think we need the barrier() here.

> > > Index: linux-2.6/include/linux/wait_bit.h
> > > ===================================================================
> > > --- linux-2.6.orig/include/linux/wait_bit.h	2022-08-01 12:27:43.000000000 +0200
> > > +++ linux-2.6/include/linux/wait_bit.h	2022-08-01 12:27:43.000000000 +0200
> > > @@ -71,7 +71,7 @@ static inline int
> > >  wait_on_bit(unsigned long *word, int bit, unsigned mode)
> > >  {
> > >  	might_sleep();
> > > -	if (!test_bit(bit, word))
> > > +	if (!test_bit_acquire(bit, word))
> > >  		return 0;
> > >  	return out_of_line_wait_on_bit(word, bit,
> > >  				       bit_wait,
> > 
> > Yet another approach here would be to leave test_bit as-is and add a call to
> > smp_acquire__after_ctrl_dep() since that exists already -- I don't have
> > strong opinions about it, but it saves you having to add another stub to
> > x86.
> 
> It would be the same as my previous patch with smp_rmb() that Linus didn't 
> like. But I think smp_rmb (or smp_acquire__after_ctrl_dep) would be 
> correct here.

Right, I saw Linus' objection to smp_rmb() and I'm not sure where
smp_acquire__after_ctrl_dep() fits in with his line of reasoning. On the one
hand, it's talking about acquire ordering, but on the other, it's ugly as
sin :)

Will
Mikulas Patocka Aug. 2, 2022, 11:38 a.m. UTC | #6
On Tue, 2 Aug 2022, Will Deacon wrote:

> On Mon, Aug 01, 2022 at 12:12:47PM -0400, Mikulas Patocka wrote:
> > On Mon, 1 Aug 2022, Will Deacon wrote:
> > > On Mon, Aug 01, 2022 at 06:42:15AM -0400, Mikulas Patocka wrote:
> > > 
> > > > Index: linux-2.6/arch/x86/include/asm/bitops.h
> > > > ===================================================================
> > > > --- linux-2.6.orig/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > > > +++ linux-2.6/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > > > @@ -203,8 +203,10 @@ arch_test_and_change_bit(long nr, volati
> > > >  
> > > >  static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
> > > >  {
> > > > -	return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > > +	bool r = ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > >  		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > > > +	barrier();
> > > > +	return r;
> > > 
> > > Hmm, I find it a bit weird to have a barrier() here given that 'addr' is
> > > volatile and we don't need a barrier() like this in the definition of
> > > READ_ONCE(), for example.
> > 
> > gcc doesn't reorder two volatile accesses, but it can reorder non-volatile
> > accesses around volatile accesses.
> > 
> > The purpose of the compiler barrier is to make sure that the non-volatile 
> > accesses that follow test_bit are not reordered by the compiler before the 
> > volatile access to addr.
> 
> If we need these accesses to be ordered reliably, then we need a CPU barrier
> and that will additionally prevent the compiler reordering. So I still don't
> think we need the barrier() here.

This is x86-specific code. x86 has strong memory ordering, so we only care 
about compiler reordering.

We could use smp_rmb() (or smp_load_acquire()) instead of barrier() here, 
but smp_rmb() and smp_load_acquire() on x86 is identical to barrier() 
anyway.

Mikulas
Will Deacon Aug. 2, 2022, 1:36 p.m. UTC | #7
On Tue, Aug 02, 2022 at 07:38:17AM -0400, Mikulas Patocka wrote:
> 
> 
> On Tue, 2 Aug 2022, Will Deacon wrote:
> 
> > On Mon, Aug 01, 2022 at 12:12:47PM -0400, Mikulas Patocka wrote:
> > > On Mon, 1 Aug 2022, Will Deacon wrote:
> > > > On Mon, Aug 01, 2022 at 06:42:15AM -0400, Mikulas Patocka wrote:
> > > > 
> > > > > Index: linux-2.6/arch/x86/include/asm/bitops.h
> > > > > ===================================================================
> > > > > --- linux-2.6.orig/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > > > > +++ linux-2.6/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > > > > @@ -203,8 +203,10 @@ arch_test_and_change_bit(long nr, volati
> > > > >  
> > > > >  static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
> > > > >  {
> > > > > -	return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > > > +	bool r = ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > > >  		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > > > > +	barrier();
> > > > > +	return r;
> > > > 
> > > > Hmm, I find it a bit weird to have a barrier() here given that 'addr' is
> > > > volatile and we don't need a barrier() like this in the definition of
> > > > READ_ONCE(), for example.
> > > 
> > > gcc doesn't reorder two volatile accesses, but it can reorder non-volatile
> > > accesses around volatile accesses.
> > > 
> > > The purpose of the compiler barrier is to make sure that the non-volatile 
> > > accesses that follow test_bit are not reordered by the compiler before the 
> > > volatile access to addr.
> > 
> > If we need these accesses to be ordered reliably, then we need a CPU barrier
> > and that will additionally prevent the compiler reordering. So I still don't
> > think we need the barrier() here.
> 
> This is x86-specific code. x86 has strong memory ordering, so we only care 
> about compiler reordering.

Indeed, but what I'm trying to say is that the _caller_ would have a memory
barrier in this case, and so there's no need for one in here. test_bit() does
not have ordering semantics.

Will
Mikulas Patocka Aug. 2, 2022, 3:57 p.m. UTC | #8
On Tue, 2 Aug 2022, Will Deacon wrote:

> On Tue, Aug 02, 2022 at 07:38:17AM -0400, Mikulas Patocka wrote:
> > 
> > 
> > On Tue, 2 Aug 2022, Will Deacon wrote:
> > 
> > > On Mon, Aug 01, 2022 at 12:12:47PM -0400, Mikulas Patocka wrote:
> > > > On Mon, 1 Aug 2022, Will Deacon wrote:
> > > > > On Mon, Aug 01, 2022 at 06:42:15AM -0400, Mikulas Patocka wrote:
> > > > > 
> > > > > > Index: linux-2.6/arch/x86/include/asm/bitops.h
> > > > > > ===================================================================
> > > > > > --- linux-2.6.orig/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > > > > > +++ linux-2.6/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
> > > > > > @@ -203,8 +203,10 @@ arch_test_and_change_bit(long nr, volati
> > > > > >  
> > > > > >  static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
> > > > > >  {
> > > > > > -	return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > > > > +	bool r = ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > > > >  		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > > > > > +	barrier();
> > > > > > +	return r;
> > > > > 
> > > > > Hmm, I find it a bit weird to have a barrier() here given that 'addr' is
> > > > > volatile and we don't need a barrier() like this in the definition of
> > > > > READ_ONCE(), for example.
> > > > 
> > > > gcc doesn't reorder two volatile accesses, but it can reorder non-volatile
> > > > accesses around volatile accesses.
> > > > 
> > > > The purpose of the compiler barrier is to make sure that the non-volatile 
> > > > accesses that follow test_bit are not reordered by the compiler before the 
> > > > volatile access to addr.
> > > 
> > > If we need these accesses to be ordered reliably, then we need a CPU barrier
> > > and that will additionally prevent the compiler reordering. So I still don't
> > > think we need the barrier() here.
> > 
> > This is x86-specific code. x86 has strong memory ordering, so we only care 
> > about compiler reordering.
> 
> Indeed, but what I'm trying to say is that the _caller_ would have a memory
> barrier in this case, and so there's no need for one in here. test_bit() does
> not have ordering semantics.
> 
> Will

But constant_test_bit() is also used for test_bit_acquire(), and for 
test_bit_acquire(), the barrier is needed. Without the barrier, it doesn't 
have acquire semantics, because the compiler (not CPU) can move the 
following non-volatile accesses before the volatile access to "addr[nr >> 
_BITOPS_LONG_SHIFT]".

See this piece of code in arch/x86/include/asm/bitops.h in the patch:
+#define arch_test_bit_acquire(nr, addr)                \
+	arch_test_bit(nr, addr)

We could split constant_test_bit() to two functions: constant_test_bit() 
and constant_test_bit_acquire() and put the barrier only in 
constant_test_bit_acquire(). But I chose not to do it because code 
duplication is bad and because the overhead of the compiler barrier is 
none.

Mikulas
diff mbox series

Patch

Index: linux-2.6/arch/x86/include/asm/bitops.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
+++ linux-2.6/arch/x86/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
@@ -203,8 +203,10 @@  arch_test_and_change_bit(long nr, volati
 
 static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
 {
-	return ((1UL << (nr & (BITS_PER_LONG-1))) &
+	bool r = ((1UL << (nr & (BITS_PER_LONG-1))) &
 		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
+	barrier();
+	return r;
 }
 
 static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
@@ -224,6 +226,9 @@  static __always_inline bool variable_tes
 	 ? constant_test_bit((nr), (addr))	\
 	 : variable_test_bit((nr), (addr)))
 
+#define arch_test_bit_acquire(nr, addr)		\
+	arch_test_bit(nr, addr)
+
 /**
  * __ffs - find first set bit in word
  * @word: The word to search
Index: linux-2.6/include/asm-generic/bitops/instrumented-non-atomic.h
===================================================================
--- linux-2.6.orig/include/asm-generic/bitops/instrumented-non-atomic.h	2022-08-01 12:27:43.000000000 +0200
+++ linux-2.6/include/asm-generic/bitops/instrumented-non-atomic.h	2022-08-01 12:28:33.000000000 +0200
@@ -135,4 +135,15 @@  static __always_inline bool test_bit(lon
 	return arch_test_bit(nr, addr);
 }
 
+/**
+ * test_bit_acquire - Determine whether a bit is set with acquire semantics
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool test_bit_acquire(long nr, const volatile unsigned long *addr)
+{
+	instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
+	return arch_test_bit_acquire(nr, addr);
+}
+
 #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
Index: linux-2.6/include/asm-generic/bitops/non-atomic.h
===================================================================
--- linux-2.6.orig/include/asm-generic/bitops/non-atomic.h	2022-08-01 12:27:43.000000000 +0200
+++ linux-2.6/include/asm-generic/bitops/non-atomic.h	2022-08-01 12:27:43.000000000 +0200
@@ -119,4 +119,17 @@  arch_test_bit(unsigned int nr, const vol
 }
 #define test_bit arch_test_bit
 
+/**
+ * arch_test_bit - Determine whether a bit is set with acquire semantics
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline int
+arch_test_bit_acquire(unsigned int nr, const volatile unsigned long *addr)
+{
+	unsigned val = smp_load_acquire(&addr[BIT_WORD(nr)]);
+	return 1UL & (val >> (nr & (BITS_PER_LONG-1)));
+}
+#define test_bit_acquire arch_test_bit_acquire
+
 #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
Index: linux-2.6/arch/s390/include/asm/bitops.h
===================================================================
--- linux-2.6.orig/arch/s390/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
+++ linux-2.6/arch/s390/include/asm/bitops.h	2022-08-01 12:27:43.000000000 +0200
@@ -184,6 +184,16 @@  static inline bool arch_test_bit(unsigne
 	return *addr & mask;
 }
 
+static inline bool arch_test_bit_acquire(unsigned long nr,
+					 const volatile unsigned long *ptr)
+{
+	const volatile unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long val = smp_load_acquire(addr);
+	unsigned long mask = __bitops_mask(nr);
+
+	return val & mask;
+}
+
 static inline bool arch_test_and_set_bit_lock(unsigned long nr,
 					      volatile unsigned long *ptr)
 {
Index: linux-2.6/include/linux/wait_bit.h
===================================================================
--- linux-2.6.orig/include/linux/wait_bit.h	2022-08-01 12:27:43.000000000 +0200
+++ linux-2.6/include/linux/wait_bit.h	2022-08-01 12:27:43.000000000 +0200
@@ -71,7 +71,7 @@  static inline int
 wait_on_bit(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit(word, bit,
 				       bit_wait,
@@ -96,7 +96,7 @@  static inline int
 wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit(word, bit,
 				       bit_wait_io,
@@ -123,7 +123,7 @@  wait_on_bit_timeout(unsigned long *word,
 		    unsigned long timeout)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit_timeout(word, bit,
 					       bit_wait_timeout,
@@ -151,7 +151,7 @@  wait_on_bit_action(unsigned long *word,
 		   unsigned mode)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit(word, bit, action, mode);
 }
Index: linux-2.6/kernel/sched/wait_bit.c
===================================================================
--- linux-2.6.orig/kernel/sched/wait_bit.c	2022-08-01 12:27:43.000000000 +0200
+++ linux-2.6/kernel/sched/wait_bit.c	2022-08-01 12:27:43.000000000 +0200
@@ -25,7 +25,7 @@  int wake_bit_function(struct wait_queue_
 
 	if (wait_bit->key.flags != key->flags ||
 			wait_bit->key.bit_nr != key->bit_nr ||
-			test_bit(key->bit_nr, key->flags))
+			test_bit_acquire(key->bit_nr, key->flags))
 		return 0;
 
 	return autoremove_wake_function(wq_entry, mode, sync, key);
@@ -45,9 +45,9 @@  __wait_on_bit(struct wait_queue_head *wq
 
 	do {
 		prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
-		if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
+		if (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags))
 			ret = (*action)(&wbq_entry->key, mode);
-	} while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+	} while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
 
 	finish_wait(wq_head, &wbq_entry->wq_entry);