diff mbox series

[1/2] virtio-ccw: fix virtio_set_ind_atomic

Message ID 20200616045035.51641-2-pasic@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series two atomic_cmpxchg() related fixes | expand

Commit Message

Halil Pasic June 16, 2020, 4:50 a.m. UTC
The atomic_cmpxchg() loop is broken because we occasionally end up with
old and _old having different values (a legit compiler can generate code
that accessed *ind_addr again to pick up a value for _old instead of
using the value of old that was already fetched according to the
rules of the abstract machine). This means the underlying CS instruction
may use a different old (_old) than the one we intended to use if
atomic_cmpxchg() performed the xchg part.

Let us use volatile to force the rules of the abstract machine for
accesses to *ind_addr. Let us also rewrite the loop so, we that the
new old is used to compute the new desired value if the xchg part
is not performed.

Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
Reported-by: Andre Wild <Andre.Wild1@ibm.com>
Fixes: 7e7494627f ("s390x/virtio-ccw: Adapter interrupt support.")
---
 hw/s390x/virtio-ccw.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

Comments

Christian Borntraeger June 16, 2020, 5:58 a.m. UTC | #1
On 16.06.20 06:50, Halil Pasic wrote:
> The atomic_cmpxchg() loop is broken because we occasionally end up with
> old and _old having different values (a legit compiler can generate code
> that accessed *ind_addr again to pick up a value for _old instead of
> using the value of old that was already fetched according to the
> rules of the abstract machine). This means the underlying CS instruction
> may use a different old (_old) than the one we intended to use if
> atomic_cmpxchg() performed the xchg part.
> 
> Let us use volatile to force the rules of the abstract machine for
> accesses to *ind_addr. Let us also rewrite the loop so, we that the
> new old is used to compute the new desired value if the xchg part
> is not performed.
> 
> Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> Reported-by: Andre Wild <Andre.Wild1@ibm.com>
> Fixes: 7e7494627f ("s390x/virtio-ccw: Adapter interrupt support.")
> ---
>  hw/s390x/virtio-ccw.c | 18 ++++++++++--------
>  1 file changed, 10 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
> index c1f4bb1d33..3c988a000b 100644
> --- a/hw/s390x/virtio-ccw.c
> +++ b/hw/s390x/virtio-ccw.c
> @@ -786,9 +786,10 @@ static inline VirtioCcwDevice *to_virtio_ccw_dev_fast(DeviceState *d)
>  static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
>                                       uint8_t to_be_set)
>  {
> -    uint8_t ind_old, ind_new;
> +    uint8_t expected, actual;
>      hwaddr len = 1;
> -    uint8_t *ind_addr;
> +    /* avoid  multiple fetches */
> +    uint8_t volatile *ind_addr;
>  
>      ind_addr = cpu_physical_memory_map(ind_loc, &len, true);
>      if (!ind_addr) {
> @@ -796,14 +797,15 @@ static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
>                       __func__, sch->cssid, sch->ssid, sch->schid);
>          return -1;
>      }
> +    actual = *ind_addr;
>      do {
> -        ind_old = *ind_addr;

to make things easier to understand. Adding a barrier in here also fixes the issue.
Reasoning follows below:

> -        ind_new = ind_old | to_be_set;

with an analysis from Andreas (cc)

 #define atomic_cmpxchg__nocheck(ptr, old, new)    ({                    \   
 
     typeof_strip_qual(*ptr) _old = (old);                               \   
 
     (void)__atomic_compare_exchange_n(ptr, &_old, new, false,           \   
 
                               __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \   
 
     _old;                                                               \   
 
 })
 
ind_old is copied into _old in the macro. Instead of doing the copy from the
register the compiler reloads the value from memory. The result is that _old
and ind_old end up having different values. _old in r1 with the bits set
already and ind_old in r10 with the bits cleared. _old gets updated by CS
and matches ind_old afterwards - both with the bits being 0. So the !=
compare is false and the loop is left without having set any bits.


Paolo (to),
I am asking myself if it would be safer to add a barrier or something like
this in the macros in include/qemu/atomic.h. 




> -    } while (atomic_cmpxchg(ind_addr, ind_old, ind_new) != ind_old);
> -    trace_virtio_ccw_set_ind(ind_loc, ind_old, ind_new);
> -    cpu_physical_memory_unmap(ind_addr, len, 1, len);
> +        expected = actual;
> +        actual = atomic_cmpxchg(ind_addr, expected, expected | to_be_set);
> +    } while (actual != expected);
> +    trace_virtio_ccw_set_ind(ind_loc, actual, actual | to_be_set);
> +    cpu_physical_memory_unmap((void *)ind_addr, len, 1, len);
>  
> -    return ind_old;
> +    return actual;
>  }
>  
>  static void virtio_ccw_notify(DeviceState *d, uint16_t vector)
>
Cornelia Huck June 16, 2020, 6:33 a.m. UTC | #2
On Tue, 16 Jun 2020 07:58:53 +0200
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> On 16.06.20 06:50, Halil Pasic wrote:
> > The atomic_cmpxchg() loop is broken because we occasionally end up with
> > old and _old having different values (a legit compiler can generate code
> > that accessed *ind_addr again to pick up a value for _old instead of
> > using the value of old that was already fetched according to the
> > rules of the abstract machine). This means the underlying CS instruction
> > may use a different old (_old) than the one we intended to use if
> > atomic_cmpxchg() performed the xchg part.
> > 
> > Let us use volatile to force the rules of the abstract machine for
> > accesses to *ind_addr. Let us also rewrite the loop so, we that the
> > new old is used to compute the new desired value if the xchg part
> > is not performed.
> > 
> > Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> > Reported-by: Andre Wild <Andre.Wild1@ibm.com>
> > Fixes: 7e7494627f ("s390x/virtio-ccw: Adapter interrupt support.")
> > ---
> >  hw/s390x/virtio-ccw.c | 18 ++++++++++--------
> >  1 file changed, 10 insertions(+), 8 deletions(-)
> > 
> > diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
> > index c1f4bb1d33..3c988a000b 100644
> > --- a/hw/s390x/virtio-ccw.c
> > +++ b/hw/s390x/virtio-ccw.c
> > @@ -786,9 +786,10 @@ static inline VirtioCcwDevice *to_virtio_ccw_dev_fast(DeviceState *d)
> >  static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
> >                                       uint8_t to_be_set)
> >  {
> > -    uint8_t ind_old, ind_new;
> > +    uint8_t expected, actual;
> >      hwaddr len = 1;
> > -    uint8_t *ind_addr;
> > +    /* avoid  multiple fetches */
> > +    uint8_t volatile *ind_addr;
> >  
> >      ind_addr = cpu_physical_memory_map(ind_loc, &len, true);
> >      if (!ind_addr) {
> > @@ -796,14 +797,15 @@ static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
> >                       __func__, sch->cssid, sch->ssid, sch->schid);
> >          return -1;
> >      }
> > +    actual = *ind_addr;
> >      do {
> > -        ind_old = *ind_addr;  
> 
> to make things easier to understand. Adding a barrier in here also fixes the issue.
> Reasoning follows below:
> 
> > -        ind_new = ind_old | to_be_set;  
> 
> with an analysis from Andreas (cc)
> 
>  #define atomic_cmpxchg__nocheck(ptr, old, new)    ({                    \   
>  
>      typeof_strip_qual(*ptr) _old = (old);                               \   
>  
>      (void)__atomic_compare_exchange_n(ptr, &_old, new, false,           \   
>  
>                                __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \   
>  
>      _old;                                                               \   
>  
>  })
>  
> ind_old is copied into _old in the macro. Instead of doing the copy from the
> register the compiler reloads the value from memory. The result is that _old
> and ind_old end up having different values. _old in r1 with the bits set
> already and ind_old in r10 with the bits cleared. _old gets updated by CS
> and matches ind_old afterwards - both with the bits being 0. So the !=
> compare is false and the loop is left without having set any bits.
> 
> 
> Paolo (to),
> I am asking myself if it would be safer to add a barrier or something like
> this in the macros in include/qemu/atomic.h. 

I'm also wondering whether this has been seen on other architectures as
well? There are also some callers in non-s390x code, and dealing with
this in common code would catch them as well.

> 
> 
> 
> 
> > -    } while (atomic_cmpxchg(ind_addr, ind_old, ind_new) != ind_old);
> > -    trace_virtio_ccw_set_ind(ind_loc, ind_old, ind_new);
> > -    cpu_physical_memory_unmap(ind_addr, len, 1, len);
> > +        expected = actual;
> > +        actual = atomic_cmpxchg(ind_addr, expected, expected | to_be_set);
> > +    } while (actual != expected);
> > +    trace_virtio_ccw_set_ind(ind_loc, actual, actual | to_be_set);
> > +    cpu_physical_memory_unmap((void *)ind_addr, len, 1, len);
> >  
> > -    return ind_old;
> > +    return actual;
> >  }
> >  
> >  static void virtio_ccw_notify(DeviceState *d, uint16_t vector)
> >   
> 
>
Christian Borntraeger June 16, 2020, 6:45 a.m. UTC | #3
On 16.06.20 08:33, Cornelia Huck wrote:
> On Tue, 16 Jun 2020 07:58:53 +0200
> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> 
>> On 16.06.20 06:50, Halil Pasic wrote:
>>> The atomic_cmpxchg() loop is broken because we occasionally end up with
>>> old and _old having different values (a legit compiler can generate code
>>> that accessed *ind_addr again to pick up a value for _old instead of
>>> using the value of old that was already fetched according to the
>>> rules of the abstract machine). This means the underlying CS instruction
>>> may use a different old (_old) than the one we intended to use if
>>> atomic_cmpxchg() performed the xchg part.
>>>
>>> Let us use volatile to force the rules of the abstract machine for
>>> accesses to *ind_addr. Let us also rewrite the loop so, we that the
>>> new old is used to compute the new desired value if the xchg part
>>> is not performed.
>>>
>>> Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
>>> Reported-by: Andre Wild <Andre.Wild1@ibm.com>
>>> Fixes: 7e7494627f ("s390x/virtio-ccw: Adapter interrupt support.")
>>> ---
>>>  hw/s390x/virtio-ccw.c | 18 ++++++++++--------
>>>  1 file changed, 10 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
>>> index c1f4bb1d33..3c988a000b 100644
>>> --- a/hw/s390x/virtio-ccw.c
>>> +++ b/hw/s390x/virtio-ccw.c
>>> @@ -786,9 +786,10 @@ static inline VirtioCcwDevice *to_virtio_ccw_dev_fast(DeviceState *d)
>>>  static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
>>>                                       uint8_t to_be_set)
>>>  {
>>> -    uint8_t ind_old, ind_new;
>>> +    uint8_t expected, actual;
>>>      hwaddr len = 1;
>>> -    uint8_t *ind_addr;
>>> +    /* avoid  multiple fetches */
>>> +    uint8_t volatile *ind_addr;
>>>  
>>>      ind_addr = cpu_physical_memory_map(ind_loc, &len, true);
>>>      if (!ind_addr) {
>>> @@ -796,14 +797,15 @@ static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
>>>                       __func__, sch->cssid, sch->ssid, sch->schid);
>>>          return -1;
>>>      }
>>> +    actual = *ind_addr;
>>>      do {
>>> -        ind_old = *ind_addr;  
>>
>> to make things easier to understand. Adding a barrier in here also fixes the issue.
>> Reasoning follows below:
>>
>>> -        ind_new = ind_old | to_be_set;  
>>
>> with an analysis from Andreas (cc)
>>
>>  #define atomic_cmpxchg__nocheck(ptr, old, new)    ({                    \   
>>  
>>      typeof_strip_qual(*ptr) _old = (old);                               \   
>>  
>>      (void)__atomic_compare_exchange_n(ptr, &_old, new, false,           \   
>>  
>>                                __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \   
>>  
>>      _old;                                                               \   
>>  
>>  })
>>  
>> ind_old is copied into _old in the macro. Instead of doing the copy from the
>> register the compiler reloads the value from memory. The result is that _old
>> and ind_old end up having different values. _old in r1 with the bits set
>> already and ind_old in r10 with the bits cleared. _old gets updated by CS
>> and matches ind_old afterwards - both with the bits being 0. So the !=
>> compare is false and the loop is left without having set any bits.
>>
>>
>> Paolo (to),
>> I am asking myself if it would be safer to add a barrier or something like
>> this in the macros in include/qemu/atomic.h. 

Having said this, I think that the refactoring from Halil (to re-use actual) 
also makes sense independent of the fix. 
> 
> I'm also wondering whether this has been seen on other architectures as
> well? There are also some callers in non-s390x code, and dealing with
> this in common code would catch them as well.
> 
>>
>>
>>
>>
>>> -    } while (atomic_cmpxchg(ind_addr, ind_old, ind_new) != ind_old);
>>> -    trace_virtio_ccw_set_ind(ind_loc, ind_old, ind_new);
>>> -    cpu_physical_memory_unmap(ind_addr, len, 1, len);
>>> +        expected = actual;
>>> +        actual = atomic_cmpxchg(ind_addr, expected, expected | to_be_set);
>>> +    } while (actual != expected);
>>> +    trace_virtio_ccw_set_ind(ind_loc, actual, actual | to_be_set);
>>> +    cpu_physical_memory_unmap((void *)ind_addr, len, 1, len);
>>>  
>>> -    return ind_old;
>>> +    return actual;
>>>  }
>>>  
>>>  static void virtio_ccw_notify(DeviceState *d, uint16_t vector)
>>>   
>>
>>
>
Halil Pasic June 16, 2020, 9:31 a.m. UTC | #4
On Tue, 16 Jun 2020 07:58:53 +0200
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> 
> 
> On 16.06.20 06:50, Halil Pasic wrote:
> > The atomic_cmpxchg() loop is broken because we occasionally end up with
> > old and _old having different values (a legit compiler can generate code
> > that accessed *ind_addr again to pick up a value for _old instead of
> > using the value of old that was already fetched according to the
> > rules of the abstract machine). This means the underlying CS instruction
> > may use a different old (_old) than the one we intended to use if
> > atomic_cmpxchg() performed the xchg part.
> > 
> > Let us use volatile to force the rules of the abstract machine for
> > accesses to *ind_addr. Let us also rewrite the loop so, we that the
> > new old is used to compute the new desired value if the xchg part
> > is not performed.
> > 
> > Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> > Reported-by: Andre Wild <Andre.Wild1@ibm.com>
> > Fixes: 7e7494627f ("s390x/virtio-ccw: Adapter interrupt support.")
> > ---
> >  hw/s390x/virtio-ccw.c | 18 ++++++++++--------
> >  1 file changed, 10 insertions(+), 8 deletions(-)
> > 
> > diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
> > index c1f4bb1d33..3c988a000b 100644
> > --- a/hw/s390x/virtio-ccw.c
> > +++ b/hw/s390x/virtio-ccw.c
> > @@ -786,9 +786,10 @@ static inline VirtioCcwDevice *to_virtio_ccw_dev_fast(DeviceState *d)
> >  static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
> >                                       uint8_t to_be_set)
> >  {
> > -    uint8_t ind_old, ind_new;
> > +    uint8_t expected, actual;
> >      hwaddr len = 1;
> > -    uint8_t *ind_addr;
> > +    /* avoid  multiple fetches */
> > +    uint8_t volatile *ind_addr;
> >  
> >      ind_addr = cpu_physical_memory_map(ind_loc, &len, true);
> >      if (!ind_addr) {
> > @@ -796,14 +797,15 @@ static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
> >                       __func__, sch->cssid, sch->ssid, sch->schid);
> >          return -1;
> >      }
> > +    actual = *ind_addr;
> >      do {
> > -        ind_old = *ind_addr;
> 
> to make things easier to understand. Adding a barrier in here also fixes the issue.
> Reasoning follows below:
> 
> > -        ind_new = ind_old | to_be_set;
> 
> with an analysis from Andreas (cc)
> 
>  #define atomic_cmpxchg__nocheck(ptr, old, new)    ({                    \   
>  
>      typeof_strip_qual(*ptr) _old = (old);                               \   
>  
>      (void)__atomic_compare_exchange_n(ptr, &_old, new, false,           \   
>  
>                                __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \   
>  
>      _old;                                                               \   
>  
>  })
>  

There is also the 

#define atomic_cmpxchg(ptr, old, new) __sync_val_compare_and_swap(ptr, old, new)

variant, I guess, when the C11 stuff is not available. I don't know if
that variant is guaranteed to not have problems with multiple loads.



> ind_old is copied into _old in the macro. Instead of doing the copy from the
> register the compiler reloads the value from memory. The result is that _old
> and ind_old end up having different values. _old in r1 with the bits set
> already and ind_old in r10 with the bits cleared. _old gets updated by CS
> and matches ind_old afterwards - both with the bits being 0. So the !=
> compare is false and the loop is left without having set any bits.
> 
> 
> Paolo (to),
> I am asking myself if it would be safer to add a barrier or something like
> this in the macros in include/qemu/atomic.h. 
>

I think accessing the initial value via a volatile pointer initially and
using the value loaded by cmpxchg for subsequent iterations is cleaner.

Regards,
Halil

> 
> > -    } while (atomic_cmpxchg(ind_addr, ind_old, ind_new) != ind_old);
> > -    trace_virtio_ccw_set_ind(ind_loc, ind_old, ind_new);
> > -    cpu_physical_memory_unmap(ind_addr, len, 1, len);
> > +        expected = actual;
> > +        actual = atomic_cmpxchg(ind_addr, expected, expected | to_be_set);
> > +    } while (actual != expected);
> > +    trace_virtio_ccw_set_ind(ind_loc, actual, actual | to_be_set);
> > +    cpu_physical_memory_unmap((void *)ind_addr, len, 1, len);
> >  
> > -    return ind_old;
> > +    return actual;
> >  }
> >  
> >  static void virtio_ccw_notify(DeviceState *d, uint16_t vector)
> > 
> 
> 
>
Halil Pasic June 17, 2020, 11:56 p.m. UTC | #5
On Tue, 16 Jun 2020 08:33:33 +0200
Cornelia Huck <cohuck@redhat.com> wrote:

> >  #define atomic_cmpxchg__nocheck(ptr, old, new)    ({                    \   
> >  
> >      typeof_strip_qual(*ptr) _old = (old);                               \   
> >  
> >      (void)__atomic_compare_exchange_n(ptr, &_old, new, false,           \   
> >  
> >                                __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \   
> >  
> >      _old;                                                               \   
> >  
> >  })
> >  
> > ind_old is copied into _old in the macro. Instead of doing the copy from the
> > register the compiler reloads the value from memory. The result is that _old
> > and ind_old end up having different values. _old in r1 with the bits set
> > already and ind_old in r10 with the bits cleared. _old gets updated by CS
> > and matches ind_old afterwards - both with the bits being 0. So the !=
> > compare is false and the loop is left without having set any bits.
> > 
> > 
> > Paolo (to),
> > I am asking myself if it would be safer to add a barrier or something like
> > this in the macros in include/qemu/atomic.h.   
> 
> I'm also wondering whether this has been seen on other architectures as
> well? There are also some callers in non-s390x code, and dealing with
> this in common code would catch them as well.

Quite a bunch of users use something like old = atomic_read(..), where
atomic_read is documented as in docs/devel/atomics.rst:
- ``atomic_read()`` and ``atomic_set()``; these prevent the compiler from
  optimizing accesses out of existence and creating unsolicited
  accesses, but do not otherwise impose any ordering on loads and
  stores: both the compiler and the processor are free to reorder
  them.

Maybe I should have used that instead of volatile, but my problem was
that I didn't fully understand what atomic_read() does, and if it does
more than we need. I found the documentation just now.

Another bunch uses constants as old, which is also fine. And there is
a third bunch where I don't know whats up, partly because I did not
dig deep enough.

Regards,
Halil
Cornelia Huck June 19, 2020, 7:14 a.m. UTC | #6
On Tue, 16 Jun 2020 08:45:14 +0200
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> On 16.06.20 08:33, Cornelia Huck wrote:
> > On Tue, 16 Jun 2020 07:58:53 +0200
> > Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> >   
> >> On 16.06.20 06:50, Halil Pasic wrote:  
> >>> The atomic_cmpxchg() loop is broken because we occasionally end up with
> >>> old and _old having different values (a legit compiler can generate code
> >>> that accessed *ind_addr again to pick up a value for _old instead of
> >>> using the value of old that was already fetched according to the
> >>> rules of the abstract machine). This means the underlying CS instruction
> >>> may use a different old (_old) than the one we intended to use if
> >>> atomic_cmpxchg() performed the xchg part.
> >>>
> >>> Let us use volatile to force the rules of the abstract machine for
> >>> accesses to *ind_addr. Let us also rewrite the loop so, we that the
> >>> new old is used to compute the new desired value if the xchg part
> >>> is not performed.
> >>>
> >>> Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> >>> Reported-by: Andre Wild <Andre.Wild1@ibm.com>
> >>> Fixes: 7e7494627f ("s390x/virtio-ccw: Adapter interrupt support.")
> >>> ---
> >>>  hw/s390x/virtio-ccw.c | 18 ++++++++++--------
> >>>  1 file changed, 10 insertions(+), 8 deletions(-)
> >>>
> >>> diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
> >>> index c1f4bb1d33..3c988a000b 100644
> >>> --- a/hw/s390x/virtio-ccw.c
> >>> +++ b/hw/s390x/virtio-ccw.c
> >>> @@ -786,9 +786,10 @@ static inline VirtioCcwDevice *to_virtio_ccw_dev_fast(DeviceState *d)
> >>>  static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
> >>>                                       uint8_t to_be_set)
> >>>  {
> >>> -    uint8_t ind_old, ind_new;
> >>> +    uint8_t expected, actual;
> >>>      hwaddr len = 1;
> >>> -    uint8_t *ind_addr;
> >>> +    /* avoid  multiple fetches */
> >>> +    uint8_t volatile *ind_addr;
> >>>  
> >>>      ind_addr = cpu_physical_memory_map(ind_loc, &len, true);
> >>>      if (!ind_addr) {
> >>> @@ -796,14 +797,15 @@ static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
> >>>                       __func__, sch->cssid, sch->ssid, sch->schid);
> >>>          return -1;
> >>>      }
> >>> +    actual = *ind_addr;
> >>>      do {
> >>> -        ind_old = *ind_addr;    
> >>
> >> to make things easier to understand. Adding a barrier in here also fixes the issue.
> >> Reasoning follows below:
> >>  
> >>> -        ind_new = ind_old | to_be_set;    
> >>
> >> with an analysis from Andreas (cc)
> >>
> >>  #define atomic_cmpxchg__nocheck(ptr, old, new)    ({                    \   
> >>  
> >>      typeof_strip_qual(*ptr) _old = (old);                               \   
> >>  
> >>      (void)__atomic_compare_exchange_n(ptr, &_old, new, false,           \   
> >>  
> >>                                __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \   
> >>  
> >>      _old;                                                               \   
> >>  
> >>  })
> >>  
> >> ind_old is copied into _old in the macro. Instead of doing the copy from the
> >> register the compiler reloads the value from memory. The result is that _old
> >> and ind_old end up having different values. _old in r1 with the bits set
> >> already and ind_old in r10 with the bits cleared. _old gets updated by CS
> >> and matches ind_old afterwards - both with the bits being 0. So the !=
> >> compare is false and the loop is left without having set any bits.
> >>
> >>
> >> Paolo (to),
> >> I am asking myself if it would be safer to add a barrier or something like
> >> this in the macros in include/qemu/atomic.h.   
> 
> Having said this, I think that the refactoring from Halil (to re-use actual) 
> also makes sense independent of the fix. 

What about adding a barrier instead, as you suggested?

(Still wondering about other users of atomic_cmpxchg(), though.)
David Hildenbrand June 19, 2020, 7:33 a.m. UTC | #7
On 18.06.20 01:56, Halil Pasic wrote:
> On Tue, 16 Jun 2020 08:33:33 +0200
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
>>>  #define atomic_cmpxchg__nocheck(ptr, old, new)    ({                    \   
>>>  
>>>      typeof_strip_qual(*ptr) _old = (old);                               \   
>>>  
>>>      (void)__atomic_compare_exchange_n(ptr, &_old, new, false,           \   
>>>  
>>>                                __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \   
>>>  
>>>      _old;                                                               \   
>>>  
>>>  })
>>>  
>>> ind_old is copied into _old in the macro. Instead of doing the copy from the
>>> register the compiler reloads the value from memory. The result is that _old
>>> and ind_old end up having different values. _old in r1 with the bits set
>>> already and ind_old in r10 with the bits cleared. _old gets updated by CS
>>> and matches ind_old afterwards - both with the bits being 0. So the !=
>>> compare is false and the loop is left without having set any bits.
>>>
>>>
>>> Paolo (to),
>>> I am asking myself if it would be safer to add a barrier or something like
>>> this in the macros in include/qemu/atomic.h.   
>>
>> I'm also wondering whether this has been seen on other architectures as
>> well? There are also some callers in non-s390x code, and dealing with
>> this in common code would catch them as well.
> 
> Quite a bunch of users use something like old = atomic_read(..), where
> atomic_read is documented as in docs/devel/atomics.rst:
> - ``atomic_read()`` and ``atomic_set()``; these prevent the compiler from
>   optimizing accesses out of existence and creating unsolicited
>   accesses, but do not otherwise impose any ordering on loads and
>   stores: both the compiler and the processor are free to reorder
>   them.
> 
> Maybe I should have used that instead of volatile, but my problem was
> that I didn't fully understand what atomic_read() does, and if it does
> more than we need. I found the documentation just now.

IIRC, atomic_read() is the right way of doing it, at least in the
kernel. I use such a loop in QEMU in

https://lkml.kernel.org/r/20200610115419.51688-2-david@redhat.com

But reading docs/devel/atomics.rst:"Comparison with Linux kernel
primitives" I do wonder if that is sufficient.

Any experts around?
Halil Pasic June 19, 2020, 8:17 a.m. UTC | #8
On Fri, 19 Jun 2020 09:33:44 +0200
David Hildenbrand <david@redhat.com> wrote:

> On 18.06.20 01:56, Halil Pasic wrote:
> > On Tue, 16 Jun 2020 08:33:33 +0200
> > Cornelia Huck <cohuck@redhat.com> wrote:
> > 
> >>>  #define atomic_cmpxchg__nocheck(ptr, old, new)    ({                    \   
> >>>  
> >>>      typeof_strip_qual(*ptr) _old = (old);                               \   
> >>>  
> >>>      (void)__atomic_compare_exchange_n(ptr, &_old, new, false,           \   
> >>>  
> >>>                                __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \   
> >>>  
> >>>      _old;                                                               \   
> >>>  
> >>>  })
> >>>  
> >>> ind_old is copied into _old in the macro. Instead of doing the copy from the
> >>> register the compiler reloads the value from memory. The result is that _old
> >>> and ind_old end up having different values. _old in r1 with the bits set
> >>> already and ind_old in r10 with the bits cleared. _old gets updated by CS
> >>> and matches ind_old afterwards - both with the bits being 0. So the !=
> >>> compare is false and the loop is left without having set any bits.
> >>>
> >>>
> >>> Paolo (to),
> >>> I am asking myself if it would be safer to add a barrier or something like
> >>> this in the macros in include/qemu/atomic.h.   
> >>
> >> I'm also wondering whether this has been seen on other architectures as
> >> well? There are also some callers in non-s390x code, and dealing with
> >> this in common code would catch them as well.
> > 
> > Quite a bunch of users use something like old = atomic_read(..), where
> > atomic_read is documented as in docs/devel/atomics.rst:
> > - ``atomic_read()`` and ``atomic_set()``; these prevent the compiler from
> >   optimizing accesses out of existence and creating unsolicited
> >   accesses, but do not otherwise impose any ordering on loads and
> >   stores: both the compiler and the processor are free to reorder
> >   them.
> > 
> > Maybe I should have used that instead of volatile, but my problem was
> > that I didn't fully understand what atomic_read() does, and if it does
> > more than we need. I found the documentation just now.
> 
> IIRC, atomic_read() is the right way of doing it, at least in the
> kernel. 

In kernel I would use READ_ONCE. 

https://elixir.bootlin.com/linux/v5.8-rc1/source/include/asm-generic/atomic.h#L171

In this case we are not manipulating an atomic variable. For uint8_t
that boils down to an access through a volatile pointer. And that is
what I did :).

> I use such a loop in QEMU in
> 
> https://lkml.kernel.org/r/20200610115419.51688-2-david@redhat.com
> 
> But reading docs/devel/atomics.rst:"Comparison with Linux kernel
> primitives" I do wonder if that is sufficient.
> 
> Any experts around?
> 

IMHO what we want here is READ_ONCE, i.e. volatile access, and not
necessarily atomic access. But I suppose atomic access implies volatile
access (the C11 standard refers to atomic_load as 
C atomic_load(volatile A *object)). Because QEMU seems to use atomic_read()
in such situations, and does not have READ_ONCE, for me atomic_read would
also do.

Regards,
Halil
Christian Borntraeger July 1, 2020, 1:13 p.m. UTC | #9
On 16.06.20 06:50, Halil Pasic wrote:
> The atomic_cmpxchg() loop is broken because we occasionally end up with
> old and _old having different values (a legit compiler can generate code
> that accessed *ind_addr again to pick up a value for _old instead of
> using the value of old that was already fetched according to the
> rules of the abstract machine). This means the underlying CS instruction
> may use a different old (_old) than the one we intended to use if
> atomic_cmpxchg() performed the xchg part.
> 
> Let us use volatile to force the rules of the abstract machine for
> accesses to *ind_addr. Let us also rewrite the loop so, we that the
> new old is used to compute the new desired value if the xchg part
> is not performed.
> 
> Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> Reported-by: Andre Wild <Andre.Wild1@ibm.com>
> Fixes: 7e7494627f ("s390x/virtio-ccw: Adapter interrupt support.")

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
>  hw/s390x/virtio-ccw.c | 18 ++++++++++--------
>  1 file changed, 10 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
> index c1f4bb1d33..3c988a000b 100644
> --- a/hw/s390x/virtio-ccw.c
> +++ b/hw/s390x/virtio-ccw.c
> @@ -786,9 +786,10 @@ static inline VirtioCcwDevice *to_virtio_ccw_dev_fast(DeviceState *d)
>  static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
>                                       uint8_t to_be_set)
>  {
> -    uint8_t ind_old, ind_new;
> +    uint8_t expected, actual;
>      hwaddr len = 1;
> -    uint8_t *ind_addr;
> +    /* avoid  multiple fetches */
> +    uint8_t volatile *ind_addr;
>  
>      ind_addr = cpu_physical_memory_map(ind_loc, &len, true);
>      if (!ind_addr) {
> @@ -796,14 +797,15 @@ static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
>                       __func__, sch->cssid, sch->ssid, sch->schid);
>          return -1;
>      }
> +    actual = *ind_addr;
>      do {
> -        ind_old = *ind_addr;
> -        ind_new = ind_old | to_be_set;
> -    } while (atomic_cmpxchg(ind_addr, ind_old, ind_new) != ind_old);
> -    trace_virtio_ccw_set_ind(ind_loc, ind_old, ind_new);
> -    cpu_physical_memory_unmap(ind_addr, len, 1, len);
> +        expected = actual;
> +        actual = atomic_cmpxchg(ind_addr, expected, expected | to_be_set);
> +    } while (actual != expected);
> +    trace_virtio_ccw_set_ind(ind_loc, actual, actual | to_be_set);
> +    cpu_physical_memory_unmap((void *)ind_addr, len, 1, len);
>  
> -    return ind_old;
> +    return actual;
>  }
>  
>  static void virtio_ccw_notify(DeviceState *d, uint16_t vector)
>
Michael S. Tsirkin July 4, 2020, 6:34 p.m. UTC | #10
On Tue, Jun 16, 2020 at 06:50:34AM +0200, Halil Pasic wrote:
> The atomic_cmpxchg() loop is broken because we occasionally end up with
> old and _old having different values (a legit compiler can generate code
> that accessed *ind_addr again to pick up a value for _old instead of
> using the value of old that was already fetched according to the
> rules of the abstract machine). This means the underlying CS instruction
> may use a different old (_old) than the one we intended to use if
> atomic_cmpxchg() performed the xchg part.

And was this ever observed in the field? Or is this a theoretical issue?
commit log should probably say ...

> 
> Let us use volatile to force the rules of the abstract machine for
> accesses to *ind_addr. Let us also rewrite the loop so, we that the

we that -> we know that?

> new old is used to compute the new desired value if the xchg part
> is not performed.
> 
> Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> Reported-by: Andre Wild <Andre.Wild1@ibm.com>
> Fixes: 7e7494627f ("s390x/virtio-ccw: Adapter interrupt support.")
> ---
>  hw/s390x/virtio-ccw.c | 18 ++++++++++--------
>  1 file changed, 10 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
> index c1f4bb1d33..3c988a000b 100644
> --- a/hw/s390x/virtio-ccw.c
> +++ b/hw/s390x/virtio-ccw.c
> @@ -786,9 +786,10 @@ static inline VirtioCcwDevice *to_virtio_ccw_dev_fast(DeviceState *d)
>  static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
>                                       uint8_t to_be_set)
>  {
> -    uint8_t ind_old, ind_new;
> +    uint8_t expected, actual;
>      hwaddr len = 1;
> -    uint8_t *ind_addr;
> +    /* avoid  multiple fetches */
> +    uint8_t volatile *ind_addr;
>  
>      ind_addr = cpu_physical_memory_map(ind_loc, &len, true);
>      if (!ind_addr) {
> @@ -796,14 +797,15 @@ static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
>                       __func__, sch->cssid, sch->ssid, sch->schid);
>          return -1;
>      }
> +    actual = *ind_addr;
>      do {
> -        ind_old = *ind_addr;
> -        ind_new = ind_old | to_be_set;
> -    } while (atomic_cmpxchg(ind_addr, ind_old, ind_new) != ind_old);
> -    trace_virtio_ccw_set_ind(ind_loc, ind_old, ind_new);
> -    cpu_physical_memory_unmap(ind_addr, len, 1, len);
> +        expected = actual;
> +        actual = atomic_cmpxchg(ind_addr, expected, expected | to_be_set);
> +    } while (actual != expected);
> +    trace_virtio_ccw_set_ind(ind_loc, actual, actual | to_be_set);
> +    cpu_physical_memory_unmap((void *)ind_addr, len, 1, len);
>  
> -    return ind_old;
> +    return actual;
>  }

I wonder whether cpuXX APIs should accept volatile pointers, too:
casting away volatile is always suspicious.
But that is a separate issue ...

>  static void virtio_ccw_notify(DeviceState *d, uint16_t vector)
> -- 
> 2.17.1
Christian Borntraeger July 6, 2020, 5:44 a.m. UTC | #11
On 04.07.20 20:34, Michael S. Tsirkin wrote:
> On Tue, Jun 16, 2020 at 06:50:34AM +0200, Halil Pasic wrote:
>> The atomic_cmpxchg() loop is broken because we occasionally end up with
>> old and _old having different values (a legit compiler can generate code
>> that accessed *ind_addr again to pick up a value for _old instead of
>> using the value of old that was already fetched according to the
>> rules of the abstract machine). This means the underlying CS instruction
>> may use a different old (_old) than the one we intended to use if
>> atomic_cmpxchg() performed the xchg part.
> 
> And was this ever observed in the field? Or is this a theoretical issue?
> commit log should probably say ...

It was observed in the field when the xml specified qemu instead of vhost.
Halil Pasic July 6, 2020, 11:19 a.m. UTC | #12
On Sat, 4 Jul 2020 14:34:04 -0400
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, Jun 16, 2020 at 06:50:34AM +0200, Halil Pasic wrote:
> > The atomic_cmpxchg() loop is broken because we occasionally end up with
> > old and _old having different values (a legit compiler can generate code
> > that accessed *ind_addr again to pick up a value for _old instead of
> > using the value of old that was already fetched according to the
> > rules of the abstract machine). This means the underlying CS instruction
> > may use a different old (_old) than the one we intended to use if
> > atomic_cmpxchg() performed the xchg part.
> 
> And was this ever observed in the field? Or is this a theoretical issue?
> commit log should probably say ...
> 

It was observed in the field (Christian already answered). I think the
message already implies this, because the only conjunctive is about the
compiler behavior.

> > 
> > Let us use volatile to force the rules of the abstract machine for
> > accesses to *ind_addr. Let us also rewrite the loop so, we that the
> 
> we that -> we know that?

s/we//

It would be nice to fix this before the patch gets merged.

> 
> > new old is used to compute the new desired value if the xchg part
> > is not performed.
> > 
> > Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> > Reported-by: Andre Wild <Andre.Wild1@ibm.com>
> > Fixes: 7e7494627f ("s390x/virtio-ccw: Adapter interrupt support.")
> > ---
> >  hw/s390x/virtio-ccw.c | 18 ++++++++++--------
> >  1 file changed, 10 insertions(+), 8 deletions(-)
> > 
> > diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
> > index c1f4bb1d33..3c988a000b 100644
> > --- a/hw/s390x/virtio-ccw.c
> > +++ b/hw/s390x/virtio-ccw.c
> > @@ -786,9 +786,10 @@ static inline VirtioCcwDevice *to_virtio_ccw_dev_fast(DeviceState *d)
> >  static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
> >                                       uint8_t to_be_set)
> >  {
> > -    uint8_t ind_old, ind_new;
> > +    uint8_t expected, actual;
> >      hwaddr len = 1;
> > -    uint8_t *ind_addr;
> > +    /* avoid  multiple fetches */
> > +    uint8_t volatile *ind_addr;
> >  
> >      ind_addr = cpu_physical_memory_map(ind_loc, &len, true);
> >      if (!ind_addr) {
> > @@ -796,14 +797,15 @@ static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
> >                       __func__, sch->cssid, sch->ssid, sch->schid);
> >          return -1;
> >      }
> > +    actual = *ind_addr;
> >      do {
> > -        ind_old = *ind_addr;
> > -        ind_new = ind_old | to_be_set;
> > -    } while (atomic_cmpxchg(ind_addr, ind_old, ind_new) != ind_old);
> > -    trace_virtio_ccw_set_ind(ind_loc, ind_old, ind_new);
> > -    cpu_physical_memory_unmap(ind_addr, len, 1, len);
> > +        expected = actual;
> > +        actual = atomic_cmpxchg(ind_addr, expected, expected | to_be_set);
> > +    } while (actual != expected);
> > +    trace_virtio_ccw_set_ind(ind_loc, actual, actual | to_be_set);
> > +    cpu_physical_memory_unmap((void *)ind_addr, len, 1, len);
> >  
> > -    return ind_old;
> > +    return actual;
> >  }
> 
> I wonder whether cpuXX APIs should accept volatile pointers, too:
> casting away volatile is always suspicious.
> But that is a separate issue ...
> 

Nod.

Thanks for having a look!

> >  static void virtio_ccw_notify(DeviceState *d, uint16_t vector)
> > -- 
> > 2.17.1
>
diff mbox series

Patch

diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index c1f4bb1d33..3c988a000b 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -786,9 +786,10 @@  static inline VirtioCcwDevice *to_virtio_ccw_dev_fast(DeviceState *d)
 static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
                                      uint8_t to_be_set)
 {
-    uint8_t ind_old, ind_new;
+    uint8_t expected, actual;
     hwaddr len = 1;
-    uint8_t *ind_addr;
+    /* avoid  multiple fetches */
+    uint8_t volatile *ind_addr;
 
     ind_addr = cpu_physical_memory_map(ind_loc, &len, true);
     if (!ind_addr) {
@@ -796,14 +797,15 @@  static uint8_t virtio_set_ind_atomic(SubchDev *sch, uint64_t ind_loc,
                      __func__, sch->cssid, sch->ssid, sch->schid);
         return -1;
     }
+    actual = *ind_addr;
     do {
-        ind_old = *ind_addr;
-        ind_new = ind_old | to_be_set;
-    } while (atomic_cmpxchg(ind_addr, ind_old, ind_new) != ind_old);
-    trace_virtio_ccw_set_ind(ind_loc, ind_old, ind_new);
-    cpu_physical_memory_unmap(ind_addr, len, 1, len);
+        expected = actual;
+        actual = atomic_cmpxchg(ind_addr, expected, expected | to_be_set);
+    } while (actual != expected);
+    trace_virtio_ccw_set_ind(ind_loc, actual, actual | to_be_set);
+    cpu_physical_memory_unmap((void *)ind_addr, len, 1, len);
 
-    return ind_old;
+    return actual;
 }
 
 static void virtio_ccw_notify(DeviceState *d, uint16_t vector)