Message ID | 20170203132737.566324209@infradead.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Feb 03, 2017 at 02:26:02PM +0100, Peter Zijlstra wrote: > Add a new cmpxchg interface: > > bool try_cmpxchg(u{8,16,32,64} *ptr, u{8,16,32,64} *val, u{8,16,32,64} new); > > Where the boolean returns the result of the compare; and thus if the > exchange happened; and in case of failure, the new value of *ptr is > returned in *val. > > This allows simplification/improvement of loops like: > > for (;;) { > new = val $op $imm; > old = cmpxchg(ptr, val, new); > if (old == val) > break; > val = old; > } > > into: > > for (;;) { > new = val $op $imm; > if (try_cmpxchg(ptr, &val, new)) > break; > } > > while also generating better code (GCC6 and onwards). > But switching to try_cmpxchg() will make @val a memory location, which could not be put in a register. And this will generate unnecessary memory accesses on archs having enough registers(PPC, e.g.). > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > --- > --- a/arch/x86/include/asm/atomic.h > +++ b/arch/x86/include/asm/atomic.h > @@ -186,6 +186,12 @@ static __always_inline int atomic_cmpxch > return cmpxchg(&v->counter, old, new); > } > > +#define atomic_try_cmpxchg atomic_try_cmpxchg > +static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) > +{ > + return try_cmpxchg(&v->counter, old, new); > +} > + > static inline int atomic_xchg(atomic_t *v, int new) > { > return xchg(&v->counter, new); > --- a/arch/x86/include/asm/cmpxchg.h > +++ b/arch/x86/include/asm/cmpxchg.h > @@ -153,6 +153,75 @@ extern void __add_wrong_size(void) > #define cmpxchg_local(ptr, old, new) \ > __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) > > + > +#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ > +({ \ > + bool success; \ > + __typeof__(_ptr) _old = (_pold); \ > + __typeof__(*(_ptr)) __old = *_old; \ > + __typeof__(*(_ptr)) __new = (_new); \ > + switch (size) { \ > + case __X86_CASE_B: \ > + { \ > + volatile u8 *__ptr = (volatile u8 *)(_ptr); \ > + asm volatile(lock "cmpxchgb %[new], %[ptr]" \ > + CC_SET(z) \ > + : CC_OUT(z) (success), \ > + [ptr] "+m" (*__ptr), \ > + [old] "+a" (__old) \ > + : [new] "q" (__new) \ > + : "memory"); \ > + break; \ > + } \ > + case __X86_CASE_W: \ > + { \ > + volatile u16 *__ptr = (volatile u16 *)(_ptr); \ > + asm volatile(lock "cmpxchgw %[new], %[ptr]" \ > + CC_SET(z) \ > + : CC_OUT(z) (success), \ > + [ptr] "+m" (*__ptr), \ > + [old] "+a" (__old) \ > + : [new] "r" (__new) \ > + : "memory"); \ > + break; \ > + } \ > + case __X86_CASE_L: \ > + { \ > + volatile u32 *__ptr = (volatile u32 *)(_ptr); \ > + asm volatile(lock "cmpxchgl %[new], %[ptr]" \ > + CC_SET(z) \ > + : CC_OUT(z) (success), \ > + [ptr] "+m" (*__ptr), \ > + [old] "+a" (__old) \ > + : [new] "r" (__new) \ > + : "memory"); \ > + break; \ > + } \ > + case __X86_CASE_Q: \ > + { \ > + volatile u64 *__ptr = (volatile u64 *)(_ptr); \ > + asm volatile(lock "cmpxchgq %[new], %[ptr]" \ > + CC_SET(z) \ > + : CC_OUT(z) (success), \ > + [ptr] "+m" (*__ptr), \ > + [old] "+a" (__old) \ > + : [new] "r" (__new) \ > + : "memory"); \ > + break; \ > + } \ > + default: \ > + __cmpxchg_wrong_size(); \ > + } \ > + *_old = __old; \ > + success; \ > +}) > + > +#define __try_cmpxchg(ptr, pold, new, size) \ > + __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) > + > +#define try_cmpxchg(ptr, pold, new) \ > + __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) > + > /* > * xadd() adds "inc" to "*ptr" and atomically returns the previous > * value of "*ptr". > --- a/include/linux/atomic.h > +++ b/include/linux/atomic.h > @@ -423,6 +423,28 @@ > #endif > #endif /* atomic_cmpxchg_relaxed */ > > +#ifndef atomic_try_cmpxchg > + > +#define __atomic_try_cmpxchg(type, _p, _po, _n) \ > +({ \ > + typeof(_po) __po = (_po); \ > + typeof(*(_po)) __o = *__po; \ > + bool success = (atomic_cmpxchg##type((_p), __o, (_n)) == __o); \ > + *__po = __o; \ Besides, is this part correct? atomic_cmpxchg_*() wouldn't change the value of __o, so *__po wouldn't be changed.. IOW, in case of failure, *ptr wouldn't be updated to a new value. Maybe this should be: bool success; *__po = atomic_cmpxchg##type((_p), __o, (_n)); sucess = (*__po == _o); , right? Regards, Boqun > + success; \ > +}) > + > +#define atomic_try_cmpxchg(_p, _po, _n) __atomic_try_cmpxchg(, _p, _po, _n) > +#define atomic_try_cmpxchg_relaxed(_p, _po, _n) __atomic_try_cmpxchg(_relaxed, _p, _po, _n) > +#define atomic_try_cmpxchg_acquire(_p, _po, _n) __atomic_try_cmpxchg(_acquire, _p, _po, _n) > +#define atomic_try_cmpxchg_release(_p, _po, _n) __atomic_try_cmpxchg(_release, _p, _po, _n) > + > +#else /* atomic_try_cmpxchg */ > +#define atomic_try_cmpxchg_relaxed atomic_try_cmpxchg > +#define atomic_try_cmpxchg_acquire atomic_try_cmpxchg > +#define atomic_try_cmpxchg_release atomic_try_cmpxchg > +#endif /* atomic_try_cmpxchg */ > + > /* cmpxchg_relaxed */ > #ifndef cmpxchg_relaxed > #define cmpxchg_relaxed cmpxchg > >
On Mon, Feb 06, 2017 at 12:24:28PM +0800, Boqun Feng wrote: > On Fri, Feb 03, 2017 at 02:26:02PM +0100, Peter Zijlstra wrote: > > Add a new cmpxchg interface: > > > > bool try_cmpxchg(u{8,16,32,64} *ptr, u{8,16,32,64} *val, u{8,16,32,64} new); > > > > Where the boolean returns the result of the compare; and thus if the > > exchange happened; and in case of failure, the new value of *ptr is > > returned in *val. > > > > This allows simplification/improvement of loops like: > > > > for (;;) { > > new = val $op $imm; > > old = cmpxchg(ptr, val, new); > > if (old == val) > > break; > > val = old; > > } > > > > into: > > > > for (;;) { > > new = val $op $imm; > > if (try_cmpxchg(ptr, &val, new)) > > break; > > } > > > > while also generating better code (GCC6 and onwards). > > > > But switching to try_cmpxchg() will make @val a memory location, which > could not be put in a register. And this will generate unnecessary > memory accesses on archs having enough registers(PPC, e.g.). > Hmm.. it turns out that compilers can figure out that @val could be fit in a register(maybe in the escape analysis step), so don't treat it as a memory location. This is at least true for GCC 5.4.0 on PPC. So I think we can rely on this? > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > --- > > --- a/arch/x86/include/asm/atomic.h > > +++ b/arch/x86/include/asm/atomic.h > > @@ -186,6 +186,12 @@ static __always_inline int atomic_cmpxch > > return cmpxchg(&v->counter, old, new); > > } > > > > +#define atomic_try_cmpxchg atomic_try_cmpxchg > > +static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) > > +{ > > + return try_cmpxchg(&v->counter, old, new); > > +} > > + > > static inline int atomic_xchg(atomic_t *v, int new) > > { > > return xchg(&v->counter, new); > > --- a/arch/x86/include/asm/cmpxchg.h > > +++ b/arch/x86/include/asm/cmpxchg.h > > @@ -153,6 +153,75 @@ extern void __add_wrong_size(void) > > #define cmpxchg_local(ptr, old, new) \ > > __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) > > > > + > > +#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ > > +({ \ > > + bool success; \ > > + __typeof__(_ptr) _old = (_pold); \ > > + __typeof__(*(_ptr)) __old = *_old; \ > > + __typeof__(*(_ptr)) __new = (_new); \ > > + switch (size) { \ > > + case __X86_CASE_B: \ > > + { \ > > + volatile u8 *__ptr = (volatile u8 *)(_ptr); \ > > + asm volatile(lock "cmpxchgb %[new], %[ptr]" \ > > + CC_SET(z) \ > > + : CC_OUT(z) (success), \ > > + [ptr] "+m" (*__ptr), \ > > + [old] "+a" (__old) \ > > + : [new] "q" (__new) \ > > + : "memory"); \ > > + break; \ > > + } \ > > + case __X86_CASE_W: \ > > + { \ > > + volatile u16 *__ptr = (volatile u16 *)(_ptr); \ > > + asm volatile(lock "cmpxchgw %[new], %[ptr]" \ > > + CC_SET(z) \ > > + : CC_OUT(z) (success), \ > > + [ptr] "+m" (*__ptr), \ > > + [old] "+a" (__old) \ > > + : [new] "r" (__new) \ > > + : "memory"); \ > > + break; \ > > + } \ > > + case __X86_CASE_L: \ > > + { \ > > + volatile u32 *__ptr = (volatile u32 *)(_ptr); \ > > + asm volatile(lock "cmpxchgl %[new], %[ptr]" \ > > + CC_SET(z) \ > > + : CC_OUT(z) (success), \ > > + [ptr] "+m" (*__ptr), \ > > + [old] "+a" (__old) \ > > + : [new] "r" (__new) \ > > + : "memory"); \ > > + break; \ > > + } \ > > + case __X86_CASE_Q: \ > > + { \ > > + volatile u64 *__ptr = (volatile u64 *)(_ptr); \ > > + asm volatile(lock "cmpxchgq %[new], %[ptr]" \ > > + CC_SET(z) \ > > + : CC_OUT(z) (success), \ > > + [ptr] "+m" (*__ptr), \ > > + [old] "+a" (__old) \ > > + : [new] "r" (__new) \ > > + : "memory"); \ > > + break; \ > > + } \ > > + default: \ > > + __cmpxchg_wrong_size(); \ > > + } \ > > + *_old = __old; \ > > + success; \ > > +}) > > + > > +#define __try_cmpxchg(ptr, pold, new, size) \ > > + __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) > > + > > +#define try_cmpxchg(ptr, pold, new) \ > > + __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) > > + > > /* > > * xadd() adds "inc" to "*ptr" and atomically returns the previous > > * value of "*ptr". > > --- a/include/linux/atomic.h > > +++ b/include/linux/atomic.h > > @@ -423,6 +423,28 @@ > > #endif > > #endif /* atomic_cmpxchg_relaxed */ > > > > +#ifndef atomic_try_cmpxchg > > + > > +#define __atomic_try_cmpxchg(type, _p, _po, _n) \ > > +({ \ > > + typeof(_po) __po = (_po); \ > > + typeof(*(_po)) __o = *__po; \ > > + bool success = (atomic_cmpxchg##type((_p), __o, (_n)) == __o); \ > > + *__po = __o; \ > > Besides, is this part correct? atomic_cmpxchg_*() wouldn't change the > value of __o, so *__po wouldn't be changed.. IOW, in case of failure, > *ptr wouldn't be updated to a new value. > > Maybe this should be: > > bool success; > *__po = atomic_cmpxchg##type((_p), __o, (_n)); > sucess = (*__po == _o); typo... should be success = (*__po == __o); Regards, Boqun > > , right? > > Regards, > Boqun > > > + success; \ > > +}) > > + > > +#define atomic_try_cmpxchg(_p, _po, _n) __atomic_try_cmpxchg(, _p, _po, _n) > > +#define atomic_try_cmpxchg_relaxed(_p, _po, _n) __atomic_try_cmpxchg(_relaxed, _p, _po, _n) > > +#define atomic_try_cmpxchg_acquire(_p, _po, _n) __atomic_try_cmpxchg(_acquire, _p, _po, _n) > > +#define atomic_try_cmpxchg_release(_p, _po, _n) __atomic_try_cmpxchg(_release, _p, _po, _n) > > + > > +#else /* atomic_try_cmpxchg */ > > +#define atomic_try_cmpxchg_relaxed atomic_try_cmpxchg > > +#define atomic_try_cmpxchg_acquire atomic_try_cmpxchg > > +#define atomic_try_cmpxchg_release atomic_try_cmpxchg > > +#endif /* atomic_try_cmpxchg */ > > + > > /* cmpxchg_relaxed */ > > #ifndef cmpxchg_relaxed > > #define cmpxchg_relaxed cmpxchg > > > >
On Mon, Feb 06, 2017 at 12:24:28PM +0800, Boqun Feng wrote: > On Fri, Feb 03, 2017 at 02:26:02PM +0100, Peter Zijlstra wrote: > > > > for (;;) { > > new = val $op $imm; > > if (try_cmpxchg(ptr, &val, new)) > > break; > > } > > > > while also generating better code (GCC6 and onwards). > > > > But switching to try_cmpxchg() will make @val a memory location, which > could not be put in a register. And this will generate unnecessary > memory accesses on archs having enough registers(PPC, e.g.). GCC was perfectly capable of making @val a register in the code I was looking at. > > +#ifndef atomic_try_cmpxchg > > + > > +#define __atomic_try_cmpxchg(type, _p, _po, _n) \ > > +({ \ > > + typeof(_po) __po = (_po); \ > > + typeof(*(_po)) __o = *__po; \ > > + bool success = (atomic_cmpxchg##type((_p), __o, (_n)) == __o); \ > > + *__po = __o; \ > > Besides, is this part correct? atomic_cmpxchg_*() wouldn't change the > value of __o, so *__po wouldn't be changed.. IOW, in case of failure, > *ptr wouldn't be updated to a new value. > > Maybe this should be: > > bool success; > *__po = atomic_cmpxchg##type((_p), __o, (_n)); > sucess = (*__po == _o); > > , right? Yes, botched that. Don't think I even compiled it to be honest :/
--- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -186,6 +186,12 @@ static __always_inline int atomic_cmpxch return cmpxchg(&v->counter, old, new); } +#define atomic_try_cmpxchg atomic_try_cmpxchg +static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) +{ + return try_cmpxchg(&v->counter, old, new); +} + static inline int atomic_xchg(atomic_t *v, int new) { return xchg(&v->counter, new); --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -153,6 +153,75 @@ extern void __add_wrong_size(void) #define cmpxchg_local(ptr, old, new) \ __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) + +#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ +({ \ + bool success; \ + __typeof__(_ptr) _old = (_pold); \ + __typeof__(*(_ptr)) __old = *_old; \ + __typeof__(*(_ptr)) __new = (_new); \ + switch (size) { \ + case __X86_CASE_B: \ + { \ + volatile u8 *__ptr = (volatile u8 *)(_ptr); \ + asm volatile(lock "cmpxchgb %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "q" (__new) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_W: \ + { \ + volatile u16 *__ptr = (volatile u16 *)(_ptr); \ + asm volatile(lock "cmpxchgw %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "r" (__new) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_L: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(_ptr); \ + asm volatile(lock "cmpxchgl %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "r" (__new) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_Q: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(_ptr); \ + asm volatile(lock "cmpxchgq %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "r" (__new) \ + : "memory"); \ + break; \ + } \ + default: \ + __cmpxchg_wrong_size(); \ + } \ + *_old = __old; \ + success; \ +}) + +#define __try_cmpxchg(ptr, pold, new, size) \ + __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) + +#define try_cmpxchg(ptr, pold, new) \ + __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) + /* * xadd() adds "inc" to "*ptr" and atomically returns the previous * value of "*ptr". --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -423,6 +423,28 @@ #endif #endif /* atomic_cmpxchg_relaxed */ +#ifndef atomic_try_cmpxchg + +#define __atomic_try_cmpxchg(type, _p, _po, _n) \ +({ \ + typeof(_po) __po = (_po); \ + typeof(*(_po)) __o = *__po; \ + bool success = (atomic_cmpxchg##type((_p), __o, (_n)) == __o); \ + *__po = __o; \ + success; \ +}) + +#define atomic_try_cmpxchg(_p, _po, _n) __atomic_try_cmpxchg(, _p, _po, _n) +#define atomic_try_cmpxchg_relaxed(_p, _po, _n) __atomic_try_cmpxchg(_relaxed, _p, _po, _n) +#define atomic_try_cmpxchg_acquire(_p, _po, _n) __atomic_try_cmpxchg(_acquire, _p, _po, _n) +#define atomic_try_cmpxchg_release(_p, _po, _n) __atomic_try_cmpxchg(_release, _p, _po, _n) + +#else /* atomic_try_cmpxchg */ +#define atomic_try_cmpxchg_relaxed atomic_try_cmpxchg +#define atomic_try_cmpxchg_acquire atomic_try_cmpxchg +#define atomic_try_cmpxchg_release atomic_try_cmpxchg +#endif /* atomic_try_cmpxchg */ + /* cmpxchg_relaxed */ #ifndef cmpxchg_relaxed #define cmpxchg_relaxed cmpxchg
Add a new cmpxchg interface: bool try_cmpxchg(u{8,16,32,64} *ptr, u{8,16,32,64} *val, u{8,16,32,64} new); Where the boolean returns the result of the compare; and thus if the exchange happened; and in case of failure, the new value of *ptr is returned in *val. This allows simplification/improvement of loops like: for (;;) { new = val $op $imm; old = cmpxchg(ptr, val, new); if (old == val) break; val = old; } into: for (;;) { new = val $op $imm; if (try_cmpxchg(ptr, &val, new)) break; } while also generating better code (GCC6 and onwards). Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> ---