diff mbox

[RFC,12/13] x86: x86 implementation for HARDENED_ATOMIC

Message ID 1475476886-26232-13-git-send-email-elena.reshetova@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Reshetova, Elena Oct. 3, 2016, 6:41 a.m. UTC
This adds x86-specific code in order to support
HARDENED_ATOMIC feature. When overflow is detected
in atomic_t or atomic_long_t types, the counter is
decremented back by one (to keep it at INT_MAX or
LONG_MAX) and issue is reported using BUG().
The side effect is that in both legitimate and
non-legitimate cases a counter cannot wrap.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: David Windsor <dwindsor@gmail.com>
---
 arch/x86/Kconfig                   |   1 +
 arch/x86/include/asm/atomic.h      | 274 +++++++++++++++++++++++++++++++++++--
 arch/x86/include/asm/atomic64_32.h | 157 ++++++++++++++++++++-
 arch/x86/include/asm/atomic64_64.h | 166 +++++++++++++++++++++-
 arch/x86/include/asm/bitops.h      |   8 +-
 arch/x86/include/asm/cmpxchg.h     |  39 ++++++
 arch/x86/include/asm/local.h       |  89 +++++++++++-
 arch/x86/include/asm/preempt.h     |   2 +-
 arch/x86/include/asm/rmwcc.h       |  82 +++++++++--
 arch/x86/include/asm/rwsem.h       |  50 +++++++
 arch/x86/kernel/traps.c            |   6 +
 arch/x86/lib/atomic64_386_32.S     | 135 ++++++++++++++++++
 arch/x86/lib/atomic64_cx8_32.S     |  78 ++++++++++-
 13 files changed, 1042 insertions(+), 45 deletions(-)

Comments

Jann Horn Oct. 3, 2016, 9:47 a.m. UTC | #1
On Mon, Oct 03, 2016 at 09:41:25AM +0300, Elena Reshetova wrote:
> This adds x86-specific code in order to support
> HARDENED_ATOMIC feature. When overflow is detected
> in atomic_t or atomic_long_t types, the counter is
> decremented back by one (to keep it at INT_MAX or
> LONG_MAX) and issue is reported using BUG().
> The side effect is that in both legitimate and
> non-legitimate cases a counter cannot wrap.
> 
> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
> Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
> Signed-off-by: David Windsor <dwindsor@gmail.com>
> ---
[...]
>  static __always_inline void atomic_add(int i, atomic_t *v)
>  {
> -	asm volatile(LOCK_PREFIX "addl %1,%0"
> +	asm volatile(LOCK_PREFIX "addl %1,%0\n"
> +
> +#ifdef CONFIG_HARDENED_ATOMIC
> +		     "jno 0f\n"
> +		     LOCK_PREFIX "subl %1,%0\n"
> +		     "int $4\n0:\n"
> +		     _ASM_EXTABLE(0b, 0b)
> +#endif
> +
> +		     : "+m" (v->counter)
> +		     : "ir" (i));
> +}

It might make sense to point out in the Kconfig entry
that on X86, this can only be relied on if
kernel.panic_on_oops==1 because otherwise, you can
(depending on the bug, in a worst-case scenario) get
past 0x7fffffff within seconds using multiple racing
processes.
(See https://bugs.chromium.org/p/project-zero/issues/detail?id=856 .)


An additional idea for future development:

One way to work around that would be to interpret the
stored value 2^30 as zero, and interpret other values
accordingly. Like this:

#define SIGNED_ATOMIC_BASE 0x40000000U

static __always_inline int atomic_read(const atomic_t *v)
{
  return READ_ONCE((v)->counter) - SIGNED_ATOMIC_BASE;
}

static __always_inline void atomic_set(atomic_t *v, int i)
{
  WRITE_ONCE(v->counter, i + SIGNED_ATOMIC_BASE);
}

static __always_inline int atomic_add_return(int i, atomic_t *v)
{
  return i + xadd_check_overflow(&v->counter, i) - SIGNED_ATOMIC_BASE;
}

With this change, atomic_t could still be used as a signed integer
with half the range of an int, but its stored value would only
become negative on overflow. Then, the "jno" instruction in the
hardening code could be replaced with "jns" to reliably block
overflows.

The downsides of this approach would be:
 - One extra increment or decrement every time an atomic_t is read
   or written. This should be relatively cheap - it should be
   operating on a register -, but it's still not ideal. atomic_t
   users could perhaps opt out with something like
   atomic_unsigned_t.
 - Implicit atomic_t initialization to zero by zeroing memory
   would stop working. This would probably be the biggest issue
   with this approach.

I think that unfortunately, there are a large number of atomic_t
users that don't explicitly initialize the atomic_t and instead
rely on implicit initialization to zero, and changing that would
cause a lot of code churn - so while this would IMO improve the
mitigation, this series should IMO be merged without it and
instead have a small warning in the Kconfig entry or so.
Dave Hansen Oct. 3, 2016, 7:27 p.m. UTC | #2
On 10/02/2016 11:41 PM, Elena Reshetova wrote:
>  static __always_inline void atomic_add(int i, atomic_t *v)
>  {
> -	asm volatile(LOCK_PREFIX "addl %1,%0"
> +	asm volatile(LOCK_PREFIX "addl %1,%0\n"
> +
> +#ifdef CONFIG_HARDENED_ATOMIC
> +		     "jno 0f\n"
> +		     LOCK_PREFIX "subl %1,%0\n"
> +		     "int $4\n0:\n"
> +		     _ASM_EXTABLE(0b, 0b)
> +#endif
> +
> +		     : "+m" (v->counter)
> +		     : "ir" (i));
> +}

Rather than doing all this assembly and exception stuff, could we just do:

static __always_inline void atomic_add(int i, atomic_t *v)
{
	if (atomic_add_unless(v, a, INT_MAX))
		BUG_ON_OVERFLOW_FOO()...
}

That way, there's also no transient state where somebody can have
observed the overflow before it is fixed up.  Granted, this
cmpxchg-based operation _is_ more expensive than the fast-path locked addl.
Kees Cook Oct. 3, 2016, 9:29 p.m. UTC | #3
The commit subject doesn't need the second "x86", I think.

On Sun, Oct 2, 2016 at 11:41 PM, Elena Reshetova
<elena.reshetova@intel.com> wrote:
> This adds x86-specific code in order to support
> HARDENED_ATOMIC feature. When overflow is detected
> in atomic_t or atomic_long_t types, the counter is
> decremented back by one (to keep it at INT_MAX or
> LONG_MAX) and issue is reported using BUG().
> The side effect is that in both legitimate and
> non-legitimate cases a counter cannot wrap.
>
> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
> Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
> Signed-off-by: David Windsor <dwindsor@gmail.com>
> ---
>  arch/x86/Kconfig                   |   1 +
>  arch/x86/include/asm/atomic.h      | 274 +++++++++++++++++++++++++++++++++++--
>  arch/x86/include/asm/atomic64_32.h | 157 ++++++++++++++++++++-
>  arch/x86/include/asm/atomic64_64.h | 166 +++++++++++++++++++++-
>  arch/x86/include/asm/bitops.h      |   8 +-
>  arch/x86/include/asm/cmpxchg.h     |  39 ++++++
>  arch/x86/include/asm/local.h       |  89 +++++++++++-
>  arch/x86/include/asm/preempt.h     |   2 +-
>  arch/x86/include/asm/rmwcc.h       |  82 +++++++++--
>  arch/x86/include/asm/rwsem.h       |  50 +++++++
>  arch/x86/kernel/traps.c            |   6 +
>  arch/x86/lib/atomic64_386_32.S     | 135 ++++++++++++++++++
>  arch/x86/lib/atomic64_cx8_32.S     |  78 ++++++++++-
>  13 files changed, 1042 insertions(+), 45 deletions(-)
> [...]
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index bd4e3d4..ad814ee 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -191,6 +191,12 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
>                         tsk->thread.trap_nr = trapnr;
>                         die(str, regs, error_code);
>                 }
> +
> +#ifdef CONFIG_HARDENED_ATOMIC
> +               if (trapnr == X86_TRAP_OF)
> +                       hardened_atomic_refcount_overflow(regs);
> +#endif

With hardened_atomic_refcount_overflow() defined as a no-op without
HARDENED_ATOMIC, this #ifdef can go away.

> +
>                 return 0;
>         }
>
> [...]

-Kees
David Windsor Oct. 3, 2016, 10:49 p.m. UTC | #4
On Mon, Oct 3, 2016 at 3:27 PM, Dave Hansen <dave.hansen@intel.com> wrote:
> On 10/02/2016 11:41 PM, Elena Reshetova wrote:
>>  static __always_inline void atomic_add(int i, atomic_t *v)
>>  {
>> -     asm volatile(LOCK_PREFIX "addl %1,%0"
>> +     asm volatile(LOCK_PREFIX "addl %1,%0\n"
>> +
>> +#ifdef CONFIG_HARDENED_ATOMIC
>> +                  "jno 0f\n"
>> +                  LOCK_PREFIX "subl %1,%0\n"
>> +                  "int $4\n0:\n"
>> +                  _ASM_EXTABLE(0b, 0b)
>> +#endif
>> +
>> +                  : "+m" (v->counter)
>> +                  : "ir" (i));
>> +}
>
> Rather than doing all this assembly and exception stuff, could we just do:
>
> static __always_inline void atomic_add(int i, atomic_t *v)
> {
>         if (atomic_add_unless(v, a, INT_MAX))
>                 BUG_ON_OVERFLOW_FOO()...
> }
>
> That way, there's also no transient state where somebody can have
> observed the overflow before it is fixed up.  Granted, this
> cmpxchg-based operation _is_ more expensive than the fast-path locked addl.

I'm not opposed to this, as this would allow us to eliminate the race
on x86 and this doesn't really change how things work on a fundamental
level here.  The overflow detection mechanism essentially remains the
same: __atomic_add_unless still would use exception dispatching as the
means of signaling that an overflow has occurred:

static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
{
    int c, old, new;
    c = atomic_read(v);
    for (;;) {
        if (unlikely(c == (u)))
            break;
        asm volatile("addl %2,%0\n"

 #ifdef CONFIG_HARDENED_ATOMIC
            "jno 0f\n"
            "subl %2,%0\n"
            "int $4\n0:\n"
            _ASM_EXTABLE(0b, 0b)
#endif
            : "=r" (new)
            : "0" (c), "ir" (a));
        old = atomic_cmpxchg((v), c, new);
        if (likely(old == c))
            break;
            c = old;
    }
    return c;
}

I'm unsure about the performance implications of cmpxchg, though, so I
agree with Kees: we should subject this feature (and this series as a
whole) to benchmarking.
Reshetova, Elena Oct. 4, 2016, 7:15 a.m. UTC | #5
On Mon, Oct 03, 2016 at 09:41:25AM +0300, Elena Reshetova wrote:
> This adds x86-specific code in order to support HARDENED_ATOMIC 
> feature. When overflow is detected in atomic_t or atomic_long_t types, 
> the counter is decremented back by one (to keep it at INT_MAX or
> LONG_MAX) and issue is reported using BUG().
> The side effect is that in both legitimate and non-legitimate cases a 
> counter cannot wrap.
> 
> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
> Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
> Signed-off-by: David Windsor <dwindsor@gmail.com>
> ---
[...]
>  static __always_inline void atomic_add(int i, atomic_t *v)  {
> -	asm volatile(LOCK_PREFIX "addl %1,%0"
> +	asm volatile(LOCK_PREFIX "addl %1,%0\n"
> +
> +#ifdef CONFIG_HARDENED_ATOMIC
> +		     "jno 0f\n"
> +		     LOCK_PREFIX "subl %1,%0\n"
> +		     "int $4\n0:\n"
> +		     _ASM_EXTABLE(0b, 0b)
> +#endif
> +
> +		     : "+m" (v->counter)
> +		     : "ir" (i));
> +}

>It might make sense to point out in the Kconfig entry that on X86, this can only be relied on if
>kernel.panic_on_oops==1 because otherwise, you can (depending on the bug, in a worst-case scenario) get past 0x7fffffff within seconds using multiple racing processes.
>(See https://bugs.chromium.org/p/project-zero/issues/detail?id=856 .)

I will reference this discussion if we stick with the current approach. Maybe after performance measurements we can stick to the atomic_add_unless version and then eliminate the issue. 

>An additional idea for future development:

>One way to work around that would be to interpret the stored value 2^30 as zero, and interpret other values accordingly. Like this:

>#define SIGNED_ATOMIC_BASE 0x40000000U

>static __always_inline int atomic_read(const atomic_t *v) {
  return READ_ONCE((v)->counter) - SIGNED_ATOMIC_BASE; }

>static __always_inline void atomic_set(atomic_t *v, int i) {
  WRITE_ONCE(v->counter, i + SIGNED_ATOMIC_BASE); }

>static __always_inline int atomic_add_return(int i, atomic_t *v) {
  return i + xadd_check_overflow(&v->counter, i) - SIGNED_ATOMIC_BASE; }

>With this change, atomic_t could still be used as a signed integer with half the range of an int, but its stored value would only become negative on overflow. Then, the "jno" instruction in the hardening code could be replaced with "jns" to reliably block overflows.

>The downsides of this approach would be:
> - One extra increment or decrement every time an atomic_t is read
   or written. This should be relatively cheap - it should be
   operating on a register -, but it's still not ideal. atomic_t
   users could perhaps opt out with something like
   atomic_unsigned_t.
 - Implicit atomic_t initialization to zero by zeroing memory
   would stop working. This would probably be the biggest issue
   with this approach.

I am not sure the BIAS is a good idea at all. Makes things much more complicated, potentially impacts performance...
Jann Horn Oct. 4, 2016, 12:41 p.m. UTC | #6
On Mon, Oct 03, 2016 at 12:27:01PM -0700, Dave Hansen wrote:
> On 10/02/2016 11:41 PM, Elena Reshetova wrote:
> >  static __always_inline void atomic_add(int i, atomic_t *v)
> >  {
> > -	asm volatile(LOCK_PREFIX "addl %1,%0"
> > +	asm volatile(LOCK_PREFIX "addl %1,%0\n"
> > +
> > +#ifdef CONFIG_HARDENED_ATOMIC
> > +		     "jno 0f\n"
> > +		     LOCK_PREFIX "subl %1,%0\n"
> > +		     "int $4\n0:\n"
> > +		     _ASM_EXTABLE(0b, 0b)
> > +#endif
> > +
> > +		     : "+m" (v->counter)
> > +		     : "ir" (i));
> > +}
> 
> Rather than doing all this assembly and exception stuff, could we just do:
> 
> static __always_inline void atomic_add(int i, atomic_t *v)
> {
> 	if (atomic_add_unless(v, a, INT_MAX))
> 		BUG_ON_OVERFLOW_FOO()...
> }
> 
> That way, there's also no transient state where somebody can have
> observed the overflow before it is fixed up.  Granted, this
> cmpxchg-based operation _is_ more expensive than the fast-path locked addl.

I think we need some numbers, so I copypasted a bunch of kernel code together
so that I can benchmark this stuff in userspace without having a full kernel
implementation of refcounting protection. My code is at the bottom of
the mail - please test this on other CPUs, these are just numbers from my
machine.

The following numbers are from tests on a
"Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz" CPU.

First, I'm testing the single-threaded (uncontended) case:

$ gcc -o atomic_user_test atomic_user_test.c -std=gnu99 -Wall -pthread -ggdb
$ time ./atomic_user_test 1 1 1000000000 # single-threaded, no protection
real	0m9.281s
user	0m9.251s
sys	0m0.004s
$ time ./atomic_user_test 1 2 1000000000 # single-threaded, racy protection
real	0m9.385s
user	0m9.365s
sys	0m0.003s
$ time ./atomic_user_test 1 3 1000000000 # single-threaded, cmpxchg protection
real	0m12.399s
user	0m12.375s
sys	0m0.000s

The cmpxchg protection is something like 30% slower than the racy one. The
cmpxchg protection needs something like 12.4ns per operation, or around 45
cycles per operation. (Well, probably actually a bit less, considering that
the loop also costs some time.) My guess is that this wouldn't be noticeable.

Now, I'm testing the multi-threaded (contended) case, with two threads that
only try to increment the same counter over and over again - so this is a
pretty extreme worst-case microbenchmark:

$ time ./atomic_user_test 2 1 1000000000 # multi-threaded, no protection
real	0m9.550s
user	0m18.988s
sys	0m0.000s
$ time ./atomic_user_test 2 2 1000000000 # multi-threaded, racy protection
real	0m9.249s
user	0m18.430s
sys	0m0.004s
$ time ./atomic_user_test 2 3 1000000000 # multi-threaded, cmpxchg protection
real	1m47.331s
user	3m34.390s
sys	0m0.024s

Here, the cmpxchg-protected counter is around 11 times as slow as the
unprotected counter, with around 107ns per average increment. That's around
380 cycles per increment.

I guess we probably don't care all that much about the few extra cycles in
the uncontended case.
So I think the big question now is how important the performance of the
high-contention case is.


My test code, cobbled together from the kernel sources and the
suggested mitigations:

===============================================
#define _GNU_SOURCE
#include <pthread.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <err.h>
#include <unistd.h>
#include <sys/wait.h>
#include <limits.h>

#define LOCK_PREFIX "lock; "

static __always_inline void atomic_inc_raw(int *v)
{
  asm volatile(LOCK_PREFIX "incl %0"
         : "+m" (v));
}

static __always_inline void atomic_inc_racy(int *v)
{
  asm volatile(LOCK_PREFIX "incl %0\n"
    "jno 0f\n"
    LOCK_PREFIX "decl %0\n"
    "call abort\n0:\n"
    : "+m" (v));
}

#define READ_ONCE_32(v) (*(volatile int *)(v))

static __always_inline int cmpxchg(int *v, int old, int new)
{
        volatile unsigned int *ptr = (volatile unsigned int *)(v);
        int ret;
        asm volatile(LOCK_PREFIX "cmpxchgl %2,%1"
                     : "=a" (ret), "+m" (*ptr)
                     : "r" (new), "0" (old)
                     : "memory");
        return ret;
}

static __always_inline int __atomic_add_unless_(int *v, int a, int u)
{
  int c, old;
  c = READ_ONCE_32(v);
  for (;;) {
    if (__builtin_expect(c == (u), 0))
      break;
    old = cmpxchg((v), c, c + (a));
    if (__builtin_expect(old == c, 1))
      break;
    c = old;
  }
  return c;
}

static __always_inline void atomic_inc_cmpxchg(int *v)
{
  if (__atomic_add_unless_(v, 1, INT_MAX) == INT_MAX)
    abort();
}

int mode, type, count;
#define TYPE_RAW 1
#define TYPE_RACY 2
#define TYPE_CMPXCHG 3

int test_atomic;
void *childfn(void *arg) {
  switch (type) {
  case TYPE_RAW:
    for (int i=0; i<count; i++)
      atomic_inc_raw(&test_atomic);
    break;
  case TYPE_RACY:
    for (int i=0; i<count; i++)
      atomic_inc_racy(&test_atomic);
    break;
  case TYPE_CMPXCHG:
    for (int i=0; i<count; i++)
      atomic_inc_cmpxchg(&test_atomic);
    break;
  }
  return NULL;
}

int main(int argc, char **argv) {
  pthread_t thread;

  if (argc != 4)
    errx(1, "bad invocation; want mode, type and count");
  mode = atoi(argv[1]); // 1 == single process, 2 == two processes
  if (mode != 1 && mode != 2)
    errx(1, "bad mode");
  type = atoi(argv[2]);
  if (type < 1 || type > 3)
    errx(1, "bad type");
  count = atoi(argv[3]);
  if (count <= 0)
    errx(1, "bad count");

  if (mode == 2) {
    if (pthread_create(&thread, NULL, childfn, NULL))
      err(1, "pthread_create");
  }
  childfn(NULL);

  if (mode == 2) {
    if (pthread_join(thread, NULL))
      err(1, "join");
  }

  return 0;
}
===============================================
Jann Horn Oct. 4, 2016, 12:46 p.m. UTC | #7
On Tue, Oct 04, 2016 at 07:15:56AM +0000, Reshetova, Elena wrote:
> >An additional idea for future development:
> 
> >One way to work around that would be to interpret the stored value 2^30 as zero, and interpret other values accordingly. Like this:
> 
> >#define SIGNED_ATOMIC_BASE 0x40000000U
> 
> >static __always_inline int atomic_read(const atomic_t *v) {
>   return READ_ONCE((v)->counter) - SIGNED_ATOMIC_BASE; }
> 
> >static __always_inline void atomic_set(atomic_t *v, int i) {
>   WRITE_ONCE(v->counter, i + SIGNED_ATOMIC_BASE); }
> 
> >static __always_inline int atomic_add_return(int i, atomic_t *v) {
>   return i + xadd_check_overflow(&v->counter, i) - SIGNED_ATOMIC_BASE; }
> 
> >With this change, atomic_t could still be used as a signed integer with half the range of an int, but its stored value would only become negative on overflow. Then, the "jno" instruction in the hardening code could be replaced with "jns" to reliably block overflows.
> 
> >The downsides of this approach would be:
> > - One extra increment or decrement every time an atomic_t is read
>    or written. This should be relatively cheap - it should be
>    operating on a register -, but it's still not ideal. atomic_t
>    users could perhaps opt out with something like
>    atomic_unsigned_t.
>  - Implicit atomic_t initialization to zero by zeroing memory
>    would stop working. This would probably be the biggest issue
>    with this approach.
> 
> I am not sure the BIAS is a good idea at all. Makes things much more complicated, potentially impacts performance...

Yeah, it does make things more complicated. And I just noticed that with the
BIAS, atomic_sub_and_test() would likely have to be implemented with cmpxchg,
so it probably doesn't help much performance-wise.

In summary: Nevermind, it was a stupid idea.
Kees Cook Oct. 4, 2016, 6:51 p.m. UTC | #8
On Tue, Oct 4, 2016 at 5:41 AM, Jann Horn <jann@thejh.net> wrote:
> On Mon, Oct 03, 2016 at 12:27:01PM -0700, Dave Hansen wrote:
>> On 10/02/2016 11:41 PM, Elena Reshetova wrote:
>> >  static __always_inline void atomic_add(int i, atomic_t *v)
>> >  {
>> > -   asm volatile(LOCK_PREFIX "addl %1,%0"
>> > +   asm volatile(LOCK_PREFIX "addl %1,%0\n"
>> > +
>> > +#ifdef CONFIG_HARDENED_ATOMIC
>> > +                "jno 0f\n"
>> > +                LOCK_PREFIX "subl %1,%0\n"
>> > +                "int $4\n0:\n"
>> > +                _ASM_EXTABLE(0b, 0b)
>> > +#endif
>> > +
>> > +                : "+m" (v->counter)
>> > +                : "ir" (i));
>> > +}
>>
>> Rather than doing all this assembly and exception stuff, could we just do:
>>
>> static __always_inline void atomic_add(int i, atomic_t *v)
>> {
>>       if (atomic_add_unless(v, a, INT_MAX))
>>               BUG_ON_OVERFLOW_FOO()...
>> }
>>
>> That way, there's also no transient state where somebody can have
>> observed the overflow before it is fixed up.  Granted, this
>> cmpxchg-based operation _is_ more expensive than the fast-path locked addl.
>
> I think we need some numbers, so I copypasted a bunch of kernel code together
> so that I can benchmark this stuff in userspace without having a full kernel
> implementation of refcounting protection. My code is at the bottom of
> the mail - please test this on other CPUs, these are just numbers from my
> machine.
>
> The following numbers are from tests on a
> "Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz" CPU.
>
> First, I'm testing the single-threaded (uncontended) case:
>
> $ gcc -o atomic_user_test atomic_user_test.c -std=gnu99 -Wall -pthread -ggdb
> $ time ./atomic_user_test 1 1 1000000000 # single-threaded, no protection
> real    0m9.281s
> user    0m9.251s
> sys     0m0.004s
> $ time ./atomic_user_test 1 2 1000000000 # single-threaded, racy protection
> real    0m9.385s
> user    0m9.365s
> sys     0m0.003s
> $ time ./atomic_user_test 1 3 1000000000 # single-threaded, cmpxchg protection
> real    0m12.399s
> user    0m12.375s
> sys     0m0.000s
>
> The cmpxchg protection is something like 30% slower than the racy one. The
> cmpxchg protection needs something like 12.4ns per operation, or around 45
> cycles per operation. (Well, probably actually a bit less, considering that
> the loop also costs some time.) My guess is that this wouldn't be noticeable.
>
> Now, I'm testing the multi-threaded (contended) case, with two threads that
> only try to increment the same counter over and over again - so this is a
> pretty extreme worst-case microbenchmark:
>
> $ time ./atomic_user_test 2 1 1000000000 # multi-threaded, no protection
> real    0m9.550s
> user    0m18.988s
> sys     0m0.000s
> $ time ./atomic_user_test 2 2 1000000000 # multi-threaded, racy protection
> real    0m9.249s
> user    0m18.430s
> sys     0m0.004s
> $ time ./atomic_user_test 2 3 1000000000 # multi-threaded, cmpxchg protection
> real    1m47.331s
> user    3m34.390s
> sys     0m0.024s
>
> Here, the cmpxchg-protected counter is around 11 times as slow as the
> unprotected counter, with around 107ns per average increment. That's around
> 380 cycles per increment.
>
> I guess we probably don't care all that much about the few extra cycles in
> the uncontended case.
> So I think the big question now is how important the performance of the
> high-contention case is.

What I find quite exciting about this benchmark is that they're the
absolute worst-case: the process is doing nothing but atomic
operations (which won't be the case for general kernel workloads) and
the fact that the racy protection is basically lost in the noise is
great.

Now, cmpxchg looks bad here, but is, again, in the worst-case
environment. I'll be curious to see kernel workloads with it, though.

As for the "racy" part, I'm not too concerned about it. (I would like
it not to race, but given a choice, I would rather this protection was
enabled by default.) As-is, with two threads fighting for a race, it
can be hard to win the race, and even if there's a success, one will
still Oops, so it's still better than what we have today: no
notification of failure and an exploitable condition. (And, frankly,
the act of racing and _failing_ is both threads Oopsing, so performing
a racy would be extremely noisy on systems where they're not already
set to panic on the first Oops...)

Just to make sure I'm not imagining the wrong thing, the race looks like this:

CPU0  CPU1
inc->0
            inc->1
dec->0
Oops
            carry on

-Kees
Jann Horn Oct. 4, 2016, 7:48 p.m. UTC | #9
On Tue, Oct 04, 2016 at 11:51:09AM -0700, Kees Cook wrote:
> On Tue, Oct 4, 2016 at 5:41 AM, Jann Horn <jann@thejh.net> wrote:
> > On Mon, Oct 03, 2016 at 12:27:01PM -0700, Dave Hansen wrote:
> >> On 10/02/2016 11:41 PM, Elena Reshetova wrote:
> >> >  static __always_inline void atomic_add(int i, atomic_t *v)
> >> >  {
> >> > -   asm volatile(LOCK_PREFIX "addl %1,%0"
> >> > +   asm volatile(LOCK_PREFIX "addl %1,%0\n"
> >> > +
> >> > +#ifdef CONFIG_HARDENED_ATOMIC
> >> > +                "jno 0f\n"
> >> > +                LOCK_PREFIX "subl %1,%0\n"
> >> > +                "int $4\n0:\n"
> >> > +                _ASM_EXTABLE(0b, 0b)
> >> > +#endif
> >> > +
> >> > +                : "+m" (v->counter)
> >> > +                : "ir" (i));
> >> > +}
> >>
> >> Rather than doing all this assembly and exception stuff, could we just do:
> >>
> >> static __always_inline void atomic_add(int i, atomic_t *v)
> >> {
> >>       if (atomic_add_unless(v, a, INT_MAX))
> >>               BUG_ON_OVERFLOW_FOO()...
> >> }
> >>
> >> That way, there's also no transient state where somebody can have
> >> observed the overflow before it is fixed up.  Granted, this
> >> cmpxchg-based operation _is_ more expensive than the fast-path locked addl.
> >
> > I think we need some numbers, so I copypasted a bunch of kernel code together
> > so that I can benchmark this stuff in userspace without having a full kernel
> > implementation of refcounting protection. My code is at the bottom of
> > the mail - please test this on other CPUs, these are just numbers from my
> > machine.
> >
> > The following numbers are from tests on a
> > "Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz" CPU.
> >
> > First, I'm testing the single-threaded (uncontended) case:
> >
> > $ gcc -o atomic_user_test atomic_user_test.c -std=gnu99 -Wall -pthread -ggdb
> > $ time ./atomic_user_test 1 1 1000000000 # single-threaded, no protection
> > real    0m9.281s
> > user    0m9.251s
> > sys     0m0.004s
> > $ time ./atomic_user_test 1 2 1000000000 # single-threaded, racy protection
> > real    0m9.385s
> > user    0m9.365s
> > sys     0m0.003s
> > $ time ./atomic_user_test 1 3 1000000000 # single-threaded, cmpxchg protection
> > real    0m12.399s
> > user    0m12.375s
> > sys     0m0.000s
> >
> > The cmpxchg protection is something like 30% slower than the racy one. The
> > cmpxchg protection needs something like 12.4ns per operation, or around 45
> > cycles per operation. (Well, probably actually a bit less, considering that
> > the loop also costs some time.) My guess is that this wouldn't be noticeable.
> >
> > Now, I'm testing the multi-threaded (contended) case, with two threads that
> > only try to increment the same counter over and over again - so this is a
> > pretty extreme worst-case microbenchmark:
> >
> > $ time ./atomic_user_test 2 1 1000000000 # multi-threaded, no protection
> > real    0m9.550s
> > user    0m18.988s
> > sys     0m0.000s
> > $ time ./atomic_user_test 2 2 1000000000 # multi-threaded, racy protection
> > real    0m9.249s
> > user    0m18.430s
> > sys     0m0.004s
> > $ time ./atomic_user_test 2 3 1000000000 # multi-threaded, cmpxchg protection
> > real    1m47.331s
> > user    3m34.390s
> > sys     0m0.024s
> >
> > Here, the cmpxchg-protected counter is around 11 times as slow as the
> > unprotected counter, with around 107ns per average increment. That's around
> > 380 cycles per increment.
> >
> > I guess we probably don't care all that much about the few extra cycles in
> > the uncontended case.
> > So I think the big question now is how important the performance of the
> > high-contention case is.
> 
> What I find quite exciting about this benchmark is that they're the
> absolute worst-case: the process is doing nothing but atomic
> operations (which won't be the case for general kernel workloads) and
> the fact that the racy protection is basically lost in the noise is
> great.

Yeah, I agree.


> Now, cmpxchg looks bad here, but is, again, in the worst-case
> environment. I'll be curious to see kernel workloads with it, though.

Me too.

Btw: I just noticed that __fget() uses atomic_long_inc_not_zero(), which
is already implemented with cmpxchg on x86. So every time a multithreaded
process has multiple threads that interact with the same fd a lot, this
is already going to create racing cmpxchg loops today, and nobody seems
to have complained sufficiently loud so far. (And "perf top" says that
indeed, doing pread() in a loop in two threads spends way less time in
fget() if the threads use different fds. I'm not going to give you exact
numbers from my system because I have all kinds of debug crap turned on,
but I'll put my test code at the bottom of this mail if someone wants to
play with it.)

> As for the "racy" part, I'm not too concerned about it. (I would like
> it not to race, but given a choice, I would rather this protection was
> enabled by default.) As-is, with two threads fighting for a race, it
> can be hard to win the race, and even if there's a success, one will
> still Oops, so it's still better than what we have today: no
> notification of failure and an exploitable condition. (And, frankly,
> the act of racing and _failing_ is both threads Oopsing, so performing
> a racy would be extremely noisy on systems where they're not already
> set to panic on the first Oops...)
> 
> Just to make sure I'm not imagining the wrong thing, the race looks like this:
> 
> CPU0  CPU1
> inc->0
>             inc->1
> dec->0
> Oops
>             carry on

Yup, exactly.

In my test with an artificial worst-realistic-case bug that I did in
https://bugs.chromium.org/p/project-zero/issues/detail?id=856, it was
possible to get around the racy protection within seconds. Of course,
the normal cost of overflowing a reference counter comes on top of
that, and if you look at the logs while the attack is running, it is
indeed going to be very obvious - but I think that realistically, on
most systems, nobody is actually watching dmesg and looking for
oopses. Either you have panic_on_oops=1 or the attack is successful
and the attacker wipes the logs.


====================
#define _GNU_SOURCE
#include <pthread.h>
#include <err.h>
#include <sys/eventfd.h>
#include <unistd.h>

static int evfd = -1;

void *childfn(void *arg) {
  // uncomment the following three lines to let the threads use separate FDs
#if 0
  int evfd = eventfd(0, 0);
  if (evfd == -1)
    err(1, "eventfd");
#endif
  char c = 'X';
  while (1) {
    pread(evfd, &c, 1, 0); // will fail every time
  }
  return NULL;
}

int main(void) {
  evfd = eventfd(0, 0);
  if (evfd == -1)
    err(1, "eventfd");
  pthread_t thread;
  if (pthread_create(&thread, NULL, childfn, NULL))
    err(1, "pthread_create");
  childfn(NULL);
}
====================
Dave Hansen Oct. 5, 2016, 3:39 p.m. UTC | #10
On 10/04/2016 05:41 AM, Jann Horn wrote:
> $ time ./atomic_user_test 2 1 1000000000 # multi-threaded, no protection
> real	0m9.550s
> user	0m18.988s
> sys	0m0.000s
> $ time ./atomic_user_test 2 2 1000000000 # multi-threaded, racy protection
> real	0m9.249s
> user	0m18.430s
> sys	0m0.004s
> $ time ./atomic_user_test 2 3 1000000000 # multi-threaded, cmpxchg protection
> real	1m47.331s
> user	3m34.390s
> sys	0m0.024s

Yikes, that does get a ton worse.  But, I guess it's good to know we
have a few choices between performant and absolutely "correct".

Do you have any explanation for "racy protection" going faster than no
protection?
Jann Horn Oct. 5, 2016, 4:18 p.m. UTC | #11
On Wed, Oct 05, 2016 at 08:39:31AM -0700, Dave Hansen wrote:
> On 10/04/2016 05:41 AM, Jann Horn wrote:
> > $ time ./atomic_user_test 2 1 1000000000 # multi-threaded, no protection
> > real	0m9.550s
> > user	0m18.988s
> > sys	0m0.000s
> > $ time ./atomic_user_test 2 2 1000000000 # multi-threaded, racy protection
> > real	0m9.249s
> > user	0m18.430s
> > sys	0m0.004s
> > $ time ./atomic_user_test 2 3 1000000000 # multi-threaded, cmpxchg protection
> > real	1m47.331s
> > user	3m34.390s
> > sys	0m0.024s
> 
> Yikes, that does get a ton worse.

Yeah, but as Kees said, that's an absolute worst case, and while there
might be some performance impact with some very syscall-heavy real-world
usage, I think it's very unlikely to be that bad in practice.

It probably doesn't matter much here, but out of curiosity: Do you know
what makes this so slow? I'm not familiar with details of how processors
work  - and you're at Intel, so maybe you know more about this or can ask
someone who knows? The causes I can imagine are:

1. Pipeline flushes because of branch prediction failures caused by
   more-or-less random cmpxchg retries? Pipeline flushes are pretty
   expensive, right?
2. Repeated back-and-forth bouncing of the cacheline because an increment
   via cmpxchg needs at least two accesses instead of one, and the
   cacheline could be "stolen" by the other thread between the READ_ONCE
   and the cmpxchg.
3. Simply the cost of retrying if the value has changed in the meantime.
4. Maybe if two CPUs try increments at the same time, with exactly the
   same timing, they get stuck in a tiny livelock where every cmpxchg
   fails because the value was just updated by the other core? And then
   something slightly disturbs the timing (interrupt / clock speed
   change / ...), allowing one task to win the race?

> But, I guess it's good to know we
> have a few choices between performant and absolutely "correct".

Hrm. My opinion is that the racy protection is unlikely to help much with
panic_on_oops=0. So IMO, on x86, it's more like a choice between:

 - performant, but pretty useless
 - performant, but technically unreliable and with panic on overflow
 - doing it properly, with a performance hit

> Do you have any explanation for "racy protection" going faster than no
> protection?

My guess is "I'm not measuring well enough and random stuff is going on". 
Re-running these on the same box, I get the following numbers:

$ time ./atomic_user_test 2 1 1000000000 # multi-threaded, no protection
real	0m9.549s
user	0m19.023s
sys	0m0.000s
$ time ./atomic_user_test 2 2 1000000000 # multi-threaded, racy protection
real	0m9.586s
user	0m19.154s
sys	0m0.001s

(This might be because I'm using the ondemand governor, because my CPU has
the 4GHz boost thing, because stuff in the background is randomly
interfering... no idea.)
Dave Hansen Oct. 5, 2016, 4:32 p.m. UTC | #12
On 10/05/2016 09:18 AM, Jann Horn wrote:
> 1. Pipeline flushes because of branch prediction failures caused by
>    more-or-less random cmpxchg retries? Pipeline flushes are pretty
>    expensive, right?
> 2. Repeated back-and-forth bouncing of the cacheline because an increment
>    via cmpxchg needs at least two accesses instead of one, and the
>    cacheline could be "stolen" by the other thread between the READ_ONCE
>    and the cmpxchg.
> 3. Simply the cost of retrying if the value has changed in the meantime.
> 4. Maybe if two CPUs try increments at the same time, with exactly the
>    same timing, they get stuck in a tiny livelock where every cmpxchg
>    fails because the value was just updated by the other core? And then
>    something slightly disturbs the timing (interrupt / clock speed
>    change / ...), allowing one task to win the race?

I can speculate about it, but I don't know for sure.  The topdown tool
from pmu-tools is usually a great way to figure out what's causing these
kinds of bottlenecks in the CPU:

	https://github.com/andikleen/pmu-tools
diff mbox

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 402eee4..6c36184 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -79,6 +79,7 @@  config X86
 	select HAVE_AOUT			if X86_32
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_HARDENED_USERCOPY
+	select HAVE_ARCH_HARDENED_ATOMIC
 	select HAVE_ARCH_HUGE_VMAP		if X86_64 || X86_PAE
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_KASAN			if X86_64 && SPARSEMEM_VMEMMAP
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 14635c5..8752ed0 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -27,6 +27,17 @@  static __always_inline int atomic_read(const atomic_t *v)
 }
 
 /**
+ * atomic_read_wrap - read atomic variable
+ * @v: pointer of type atomic_wrap_t
+ *
+ * Atomically reads the value of @v.
+ */
+static __always_inline int atomic_read_wrap(const atomic_wrap_t *v)
+{
+	return ACCESS_ONCE((v)->counter);
+}
+
+/**
  * atomic_set - set atomic variable
  * @v: pointer of type atomic_t
  * @i: required value
@@ -39,6 +50,18 @@  static __always_inline void atomic_set(atomic_t *v, int i)
 }
 
 /**
+ * atomic_set_wrap - set atomic variable
+ * @v: pointer of type atomic_wrap_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+static __always_inline void atomic_set_wrap(atomic_wrap_t *v, int i)
+{
+	v->counter = i;
+}
+
+/**
  * atomic_add - add integer to atomic variable
  * @i: integer value to add
  * @v: pointer of type atomic_t
@@ -47,7 +70,29 @@  static __always_inline void atomic_set(atomic_t *v, int i)
  */
 static __always_inline void atomic_add(int i, atomic_t *v)
 {
-	asm volatile(LOCK_PREFIX "addl %1,%0"
+	asm volatile(LOCK_PREFIX "addl %1,%0\n"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     LOCK_PREFIX "subl %1,%0\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+
+		     : "+m" (v->counter)
+		     : "ir" (i));
+}
+
+/**
+ * atomic_add_wrap - add integer to atomic variable
+ * @i: integer value to add
+ * @v: pointer of type atomic_wrap_t
+ *
+ * Atomically adds @i to @v.
+ */
+static __always_inline void atomic_add_wrap(int i, atomic_wrap_t *v)
+{
+	asm volatile(LOCK_PREFIX "addl %1,%0\n"
 		     : "+m" (v->counter)
 		     : "ir" (i));
 }
@@ -61,7 +106,29 @@  static __always_inline void atomic_add(int i, atomic_t *v)
  */
 static __always_inline void atomic_sub(int i, atomic_t *v)
 {
-	asm volatile(LOCK_PREFIX "subl %1,%0"
+	asm volatile(LOCK_PREFIX "subl %1,%0\n"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     LOCK_PREFIX "addl %1,%0\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+
+		     : "+m" (v->counter)
+		     : "ir" (i));
+}
+
+/**
+ * atomic_sub_wrap - subtract integer from atomic variable
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_wrap_t
+ *
+ * Atomically subtracts @i from @v.
+ */
+static __always_inline void atomic_sub_wrap(int i, atomic_wrap_t *v)
+{
+	asm volatile(LOCK_PREFIX "subl %1,%0\n"
 		     : "+m" (v->counter)
 		     : "ir" (i));
 }
@@ -77,7 +144,21 @@  static __always_inline void atomic_sub(int i, atomic_t *v)
  */
 static __always_inline bool atomic_sub_and_test(int i, atomic_t *v)
 {
-	GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e);
+	GEN_BINARY_RMWcc(LOCK_PREFIX "subl", LOCK_PREFIX "addl", v->counter, "er", i, "%0", e);
+}
+
+/**
+ * atomic_sub_and_test_wrap - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_wrap_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static __always_inline bool atomic_sub_and_test_wrap(int i, atomic_wrap_t *v)
+{
+	GEN_BINARY_RMWcc_wrap(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e);
 }
 
 /**
@@ -88,7 +169,27 @@  static __always_inline bool atomic_sub_and_test(int i, atomic_t *v)
  */
 static __always_inline void atomic_inc(atomic_t *v)
 {
-	asm volatile(LOCK_PREFIX "incl %0"
+	asm volatile(LOCK_PREFIX "incl %0\n"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     LOCK_PREFIX "decl %0\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+
+		     : "+m" (v->counter));
+}
+
+/**
+ * atomic_inc_wrap - increment atomic variable
+ * @v: pointer of type atomic_wrap_t
+ *
+ * Atomically increments @v by 1.
+ */
+static __always_inline void atomic_inc_wrap(atomic_wrap_t *v)
+{
+	asm volatile(LOCK_PREFIX "incl %0\n"
 		     : "+m" (v->counter));
 }
 
@@ -100,7 +201,27 @@  static __always_inline void atomic_inc(atomic_t *v)
  */
 static __always_inline void atomic_dec(atomic_t *v)
 {
-	asm volatile(LOCK_PREFIX "decl %0"
+	asm volatile(LOCK_PREFIX "decl %0\n"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     LOCK_PREFIX "incl %0\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+
+		     : "+m" (v->counter));
+}
+
+/**
+ * atomic_dec_wrap - decrement atomic variable
+ * @v: pointer of type atomic_wrap_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static __always_inline void atomic_dec_wrap(atomic_wrap_t *v)
+{
+	asm volatile(LOCK_PREFIX "decl %0\n"
 		     : "+m" (v->counter));
 }
 
@@ -114,7 +235,7 @@  static __always_inline void atomic_dec(atomic_t *v)
  */
 static __always_inline bool atomic_dec_and_test(atomic_t *v)
 {
-	GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e);
+	GEN_UNARY_RMWcc(LOCK_PREFIX "decl", LOCK_PREFIX "incl", v->counter, "%0", e);
 }
 
 /**
@@ -127,7 +248,20 @@  static __always_inline bool atomic_dec_and_test(atomic_t *v)
  */
 static __always_inline bool atomic_inc_and_test(atomic_t *v)
 {
-	GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e);
+	GEN_UNARY_RMWcc(LOCK_PREFIX "incl", LOCK_PREFIX "decl", v->counter, "%0", e);
+}
+
+/**
+ * atomic_inc_and_test_wrap - increment and test
+ * @v: pointer of type atomic_wrap_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static __always_inline int atomic_inc_and_test_wrap(atomic_wrap_t *v)
+{
+	GEN_UNARY_RMWcc_wrap(LOCK_PREFIX "incl", v->counter, "%0", e);
 }
 
 /**
@@ -141,7 +275,7 @@  static __always_inline bool atomic_inc_and_test(atomic_t *v)
  */
 static __always_inline bool atomic_add_negative(int i, atomic_t *v)
 {
-	GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s);
+	GEN_BINARY_RMWcc(LOCK_PREFIX "addl", LOCK_PREFIX "subl", v->counter, "er", i, "%0", s);
 }
 
 /**
@@ -153,6 +287,18 @@  static __always_inline bool atomic_add_negative(int i, atomic_t *v)
  */
 static __always_inline int atomic_add_return(int i, atomic_t *v)
 {
+	return i + xadd_check_overflow(&v->counter, i);
+}
+
+/**
+ * atomic_add_return_wrap - add integer and return
+ * @i: integer value to add
+ * @v: pointer of type atomic_wrap_t
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static __always_inline int atomic_add_return_wrap(int i, atomic_wrap_t *v)
+{
 	return i + xadd(&v->counter, i);
 }
 
@@ -169,6 +315,10 @@  static __always_inline int atomic_sub_return(int i, atomic_t *v)
 }
 
 #define atomic_inc_return(v)  (atomic_add_return(1, v))
+static __always_inline int atomic_inc_return_wrap(atomic_wrap_t *v)
+{
+	return atomic_add_return_wrap(1, v);
+}
 #define atomic_dec_return(v)  (atomic_sub_return(1, v))
 
 static __always_inline int atomic_fetch_add(int i, atomic_t *v)
@@ -186,11 +336,21 @@  static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return cmpxchg(&v->counter, old, new);
 }
 
+static __always_inline int atomic_cmpxchg_wrap(atomic_wrap_t *v, int old, int new)
+{
+	return cmpxchg(&v->counter, old, new);
+}
+
 static inline int atomic_xchg(atomic_t *v, int new)
 {
 	return xchg(&v->counter, new);
 }
 
+static inline int atomic_xchg_wrap(atomic_wrap_t *v, int new)
+{
+	return xchg(&v->counter, new);
+}
+
 #define ATOMIC_OP(op)							\
 static inline void atomic_##op(int i, atomic_t *v)			\
 {									\
@@ -236,12 +396,25 @@  ATOMIC_OPS(xor, ^)
  */
 static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
-	int c, old;
+	int c, old, new;
 	c = atomic_read(v);
 	for (;;) {
 		if (unlikely(c == (u)))
 			break;
-		old = atomic_cmpxchg((v), c, c + (a));
+
+		asm volatile("addl %2,%0\n"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+			     "jno 0f\n"
+			     "subl %2,%0\n"
+			     "int $4\n0:\n"
+			     _ASM_EXTABLE(0b, 0b)
+#endif
+
+			     : "=r" (new)
+			     : "0" (c), "ir" (a));
+
+		old = atomic_cmpxchg((v), c, new);
 		if (likely(old == c))
 			break;
 		c = old;
@@ -250,6 +423,87 @@  static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
 }
 
 /**
+ * __atomic_add_unless__wrap - add unless the number is already a given value
+ * @v: pointer of type atomic_wrap_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as @v was not already @u.
+ * Returns the old value of @v.
+ */
+static __always_inline int __atomic_add_unless_wrap(atomic_wrap_t *v,
+						    int a, int u)
+{
+	int c, old, new;
+	c = atomic_read_wrap(v);
+	for (;;) {
+		if (unlikely(c == (u)))
+			break;
+
+		asm volatile("addl %2,%0\n"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+			     "jno 0f\n"
+			     "subl %2,%0\n"
+			     "int $4\n0:\n"
+			     _ASM_EXTABLE(0b, 0b)
+#endif
+
+			     : "=r" (new)
+			     : "0" (c), "ir" (a));
+
+		old = atomic_cmpxchg_wrap((v), c, new);
+		if (likely(old == c))
+			break;
+		c = old;
+	}
+	return c;
+}
+
+/**
++ * atomic_inc_not_zero_hint - increment if not null
++ * @v: pointer of type atomic_t
++ * @hint: probable value of the atomic before the increment
++ *
++ * This version of atomic_inc_not_zero() gives a hint of probable
++ * value of the atomic. This helps processor to not read the memory
++ * before doing the atomic read/modify/write cycle, lowering
++ * number of bus transactions on some arches.
++ *
++ * Returns: 0 if increment was not done, 1 otherwise.
++ */
+#define atomic_inc_not_zero_hint atomic_inc_not_zero_hint
+static inline int atomic_inc_not_zero_hint(atomic_t *v, int hint)
+{
+	int val, c = hint, new;
+
+	/* sanity test, should be removed by compiler if hint is a constant */
+	if (!hint)
+		return __atomic_add_unless(v, 1, 0);
+
+	do {
+		asm volatile("incl %0\n"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+			     "jno 0f\n"
+			     "decl %0\n"
+			     "int $4\n0:\n"
+			     _ASM_EXTABLE(0b, 0b)
+#endif
+
+			     : "=r" (new)
+			     : "0" (c));
+
+		val = atomic_cmpxchg((v), c, new);
+		if (val == c)
+			return 1;
+		c = val;
+	} while (c);
+
+	return 0;
+}
+
+/**
  * atomic_inc_short - increment of a short integer
  * @v: pointer to type int
  *
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 71d7705..fd781a1 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -11,6 +11,14 @@  typedef struct {
 	u64 __aligned(8) counter;
 } atomic64_t;
 
+#ifdef CONFIG_HARDENED_ATOMIC
+typedef struct {
+	u64 __aligned(8) counter;
+} atomic64_wrap_t;
+#else
+typedef atomic64_t atomic64_wrap_t;
+#endif
+
 #define ATOMIC64_INIT(val)	{ (val) }
 
 #define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...)
@@ -36,21 +44,31 @@  typedef struct {
 	ATOMIC64_DECL_ONE(sym##_386)
 
 ATOMIC64_DECL_ONE(add_386);
+ATOMIC64_DECL_ONE(add_wrap_386);
 ATOMIC64_DECL_ONE(sub_386);
+ATOMIC64_DECL_ONE(sub_wrap_386);
 ATOMIC64_DECL_ONE(inc_386);
+ATOMIC64_DECL_ONE(inc_wrap_386);
 ATOMIC64_DECL_ONE(dec_386);
+ATOMIC64_DECL_ONE(dec_wrap_386);
 #endif
 
 #define alternative_atomic64(f, out, in...) \
 	__alternative_atomic64(f, f, ASM_OUTPUT2(out), ## in)
 
 ATOMIC64_DECL(read);
+ATOMIC64_DECL(read_wrap);
 ATOMIC64_DECL(set);
+ATOMIC64_DECL(set_wrap);
 ATOMIC64_DECL(xchg);
 ATOMIC64_DECL(add_return);
+ATOMIC64_DECL(add_return_wrap);
 ATOMIC64_DECL(sub_return);
+ATOMIC64_DECL(sub_return_wrap);
 ATOMIC64_DECL(inc_return);
+ATOMIC64_DECL(inc_return_wrap);
 ATOMIC64_DECL(dec_return);
+ATOMIC64_DECL(dec_return_wrap);
 ATOMIC64_DECL(dec_if_positive);
 ATOMIC64_DECL(inc_not_zero);
 ATOMIC64_DECL(add_unless);
@@ -76,6 +94,21 @@  static inline long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n
 }
 
 /**
+ * atomic64_cmpxchg_wrap - cmpxchg atomic64 variable
+ * @p: pointer to type atomic64_wrap_t
+ * @o: expected value
+ * @n: new value
+ *
+ * Atomically sets @v to @n if it was equal to @o and returns
+ * the old value.
+ */
+
+static inline long long atomic64_cmpxchg_wrap(atomic64_wrap_t *v, long long o, long long n)
+{
+	return cmpxchg64(&v->counter, o, n);
+}
+
+/**
  * atomic64_xchg - xchg atomic64 variable
  * @v: pointer to type atomic64_t
  * @n: value to assign
@@ -95,6 +128,25 @@  static inline long long atomic64_xchg(atomic64_t *v, long long n)
 }
 
 /**
+ * atomic64_xchg_wrap - xchg atomic64 variable
+ * @v: pointer to type atomic64_wrap_t
+ * @n: value to assign
+ *
+ * Atomically xchgs the value of @v to @n and returns
+ * the old value.
+ */
+static inline long long atomic64_xchg_wrap(atomic64_wrap_t *v, long long n)
+{
+	long long o;
+	unsigned high = (unsigned)(n >> 32);
+	unsigned low = (unsigned)n;
+	alternative_atomic64(xchg, "=&A" (o),
+			     "S" (v), "b" (low), "c" (high)
+			     : "memory");
+	return o;
+}
+
+/**
  * atomic64_set - set atomic64 variable
  * @v: pointer to type atomic64_t
  * @i: value to assign
@@ -111,6 +163,22 @@  static inline void atomic64_set(atomic64_t *v, long long i)
 }
 
 /**
+ * atomic64_set_wrap - set atomic64 variable
+ * @v: pointer to type atomic64_wrap_t
+ * @n: value to assign
+ *
+ * Atomically sets the value of @v to @n.
+ */
+static inline void atomic64_set_wrap(atomic64_wrap_t *v, long long i)
+{
+	unsigned high = (unsigned)(i >> 32);
+	unsigned low = (unsigned)i;
+	alternative_atomic64(set, /* no output */,
+			     "S" (v), "b" (low), "c" (high)
+			     : "eax", "edx", "memory");
+}
+
+/**
  * atomic64_read - read atomic64 variable
  * @v: pointer to type atomic64_t
  *
@@ -121,7 +189,20 @@  static inline long long atomic64_read(const atomic64_t *v)
 	long long r;
 	alternative_atomic64(read, "=&A" (r), "c" (v) : "memory");
 	return r;
- }
+}
+
+/**
+ * atomic64_read_wrap - read atomic64 variable
+ * @v: pointer to type atomic64_wrap_t
+ *
+ * Atomically reads the value of @v and returns it.
+ */
+static inline long long atomic64_read_wrap(const atomic64_wrap_t *v)
+{
+	long long r;
+	alternative_atomic64(read, "=&A" (r), "c" (v) : "memory");
+	return r;
+}
 
 /**
  * atomic64_add_return - add and return
@@ -138,6 +219,21 @@  static inline long long atomic64_add_return(long long i, atomic64_t *v)
 	return i;
 }
 
+/**
+ * atomic64_add_return_wrap - add and return
+ * @i: integer value to add
+ * @v: pointer to type atomic64_wrap_t
+ *
+ * Atomically adds @i to @v and returns @i + *@v
+ */
+static inline long long atomic64_add_return_wrap(long long i, atomic64_wrap_t *v)
+{
+	alternative_atomic64(add_return_wrap,
+			     ASM_OUTPUT2("+A" (i), "+c" (v)),
+			     ASM_NO_INPUT_CLOBBER("memory"));
+	return i;
+}
+
 /*
  * Other variants with different arithmetic operators:
  */
@@ -149,6 +245,14 @@  static inline long long atomic64_sub_return(long long i, atomic64_t *v)
 	return i;
 }
 
+static inline long long atomic64_sub_return_wrap(long long i, atomic64_wrap_t *v)
+{
+	alternative_atomic64(sub_return,
+			     ASM_OUTPUT2("+A" (i), "+c" (v)),
+			     ASM_NO_INPUT_CLOBBER("memory"));
+	return i;
+}
+
 static inline long long atomic64_inc_return(atomic64_t *v)
 {
 	long long a;
@@ -157,6 +261,14 @@  static inline long long atomic64_inc_return(atomic64_t *v)
 	return a;
 }
 
+static inline long long atomic64_inc_return_wrap(atomic64_wrap_t *v)
+{
+	long long a;
+	alternative_atomic64(inc_return_wrap, "=&A" (a),
+			     "S" (v) : "memory", "ecx");
+	return a;
+}
+
 static inline long long atomic64_dec_return(atomic64_t *v)
 {
 	long long a;
@@ -181,6 +293,21 @@  static inline long long atomic64_add(long long i, atomic64_t *v)
 }
 
 /**
+ * atomic64_add_wrap - add integer to atomic64 variable
+ * @i: integer value to add
+ * @v: pointer to type atomic64_wrap_t
+ *
+ * Atomically adds @i to @v.
+ */
+static inline long long atomic64_add_wrap(long long i, atomic64_wrap_t *v)
+{
+	__alternative_atomic64(add_wrap, add_return_wrap,
+			       ASM_OUTPUT2("+A" (i), "+c" (v)),
+			       ASM_NO_INPUT_CLOBBER("memory"));
+	return i;
+}
+
+/**
  * atomic64_sub - subtract the atomic64 variable
  * @i: integer value to subtract
  * @v: pointer to type atomic64_t
@@ -209,6 +336,22 @@  static inline int atomic64_sub_and_test(long long i, atomic64_t *v)
 	return atomic64_sub_return(i, v) == 0;
 }
 
+#ifdef CONFIG_HARDENED_ATOMIC
+/**
+ * atomic64_sub_and_test_wrap - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_wrap_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_sub_and_test_wrap(long long i, atomic64_wrap_t *v)
+{
+	return atomic64_sub_return_wrap(i, v) == 0;
+}
+#endif /* CONFIG_HARDENED_ATOMIC */
+
 /**
  * atomic64_inc - increment atomic64 variable
  * @v: pointer to type atomic64_t
@@ -222,6 +365,18 @@  static inline void atomic64_inc(atomic64_t *v)
 }
 
 /**
+ * atomic64_inc_wrap - increment atomic64 variable
+ * @v: pointer to type atomic64_wrap_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic64_inc_wrap(atomic64_wrap_t *v)
+{
+	__alternative_atomic64(inc_wrap, inc_return_wrap, /* no output */,
+			       "S" (v) : "memory", "eax", "ecx", "edx");
+}
+
+/**
  * atomic64_dec - decrement atomic64 variable
  * @v: pointer to type atomic64_t
  *
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 89ed2f6..91892c3 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -22,6 +22,18 @@  static inline long atomic64_read(const atomic64_t *v)
 }
 
 /**
+ * atomic64_read_wrap - read atomic64 variable
+ * @v: pointer of type atomic64_wrap_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+static inline long atomic64_read_wrap(const atomic64_wrap_t *v)
+{
+	return ACCESS_ONCE((v)->counter);
+}
+
+/**
  * atomic64_set - set atomic64 variable
  * @v: pointer to type atomic64_t
  * @i: required value
@@ -34,6 +46,18 @@  static inline void atomic64_set(atomic64_t *v, long i)
 }
 
 /**
+ * atomic64_set_wrap - set atomic64 variable
+ * @v: pointer to type atomic64_wrap_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+static inline void atomic64_set_wrap(atomic64_wrap_t *v, long i)
+{
+	v->counter = i;
+}
+
+/**
  * atomic64_add - add integer to atomic64 variable
  * @i: integer value to add
  * @v: pointer to type atomic64_t
@@ -42,6 +66,28 @@  static inline void atomic64_set(atomic64_t *v, long i)
  */
 static __always_inline void atomic64_add(long i, atomic64_t *v)
 {
+	asm volatile(LOCK_PREFIX "addq %1,%0\n"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     LOCK_PREFIX "subq %1,%0\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+
+		     : "=m" (v->counter)
+		     : "er" (i), "m" (v->counter));
+}
+
+/**
+ * atomic64_add_wrap - add integer to atomic64 variable
+ * @i: integer value to add
+ * @v: pointer to type atomic64_wrap_t
+ *
+ * Atomically adds @i to @v.
+ */
+static __always_inline void atomic64_add_wrap(long i, atomic64_wrap_t *v)
+{
 	asm volatile(LOCK_PREFIX "addq %1,%0"
 		     : "=m" (v->counter)
 		     : "er" (i), "m" (v->counter));
@@ -56,6 +102,26 @@  static __always_inline void atomic64_add(long i, atomic64_t *v)
  */
 static inline void atomic64_sub(long i, atomic64_t *v)
 {
+	asm volatile(LOCK_PREFIX "subq %1,%0\n"
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     LOCK_PREFIX "addq %1,%0\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+		     : "=m" (v->counter)
+		     : "er" (i), "m" (v->counter));
+}
+
+/**
++ * atomic64_sub_wrap - subtract the atomic64 variable
++ * @i: integer value to subtract
++ * @v: pointer to type atomic64_wrap_t
++ *
++ * Atomically subtracts @i from @v.
++ */
+static inline void atomic64_sub_wrap(long i, atomic64_wrap_t *v)
+{
 	asm volatile(LOCK_PREFIX "subq %1,%0"
 		     : "=m" (v->counter)
 		     : "er" (i), "m" (v->counter));
@@ -72,7 +138,21 @@  static inline void atomic64_sub(long i, atomic64_t *v)
  */
 static inline bool atomic64_sub_and_test(long i, atomic64_t *v)
 {
-	GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e);
+	GEN_BINARY_RMWcc(LOCK_PREFIX "subq", LOCK_PREFIX "addq", v->counter, "er", i, "%0", e);
+}
+
+/**
+ * atomic64_sub_and_test_wrap - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_wrap_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline bool atomic64_sub_and_test_wrap(long i, atomic64_wrap_t *v)
+{
+	GEN_BINARY_RMWcc_wrap(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e);
 }
 
 /**
@@ -83,6 +163,26 @@  static inline bool atomic64_sub_and_test(long i, atomic64_t *v)
  */
 static __always_inline void atomic64_inc(atomic64_t *v)
 {
+	asm volatile(LOCK_PREFIX "incq %0\n"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     LOCK_PREFIX "decq %0\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+		     : "=m" (v->counter)
+		     : "m" (v->counter));
+}
+
+/**
+ * atomic64_inc_wrap - increment atomic64 variable
+ * @v: pointer to type atomic64_wrap_t
+ *
+ * Atomically increments @v by 1.
+ */
+static __always_inline void atomic64_inc_wrap(atomic64_wrap_t *v)
+{
 	asm volatile(LOCK_PREFIX "incq %0"
 		     : "=m" (v->counter)
 		     : "m" (v->counter));
@@ -96,6 +196,26 @@  static __always_inline void atomic64_inc(atomic64_t *v)
  */
 static __always_inline void atomic64_dec(atomic64_t *v)
 {
+	asm volatile(LOCK_PREFIX "decq %0\n"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     LOCK_PREFIX "incq %0\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+		     : "=m" (v->counter)
+		     : "m" (v->counter));
+}
+
+/**
+ * atomic64_dec_wrap - decrement atomic64 variable
+ * @v: pointer to type atomic64_wrap_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static __always_inline void atomic64_dec_wrap(atomic64_wrap_t *v)
+{
 	asm volatile(LOCK_PREFIX "decq %0"
 		     : "=m" (v->counter)
 		     : "m" (v->counter));
@@ -111,7 +231,7 @@  static __always_inline void atomic64_dec(atomic64_t *v)
  */
 static inline bool atomic64_dec_and_test(atomic64_t *v)
 {
-	GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e);
+	GEN_UNARY_RMWcc(LOCK_PREFIX "decq", LOCK_PREFIX "incq", v->counter, "%0", e);
 }
 
 /**
@@ -124,7 +244,7 @@  static inline bool atomic64_dec_and_test(atomic64_t *v)
  */
 static inline bool atomic64_inc_and_test(atomic64_t *v)
 {
-	GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e);
+	GEN_UNARY_RMWcc(LOCK_PREFIX "incq", LOCK_PREFIX "decq", v->counter, "%0", e);
 }
 
 /**
@@ -138,7 +258,7 @@  static inline bool atomic64_inc_and_test(atomic64_t *v)
  */
 static inline bool atomic64_add_negative(long i, atomic64_t *v)
 {
-	GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s);
+	GEN_BINARY_RMWcc(LOCK_PREFIX "addq", LOCK_PREFIX "subq", v->counter, "er", i, "%0", s);
 }
 
 /**
@@ -150,6 +270,18 @@  static inline bool atomic64_add_negative(long i, atomic64_t *v)
  */
 static __always_inline long atomic64_add_return(long i, atomic64_t *v)
 {
+	return i + xadd_check_overflow(&v->counter, i);
+}
+
+/**
+ * atomic64_add_return_wrap - add and return
+ * @i: integer value to add
+ * @v: pointer to type atomic64_wrap_t
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static __always_inline long atomic64_add_return_wrap(long i, atomic64_wrap_t *v)
+{
 	return i + xadd(&v->counter, i);
 }
 
@@ -169,6 +301,10 @@  static inline long atomic64_fetch_sub(long i, atomic64_t *v)
 }
 
 #define atomic64_inc_return(v)  (atomic64_add_return(1, (v)))
+static inline long atomic64_inc_return_wrap(atomic64_wrap_t *v)
+{
+	return atomic64_add_return_wrap(1, v);
+}
 #define atomic64_dec_return(v)  (atomic64_sub_return(1, (v)))
 
 static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
@@ -176,11 +312,21 @@  static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
 	return cmpxchg(&v->counter, old, new);
 }
 
+static inline long atomic64_cmpxchg_wrap(atomic64_wrap_t *v, long old, long new)
+{
+	return cmpxchg(&v->counter, old, new);
+}
+
 static inline long atomic64_xchg(atomic64_t *v, long new)
 {
 	return xchg(&v->counter, new);
 }
 
+static inline long atomic64_xchg_wrap(atomic64_wrap_t *v, long new)
+{
+	return xchg(&v->counter, new);
+}
+
 /**
  * atomic64_add_unless - add unless the number is a given value
  * @v: pointer of type atomic64_t
@@ -192,11 +338,21 @@  static inline long atomic64_xchg(atomic64_t *v, long new)
  */
 static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
 {
-	long c, old;
+	long c, old, new;
 	c = atomic64_read(v);
 	for (;;) {
 		if (unlikely(c == (u)))
 			break;
+		asm volatile("add %2,%0\n"
+#ifdef CONFIG_HARDENED_ATOMIC
+			     "jno 0f\n"
+			     "sub %2,%0\n"
+			     "int $4\n0:\n"
+			     _ASM_EXTABLE(0b, 0b)
+#endif
+			     : "=r" (new)
+			     : "0" (c), "ir" (a));
+
 		old = atomic64_cmpxchg((v), c, c + (a));
 		if (likely(old == c))
 			break;
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 68557f52..e25eb0d 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -50,7 +50,7 @@ 
  * a mask operation on a byte.
  */
 #define IS_IMMEDIATE(nr)		(__builtin_constant_p(nr))
-#define CONST_MASK_ADDR(nr, addr)	BITOP_ADDR((void *)(addr) + ((nr)>>3))
+#define CONST_MASK_ADDR(nr, addr)	BITOP_ADDR((volatile void *)(addr) + ((nr)>>3))
 #define CONST_MASK(nr)			(1 << ((nr) & 7))
 
 /**
@@ -203,7 +203,7 @@  static __always_inline void change_bit(long nr, volatile unsigned long *addr)
  */
 static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
 {
-	GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", c);
+	GEN_BINARY_RMWcc_wrap(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", c);
 }
 
 /**
@@ -249,7 +249,7 @@  static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *
  */
 static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
 {
-	GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", c);
+	GEN_BINARY_RMWcc_wrap(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", c);
 }
 
 /**
@@ -302,7 +302,7 @@  static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon
  */
 static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr)
 {
-	GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", c);
+	GEN_BINARY_RMWcc_wrap(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", c);
 }
 
 static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 9733361..b83f612 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -13,10 +13,14 @@  extern void __xchg_wrong_size(void)
 	__compiletime_error("Bad argument size for xchg");
 extern void __cmpxchg_wrong_size(void)
 	__compiletime_error("Bad argument size for cmpxchg");
+extern void __xadd_check_overflow_wrong_size(void)
+	__compiletime_error("Bad argument size for xadd_check_overflow");
 extern void __xadd_wrong_size(void)
 	__compiletime_error("Bad argument size for xadd");
 extern void __add_wrong_size(void)
 	__compiletime_error("Bad argument size for add");
+extern void __add_check_overflow_wrong_size(void)
+	__compiletime_error("Bad argument size for add_check_overflow");
 
 /*
  * Constants for operation sizes. On 32-bit, the 64-bit size it set to
@@ -68,6 +72,38 @@  extern void __add_wrong_size(void)
 		__ret;							\
 	})
 
+#ifdef CONFIG_HARDENED_ATOMIC
+#define __xchg_op_check_overflow(ptr, arg, op, lock)			\
+	({								\
+	        __typeof__ (*(ptr)) __ret = (arg);			\
+		switch (sizeof(*(ptr))) {				\
+		case __X86_CASE_L:					\
+			asm volatile (lock #op "l %0, %1\n"		\
+				      "jno 0f\n"			\
+				      "mov %0,%1\n"			\
+				      "int $4\n0:\n"			\
+				      _ASM_EXTABLE(0b, 0b)		\
+				      : "+r" (__ret), "+m" (*(ptr))	\
+				      : : "memory", "cc");		\
+			break;						\
+		case __X86_CASE_Q:					\
+			asm volatile (lock #op "q %q0, %1\n"		\
+				      "jno 0f\n"			\
+				      "mov %0,%1\n"			\
+				      "int $4\n0:\n"			\
+				      _ASM_EXTABLE(0b, 0b)		\
+				      : "+r" (__ret), "+m" (*(ptr))	\
+				      : : "memory", "cc");		\
+			break;						\
+		default:						\
+			__ ## op ## _check_overflow_wrong_size();	\
+		}							\
+		__ret;							\
+	})
+#else
+#define __xchg_op_check_overflow(ptr, arg, op, lock) __xchg_op(ptr, arg, op, lock)
+#endif
+
 /*
  * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
  * Since this is generally used to protect other memory information, we
@@ -166,6 +202,9 @@  extern void __add_wrong_size(void)
 #define xadd_sync(ptr, inc)	__xadd((ptr), (inc), "lock; ")
 #define xadd_local(ptr, inc)	__xadd((ptr), (inc), "")
 
+#define __xadd_check_overflow(ptr, inc, lock)	__xchg_op_check_overflow((ptr), (inc), xadd, lock)
+#define xadd_check_overflow(ptr, inc)		__xadd_check_overflow((ptr), (inc), LOCK_PREFIX)
+
 #define __add(ptr, inc, lock)						\
 	({								\
 	        __typeof__ (*(ptr)) __ret = (inc);			\
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 7511978..46cfaf0 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -10,25 +10,69 @@  typedef struct {
 	atomic_long_t a;
 } local_t;
 
+typedef struct {
+	atomic_long_wrap_t a;
+} local_wrap_t;
+
 #define LOCAL_INIT(i)	{ ATOMIC_LONG_INIT(i) }
 
 #define local_read(l)	atomic_long_read(&(l)->a)
+#define local_read_wrap(l)	atomic_long_read_wrap(&(l)->a)
 #define local_set(l, i)	atomic_long_set(&(l)->a, (i))
+#define local_set_wrap(l, i)	atomic_long_set_wrap(&(l)->a, (i))
 
 static inline void local_inc(local_t *l)
 {
+	asm volatile(_ASM_INC "%0\n"
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     _ASM_DEC "%0\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+		     : "+m" (l->a.counter));
+}
+
+static inline void local_inc_wrap(local_wrap_t *l)
+{
 	asm volatile(_ASM_INC "%0"
 		     : "+m" (l->a.counter));
 }
 
 static inline void local_dec(local_t *l)
 {
+	asm volatile(_ASM_DEC "%0\n"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     _ASM_INC "%0\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+		     : "+m" (l->a.counter));
+}
+
+static inline void local_dec_wrap(local_wrap_t *l)
+{
 	asm volatile(_ASM_DEC "%0"
 		     : "+m" (l->a.counter));
 }
 
 static inline void local_add(long i, local_t *l)
 {
+	asm volatile(_ASM_ADD "%1,%0\n"
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     _ASM_SUB "%1,%0\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+		     : "+m" (l->a.counter)
+		     : "ir" (i));
+}
+
+static inline void local_add_wrap(long i, local_wrap_t *l)
+{
 	asm volatile(_ASM_ADD "%1,%0"
 		     : "+m" (l->a.counter)
 		     : "ir" (i));
@@ -36,6 +80,19 @@  static inline void local_add(long i, local_t *l)
 
 static inline void local_sub(long i, local_t *l)
 {
+	asm volatile(_ASM_SUB "%1,%0\n"
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     _ASM_ADD "%1,%0\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+		     : "+m" (l->a.counter)
+		     : "ir" (i));
+}
+
+static inline void local_sub_wrap(long i, local_wrap_t *l)
+{
 	asm volatile(_ASM_SUB "%1,%0"
 		     : "+m" (l->a.counter)
 		     : "ir" (i));
@@ -52,7 +109,7 @@  static inline void local_sub(long i, local_t *l)
  */
 static inline bool local_sub_and_test(long i, local_t *l)
 {
-	GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", e);
+	GEN_BINARY_RMWcc(_ASM_SUB, _ASM_ADD, l->a.counter, "er", i, "%0", e);
 }
 
 /**
@@ -65,7 +122,7 @@  static inline bool local_sub_and_test(long i, local_t *l)
  */
 static inline bool local_dec_and_test(local_t *l)
 {
-	GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", e);
+	GEN_UNARY_RMWcc(_ASM_DEC, _ASM_INC, l->a.counter, "%0", e);
 }
 
 /**
@@ -78,7 +135,7 @@  static inline bool local_dec_and_test(local_t *l)
  */
 static inline bool local_inc_and_test(local_t *l)
 {
-	GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", e);
+	GEN_UNARY_RMWcc(_ASM_INC, _ASM_DEC, l->a.counter, "%0", e);
 }
 
 /**
@@ -92,7 +149,7 @@  static inline bool local_inc_and_test(local_t *l)
  */
 static inline bool local_add_negative(long i, local_t *l)
 {
-	GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", s);
+	GEN_BINARY_RMWcc(_ASM_ADD, _ASM_SUB, l->a.counter, "er", i, "%0", s);
 }
 
 /**
@@ -105,6 +162,28 @@  static inline bool local_add_negative(long i, local_t *l)
 static inline long local_add_return(long i, local_t *l)
 {
 	long __i = i;
+	asm volatile(_ASM_XADD "%0, %1\n"
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     _ASM_MOV "%0,%1\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+		     : "+r" (i), "+m" (l->a.counter)
+		     : : "memory");
+	return i + __i;
+}
+
+/**
+ * local_add_return_wrap - add and return
+ * @i: integer value to add
+ * @l: pointer to type local_wrap_t
+ *
+ * Atomically adds @i to @l and returns @i + @l
+ */
+static inline long local_add_return_wrap(long i, local_wrap_t *l)
+{
+	long __i = i;
 	asm volatile(_ASM_XADD "%0, %1;"
 		     : "+r" (i), "+m" (l->a.counter)
 		     : : "memory");
@@ -121,6 +200,8 @@  static inline long local_sub_return(long i, local_t *l)
 
 #define local_cmpxchg(l, o, n) \
 	(cmpxchg_local(&((l)->a.counter), (o), (n)))
+#define local_cmpxchg_wrap(l, o, n) \
+	(cmpxchg_local(&((l)->a.counter), (o), (n)))
 /* Always has a lock prefix */
 #define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
 
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 17f2186..2fa0e84 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -81,7 +81,7 @@  static __always_inline void __preempt_count_sub(int val)
  */
 static __always_inline bool __preempt_count_dec_and_test(void)
 {
-	GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
+    GEN_UNARY_RMWcc("decl", "incl", __preempt_count, __percpu_arg(0), e);
 }
 
 /*
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 661dd30..0375d3f 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -5,28 +5,80 @@ 
 
 /* Use asm goto */
 
-#define __GEN_RMWcc(fullop, var, cc, ...)				\
+#ifdef CONFIG_HARDENED_ATOMIC
+#define __GEN_RMWcc(fullop, fullantiop, var, cc, ...)			\
 do {									\
-	asm_volatile_goto (fullop "; j" #cc " %l[cc_label]"		\
+	asm_volatile_goto (fullop					\
+			";jno 0f\n"					\
+			fullantiop					\
+			";int $4\n0:\n"					\
+			_ASM_EXTABLE(0b, 0b)				\
+			 ";j" #cc " %l[cc_label]"			\
 			: : "m" (var), ## __VA_ARGS__ 			\
 			: "memory" : cc_label);				\
 	return 0;							\
 cc_label:								\
 	return 1;							\
 } while (0)
+#else
+#define __GEN_RMWcc(fullop, fullantiop, var, cc, ...)			\
+do {									\
+	asm_volatile_goto (fullop ";j" #cc " %l[cc_label]"		\
+			: : "m" (var), ## __VA_ARGS__ 			\
+			: "memory" : cc_label);				\
+	return 0;							\
+cc_label:								\
+	return 1;							\
+} while (0)
+#endif
 
-#define GEN_UNARY_RMWcc(op, var, arg0, cc) 				\
-	__GEN_RMWcc(op " " arg0, var, cc)
-
-#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
-	__GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val))
+#define __GEN_RMWcc_wrap(fullop, var, cc, ...)do {									\
+	asm_volatile_goto (fullop "; j" #cc " %l[cc_label]"		\
+			: : "m" (var), ## __VA_ARGS__ 			\
+			: "memory" : cc_label);				\
+	return 0;							\
+cc_label:								\
+	return 1;							\
+} while (0)
 
+#define GEN_UNARY_RMWcc(op, antiop, var, arg0, cc) 			\
+	__GEN_RMWcc(op " " arg0, antiop " " arg0, var, cc)
+#define GEN_UNARY_RMWcc_wrap(op, var, arg0, cc) 			\
+	__GEN_RMWcc_wrap(op " " arg0, var, cc)
+#define GEN_BINARY_RMWcc(op, antiop, var, vcon, val, arg0, cc)		\
+	__GEN_RMWcc(op " %1, " arg0, antiop " %1, " arg0, var, cc, vcon (val))
+#define GEN_BINARY_RMWcc_wrap(op, var, vcon, val, arg0, cc)	\
+	__GEN_RMWcc_wrap(op " %1, " arg0, var, cc, vcon (val))
 #else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
 
 /* Use flags output or a set instruction */
 
-#define __GEN_RMWcc(fullop, var, cc, ...)				\
+#ifdef CONFIG_HARDENED_ATOMIC
+#define __GEN_RMWcc(fullop, fullantiop, var, cc, ...)			\
 do {									\
+	char c;								\
+	asm volatile (fullop 						\
+			";jno 0f\n"					\
+			fullantiop					\
+			";int $4\n0:\n"					\
+			_ASM_EXTABLE(0b, 0b)				\
+			";" CC_SET(cc)				\
+			: "+m" (var), CC_OUT(cc) (c)			\
+			: __VA_ARGS__ : "memory");			\
+	return c != 0;							\
+} while (0)
+#else
+#define __GEN_RMWcc(fullop, fullantiop, var, cc, ...)			\
+do {									\
+	char c;								\
+	asm volatile (fullop ";" CC_SET(cc)				\
+			: "+m" (var), CC_OUT(cc) (c)			\
+			: __VA_ARGS__ : "memory");			\
+	return c != 0;							\
+} while (0)
+#endif
+
+#define __GEN_RMWcc_wrap(fullop, var, cc, ...)do {									\
 	bool c;								\
 	asm volatile (fullop ";" CC_SET(cc)				\
 			: "+m" (var), CC_OUT(cc) (c)			\
@@ -34,12 +86,14 @@  do {									\
 	return c;							\
 } while (0)
 
-#define GEN_UNARY_RMWcc(op, var, arg0, cc)				\
-	__GEN_RMWcc(op " " arg0, var, cc)
-
-#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
-	__GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val))
-
+#define GEN_UNARY_RMWcc(op, antiop, var, arg0, cc)			\
+	__GEN_RMWcc(op " " arg0, antiop " " arg0, var, cc)
+#define GEN_UNARY_RMWcc_wrap(op, var, arg0, cc)			\
+	__GEN_RMWcc_wrap(op " " arg0, var, cc)
+#define GEN_BINARY_RMWcc(op, antiop, var, vcon, val, arg0, cc)		\
+	__GEN_RMWcc(op " %2, " arg0, antiop " %2, " arg0, var, cc, vcon (val))
+#define GEN_BINARY_RMWcc_wrap(op, var, vcon, val, arg0, cc)	\
+	__GEN_RMWcc_wrap(op " %2, " arg0, var, cc, vcon (val))
 #endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
 
 #endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 3d33a71..4d3f8a5 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -64,6 +64,14 @@  static inline void __down_read(struct rw_semaphore *sem)
 {
 	asm volatile("# beginning down_read\n\t"
 		     LOCK_PREFIX _ASM_INC "(%1)\n\t"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     LOCK_PREFIX _ASM_DEC "(%1)\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+
 		     /* adds 0x00000001 */
 		     "  jns        1f\n"
 		     "  call call_rwsem_down_read_failed\n"
@@ -85,6 +93,14 @@  static inline bool __down_read_trylock(struct rw_semaphore *sem)
 		     "1:\n\t"
 		     "  mov          %1,%2\n\t"
 		     "  add          %3,%2\n\t"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     "sub %3,%2\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+
 		     "  jle	     2f\n\t"
 		     LOCK_PREFIX "  cmpxchg  %2,%0\n\t"
 		     "  jnz	     1b\n\t"
@@ -99,12 +115,22 @@  static inline bool __down_read_trylock(struct rw_semaphore *sem)
 /*
  * lock for writing
  */
+#ifdef CONFIG_HARDENED_ATOMIC
+#define ____down_write_undo \
+		     "jno 0f\n"\
+		     "mov %1,(%2)\n"\
+		     "int $4\n0:\n"\
+		     _ASM_EXTABLE(0b, 0b)
+#else
+#define ____down_write_undo
+#endif
 #define ____down_write(sem, slow_path)			\
 ({							\
 	long tmp;					\
 	struct rw_semaphore* ret;			\
 	asm volatile("# beginning down_write\n\t"	\
 		     LOCK_PREFIX "  xadd      %1,(%3)\n\t"	\
+		     ____down_write_undo		\
 		     /* adds 0xffff0001, returns the old value */ \
 		     "  test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \
 		     /* was the active mask 0 before? */\
@@ -166,6 +192,14 @@  static inline void __up_read(struct rw_semaphore *sem)
 	long tmp;
 	asm volatile("# beginning __up_read\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     "mov %1,(%2)\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+
 		     /* subtracts 1, returns the old value */
 		     "  jns        1f\n\t"
 		     "  call call_rwsem_wake\n" /* expects old value in %edx */
@@ -184,6 +218,14 @@  static inline void __up_write(struct rw_semaphore *sem)
 	long tmp;
 	asm volatile("# beginning __up_write\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     "mov %1,(%2)\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+
 		     /* subtracts 0xffff0001, returns the old value */
 		     "  jns        1f\n\t"
 		     "  call call_rwsem_wake\n" /* expects old value in %edx */
@@ -201,6 +243,14 @@  static inline void __downgrade_write(struct rw_semaphore *sem)
 {
 	asm volatile("# beginning __downgrade_write\n\t"
 		     LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t"
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		     "jno 0f\n"
+		     LOCK_PREFIX _ASM_SUB "%2,(%1)\n"
+		     "int $4\n0:\n"
+		     _ASM_EXTABLE(0b, 0b)
+#endif
+
 		     /*
 		      * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386)
 		      *     0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bd4e3d4..ad814ee 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -191,6 +191,12 @@  do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
 		}
+
+#ifdef CONFIG_HARDENED_ATOMIC
+		if (trapnr == X86_TRAP_OF)
+			hardened_atomic_refcount_overflow(regs);
+#endif
+
 		return 0;
 	}
 
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 9b0ca8f..0e8a888 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -45,6 +45,10 @@  BEGIN(read)
 	movl  (v), %eax
 	movl 4(v), %edx
 RET_ENDP
+BEGIN(read_wrap)
+	movl  (v), %eax
+	movl 4(v), %edx
+RET_ENDP
 #undef v
 
 #define v %esi
@@ -52,6 +56,10 @@  BEGIN(set)
 	movl %ebx,  (v)
 	movl %ecx, 4(v)
 RET_ENDP
+BEGIN(set_wrap)
+	movl %ebx,  (v)
+	movl %ecx, 4(v)
+RET_ENDP
 #undef v
 
 #define v  %esi
@@ -67,6 +75,18 @@  RET_ENDP
 BEGIN(add)
 	addl %eax,  (v)
 	adcl %edx, 4(v)
+#ifdef CONFIG_HARDENED_ATOMIC
+	jno 0f
+	subl %eax,  (v)
+	sbbl %edx, 4(v)
+	int $4
+0:
+	_ASM_EXTABLE(0b, 0b)
+#endif
+RET_ENDP
+BEGIN(add_wrap)
+	addl %eax,  (v)
+	adcl %edx, 4(v)
 RET_ENDP
 #undef v
 
@@ -74,6 +94,20 @@  RET_ENDP
 BEGIN(add_return)
 	addl  (v), %eax
 	adcl 4(v), %edx
+#ifdef CONFIG_HARDENED_ATOMIC
+	into
+1234:
+	_ASM_EXTABLE(1234b, 2f)
+#endif
+	movl %eax,  (v)
+	movl %edx, 4(v)
+#ifdef CONFIG_HARDENED_ATOMIC
+2:
+#endif
+RET_ENDP
+BEGIN(add_return_wrap)
+	addl  (v), %eax
+	adcl 4(v), %edx
 	movl %eax,  (v)
 	movl %edx, 4(v)
 RET_ENDP
@@ -83,6 +117,18 @@  RET_ENDP
 BEGIN(sub)
 	subl %eax,  (v)
 	sbbl %edx, 4(v)
+#ifdef CONFIG_HARDENED_ATOMIC
+	jno 0f
+	addl %eax,  (v)
+	adcl %edx, 4(v)
+	int $4
+0:
+	_ASM_EXTABLE(0b, 0b)
+#endif
+RET_ENDP
+BEGIN(sub_wrap)
+	subl %eax,  (v)
+	sbbl %edx, 4(v)
 RET_ENDP
 #undef v
 
@@ -93,6 +139,23 @@  BEGIN(sub_return)
 	sbbl $0, %edx
 	addl  (v), %eax
 	adcl 4(v), %edx
+#ifdef CONFIG_HARDENED_ATOMIC
+	into
+1234:
+	_ASM_EXTABLE(1234b, 2f)
+#endif
+	movl %eax,  (v)
+	movl %edx, 4(v)
+#ifdef CONFIG_HARDENED_ATOMIC
+2:
+#endif
+RET_ENDP
+BEGIN(sub_return_wrap)
+	negl %edx
+	negl %eax
+	sbbl $0, %edx
+	addl  (v), %eax
+	adcl 4(v), %edx
 	movl %eax,  (v)
 	movl %edx, 4(v)
 RET_ENDP
@@ -102,6 +165,19 @@  RET_ENDP
 BEGIN(inc)
 	addl $1,  (v)
 	adcl $0, 4(v)
+#ifdef CONFIG_HARDENED_ATOMIC
+	jno 0f
+	subl $1,  (v)
+	sbbl $0, 4(v)
+	int $4
+0:
+	_ASM_EXTABLE(0b, 0b)
+#endif
+
+RET_ENDP
+BEGIN(inc_wrap)
+	addl $1,  (v)
+	adcl $0, 4(v)
 RET_ENDP
 #undef v
 
@@ -111,6 +187,22 @@  BEGIN(inc_return)
 	movl 4(v), %edx
 	addl $1, %eax
 	adcl $0, %edx
+#ifdef CONFIG_HARDENED_ATOMIC
+	into
+1234:
+	_ASM_EXTABLE(1234b, 2f)
+#endif
+	movl %eax,  (v)
+	movl %edx, 4(v)
+#ifdef CONFIG_HARDENED_ATOMIC
+2:
+#endif
+RET_ENDP
+BEGIN(inc_return_wrap)
+	movl  (v), %eax
+	movl 4(v), %edx
+	addl $1, %eax
+	adcl $0, %edx
 	movl %eax,  (v)
 	movl %edx, 4(v)
 RET_ENDP
@@ -120,6 +212,18 @@  RET_ENDP
 BEGIN(dec)
 	subl $1,  (v)
 	sbbl $0, 4(v)
+#ifdef CONFIG_HARDENED_ATOMIC
+	jno 0f
+	addl $1,  (v)
+	adcl $0, 4(v)
+	int $4
+0:
+	_ASM_EXTABLE(0b, 0b)
+#endif
+RET_ENDP
+BEGIN(dec_wrap)
+	subl $1,  (v)
+	sbbl $0, 4(v)
 RET_ENDP
 #undef v
 
@@ -129,6 +233,22 @@  BEGIN(dec_return)
 	movl 4(v), %edx
 	subl $1, %eax
 	sbbl $0, %edx
+#ifdef CONFIG_HARDENED_ATOMIC
+	into
+1234:
+	_ASM_EXTABLE(1234b, 2f)
+#endif
+	movl %eax,  (v)
+	movl %edx, 4(v)
+#ifdef CONFIG_HARDENED_ATOMIC
+2:
+#endif
+RET_ENDP
+BEGIN(dec_return_wrap)
+	movl  (v), %eax
+	movl 4(v), %edx
+	subl $1, %eax
+	sbbl $0, %edx
 	movl %eax,  (v)
 	movl %edx, 4(v)
 RET_ENDP
@@ -140,6 +260,11 @@  BEGIN(add_unless)
 	adcl %edx, %edi
 	addl  (v), %eax
 	adcl 4(v), %edx
+#ifdef CONFIG_HARDENED_ATOMIC
+	into
+1234:
+	_ASM_EXTABLE(1234b, 2f)
+#endif
 	cmpl %eax, %ecx
 	je 3f
 1:
@@ -165,6 +290,11 @@  BEGIN(inc_not_zero)
 1:
 	addl $1, %eax
 	adcl $0, %edx
+#ifdef CONFIG_HARDENED_ATOMIC
+	into
+1234:
+	_ASM_EXTABLE(1234b, 2f)
+#endif
 	movl %eax,  (v)
 	movl %edx, 4(v)
 	movl $1, %eax
@@ -183,6 +313,11 @@  BEGIN(dec_if_positive)
 	movl 4(v), %edx
 	subl $1, %eax
 	sbbl $0, %edx
+#ifdef CONFIG_HARDENED_ATOMIC
+	into
+1234:
+	_ASM_EXTABLE(1234b, 1f)
+#endif
 	js 1f
 	movl %eax,  (v)
 	movl %edx, 4(v)
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index db3ae854..5bd864e 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -22,9 +22,19 @@ 
 
 ENTRY(atomic64_read_cx8)
 	read64 %ecx
+	/* Pax has pax_force_retaddr here
+	 * do we want similar? If yes, changes
+	 * have to be made in more places below */
 	ret
 ENDPROC(atomic64_read_cx8)
 
+ENTRY(atomic64_read_wrap_cx8)
+	read64 %ecx
+/* do we want smth like the below line?
+ *	pax_force_retaddr */
+	ret
+ENDPROC(atomic64_read_wrap_cx8)
+
 ENTRY(atomic64_set_cx8)
 1:
 /* we don't need LOCK_PREFIX since aligned 64-bit writes
@@ -35,6 +45,17 @@  ENTRY(atomic64_set_cx8)
 	ret
 ENDPROC(atomic64_set_cx8)
 
+ENTRY(atomic64_set_wrap_cx8)
+1:
+/* we don't need LOCK_PREFIX since aligned 64-bit writes
+ * are atomic on 586 and newer */
+	cmpxchg8b (%esi)
+	jne 1b
+
+	/* pax_force_retaddr */
+	ret
+ENDPROC(atomic64_set_wrap_cx8)
+
 ENTRY(atomic64_xchg_cx8)
 1:
 	LOCK_PREFIX
@@ -44,8 +65,8 @@  ENTRY(atomic64_xchg_cx8)
 	ret
 ENDPROC(atomic64_xchg_cx8)
 
-.macro addsub_return func ins insc
-ENTRY(atomic64_\func\()_return_cx8)
+.macro addsub_return func ins insc wrap=""
+ENTRY(atomic64_\func\()_return\wrap\()_cx8)
 	pushl %ebp
 	pushl %ebx
 	pushl %esi
@@ -61,6 +82,13 @@  ENTRY(atomic64_\func\()_return_cx8)
 	movl %edx, %ecx
 	\ins\()l %esi, %ebx
 	\insc\()l %edi, %ecx
+#ifdef CONFIG_HARDENED_ATOMIC
+.ifb \wrap
+	into
+2:
+	_ASM_EXTABLE(2b, 3f)
+.endif
+#endif
 	LOCK_PREFIX
 	cmpxchg8b (%ebp)
 	jne 1b
@@ -68,19 +96,27 @@  ENTRY(atomic64_\func\()_return_cx8)
 10:
 	movl %ebx, %eax
 	movl %ecx, %edx
+
+.ifb \wrap
+#ifdef CONFIG_HARDENED_ATOMIC
+3:
+#endif
+.endif
 	popl %edi
 	popl %esi
 	popl %ebx
 	popl %ebp
 	ret
-ENDPROC(atomic64_\func\()_return_cx8)
+ENDPROC(atomic64_\func\()_return\wrap\()_cx8)
 .endm
 
 addsub_return add add adc
 addsub_return sub sub sbb
+addsub_return add add adc _wrap
+addsub_return sub sub sbb _wrap
 
-.macro incdec_return func ins insc
-ENTRY(atomic64_\func\()_return_cx8)
+.macro incdec_return func ins insc wrap=""
+ENTRY(atomic64_\func\()_return\wrap\()_cx8)
 	pushl %ebx
 
 	read64 %esi
@@ -89,6 +125,13 @@  ENTRY(atomic64_\func\()_return_cx8)
 	movl %edx, %ecx
 	\ins\()l $1, %ebx
 	\insc\()l $0, %ecx
+#ifdef CONFIG_HARDENED_ATOMIC
+.ifb \wrap
+	into
+2:
+	_ASM_EXTABLE(2b, 3f)
+.endif
+#endif
 	LOCK_PREFIX
 	cmpxchg8b (%esi)
 	jne 1b
@@ -96,13 +139,21 @@  ENTRY(atomic64_\func\()_return_cx8)
 10:
 	movl %ebx, %eax
 	movl %ecx, %edx
+
+.ifb \wrap
+#ifdef CONFIG_HARDENED_ATOMIC
+3:
+#endif
+.endif
 	popl %ebx
 	ret
-ENDPROC(atomic64_\func\()_return_cx8)
+ENDPROC(atomic64_\func\()_return\wrap\()_cx8)
 .endm
 
 incdec_return inc add adc
 incdec_return dec sub sbb
+incdec_return inc add adc _wrap
+incdec_return dec sub sbb _wrap
 
 ENTRY(atomic64_dec_if_positive_cx8)
 	pushl %ebx
@@ -113,6 +164,11 @@  ENTRY(atomic64_dec_if_positive_cx8)
 	movl %edx, %ecx
 	subl $1, %ebx
 	sbb $0, %ecx
+#ifdef CONFIG_HARDENED_ATOMIC
+	into
+1234:
+	_ASM_EXTABLE(1234b, 2f)
+#endif
 	js 2f
 	LOCK_PREFIX
 	cmpxchg8b (%esi)
@@ -144,6 +200,11 @@  ENTRY(atomic64_add_unless_cx8)
 	movl %edx, %ecx
 	addl %ebp, %ebx
 	adcl %edi, %ecx
+#ifdef CONFIG_HARDENED_ATOMIC
+	into
+1234:
+	_ASM_EXTABLE(1234b, 3f)
+#endif
 	LOCK_PREFIX
 	cmpxchg8b (%esi)
 	jne 1b
@@ -173,6 +234,11 @@  ENTRY(atomic64_inc_not_zero_cx8)
 	xorl %ecx, %ecx
 	addl $1, %ebx
 	adcl %edx, %ecx
+#ifdef CONFIG_HARDENED_ATOMIC
+	into
+1234:
+	_ASM_EXTABLE(1234b, 3f)
+#endif
 	LOCK_PREFIX
 	cmpxchg8b (%esi)
 	jne 1b