diff mbox series

[bpf-next,v4,07/11] bpf: Add instructions for atomic_[cmp]xchg

Message ID 20201207160734.2345502-8-jackmanb@google.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Atomics for eBPF | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 15663 this patch: 15663
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning CHECK: Lines should not end with a '(' CHECK: No space is necessary after a cast WARNING: line length of 104 exceeds 80 columns WARNING: line length of 110 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 15328 this patch: 15328
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Brendan Jackman Dec. 7, 2020, 4:07 p.m. UTC
This adds two atomic opcodes, both of which include the BPF_FETCH
flag. XCHG without the BPF_FETCH flag would naturally encode
atomic_set. This is not supported because it would be of limited
value to userspace (it doesn't imply any barriers). CMPXCHG without
BPF_FETCH woulud be an atomic compare-and-write. We don't have such
an operation in the kernel so it isn't provided to BPF either.

There are two significant design decisions made for the CMPXCHG
instruction:

 - To solve the issue that this operation fundamentally has 3
   operands, but we only have two register fields. Therefore the
   operand we compare against (the kernel's API calls it 'old') is
   hard-coded to be R0. x86 has similar design (and A64 doesn't
   have this problem).

   A potential alternative might be to encode the other operand's
   register number in the immediate field.

 - The kernel's atomic_cmpxchg returns the old value, while the C11
   userspace APIs return a boolean indicating the comparison
   result. Which should BPF do? A64 returns the old value. x86 returns
   the old value in the hard-coded register (and also sets a
   flag). That means return-old-value is easier to JIT.

Signed-off-by: Brendan Jackman <jackmanb@google.com>
---
 arch/x86/net/bpf_jit_comp.c    |  8 ++++++++
 include/linux/filter.h         | 22 ++++++++++++++++++++++
 include/uapi/linux/bpf.h       |  4 +++-
 kernel/bpf/core.c              | 20 ++++++++++++++++++++
 kernel/bpf/disasm.c            | 15 +++++++++++++++
 kernel/bpf/verifier.c          | 19 +++++++++++++++++--
 tools/include/linux/filter.h   | 22 ++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  4 +++-
 8 files changed, 110 insertions(+), 4 deletions(-)

Comments

Yonghong Song Dec. 8, 2020, 1:44 a.m. UTC | #1
On 12/7/20 8:07 AM, Brendan Jackman wrote:
> This adds two atomic opcodes, both of which include the BPF_FETCH
> flag. XCHG without the BPF_FETCH flag would naturally encode
> atomic_set. This is not supported because it would be of limited
> value to userspace (it doesn't imply any barriers). CMPXCHG without
> BPF_FETCH woulud be an atomic compare-and-write. We don't have such
> an operation in the kernel so it isn't provided to BPF either.
> 
> There are two significant design decisions made for the CMPXCHG
> instruction:
> 
>   - To solve the issue that this operation fundamentally has 3
>     operands, but we only have two register fields. Therefore the
>     operand we compare against (the kernel's API calls it 'old') is
>     hard-coded to be R0. x86 has similar design (and A64 doesn't
>     have this problem).
> 
>     A potential alternative might be to encode the other operand's
>     register number in the immediate field.
> 
>   - The kernel's atomic_cmpxchg returns the old value, while the C11
>     userspace APIs return a boolean indicating the comparison
>     result. Which should BPF do? A64 returns the old value. x86 returns
>     the old value in the hard-coded register (and also sets a
>     flag). That means return-old-value is easier to JIT.
> 
> Signed-off-by: Brendan Jackman <jackmanb@google.com>
> ---
>   arch/x86/net/bpf_jit_comp.c    |  8 ++++++++
>   include/linux/filter.h         | 22 ++++++++++++++++++++++
>   include/uapi/linux/bpf.h       |  4 +++-
>   kernel/bpf/core.c              | 20 ++++++++++++++++++++
>   kernel/bpf/disasm.c            | 15 +++++++++++++++
>   kernel/bpf/verifier.c          | 19 +++++++++++++++++--
>   tools/include/linux/filter.h   | 22 ++++++++++++++++++++++
>   tools/include/uapi/linux/bpf.h |  4 +++-
>   8 files changed, 110 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index eea7d8b0bb12..308241187582 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -815,6 +815,14 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
>   		/* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */
>   		EMIT2(0x0F, 0xC1);
>   		break;
> +	case BPF_XCHG:
> +		/* src_reg = atomic_xchg(dst_reg + off, src_reg); */
> +		EMIT1(0x87);
> +		break;
> +	case BPF_CMPXCHG:
> +		/* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */
> +		EMIT2(0x0F, 0xB1);
> +		break;
>   	default:
>   		pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
>   		return -EFAULT;
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index b5258bca10d2..e1e1fc946a7c 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -265,6 +265,8 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
>    *
>    *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
>    *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
> + *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
> + *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
>    */
>   
>   #define BPF_ATOMIC64(OP, DST, SRC, OFF)				\
> @@ -293,6 +295,26 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
>   		.off   = OFF,					\
>   		.imm   = BPF_ADD })
>   
> +/* Atomic exchange, src_reg = atomic_xchg(dst_reg + off, src_reg) */
> +
> +#define BPF_ATOMIC_XCHG(SIZE, DST, SRC, OFF)			\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_XCHG  })
> +
> +/* Atomic compare-exchange, r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg) */
> +
> +#define BPF_ATOMIC_CMPXCHG(SIZE, DST, SRC, OFF)			\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_CMPXCHG })

Define BPF_ATOMIC_{XCHG, CMPXCHG} based on BPF_ATOMIC macro?

> +
>   /* Memory store, *(uint *) (dst_reg + off16) = imm32 */
>   
>   #define BPF_ST_MEM(SIZE, DST, OFF, IMM)				\
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index d5389119291e..b733af50a5b9 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -45,7 +45,9 @@
>   #define BPF_EXIT	0x90	/* function return */
>   
>   /* atomic op type fields (stored in immediate) */
> -#define BPF_FETCH	0x01	/* fetch previous value into src reg */
> +#define BPF_XCHG	(0xe0 | BPF_FETCH)	/* atomic exchange */
> +#define BPF_CMPXCHG	(0xf0 | BPF_FETCH)	/* atomic compare-and-write */
> +#define BPF_FETCH	0x01	/* not an opcode on its own, used to build others */
>   
>   /* Register numbers */
>   enum {
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 61e93eb7d363..28f960bc2e30 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -1630,6 +1630,16 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
>   				(u32) SRC,
>   				(atomic_t *)(unsigned long) (DST + insn->off));
>   			break;
> +		case BPF_XCHG:
> +			SRC = (u32) atomic_xchg(
> +				(atomic_t *)(unsigned long) (DST + insn->off),
> +				(u32) SRC);
> +			break;
> +		case BPF_CMPXCHG:
> +			BPF_R0 = (u32) atomic_cmpxchg(
> +				(atomic_t *)(unsigned long) (DST + insn->off),
> +				(u32) BPF_R0, (u32) SRC);
> +			break;
>   		default:
>   			goto default_label;
>   		}
[...]
John Fastabend Dec. 8, 2020, 6:37 a.m. UTC | #2
Brendan Jackman wrote:
> This adds two atomic opcodes, both of which include the BPF_FETCH
> flag. XCHG without the BPF_FETCH flag would naturally encode
> atomic_set. This is not supported because it would be of limited
> value to userspace (it doesn't imply any barriers). CMPXCHG without
> BPF_FETCH woulud be an atomic compare-and-write. We don't have such
> an operation in the kernel so it isn't provided to BPF either.
> 
> There are two significant design decisions made for the CMPXCHG
> instruction:
> 
>  - To solve the issue that this operation fundamentally has 3
>    operands, but we only have two register fields. Therefore the
>    operand we compare against (the kernel's API calls it 'old') is
>    hard-coded to be R0. x86 has similar design (and A64 doesn't
>    have this problem).
> 
>    A potential alternative might be to encode the other operand's
>    register number in the immediate field.
> 
>  - The kernel's atomic_cmpxchg returns the old value, while the C11
>    userspace APIs return a boolean indicating the comparison
>    result. Which should BPF do? A64 returns the old value. x86 returns
>    the old value in the hard-coded register (and also sets a
>    flag). That means return-old-value is easier to JIT.

Just a nit as it looks like perhaps we get one more spin here. Would
be nice to be explicit about what the code does here. The above reads
like it could go either way. Just an extra line "So we use ...' would
be enough.

> 
> Signed-off-by: Brendan Jackman <jackmanb@google.com>
> ---

One question below.

>  arch/x86/net/bpf_jit_comp.c    |  8 ++++++++
>  include/linux/filter.h         | 22 ++++++++++++++++++++++
>  include/uapi/linux/bpf.h       |  4 +++-
>  kernel/bpf/core.c              | 20 ++++++++++++++++++++
>  kernel/bpf/disasm.c            | 15 +++++++++++++++
>  kernel/bpf/verifier.c          | 19 +++++++++++++++++--
>  tools/include/linux/filter.h   | 22 ++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h |  4 +++-
>  8 files changed, 110 insertions(+), 4 deletions(-)
> 

[...]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index f8c4e809297d..f5f4460b3e4e 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -3608,11 +3608,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>  
>  static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
>  {
> +	int load_reg;
>  	int err;
>  
>  	switch (insn->imm) {
>  	case BPF_ADD:
>  	case BPF_ADD | BPF_FETCH:
> +	case BPF_XCHG:
> +	case BPF_CMPXCHG:
>  		break;
>  	default:
>  		verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
> @@ -3634,6 +3637,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
>  	if (err)
>  		return err;
>  
> +	if (insn->imm == BPF_CMPXCHG) {
> +		/* Check comparison of R0 with memory location */
> +		err = check_reg_arg(env, BPF_REG_0, SRC_OP);
> +		if (err)
> +			return err;
> +	}
> +

I need to think a bit more about it, but do we need to update is_reg64()
at all for these?

>  	if (is_pointer_value(env, insn->src_reg)) {
>  		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
>  		return -EACCES;
> @@ -3664,8 +3674,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
>  	if (!(insn->imm & BPF_FETCH))
>  		return 0;
>  
> -	/* check and record load of old value into src reg  */
> -	err = check_reg_arg(env, insn->src_reg, DST_OP);
> +	if (insn->imm == BPF_CMPXCHG)
> +		load_reg = BPF_REG_0;
> +	else
> +		load_reg = insn->src_reg;
> +
> +	/* check and record load of old value */
> +	err = check_reg_arg(env, load_reg, DST_OP);
>  	if (err)
>  		return err;
John Fastabend Dec. 8, 2020, 6:42 a.m. UTC | #3
Brendan Jackman wrote:
> This adds two atomic opcodes, both of which include the BPF_FETCH
> flag. XCHG without the BPF_FETCH flag would naturally encode
> atomic_set. This is not supported because it would be of limited
> value to userspace (it doesn't imply any barriers). CMPXCHG without
> BPF_FETCH woulud be an atomic compare-and-write. We don't have such
> an operation in the kernel so it isn't provided to BPF either.
> 
> There are two significant design decisions made for the CMPXCHG
> instruction:
> 
>  - To solve the issue that this operation fundamentally has 3
>    operands, but we only have two register fields. Therefore the
>    operand we compare against (the kernel's API calls it 'old') is
>    hard-coded to be R0. x86 has similar design (and A64 doesn't
>    have this problem).
> 
>    A potential alternative might be to encode the other operand's
>    register number in the immediate field.
> 
>  - The kernel's atomic_cmpxchg returns the old value, while the C11
>    userspace APIs return a boolean indicating the comparison
>    result. Which should BPF do? A64 returns the old value. x86 returns
>    the old value in the hard-coded register (and also sets a
>    flag). That means return-old-value is easier to JIT.
> 
> Signed-off-by: Brendan Jackman <jackmanb@google.com>
> ---

Sorry if this is a dup, client crashed while I sent the previous version
and don't see it on the list.

> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -3608,11 +3608,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>  
>  static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
>  {
> +	int load_reg;
>  	int err;
>  
>  	switch (insn->imm) {
>  	case BPF_ADD:
>  	case BPF_ADD | BPF_FETCH:
> +	case BPF_XCHG:
> +	case BPF_CMPXCHG:
>  		break;
>  	default:
>  		verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
> @@ -3634,6 +3637,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
>  	if (err)
>  		return err;
>  
> +	if (insn->imm == BPF_CMPXCHG) {
> +		/* Check comparison of R0 with memory location */
> +		err = check_reg_arg(env, BPF_REG_0, SRC_OP);
> +		if (err)
> +			return err;
> +	}
> +

Need to think a bit more on this, but do we need to update is_reg64() here
as well?

>  	if (is_pointer_value(env, insn->src_reg)) {
>  		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
>  		return -EACCES;
> @@ -3664,8 +3674,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
>  	if (!(insn->imm & BPF_FETCH))
>  		return 0;
>  
> -	/* check and record load of old value into src reg  */
> -	err = check_reg_arg(env, insn->src_reg, DST_OP);
> +	if (insn->imm == BPF_CMPXCHG)
> +		load_reg = BPF_REG_0;
> +	else
> +		load_reg = insn->src_reg;
> +
> +	/* check and record load of old value */
> +	err = check_reg_arg(env, load_reg, DST_OP);
>  	if (err)
>  		return err;
>  

Thanks,
John
Brendan Jackman Dec. 14, 2020, 3:39 p.m. UTC | #4
Seems I never replied to this, thanks for the reviews!

On Mon, Dec 07, 2020 at 10:37:32PM -0800, John Fastabend wrote:
> Brendan Jackman wrote:
> > This adds two atomic opcodes, both of which include the BPF_FETCH
> > flag. XCHG without the BPF_FETCH flag would naturally encode
> > atomic_set. This is not supported because it would be of limited
> > value to userspace (it doesn't imply any barriers). CMPXCHG without
> > BPF_FETCH woulud be an atomic compare-and-write. We don't have such
> > an operation in the kernel so it isn't provided to BPF either.
> > 
> > There are two significant design decisions made for the CMPXCHG
> > instruction:
> > 
> >  - To solve the issue that this operation fundamentally has 3
> >    operands, but we only have two register fields. Therefore the
> >    operand we compare against (the kernel's API calls it 'old') is
> >    hard-coded to be R0. x86 has similar design (and A64 doesn't
> >    have this problem).
> > 
> >    A potential alternative might be to encode the other operand's
> >    register number in the immediate field.
> > 
> >  - The kernel's atomic_cmpxchg returns the old value, while the C11
> >    userspace APIs return a boolean indicating the comparison
> >    result. Which should BPF do? A64 returns the old value. x86 returns
> >    the old value in the hard-coded register (and also sets a
> >    flag). That means return-old-value is easier to JIT.
> 
> Just a nit as it looks like perhaps we get one more spin here. Would
> be nice to be explicit about what the code does here. The above reads
> like it could go either way. Just an extra line "So we use ...' would
> be enough.

Ack, adding the note.

> > Signed-off-by: Brendan Jackman <jackmanb@google.com>
> > ---
> 
> One question below.
> 
> >  arch/x86/net/bpf_jit_comp.c    |  8 ++++++++
> >  include/linux/filter.h         | 22 ++++++++++++++++++++++
> >  include/uapi/linux/bpf.h       |  4 +++-
> >  kernel/bpf/core.c              | 20 ++++++++++++++++++++
> >  kernel/bpf/disasm.c            | 15 +++++++++++++++
> >  kernel/bpf/verifier.c          | 19 +++++++++++++++++--
> >  tools/include/linux/filter.h   | 22 ++++++++++++++++++++++
> >  tools/include/uapi/linux/bpf.h |  4 +++-
> >  8 files changed, 110 insertions(+), 4 deletions(-)
> > 
> 
> [...]
> 
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index f8c4e809297d..f5f4460b3e4e 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -3608,11 +3608,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
> >  
> >  static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
> >  {
> > +	int load_reg;
> >  	int err;
> >  
> >  	switch (insn->imm) {
> >  	case BPF_ADD:
> >  	case BPF_ADD | BPF_FETCH:
> > +	case BPF_XCHG:
> > +	case BPF_CMPXCHG:
> >  		break;
> >  	default:
> >  		verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
> > @@ -3634,6 +3637,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
> >  	if (err)
> >  		return err;
> >  
> > +	if (insn->imm == BPF_CMPXCHG) {
> > +		/* Check comparison of R0 with memory location */
> > +		err = check_reg_arg(env, BPF_REG_0, SRC_OP);
> > +		if (err)
> > +			return err;
> > +	}
> > +
> 
> I need to think a bit more about it, but do we need to update is_reg64()
> at all for these?

I don't think so - this all falls into the same
`if (class == BPF_STX)` case as the existing BPF_STX_XADD instruction.

> >  	if (is_pointer_value(env, insn->src_reg)) {
> >  		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
> >  		return -EACCES;
> > @@ -3664,8 +3674,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
> >  	if (!(insn->imm & BPF_FETCH))
> >  		return 0;
> >  
> > -	/* check and record load of old value into src reg  */
> > -	err = check_reg_arg(env, insn->src_reg, DST_OP);
> > +	if (insn->imm == BPF_CMPXCHG)
> > +		load_reg = BPF_REG_0;
> > +	else
> > +		load_reg = insn->src_reg;
> > +
> > +	/* check and record load of old value */
> > +	err = check_reg_arg(env, load_reg, DST_OP);
> >  	if (err)
> >  		return err;
diff mbox series

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index eea7d8b0bb12..308241187582 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -815,6 +815,14 @@  static int emit_atomic(u8 **pprog, u8 atomic_op,
 		/* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */
 		EMIT2(0x0F, 0xC1);
 		break;
+	case BPF_XCHG:
+		/* src_reg = atomic_xchg(dst_reg + off, src_reg); */
+		EMIT1(0x87);
+		break;
+	case BPF_CMPXCHG:
+		/* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */
+		EMIT2(0x0F, 0xB1);
+		break;
 	default:
 		pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
 		return -EFAULT;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index b5258bca10d2..e1e1fc946a7c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -265,6 +265,8 @@  static inline bool insn_is_zext(const struct bpf_insn *insn)
  *
  *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
  *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
+ *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
+ *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
  */
 
 #define BPF_ATOMIC64(OP, DST, SRC, OFF)				\
@@ -293,6 +295,26 @@  static inline bool insn_is_zext(const struct bpf_insn *insn)
 		.off   = OFF,					\
 		.imm   = BPF_ADD })
 
+/* Atomic exchange, src_reg = atomic_xchg(dst_reg + off, src_reg) */
+
+#define BPF_ATOMIC_XCHG(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_XCHG  })
+
+/* Atomic compare-exchange, r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg) */
+
+#define BPF_ATOMIC_CMPXCHG(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_CMPXCHG })
+
 /* Memory store, *(uint *) (dst_reg + off16) = imm32 */
 
 #define BPF_ST_MEM(SIZE, DST, OFF, IMM)				\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d5389119291e..b733af50a5b9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -45,7 +45,9 @@ 
 #define BPF_EXIT	0x90	/* function return */
 
 /* atomic op type fields (stored in immediate) */
-#define BPF_FETCH	0x01	/* fetch previous value into src reg */
+#define BPF_XCHG	(0xe0 | BPF_FETCH)	/* atomic exchange */
+#define BPF_CMPXCHG	(0xf0 | BPF_FETCH)	/* atomic compare-and-write */
+#define BPF_FETCH	0x01	/* not an opcode on its own, used to build others */
 
 /* Register numbers */
 enum {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 61e93eb7d363..28f960bc2e30 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1630,6 +1630,16 @@  static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 				(u32) SRC,
 				(atomic_t *)(unsigned long) (DST + insn->off));
 			break;
+		case BPF_XCHG:
+			SRC = (u32) atomic_xchg(
+				(atomic_t *)(unsigned long) (DST + insn->off),
+				(u32) SRC);
+			break;
+		case BPF_CMPXCHG:
+			BPF_R0 = (u32) atomic_cmpxchg(
+				(atomic_t *)(unsigned long) (DST + insn->off),
+				(u32) BPF_R0, (u32) SRC);
+			break;
 		default:
 			goto default_label;
 		}
@@ -1647,6 +1657,16 @@  static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 				(u64) SRC,
 				(atomic64_t *)(s64) (DST + insn->off));
 			break;
+		case BPF_XCHG:
+			SRC = (u64) atomic64_xchg(
+				(atomic64_t *)(u64) (DST + insn->off),
+				(u64) SRC);
+			break;
+		case BPF_CMPXCHG:
+			BPF_R0 = (u64) atomic64_cmpxchg(
+				(atomic64_t *)(u64) (DST + insn->off),
+				(u64) BPF_R0, (u64) SRC);
+			break;
 		default:
 			goto default_label;
 		}
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index d2e20f6d0516..ee8d1132767b 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -167,6 +167,21 @@  void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 				BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off, insn->src_reg);
+		} else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+			   insn->imm == BPF_CMPXCHG) {
+			verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n",
+				insn->code,
+				BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg, insn->off,
+				insn->src_reg);
+		} else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+			   insn->imm == BPF_XCHG) {
+			verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n",
+				insn->code, insn->src_reg,
+				BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg, insn->off, insn->src_reg);
 		} else {
 			verbose(cbs->private_data, "BUG_%02x\n", insn->code);
 		}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f8c4e809297d..f5f4460b3e4e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3608,11 +3608,14 @@  static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
 {
+	int load_reg;
 	int err;
 
 	switch (insn->imm) {
 	case BPF_ADD:
 	case BPF_ADD | BPF_FETCH:
+	case BPF_XCHG:
+	case BPF_CMPXCHG:
 		break;
 	default:
 		verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
@@ -3634,6 +3637,13 @@  static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 	if (err)
 		return err;
 
+	if (insn->imm == BPF_CMPXCHG) {
+		/* Check comparison of R0 with memory location */
+		err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+		if (err)
+			return err;
+	}
+
 	if (is_pointer_value(env, insn->src_reg)) {
 		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
 		return -EACCES;
@@ -3664,8 +3674,13 @@  static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 	if (!(insn->imm & BPF_FETCH))
 		return 0;
 
-	/* check and record load of old value into src reg  */
-	err = check_reg_arg(env, insn->src_reg, DST_OP);
+	if (insn->imm == BPF_CMPXCHG)
+		load_reg = BPF_REG_0;
+	else
+		load_reg = insn->src_reg;
+
+	/* check and record load of old value */
+	err = check_reg_arg(env, load_reg, DST_OP);
 	if (err)
 		return err;
 
diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index 4e0100ba52c2..21598053fd40 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -174,6 +174,8 @@ 
  *
  *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
  *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
+ *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
+ *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
  */
 
 #define BPF_ATOMIC64(OP, DST, SRC, OFF)				\
@@ -212,6 +214,26 @@ 
 		.off   = OFF,					\
 		.imm   = BPF_ADD | BPF_FETCH })
 
+/* Atomic exchange, src_reg = atomic_xchg(dst_reg + off, src_reg) */
+
+#define BPF_ATOMIC_XCHG(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_XCHG })
+
+/* Atomic compare-exchange, r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg) */
+
+#define BPF_ATOMIC_CMPXCHG(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_CMPXCHG })
+
 /* Memory store, *(uint *) (dst_reg + off16) = imm32 */
 
 #define BPF_ST_MEM(SIZE, DST, OFF, IMM)				\
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d5389119291e..b733af50a5b9 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -45,7 +45,9 @@ 
 #define BPF_EXIT	0x90	/* function return */
 
 /* atomic op type fields (stored in immediate) */
-#define BPF_FETCH	0x01	/* fetch previous value into src reg */
+#define BPF_XCHG	(0xe0 | BPF_FETCH)	/* atomic exchange */
+#define BPF_CMPXCHG	(0xf0 | BPF_FETCH)	/* atomic compare-and-write */
+#define BPF_FETCH	0x01	/* not an opcode on its own, used to build others */
 
 /* Register numbers */
 enum {