diff mbox series

[bpf-next,v3,10/14] bpf: Add bitwise atomic instructions

Message ID 20201203160245.1014867-11-jackmanb@google.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Atomics for eBPF | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 4938 this patch: 4938
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch fail CHECK: Blank lines aren't necessary before a close brace '}' ERROR: Remove Gerrit Change-Id's before submitting upstream WARNING: line length of 109 exceeds 80 columns WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 5306 this patch: 5306
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Brendan Jackman Dec. 3, 2020, 4:02 p.m. UTC
This adds instructions for

atomic[64]_[fetch_]and
atomic[64]_[fetch_]or
atomic[64]_[fetch_]xor

All these operations are isomorphic enough to implement with the same
verifier, interpreter, and x86 JIT code, hence being a single commit.

The main interesting thing here is that x86 doesn't directly support
the fetch_ version these operations, so we need to generate a CMPXCHG
loop in the JIT. This requires the use of two temporary registers,
IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose.

Change-Id: I340b10cecebea8cb8a52e3606010cde547a10ed4
Signed-off-by: Brendan Jackman <jackmanb@google.com>
---
 arch/x86/net/bpf_jit_comp.c  | 50 +++++++++++++++++++++++++++++-
 include/linux/filter.h       | 60 ++++++++++++++++++++++++++++++++++++
 kernel/bpf/core.c            |  5 ++-
 kernel/bpf/disasm.c          | 21 ++++++++++---
 kernel/bpf/verifier.c        |  6 ++++
 tools/include/linux/filter.h | 60 ++++++++++++++++++++++++++++++++++++
 6 files changed, 196 insertions(+), 6 deletions(-)

Comments

Yonghong Song Dec. 4, 2020, 6:42 a.m. UTC | #1
On 12/3/20 8:02 AM, Brendan Jackman wrote:
> This adds instructions for
> 
> atomic[64]_[fetch_]and
> atomic[64]_[fetch_]or
> atomic[64]_[fetch_]xor
> 
> All these operations are isomorphic enough to implement with the same
> verifier, interpreter, and x86 JIT code, hence being a single commit.
> 
> The main interesting thing here is that x86 doesn't directly support
> the fetch_ version these operations, so we need to generate a CMPXCHG
> loop in the JIT. This requires the use of two temporary registers,
> IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose.
> 
> Change-Id: I340b10cecebea8cb8a52e3606010cde547a10ed4
> Signed-off-by: Brendan Jackman <jackmanb@google.com>
> ---
>   arch/x86/net/bpf_jit_comp.c  | 50 +++++++++++++++++++++++++++++-
>   include/linux/filter.h       | 60 ++++++++++++++++++++++++++++++++++++
>   kernel/bpf/core.c            |  5 ++-
>   kernel/bpf/disasm.c          | 21 ++++++++++---
>   kernel/bpf/verifier.c        |  6 ++++
>   tools/include/linux/filter.h | 60 ++++++++++++++++++++++++++++++++++++
>   6 files changed, 196 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 7d29bc3bb4ff..4ab0f821326c 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -824,6 +824,10 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
>   	/* emit opcode */
>   	switch (atomic_op) {
>   	case BPF_ADD:
> +	case BPF_SUB:
> +	case BPF_AND:
> +	case BPF_OR:
> +	case BPF_XOR:
>   		/* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */
>   		EMIT1(simple_alu_opcodes[atomic_op]);
>   		break;
> @@ -1306,8 +1310,52 @@ st:			if (is_imm8(insn->off))
>   
>   		case BPF_STX | BPF_ATOMIC | BPF_W:
>   		case BPF_STX | BPF_ATOMIC | BPF_DW:
> +			if (insn->imm == (BPF_AND | BPF_FETCH) ||
> +			    insn->imm == (BPF_OR | BPF_FETCH) ||
> +			    insn->imm == (BPF_XOR | BPF_FETCH)) {
> +				u8 *branch_target;
> +				bool is64 = BPF_SIZE(insn->code) == BPF_DW;
> +
> +				/*
> +				 * Can't be implemented with a single x86 insn.
> +				 * Need to do a CMPXCHG loop.
> +				 */
> +
> +				/* Will need RAX as a CMPXCHG operand so save R0 */
> +				emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0);
> +				branch_target = prog;
> +				/* Load old value */
> +				emit_ldx(&prog, BPF_SIZE(insn->code),
> +					 BPF_REG_0, dst_reg, insn->off);
> +				/*
> +				 * Perform the (commutative) operation locally,
> +				 * put the result in the AUX_REG.
> +				 */
> +				emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0);
> +				maybe_emit_mod(&prog, AUX_REG, src_reg, is64);
> +				EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)],
> +				      add_2reg(0xC0, AUX_REG, src_reg));
> +				/* Attempt to swap in new value */
> +				err = emit_atomic(&prog, BPF_CMPXCHG,
> +						  dst_reg, AUX_REG, insn->off,
> +						  BPF_SIZE(insn->code));
> +				if (WARN_ON(err))
> +					return err;
> +				/*
> +				 * ZF tells us whether we won the race. If it's
> +				 * cleared we need to try again.
> +				 */
> +				EMIT2(X86_JNE, -(prog - branch_target) - 2);
> +				/* Return the pre-modification value */
> +				emit_mov_reg(&prog, is64, src_reg, BPF_REG_0);
> +				/* Restore R0 after clobbering RAX */
> +				emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX);
> +				break;
> +
> +			}
> +
>   			err = emit_atomic(&prog, insn->imm, dst_reg, src_reg,
> -					  insn->off, BPF_SIZE(insn->code));
> +						  insn->off, BPF_SIZE(insn->code));
>   			if (err)
>   				return err;
>   			break;
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 6186280715ed..698f82897b0d 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -280,6 +280,66 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
>   		.off   = OFF,					\
>   		.imm   = BPF_ADD | BPF_FETCH })
>   
> +/* Atomic memory and, *(uint *)(dst_reg + off16) &= src_reg */
> +
> +#define BPF_ATOMIC_AND(SIZE, DST, SRC, OFF)			\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_AND })
> +
> +/* Atomic memory and with fetch, src_reg = atomic_fetch_and(*(dst_reg + off), src_reg); */

src_reg = atomic_fetch_and(dst_reg + off, src_reg)?

> +
> +#define BPF_ATOMIC_FETCH_AND(SIZE, DST, SRC, OFF)		\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_AND | BPF_FETCH })
> +
> +/* Atomic memory or, *(uint *)(dst_reg + off16) |= src_reg */
> +
> +#define BPF_ATOMIC_OR(SIZE, DST, SRC, OFF)			\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_OR })
> +
> +/* Atomic memory or with fetch, src_reg = atomic_fetch_or(*(dst_reg + off), src_reg); */

src_reg = atomic_fetch_or(dst_reg + off, src_reg)?

> +
> +#define BPF_ATOMIC_FETCH_OR(SIZE, DST, SRC, OFF)		\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_OR | BPF_FETCH })
> +
> +/* Atomic memory xor, *(uint *)(dst_reg + off16) ^= src_reg */
> +
> +#define BPF_ATOMIC_XOR(SIZE, DST, SRC, OFF)			\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_XOR })
> +
> +/* Atomic memory xor with fetch, src_reg = atomic_fetch_xor(*(dst_reg + off), src_reg); */

src_reg = atomic_fetch_xor(dst_reg + off, src_reg)?

> +
> +#define BPF_ATOMIC_FETCH_XOR(SIZE, DST, SRC, OFF)		\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_XOR | BPF_FETCH })
> +
>   /* Atomic exchange, src_reg = atomic_xchg((dst_reg + off), src_reg) */
>   

Looks like BPF_ATOMIC_XOR/OR/AND/... all similar to each other.
The same is for BPF_ATOMIC_FETCH_XOR/OR/AND/...

I am wondering whether it makes sence to have to
BPF_ATOMIC_BOP(BOP, SIZE, DST, SRC, OFF) and
BPF_ATOMIC_FETCH_BOP(BOP, SIZE, DST, SRC, OFF)
can have less number of macros?

>   #define BPF_ATOMIC_XCHG(SIZE, DST, SRC, OFF)			\
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 498d3f067be7..27eac4d5724c 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -1642,7 +1642,10 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
>   	STX_ATOMIC_W:
>   		switch (IMM) {
>   		ATOMIC(BPF_ADD, add)
> -
> +		ATOMIC(BPF_AND, and)
> +		ATOMIC(BPF_OR, or)
> +		ATOMIC(BPF_XOR, xor)
> +#undef ATOMIC
>   		case BPF_XCHG:
>   			if (BPF_SIZE(insn->code) == BPF_W)
>   				SRC = (u32) atomic_xchg(
> diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
> index 18357ea9a17d..0c7c1c31a57b 100644
> --- a/kernel/bpf/disasm.c
> +++ b/kernel/bpf/disasm.c
> @@ -80,6 +80,13 @@ const char *const bpf_alu_string[16] = {
>   	[BPF_END >> 4]  = "endian",
>   };
>   
> +static const char *const bpf_atomic_alu_string[16] = {
> +	[BPF_ADD >> 4]  = "add",
> +	[BPF_AND >> 4]  = "and",
> +	[BPF_OR >> 4]  = "or",
> +	[BPF_XOR >> 4]  = "or",
> +};
> +
>   static const char *const bpf_ldst_string[] = {
>   	[BPF_W >> 3]  = "u32",
>   	[BPF_H >> 3]  = "u16",
> @@ -154,17 +161,23 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
>   				insn->dst_reg,
>   				insn->off, insn->src_reg);
>   		else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
> -			 insn->imm == BPF_ADD) {
> -			verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
> +			 (insn->imm == BPF_ADD || insn->imm == BPF_ADD ||
> +			  insn->imm == BPF_OR || insn->imm == BPF_XOR)) {
> +			verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n",
>   				insn->code,
>   				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
>   				insn->dst_reg, insn->off,
> +				bpf_alu_string[BPF_OP(insn->imm) >> 4],
>   				insn->src_reg);
>   		} else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
> -			   insn->imm == (BPF_ADD | BPF_FETCH)) {
> -			verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_add(*(%s *)(r%d %+d), r%d)\n",

(%02x) r%d = atomic%s_fetch_add((%s *)(r%d %+d), r%d)?

> +			   (insn->imm == (BPF_ADD | BPF_FETCH) ||
> +			    insn->imm == (BPF_AND | BPF_FETCH) ||
> +			    insn->imm == (BPF_OR | BPF_FETCH) ||
> +			    insn->imm == (BPF_XOR | BPF_FETCH)))  > +			verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s(*(%s 
*)(r%d %+d), r%d)\n",

(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)?

>   				insn->code, insn->src_reg,
>   				BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
> +				bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4],
>   				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
>   				insn->dst_reg, insn->off, insn->src_reg);
>   		} else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index ccf4315e54e7..dd30eb9a6c1b 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -3606,6 +3606,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
>   	switch (insn->imm) {
>   	case BPF_ADD:
>   	case BPF_ADD | BPF_FETCH:
> +	case BPF_AND:
> +	case BPF_AND | BPF_FETCH:
> +	case BPF_OR:
> +	case BPF_OR | BPF_FETCH:
> +	case BPF_XOR:
> +	case BPF_XOR | BPF_FETCH:
>   	case BPF_XCHG:
>   	case BPF_CMPXCHG:
>   		break;
> diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
> index ea99bd17d003..b74febf83eb1 100644
> --- a/tools/include/linux/filter.h
> +++ b/tools/include/linux/filter.h
> @@ -190,6 +190,66 @@
>   		.off   = OFF,					\
>   		.imm   = BPF_ADD | BPF_FETCH })
>   
> +/* Atomic memory and, *(uint *)(dst_reg + off16) -= src_reg */
> +
> +#define BPF_ATOMIC_AND(SIZE, DST, SRC, OFF)			\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_AND })
> +
> +/* Atomic memory and with fetch, src_reg = atomic_fetch_and(*(dst_reg + off), src_reg); */
> +
> +#define BPF_ATOMIC_FETCH_AND(SIZE, DST, SRC, OFF)		\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_AND | BPF_FETCH })
> +
> +/* Atomic memory or, *(uint *)(dst_reg + off16) -= src_reg */
> +
> +#define BPF_ATOMIC_OR(SIZE, DST, SRC, OFF)			\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_OR })
> +
> +/* Atomic memory or with fetch, src_reg = atomic_fetch_or(*(dst_reg + off), src_reg); */
> +
> +#define BPF_ATOMIC_FETCH_OR(SIZE, DST, SRC, OFF)		\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_OR | BPF_FETCH })
> +
> +/* Atomic memory xor, *(uint *)(dst_reg + off16) -= src_reg */
> +
> +#define BPF_ATOMIC_XOR(SIZE, DST, SRC, OFF)			\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_XOR })
> +
> +/* Atomic memory xor with fetch, src_reg = atomic_fetch_xor(*(dst_reg + off), src_reg); */
> +
> +#define BPF_ATOMIC_FETCH_XOR(SIZE, DST, SRC, OFF)		\
> +	((struct bpf_insn) {					\
> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> +		.dst_reg = DST,					\
> +		.src_reg = SRC,					\
> +		.off   = OFF,					\
> +		.imm   = BPF_XOR | BPF_FETCH })
> +
>   /* Atomic exchange, src_reg = atomic_xchg((dst_reg + off), src_reg) */
>   
>   #define BPF_ATOMIC_XCHG(SIZE, DST, SRC, OFF)			\
>
Brendan Jackman Dec. 4, 2020, 9:36 a.m. UTC | #2
On Thu, Dec 03, 2020 at 10:42:19PM -0800, Yonghong Song wrote:
> 
> 
> On 12/3/20 8:02 AM, Brendan Jackman wrote:
> > This adds instructions for
> > 
> > atomic[64]_[fetch_]and
> > atomic[64]_[fetch_]or
> > atomic[64]_[fetch_]xor
> > 
> > All these operations are isomorphic enough to implement with the same
> > verifier, interpreter, and x86 JIT code, hence being a single commit.
> > 
> > The main interesting thing here is that x86 doesn't directly support
> > the fetch_ version these operations, so we need to generate a CMPXCHG
> > loop in the JIT. This requires the use of two temporary registers,
> > IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose.
> > 
> > Change-Id: I340b10cecebea8cb8a52e3606010cde547a10ed4
> > Signed-off-by: Brendan Jackman <jackmanb@google.com>
> > ---
> >   arch/x86/net/bpf_jit_comp.c  | 50 +++++++++++++++++++++++++++++-
> >   include/linux/filter.h       | 60 ++++++++++++++++++++++++++++++++++++
> >   kernel/bpf/core.c            |  5 ++-
> >   kernel/bpf/disasm.c          | 21 ++++++++++---
> >   kernel/bpf/verifier.c        |  6 ++++
> >   tools/include/linux/filter.h | 60 ++++++++++++++++++++++++++++++++++++
> >   6 files changed, 196 insertions(+), 6 deletions(-)
> > 
[...]
> > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > index 6186280715ed..698f82897b0d 100644
> > --- a/include/linux/filter.h
> > +++ b/include/linux/filter.h
> > @@ -280,6 +280,66 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
[...]
> > +#define BPF_ATOMIC_FETCH_XOR(SIZE, DST, SRC, OFF)		\
> > +	((struct bpf_insn) {					\
> > +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> > +		.dst_reg = DST,					\
> > +		.src_reg = SRC,					\
> > +		.off   = OFF,					\
> > +		.imm   = BPF_XOR | BPF_FETCH })
> > +
> >   /* Atomic exchange, src_reg = atomic_xchg((dst_reg + off), src_reg) */
> 
> Looks like BPF_ATOMIC_XOR/OR/AND/... all similar to each other.
> The same is for BPF_ATOMIC_FETCH_XOR/OR/AND/...
> 
> I am wondering whether it makes sence to have to
> BPF_ATOMIC_BOP(BOP, SIZE, DST, SRC, OFF) and
> BPF_ATOMIC_FETCH_BOP(BOP, SIZE, DST, SRC, OFF)
> can have less number of macros?

Hmm yeah I think that's probably a good idea, it would be consistent
with the macros for non-atomic ALU ops.

I don't think 'BOP' would be very clear though, 'ALU' might be more
obvious.
Yonghong Song Dec. 4, 2020, 3:21 p.m. UTC | #3
On 12/4/20 1:36 AM, Brendan Jackman wrote:
> On Thu, Dec 03, 2020 at 10:42:19PM -0800, Yonghong Song wrote:
>>
>>
>> On 12/3/20 8:02 AM, Brendan Jackman wrote:
>>> This adds instructions for
>>>
>>> atomic[64]_[fetch_]and
>>> atomic[64]_[fetch_]or
>>> atomic[64]_[fetch_]xor
>>>
>>> All these operations are isomorphic enough to implement with the same
>>> verifier, interpreter, and x86 JIT code, hence being a single commit.
>>>
>>> The main interesting thing here is that x86 doesn't directly support
>>> the fetch_ version these operations, so we need to generate a CMPXCHG
>>> loop in the JIT. This requires the use of two temporary registers,
>>> IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose.
>>>
>>> Change-Id: I340b10cecebea8cb8a52e3606010cde547a10ed4
>>> Signed-off-by: Brendan Jackman <jackmanb@google.com>
>>> ---
>>>    arch/x86/net/bpf_jit_comp.c  | 50 +++++++++++++++++++++++++++++-
>>>    include/linux/filter.h       | 60 ++++++++++++++++++++++++++++++++++++
>>>    kernel/bpf/core.c            |  5 ++-
>>>    kernel/bpf/disasm.c          | 21 ++++++++++---
>>>    kernel/bpf/verifier.c        |  6 ++++
>>>    tools/include/linux/filter.h | 60 ++++++++++++++++++++++++++++++++++++
>>>    6 files changed, 196 insertions(+), 6 deletions(-)
>>>
> [...]
>>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>>> index 6186280715ed..698f82897b0d 100644
>>> --- a/include/linux/filter.h
>>> +++ b/include/linux/filter.h
>>> @@ -280,6 +280,66 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
> [...]
>>> +#define BPF_ATOMIC_FETCH_XOR(SIZE, DST, SRC, OFF)		\
>>> +	((struct bpf_insn) {					\
>>> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
>>> +		.dst_reg = DST,					\
>>> +		.src_reg = SRC,					\
>>> +		.off   = OFF,					\
>>> +		.imm   = BPF_XOR | BPF_FETCH })
>>> +
>>>    /* Atomic exchange, src_reg = atomic_xchg((dst_reg + off), src_reg) */
>>
>> Looks like BPF_ATOMIC_XOR/OR/AND/... all similar to each other.
>> The same is for BPF_ATOMIC_FETCH_XOR/OR/AND/...
>>
>> I am wondering whether it makes sence to have to
>> BPF_ATOMIC_BOP(BOP, SIZE, DST, SRC, OFF) and
>> BPF_ATOMIC_FETCH_BOP(BOP, SIZE, DST, SRC, OFF)
>> can have less number of macros?
> 
> Hmm yeah I think that's probably a good idea, it would be consistent
> with the macros for non-atomic ALU ops.
> 
> I don't think 'BOP' would be very clear though, 'ALU' might be more
> obvious.

BPF_ATOMIC_ALU and BPF_ATOMIC_FETCH_ALU indeed better.

>
Brendan Jackman Dec. 7, 2020, 11:28 a.m. UTC | #4
On Fri, Dec 04, 2020 at 07:21:22AM -0800, Yonghong Song wrote:
> 
> 
> On 12/4/20 1:36 AM, Brendan Jackman wrote:
> > On Thu, Dec 03, 2020 at 10:42:19PM -0800, Yonghong Song wrote:
> > > 
> > > 
> > > On 12/3/20 8:02 AM, Brendan Jackman wrote:
> > > > This adds instructions for
> > > > 
> > > > atomic[64]_[fetch_]and
> > > > atomic[64]_[fetch_]or
> > > > atomic[64]_[fetch_]xor
> > > > 
> > > > All these operations are isomorphic enough to implement with the same
> > > > verifier, interpreter, and x86 JIT code, hence being a single commit.
> > > > 
> > > > The main interesting thing here is that x86 doesn't directly support
> > > > the fetch_ version these operations, so we need to generate a CMPXCHG
> > > > loop in the JIT. This requires the use of two temporary registers,
> > > > IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose.
> > > > 
> > > > Change-Id: I340b10cecebea8cb8a52e3606010cde547a10ed4
> > > > Signed-off-by: Brendan Jackman <jackmanb@google.com>
> > > > ---
> > > >    arch/x86/net/bpf_jit_comp.c  | 50 +++++++++++++++++++++++++++++-
> > > >    include/linux/filter.h       | 60 ++++++++++++++++++++++++++++++++++++
> > > >    kernel/bpf/core.c            |  5 ++-
> > > >    kernel/bpf/disasm.c          | 21 ++++++++++---
> > > >    kernel/bpf/verifier.c        |  6 ++++
> > > >    tools/include/linux/filter.h | 60 ++++++++++++++++++++++++++++++++++++
> > > >    6 files changed, 196 insertions(+), 6 deletions(-)
> > > > 
> > [...]
> > > > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > > > index 6186280715ed..698f82897b0d 100644
> > > > --- a/include/linux/filter.h
> > > > +++ b/include/linux/filter.h
> > > > @@ -280,6 +280,66 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
> > [...]
> > > > +#define BPF_ATOMIC_FETCH_XOR(SIZE, DST, SRC, OFF)		\
> > > > +	((struct bpf_insn) {					\
> > > > +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> > > > +		.dst_reg = DST,					\
> > > > +		.src_reg = SRC,					\
> > > > +		.off   = OFF,					\
> > > > +		.imm   = BPF_XOR | BPF_FETCH })
> > > > +
> > > >    /* Atomic exchange, src_reg = atomic_xchg((dst_reg + off), src_reg) */
> > > 
> > > Looks like BPF_ATOMIC_XOR/OR/AND/... all similar to each other.
> > > The same is for BPF_ATOMIC_FETCH_XOR/OR/AND/...
> > > 
> > > I am wondering whether it makes sence to have to
> > > BPF_ATOMIC_BOP(BOP, SIZE, DST, SRC, OFF) and
> > > BPF_ATOMIC_FETCH_BOP(BOP, SIZE, DST, SRC, OFF)
> > > can have less number of macros?
> > 
> > Hmm yeah I think that's probably a good idea, it would be consistent
> > with the macros for non-atomic ALU ops.
> > 
> > I don't think 'BOP' would be very clear though, 'ALU' might be more
> > obvious.
> 
> BPF_ATOMIC_ALU and BPF_ATOMIC_FETCH_ALU indeed better.

On second thoughts I think it feels right (i.e. it would be roughly
consistent with the level of abstraction of the rest of this macro API)
to go further and just have two macros BPF_ATOMIC64 and BPF_ATOMIC32:

	/*
	 * Atomic ALU ops:
	 *
	 *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
	 *   BPF_AND                  *(uint *) (dst_reg + off16) &= src_reg
	 *   BPF_OR                   *(uint *) (dst_reg + off16) |= src_reg
	 *   BPF_XOR                  *(uint *) (dst_reg + off16) ^= src_reg
	 *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
	 *   BPF_AND | BPF_FETCH      src_reg = atomic_fetch_and(dst_reg + off16, src_reg);
	 *   BPF_OR | BPF_FETCH       src_reg = atomic_fetch_or(dst_reg + off16, src_reg);
	 *   BPF_XOR | BPF_FETCH      src_reg = atomic_fetch_xor(dst_reg + off16, src_reg);
	 *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
	 *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
	 */

	#define BPF_ATOMIC64(OP, DST, SRC, OFF)                         \
		((struct bpf_insn) {                                    \
			.code  = BPF_STX | BPF_DW | BPF_ATOMIC,         \
			.dst_reg = DST,                                 \
			.src_reg = SRC,                                 \
			.off   = OFF,                                   \
			.imm   = OP })

	#define BPF_ATOMIC32(OP, DST, SRC, OFF)                         \
		((struct bpf_insn) {                                    \
			.code  = BPF_STX | BPF_W | BPF_ATOMIC,         \
			.dst_reg = DST,                                 \
			.src_reg = SRC,                                 \
			.off   = OFF,                                   \
			.imm   = OP })

The downside compared to what's currently in the patchset is that the
user can write e.g. BPF_ATOMIC64(BPF_SUB, BPF_REG_1, BPF_REG_2, 0) and
it will compile. On the other hand they'll get a pretty clear
"BPF_ATOMIC uses invalid atomic opcode 10" when they try to load the
prog, and the valid atomic ops are clearly listed in Documentation as
well as the comments here.
Yonghong Song Dec. 7, 2020, 3:58 p.m. UTC | #5
On 12/7/20 3:28 AM, Brendan Jackman wrote:
> On Fri, Dec 04, 2020 at 07:21:22AM -0800, Yonghong Song wrote:
>>
>>
>> On 12/4/20 1:36 AM, Brendan Jackman wrote:
>>> On Thu, Dec 03, 2020 at 10:42:19PM -0800, Yonghong Song wrote:
>>>>
>>>>
>>>> On 12/3/20 8:02 AM, Brendan Jackman wrote:
>>>>> This adds instructions for
>>>>>
>>>>> atomic[64]_[fetch_]and
>>>>> atomic[64]_[fetch_]or
>>>>> atomic[64]_[fetch_]xor
>>>>>
>>>>> All these operations are isomorphic enough to implement with the same
>>>>> verifier, interpreter, and x86 JIT code, hence being a single commit.
>>>>>
>>>>> The main interesting thing here is that x86 doesn't directly support
>>>>> the fetch_ version these operations, so we need to generate a CMPXCHG
>>>>> loop in the JIT. This requires the use of two temporary registers,
>>>>> IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose.
>>>>>
>>>>> Change-Id: I340b10cecebea8cb8a52e3606010cde547a10ed4
>>>>> Signed-off-by: Brendan Jackman <jackmanb@google.com>
>>>>> ---
>>>>>     arch/x86/net/bpf_jit_comp.c  | 50 +++++++++++++++++++++++++++++-
>>>>>     include/linux/filter.h       | 60 ++++++++++++++++++++++++++++++++++++
>>>>>     kernel/bpf/core.c            |  5 ++-
>>>>>     kernel/bpf/disasm.c          | 21 ++++++++++---
>>>>>     kernel/bpf/verifier.c        |  6 ++++
>>>>>     tools/include/linux/filter.h | 60 ++++++++++++++++++++++++++++++++++++
>>>>>     6 files changed, 196 insertions(+), 6 deletions(-)
>>>>>
>>> [...]
>>>>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>>>>> index 6186280715ed..698f82897b0d 100644
>>>>> --- a/include/linux/filter.h
>>>>> +++ b/include/linux/filter.h
>>>>> @@ -280,6 +280,66 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
>>> [...]
>>>>> +#define BPF_ATOMIC_FETCH_XOR(SIZE, DST, SRC, OFF)		\
>>>>> +	((struct bpf_insn) {					\
>>>>> +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
>>>>> +		.dst_reg = DST,					\
>>>>> +		.src_reg = SRC,					\
>>>>> +		.off   = OFF,					\
>>>>> +		.imm   = BPF_XOR | BPF_FETCH })
>>>>> +
>>>>>     /* Atomic exchange, src_reg = atomic_xchg((dst_reg + off), src_reg) */
>>>>
>>>> Looks like BPF_ATOMIC_XOR/OR/AND/... all similar to each other.
>>>> The same is for BPF_ATOMIC_FETCH_XOR/OR/AND/...
>>>>
>>>> I am wondering whether it makes sence to have to
>>>> BPF_ATOMIC_BOP(BOP, SIZE, DST, SRC, OFF) and
>>>> BPF_ATOMIC_FETCH_BOP(BOP, SIZE, DST, SRC, OFF)
>>>> can have less number of macros?
>>>
>>> Hmm yeah I think that's probably a good idea, it would be consistent
>>> with the macros for non-atomic ALU ops.
>>>
>>> I don't think 'BOP' would be very clear though, 'ALU' might be more
>>> obvious.
>>
>> BPF_ATOMIC_ALU and BPF_ATOMIC_FETCH_ALU indeed better.
> 
> On second thoughts I think it feels right (i.e. it would be roughly
> consistent with the level of abstraction of the rest of this macro API)
> to go further and just have two macros BPF_ATOMIC64 and BPF_ATOMIC32:
> 
> 	/*
> 	 * Atomic ALU ops:
> 	 *
> 	 *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
> 	 *   BPF_AND                  *(uint *) (dst_reg + off16) &= src_reg
> 	 *   BPF_OR                   *(uint *) (dst_reg + off16) |= src_reg
> 	 *   BPF_XOR                  *(uint *) (dst_reg + off16) ^= src_reg

"uint *" => "size_type *"?
and give an explanation that "size_type" is either "u32" or "u64"?

> 	 *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
> 	 *   BPF_AND | BPF_FETCH      src_reg = atomic_fetch_and(dst_reg + off16, src_reg);
> 	 *   BPF_OR | BPF_FETCH       src_reg = atomic_fetch_or(dst_reg + off16, src_reg);
> 	 *   BPF_XOR | BPF_FETCH      src_reg = atomic_fetch_xor(dst_reg + off16, src_reg);
> 	 *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
> 	 *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
> 	 */
> 
> 	#define BPF_ATOMIC64(OP, DST, SRC, OFF)                         \
> 		((struct bpf_insn) {                                    \
> 			.code  = BPF_STX | BPF_DW | BPF_ATOMIC,         \
> 			.dst_reg = DST,                                 \
> 			.src_reg = SRC,                                 \
> 			.off   = OFF,                                   \
> 			.imm   = OP })
> 
> 	#define BPF_ATOMIC32(OP, DST, SRC, OFF)                         \
> 		((struct bpf_insn) {                                    \
> 			.code  = BPF_STX | BPF_W | BPF_ATOMIC,         \
> 			.dst_reg = DST,                                 \
> 			.src_reg = SRC,                                 \
> 			.off   = OFF,                                   \
> 			.imm   = OP })

You could have
   BPF_ATOMIC(OP, SIZE, DST, SRC, OFF)
where SIZE is BPF_DW or BPF_W.

> 
> The downside compared to what's currently in the patchset is that the
> user can write e.g. BPF_ATOMIC64(BPF_SUB, BPF_REG_1, BPF_REG_2, 0) and
> it will compile. On the other hand they'll get a pretty clear
> "BPF_ATOMIC uses invalid atomic opcode 10" when they try to load the
> prog, and the valid atomic ops are clearly listed in Documentation as
> well as the comments here.

This should be fine. As you mentioned, documentation has mentioned
what is supported and what is not...
Brendan Jackman Dec. 7, 2020, 4:14 p.m. UTC | #6
On Mon, Dec 07, 2020 at 07:58:09AM -0800, Yonghong Song wrote:
> 
> 
> On 12/7/20 3:28 AM, Brendan Jackman wrote:
> > On Fri, Dec 04, 2020 at 07:21:22AM -0800, Yonghong Song wrote:
> > > 
> > > 
> > > On 12/4/20 1:36 AM, Brendan Jackman wrote:
> > > > On Thu, Dec 03, 2020 at 10:42:19PM -0800, Yonghong Song wrote:
> > > > > 
> > > > > 
> > > > > On 12/3/20 8:02 AM, Brendan Jackman wrote:
> > > > > > This adds instructions for
> > > > > > 
> > > > > > atomic[64]_[fetch_]and
> > > > > > atomic[64]_[fetch_]or
> > > > > > atomic[64]_[fetch_]xor
> > > > > > 
> > > > > > All these operations are isomorphic enough to implement with the same
> > > > > > verifier, interpreter, and x86 JIT code, hence being a single commit.
> > > > > > 
> > > > > > The main interesting thing here is that x86 doesn't directly support
> > > > > > the fetch_ version these operations, so we need to generate a CMPXCHG
> > > > > > loop in the JIT. This requires the use of two temporary registers,
> > > > > > IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose.
> > > > > > 
> > > > > > Change-Id: I340b10cecebea8cb8a52e3606010cde547a10ed4
> > > > > > Signed-off-by: Brendan Jackman <jackmanb@google.com>
> > > > > > ---
> > > > > >     arch/x86/net/bpf_jit_comp.c  | 50 +++++++++++++++++++++++++++++-
> > > > > >     include/linux/filter.h       | 60 ++++++++++++++++++++++++++++++++++++
> > > > > >     kernel/bpf/core.c            |  5 ++-
> > > > > >     kernel/bpf/disasm.c          | 21 ++++++++++---
> > > > > >     kernel/bpf/verifier.c        |  6 ++++
> > > > > >     tools/include/linux/filter.h | 60 ++++++++++++++++++++++++++++++++++++
> > > > > >     6 files changed, 196 insertions(+), 6 deletions(-)
> > > > > > 
> > > > [...]
> > > > > > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > > > > > index 6186280715ed..698f82897b0d 100644
> > > > > > --- a/include/linux/filter.h
> > > > > > +++ b/include/linux/filter.h
> > > > > > @@ -280,6 +280,66 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
> > > > [...]
> > > > > > +#define BPF_ATOMIC_FETCH_XOR(SIZE, DST, SRC, OFF)		\
> > > > > > +	((struct bpf_insn) {					\
> > > > > > +		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
> > > > > > +		.dst_reg = DST,					\
> > > > > > +		.src_reg = SRC,					\
> > > > > > +		.off   = OFF,					\
> > > > > > +		.imm   = BPF_XOR | BPF_FETCH })
> > > > > > +
> > > > > >     /* Atomic exchange, src_reg = atomic_xchg((dst_reg + off), src_reg) */
> > > > > 
> > > > > Looks like BPF_ATOMIC_XOR/OR/AND/... all similar to each other.
> > > > > The same is for BPF_ATOMIC_FETCH_XOR/OR/AND/...
> > > > > 
> > > > > I am wondering whether it makes sence to have to
> > > > > BPF_ATOMIC_BOP(BOP, SIZE, DST, SRC, OFF) and
> > > > > BPF_ATOMIC_FETCH_BOP(BOP, SIZE, DST, SRC, OFF)
> > > > > can have less number of macros?
> > > > 
> > > > Hmm yeah I think that's probably a good idea, it would be consistent
> > > > with the macros for non-atomic ALU ops.
> > > > 
> > > > I don't think 'BOP' would be very clear though, 'ALU' might be more
> > > > obvious.
> > > 
> > > BPF_ATOMIC_ALU and BPF_ATOMIC_FETCH_ALU indeed better.
> > 
> > On second thoughts I think it feels right (i.e. it would be roughly
> > consistent with the level of abstraction of the rest of this macro API)
> > to go further and just have two macros BPF_ATOMIC64 and BPF_ATOMIC32:
> > 
> > 	/*
> > 	 * Atomic ALU ops:
> > 	 *
> > 	 *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
> > 	 *   BPF_AND                  *(uint *) (dst_reg + off16) &= src_reg
> > 	 *   BPF_OR                   *(uint *) (dst_reg + off16) |= src_reg
> > 	 *   BPF_XOR                  *(uint *) (dst_reg + off16) ^= src_reg
> 
> "uint *" => "size_type *"?
> and give an explanation that "size_type" is either "u32" or "u64"?

"uint *" is already used in the file so I'll follow the precedent there.

> 
> > 	 *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
> > 	 *   BPF_AND | BPF_FETCH      src_reg = atomic_fetch_and(dst_reg + off16, src_reg);
> > 	 *   BPF_OR | BPF_FETCH       src_reg = atomic_fetch_or(dst_reg + off16, src_reg);
> > 	 *   BPF_XOR | BPF_FETCH      src_reg = atomic_fetch_xor(dst_reg + off16, src_reg);
> > 	 *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
> > 	 *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
> > 	 */
> > 
> > 	#define BPF_ATOMIC64(OP, DST, SRC, OFF)                         \
> > 		((struct bpf_insn) {                                    \
> > 			.code  = BPF_STX | BPF_DW | BPF_ATOMIC,         \
> > 			.dst_reg = DST,                                 \
> > 			.src_reg = SRC,                                 \
> > 			.off   = OFF,                                   \
> > 			.imm   = OP })
> > 
> > 	#define BPF_ATOMIC32(OP, DST, SRC, OFF)                         \
> > 		((struct bpf_insn) {                                    \
> > 			.code  = BPF_STX | BPF_W | BPF_ATOMIC,         \
> > 			.dst_reg = DST,                                 \
> > 			.src_reg = SRC,                                 \
> > 			.off   = OFF,                                   \
> > 			.imm   = OP })
> 
> You could have
>   BPF_ATOMIC(OP, SIZE, DST, SRC, OFF)
> where SIZE is BPF_DW or BPF_W.

Ah sorry, I didn't see this mail and have just posted v4 with the 2
separate macros. Let's see if anyone else has an opinion on
this point.

> > 
> > The downside compared to what's currently in the patchset is that the
> > user can write e.g. BPF_ATOMIC64(BPF_SUB, BPF_REG_1, BPF_REG_2, 0) and
> > it will compile. On the other hand they'll get a pretty clear
> > "BPF_ATOMIC uses invalid atomic opcode 10" when they try to load the
> > prog, and the valid atomic ops are clearly listed in Documentation as
> > well as the comments here.
> 
> This should be fine. As you mentioned, documentation has mentioned
> what is supported and what is not...
diff mbox series

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 7d29bc3bb4ff..4ab0f821326c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -824,6 +824,10 @@  static int emit_atomic(u8 **pprog, u8 atomic_op,
 	/* emit opcode */
 	switch (atomic_op) {
 	case BPF_ADD:
+	case BPF_SUB:
+	case BPF_AND:
+	case BPF_OR:
+	case BPF_XOR:
 		/* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */
 		EMIT1(simple_alu_opcodes[atomic_op]);
 		break;
@@ -1306,8 +1310,52 @@  st:			if (is_imm8(insn->off))
 
 		case BPF_STX | BPF_ATOMIC | BPF_W:
 		case BPF_STX | BPF_ATOMIC | BPF_DW:
+			if (insn->imm == (BPF_AND | BPF_FETCH) ||
+			    insn->imm == (BPF_OR | BPF_FETCH) ||
+			    insn->imm == (BPF_XOR | BPF_FETCH)) {
+				u8 *branch_target;
+				bool is64 = BPF_SIZE(insn->code) == BPF_DW;
+
+				/*
+				 * Can't be implemented with a single x86 insn.
+				 * Need to do a CMPXCHG loop.
+				 */
+
+				/* Will need RAX as a CMPXCHG operand so save R0 */
+				emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0);
+				branch_target = prog;
+				/* Load old value */
+				emit_ldx(&prog, BPF_SIZE(insn->code),
+					 BPF_REG_0, dst_reg, insn->off);
+				/*
+				 * Perform the (commutative) operation locally,
+				 * put the result in the AUX_REG.
+				 */
+				emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0);
+				maybe_emit_mod(&prog, AUX_REG, src_reg, is64);
+				EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)],
+				      add_2reg(0xC0, AUX_REG, src_reg));
+				/* Attempt to swap in new value */
+				err = emit_atomic(&prog, BPF_CMPXCHG,
+						  dst_reg, AUX_REG, insn->off,
+						  BPF_SIZE(insn->code));
+				if (WARN_ON(err))
+					return err;
+				/*
+				 * ZF tells us whether we won the race. If it's
+				 * cleared we need to try again.
+				 */
+				EMIT2(X86_JNE, -(prog - branch_target) - 2);
+				/* Return the pre-modification value */
+				emit_mov_reg(&prog, is64, src_reg, BPF_REG_0);
+				/* Restore R0 after clobbering RAX */
+				emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX);
+				break;
+
+			}
+
 			err = emit_atomic(&prog, insn->imm, dst_reg, src_reg,
-					  insn->off, BPF_SIZE(insn->code));
+						  insn->off, BPF_SIZE(insn->code));
 			if (err)
 				return err;
 			break;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6186280715ed..698f82897b0d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -280,6 +280,66 @@  static inline bool insn_is_zext(const struct bpf_insn *insn)
 		.off   = OFF,					\
 		.imm   = BPF_ADD | BPF_FETCH })
 
+/* Atomic memory and, *(uint *)(dst_reg + off16) &= src_reg */
+
+#define BPF_ATOMIC_AND(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_AND })
+
+/* Atomic memory and with fetch, src_reg = atomic_fetch_and(*(dst_reg + off), src_reg); */
+
+#define BPF_ATOMIC_FETCH_AND(SIZE, DST, SRC, OFF)		\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_AND | BPF_FETCH })
+
+/* Atomic memory or, *(uint *)(dst_reg + off16) |= src_reg */
+
+#define BPF_ATOMIC_OR(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_OR })
+
+/* Atomic memory or with fetch, src_reg = atomic_fetch_or(*(dst_reg + off), src_reg); */
+
+#define BPF_ATOMIC_FETCH_OR(SIZE, DST, SRC, OFF)		\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_OR | BPF_FETCH })
+
+/* Atomic memory xor, *(uint *)(dst_reg + off16) ^= src_reg */
+
+#define BPF_ATOMIC_XOR(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_XOR })
+
+/* Atomic memory xor with fetch, src_reg = atomic_fetch_xor(*(dst_reg + off), src_reg); */
+
+#define BPF_ATOMIC_FETCH_XOR(SIZE, DST, SRC, OFF)		\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_XOR | BPF_FETCH })
+
 /* Atomic exchange, src_reg = atomic_xchg((dst_reg + off), src_reg) */
 
 #define BPF_ATOMIC_XCHG(SIZE, DST, SRC, OFF)			\
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 498d3f067be7..27eac4d5724c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1642,7 +1642,10 @@  static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 	STX_ATOMIC_W:
 		switch (IMM) {
 		ATOMIC(BPF_ADD, add)
-
+		ATOMIC(BPF_AND, and)
+		ATOMIC(BPF_OR, or)
+		ATOMIC(BPF_XOR, xor)
+#undef ATOMIC
 		case BPF_XCHG:
 			if (BPF_SIZE(insn->code) == BPF_W)
 				SRC = (u32) atomic_xchg(
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 18357ea9a17d..0c7c1c31a57b 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -80,6 +80,13 @@  const char *const bpf_alu_string[16] = {
 	[BPF_END >> 4]  = "endian",
 };
 
+static const char *const bpf_atomic_alu_string[16] = {
+	[BPF_ADD >> 4]  = "add",
+	[BPF_AND >> 4]  = "and",
+	[BPF_OR >> 4]  = "or",
+	[BPF_XOR >> 4]  = "or",
+};
+
 static const char *const bpf_ldst_string[] = {
 	[BPF_W >> 3]  = "u32",
 	[BPF_H >> 3]  = "u16",
@@ -154,17 +161,23 @@  void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 				insn->dst_reg,
 				insn->off, insn->src_reg);
 		else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
-			 insn->imm == BPF_ADD) {
-			verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+			 (insn->imm == BPF_ADD || insn->imm == BPF_ADD ||
+			  insn->imm == BPF_OR || insn->imm == BPF_XOR)) {
+			verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off,
+				bpf_alu_string[BPF_OP(insn->imm) >> 4],
 				insn->src_reg);
 		} else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
-			   insn->imm == (BPF_ADD | BPF_FETCH)) {
-			verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_add(*(%s *)(r%d %+d), r%d)\n",
+			   (insn->imm == (BPF_ADD | BPF_FETCH) ||
+			    insn->imm == (BPF_AND | BPF_FETCH) ||
+			    insn->imm == (BPF_OR | BPF_FETCH) ||
+			    insn->imm == (BPF_XOR | BPF_FETCH))) {
+			verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s(*(%s *)(r%d %+d), r%d)\n",
 				insn->code, insn->src_reg,
 				BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+				bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4],
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off, insn->src_reg);
 		} else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ccf4315e54e7..dd30eb9a6c1b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3606,6 +3606,12 @@  static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 	switch (insn->imm) {
 	case BPF_ADD:
 	case BPF_ADD | BPF_FETCH:
+	case BPF_AND:
+	case BPF_AND | BPF_FETCH:
+	case BPF_OR:
+	case BPF_OR | BPF_FETCH:
+	case BPF_XOR:
+	case BPF_XOR | BPF_FETCH:
 	case BPF_XCHG:
 	case BPF_CMPXCHG:
 		break;
diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index ea99bd17d003..b74febf83eb1 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -190,6 +190,66 @@ 
 		.off   = OFF,					\
 		.imm   = BPF_ADD | BPF_FETCH })
 
+/* Atomic memory and, *(uint *)(dst_reg + off16) -= src_reg */
+
+#define BPF_ATOMIC_AND(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_AND })
+
+/* Atomic memory and with fetch, src_reg = atomic_fetch_and(*(dst_reg + off), src_reg); */
+
+#define BPF_ATOMIC_FETCH_AND(SIZE, DST, SRC, OFF)		\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_AND | BPF_FETCH })
+
+/* Atomic memory or, *(uint *)(dst_reg + off16) -= src_reg */
+
+#define BPF_ATOMIC_OR(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_OR })
+
+/* Atomic memory or with fetch, src_reg = atomic_fetch_or(*(dst_reg + off), src_reg); */
+
+#define BPF_ATOMIC_FETCH_OR(SIZE, DST, SRC, OFF)		\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_OR | BPF_FETCH })
+
+/* Atomic memory xor, *(uint *)(dst_reg + off16) -= src_reg */
+
+#define BPF_ATOMIC_XOR(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_XOR })
+
+/* Atomic memory xor with fetch, src_reg = atomic_fetch_xor(*(dst_reg + off), src_reg); */
+
+#define BPF_ATOMIC_FETCH_XOR(SIZE, DST, SRC, OFF)		\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = BPF_XOR | BPF_FETCH })
+
 /* Atomic exchange, src_reg = atomic_xchg((dst_reg + off), src_reg) */
 
 #define BPF_ATOMIC_XCHG(SIZE, DST, SRC, OFF)			\