diff mbox series

[bpf-next,v4,3/4] bpf, arm64: adjust the offset of str/ldr(immediate) to positive number

Message ID 20220317140243.212509-4-xukuohai@huawei.com (mailing list archive)
State New, archived
Headers show
Series bpf, arm64: Optimize BPF store/load using | expand

Commit Message

Xu Kuohai March 17, 2022, 2:02 p.m. UTC
The BPF STX/LDX instruction uses offset relative to the FP to address
stack space. Since the BPF_FP locates at the top of the frame, the offset
is usually a negative number. However, arm64 str/ldr immediate instruction
requires that offset be a positive number.  Therefore, this patch tries to
convert the offsets.

The method is to find the negative offset furthest from the FP firstly.
Then add it to the FP, calculate a bottom position, called FPB, and then
adjust the offsets in other STR/LDX instructions relative to FPB.

FPB is saved using the callee-saved register x27 of arm64 which is not
used yet.

Before adjusting the offset, the patch checks every instruction to ensure
that the FP does not change in run-time. If the FP may change, no offset
is adjusted.

For example, for the following bpftrace command:

  bpftrace -e 'kprobe:do_sys_open { printf("opening: %s\n", str(arg1)); }'

Without this patch, jited code(fragment):

   0:   bti     c
   4:   stp     x29, x30, [sp, #-16]!
   8:   mov     x29, sp
   c:   stp     x19, x20, [sp, #-16]!
  10:   stp     x21, x22, [sp, #-16]!
  14:   stp     x25, x26, [sp, #-16]!
  18:   mov     x25, sp
  1c:   mov     x26, #0x0                       // #0
  20:   bti     j
  24:   sub     sp, sp, #0x90
  28:   add     x19, x0, #0x0
  2c:   mov     x0, #0x0                        // #0
  30:   mov     x10, #0xffffffffffffff78        // #-136
  34:   str     x0, [x25, x10]
  38:   mov     x10, #0xffffffffffffff80        // #-128
  3c:   str     x0, [x25, x10]
  40:   mov     x10, #0xffffffffffffff88        // #-120
  44:   str     x0, [x25, x10]
  48:   mov     x10, #0xffffffffffffff90        // #-112
  4c:   str     x0, [x25, x10]
  50:   mov     x10, #0xffffffffffffff98        // #-104
  54:   str     x0, [x25, x10]
  58:   mov     x10, #0xffffffffffffffa0        // #-96
  5c:   str     x0, [x25, x10]
  60:   mov     x10, #0xffffffffffffffa8        // #-88
  64:   str     x0, [x25, x10]
  68:   mov     x10, #0xffffffffffffffb0        // #-80
  6c:   str     x0, [x25, x10]
  70:   mov     x10, #0xffffffffffffffb8        // #-72
  74:   str     x0, [x25, x10]
  78:   mov     x10, #0xffffffffffffffc0        // #-64
  7c:   str     x0, [x25, x10]
  80:   mov     x10, #0xffffffffffffffc8        // #-56
  84:   str     x0, [x25, x10]
  88:   mov     x10, #0xffffffffffffffd0        // #-48
  8c:   str     x0, [x25, x10]
  90:   mov     x10, #0xffffffffffffffd8        // #-40
  94:   str     x0, [x25, x10]
  98:   mov     x10, #0xffffffffffffffe0        // #-32
  9c:   str     x0, [x25, x10]
  a0:   mov     x10, #0xffffffffffffffe8        // #-24
  a4:   str     x0, [x25, x10]
  a8:   mov     x10, #0xfffffffffffffff0        // #-16
  ac:   str     x0, [x25, x10]
  b0:   mov     x10, #0xfffffffffffffff8        // #-8
  b4:   str     x0, [x25, x10]
  b8:   mov     x10, #0x8                       // #8
  bc:   ldr     x2, [x19, x10]
  [...]

With this patch, jited code(fragment):

   0:   bti     c
   4:   stp     x29, x30, [sp, #-16]!
   8:   mov     x29, sp
   c:   stp     x19, x20, [sp, #-16]!
  10:   stp     x21, x22, [sp, #-16]!
  14:   stp     x25, x26, [sp, #-16]!
  18:   stp     x27, x28, [sp, #-16]!
  1c:   mov     x25, sp
  20:   sub     x27, x25, #0x88
  24:   mov     x26, #0x0                       // #0
  28:   bti     j
  2c:   sub     sp, sp, #0x90
  30:   add     x19, x0, #0x0
  34:   mov     x0, #0x0                        // #0
  38:   str     x0, [x27]
  3c:   str     x0, [x27, #8]
  40:   str     x0, [x27, #16]
  44:   str     x0, [x27, #24]
  48:   str     x0, [x27, #32]
  4c:   str     x0, [x27, #40]
  50:   str     x0, [x27, #48]
  54:   str     x0, [x27, #56]
  58:   str     x0, [x27, #64]
  5c:   str     x0, [x27, #72]
  60:   str     x0, [x27, #80]
  64:   str     x0, [x27, #88]
  68:   str     x0, [x27, #96]
  6c:   str     x0, [x27, #104]
  70:   str     x0, [x27, #112]
  74:   str     x0, [x27, #120]
  78:   str     x0, [x27, #128]
  7c:   ldr     x2, [x19, #8]
  [...]

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
---
 arch/arm64/net/bpf_jit_comp.c | 155 ++++++++++++++++++++++++++++------
 1 file changed, 128 insertions(+), 27 deletions(-)

Comments

Daniel Borkmann March 18, 2022, 2:41 p.m. UTC | #1
On 3/17/22 3:02 PM, Xu Kuohai wrote:
> The BPF STX/LDX instruction uses offset relative to the FP to address
> stack space. Since the BPF_FP locates at the top of the frame, the offset
> is usually a negative number. However, arm64 str/ldr immediate instruction
> requires that offset be a positive number.  Therefore, this patch tries to
> convert the offsets.
> 
> The method is to find the negative offset furthest from the FP firstly.
> Then add it to the FP, calculate a bottom position, called FPB, and then
> adjust the offsets in other STR/LDX instructions relative to FPB.
> 
> FPB is saved using the callee-saved register x27 of arm64 which is not
> used yet.
> 
> Before adjusting the offset, the patch checks every instruction to ensure
> that the FP does not change in run-time. If the FP may change, no offset
> is adjusted.
> 
> For example, for the following bpftrace command:
> 
>    bpftrace -e 'kprobe:do_sys_open { printf("opening: %s\n", str(arg1)); }'
> 
> Without this patch, jited code(fragment):
> 
>     0:   bti     c
>     4:   stp     x29, x30, [sp, #-16]!
>     8:   mov     x29, sp
>     c:   stp     x19, x20, [sp, #-16]!
>    10:   stp     x21, x22, [sp, #-16]!
>    14:   stp     x25, x26, [sp, #-16]!
>    18:   mov     x25, sp
>    1c:   mov     x26, #0x0                       // #0
>    20:   bti     j
>    24:   sub     sp, sp, #0x90
>    28:   add     x19, x0, #0x0
>    2c:   mov     x0, #0x0                        // #0
>    30:   mov     x10, #0xffffffffffffff78        // #-136
>    34:   str     x0, [x25, x10]
>    38:   mov     x10, #0xffffffffffffff80        // #-128
>    3c:   str     x0, [x25, x10]
>    40:   mov     x10, #0xffffffffffffff88        // #-120
>    44:   str     x0, [x25, x10]
>    48:   mov     x10, #0xffffffffffffff90        // #-112
>    4c:   str     x0, [x25, x10]
>    50:   mov     x10, #0xffffffffffffff98        // #-104
>    54:   str     x0, [x25, x10]
>    58:   mov     x10, #0xffffffffffffffa0        // #-96
>    5c:   str     x0, [x25, x10]
>    60:   mov     x10, #0xffffffffffffffa8        // #-88
>    64:   str     x0, [x25, x10]
>    68:   mov     x10, #0xffffffffffffffb0        // #-80
>    6c:   str     x0, [x25, x10]
>    70:   mov     x10, #0xffffffffffffffb8        // #-72
>    74:   str     x0, [x25, x10]
>    78:   mov     x10, #0xffffffffffffffc0        // #-64
>    7c:   str     x0, [x25, x10]
>    80:   mov     x10, #0xffffffffffffffc8        // #-56
>    84:   str     x0, [x25, x10]
>    88:   mov     x10, #0xffffffffffffffd0        // #-48
>    8c:   str     x0, [x25, x10]
>    90:   mov     x10, #0xffffffffffffffd8        // #-40
>    94:   str     x0, [x25, x10]
>    98:   mov     x10, #0xffffffffffffffe0        // #-32
>    9c:   str     x0, [x25, x10]
>    a0:   mov     x10, #0xffffffffffffffe8        // #-24
>    a4:   str     x0, [x25, x10]
>    a8:   mov     x10, #0xfffffffffffffff0        // #-16
>    ac:   str     x0, [x25, x10]
>    b0:   mov     x10, #0xfffffffffffffff8        // #-8
>    b4:   str     x0, [x25, x10]
>    b8:   mov     x10, #0x8                       // #8
>    bc:   ldr     x2, [x19, x10]
>    [...]
> 
> With this patch, jited code(fragment):
> 
>     0:   bti     c
>     4:   stp     x29, x30, [sp, #-16]!
>     8:   mov     x29, sp
>     c:   stp     x19, x20, [sp, #-16]!
>    10:   stp     x21, x22, [sp, #-16]!
>    14:   stp     x25, x26, [sp, #-16]!
>    18:   stp     x27, x28, [sp, #-16]!
>    1c:   mov     x25, sp
>    20:   sub     x27, x25, #0x88
>    24:   mov     x26, #0x0                       // #0
>    28:   bti     j
>    2c:   sub     sp, sp, #0x90
>    30:   add     x19, x0, #0x0
>    34:   mov     x0, #0x0                        // #0
>    38:   str     x0, [x27]
>    3c:   str     x0, [x27, #8]
>    40:   str     x0, [x27, #16]
>    44:   str     x0, [x27, #24]
>    48:   str     x0, [x27, #32]
>    4c:   str     x0, [x27, #40]
>    50:   str     x0, [x27, #48]
>    54:   str     x0, [x27, #56]
>    58:   str     x0, [x27, #64]
>    5c:   str     x0, [x27, #72]
>    60:   str     x0, [x27, #80]
>    64:   str     x0, [x27, #88]
>    68:   str     x0, [x27, #96]
>    6c:   str     x0, [x27, #104]
>    70:   str     x0, [x27, #112]
>    74:   str     x0, [x27, #120]
>    78:   str     x0, [x27, #128]
>    7c:   ldr     x2, [x19, #8]
>    [...]
> 
> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>

Could you elaborate how this works across tail calls / bpf2bpf calls? Could we
run into a situation where is_lsi_offset() on off_adj doesn't hold true anymore
and we ended up emitting e.g. STR32I instead of MOV_I+STR32?

Thanks,
Daniel
Xu Kuohai March 19, 2022, 8 a.m. UTC | #2
On 2022/3/18 22:41, Daniel Borkmann wrote:
> On 3/17/22 3:02 PM, Xu Kuohai wrote:
>> The BPF STX/LDX instruction uses offset relative to the FP to address
>> stack space. Since the BPF_FP locates at the top of the frame, the offset
>> is usually a negative number. However, arm64 str/ldr immediate 
>> instruction
>> requires that offset be a positive number.  Therefore, this patch 
>> tries to
>> convert the offsets.
>>
>> The method is to find the negative offset furthest from the FP firstly.
>> Then add it to the FP, calculate a bottom position, called FPB, and then
>> adjust the offsets in other STR/LDX instructions relative to FPB.
>>
>> FPB is saved using the callee-saved register x27 of arm64 which is not
>> used yet.
>>
>> Before adjusting the offset, the patch checks every instruction to ensure
>> that the FP does not change in run-time. If the FP may change, no offset
>> is adjusted.
>>
>> For example, for the following bpftrace command:
>>
>>    bpftrace -e 'kprobe:do_sys_open { printf("opening: %s\n", 
>> str(arg1)); }'
>>
>> Without this patch, jited code(fragment):
>>
>>     0:   bti     c
>>     4:   stp     x29, x30, [sp, #-16]!
>>     8:   mov     x29, sp
>>     c:   stp     x19, x20, [sp, #-16]!
>>    10:   stp     x21, x22, [sp, #-16]!
>>    14:   stp     x25, x26, [sp, #-16]!
>>    18:   mov     x25, sp
>>    1c:   mov     x26, #0x0                       // #0
>>    20:   bti     j
>>    24:   sub     sp, sp, #0x90
>>    28:   add     x19, x0, #0x0
>>    2c:   mov     x0, #0x0                        // #0
>>    30:   mov     x10, #0xffffffffffffff78        // #-136
>>    34:   str     x0, [x25, x10]
>>    38:   mov     x10, #0xffffffffffffff80        // #-128
>>    3c:   str     x0, [x25, x10]
>>    40:   mov     x10, #0xffffffffffffff88        // #-120
>>    44:   str     x0, [x25, x10]
>>    48:   mov     x10, #0xffffffffffffff90        // #-112
>>    4c:   str     x0, [x25, x10]
>>    50:   mov     x10, #0xffffffffffffff98        // #-104
>>    54:   str     x0, [x25, x10]
>>    58:   mov     x10, #0xffffffffffffffa0        // #-96
>>    5c:   str     x0, [x25, x10]
>>    60:   mov     x10, #0xffffffffffffffa8        // #-88
>>    64:   str     x0, [x25, x10]
>>    68:   mov     x10, #0xffffffffffffffb0        // #-80
>>    6c:   str     x0, [x25, x10]
>>    70:   mov     x10, #0xffffffffffffffb8        // #-72
>>    74:   str     x0, [x25, x10]
>>    78:   mov     x10, #0xffffffffffffffc0        // #-64
>>    7c:   str     x0, [x25, x10]
>>    80:   mov     x10, #0xffffffffffffffc8        // #-56
>>    84:   str     x0, [x25, x10]
>>    88:   mov     x10, #0xffffffffffffffd0        // #-48
>>    8c:   str     x0, [x25, x10]
>>    90:   mov     x10, #0xffffffffffffffd8        // #-40
>>    94:   str     x0, [x25, x10]
>>    98:   mov     x10, #0xffffffffffffffe0        // #-32
>>    9c:   str     x0, [x25, x10]
>>    a0:   mov     x10, #0xffffffffffffffe8        // #-24
>>    a4:   str     x0, [x25, x10]
>>    a8:   mov     x10, #0xfffffffffffffff0        // #-16
>>    ac:   str     x0, [x25, x10]
>>    b0:   mov     x10, #0xfffffffffffffff8        // #-8
>>    b4:   str     x0, [x25, x10]
>>    b8:   mov     x10, #0x8                       // #8
>>    bc:   ldr     x2, [x19, x10]
>>    [...]
>>
>> With this patch, jited code(fragment):
>>
>>     0:   bti     c
>>     4:   stp     x29, x30, [sp, #-16]!
>>     8:   mov     x29, sp
>>     c:   stp     x19, x20, [sp, #-16]!
>>    10:   stp     x21, x22, [sp, #-16]!
>>    14:   stp     x25, x26, [sp, #-16]!
>>    18:   stp     x27, x28, [sp, #-16]!
>>    1c:   mov     x25, sp
>>    20:   sub     x27, x25, #0x88
>>    24:   mov     x26, #0x0                       // #0
>>    28:   bti     j
>>    2c:   sub     sp, sp, #0x90
>>    30:   add     x19, x0, #0x0
>>    34:   mov     x0, #0x0                        // #0
>>    38:   str     x0, [x27]
>>    3c:   str     x0, [x27, #8]
>>    40:   str     x0, [x27, #16]
>>    44:   str     x0, [x27, #24]
>>    48:   str     x0, [x27, #32]
>>    4c:   str     x0, [x27, #40]
>>    50:   str     x0, [x27, #48]
>>    54:   str     x0, [x27, #56]
>>    58:   str     x0, [x27, #64]
>>    5c:   str     x0, [x27, #72]
>>    60:   str     x0, [x27, #80]
>>    64:   str     x0, [x27, #88]
>>    68:   str     x0, [x27, #96]
>>    6c:   str     x0, [x27, #104]
>>    70:   str     x0, [x27, #112]
>>    74:   str     x0, [x27, #120]
>>    78:   str     x0, [x27, #128]
>>    7c:   ldr     x2, [x19, #8]
>>    [...]
>>
>> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
> 
> Could you elaborate how this works across tail calls / bpf2bpf calls? 
> Could we
> run into a situation where is_lsi_offset() on off_adj doesn't hold true 
> anymore
> and we ended up emitting e.g. STR32I instead of MOV_I+STR32?
> 
> Thanks,
> Daniel
> .

Thanks for the pointing this out. There's a bug here.

1. The fp bottom is calculated in the prologue before tail call entry,
    so if the caller has a different fp bottom than the callee, the
    off_adj in callee's str/ldr will be wrong.

    For example, for the following two pieces of code, prog1 loads 64bits
    data from FP - 16, prog2 write 7 to FP - 8, when prog2 calls prog1
    via tail call, the data prog1 read out is 7!

    prog1:
     BPF_LDX_MEM(BPF_DW, R0, BPF_REG_FP, -16)
     BPF_EXIT_INSN()

    prog2:
     BPF_ALU64_IMM(BPF_MOV, R0, 7)
     BPF_STX_MEM(BPF_DW, BPF_REG_FP, R0, -8)
     TAIL_CALL(-1)

    The reason is that the fp bottom in prog2 is FP - 8, and the fp
    bottom is not updated during tail call, so when prog1 reads data from
    fp bottom, it acctually reads from FP - 8.

    jited prog1:
     sub x27, x25, #0x10 // calculate and store fp bottom, x27 = fp - 16
     bti j               // tail call entry     <----------------------
     ldr x7, [x27]       // load data from fp bottom + 0               |
                                                                       |
    jited prog2:                                                       |
     sub x27, x25, #0x8 // calculate and store fp bottom, x27 = fp - 8 |
     bti j                                                             |
     mov x7, #0x7                                                      |
     str x7, [x27]      // store 7 to fp bottom + 0                    |
     ...                                                               |
     br x10 // tail call prog1  ---------------------------------------

    This issue can be fixed by calculating fp bottom after the tail call
    entry:

     bti j               // tail call entry
     sub x27, x25, #0x10 // calculate and store fp bottom, x27 = fp - 16
     ldr x7, [x27]       // load data from bottom + 0

2. For bpf2bpf calls, the call entry is callee's first instruction, so
    the fp bottom will be updated every time, seems to be fine.
diff mbox series

Patch

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 06037607c04a..d91bd9d43c5a 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -26,6 +26,7 @@ 
 #define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
 #define TCALL_CNT (MAX_BPF_JIT_REG + 2)
 #define TMP_REG_3 (MAX_BPF_JIT_REG + 3)
+#define FP_BOTTOM (MAX_BPF_JIT_REG + 4)
 
 #define check_imm(bits, imm) do {				\
 	if ((((imm) > 0) && ((imm) >> (bits))) ||		\
@@ -63,6 +64,7 @@  static const int bpf2a64[] = {
 	[TCALL_CNT] = A64_R(26),
 	/* temporary register for blinding constants */
 	[BPF_REG_AX] = A64_R(9),
+	[FP_BOTTOM] = A64_R(27),
 };
 
 struct jit_ctx {
@@ -73,6 +75,7 @@  struct jit_ctx {
 	int exentry_idx;
 	__le32 *image;
 	u32 stack_size;
+	int fpb_offset;
 };
 
 static inline void emit(const u32 insn, struct jit_ctx *ctx)
@@ -218,7 +221,7 @@  static bool is_addsub_imm(u32 imm)
  *
  * offset = (u64)imm12 << scale
  */
-static bool is_lsi_offset(s16 offset, int scale)
+static bool is_lsi_offset(int offset, int scale)
 {
 	if (offset < 0)
 		return false;
@@ -234,9 +237,9 @@  static bool is_lsi_offset(s16 offset, int scale)
 
 /* Tail call offset to jump into */
 #if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)
-#define PROLOGUE_OFFSET 8
+#define PROLOGUE_OFFSET 10
 #else
-#define PROLOGUE_OFFSET 7
+#define PROLOGUE_OFFSET 9
 #endif
 
 static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
@@ -248,6 +251,7 @@  static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 	const u8 r9 = bpf2a64[BPF_REG_9];
 	const u8 fp = bpf2a64[BPF_REG_FP];
 	const u8 tcc = bpf2a64[TCALL_CNT];
+	const u8 fpb = bpf2a64[FP_BOTTOM];
 	const int idx0 = ctx->idx;
 	int cur_offset;
 
@@ -286,9 +290,11 @@  static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 	emit(A64_PUSH(r6, r7, A64_SP), ctx);
 	emit(A64_PUSH(r8, r9, A64_SP), ctx);
 	emit(A64_PUSH(fp, tcc, A64_SP), ctx);
+	emit(A64_PUSH(fpb, A64_R(28), A64_SP), ctx);
 
 	/* Set up BPF prog stack base register */
 	emit(A64_MOV(1, fp, A64_SP), ctx);
+	emit(A64_SUB_I(1, fpb, fp, ctx->fpb_offset), ctx);
 
 	if (!ebpf_from_cbpf) {
 		/* Initialize tail_call_cnt */
@@ -553,10 +559,13 @@  static void build_epilogue(struct jit_ctx *ctx)
 	const u8 r8 = bpf2a64[BPF_REG_8];
 	const u8 r9 = bpf2a64[BPF_REG_9];
 	const u8 fp = bpf2a64[BPF_REG_FP];
+	const u8 fpb = bpf2a64[FP_BOTTOM];
 
 	/* We're done with BPF stack */
 	emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
 
+	/* Restore x27 and x28 */
+	emit(A64_POP(fpb, A64_R(28), A64_SP), ctx);
 	/* Restore fs (x25) and x26 */
 	emit(A64_POP(fp, A64_R(26), A64_SP), ctx);
 
@@ -650,6 +659,8 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	const u8 src = bpf2a64[insn->src_reg];
 	const u8 tmp = bpf2a64[TMP_REG_1];
 	const u8 tmp2 = bpf2a64[TMP_REG_2];
+	const u8 fp = bpf2a64[BPF_REG_FP];
+	const u8 fpb = bpf2a64[FP_BOTTOM];
 	const s16 off = insn->off;
 	const s32 imm = insn->imm;
 	const int i = insn - ctx->prog->insnsi;
@@ -658,6 +669,9 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	u8 jmp_cond;
 	s32 jmp_offset;
 	u32 a64_insn;
+	u8 src_adj;
+	u8 dst_adj;
+	int off_adj;
 	int ret;
 
 	switch (code) {
@@ -1012,34 +1026,41 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	case BPF_LDX | BPF_PROBE_MEM | BPF_W:
 	case BPF_LDX | BPF_PROBE_MEM | BPF_H:
 	case BPF_LDX | BPF_PROBE_MEM | BPF_B:
+		if (ctx->fpb_offset > 0 && src == fp) {
+			src_adj = fpb;
+			off_adj = off + ctx->fpb_offset;
+		} else {
+			src_adj = src;
+			off_adj = off;
+		}
 		switch (BPF_SIZE(code)) {
 		case BPF_W:
-			if (is_lsi_offset(off, 2)) {
-				emit(A64_LDR32I(dst, src, off), ctx);
+			if (is_lsi_offset(off_adj, 2)) {
+				emit(A64_LDR32I(dst, src_adj, off_adj), ctx);
 			} else {
 				emit_a64_mov_i(1, tmp, off, ctx);
 				emit(A64_LDR32(dst, src, tmp), ctx);
 			}
 			break;
 		case BPF_H:
-			if (is_lsi_offset(off, 1)) {
-				emit(A64_LDRHI(dst, src, off), ctx);
+			if (is_lsi_offset(off_adj, 1)) {
+				emit(A64_LDRHI(dst, src_adj, off_adj), ctx);
 			} else {
 				emit_a64_mov_i(1, tmp, off, ctx);
 				emit(A64_LDRH(dst, src, tmp), ctx);
 			}
 			break;
 		case BPF_B:
-			if (is_lsi_offset(off, 0)) {
-				emit(A64_LDRBI(dst, src, off), ctx);
+			if (is_lsi_offset(off_adj, 0)) {
+				emit(A64_LDRBI(dst, src_adj, off_adj), ctx);
 			} else {
 				emit_a64_mov_i(1, tmp, off, ctx);
 				emit(A64_LDRB(dst, src, tmp), ctx);
 			}
 			break;
 		case BPF_DW:
-			if (is_lsi_offset(off, 3)) {
-				emit(A64_LDR64I(dst, src, off), ctx);
+			if (is_lsi_offset(off_adj, 3)) {
+				emit(A64_LDR64I(dst, src_adj, off_adj), ctx);
 			} else {
 				emit_a64_mov_i(1, tmp, off, ctx);
 				emit(A64_LDR64(dst, src, tmp), ctx);
@@ -1070,36 +1091,43 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	case BPF_ST | BPF_MEM | BPF_H:
 	case BPF_ST | BPF_MEM | BPF_B:
 	case BPF_ST | BPF_MEM | BPF_DW:
+		if (ctx->fpb_offset > 0 && dst == fp) {
+			dst_adj = fpb;
+			off_adj = off + ctx->fpb_offset;
+		} else {
+			dst_adj = dst;
+			off_adj = off;
+		}
 		/* Load imm to a register then store it */
 		emit_a64_mov_i(1, tmp, imm, ctx);
 		switch (BPF_SIZE(code)) {
 		case BPF_W:
-			if (is_lsi_offset(off, 2)) {
-				emit(A64_STR32I(tmp, dst, off), ctx);
+			if (is_lsi_offset(off_adj, 2)) {
+				emit(A64_STR32I(tmp, dst_adj, off_adj), ctx);
 			} else {
 				emit_a64_mov_i(1, tmp2, off, ctx);
 				emit(A64_STR32(tmp, dst, tmp2), ctx);
 			}
 			break;
 		case BPF_H:
-			if (is_lsi_offset(off, 1)) {
-				emit(A64_STRHI(tmp, dst, off), ctx);
+			if (is_lsi_offset(off_adj, 1)) {
+				emit(A64_STRHI(tmp, dst_adj, off_adj), ctx);
 			} else {
 				emit_a64_mov_i(1, tmp2, off, ctx);
 				emit(A64_STRH(tmp, dst, tmp2), ctx);
 			}
 			break;
 		case BPF_B:
-			if (is_lsi_offset(off, 0)) {
-				emit(A64_STRBI(tmp, dst, off), ctx);
+			if (is_lsi_offset(off_adj, 0)) {
+				emit(A64_STRBI(tmp, dst_adj, off_adj), ctx);
 			} else {
 				emit_a64_mov_i(1, tmp2, off, ctx);
 				emit(A64_STRB(tmp, dst, tmp2), ctx);
 			}
 			break;
 		case BPF_DW:
-			if (is_lsi_offset(off, 3)) {
-				emit(A64_STR64I(tmp, dst, off), ctx);
+			if (is_lsi_offset(off_adj, 3)) {
+				emit(A64_STR64I(tmp, dst_adj, off_adj), ctx);
 			} else {
 				emit_a64_mov_i(1, tmp2, off, ctx);
 				emit(A64_STR64(tmp, dst, tmp2), ctx);
@@ -1113,34 +1141,41 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	case BPF_STX | BPF_MEM | BPF_H:
 	case BPF_STX | BPF_MEM | BPF_B:
 	case BPF_STX | BPF_MEM | BPF_DW:
+		if (ctx->fpb_offset > 0 && dst == fp) {
+			dst_adj = fpb;
+			off_adj = off + ctx->fpb_offset;
+		} else {
+			dst_adj = dst;
+			off_adj = off;
+		}
 		switch (BPF_SIZE(code)) {
 		case BPF_W:
-			if (is_lsi_offset(off, 2)) {
-				emit(A64_STR32I(src, dst, off), ctx);
+			if (is_lsi_offset(off_adj, 2)) {
+				emit(A64_STR32I(src, dst_adj, off_adj), ctx);
 			} else {
 				emit_a64_mov_i(1, tmp, off, ctx);
 				emit(A64_STR32(src, dst, tmp), ctx);
 			}
 			break;
 		case BPF_H:
-			if (is_lsi_offset(off, 1)) {
-				emit(A64_STRHI(src, dst, off), ctx);
+			if (is_lsi_offset(off_adj, 1)) {
+				emit(A64_STRHI(src, dst_adj, off_adj), ctx);
 			} else {
 				emit_a64_mov_i(1, tmp, off, ctx);
 				emit(A64_STRH(src, dst, tmp), ctx);
 			}
 			break;
 		case BPF_B:
-			if (is_lsi_offset(off, 0)) {
-				emit(A64_STRBI(src, dst, off), ctx);
+			if (is_lsi_offset(off_adj, 0)) {
+				emit(A64_STRBI(src, dst_adj, off_adj), ctx);
 			} else {
 				emit_a64_mov_i(1, tmp, off, ctx);
 				emit(A64_STRB(src, dst, tmp), ctx);
 			}
 			break;
 		case BPF_DW:
-			if (is_lsi_offset(off, 3)) {
-				emit(A64_STR64I(src, dst, off), ctx);
+			if (is_lsi_offset(off_adj, 3)) {
+				emit(A64_STR64I(src, dst_adj, off_adj), ctx);
 			} else {
 				emit_a64_mov_i(1, tmp, off, ctx);
 				emit(A64_STR64(src, dst, tmp), ctx);
@@ -1167,6 +1202,70 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	return 0;
 }
 
+/*
+ * Return 0 if FP may change at runtime, otherwise find the minimum negative
+ * offset to FP and converts it to positive number.
+ */
+static int find_fpb_offset(struct bpf_prog *prog)
+{
+	int i;
+	int offset = 0;
+
+	for (i = 0; i < prog->len; i++) {
+		const struct bpf_insn *insn = &prog->insnsi[i];
+		const u8 class = BPF_CLASS(insn->code);
+		const u8 mode = BPF_MODE(insn->code);
+		const u8 src = insn->src_reg;
+		const u8 dst = insn->dst_reg;
+		const s32 imm = insn->imm;
+		const s16 off = insn->off;
+
+		switch (class) {
+		case BPF_STX:
+		case BPF_ST:
+			/* fp holds atomic operation result */
+			if (class == BPF_STX && mode == BPF_ATOMIC &&
+				((imm == BPF_XCHG ||
+				  imm == (BPF_FETCH | BPF_ADD) ||
+				  imm == (BPF_FETCH | BPF_AND) ||
+				  imm == (BPF_FETCH | BPF_XOR) ||
+				  imm == (BPF_FETCH | BPF_OR)) &&
+				 src == BPF_REG_FP))
+				return 0;
+
+			if (mode == BPF_MEM && dst == BPF_REG_FP &&
+				off < offset)
+				offset = insn->off;
+			break;
+
+		case BPF_JMP32:
+		case BPF_JMP:
+			break;
+
+		case BPF_LDX:
+		case BPF_LD:
+			/* fp holds load result */
+			if (dst == BPF_REG_FP)
+				return 0;
+
+			if (class == BPF_LDX && mode == BPF_MEM &&
+				src == BPF_REG_FP && off < offset)
+				offset = off;
+			break;
+
+		case BPF_ALU:
+		case BPF_ALU64:
+		default:
+			/* fp holds ALU result */
+			if (dst == BPF_REG_FP)
+				return 0;
+		}
+	}
+
+	/* safely be converted to a positive 'int', since insn->off is 's16' */
+	return -offset;
+}
+
 static int build_body(struct jit_ctx *ctx, bool extra_pass)
 {
 	const struct bpf_prog *prog = ctx->prog;
@@ -1288,6 +1387,8 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		goto out_off;
 	}
 
+	ctx.fpb_offset = find_fpb_offset(prog);
+
 	/*
 	 * 1. Initial fake pass to compute ctx->idx and ctx->offset.
 	 *