diff mbox series

[bpf-next,v3,1/2] arm64, bpf: add internal-only MOV instruction to resolve per-CPU addrs

Message ID 20240426121349.97651-2-puranjay@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf, arm64: Support per-cpu instruction | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 20 of 20 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 69 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc

Commit Message

Puranjay Mohan April 26, 2024, 12:13 p.m. UTC
From: Puranjay Mohan <puranjay12@gmail.com>

Support an instruction for resolving absolute addresses of per-CPU
data from their per-CPU offsets. This instruction is internal-only and
users are not allowed to use them directly. They will only be used for
internal inlining optimizations for now between BPF verifier and BPF
JITs.

Since commit 7158627686f0 ("arm64: percpu: implement optimised pcpu
access using tpidr_el1"), the per-cpu offset for the CPU is stored in
the tpidr_el1/2 register of that CPU.

To support this BPF instruction in the ARM64 JIT, the following ARM64
instructions are emitted:

mov dst, src		// Move src to dst, if src != dst
mrs tmp, tpidr_el1/2	// Move per-cpu offset of the current cpu in tmp.
add dst, dst, tmp	// Add the per cpu offset to the dst.

To measure the performance improvement provided by this change, the
benchmark in [1] was used:

Before:
glob-arr-inc   :   23.597 ± 0.012M/s
arr-inc        :   23.173 ± 0.019M/s
hash-inc       :   12.186 ± 0.028M/s

After:
glob-arr-inc   :   23.819 ± 0.034M/s
arr-inc        :   23.285 ± 0.017M/s
hash-inc       :   12.419 ± 0.011M/s

[1] https://github.com/anakryiko/linux/commit/8dec900975ef

Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
---
 arch/arm64/include/asm/insn.h |  7 +++++++
 arch/arm64/lib/insn.c         | 11 +++++++++++
 arch/arm64/net/bpf_jit.h      |  6 ++++++
 arch/arm64/net/bpf_jit_comp.c | 14 ++++++++++++++
 4 files changed, 38 insertions(+)

Comments

Andrii Nakryiko April 26, 2024, 4:19 p.m. UTC | #1
On Fri, Apr 26, 2024 at 5:14 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> From: Puranjay Mohan <puranjay12@gmail.com>
>
> Support an instruction for resolving absolute addresses of per-CPU
> data from their per-CPU offsets. This instruction is internal-only and
> users are not allowed to use them directly. They will only be used for
> internal inlining optimizations for now between BPF verifier and BPF
> JITs.
>
> Since commit 7158627686f0 ("arm64: percpu: implement optimised pcpu
> access using tpidr_el1"), the per-cpu offset for the CPU is stored in
> the tpidr_el1/2 register of that CPU.
>
> To support this BPF instruction in the ARM64 JIT, the following ARM64
> instructions are emitted:
>
> mov dst, src            // Move src to dst, if src != dst
> mrs tmp, tpidr_el1/2    // Move per-cpu offset of the current cpu in tmp.
> add dst, dst, tmp       // Add the per cpu offset to the dst.
>
> To measure the performance improvement provided by this change, the
> benchmark in [1] was used:
>
> Before:
> glob-arr-inc   :   23.597 ± 0.012M/s
> arr-inc        :   23.173 ± 0.019M/s
> hash-inc       :   12.186 ± 0.028M/s
>
> After:
> glob-arr-inc   :   23.819 ± 0.034M/s
> arr-inc        :   23.285 ± 0.017M/s

I still expected a better improvement (global-arr-inc's results
improved more than arr-inc, which is completely different from
x86-64), but it's still a good thing to support this for arm64, of
course.

ack for generic parts I can understand:

Acked-by: Andrii Nakryiko <andrii@kernel.org>

> hash-inc       :   12.419 ± 0.011M/s
>
> [1] https://github.com/anakryiko/linux/commit/8dec900975ef
>
> Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
> ---
>  arch/arm64/include/asm/insn.h |  7 +++++++
>  arch/arm64/lib/insn.c         | 11 +++++++++++
>  arch/arm64/net/bpf_jit.h      |  6 ++++++
>  arch/arm64/net/bpf_jit_comp.c | 14 ++++++++++++++
>  4 files changed, 38 insertions(+)
>

[...]
Puranjay Mohan April 26, 2024, 4:55 p.m. UTC | #2
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Fri, Apr 26, 2024 at 5:14 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>>
>> From: Puranjay Mohan <puranjay12@gmail.com>
>>
>> Support an instruction for resolving absolute addresses of per-CPU
>> data from their per-CPU offsets. This instruction is internal-only and
>> users are not allowed to use them directly. They will only be used for
>> internal inlining optimizations for now between BPF verifier and BPF
>> JITs.
>>
>> Since commit 7158627686f0 ("arm64: percpu: implement optimised pcpu
>> access using tpidr_el1"), the per-cpu offset for the CPU is stored in
>> the tpidr_el1/2 register of that CPU.
>>
>> To support this BPF instruction in the ARM64 JIT, the following ARM64
>> instructions are emitted:
>>
>> mov dst, src            // Move src to dst, if src != dst
>> mrs tmp, tpidr_el1/2    // Move per-cpu offset of the current cpu in tmp.
>> add dst, dst, tmp       // Add the per cpu offset to the dst.
>>
>> To measure the performance improvement provided by this change, the
>> benchmark in [1] was used:
>>
>> Before:
>> glob-arr-inc   :   23.597 ± 0.012M/s
>> arr-inc        :   23.173 ± 0.019M/s
>> hash-inc       :   12.186 ± 0.028M/s
>>
>> After:
>> glob-arr-inc   :   23.819 ± 0.034M/s
>> arr-inc        :   23.285 ± 0.017M/s
>
> I still expected a better improvement (global-arr-inc's results
> improved more than arr-inc, which is completely different from
> x86-64), but it's still a good thing to support this for arm64, of
> course.
>
> ack for generic parts I can understand:
>
> Acked-by: Andrii Nakryiko <andrii@kernel.org>
>

I will have to do more research to find why we don't see very high
improvement.

But this is what is happening here:

This was the complete picture before inlining:

int cpu = bpf_get_smp_processor_id();
mov     x10, #0xffffffffffffd4a8
movk    x10, #0x802c, lsl #16
movk    x10, #0x8000, lsl #32
blr     x10 ---------------------------------------> nop
                                                     nop
                                                     adrp    x0, 0xffff800082128000
                                                     mrs     x1, tpidr_el1
                                                     add     x0, x0, #0x8
                                                     ldrsw   x0, [x0, x1]
            <----------------------------------------ret
add     x7, x0, #0x0


Now we have:

int cpu = bpf_get_smp_processor_id();
mov     x7, #0xffff8000ffffffff
movk    x7, #0x8212, lsl #16
movk    x7, #0x8008
mrs     x10, tpidr_el1
add     x7, x7, x10
ldr     w7, [x7]


So, we have removed multiple instructions including a branch and a
return. I was expecting to see more improvement. This benchmark is taken
from a KVM based virtual machine, maybe if I do it on bare-metal I would
see more improvement ?

Thanks,
Puranjay
Andrii Nakryiko April 26, 2024, 5:35 p.m. UTC | #3
On Fri, Apr 26, 2024 at 9:55 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
> > On Fri, Apr 26, 2024 at 5:14 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> >>
> >> From: Puranjay Mohan <puranjay12@gmail.com>
> >>
> >> Support an instruction for resolving absolute addresses of per-CPU
> >> data from their per-CPU offsets. This instruction is internal-only and
> >> users are not allowed to use them directly. They will only be used for
> >> internal inlining optimizations for now between BPF verifier and BPF
> >> JITs.
> >>
> >> Since commit 7158627686f0 ("arm64: percpu: implement optimised pcpu
> >> access using tpidr_el1"), the per-cpu offset for the CPU is stored in
> >> the tpidr_el1/2 register of that CPU.
> >>
> >> To support this BPF instruction in the ARM64 JIT, the following ARM64
> >> instructions are emitted:
> >>
> >> mov dst, src            // Move src to dst, if src != dst
> >> mrs tmp, tpidr_el1/2    // Move per-cpu offset of the current cpu in tmp.
> >> add dst, dst, tmp       // Add the per cpu offset to the dst.
> >>
> >> To measure the performance improvement provided by this change, the
> >> benchmark in [1] was used:
> >>
> >> Before:
> >> glob-arr-inc   :   23.597 ± 0.012M/s
> >> arr-inc        :   23.173 ± 0.019M/s
> >> hash-inc       :   12.186 ± 0.028M/s
> >>
> >> After:
> >> glob-arr-inc   :   23.819 ± 0.034M/s
> >> arr-inc        :   23.285 ± 0.017M/s
> >
> > I still expected a better improvement (global-arr-inc's results
> > improved more than arr-inc, which is completely different from
> > x86-64), but it's still a good thing to support this for arm64, of
> > course.
> >
> > ack for generic parts I can understand:
> >
> > Acked-by: Andrii Nakryiko <andrii@kernel.org>
> >
>
> I will have to do more research to find why we don't see very high
> improvement.
>
> But this is what is happening here:
>
> This was the complete picture before inlining:
>
> int cpu = bpf_get_smp_processor_id();
> mov     x10, #0xffffffffffffd4a8
> movk    x10, #0x802c, lsl #16
> movk    x10, #0x8000, lsl #32
> blr     x10 ---------------------------------------> nop
>                                                      nop
>                                                      adrp    x0, 0xffff800082128000
>                                                      mrs     x1, tpidr_el1
>                                                      add     x0, x0, #0x8
>                                                      ldrsw   x0, [x0, x1]
>             <----------------------------------------ret
> add     x7, x0, #0x0
>
>
> Now we have:
>
> int cpu = bpf_get_smp_processor_id();
> mov     x7, #0xffff8000ffffffff
> movk    x7, #0x8212, lsl #16
> movk    x7, #0x8008
> mrs     x10, tpidr_el1
> add     x7, x7, x10
> ldr     w7, [x7]
>
>
> So, we have removed multiple instructions including a branch and a
> return. I was expecting to see more improvement. This benchmark is taken
> from a KVM based virtual machine, maybe if I do it on bare-metal I would
> see more improvement ?

I see, yeah, I think it might change significantly. I remember back
from times when I was benchmarking BPF ringbuf, I was getting
very-very different results from inside QEMU vs bare metal. And I
don't mean just in absolute numbers. QEMU/KVM seems to change a lot of
things when it comes to contentions, atomic instructions, etc, etc.
Anyways, for benchmarking, always try to do bare metal.

>
> Thanks,
> Puranjay
Puranjay Mohan April 30, 2024, 6:30 p.m. UTC | #4
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Fri, Apr 26, 2024 at 9:55 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>>
>> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>>
>> > On Fri, Apr 26, 2024 at 5:14 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>> >>
>> >> From: Puranjay Mohan <puranjay12@gmail.com>
>> >>
>> >> Support an instruction for resolving absolute addresses of per-CPU
>> >> data from their per-CPU offsets. This instruction is internal-only and
>> >> users are not allowed to use them directly. They will only be used for
>> >> internal inlining optimizations for now between BPF verifier and BPF
>> >> JITs.
>> >>
>> >> Since commit 7158627686f0 ("arm64: percpu: implement optimised pcpu
>> >> access using tpidr_el1"), the per-cpu offset for the CPU is stored in
>> >> the tpidr_el1/2 register of that CPU.
>> >>
>> >> To support this BPF instruction in the ARM64 JIT, the following ARM64
>> >> instructions are emitted:
>> >>
>> >> mov dst, src            // Move src to dst, if src != dst
>> >> mrs tmp, tpidr_el1/2    // Move per-cpu offset of the current cpu in tmp.
>> >> add dst, dst, tmp       // Add the per cpu offset to the dst.
>> >>
>> >> To measure the performance improvement provided by this change, the
>> >> benchmark in [1] was used:
>> >>
>> >> Before:
>> >> glob-arr-inc   :   23.597 ± 0.012M/s
>> >> arr-inc        :   23.173 ± 0.019M/s
>> >> hash-inc       :   12.186 ± 0.028M/s
>> >>
>> >> After:
>> >> glob-arr-inc   :   23.819 ± 0.034M/s
>> >> arr-inc        :   23.285 ± 0.017M/s
>> >
>> > I still expected a better improvement (global-arr-inc's results
>> > improved more than arr-inc, which is completely different from
>> > x86-64), but it's still a good thing to support this for arm64, of
>> > course.
>> >
>> > ack for generic parts I can understand:
>> >
>> > Acked-by: Andrii Nakryiko <andrii@kernel.org>
>> >
>>
>> I will have to do more research to find why we don't see very high
>> improvement.
>>
>> But this is what is happening here:
>>
>> This was the complete picture before inlining:
>>
>> int cpu = bpf_get_smp_processor_id();
>> mov     x10, #0xffffffffffffd4a8
>> movk    x10, #0x802c, lsl #16
>> movk    x10, #0x8000, lsl #32
>> blr     x10 ---------------------------------------> nop
>>                                                      nop
>>                                                      adrp    x0, 0xffff800082128000
>>                                                      mrs     x1, tpidr_el1
>>                                                      add     x0, x0, #0x8
>>                                                      ldrsw   x0, [x0, x1]
>>             <----------------------------------------ret
>> add     x7, x0, #0x0
>>
>>
>> Now we have:
>>
>> int cpu = bpf_get_smp_processor_id();
>> mov     x7, #0xffff8000ffffffff
>> movk    x7, #0x8212, lsl #16
>> movk    x7, #0x8008
>> mrs     x10, tpidr_el1
>> add     x7, x7, x10
>> ldr     w7, [x7]
>>
>>
>> So, we have removed multiple instructions including a branch and a
>> return. I was expecting to see more improvement. This benchmark is taken
>> from a KVM based virtual machine, maybe if I do it on bare-metal I would
>> see more improvement ?
>
> I see, yeah, I think it might change significantly. I remember back
> from times when I was benchmarking BPF ringbuf, I was getting
> very-very different results from inside QEMU vs bare metal. And I
> don't mean just in absolute numbers. QEMU/KVM seems to change a lot of
> things when it comes to contentions, atomic instructions, etc, etc.
> Anyways, for benchmarking, always try to do bare metal.
>

I found the solution to this. I am seeing much better performance when
implementing this inlining in the JIT through another method, similar to
what I did for riscv see[1]

[1] https://lore.kernel.org/all/20240430175834.33152-3-puranjay@kernel.org/

Will do the same for ARM64 in V5 of this series.

Thanks,
Puranjay
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index db1aeacd4cd9..8de0e39b29f3 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -135,6 +135,11 @@  enum aarch64_insn_special_register {
 	AARCH64_INSN_SPCLREG_SP_EL2	= 0xF210
 };
 
+enum aarch64_insn_system_register {
+	AARCH64_INSN_SYSREG_TPIDR_EL1	= 0x4684,
+	AARCH64_INSN_SYSREG_TPIDR_EL2	= 0x6682,
+};
+
 enum aarch64_insn_variant {
 	AARCH64_INSN_VARIANT_32BIT,
 	AARCH64_INSN_VARIANT_64BIT
@@ -686,6 +691,8 @@  u32 aarch64_insn_gen_cas(enum aarch64_insn_register result,
 }
 #endif
 u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type);
+u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result,
+			 enum aarch64_insn_system_register sysreg);
 
 s32 aarch64_get_branch_offset(u32 insn);
 u32 aarch64_set_branch_offset(u32 insn, s32 offset);
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index a635ab83fee3..b008a9b46a7f 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -1515,3 +1515,14 @@  u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
 
 	return insn;
 }
+
+u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result,
+			 enum aarch64_insn_system_register sysreg)
+{
+	u32 insn = aarch64_insn_get_mrs_value();
+
+	insn &= ~GENMASK(19, 0);
+	insn |= sysreg << 5;
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT,
+					    insn, result);
+}
diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index 23b1b34db088..b627ef7188c7 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -297,4 +297,10 @@ 
 #define A64_ADR(Rd, offset) \
 	aarch64_insn_gen_adr(0, offset, Rd, AARCH64_INSN_ADR_TYPE_ADR)
 
+/* MRS */
+#define A64_MRS_TPIDR_EL1(Rt) \
+	aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL1)
+#define A64_MRS_TPIDR_EL2(Rt) \
+	aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL2)
+
 #endif /* _BPF_JIT_H */
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 76b91f36c729..ed8f9716d9d5 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -877,6 +877,15 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 			emit(A64_ORR(1, tmp, dst, tmp), ctx);
 			emit(A64_MOV(1, dst, tmp), ctx);
 			break;
+		} else if (insn_is_mov_percpu_addr(insn)) {
+			if (dst != src)
+				emit(A64_MOV(1, dst, src), ctx);
+			if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN))
+				emit(A64_MRS_TPIDR_EL2(tmp), ctx);
+			else
+				emit(A64_MRS_TPIDR_EL1(tmp), ctx);
+			emit(A64_ADD(1, dst, dst, tmp), ctx);
+			break;
 		}
 		switch (insn->off) {
 		case 0:
@@ -2527,6 +2536,11 @@  bool bpf_jit_supports_arena(void)
 	return true;
 }
 
+bool bpf_jit_supports_percpu_insn(void)
+{
+	return true;
+}
+
 void bpf_jit_free(struct bpf_prog *prog)
 {
 	if (prog->jited) {