diff mbox series

[bpf-next,v4,3/5] bpf: Inline calls to bpf_loop when callback is known

Message ID 20220608192630.3710333-4-eddyz87@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf_loop inlining | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Kernel LATEST on z15 with gcc
bpf/vmtest-bpf-next-PR success PR summary
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 1432 this patch: 1436
netdev/cc_maintainers warning 6 maintainers not CCed: netdev@vger.kernel.org songliubraving@fb.com yhs@fb.com john.fastabend@gmail.com kafai@fb.com kpsingh@kernel.org
netdev/build_clang fail Errors and warnings before: 166 this patch: 169
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 1439 this patch: 1443
netdev/checkpatch warning WARNING: line length of 84 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on ubuntu-latest with llvm-15

Commit Message

Eduard Zingerman June 8, 2022, 7:26 p.m. UTC
Calls to `bpf_loop` are replaced with direct loops to avoid
indirection. E.g. the following:

  bpf_loop(10, foo, NULL, 0);

Is replaced by equivalent of the following:

  for (int i = 0; i < 10; ++i)
    foo(i, NULL);

This transformation could be applied when:
- callback is known and does not change during program execution;
- flags passed to `bpf_loop` are always zero.

Inlining logic works as follows:

- During execution simulation function `update_loop_inline_state`
  tracks the following information for each `bpf_loop` call
  instruction:
  - is callback known and constant?
  - are flags constant and zero?
- Function `optimize_bpf_loop` increases stack depth for functions
  where `bpf_loop` calls can be inlined and invokes `inline_bpf_loop`
  to apply the inlining. The additional stack space is used to spill
  registers R6, R7 and R8. These registers are used as loop counter,
  loop maximal bound and callback context parameter;

Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on
i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
---
 include/linux/bpf.h          |   3 +
 include/linux/bpf_verifier.h |  12 +++
 kernel/bpf/bpf_iter.c        |   9 +-
 kernel/bpf/verifier.c        | 168 +++++++++++++++++++++++++++++++++--
 4 files changed, 183 insertions(+), 9 deletions(-)

Comments

kernel test robot June 9, 2022, 2:56 p.m. UTC | #1
Hi Eduard,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Eduard-Zingerman/bpf_loop-inlining/20220609-033007
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: arm-randconfig-r014-20220608 (https://download.01.org/0day-ci/archive/20220609/202206092228.SWNGbTFO-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project 971e13d69e3e7b687213fef22952be6a328c426c)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install arm cross compiling tool for clang build
        # apt-get install binutils-arm-linux-gnueabi
        # https://github.com/intel-lab-lkp/linux/commit/10671a6a0479bb7ee4d437c4ce7307f198cb96fd
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Eduard-Zingerman/bpf_loop-inlining/20220609-033007
        git checkout 10671a6a0479bb7ee4d437c4ce7307f198cb96fd
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash kernel/bpf/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> kernel/bpf/verifier.c:7111:6: warning: no previous prototype for function 'update_loop_inline_state' [-Wmissing-prototypes]
   void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
        ^
   kernel/bpf/verifier.c:7111:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
   ^
   static 
>> kernel/bpf/verifier.c:14318:18: warning: no previous prototype for function 'inline_bpf_loop' [-Wmissing-prototypes]
   struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
                    ^
   kernel/bpf/verifier.c:14318:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
   ^
   static 
   2 warnings generated.


vim +/update_loop_inline_state +7111 kernel/bpf/verifier.c

  7110	
> 7111	void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
  7112	{
  7113		struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
  7114		struct bpf_reg_state *regs = cur_regs(env);
  7115		struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
  7116	
  7117		int flags_is_zero =
  7118			register_is_const(flags_reg) && flags_reg->var_off.value == 0;
  7119	
  7120		if (state->initialized) {
  7121			state->fit_for_inline &=
  7122				flags_is_zero &&
  7123				state->callback_subprogno == subprogno;
  7124		} else {
  7125			state->initialized = 1;
  7126			state->fit_for_inline = flags_is_zero;
  7127			state->callback_subprogno = subprogno;
  7128		}
  7129	}
  7130
Song Liu June 10, 2022, 8:54 p.m. UTC | #2
On Wed, Jun 8, 2022 at 12:27 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
[...]

>
> Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
> ---
>  include/linux/bpf.h          |   3 +
>  include/linux/bpf_verifier.h |  12 +++
>  kernel/bpf/bpf_iter.c        |   9 +-
>  kernel/bpf/verifier.c        | 168 +++++++++++++++++++++++++++++++++--
>  4 files changed, 183 insertions(+), 9 deletions(-)

[...]

> +struct bpf_loop_inline_state {
> +       bool initialized; /* set to true upon first entry */
> +       bool fit_for_inline; /* true if callback function is the same
> +                             * at each call and flags are always zero
> +                             */
> +       u32 callback_subprogno; /* valid when fit_for_inline is true */
> +};

nit: We only need one bit for initialized and fit_for_inline.

> +
>  /* Possible states for alu_state member. */
>  #define BPF_ALU_SANITIZE_SRC           (1U << 0)
>  #define BPF_ALU_SANITIZE_DST           (1U << 1)
> @@ -373,6 +381,10 @@ struct bpf_insn_aux_data {
>                                 u32 mem_size;   /* mem_size for non-struct typed var */
>                         };

[...]

> +
> +void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)

static void ...

> +{
> +       struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
> +       struct bpf_reg_state *regs = cur_regs(env);
> +       struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
> +

nit: we usually don't have empty lines here.

> +       int flags_is_zero =
> +               register_is_const(flags_reg) && flags_reg->var_off.value == 0;

If we replace "fit_for_inline" with "not_fit_for_inline", we can make the cannot
inline case faster with:

  if (state->not_fit_for_inline)
      return;

> +
> +       if (state->initialized) {
> +               state->fit_for_inline &=
> +                       flags_is_zero &&
> +                       state->callback_subprogno == subprogno;
> +       } else {
> +               state->initialized = 1;
> +               state->fit_for_inline = flags_is_zero;
> +               state->callback_subprogno = subprogno;
> +       }
> +}
> +
[...]

>
> +struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
> +                                int position,
> +                                s32 stack_base,
> +                                u32 callback_subprogno,
> +                                u32 *cnt)

missing static

> +{
> +       s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
> +       s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
> +       s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
> +       int reg_loop_max = BPF_REG_6;
> +       int reg_loop_cnt = BPF_REG_7;
> +       int reg_loop_ctx = BPF_REG_8;
> +
[...]
Eduard Zingerman June 10, 2022, 9:54 p.m. UTC | #3
> On Fri, 2022-06-10 at 13:54 -0700, Song Liu wrote:

> > +
> > +void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
> 
> static void ...
> 
> > +{
> > +       struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
> > +       struct bpf_reg_state *regs = cur_regs(env);
> > +       struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
> > +
> 
> nit: we usually don't have empty lines here.
> 
> > +       int flags_is_zero =
> > +               register_is_const(flags_reg) && flags_reg->var_off.value == 0;
> 
> If we replace "fit_for_inline" with "not_fit_for_inline", we can make the cannot
> inline case faster with:
> 
>   if (state->not_fit_for_inline)
>       return;
> 
> > +
> > +       if (state->initialized) {
> > +               state->fit_for_inline &=
> > +                       flags_is_zero &&
> > +                       state->callback_subprogno == subprogno;
> > +       } else {
> > +               state->initialized = 1;
> > +               state->fit_for_inline = flags_is_zero;
> > +               state->callback_subprogno = subprogno;
> > +       }
> > +}
> > +

Sorry, I'm not sure that I understand you correctly. Do you want me to
rewrite the code as follows:

struct bpf_loop_inline_state {
	int initialized:1; /* set to true upon first entry */
	int not_fit_for_inline:1; /* false if callback function is thesame
				   * at each call and flags are always zero
				   */
	u32 callback_subprogno; /* valid when fit_for_inline is true */
};

static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
{
	struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
	struct bpf_reg_state *regs = cur_regs(env);
	struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
	int flags_is_zero =
		register_is_const(flags_reg) && flags_reg->var_off.value == 0;

	if (state->not_fit_for_inline)
		return;

	if (state->initialized) {
		state->not_fit_for_inline |=
			!flags_is_zero ||
			state->callback_subprogno != subprogno;
	} else {
		state->initialized = 1;
		state->not_fit_for_inline = !flags_is_zero;
		state->callback_subprogno = subprogno;
	}
}

// ...

static int optimize_bpf_loop(struct bpf_verifier_env *env)
{
	// ...
		if (is_bpf_loop_call(insn) && !inline_state->not_fit_for_inline) {
	// ...
}

IMO, the code is less clear after such rewrite, also
`update_loop_inline_state` is not on a hot path (it is called from
`check_helper_call` only when helper is `bpf_loop`). Are you sure this
rewrite is necessary?

Thanks,
Eduard
Song Liu June 10, 2022, 10:40 p.m. UTC | #4
On Fri, Jun 10, 2022 at 2:55 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> > On Fri, 2022-06-10 at 13:54 -0700, Song Liu wrote:
>
> > > +
> > > +void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
> >
> > static void ...
> >
> > > +{
> > > +       struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
> > > +       struct bpf_reg_state *regs = cur_regs(env);
> > > +       struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
> > > +
> >
> > nit: we usually don't have empty lines here.
> >
> > > +       int flags_is_zero =
> > > +               register_is_const(flags_reg) && flags_reg->var_off.value == 0;
> >
> > If we replace "fit_for_inline" with "not_fit_for_inline", we can make the cannot
> > inline case faster with:
> >
> >   if (state->not_fit_for_inline)
> >       return;
> >
> > > +
> > > +       if (state->initialized) {
> > > +               state->fit_for_inline &=
> > > +                       flags_is_zero &&
> > > +                       state->callback_subprogno == subprogno;
> > > +       } else {
> > > +               state->initialized = 1;
> > > +               state->fit_for_inline = flags_is_zero;
> > > +               state->callback_subprogno = subprogno;
> > > +       }
> > > +}
> > > +
>
> Sorry, I'm not sure that I understand you correctly. Do you want me to
> rewrite the code as follows:

Yes, I was thinking about this change. I guess it can also be clear:

static void update_loop_inline_state(struct bpf_verifier_env *env, u32
subprogno)
{
        struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
        int flags_is_zero;

        if (state->cannot_inline)
                return;

        flags_is_zero = register_is_const(flags_reg) &&
flags_reg->var_off.value == 0;

        if (!state->initialized) {
                state->initialized = 1;
                state->cannot_inline = !flags_is_zero;
                state->callback_subprogno = subprogno;
                return;
        }

        state->cannot_inline = !flags_is_zero ||
                state->callback_subprogno != subprogno;
}

What do you think about this version?

Thanks,
Song
Eduard Zingerman June 10, 2022, 10:49 p.m. UTC | #5
> On Fri, 2022-06-10 at 15:40 -0700, Song Liu wrote:
> Yes, I was thinking about this change. I guess it can also be clear:
> 
> static void update_loop_inline_state(struct bpf_verifier_env *env, u32
> subprogno)
> {
>         struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
>         struct bpf_reg_state *regs = cur_regs(env);
>         struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
>         int flags_is_zero;
> 
>         if (state->cannot_inline)
>                 return;
> 
>         flags_is_zero = register_is_const(flags_reg) &&
> flags_reg->var_off.value == 0;
> 
>         if (!state->initialized) {
>                 state->initialized = 1;
>                 state->cannot_inline = !flags_is_zero;
>                 state->callback_subprogno = subprogno;
>                 return;
>         }
> 
>         state->cannot_inline = !flags_is_zero ||
>                 state->callback_subprogno != subprogno;
> }
> 
> What do you think about this version?

Maybe keep `fit_for_inline` to minimize amount of negations?
As below:

static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
{
        struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
        int flags_is_zero;

/* this should be compiled as a single instruction anyway */
        if (!state->fit_for_inline)
                return;

        flags_is_zero = register_is_const(flags_reg) && flags_reg->var_off.value == 0;

        if (!state->initialized) {
                state->initialized = 1;
                state->fit_for_inline = flags_is_zero;
                state->callback_subprogno = subprogno;
                return;
        }

        state->fit_for_inline = flags_is_zero &&
                state->callback_subprogno == subprogno;
}

// ...

static int optimize_bpf_loop(struct bpf_verifier_env *env)
{
	if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { ... }
// vs
	if (is_bpf_loop_call(insn) && !inline_state->cannot_inline) { ... }
}

Thanks,
Eduard
Song Liu June 10, 2022, 11:01 p.m. UTC | #6
On Fri, Jun 10, 2022 at 3:49 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
[...]
> >
> >         state->cannot_inline = !flags_is_zero ||
> >                 state->callback_subprogno != subprogno;
> > }
> >
> > What do you think about this version?
>
> Maybe keep `fit_for_inline` to minimize amount of negations?
> As below:
>
> static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
> {
>         struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
>         struct bpf_reg_state *regs = cur_regs(env);
>         struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
>         int flags_is_zero;
>
> /* this should be compiled as a single instruction anyway */
>         if (!state->fit_for_inline)
>                 return;

In this case, we need to initialize fit_for_inline to true, no?

Thanks,
Song

>
>         flags_is_zero = register_is_const(flags_reg) && flags_reg->var_off.value == 0;
>
>         if (!state->initialized) {
>                 state->initialized = 1;
>                 state->fit_for_inline = flags_is_zero;
>                 state->callback_subprogno = subprogno;
>                 return;
>         }
>
>         state->fit_for_inline = flags_is_zero &&
>                 state->callback_subprogno == subprogno;
> }
>
> // ...
>
> static int optimize_bpf_loop(struct bpf_verifier_env *env)
> {
>         if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { ... }
> // vs
>         if (is_bpf_loop_call(insn) && !inline_state->cannot_inline) { ... }
> }
>
> Thanks,
> Eduard
>
Eduard Zingerman June 10, 2022, 11:21 p.m. UTC | #7
> On Fri, 2022-06-10 at 16:01 -0700, Song Liu wrote:
> 
> In this case, we need to initialize fit_for_inline to true, no?

You are right...

My Last try is below, if you don't like it I'll proceed with your
version.  I just really don't like the "not-cannot" part in the
expression "!inline_state->cannot_inline" :)

static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
{
	struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
	struct bpf_reg_state *regs = cur_regs(env);
	struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
	int flags_is_zero =
		register_is_const(flags_reg) && flags_reg->var_off.value == 0;

	if (!state->initialized) {
		state->initialized = 1;
		state->fit_for_inline = flags_is_zero;
		state->callback_subprogno = subprogno;
		return;
	}

	if (!state->fit_for_inline)
		return;

	state->fit_for_inline =
		flags_is_zero &&
		state->callback_subprogno == subprogno;
}

static int optimize_bpf_loop(struct bpf_verifier_env *env)
{
	// ...
	if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { ... }
	// ...
}

Thanks,
Eduard
Song Liu June 11, 2022, 1:46 a.m. UTC | #8
On Fri, Jun 10, 2022 at 4:21 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> > On Fri, 2022-06-10 at 16:01 -0700, Song Liu wrote:
> >
> > In this case, we need to initialize fit_for_inline to true, no?
>
> You are right...
>
> My Last try is below, if you don't like it I'll proceed with your
> version.  I just really don't like the "not-cannot" part in the
> expression "!inline_state->cannot_inline" :)

Yeah, this version looks good to me.

Thanks,
Song

>
> static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
> {
>         struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
>         struct bpf_reg_state *regs = cur_regs(env);
>         struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
>         int flags_is_zero =
>                 register_is_const(flags_reg) && flags_reg->var_off.value == 0;
>
>         if (!state->initialized) {
>                 state->initialized = 1;
>                 state->fit_for_inline = flags_is_zero;
>                 state->callback_subprogno = subprogno;
>                 return;
>         }
>
>         if (!state->fit_for_inline)
>                 return;
>
>         state->fit_for_inline =
>                 flags_is_zero &&
>                 state->callback_subprogno == subprogno;
> }
>
> static int optimize_bpf_loop(struct bpf_verifier_env *env)
> {
>         // ...
>         if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { ... }
>         // ...
> }
>
> Thanks,
> Eduard
>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8e6092d0ea95..3c75ede138b5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1236,6 +1236,9 @@  struct bpf_array {
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 33
 
+/* Maximum number of loops for bpf_loop */
+#define BPF_MAX_LOOPS	BIT(23)
+
 #define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
 				 BPF_F_RDONLY_PROG |	\
 				 BPF_F_WRONLY |		\
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e8439f6cbe57..5df09fc46043 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -344,6 +344,14 @@  struct bpf_verifier_state_list {
 	int miss_cnt, hit_cnt;
 };
 
+struct bpf_loop_inline_state {
+	bool initialized; /* set to true upon first entry */
+	bool fit_for_inline; /* true if callback function is the same
+			      * at each call and flags are always zero
+			      */
+	u32 callback_subprogno; /* valid when fit_for_inline is true */
+};
+
 /* Possible states for alu_state member. */
 #define BPF_ALU_SANITIZE_SRC		(1U << 0)
 #define BPF_ALU_SANITIZE_DST		(1U << 1)
@@ -373,6 +381,10 @@  struct bpf_insn_aux_data {
 				u32 mem_size;	/* mem_size for non-struct typed var */
 			};
 		} btf_var;
+		/* if instruction is a call to bpf_loop this field tracks
+		 * the state of the relevant registers to make decision about inlining
+		 */
+		struct bpf_loop_inline_state loop_inline_state;
 	};
 	u64 map_key_state; /* constant (32 bit) key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index d5d96ceca105..7e8fd49406f6 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -723,9 +723,6 @@  const struct bpf_func_proto bpf_for_each_map_elem_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-/* maximum number of loops */
-#define MAX_LOOPS	BIT(23)
-
 BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
 	   u64, flags)
 {
@@ -733,9 +730,13 @@  BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
 	u64 ret;
 	u32 i;
 
+	/* Note: these safety checks are also verified when bpf_loop
+	 * is inlined, be careful to modify this code in sync. See
+	 * function verifier.c:inline_bpf_loop.
+	 */
 	if (flags)
 		return -EINVAL;
-	if (nr_loops > MAX_LOOPS)
+	if (nr_loops > BPF_MAX_LOOPS)
 		return -E2BIG;
 
 	for (i = 0; i < nr_loops; i++) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2d2872682278..81291dfa63cf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7103,6 +7103,31 @@  static int check_get_func_ip(struct bpf_verifier_env *env)
 	return -ENOTSUPP;
 }
 
+static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+{
+	return &env->insn_aux_data[env->insn_idx];
+}
+
+void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
+{
+	struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
+
+	int flags_is_zero =
+		register_is_const(flags_reg) && flags_reg->var_off.value == 0;
+
+	if (state->initialized) {
+		state->fit_for_inline &=
+			flags_is_zero &&
+			state->callback_subprogno == subprogno;
+	} else {
+		state->initialized = 1;
+		state->fit_for_inline = flags_is_zero;
+		state->callback_subprogno = subprogno;
+	}
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			     int *insn_idx_p)
 {
@@ -7255,6 +7280,7 @@  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		err = check_bpf_snprintf_call(env, regs);
 		break;
 	case BPF_FUNC_loop:
+		update_loop_inline_state(env, meta.subprogno);
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_loop_callback_state);
 		break;
@@ -7661,11 +7687,6 @@  static bool check_reg_sane_offset(struct bpf_verifier_env *env,
 	return true;
 }
 
-static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
-{
-	return &env->insn_aux_data[env->insn_idx];
-}
-
 enum {
 	REASON_BOUNDS	= -1,
 	REASON_TYPE	= -2,
@@ -14294,6 +14315,140 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 	return 0;
 }
 
+struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
+				 int position,
+				 s32 stack_base,
+				 u32 callback_subprogno,
+				 u32 *cnt)
+{
+	s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
+	s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
+	s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
+	int reg_loop_max = BPF_REG_6;
+	int reg_loop_cnt = BPF_REG_7;
+	int reg_loop_ctx = BPF_REG_8;
+
+	struct bpf_prog *new_prog;
+	u32 callback_start;
+	u32 call_insn_offset;
+	s32 callback_offset;
+
+	/* This represents an inlined version of bpf_iter.c:bpf_loop,
+	 * be careful to modify this code in sync.
+	 */
+	struct bpf_insn insn_buf[] = {
+		/* Return error and jump to the end of the patch if
+		 * expected number of iterations is too big.
+		 */
+		BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2),
+		BPF_MOV32_IMM(BPF_REG_0, -E2BIG),
+		BPF_JMP_IMM(BPF_JA, 0, 0, 16),
+		/* spill R6, R7, R8 to use these as loop vars */
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset),
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset),
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset),
+		/* initialize loop vars */
+		BPF_MOV64_REG(reg_loop_max, BPF_REG_1),
+		BPF_MOV32_IMM(reg_loop_cnt, 0),
+		BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3),
+		/* loop header,
+		 * if reg_loop_cnt >= reg_loop_max skip the loop body
+		 */
+		BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5),
+		/* callback call,
+		 * correct callback offset would be set after patching
+		 */
+		BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt),
+		BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx),
+		BPF_CALL_REL(0),
+		/* increment loop counter */
+		BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1),
+		/* jump to loop header if callback returned 0 */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6),
+		/* return value of bpf_loop,
+		 * set R0 to the number of iterations
+		 */
+		BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt),
+		/* restore original values of R6, R7, R8 */
+		BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset),
+	};
+
+	*cnt = ARRAY_SIZE(insn_buf);
+	new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
+	if (!new_prog)
+		return new_prog;
+
+	/* callback start is known only after patching */
+	callback_start = env->subprog_info[callback_subprogno].start;
+	/* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
+	call_insn_offset = position + 12;
+	callback_offset = callback_start - call_insn_offset - 1;
+	env->prog->insnsi[call_insn_offset].imm = callback_offset;
+
+	return new_prog;
+}
+
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+	return insn->code == (BPF_JMP | BPF_CALL) &&
+		insn->src_reg == 0 &&
+		insn->imm == BPF_FUNC_loop;
+}
+
+/* For all sub-programs in the program (including main) check
+ * insn_aux_data to see if there are bpf_loop calls that require
+ * inlining. If such calls are found the calls are replaced with a
+ * sequence of instructions produced by `inline_bpf_loop` function and
+ * subprog stack_depth is increased by the size of 3 registers.
+ * This stack space is used to spill values of the R6, R7, R8.  These
+ * registers are used to store the loop bound, counter and context
+ * variables.
+ */
+static int optimize_bpf_loop(struct bpf_verifier_env *env)
+{
+	struct bpf_subprog_info *subprogs = env->subprog_info;
+	int i, cur_subprog = 0, cnt, delta = 0;
+	struct bpf_insn *insn = env->prog->insnsi;
+	int insn_cnt = env->prog->len;
+	u16 stack_depth = subprogs[cur_subprog].stack_depth;
+	u16 stack_depth_extra = 0;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		struct bpf_loop_inline_state *inline_state =
+			&env->insn_aux_data[i + delta].loop_inline_state;
+
+		if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
+			struct bpf_prog *new_prog;
+
+			stack_depth_extra = BPF_REG_SIZE * 3;
+			new_prog = inline_bpf_loop(env,
+						   i + delta,
+						   -(stack_depth + stack_depth_extra),
+						   inline_state->callback_subprogno,
+						   &cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta     += cnt - 1;
+			env->prog  = new_prog;
+			insn       = new_prog->insnsi + i + delta;
+		}
+
+		if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+			subprogs[cur_subprog].stack_depth += stack_depth_extra;
+			cur_subprog++;
+			stack_depth = subprogs[cur_subprog].stack_depth;
+			stack_depth_extra = 0;
+		}
+	}
+
+	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+
+	return 0;
+}
+
 static void free_states(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state_list *sl, *sln;
@@ -15030,6 +15185,9 @@  int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
 	if (ret == 0)
 		ret = check_max_stack_depth(env);
 
+	if (ret == 0)
+		optimize_bpf_loop(env);
+
 	/* instruction rewrites happen after this point */
 	if (is_priv) {
 		if (ret == 0)