diff mbox series

[bpf-next,2/3] bpf: Add verifier checks for bpf_timer.

Message ID 20210527040259.77823-3-alexei.starovoitov@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: Introduce BPF timers. | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 9 maintainers not CCed: haoluo@google.com yhs@fb.com kpsingh@kernel.org jolsa@kernel.org kafai@fb.com ast@kernel.org john.fastabend@gmail.com songliubraving@fb.com alan.maguire@oracle.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 9983 this patch: 9981
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Blank lines aren't necessary after an open brace '{' WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 10397 this patch: 10395
netdev/header_inline success Link

Commit Message

Alexei Starovoitov May 27, 2021, 4:02 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Add appropriate safety checks for bpf_timer:
- restrict to array, hash, lru. per-cpu maps cannot be supported.
- kfree bpf_timer during map_delete_elem and map_free.
- verifier btf checks.
- safe interaction with lookup/update/delete operations and iterator.
- relax the first field only requirement of the previous patch.
- allow bpf_timer in global data and search for it in datasec.
- check prog_rdonly, frozen flags.
- mmap is allowed. otherwise global timer is not possible.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h        | 36 +++++++++++++-----
 include/linux/btf.h        |  1 +
 kernel/bpf/arraymap.c      |  7 ++++
 kernel/bpf/btf.c           | 77 +++++++++++++++++++++++++++++++-------
 kernel/bpf/hashtab.c       | 53 ++++++++++++++++++++------
 kernel/bpf/helpers.c       |  2 +-
 kernel/bpf/local_storage.c |  4 +-
 kernel/bpf/syscall.c       | 23 ++++++++++--
 kernel/bpf/verifier.c      | 30 +++++++++++++--
 9 files changed, 190 insertions(+), 43 deletions(-)

Comments

Andrii Nakryiko June 2, 2021, 10:34 p.m. UTC | #1
On Wed, May 26, 2021 at 9:03 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> From: Alexei Starovoitov <ast@kernel.org>
>
> Add appropriate safety checks for bpf_timer:
> - restrict to array, hash, lru. per-cpu maps cannot be supported.
> - kfree bpf_timer during map_delete_elem and map_free.
> - verifier btf checks.
> - safe interaction with lookup/update/delete operations and iterator.
> - relax the first field only requirement of the previous patch.
> - allow bpf_timer in global data and search for it in datasec.
> - check prog_rdonly, frozen flags.
> - mmap is allowed. otherwise global timer is not possible.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/bpf.h        | 36 +++++++++++++-----
>  include/linux/btf.h        |  1 +
>  kernel/bpf/arraymap.c      |  7 ++++
>  kernel/bpf/btf.c           | 77 +++++++++++++++++++++++++++++++-------
>  kernel/bpf/hashtab.c       | 53 ++++++++++++++++++++------
>  kernel/bpf/helpers.c       |  2 +-
>  kernel/bpf/local_storage.c |  4 +-
>  kernel/bpf/syscall.c       | 23 ++++++++++--
>  kernel/bpf/verifier.c      | 30 +++++++++++++--
>  9 files changed, 190 insertions(+), 43 deletions(-)
>

[...]

>  /* copy everything but bpf_spin_lock */
>  static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
>  {
> +       u32 off = 0, size = 0;
> +
>         if (unlikely(map_value_has_spin_lock(map))) {
> -               u32 off = map->spin_lock_off;
> +               off = map->spin_lock_off;
> +               size = sizeof(struct bpf_spin_lock);
> +       } else if (unlikely(map_value_has_timer(map))) {
> +               off = map->timer_off;
> +               size = sizeof(struct bpf_timer);
> +       }

so the need to handle 0, 1, or 2 gaps seems to be the only reason to
disallow both bpf_spinlock and bpf_timer in one map element, right?
Isn't it worth addressing it from the very beginning to lift the
artificial restriction? E.g., for speed, you'd do:

if (likely(neither spinlock nor timer)) {
 /* fastest pass */
} else if (only one of spinlock or timer) {
  /* do what you do here */
} else {
  int off1, off2, sz1, sz2;

  if (spinlock_off < timer_off) {
    off1 = spinlock_off;
    sz1 = spinlock_sz;
    off2 = timer_off;
    sz2 = timer_sz;
  } else {
    ... you get the idea
  }

  memcpy(0, off1);
  memcpy(off1+sz1, off2);
  memcpy(off2+sz2, total_sz);
}

It's not that bad, right?

>
> +       if (unlikely(size)) {
>                 memcpy(dst, src, off);
> -               memcpy(dst + off + sizeof(struct bpf_spin_lock),
> -                      src + off + sizeof(struct bpf_spin_lock),
> -                      map->value_size - off - sizeof(struct bpf_spin_lock));
> +               memcpy(dst + off + size,
> +                      src + off + size,
> +                      map->value_size - off - size);
>         } else {
>                 memcpy(dst, src, map->value_size);
>         }

[...]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index f386f85aee5c..0a828dc4968e 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -3241,6 +3241,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
>                         return -EACCES;
>                 }
>         }
> +       if (map_value_has_timer(map)) {
> +               u32 t = map->timer_off;
> +
> +               if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&

<= ? Otherwise we allow accessing the first byte, unless I'm mistaken.

> +                    t < reg->umax_value + off + size) {
> +                       verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
> +                       return -EACCES;
> +               }
> +       }
>         return err;
>  }
>
> @@ -4675,9 +4684,24 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
>                         map->name);
>                 return -EINVAL;
>         }
> -       if (val) {
> -               /* todo: relax this requirement */
> -               verbose(env, "bpf_timer field can only be first in the map value element\n");

ok, this was confusing, but now I see why you did that...

> +       if (!map_value_has_timer(map)) {
> +               if (map->timer_off == -E2BIG)
> +                       verbose(env,
> +                               "map '%s' has more than one 'struct bpf_timer'\n",
> +                               map->name);
> +               else if (map->timer_off == -ENOENT)
> +                       verbose(env,
> +                               "map '%s' doesn't have 'struct bpf_timer'\n",
> +                               map->name);
> +               else
> +                       verbose(env,
> +                               "map '%s' is not a struct type or bpf_timer is mangled\n",
> +                               map->name);
> +               return -EINVAL;
> +       }
> +       if (map->timer_off != val + reg->off) {
> +               verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
> +                       val + reg->off, map->timer_off);
>                 return -EINVAL;
>         }
>         WARN_ON(meta->map_ptr);
> --
> 2.30.2
>
Andrii Nakryiko June 2, 2021, 10:38 p.m. UTC | #2
On Wed, Jun 2, 2021 at 3:34 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Wed, May 26, 2021 at 9:03 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Add appropriate safety checks for bpf_timer:
> > - restrict to array, hash, lru. per-cpu maps cannot be supported.
> > - kfree bpf_timer during map_delete_elem and map_free.
> > - verifier btf checks.
> > - safe interaction with lookup/update/delete operations and iterator.
> > - relax the first field only requirement of the previous patch.
> > - allow bpf_timer in global data and search for it in datasec.

I'll mention it here for completeness. I don't think safety
implications are worth it to support timer or spinlock in
memory-mapped maps. It's way too easy to abuse it (or even
accidentally corrupt kernel state). Sure it's nice, but doing an
explicit single-element map for "global" timer is just fine. And it
generalizes nicely to having 2, 3, ..., N timers.

> > - check prog_rdonly, frozen flags.
> > - mmap is allowed. otherwise global timer is not possible.
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > ---
> >  include/linux/bpf.h        | 36 +++++++++++++-----
> >  include/linux/btf.h        |  1 +
> >  kernel/bpf/arraymap.c      |  7 ++++
> >  kernel/bpf/btf.c           | 77 +++++++++++++++++++++++++++++++-------
> >  kernel/bpf/hashtab.c       | 53 ++++++++++++++++++++------
> >  kernel/bpf/helpers.c       |  2 +-
> >  kernel/bpf/local_storage.c |  4 +-
> >  kernel/bpf/syscall.c       | 23 ++++++++++--
> >  kernel/bpf/verifier.c      | 30 +++++++++++++--
> >  9 files changed, 190 insertions(+), 43 deletions(-)
> >
>
> [...]
>
> >  /* copy everything but bpf_spin_lock */
> >  static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
> >  {
> > +       u32 off = 0, size = 0;
> > +
> >         if (unlikely(map_value_has_spin_lock(map))) {
> > -               u32 off = map->spin_lock_off;
> > +               off = map->spin_lock_off;
> > +               size = sizeof(struct bpf_spin_lock);
> > +       } else if (unlikely(map_value_has_timer(map))) {
> > +               off = map->timer_off;
> > +               size = sizeof(struct bpf_timer);
> > +       }
>
> so the need to handle 0, 1, or 2 gaps seems to be the only reason to
> disallow both bpf_spinlock and bpf_timer in one map element, right?
> Isn't it worth addressing it from the very beginning to lift the
> artificial restriction? E.g., for speed, you'd do:
>
> if (likely(neither spinlock nor timer)) {
>  /* fastest pass */
> } else if (only one of spinlock or timer) {
>   /* do what you do here */
> } else {
>   int off1, off2, sz1, sz2;
>
>   if (spinlock_off < timer_off) {
>     off1 = spinlock_off;
>     sz1 = spinlock_sz;
>     off2 = timer_off;
>     sz2 = timer_sz;
>   } else {
>     ... you get the idea
>   }
>
>   memcpy(0, off1);
>   memcpy(off1+sz1, off2);
>   memcpy(off2+sz2, total_sz);
> }
>
> It's not that bad, right?
>
> >
> > +       if (unlikely(size)) {
> >                 memcpy(dst, src, off);
> > -               memcpy(dst + off + sizeof(struct bpf_spin_lock),
> > -                      src + off + sizeof(struct bpf_spin_lock),
> > -                      map->value_size - off - sizeof(struct bpf_spin_lock));
> > +               memcpy(dst + off + size,
> > +                      src + off + size,
> > +                      map->value_size - off - size);
> >         } else {
> >                 memcpy(dst, src, map->value_size);
> >         }
>
> [...]
>
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index f386f85aee5c..0a828dc4968e 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -3241,6 +3241,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
> >                         return -EACCES;
> >                 }
> >         }
> > +       if (map_value_has_timer(map)) {
> > +               u32 t = map->timer_off;
> > +
> > +               if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
>
> <= ? Otherwise we allow accessing the first byte, unless I'm mistaken.
>
> > +                    t < reg->umax_value + off + size) {
> > +                       verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
> > +                       return -EACCES;
> > +               }
> > +       }
> >         return err;
> >  }
> >
> > @@ -4675,9 +4684,24 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
> >                         map->name);
> >                 return -EINVAL;
> >         }
> > -       if (val) {
> > -               /* todo: relax this requirement */
> > -               verbose(env, "bpf_timer field can only be first in the map value element\n");
>
> ok, this was confusing, but now I see why you did that...
>
> > +       if (!map_value_has_timer(map)) {
> > +               if (map->timer_off == -E2BIG)
> > +                       verbose(env,
> > +                               "map '%s' has more than one 'struct bpf_timer'\n",
> > +                               map->name);
> > +               else if (map->timer_off == -ENOENT)
> > +                       verbose(env,
> > +                               "map '%s' doesn't have 'struct bpf_timer'\n",
> > +                               map->name);
> > +               else
> > +                       verbose(env,
> > +                               "map '%s' is not a struct type or bpf_timer is mangled\n",
> > +                               map->name);
> > +               return -EINVAL;
> > +       }
> > +       if (map->timer_off != val + reg->off) {
> > +               verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
> > +                       val + reg->off, map->timer_off);
> >                 return -EINVAL;
> >         }
> >         WARN_ON(meta->map_ptr);
> > --
> > 2.30.2
> >
Alexei Starovoitov June 3, 2021, 2:04 a.m. UTC | #3
On Wed, Jun 02, 2021 at 03:34:29PM -0700, Andrii Nakryiko wrote:
> 
> >  /* copy everything but bpf_spin_lock */
> >  static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
> >  {
> > +       u32 off = 0, size = 0;
> > +
> >         if (unlikely(map_value_has_spin_lock(map))) {
> > -               u32 off = map->spin_lock_off;
> > +               off = map->spin_lock_off;
> > +               size = sizeof(struct bpf_spin_lock);
> > +       } else if (unlikely(map_value_has_timer(map))) {
> > +               off = map->timer_off;
> > +               size = sizeof(struct bpf_timer);
> > +       }
> 
> so the need to handle 0, 1, or 2 gaps seems to be the only reason to
> disallow both bpf_spinlock and bpf_timer in one map element, right?

exactly.

> Isn't it worth addressing it from the very beginning to lift the
> artificial restriction? E.g., for speed, you'd do:
> 
> if (likely(neither spinlock nor timer)) {
>  /* fastest pass */
> } else if (only one of spinlock or timer) {
>   /* do what you do here */
> } else {
>   int off1, off2, sz1, sz2;
> 
>   if (spinlock_off < timer_off) {
>     off1 = spinlock_off;
>     sz1 = spinlock_sz;
>     off2 = timer_off;
>     sz2 = timer_sz;
>   } else {
>     ... you get the idea

Not really :)
Are you suggesting to support one bpf_spin_lock and one
bpf_timer inside single map element, but not two spin_locks
and/or not two bpf_timers?
Two me it's either one or support any.
Anything in-between doesn't seem worth extra code.

> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index f386f85aee5c..0a828dc4968e 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -3241,6 +3241,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
> >                         return -EACCES;
> >                 }
> >         }
> > +       if (map_value_has_timer(map)) {
> > +               u32 t = map->timer_off;
> > +
> > +               if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
> 
> <= ? Otherwise we allow accessing the first byte, unless I'm mistaken.

I don't think so. See the comment above in if (map_value_has_spin_lock(map))
I didn't copy-paste it, because it's the same logic.

> > -       if (val) {
> > -               /* todo: relax this requirement */
> > -               verbose(env, "bpf_timer field can only be first in the map value element\n");
> 
> ok, this was confusing, but now I see why you did that...

I'll clarify the comment to say that the next patch fixes it.
Andrii Nakryiko June 3, 2021, 5:35 p.m. UTC | #4
On Wed, Jun 2, 2021 at 7:04 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Jun 02, 2021 at 03:34:29PM -0700, Andrii Nakryiko wrote:
> >
> > >  /* copy everything but bpf_spin_lock */
> > >  static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
> > >  {
> > > +       u32 off = 0, size = 0;
> > > +
> > >         if (unlikely(map_value_has_spin_lock(map))) {
> > > -               u32 off = map->spin_lock_off;
> > > +               off = map->spin_lock_off;
> > > +               size = sizeof(struct bpf_spin_lock);
> > > +       } else if (unlikely(map_value_has_timer(map))) {
> > > +               off = map->timer_off;
> > > +               size = sizeof(struct bpf_timer);
> > > +       }
> >
> > so the need to handle 0, 1, or 2 gaps seems to be the only reason to
> > disallow both bpf_spinlock and bpf_timer in one map element, right?
>
> exactly.
>
> > Isn't it worth addressing it from the very beginning to lift the
> > artificial restriction? E.g., for speed, you'd do:
> >
> > if (likely(neither spinlock nor timer)) {
> >  /* fastest pass */
> > } else if (only one of spinlock or timer) {
> >   /* do what you do here */
> > } else {
> >   int off1, off2, sz1, sz2;
> >
> >   if (spinlock_off < timer_off) {
> >     off1 = spinlock_off;
> >     sz1 = spinlock_sz;
> >     off2 = timer_off;
> >     sz2 = timer_sz;
> >   } else {
> >     ... you get the idea
>
> Not really :)

hm, really? I meant that else will be:

     off1 = timer_off;
     sz1 = timer_sz;
     off2 = spinlock_off;
     sz2 = spinlock_sz;

Just making sure that off1 < off2 always and sz1 and sz2 are matching

> Are you suggesting to support one bpf_spin_lock and one
> bpf_timer inside single map element, but not two spin_locks
> and/or not two bpf_timers?

Yes, exactly. I see bpf_spinlock and bpf_timer as two independent
orthogonal features and I don't understand why we restrict using just
one of them in a given map element. I think those 20 lines of code
that decouples them and removes artificial restriction that users need
to remember (or discover with surprise) is totally worth it.

> Two me it's either one or support any.

I think it's fine to start with supporting one. But one of each. They
are independent of each other.

> Anything in-between doesn't seem worth extra code.

Up to you, but I disagree, obviously. It's possible to work-around
that limitation with extra maps/complexity, so if I ever need to both
lock an element and schedule the timer with it, it's not going to stop
me. :)

>
> > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > > index f386f85aee5c..0a828dc4968e 100644
> > > --- a/kernel/bpf/verifier.c
> > > +++ b/kernel/bpf/verifier.c
> > > @@ -3241,6 +3241,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
> > >                         return -EACCES;
> > >                 }
> > >         }
> > > +       if (map_value_has_timer(map)) {
> > > +               u32 t = map->timer_off;
> > > +
> > > +               if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
> >
> > <= ? Otherwise we allow accessing the first byte, unless I'm mistaken.
>
> I don't think so. See the comment above in if (map_value_has_spin_lock(map))
> I didn't copy-paste it, because it's the same logic.

Oh, I didn't realize that this is the interval intersection check I
suggested a long time ago :) yeah, that still looks correct

>
> > > -       if (val) {
> > > -               /* todo: relax this requirement */
> > > -               verbose(env, "bpf_timer field can only be first in the map value element\n");
> >
> > ok, this was confusing, but now I see why you did that...
>
> I'll clarify the comment to say that the next patch fixes it.

ok, thanks
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 925b8416ea0a..7deff4438808 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -168,6 +168,7 @@  struct bpf_map {
 	u32 max_entries;
 	u32 map_flags;
 	int spin_lock_off; /* >=0 valid offset, <0 error */
+	int timer_off; /* >=0 valid offset, <0 error */
 	u32 id;
 	int numa_node;
 	u32 btf_key_type_id;
@@ -197,24 +198,41 @@  static inline bool map_value_has_spin_lock(const struct bpf_map *map)
 	return map->spin_lock_off >= 0;
 }
 
-static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
+static inline bool map_value_has_timer(const struct bpf_map *map)
 {
-	if (likely(!map_value_has_spin_lock(map)))
-		return;
-	*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
-		(struct bpf_spin_lock){};
+	return map->timer_off >= 0;
+}
+
+void bpf_timer_cancel_and_free(void *timer);
+
+static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
+{
+	if (unlikely(map_value_has_spin_lock(map)))
+		*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
+			(struct bpf_spin_lock){};
+	if (unlikely(map_value_has_timer(map)))
+		*(struct bpf_timer *)(dst + map->timer_off) =
+			(struct bpf_timer){};
 }
 
 /* copy everything but bpf_spin_lock */
 static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 {
+	u32 off = 0, size = 0;
+
 	if (unlikely(map_value_has_spin_lock(map))) {
-		u32 off = map->spin_lock_off;
+		off = map->spin_lock_off;
+		size = sizeof(struct bpf_spin_lock);
+	} else if (unlikely(map_value_has_timer(map))) {
+		off = map->timer_off;
+		size = sizeof(struct bpf_timer);
+	}
 
+	if (unlikely(size)) {
 		memcpy(dst, src, off);
-		memcpy(dst + off + sizeof(struct bpf_spin_lock),
-		       src + off + sizeof(struct bpf_spin_lock),
-		       map->value_size - off - sizeof(struct bpf_spin_lock));
+		memcpy(dst + off + size,
+		       src + off + size,
+		       map->value_size - off - size);
 	} else {
 		memcpy(dst, src, map->value_size);
 	}
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 94a0c976c90f..214fde93214b 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -99,6 +99,7 @@  bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   const struct btf_member *m,
 			   u32 expected_offset, u32 expected_size);
 int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
+int btf_find_timer(const struct btf *btf, const struct btf_type *t);
 bool btf_type_is_void(const struct btf_type *t);
 s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
 const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3c4105603f9d..3fedd9209770 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -378,10 +378,17 @@  static void *array_map_vmalloc_addr(struct bpf_array *array)
 static void array_map_free(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
 
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 		bpf_array_free_percpu(array);
 
+	if (unlikely(map_value_has_timer(map)))
+		for (i = 0; i < array->map.max_entries; i++)
+			bpf_timer_cancel_and_free(array->value +
+						  array->elem_size * i +
+						  map->timer_off);
+
 	if (array->map.map_flags & BPF_F_MMAPABLE)
 		bpf_map_area_free(array_map_vmalloc_addr(array));
 	else
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a6e39c5ea0bf..28a8014b8379 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3046,43 +3046,92 @@  static void btf_struct_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
+				 const char *name, int sz, int align)
 {
 	const struct btf_member *member;
 	u32 i, off = -ENOENT;
 
-	if (!__btf_type_is_struct(t))
-		return -EINVAL;
-
 	for_each_member(i, t, member) {
 		const struct btf_type *member_type = btf_type_by_id(btf,
 								    member->type);
 		if (!__btf_type_is_struct(member_type))
 			continue;
-		if (member_type->size != sizeof(struct bpf_spin_lock))
+		if (member_type->size != sz)
 			continue;
-		if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
-			   "bpf_spin_lock"))
+		if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
 			continue;
 		if (off != -ENOENT)
-			/* only one 'struct bpf_spin_lock' is allowed */
+			/* only one such field is allowed */
 			return -E2BIG;
 		off = btf_member_bit_offset(t, member);
 		if (off % 8)
 			/* valid C code cannot generate such BTF */
 			return -EINVAL;
 		off /= 8;
-		if (off % __alignof__(struct bpf_spin_lock))
-			/* valid struct bpf_spin_lock will be 4 byte aligned */
+		if (off % align)
+			return -EINVAL;
+	}
+	return off;
+}
+
+static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
+				const char *name, int sz, int align)
+{
+	const struct btf_var_secinfo *vsi;
+	u32 i, off = -ENOENT;
+
+	for_each_vsi(i, t, vsi) {
+		const struct btf_type *var = btf_type_by_id(btf, vsi->type);
+		const struct btf_type *var_type = btf_type_by_id(btf, var->type);
+
+		if (!__btf_type_is_struct(var_type))
+			continue;
+		if (var_type->size != sz)
+			continue;
+		if (vsi->size != sz)
+			continue;
+		if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
+			continue;
+		if (off != -ENOENT)
+			/* only one such field is allowed */
+			return -E2BIG;
+		off = vsi->offset;
+		if (off % align)
 			return -EINVAL;
 	}
 	return off;
 }
 
+static int btf_find_field(const struct btf *btf, const struct btf_type *t,
+			  const char *name, int sz, int align)
+{
+
+	if (__btf_type_is_struct(t))
+		return btf_find_struct_field(btf, t, name, sz, align);
+	else if (btf_type_is_datasec(t))
+		return btf_find_datasec_var(btf, t, name, sz, align);
+	return -EINVAL;
+}
+
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+	return btf_find_field(btf, t, "bpf_spin_lock",
+			      sizeof(struct bpf_spin_lock),
+			      __alignof__(struct bpf_spin_lock));
+}
+
+int btf_find_timer(const struct btf *btf, const struct btf_type *t)
+{
+	return btf_find_field(btf, t, "bpf_timer",
+			      sizeof(struct bpf_timer),
+			      __alignof__(struct bpf_timer));
+}
+
 static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
 			      u32 type_id, void *data, u8 bits_offset,
 			      struct btf_show *show)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 6f6681b07364..28d66fa74780 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -244,6 +244,17 @@  static void htab_free_elems(struct bpf_htab *htab)
 		cond_resched();
 	}
 free_elems:
+	if (unlikely(map_value_has_timer(&htab->map)))
+		for (i = 0; i < htab->map.max_entries; i++) {
+			struct htab_elem *elem;
+
+			elem = get_htab_elem(htab, i);
+			bpf_timer_cancel_and_free(elem->key +
+						  round_up(htab->map.key_size, 8) +
+						  htab->map.timer_off);
+			cond_resched();
+		}
+
 	bpf_map_area_free(htab->elems);
 }
 
@@ -265,8 +276,11 @@  static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
 	struct htab_elem *l;
 
 	if (node) {
+		u32 key_size = htab->map.key_size;
 		l = container_of(node, struct htab_elem, lru_node);
-		memcpy(l->key, key, htab->map.key_size);
+		memcpy(l->key, key, key_size);
+		check_and_init_map_value(&htab->map,
+					 l->key + round_up(key_size, 8));
 		return l;
 	}
 
@@ -785,10 +799,19 @@  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return -ENOENT;
 }
 
+static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
+{
+	if (unlikely(map_value_has_timer(&htab->map)))
+		bpf_timer_cancel_and_free(elem->key +
+					  round_up(htab->map.key_size, 8) +
+					  htab->map.timer_off);
+}
+
 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 {
 	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
 		free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+	check_and_free_timer(htab, l);
 	kfree(l);
 }
 
@@ -816,6 +839,7 @@  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 	htab_put_fd_value(htab, l);
 
 	if (htab_is_prealloc(htab)) {
+		check_and_free_timer(htab, l);
 		__pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
 		atomic_dec(&htab->count);
@@ -919,8 +943,8 @@  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 			l_new = ERR_PTR(-ENOMEM);
 			goto dec_count;
 		}
-		check_and_init_map_lock(&htab->map,
-					l_new->key + round_up(key_size, 8));
+		check_and_init_map_value(&htab->map,
+					 l_new->key + round_up(key_size, 8));
 	}
 
 	memcpy(l_new->key, key, key_size);
@@ -1067,6 +1091,12 @@  static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	return ret;
 }
 
+static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
+{
+	check_and_free_timer(htab, elem);
+	bpf_lru_push_free(&htab->lru, &elem->lru_node);
+}
+
 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 				    u64 map_flags)
 {
@@ -1099,7 +1129,8 @@  static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 	l_new = prealloc_lru_pop(htab, key, hash);
 	if (!l_new)
 		return -ENOMEM;
-	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
+	copy_map_value(&htab->map,
+		       l_new->key + round_up(map->key_size, 8), value);
 
 	ret = htab_lock_bucket(htab, b, hash, &flags);
 	if (ret)
@@ -1125,9 +1156,9 @@  static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 	htab_unlock_bucket(htab, b, hash, flags);
 
 	if (ret)
-		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+		htab_lru_push_free(htab, l_new);
 	else if (l_old)
-		bpf_lru_push_free(&htab->lru, &l_old->lru_node);
+		htab_lru_push_free(htab, l_old);
 
 	return ret;
 }
@@ -1332,7 +1363,7 @@  static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 
 	htab_unlock_bucket(htab, b, hash, flags);
 	if (l)
-		bpf_lru_push_free(&htab->lru, &l->lru_node);
+		htab_lru_push_free(htab, l);
 	return ret;
 }
 
@@ -1449,7 +1480,7 @@  static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
 			else
 				copy_map_value(map, value, l->key +
 					       roundup_key_size);
-			check_and_init_map_lock(map, value);
+			check_and_init_map_value(map, value);
 		}
 
 		hlist_nulls_del_rcu(&l->hash_node);
@@ -1460,7 +1491,7 @@  static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
 	htab_unlock_bucket(htab, b, hash, bflags);
 
 	if (is_lru_map && l)
-		bpf_lru_push_free(&htab->lru, &l->lru_node);
+		htab_lru_push_free(htab, l);
 
 	return ret;
 }
@@ -1638,7 +1669,7 @@  __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 						      true);
 			else
 				copy_map_value(map, dst_val, value);
-			check_and_init_map_lock(map, dst_val);
+			check_and_init_map_value(map, dst_val);
 		}
 		if (do_delete) {
 			hlist_nulls_del_rcu(&l->hash_node);
@@ -1665,7 +1696,7 @@  __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 	while (node_to_free) {
 		l = node_to_free;
 		node_to_free = node_to_free->batch_flink;
-		bpf_lru_push_free(&htab->lru, &l->lru_node);
+		htab_lru_push_free(htab, l);
 	}
 
 next_batch:
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6f9620cbe95d..8580b7bfc8bb 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1039,7 +1039,7 @@  BPF_CALL_5(bpf_timer_init, struct bpf_timer_kern *, timer, void *, cb, int, flag
 	if (!t)
 		return -ENOMEM;
 	t->callback_fn = cb;
-	t->value = (void *)timer /* - offset of bpf_timer inside elem */;
+	t->value = (void *)timer - map->timer_off;
 	t->key = t->value - round_up(map->key_size, 8);
 	t->map = map;
 	t->prog = prog;
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index bd11db9774c3..95d70a08325d 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -173,7 +173,7 @@  static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
 		return -ENOMEM;
 
 	memcpy(&new->data[0], value, map->value_size);
-	check_and_init_map_lock(map, new->data);
+	check_and_init_map_value(map, new->data);
 
 	new = xchg(&storage->buf, new);
 	kfree_rcu(new, rcu);
@@ -509,7 +509,7 @@  struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
 						    map->numa_node);
 		if (!storage->buf)
 			goto enomem;
-		check_and_init_map_lock(map, storage->buf->data);
+		check_and_init_map_value(map, storage->buf->data);
 	} else {
 		storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp);
 		if (!storage->percpu_buf)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 50457019da27..d49ba04d549e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -259,8 +259,8 @@  static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
 				copy_map_value_locked(map, value, ptr, true);
 			else
 				copy_map_value(map, value, ptr);
-			/* mask lock, since value wasn't zero inited */
-			check_and_init_map_lock(map, value);
+			/* mask lock and timer, since value wasn't zero inited */
+			check_and_init_map_value(map, value);
 		}
 		rcu_read_unlock();
 	}
@@ -792,6 +792,21 @@  static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 		}
 	}
 
+	map->timer_off = btf_find_timer(btf, value_type);
+	if (map_value_has_timer(map)) {
+		if (map->map_flags & BPF_F_RDONLY_PROG)
+			return -EACCES;
+		if (map->map_type != BPF_MAP_TYPE_HASH &&
+		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+		    map->map_type != BPF_MAP_TYPE_ARRAY)
+			return -ENOTSUPP;
+		if (map_value_has_spin_lock(map))
+			/* map values with bpf_spin_lock and bpf_timer
+			 * are not supported yet.
+			 */
+			return -EOPNOTSUPP;
+	}
+
 	if (map->ops->map_check_btf)
 		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
 
@@ -843,6 +858,7 @@  static int map_create(union bpf_attr *attr)
 	mutex_init(&map->freeze_mutex);
 
 	map->spin_lock_off = -EINVAL;
+	map->timer_off = -EINVAL;
 	if (attr->btf_key_type_id || attr->btf_value_type_id ||
 	    /* Even the map's value is a kernel's struct,
 	     * the bpf_prog.o must have BTF to begin with
@@ -1590,7 +1606,8 @@  static int map_freeze(const union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
+	    map_value_has_timer(map)) {
 		fdput(f);
 		return -ENOTSUPP;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f386f85aee5c..0a828dc4968e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3241,6 +3241,15 @@  static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 	}
+	if (map_value_has_timer(map)) {
+		u32 t = map->timer_off;
+
+		if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
+		     t < reg->umax_value + off + size) {
+			verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
+			return -EACCES;
+		}
+	}
 	return err;
 }
 
@@ -4675,9 +4684,24 @@  static int process_timer_func(struct bpf_verifier_env *env, int regno,
 			map->name);
 		return -EINVAL;
 	}
-	if (val) {
-		/* todo: relax this requirement */
-		verbose(env, "bpf_timer field can only be first in the map value element\n");
+	if (!map_value_has_timer(map)) {
+		if (map->timer_off == -E2BIG)
+			verbose(env,
+				"map '%s' has more than one 'struct bpf_timer'\n",
+				map->name);
+		else if (map->timer_off == -ENOENT)
+			verbose(env,
+				"map '%s' doesn't have 'struct bpf_timer'\n",
+				map->name);
+		else
+			verbose(env,
+				"map '%s' is not a struct type or bpf_timer is mangled\n",
+				map->name);
+		return -EINVAL;
+	}
+	if (map->timer_off != val + reg->off) {
+		verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
+			val + reg->off, map->timer_off);
 		return -EINVAL;
 	}
 	WARN_ON(meta->map_ptr);