diff mbox series

[3/3] bpf: Implement prealloc for task_local_storage

Message ID YXB5hWFCzJDISnrK@slm.duckdns.org (mailing list archive)
State Not Applicable
Delegated to: BPF
Headers show
Series [1/3] cgroup: Drop cgroup_ prefix from cgroup_threadgroup_rwsem and friends | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch
bpf/vmtest-bpf-PR fail merge-conflict
bpf/vmtest-bpf-next-PR fail merge-conflict

Commit Message

Tejun Heo Oct. 20, 2021, 8:18 p.m. UTC
>From 5e3ad0d4a0b0732e7ebe035582d282ab752397ed Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 20 Oct 2021 08:56:53 -1000

task_local_storage currently does not support pre-allocation and the memory
is allocated on demand using the GFP_ATOMIC mask. While atomic allocations
succeed most of the time and the occasional failures aren't a problem for
many use-cases, there are some which can benefit from reliable allocations -
e.g. tracking acquisitions and releases of specific resources to root cause
long-term reference leaks.

Prealloc semantics for task_local_storage:

* When a prealloc map is created, the map's elements for all existing tasks
  are allocated.

* Afterwards, whenever a new task is forked, it automatically allocates the
  elements for the existing preallocated maps.

To synchronize against concurrent forks, CONFIG_BPF_SYSCALL now enables
CONFIG_THREADGROUP_RWSEM and prealloc task_local_storage creation path
write-locks threadgroup_rwsem, and the rest of the implementation is
straight-forward.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/bpf.h                           |   6 +
 include/linux/bpf_local_storage.h             |  12 ++
 kernel/bpf/Kconfig                            |   1 +
 kernel/bpf/bpf_local_storage.c                | 112 ++++++++++----
 kernel/bpf/bpf_task_storage.c                 | 138 +++++++++++++++++-
 kernel/fork.c                                 |   8 +-
 .../bpf/prog_tests/task_local_storage.c       | 101 +++++++++++++
 .../selftests/bpf/progs/task_ls_prealloc.c    |  15 ++
 8 files changed, 361 insertions(+), 32 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/task_ls_prealloc.c

Comments

Martin KaFai Lau Oct. 22, 2021, 10:47 p.m. UTC | #1
On Wed, Oct 20, 2021 at 10:18:13AM -1000, Tejun Heo wrote:
> From 5e3ad0d4a0b0732e7ebe035582d282ab752397ed Mon Sep 17 00:00:00 2001
> From: Tejun Heo <tj@kernel.org>
> Date: Wed, 20 Oct 2021 08:56:53 -1000
> 
> task_local_storage currently does not support pre-allocation and the memory
> is allocated on demand using the GFP_ATOMIC mask. While atomic allocations
> succeed most of the time and the occasional failures aren't a problem for
> many use-cases, there are some which can benefit from reliable allocations -
> e.g. tracking acquisitions and releases of specific resources to root cause
> long-term reference leaks.
>
> Prealloc semantics for task_local_storage:
> 
> * When a prealloc map is created, the map's elements for all existing tasks
>   are allocated.
> 
> * Afterwards, whenever a new task is forked, it automatically allocates the
>   elements for the existing preallocated maps.
> 
> To synchronize against concurrent forks, CONFIG_BPF_SYSCALL now enables
> CONFIG_THREADGROUP_RWSEM and prealloc task_local_storage creation path
> write-locks threadgroup_rwsem, and the rest of the implementation is
> straight-forward.

[ ... ]

> +static int task_storage_map_populate(struct bpf_local_storage_map *smap)
> +{
> +	struct bpf_local_storage *storage = NULL;
> +	struct bpf_local_storage_elem *selem = NULL;
> +	struct task_struct *p, *g;
> +	int err = 0;
> +
> +	lockdep_assert_held(&threadgroup_rwsem);
> +retry:
> +	if (!storage)
> +		storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
> +					  GFP_USER);
> +	if (!selem)
> +		selem = bpf_map_kzalloc(&smap->map, smap->elem_size, GFP_USER);
> +	if (!storage || !selem) {
> +		err = -ENOMEM;
> +		goto out_free;
> +	}
> +
> +	rcu_read_lock();
> +	bpf_task_storage_lock();
> +
> +	for_each_process_thread(g, p) {
I am thinking if this loop can be done in bpf iter.

If the bpf_local_storage_map is sleepable safe (not yet done but there is
an earlier attempt [0]),  bpf_local_storage_update() should be able to
alloc without GFP_ATOMIC by sleepable bpf prog and this potentially
will be useful in general for other sleepable use cases.

For example, if a sleepable bpf iter prog can run in this loop (or the existing
bpf task iter loop is as good?), the iter bpf prog can call
bpf_task_storage_get(BPF_SK_STORAGE_GET_F_CREATE) on a sleepable
bpf_local_storage_map.

This pre-alloc then can be done similarly on the tcp/udp socket side
by running a bpf prog at the existing bpf tcp/udp iter.

[0]: https://lore.kernel.org/bpf/20210826235127.303505-1-kpsingh@kernel.org/

> +		struct bpf_local_storage_data *sdata;
> +
> +		/* Try inserting with atomic allocations. On failure, retry with
> +		 * the preallocated ones.
> +		 */
> +		sdata = bpf_local_storage_update(p, smap, NULL, BPF_NOEXIST);
> +
> +		if (PTR_ERR(sdata) == -ENOMEM && storage && selem) {
> +			sdata = __bpf_local_storage_update(p, smap, NULL,
> +							   BPF_NOEXIST,
> +							   &storage, &selem);
> +		}
> +
> +		/* Check -EEXIST before need_resched() to guarantee forward
> +		 * progress.
> +		 */
> +		if (PTR_ERR(sdata) == -EEXIST)
> +			continue;
> +
> +		/* If requested or alloc failed, take a breather and loop back
> +		 * to preallocate.
> +		 */
> +		if (need_resched() ||
> +		    PTR_ERR(sdata) == -EAGAIN || PTR_ERR(sdata) == -ENOMEM) {
> +			bpf_task_storage_unlock();
> +			rcu_read_unlock();
> +			cond_resched();
> +			goto retry;
> +		}
> +
> +		if (IS_ERR(sdata)) {
> +			err = PTR_ERR(sdata);
> +			goto out_unlock;
> +		}
> +	}
> +out_unlock:
> +	bpf_task_storage_unlock();
> +	rcu_read_unlock();
> +out_free:
> +	if (storage)
> +		kfree(storage);
> +	if (selem)
> +		kfree(selem);
> +	return err;
> +}

[ ... ]

> +int bpf_task_storage_fork(struct task_struct *task)
> +{
> +	struct bpf_local_storage_map *smap;
> +
> +	percpu_rwsem_assert_held(&threadgroup_rwsem);
> +
> +	list_for_each_entry(smap, &prealloc_smaps, prealloc_node) {
Mostly a comment here from the networking side,  I suspect the common use case
is going to be more selective based on different protocol (tcp or udp)
and even port.  There is some existing bpf hooks during inet_sock creation
time, bind time ...etc.  The bpf prog can be selective on what bpf_sk_storage
it needs by inspecting different fields of a sk.

e.g. in inet_create(), there is BPF_CGROUP_RUN_PROG_INET_SOCK(sk).
Would a similar hook be useful on the fork side?

> +		struct bpf_local_storage *storage;
> +		struct bpf_local_storage_elem *selem;
> +		struct bpf_local_storage_data *sdata;
> +
> +		storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
> +					  GFP_USER);
> +		selem = bpf_map_kzalloc(&smap->map, smap->elem_size, GFP_USER);
> +
> +		rcu_read_lock();
> +		bpf_task_storage_lock();
> +		sdata = __bpf_local_storage_update(task, smap, NULL, BPF_NOEXIST,
> +						   &storage, &selem);
> +		bpf_task_storage_unlock();
> +		rcu_read_unlock();
> +
> +		if (storage)
> +			kfree(storage);
> +		if (selem)
> +			kfree(selem);
> +
> +		if (IS_ERR(sdata)) {
> +			bpf_task_storage_free(task);
> +			return PTR_ERR(sdata);
> +		}
> +	}
> +
> +	return 0;
> +}
Tejun Heo Oct. 22, 2021, 11 p.m. UTC | #2
Hello,

On Fri, Oct 22, 2021 at 03:47:33PM -0700, Martin KaFai Lau wrote:
...
> > +	for_each_process_thread(g, p) {
> I am thinking if this loop can be done in bpf iter.
> 
> If the bpf_local_storage_map is sleepable safe (not yet done but there is
> an earlier attempt [0]),  bpf_local_storage_update() should be able to
> alloc without GFP_ATOMIC by sleepable bpf prog and this potentially
> will be useful in general for other sleepable use cases.
> 
> For example, if a sleepable bpf iter prog can run in this loop (or the existing
> bpf task iter loop is as good?), the iter bpf prog can call
> bpf_task_storage_get(BPF_SK_STORAGE_GET_F_CREATE) on a sleepable
> bpf_local_storage_map.

Yeah, whatever that can walk all the existing tasks should do, and I think
the locked section can be shrunk too.

        percpu_down_write(&threadgroup_rwsem);
        list_add_tail(&smap->prealloc_node, &prealloc_smaps);
        percpu_up_write(&threadgroup_rwsem);

        // Here, it's guaranteed that all new tasks are guaranteed to
        // prealloc on fork.

        Iterate all tasks in whatever way and allocate if necessary;

Thanks.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d604c8251d88..7f9e5dea0660 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1673,6 +1673,7 @@  struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
 
 const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+int bpf_task_storage_fork(struct task_struct *task);
 void bpf_task_storage_free(struct task_struct *task);
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
 const struct btf_func_model *
@@ -1882,6 +1883,11 @@  bpf_base_func_proto(enum bpf_func_id func_id)
 	return NULL;
 }
 
+static inline int bpf_task_storage_fork(struct task_struct *p)
+{
+	return 0;
+}
+
 static inline void bpf_task_storage_free(struct task_struct *task)
 {
 }
diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 24496bc28e7b..bbb4cedbd2b2 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -51,6 +51,12 @@  struct bpf_local_storage_map {
 	u32 bucket_log;
 	u16 elem_size;
 	u16 cache_idx;
+	/* Maps with prealloc need to be tracked and allocated when a new
+	 * containing object is created. The following node can be used to keep
+	 * track of the prealloc maps. Outside of initializing the field, the
+	 * shared local_storage code doesn't use it directly.
+	 */
+	struct list_head prealloc_node;
 };
 
 struct bpf_local_storage_data {
@@ -118,6 +124,7 @@  void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
 
 /* Helper functions for bpf_local_storage */
 int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
+int bpf_local_storage_prealloc_map_alloc_check(union bpf_attr *attr);
 
 struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr);
 
@@ -158,6 +165,11 @@  bpf_local_storage_alloc(void *owner,
 			struct bpf_local_storage_elem *first_selem);
 
 struct bpf_local_storage_data *
+__bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
+			   void *value, u64 map_flags,
+			   struct bpf_local_storage **local_storage_prealloc,
+			   struct bpf_local_storage_elem **selem_prealloc);
+struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 			 void *value, u64 map_flags);
 
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index a82d6de86522..4d816664026f 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -29,6 +29,7 @@  config BPF_SYSCALL
 	select IRQ_WORK
 	select TASKS_TRACE_RCU
 	select BINARY_PRINTF
+	select THREADGROUP_RWSEM
 	select NET_SOCK_MSG if NET
 	default n
 	help
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b305270b7a4b..0a6bf3e4bbcd 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -258,24 +258,13 @@  static int check_flags(const struct bpf_local_storage_data *old_sdata,
 	return 0;
 }
 
-int bpf_local_storage_alloc(void *owner,
-			    struct bpf_local_storage_map *smap,
-			    struct bpf_local_storage_elem *first_selem)
+static int bpf_local_storage_link(void *owner,
+				  struct bpf_local_storage_map *smap,
+				  struct bpf_local_storage_elem *first_selem,
+				  struct bpf_local_storage *storage)
 {
-	struct bpf_local_storage *prev_storage, *storage;
+	struct bpf_local_storage *prev_storage;
 	struct bpf_local_storage **owner_storage_ptr;
-	int err;
-
-	err = mem_charge(smap, owner, sizeof(*storage));
-	if (err)
-		return err;
-
-	storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
-				  GFP_ATOMIC | __GFP_NOWARN);
-	if (!storage) {
-		err = -ENOMEM;
-		goto uncharge;
-	}
 
 	INIT_HLIST_HEAD(&storage->list);
 	raw_spin_lock_init(&storage->lock);
@@ -299,8 +288,7 @@  int bpf_local_storage_alloc(void *owner,
 	prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
 	if (unlikely(prev_storage)) {
 		bpf_selem_unlink_map(first_selem);
-		err = -EAGAIN;
-		goto uncharge;
+		return -EAGAIN;
 
 		/* Note that even first_selem was linked to smap's
 		 * bucket->list, first_selem can be freed immediately
@@ -313,6 +301,31 @@  int bpf_local_storage_alloc(void *owner,
 	}
 
 	return 0;
+}
+
+int bpf_local_storage_alloc(void *owner,
+			   struct bpf_local_storage_map *smap,
+			   struct bpf_local_storage_elem *first_selem)
+{
+	struct bpf_local_storage *storage;
+	int err;
+
+	err = mem_charge(smap, owner, sizeof(*storage));
+	if (err)
+		return err;
+
+	storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+				  GFP_ATOMIC | __GFP_NOWARN);
+	if (!storage) {
+		err = -ENOMEM;
+		goto uncharge;
+	}
+
+	err = bpf_local_storage_link(owner, smap, first_selem, storage);
+	if (err)
+		goto uncharge;
+
+	return 0;
 
 uncharge:
 	kfree(storage);
@@ -326,8 +339,10 @@  int bpf_local_storage_alloc(void *owner,
  * during map destruction).
  */
 struct bpf_local_storage_data *
-bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
-			 void *value, u64 map_flags)
+__bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
+			   void *value, u64 map_flags,
+			   struct bpf_local_storage **local_storage_prealloc,
+			   struct bpf_local_storage_elem **selem_prealloc)
 {
 	struct bpf_local_storage_data *old_sdata = NULL;
 	struct bpf_local_storage_elem *selem;
@@ -349,17 +364,30 @@  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		if (err)
 			return ERR_PTR(err);
 
-		selem = bpf_selem_alloc(smap, owner, value, true);
+		if (*selem_prealloc)
+			selem = *selem_prealloc;
+		else
+			selem = bpf_selem_alloc(smap, owner, value, true);
 		if (!selem)
 			return ERR_PTR(-ENOMEM);
 
-		err = bpf_local_storage_alloc(owner, smap, selem);
+		if (*local_storage_prealloc) {
+			err = bpf_local_storage_link(owner, smap, selem,
+						     *local_storage_prealloc);
+		} else {
+			err = bpf_local_storage_alloc(owner, smap, selem);
+		}
 		if (err) {
-			kfree(selem);
-			mem_uncharge(smap, owner, smap->elem_size);
+			if (!*selem_prealloc) {
+				kfree(selem);
+				mem_uncharge(smap, owner, smap->elem_size);
+			}
 			return ERR_PTR(err);
 		}
 
+		*selem_prealloc = NULL;
+		*local_storage_prealloc = NULL;
+
 		return SDATA(selem);
 	}
 
@@ -414,10 +442,15 @@  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	 * old_sdata will not be uncharged later during
 	 * bpf_selem_unlink_storage_nolock().
 	 */
-	selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
-	if (!selem) {
-		err = -ENOMEM;
-		goto unlock_err;
+	if (*selem_prealloc) {
+		selem = *selem_prealloc;
+		*selem_prealloc = NULL;
+	} else {
+		selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
+		if (!selem) {
+			err = -ENOMEM;
+			goto unlock_err;
+		}
 	}
 
 	/* First, link the new selem to the map */
@@ -442,6 +475,17 @@  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	return ERR_PTR(err);
 }
 
+struct bpf_local_storage_data *
+bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
+			 void *value, u64 map_flags)
+{
+	struct bpf_local_storage *local_storage_prealloc = NULL;
+	struct bpf_local_storage_elem *selem_prealloc = NULL;
+
+	return __bpf_local_storage_update(owner, smap, value, map_flags,
+					  &local_storage_prealloc, &selem_prealloc);
+}
+
 u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
 {
 	u64 min_usage = U64_MAX;
@@ -536,10 +580,9 @@  void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
 	kfree(smap);
 }
 
-int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
+int bpf_local_storage_prealloc_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
-	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
 	    attr->max_entries ||
 	    attr->key_size != sizeof(int) || !attr->value_size ||
 	    /* Enforce BTF for userspace sk dumping */
@@ -555,6 +598,13 @@  int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
 	return 0;
 }
 
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
+{
+	if (!(attr->map_flags & BPF_F_NO_PREALLOC))
+		return -EINVAL;
+	return bpf_local_storage_prealloc_map_alloc_check(attr);
+}
+
 struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_local_storage_map *smap;
@@ -586,6 +636,8 @@  struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
 	smap->elem_size =
 		sizeof(struct bpf_local_storage_elem) + attr->value_size;
 
+	INIT_LIST_HEAD(&smap->prealloc_node);
+
 	return smap;
 }
 
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index ebfa8bc90892..6f8b781647f7 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -17,11 +17,15 @@ 
 #include <uapi/linux/btf.h>
 #include <linux/btf_ids.h>
 #include <linux/fdtable.h>
+#include <linux/sched/threadgroup_rwsem.h>
 
 DEFINE_BPF_STORAGE_CACHE(task_cache);
 
 static DEFINE_PER_CPU(int, bpf_task_storage_busy);
 
+/* Protected by threadgroup_rwsem. */
+static LIST_HEAD(prealloc_smaps);
+
 static void bpf_task_storage_lock(void)
 {
 	migrate_disable();
@@ -280,14 +284,103 @@  static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return -ENOTSUPP;
 }
 
+static int task_storage_map_populate(struct bpf_local_storage_map *smap)
+{
+	struct bpf_local_storage *storage = NULL;
+	struct bpf_local_storage_elem *selem = NULL;
+	struct task_struct *p, *g;
+	int err = 0;
+
+	lockdep_assert_held(&threadgroup_rwsem);
+retry:
+	if (!storage)
+		storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+					  GFP_USER);
+	if (!selem)
+		selem = bpf_map_kzalloc(&smap->map, smap->elem_size, GFP_USER);
+	if (!storage || !selem) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	rcu_read_lock();
+	bpf_task_storage_lock();
+
+	for_each_process_thread(g, p) {
+		struct bpf_local_storage_data *sdata;
+
+		/* Try inserting with atomic allocations. On failure, retry with
+		 * the preallocated ones.
+		 */
+		sdata = bpf_local_storage_update(p, smap, NULL, BPF_NOEXIST);
+
+		if (PTR_ERR(sdata) == -ENOMEM && storage && selem) {
+			sdata = __bpf_local_storage_update(p, smap, NULL,
+							   BPF_NOEXIST,
+							   &storage, &selem);
+		}
+
+		/* Check -EEXIST before need_resched() to guarantee forward
+		 * progress.
+		 */
+		if (PTR_ERR(sdata) == -EEXIST)
+			continue;
+
+		/* If requested or alloc failed, take a breather and loop back
+		 * to preallocate.
+		 */
+		if (need_resched() ||
+		    PTR_ERR(sdata) == -EAGAIN || PTR_ERR(sdata) == -ENOMEM) {
+			bpf_task_storage_unlock();
+			rcu_read_unlock();
+			cond_resched();
+			goto retry;
+		}
+
+		if (IS_ERR(sdata)) {
+			err = PTR_ERR(sdata);
+			goto out_unlock;
+		}
+	}
+out_unlock:
+	bpf_task_storage_unlock();
+	rcu_read_unlock();
+out_free:
+	if (storage)
+		kfree(storage);
+	if (selem)
+		kfree(selem);
+	return err;
+}
+
 static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_local_storage_map *smap;
+	int err;
 
 	smap = bpf_local_storage_map_alloc(attr);
 	if (IS_ERR(smap))
 		return ERR_CAST(smap);
 
+	if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
+		/* We're going to exercise the regular update path to populate
+		 * the map for the existing tasks, which will call into map ops
+		 * which is normally initialized after this function returns.
+		 * Initialize it early here.
+		 */
+		smap->map.ops = &task_storage_map_ops;
+
+		percpu_down_write(&threadgroup_rwsem);
+		list_add_tail(&smap->prealloc_node, &prealloc_smaps);
+		err = task_storage_map_populate(smap);
+		percpu_up_write(&threadgroup_rwsem);
+		if (err) {
+			bpf_local_storage_map_free(smap,
+						   &bpf_task_storage_busy);
+			return ERR_PTR(err);
+		}
+	}
+
 	smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache);
 	return &smap->map;
 }
@@ -298,13 +391,20 @@  static void task_storage_map_free(struct bpf_map *map)
 
 	smap = (struct bpf_local_storage_map *)map;
 	bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
+
+	if (!list_empty(&smap->prealloc_node)) {
+		percpu_down_write(&threadgroup_rwsem);
+		list_del_init(&smap->prealloc_node);
+		percpu_up_write(&threadgroup_rwsem);
+	}
+
 	bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
 }
 
 static int task_storage_map_btf_id;
 const struct bpf_map_ops task_storage_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
-	.map_alloc_check = bpf_local_storage_map_alloc_check,
+	.map_alloc_check = bpf_local_storage_prealloc_map_alloc_check,
 	.map_alloc = task_storage_map_alloc,
 	.map_free = task_storage_map_free,
 	.map_get_next_key = notsupp_get_next_key,
@@ -317,6 +417,42 @@  const struct bpf_map_ops task_storage_map_ops = {
 	.map_owner_storage_ptr = task_storage_ptr,
 };
 
+int bpf_task_storage_fork(struct task_struct *task)
+{
+	struct bpf_local_storage_map *smap;
+
+	percpu_rwsem_assert_held(&threadgroup_rwsem);
+
+	list_for_each_entry(smap, &prealloc_smaps, prealloc_node) {
+		struct bpf_local_storage *storage;
+		struct bpf_local_storage_elem *selem;
+		struct bpf_local_storage_data *sdata;
+
+		storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+					  GFP_USER);
+		selem = bpf_map_kzalloc(&smap->map, smap->elem_size, GFP_USER);
+
+		rcu_read_lock();
+		bpf_task_storage_lock();
+		sdata = __bpf_local_storage_update(task, smap, NULL, BPF_NOEXIST,
+						   &storage, &selem);
+		bpf_task_storage_unlock();
+		rcu_read_unlock();
+
+		if (storage)
+			kfree(storage);
+		if (selem)
+			kfree(selem);
+
+		if (IS_ERR(sdata)) {
+			bpf_task_storage_free(task);
+			return PTR_ERR(sdata);
+		}
+	}
+
+	return 0;
+}
+
 const struct bpf_func_proto bpf_task_storage_get_proto = {
 	.func = bpf_task_storage_get,
 	.gpl_only = false,
diff --git a/kernel/fork.c b/kernel/fork.c
index 34fb9db59148..845c49c6e89b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2290,6 +2290,10 @@  static __latent_entropy struct task_struct *copy_process(
 
 	threadgroup_change_begin(current);
 
+	retval = bpf_task_storage_fork(p);
+	if (retval)
+		goto bad_fork_threadgroup_change_end;
+
 	/*
 	 * Ensure that the cgroup subsystem policies allow the new process to be
 	 * forked. It should be noted that the new process's css_set can be changed
@@ -2298,7 +2302,7 @@  static __latent_entropy struct task_struct *copy_process(
 	 */
 	retval = cgroup_can_fork(p, args);
 	if (retval)
-		goto bad_fork_threadgroup_change_end;
+		goto bad_fork_bpf_task_storage_free;
 
 	/*
 	 * From this point on we must avoid any synchronous user-space
@@ -2427,6 +2431,8 @@  static __latent_entropy struct task_struct *copy_process(
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
 	cgroup_cancel_fork(p, args);
+bad_fork_bpf_task_storage_free:
+	bpf_task_storage_free(p);
 bad_fork_threadgroup_change_end:
 	threadgroup_change_end(current);
 bad_fork_put_pidfd:
diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index 035c263aab1b..ad35470db991 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -5,10 +5,21 @@ 
 #include <unistd.h>
 #include <sys/syscall.h>   /* For SYS_xxx definitions */
 #include <sys/types.h>
+#include <sys/wait.h>
 #include <test_progs.h>
 #include "task_local_storage.skel.h"
 #include "task_local_storage_exit_creds.skel.h"
 #include "task_ls_recursion.skel.h"
+#include "task_ls_prealloc.skel.h"
+
+#ifndef __NR_pidfd_open
+#define __NR_pidfd_open 434
+#endif
+
+static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
+{
+	return syscall(__NR_pidfd_open, pid, flags);
+}
 
 static void test_sys_enter_exit(void)
 {
@@ -81,6 +92,94 @@  static void test_recursion(void)
 	task_ls_recursion__destroy(skel);
 }
 
+static int fork_prealloc_child(int *pipe_fd)
+{
+	int pipe_fds[2], pid_fd, err;
+	pid_t pid;
+
+	err = pipe(pipe_fds);
+	if (!ASSERT_OK(err, "pipe"))
+		return -1;
+
+	*pipe_fd = pipe_fds[1];
+
+	pid = fork();
+	if (pid == 0) {
+		char ch;
+		close(pipe_fds[1]);
+		read(pipe_fds[0], &ch, 1);
+		exit(0);
+	}
+
+	if (!ASSERT_GE(pid, 0, "fork"))
+		return -1;
+
+	pid_fd = sys_pidfd_open(pid, 0);
+	if (!ASSERT_GE(pid_fd, 0, "pidfd_open"))
+		return -1;
+
+	return pid_fd;
+}
+
+static void test_prealloc_elem(int map_fd, int pid_fd)
+{
+	int val, err;
+
+	err = bpf_map_lookup_elem(map_fd, &pid_fd, &val);
+	if (ASSERT_OK(err, "bpf_map_lookup_elem"))
+		ASSERT_EQ(val, 0, "elem value == 0");
+
+	val = 0xdeadbeef;
+	err = bpf_map_update_elem(map_fd, &pid_fd, &val, BPF_EXIST);
+	ASSERT_OK(err, "bpf_map_update_elem to 0xdeadbeef");
+
+	err = bpf_map_lookup_elem(map_fd, &pid_fd, &val);
+	if (ASSERT_OK(err, "bpf_map_lookup_elem"))
+		ASSERT_EQ(val, 0xdeadbeef, "elem value == 0xdeadbeef");
+}
+
+static void test_prealloc(void)
+{
+	struct task_ls_prealloc *skel = NULL;
+	int pre_pipe_fd = -1, post_pipe_fd = -1;
+	int pre_pid_fd, post_pid_fd;
+	int map_fd, err;
+
+	pre_pid_fd = fork_prealloc_child(&pre_pipe_fd);
+	if (pre_pid_fd < 0)
+		goto out;
+
+	skel = task_ls_prealloc__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+		goto out;
+
+	err = task_ls_prealloc__attach(skel);
+	if (!ASSERT_OK(err, "skel_attach"))
+		goto out;
+
+	post_pid_fd = fork_prealloc_child(&post_pipe_fd);
+	if (post_pid_fd < 0)
+		goto out;
+
+	map_fd = bpf_map__fd(skel->maps.prealloc_map);
+	if (!ASSERT_GE(map_fd, 0, "bpf_map__fd"))
+		goto out;
+
+	test_prealloc_elem(map_fd, pre_pid_fd);
+	test_prealloc_elem(map_fd, post_pid_fd);
+out:
+	if (pre_pipe_fd >= 0)
+		close(pre_pipe_fd);
+	if (post_pipe_fd >= 0)
+		close(post_pipe_fd);
+	do {
+		err = wait4(-1, NULL, 0, NULL);
+	} while (!err);
+
+	if (skel)
+		task_ls_prealloc__destroy(skel);
+}
+
 void test_task_local_storage(void)
 {
 	if (test__start_subtest("sys_enter_exit"))
@@ -89,4 +188,6 @@  void test_task_local_storage(void)
 		test_exit_creds();
 	if (test__start_subtest("recursion"))
 		test_recursion();
+	if (test__start_subtest("prealloc"))
+		test_prealloc();
 }
diff --git a/tools/testing/selftests/bpf/progs/task_ls_prealloc.c b/tools/testing/selftests/bpf/progs/task_ls_prealloc.c
new file mode 100644
index 000000000000..8b252ee3511e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/task_ls_prealloc.c
@@ -0,0 +1,15 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, 0);
+	__type(key, int);
+	__type(value, int);
+} prealloc_map SEC(".maps");