diff mbox series

[v1,bpf-next,1/2] bpf: Support BPF_F_MMAPABLE task_local storage

Message ID 20231120175925.733167-2-davemarchevsky@fb.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: Add mmapable task_local storage | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/codegen success Generated files up to date
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 1198 this patch: 1205
netdev/cc_maintainers warning 8 maintainers not CCed: yonghong.song@linux.dev john.fastabend@gmail.com song@kernel.org haoluo@google.com jolsa@kernel.org kpsingh@kernel.org martin.lau@linux.dev sdf@google.com
netdev/build_clang fail Errors and warnings before: 1162 this patch: 1169
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 1225 this patch: 1232
netdev/checkpatch warning WARNING: Missing a blank line after declarations WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-3 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-15 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-llvm-16 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-26 fail Logs for x86_64-llvm-16 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-16 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-llvm-16 / build / build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-16 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-16 / veristat
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc

Commit Message

Dave Marchevsky Nov. 20, 2023, 5:59 p.m. UTC
This patch modifies the generic bpf_local_storage infrastructure to
support mmapable map values and adds mmap() handling to task_local
storage leveraging this new functionality. A userspace task which
mmap's a task_local storage map will receive a pointer to the map_value
corresponding to that tasks' key - mmap'ing in other tasks' mapvals is
not supported in this patch.

Currently, struct bpf_local_storage_elem contains both bookkeeping
information as well as a struct bpf_local_storage_data with additional
bookkeeping information and the actual mapval data. We can't simply map
the page containing this struct into userspace. Instead, mmapable
local_storage uses bpf_local_storage_data's data field to point to the
actual mapval, which is allocated separately such that it can be
mmapped. Only the mapval lives on the page(s) allocated for it.

The lifetime of the actual_data mmapable region is tied to the
bpf_local_storage_elem which points to it. This doesn't necessarily mean
that the pages go away when the bpf_local_storage_elem is free'd - if
they're mapped into some userspace process they will remain until
unmapped, but are no longer the task_local storage's mapval.

Implementation details:

  * A few small helpers are added to deal with bpf_local_storage_data's
    'data' field having different semantics when the local_storage map
    is mmapable. With their help, many of the changes to existing code
    are purely mechanical (e.g. sdata->data becomes sdata_mapval(sdata),
    selem->elem_size becomes selem_bytes_used(selem)).

  * The map flags are copied into bpf_local_storage_data when its
    containing bpf_local_storage_elem is alloc'd, since the
    bpf_local_storage_map associated with them may be gone when
    bpf_local_storage_data is free'd, and testing flags for
    BPF_F_MMAPABLE is necessary when free'ing to ensure that the
    mmapable region is free'd.
    * The extra field doesn't change bpf_local_storage_elem's size.
      There were 48 bytes of padding after the bpf_local_storage_data
      field, now there are 40.

  * Currently, bpf_local_storage_update always creates a new
    bpf_local_storage_elem for the 'updated' value - the only exception
    being if the map_value has a bpf_spin_lock field, in which case the
    spin lock is grabbed instead of the less granular bpf_local_storage
    lock, and the value updated in place. This inplace update behavior
    is desired for mmapable local_storage map_values as well, since
    creating a new selem would result in new mmapable pages.

  * The size of the mmapable pages are accounted for when calling
    mem_{charge,uncharge}. If the pages are mmap'd into a userspace task
    mem_uncharge may be called before they actually go away.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
---
 include/linux/bpf_local_storage.h |  14 ++-
 kernel/bpf/bpf_local_storage.c    | 145 ++++++++++++++++++++++++------
 kernel/bpf/bpf_task_storage.c     |  35 ++++++--
 kernel/bpf/syscall.c              |   2 +-
 4 files changed, 163 insertions(+), 33 deletions(-)

Comments

Johannes Weiner Nov. 20, 2023, 9:41 p.m. UTC | #1
On Mon, Nov 20, 2023 at 09:59:24AM -0800, Dave Marchevsky wrote:
> This patch modifies the generic bpf_local_storage infrastructure to
> support mmapable map values and adds mmap() handling to task_local
> storage leveraging this new functionality. A userspace task which
> mmap's a task_local storage map will receive a pointer to the map_value
> corresponding to that tasks' key - mmap'ing in other tasks' mapvals is
> not supported in this patch.
> 
> Currently, struct bpf_local_storage_elem contains both bookkeeping
> information as well as a struct bpf_local_storage_data with additional
> bookkeeping information and the actual mapval data. We can't simply map
> the page containing this struct into userspace. Instead, mmapable
> local_storage uses bpf_local_storage_data's data field to point to the
> actual mapval, which is allocated separately such that it can be
> mmapped. Only the mapval lives on the page(s) allocated for it.
> 
> The lifetime of the actual_data mmapable region is tied to the
> bpf_local_storage_elem which points to it. This doesn't necessarily mean
> that the pages go away when the bpf_local_storage_elem is free'd - if
> they're mapped into some userspace process they will remain until
> unmapped, but are no longer the task_local storage's mapval.

Those bits look good to me. vfree() uses __free_pages(), which
participates in refcounting. remap_vmalloc_range() acquires references
to the individual pages, which will be dropped once the page tables
disappear on munmap(). The vmalloc area doesn't need to stick around.
Martin KaFai Lau Nov. 21, 2023, 12:42 a.m. UTC | #2
On 11/20/23 9:59 AM, Dave Marchevsky wrote:
> diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
> index 173ec7f43ed1..114973f925ea 100644
> --- a/include/linux/bpf_local_storage.h
> +++ b/include/linux/bpf_local_storage.h
> @@ -69,7 +69,17 @@ struct bpf_local_storage_data {
>   	 * the number of cachelines accessed during the cache hit case.
>   	 */
>   	struct bpf_local_storage_map __rcu *smap;
> -	u8 data[] __aligned(8);
> +	/* Need to duplicate smap's map_flags as smap may be gone when
> +	 * it's time to free bpf_local_storage_data
> +	 */
> +	u64 smap_map_flags;
> +	/* If BPF_F_MMAPABLE, this is a void * to separately-alloc'd data
> +	 * Otherwise the actual mapval data lives here
> +	 */
> +	union {
> +		DECLARE_FLEX_ARRAY(u8, data) __aligned(8);
> +		void *actual_data __aligned(8);

The pages (that can be mmap'ed later) feel like a specific kind of kptr.

Have you thought about allowing a kptr (pointing to some pages that can be 
mmap'ed later) to be stored as one of the members of the map's value as a kptr. 
bpf_local_storage_map is one of the maps that supports kptr.

struct normal_and_mmap_value {
     int some_int;
     int __percpu_kptr *some_cnts;

     struct bpf_mmap_page __kptr *some_stats;
};

struct mmap_only_value {
     struct bpf_mmap_page __kptr *some_stats;
};

[ ... ]

> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> index 146824cc9689..9b3becbcc1a3 100644
> --- a/kernel/bpf/bpf_local_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c
> @@ -15,7 +15,8 @@
>   #include <linux/rcupdate_trace.h>
>   #include <linux/rcupdate_wait.h>
>   
> -#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
> +#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK \
> +	(BPF_F_NO_PREALLOC | BPF_F_CLONE | BPF_F_MMAPABLE)
>   
>   static struct bpf_local_storage_map_bucket *
>   select_bucket(struct bpf_local_storage_map *smap,
> @@ -24,6 +25,51 @@ select_bucket(struct bpf_local_storage_map *smap,
>   	return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
>   }
>   
> +struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map);
> +
> +void *alloc_mmapable_selem_value(struct bpf_local_storage_map *smap)
> +{
> +	struct mem_cgroup *memcg, *old_memcg;
> +	void *ptr;
> +
> +	memcg = bpf_map_get_memcg(&smap->map);
> +	old_memcg = set_active_memcg(memcg);
> +	ptr = bpf_map_area_mmapable_alloc(PAGE_ALIGN(smap->map.value_size),
> +					  NUMA_NO_NODE);
> +	set_active_memcg(old_memcg);
> +	mem_cgroup_put(memcg);
> +
> +	return ptr;
> +}

[ ... ]

> @@ -76,10 +122,19 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
>   		void *value, bool charge_mem, gfp_t gfp_flags)
>   {
>   	struct bpf_local_storage_elem *selem;
> +	void *mmapable_value = NULL;
> +	u32 selem_mem;
>   
> -	if (charge_mem && mem_charge(smap, owner, smap->elem_size))
> +	selem_mem = selem_bytes_used(smap);
> +	if (charge_mem && mem_charge(smap, owner, selem_mem))
>   		return NULL;
>   
> +	if (smap->map.map_flags & BPF_F_MMAPABLE) {
> +		mmapable_value = alloc_mmapable_selem_value(smap);

This probably is not always safe for bpf prog to do. Leaving the gfp_flags was 
not used aside, the bpf local storage is moving to the bpf's memalloc because of 
https://lore.kernel.org/bpf/20221118190109.1512674-1-namhyung@kernel.org/

> +		if (!mmapable_value)
> +			goto err_out;
> +	}
> +
kernel test robot Nov. 21, 2023, 2:32 a.m. UTC | #3
Hi Dave,

kernel test robot noticed the following build warnings:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Dave-Marchevsky/bpf-Support-BPF_F_MMAPABLE-task_local-storage/20231121-020345
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20231120175925.733167-2-davemarchevsky%40fb.com
patch subject: [PATCH v1 bpf-next 1/2] bpf: Support BPF_F_MMAPABLE task_local storage
config: sparc-randconfig-r081-20231121 (https://download.01.org/0day-ci/archive/20231121/202311211037.6BmDdrr4-lkp@intel.com/config)
compiler: sparc-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231121/202311211037.6BmDdrr4-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202311211037.6BmDdrr4-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> kernel/bpf/syscall.c:407:20: warning: no previous prototype for 'bpf_map_get_memcg' [-Wmissing-prototypes]
     407 | struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
         |                    ^~~~~~~~~~~~~~~~~
--
>> kernel/bpf/bpf_local_storage.c:30:7: warning: no previous prototype for 'alloc_mmapable_selem_value' [-Wmissing-prototypes]
      30 | void *alloc_mmapable_selem_value(struct bpf_local_storage_map *smap)
         |       ^~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/bpf_local_storage.c: In function 'bpf_selem_free_rcu':
   kernel/bpf/bpf_local_storage.c:288:39: warning: variable 'smap' set but not used [-Wunused-but-set-variable]
     288 |         struct bpf_local_storage_map *smap;
         |                                       ^~~~


vim +/bpf_map_get_memcg +407 kernel/bpf/syscall.c

   406	
 > 407	struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
   408	{
   409		if (map->objcg)
   410			return get_mem_cgroup_from_objcg(map->objcg);
   411	
   412		return root_mem_cgroup;
   413	}
   414
kernel test robot Nov. 21, 2023, 5:06 a.m. UTC | #4
Hi Dave,

kernel test robot noticed the following build errors:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Dave-Marchevsky/bpf-Support-BPF_F_MMAPABLE-task_local-storage/20231121-020345
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20231120175925.733167-2-davemarchevsky%40fb.com
patch subject: [PATCH v1 bpf-next 1/2] bpf: Support BPF_F_MMAPABLE task_local storage
config: i386-buildonly-randconfig-005-20231121 (https://download.01.org/0day-ci/archive/20231121/202311211247.KBiJyddD-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231121/202311211247.KBiJyddD-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202311211247.KBiJyddD-lkp@intel.com/

All errors (new ones prefixed by >>):

   ld: kernel/bpf/bpf_local_storage.o: in function `alloc_mmapable_selem_value':
   bpf_local_storage.c:(.text+0x207): undefined reference to `bpf_map_get_memcg'
   ld: kernel/bpf/bpf_local_storage.o: in function `bpf_selem_alloc':
   bpf_local_storage.c:(.text+0x410): undefined reference to `bpf_map_get_memcg'
>> ld: bpf_local_storage.c:(.text+0x543): undefined reference to `bpf_map_get_memcg'
kernel test robot Nov. 21, 2023, 5:20 a.m. UTC | #5
Hi Dave,

kernel test robot noticed the following build warnings:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Dave-Marchevsky/bpf-Support-BPF_F_MMAPABLE-task_local-storage/20231121-020345
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20231120175925.733167-2-davemarchevsky%40fb.com
patch subject: [PATCH v1 bpf-next 1/2] bpf: Support BPF_F_MMAPABLE task_local storage
config: x86_64-randconfig-121-20231121 (https://download.01.org/0day-ci/archive/20231121/202311211358.O24bsL46-lkp@intel.com/config)
compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231121/202311211358.O24bsL46-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202311211358.O24bsL46-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> kernel/bpf/bpf_local_storage.c:30:6: sparse: sparse: symbol 'alloc_mmapable_selem_value' was not declared. Should it be static?
>> kernel/bpf/bpf_local_storage.c:291:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct bpf_local_storage_map *smap @@     got struct bpf_local_storage_map [noderef] __rcu *smap @@
   kernel/bpf/bpf_local_storage.c:291:14: sparse:     expected struct bpf_local_storage_map *smap
   kernel/bpf/bpf_local_storage.c:291:14: sparse:     got struct bpf_local_storage_map [noderef] __rcu *smap
   kernel/bpf/bpf_local_storage.c: note: in included file (through include/linux/mmzone.h, include/linux/gfp.h, include/linux/umh.h, include/linux/kmod.h, ...):
   include/linux/page-flags.h:242:46: sparse: sparse: self-comparison always evaluates to false

vim +/alloc_mmapable_selem_value +30 kernel/bpf/bpf_local_storage.c

    29	
  > 30	void *alloc_mmapable_selem_value(struct bpf_local_storage_map *smap)
    31	{
    32		struct mem_cgroup *memcg, *old_memcg;
    33		void *ptr;
    34	
    35		memcg = bpf_map_get_memcg(&smap->map);
    36		old_memcg = set_active_memcg(memcg);
    37		ptr = bpf_map_area_mmapable_alloc(PAGE_ALIGN(smap->map.value_size),
    38						  NUMA_NO_NODE);
    39		set_active_memcg(old_memcg);
    40		mem_cgroup_put(memcg);
    41	
    42		return ptr;
    43	}
    44
Alexei Starovoitov Nov. 21, 2023, 5:44 a.m. UTC | #6
On Mon, Nov 20, 2023 at 09:59:24AM -0800, Dave Marchevsky wrote:
> +void *alloc_mmapable_selem_value(struct bpf_local_storage_map *smap)

should be static?

> +static int task_storage_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
> +{
> +	void *data;
> +
> +	if (!(map->map_flags & BPF_F_MMAPABLE) || vma->vm_pgoff ||
> +	    (vma->vm_end - vma->vm_start) < map->value_size)
> +		return -EINVAL;
> +
> +	WARN_ON_ONCE(!bpf_rcu_lock_held());

why? This is mmap() syscall. What is the concern?

> +	bpf_task_storage_lock();
> +	data = __bpf_task_storage_get(map, current, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE,
> +				      0, true);

0 for gfp_flags? It probably should be GFP_USER?
David Marchevsky Nov. 21, 2023, 6:11 a.m. UTC | #7
On 11/20/23 7:42 PM, Martin KaFai Lau wrote:
> On 11/20/23 9:59 AM, Dave Marchevsky wrote:
>> diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
>> index 173ec7f43ed1..114973f925ea 100644
>> --- a/include/linux/bpf_local_storage.h
>> +++ b/include/linux/bpf_local_storage.h
>> @@ -69,7 +69,17 @@ struct bpf_local_storage_data {
>>        * the number of cachelines accessed during the cache hit case.
>>        */
>>       struct bpf_local_storage_map __rcu *smap;
>> -    u8 data[] __aligned(8);
>> +    /* Need to duplicate smap's map_flags as smap may be gone when
>> +     * it's time to free bpf_local_storage_data
>> +     */
>> +    u64 smap_map_flags;
>> +    /* If BPF_F_MMAPABLE, this is a void * to separately-alloc'd data
>> +     * Otherwise the actual mapval data lives here
>> +     */
>> +    union {
>> +        DECLARE_FLEX_ARRAY(u8, data) __aligned(8);
>> +        void *actual_data __aligned(8);
> 
> The pages (that can be mmap'ed later) feel like a specific kind of kptr.
> 
> Have you thought about allowing a kptr (pointing to some pages that can be mmap'ed later) to be stored as one of the members of the map's value as a kptr. bpf_local_storage_map is one of the maps that supports kptr.
> 
> struct normal_and_mmap_value {
>     int some_int;
>     int __percpu_kptr *some_cnts;
> 
>     struct bpf_mmap_page __kptr *some_stats;
> };
> 
> struct mmap_only_value {
>     struct bpf_mmap_page __kptr *some_stats;
> };
> 
> [ ... ]
> 

This is an intriguing idea. For conciseness I'll call this specific
kind of kptr 'mmapable kptrs' for the rest of this message. Below is
more of a brainstorming dump than a cohesive response, separate trains
of thought are separated by two newlines.


My initial thought upon seeing struct normal_and_mmap_value was to note
that we currently don't support mmaping for map_value types with _any_
special fields ('special' as determined by btf_parse_fields). But IIUC
you're actually talking about exposing the some_stats pointee memory via
mmap, not the containing struct with kptr fields. That is, for maps that
support these kptrs, mmap()ing a map with value type struct
normal_and_mmap_value would return the some_stats pointer value, and
likely initialize the pointer similarly to BPF_LOCAL_STORAGE_GET_F_CREATE
logic in this patch. We'd only be able to support one such mmapable kptr
field per mapval type, but that isn't a dealbreaker.

Some maps, like task_storage, would only support mmap() on a map_value
with mmapable kptr field, as mmap()ing the mapval itself doesn't make
sense or is unsafe. Seems like arraymap would do the opposite, only
supporting mmap()ing the mapval itself. I'm curious if any map could
feasibly support both, and if so, might have to do logic like:

  if (map_val has mmapable kptr)
     mmap the pointee of mmapable kptr
  else
     mmap the map_val itself

Which is maybe too confusing of a detail to expose to BPF program
writers. Maybe a little too presumptuous and brainstorm-ey given the
limited number of mmap()able maps currently, but making this a kptr type
means maps should either ignore/fail if they don't support it, or have
consistent semantics amongst maps that do support it.


Instead of  struct bpf_mmap_page __kptr *some_stats;  I'd prefer
something like

struct my_type { long count; long another_count; };

struct mmap_only_value {
  struct my_type __mmapable_kptr *some_stats;
};

This way the type of mmap()able field is known to BPF programs that
interact with it. This is all assuming that struct bpf_mmap_page is an
opaque page-sized blob of bytes.


We could then support structs like

struct mmap_value_and_lock {
  struct bpf_spin_lock l;
  int some_int;
  struct my_type __mmapable_kptr *some_stats;
};

and have bpf_map_update_elem handler use the spin_lock instead of
map-internal lock where appropriate. But no way to ensure userspace task
using the mmap()ed region uses the spin_lock.

>> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
>> index 146824cc9689..9b3becbcc1a3 100644
>> --- a/kernel/bpf/bpf_local_storage.c
>> +++ b/kernel/bpf/bpf_local_storage.c
>> @@ -15,7 +15,8 @@
>>   #include <linux/rcupdate_trace.h>
>>   #include <linux/rcupdate_wait.h>
>>   -#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
>> +#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK \
>> +    (BPF_F_NO_PREALLOC | BPF_F_CLONE | BPF_F_MMAPABLE)
>>     static struct bpf_local_storage_map_bucket *
>>   select_bucket(struct bpf_local_storage_map *smap,
>> @@ -24,6 +25,51 @@ select_bucket(struct bpf_local_storage_map *smap,
>>       return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
>>   }
>>   +struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map);
>> +
>> +void *alloc_mmapable_selem_value(struct bpf_local_storage_map *smap)
>> +{
>> +    struct mem_cgroup *memcg, *old_memcg;
>> +    void *ptr;
>> +
>> +    memcg = bpf_map_get_memcg(&smap->map);
>> +    old_memcg = set_active_memcg(memcg);
>> +    ptr = bpf_map_area_mmapable_alloc(PAGE_ALIGN(smap->map.value_size),
>> +                      NUMA_NO_NODE);
>> +    set_active_memcg(old_memcg);
>> +    mem_cgroup_put(memcg);
>> +
>> +    return ptr;
>> +}
> 
> [ ... ]
> 
>> @@ -76,10 +122,19 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
>>           void *value, bool charge_mem, gfp_t gfp_flags)
>>   {
>>       struct bpf_local_storage_elem *selem;
>> +    void *mmapable_value = NULL;
>> +    u32 selem_mem;
>>   -    if (charge_mem && mem_charge(smap, owner, smap->elem_size))
>> +    selem_mem = selem_bytes_used(smap);
>> +    if (charge_mem && mem_charge(smap, owner, selem_mem))
>>           return NULL;
>>   +    if (smap->map.map_flags & BPF_F_MMAPABLE) {
>> +        mmapable_value = alloc_mmapable_selem_value(smap);
> 
> This probably is not always safe for bpf prog to do. Leaving the gfp_flags was not used aside, the bpf local storage is moving to the bpf's memalloc because of https://lore.kernel.org/bpf/20221118190109.1512674-1-namhyung@kernel.org/
> 

Minor point: alloc_mmapable_selem_value's bpf_map_area_mmapable_alloc
call will always call vmalloc under the hood. vmalloc has locks as well,
so your point stands.

I think I see how this ties into your 'specific kptr type' proposal
above. Let me know if this sounds right: if there was a bpf_mem_alloc
type focused on providing vmalloc'd mmap()able memory, we could use it
here instead of raw vmalloc and avoid the lock recursion problem linked
above. Such an allocator could be used in something like bpf_obj_new to
create the __mmapable_kptr - either from BPF prog or mmap path before
remap_vmalloc_range.

re: gfp_flags, looks like verifier is setting this param to either
GFP_ATOMIC or GFP_KERNEL. Looks like we should not allow GFP_KERNEL
allocs here?

>> +        if (!mmapable_value)
>> +            goto err_out;
>> +    }
>> +
>
Yonghong Song Nov. 21, 2023, 6:41 a.m. UTC | #8
On 11/20/23 12:59 PM, Dave Marchevsky wrote:
> This patch modifies the generic bpf_local_storage infrastructure to
> support mmapable map values and adds mmap() handling to task_local
> storage leveraging this new functionality. A userspace task which
> mmap's a task_local storage map will receive a pointer to the map_value
> corresponding to that tasks' key - mmap'ing in other tasks' mapvals is
> not supported in this patch.
>
> Currently, struct bpf_local_storage_elem contains both bookkeeping
> information as well as a struct bpf_local_storage_data with additional
> bookkeeping information and the actual mapval data. We can't simply map
> the page containing this struct into userspace. Instead, mmapable
> local_storage uses bpf_local_storage_data's data field to point to the
> actual mapval, which is allocated separately such that it can be
> mmapped. Only the mapval lives on the page(s) allocated for it.
>
> The lifetime of the actual_data mmapable region is tied to the
> bpf_local_storage_elem which points to it. This doesn't necessarily mean
> that the pages go away when the bpf_local_storage_elem is free'd - if
> they're mapped into some userspace process they will remain until
> unmapped, but are no longer the task_local storage's mapval.
>
> Implementation details:
>
>    * A few small helpers are added to deal with bpf_local_storage_data's
>      'data' field having different semantics when the local_storage map
>      is mmapable. With their help, many of the changes to existing code
>      are purely mechanical (e.g. sdata->data becomes sdata_mapval(sdata),
>      selem->elem_size becomes selem_bytes_used(selem)).
>
>    * The map flags are copied into bpf_local_storage_data when its
>      containing bpf_local_storage_elem is alloc'd, since the
>      bpf_local_storage_map associated with them may be gone when
>      bpf_local_storage_data is free'd, and testing flags for
>      BPF_F_MMAPABLE is necessary when free'ing to ensure that the
>      mmapable region is free'd.
>      * The extra field doesn't change bpf_local_storage_elem's size.
>        There were 48 bytes of padding after the bpf_local_storage_data
>        field, now there are 40.
>
>    * Currently, bpf_local_storage_update always creates a new
>      bpf_local_storage_elem for the 'updated' value - the only exception
>      being if the map_value has a bpf_spin_lock field, in which case the
>      spin lock is grabbed instead of the less granular bpf_local_storage
>      lock, and the value updated in place. This inplace update behavior
>      is desired for mmapable local_storage map_values as well, since
>      creating a new selem would result in new mmapable pages.
>
>    * The size of the mmapable pages are accounted for when calling
>      mem_{charge,uncharge}. If the pages are mmap'd into a userspace task
>      mem_uncharge may be called before they actually go away.
>
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> ---
>   include/linux/bpf_local_storage.h |  14 ++-
>   kernel/bpf/bpf_local_storage.c    | 145 ++++++++++++++++++++++++------
>   kernel/bpf/bpf_task_storage.c     |  35 ++++++--
>   kernel/bpf/syscall.c              |   2 +-
>   4 files changed, 163 insertions(+), 33 deletions(-)
>
> diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
> index 173ec7f43ed1..114973f925ea 100644
> --- a/include/linux/bpf_local_storage.h
> +++ b/include/linux/bpf_local_storage.h
> @@ -69,7 +69,17 @@ struct bpf_local_storage_data {
>   	 * the number of cachelines accessed during the cache hit case.
>   	 */
>   	struct bpf_local_storage_map __rcu *smap;
> -	u8 data[] __aligned(8);
> +	/* Need to duplicate smap's map_flags as smap may be gone when
> +	 * it's time to free bpf_local_storage_data
> +	 */
> +	u64 smap_map_flags;
> +	/* If BPF_F_MMAPABLE, this is a void * to separately-alloc'd data
> +	 * Otherwise the actual mapval data lives here
> +	 */
> +	union {
> +		DECLARE_FLEX_ARRAY(u8, data) __aligned(8);
> +		void *actual_data __aligned(8);

I think we can remove __aligned(8) from 'void *actual_data __aligned(8)'
in the above. There are two reasons, first, the first element in the union
is aligned(8) and then the rest of union members will be at least aligned 8.
second, IIUC, in the code, we have
     actual_data = mmapable_value
where mmapable_value has type 'void *'. So actual_data is used
to hold a pointer to mmap-ed page, so it only needs pointer type
alignment which can already be achieved with 'void *'.
So we can remove __aligned(8) safely. It can also remove
an impression that 'actual_data' has to be 8-byte aligned in
32-bit system although it does not need to be just by 'actual_data'
itself.


> +	};
>   };
>   
>   /* Linked to bpf_local_storage and bpf_local_storage_map */
> @@ -124,6 +134,8 @@ static struct bpf_local_storage_cache name = {			\
>   /* Helper functions for bpf_local_storage */
>   int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
>   
> +void *sdata_mapval(struct bpf_local_storage_data *data);
> +
>   struct bpf_map *
>   bpf_local_storage_map_alloc(union bpf_attr *attr,
>   			    struct bpf_local_storage_cache *cache,

[...]
Yonghong Song Nov. 21, 2023, 3:34 p.m. UTC | #9
On 11/20/23 12:59 PM, Dave Marchevsky wrote:
> This patch modifies the generic bpf_local_storage infrastructure to
> support mmapable map values and adds mmap() handling to task_local
> storage leveraging this new functionality. A userspace task which
> mmap's a task_local storage map will receive a pointer to the map_value
> corresponding to that tasks' key - mmap'ing in other tasks' mapvals is
> not supported in this patch.
>
> Currently, struct bpf_local_storage_elem contains both bookkeeping
> information as well as a struct bpf_local_storage_data with additional
> bookkeeping information and the actual mapval data. We can't simply map
> the page containing this struct into userspace. Instead, mmapable
> local_storage uses bpf_local_storage_data's data field to point to the
> actual mapval, which is allocated separately such that it can be
> mmapped. Only the mapval lives on the page(s) allocated for it.
>
> The lifetime of the actual_data mmapable region is tied to the
> bpf_local_storage_elem which points to it. This doesn't necessarily mean
> that the pages go away when the bpf_local_storage_elem is free'd - if
> they're mapped into some userspace process they will remain until
> unmapped, but are no longer the task_local storage's mapval.
>
> Implementation details:
>
>    * A few small helpers are added to deal with bpf_local_storage_data's
>      'data' field having different semantics when the local_storage map
>      is mmapable. With their help, many of the changes to existing code
>      are purely mechanical (e.g. sdata->data becomes sdata_mapval(sdata),
>      selem->elem_size becomes selem_bytes_used(selem)).
>
>    * The map flags are copied into bpf_local_storage_data when its
>      containing bpf_local_storage_elem is alloc'd, since the
>      bpf_local_storage_map associated with them may be gone when
>      bpf_local_storage_data is free'd, and testing flags for
>      BPF_F_MMAPABLE is necessary when free'ing to ensure that the
>      mmapable region is free'd.
>      * The extra field doesn't change bpf_local_storage_elem's size.
>        There were 48 bytes of padding after the bpf_local_storage_data
>        field, now there are 40.
>
>    * Currently, bpf_local_storage_update always creates a new
>      bpf_local_storage_elem for the 'updated' value - the only exception
>      being if the map_value has a bpf_spin_lock field, in which case the
>      spin lock is grabbed instead of the less granular bpf_local_storage
>      lock, and the value updated in place. This inplace update behavior
>      is desired for mmapable local_storage map_values as well, since
>      creating a new selem would result in new mmapable pages.
>
>    * The size of the mmapable pages are accounted for when calling
>      mem_{charge,uncharge}. If the pages are mmap'd into a userspace task
>      mem_uncharge may be called before they actually go away.
>
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> ---
>   include/linux/bpf_local_storage.h |  14 ++-
>   kernel/bpf/bpf_local_storage.c    | 145 ++++++++++++++++++++++++------
>   kernel/bpf/bpf_task_storage.c     |  35 ++++++--
>   kernel/bpf/syscall.c              |   2 +-
>   4 files changed, 163 insertions(+), 33 deletions(-)
>
> diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
> index 173ec7f43ed1..114973f925ea 100644
> --- a/include/linux/bpf_local_storage.h
> +++ b/include/linux/bpf_local_storage.h
> @@ -69,7 +69,17 @@ struct bpf_local_storage_data {
>   	 * the number of cachelines accessed during the cache hit case.
>   	 */
>   	struct bpf_local_storage_map __rcu *smap;
> -	u8 data[] __aligned(8);
> +	/* Need to duplicate smap's map_flags as smap may be gone when
> +	 * it's time to free bpf_local_storage_data
> +	 */
> +	u64 smap_map_flags;
> +	/* If BPF_F_MMAPABLE, this is a void * to separately-alloc'd data
> +	 * Otherwise the actual mapval data lives here
> +	 */
> +	union {
> +		DECLARE_FLEX_ARRAY(u8, data) __aligned(8);
> +		void *actual_data __aligned(8);
> +	};
>   };
>   
>   /* Linked to bpf_local_storage and bpf_local_storage_map */
> @@ -124,6 +134,8 @@ static struct bpf_local_storage_cache name = {			\
>   /* Helper functions for bpf_local_storage */
>   int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
>   
> +void *sdata_mapval(struct bpf_local_storage_data *data);
> +
>   struct bpf_map *
>   bpf_local_storage_map_alloc(union bpf_attr *attr,
>   			    struct bpf_local_storage_cache *cache,
> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> index 146824cc9689..9b3becbcc1a3 100644
> --- a/kernel/bpf/bpf_local_storage.c
[...]
> @@ -583,14 +665,14 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
>   		err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
>   		if (err) {
>   			bpf_selem_free(selem, smap, true);
> -			mem_uncharge(smap, owner, smap->elem_size);
> +			mem_uncharge(smap, owner, selem_bytes_used(smap));
>   			return ERR_PTR(err);
>   		}
>   
>   		return SDATA(selem);
>   	}
>   
> -	if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
> +	if (can_update_existing_selem(smap, map_flags) && !(map_flags & BPF_NOEXIST)) {
>   		/* Hoping to find an old_sdata to do inline update
>   		 * such that it can avoid taking the local_storage->lock
>   		 * and changing the lists.
> @@ -601,8 +683,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
>   		if (err)
>   			return ERR_PTR(err);
>   		if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
> -			copy_map_value_locked(&smap->map, old_sdata->data,
> -					      value, false);
> +			if (map_flags & BPF_F_LOCK)
> +				copy_map_value_locked(&smap->map,
> +						      sdata_mapval(old_sdata),
> +						      value, false);
> +			else
> +				copy_map_value(&smap->map, sdata_mapval(old_sdata),
> +					       value);

IIUC, if two 'storage_update' to the same map/key and then
these two updates will be serialized due to spin_lock.
How about concurrent update for mmap'ed sdata, do we need
any protection here?

>   			return old_sdata;
>   		}
>   	}
> @@ -633,8 +720,8 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
>   		goto unlock;
>   
>   	if (old_sdata && (map_flags & BPF_F_LOCK)) {
> -		copy_map_value_locked(&smap->map, old_sdata->data, value,
> -				      false);
> +		copy_map_value_locked(&smap->map, sdata_mapval(old_sdata),
> +				      value, false);
>   		selem = SELEM(old_sdata);
>   		goto unlock;
>   	}
> @@ -656,7 +743,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
>   unlock:
>   	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
>   	if (alloc_selem) {
> -		mem_uncharge(smap, owner, smap->elem_size);
> +		mem_uncharge(smap, owner, selem_bytes_used(smap));
>   		bpf_selem_free(alloc_selem, smap, true);
>   	}
>   	return err ? ERR_PTR(err) : SDATA(selem);
> @@ -707,6 +794,10 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
>   	if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
>   		return -E2BIG;
>   
> +	if ((attr->map_flags & BPF_F_MMAPABLE) &&
> +	    attr->map_type != BPF_MAP_TYPE_TASK_STORAGE)
> +		return -EINVAL;
> +
>   	return 0;
>   }
>   

[...]
Martin KaFai Lau Nov. 21, 2023, 7:27 p.m. UTC | #10
On 11/20/23 10:11 PM, David Marchevsky wrote:
> 
> 
> On 11/20/23 7:42 PM, Martin KaFai Lau wrote:
>> On 11/20/23 9:59 AM, Dave Marchevsky wrote:
>>> diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
>>> index 173ec7f43ed1..114973f925ea 100644
>>> --- a/include/linux/bpf_local_storage.h
>>> +++ b/include/linux/bpf_local_storage.h
>>> @@ -69,7 +69,17 @@ struct bpf_local_storage_data {
>>>         * the number of cachelines accessed during the cache hit case.
>>>         */
>>>        struct bpf_local_storage_map __rcu *smap;
>>> -    u8 data[] __aligned(8);
>>> +    /* Need to duplicate smap's map_flags as smap may be gone when
>>> +     * it's time to free bpf_local_storage_data
>>> +     */
>>> +    u64 smap_map_flags;
>>> +    /* If BPF_F_MMAPABLE, this is a void * to separately-alloc'd data
>>> +     * Otherwise the actual mapval data lives here
>>> +     */
>>> +    union {
>>> +        DECLARE_FLEX_ARRAY(u8, data) __aligned(8);
>>> +        void *actual_data __aligned(8);
>>
>> The pages (that can be mmap'ed later) feel like a specific kind of kptr.
>>
>> Have you thought about allowing a kptr (pointing to some pages that can be mmap'ed later) to be stored as one of the members of the map's value as a kptr. bpf_local_storage_map is one of the maps that supports kptr.
>>
>> struct normal_and_mmap_value {
>>      int some_int;
>>      int __percpu_kptr *some_cnts;
>>
>>      struct bpf_mmap_page __kptr *some_stats;
>> };
>>
>> struct mmap_only_value {
>>      struct bpf_mmap_page __kptr *some_stats;
>> };
>>
>> [ ... ]
>>
> 
> This is an intriguing idea. For conciseness I'll call this specific
> kind of kptr 'mmapable kptrs' for the rest of this message. Below is
> more of a brainstorming dump than a cohesive response, separate trains
> of thought are separated by two newlines.

Thanks for bearing with me while some ideas could be crazy. I am trying to see 
how this would look like for other local storage, sk and inode. Allocating a 
page for each sk will not be nice for server with half a million sk(s). e.g. 
half a million sk(s) sharing a few bandwidth policies or a few tuning 
parameters. Creating something mmap'able to the user space and also sharable 
among many sk(s) will be useful.

> 
> 
> My initial thought upon seeing struct normal_and_mmap_value was to note
> that we currently don't support mmaping for map_value types with _any_
> special fields ('special' as determined by btf_parse_fields). But IIUC
> you're actually talking about exposing the some_stats pointee memory via
> mmap, not the containing struct with kptr fields. That is, for maps that
> support these kptrs, mmap()ing a map with value type struct
> normal_and_mmap_value would return the some_stats pointer value, and
> likely initialize the pointer similarly to BPF_LOCAL_STORAGE_GET_F_CREATE
> logic in this patch. We'd only be able to support one such mmapable kptr
> field per mapval type, but that isn't a dealbreaker.
> 
> Some maps, like task_storage, would only support mmap() on a map_value
> with mmapable kptr field, as mmap()ing the mapval itself doesn't make
> sense or is unsafe. Seems like arraymap would do the opposite, only

Changing direction a bit since arraymap is brought up. :)

arraymap supports BPF_F_MMAPABLE. If the local storage map's value can store an 
arraymap as kptr, the bpf prog should be able to access it as a map. More like 
the current map-in-map setup. The arraymap can be used as regular map in the 
user space also (like pinning). It may need some btf plumbing to tell the value 
type of the arrayamp to the verifier.

The syscall bpf_map_update_elem(task_storage_map_fd, &task_pidfd, &value, flags) 
can be used where the value->array_mmap initialized as an arraymap_fd. This will 
limit the arraymap kptr update only from the syscall side which seems to be your 
usecase also? Allocating the arraymap from the bpf prog side needs some thoughts 
and need a whitelist.

The same goes for the syscall bpf_map_lookup_elem(task_storage_map_fd, 
&task_pidfd, &value). The kernel can return a fd in value->array_mmap. May be we 
can create a libbpf helper to free the fd(s) resources held in the looked-up 
value by using the value's btf.

The bpf_local_storage_map side probably does not need to support mmap() then.


> supporting mmap()ing the mapval itself. I'm curious if any map could
> feasibly support both, and if so, might have to do logic like:
> 
>    if (map_val has mmapable kptr)
>       mmap the pointee of mmapable kptr
>    else
>       mmap the map_val itself
> 
> Which is maybe too confusing of a detail to expose to BPF program
> writers. Maybe a little too presumptuous and brainstorm-ey given the
> limited number of mmap()able maps currently, but making this a kptr type
> means maps should either ignore/fail if they don't support it, or have
> consistent semantics amongst maps that do support it.
> 
> 
> Instead of  struct bpf_mmap_page __kptr *some_stats;  I'd prefer
> something like
> 
> struct my_type { long count; long another_count; };
> 
> struct mmap_only_value {
>    struct my_type __mmapable_kptr *some_stats;
> };
> 
> This way the type of mmap()able field is known to BPF programs that
> interact with it. This is all assuming that struct bpf_mmap_page is an
> opaque page-sized blob of bytes.
> 
> 
> We could then support structs like
> 
> struct mmap_value_and_lock {
>    struct bpf_spin_lock l;
>    int some_int;
>    struct my_type __mmapable_kptr *some_stats;
> };
> 
> and have bpf_map_update_elem handler use the spin_lock instead of
> map-internal lock where appropriate. But no way to ensure userspace task
> using the mmap()ed region uses the spin_lock.
> 
>>> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
>>> index 146824cc9689..9b3becbcc1a3 100644
>>> --- a/kernel/bpf/bpf_local_storage.c
>>> +++ b/kernel/bpf/bpf_local_storage.c
>>> @@ -15,7 +15,8 @@
>>>    #include <linux/rcupdate_trace.h>
>>>    #include <linux/rcupdate_wait.h>
>>>    -#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
>>> +#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK \
>>> +    (BPF_F_NO_PREALLOC | BPF_F_CLONE | BPF_F_MMAPABLE)
>>>      static struct bpf_local_storage_map_bucket *
>>>    select_bucket(struct bpf_local_storage_map *smap,
>>> @@ -24,6 +25,51 @@ select_bucket(struct bpf_local_storage_map *smap,
>>>        return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
>>>    }
>>>    +struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map);
>>> +
>>> +void *alloc_mmapable_selem_value(struct bpf_local_storage_map *smap)
>>> +{
>>> +    struct mem_cgroup *memcg, *old_memcg;
>>> +    void *ptr;
>>> +
>>> +    memcg = bpf_map_get_memcg(&smap->map);
>>> +    old_memcg = set_active_memcg(memcg);
>>> +    ptr = bpf_map_area_mmapable_alloc(PAGE_ALIGN(smap->map.value_size),
>>> +                      NUMA_NO_NODE);
>>> +    set_active_memcg(old_memcg);
>>> +    mem_cgroup_put(memcg);
>>> +
>>> +    return ptr;
>>> +}
>>
>> [ ... ]
>>
>>> @@ -76,10 +122,19 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
>>>            void *value, bool charge_mem, gfp_t gfp_flags)
>>>    {
>>>        struct bpf_local_storage_elem *selem;
>>> +    void *mmapable_value = NULL;
>>> +    u32 selem_mem;
>>>    -    if (charge_mem && mem_charge(smap, owner, smap->elem_size))
>>> +    selem_mem = selem_bytes_used(smap);
>>> +    if (charge_mem && mem_charge(smap, owner, selem_mem))
>>>            return NULL;
>>>    +    if (smap->map.map_flags & BPF_F_MMAPABLE) {
>>> +        mmapable_value = alloc_mmapable_selem_value(smap);
>>
>> This probably is not always safe for bpf prog to do. Leaving the gfp_flags was not used aside, the bpf local storage is moving to the bpf's memalloc because of https://lore.kernel.org/bpf/20221118190109.1512674-1-namhyung@kernel.org/
>>
> 
> Minor point: alloc_mmapable_selem_value's bpf_map_area_mmapable_alloc
> call will always call vmalloc under the hood. vmalloc has locks as well,
> so your point stands.
> 
> I think I see how this ties into your 'specific kptr type' proposal
> above. Let me know if this sounds right: if there was a bpf_mem_alloc
> type focused on providing vmalloc'd mmap()able memory, we could use it
> here instead of raw vmalloc and avoid the lock recursion problem linked
> above. Such an allocator could be used in something like bpf_obj_new to
> create the __mmapable_kptr - either from BPF prog or mmap path before
> remap_vmalloc_range.
> 
> re: gfp_flags, looks like verifier is setting this param to either
> GFP_ATOMIC or GFP_KERNEL. Looks like we should not allow GFP_KERNEL
> allocs here?

Going back to this patch, not sure what does it take to make bpf_mem_alloc() 
mmap()able. May be we can limit the blast radius for now, like limit this alloc 
to the user space mmap() call for now. Does it fit your use case? A whitelist 
for bpf prog could also be created later if needed.

> 
>>> +        if (!mmapable_value)
>>> +            goto err_out;
>>> +    }
>>> +
>>
Andrii Nakryiko Nov. 21, 2023, 7:30 p.m. UTC | #11
On Mon, Nov 20, 2023 at 9:59 AM Dave Marchevsky <davemarchevsky@fb.com> wrote:
>
> This patch modifies the generic bpf_local_storage infrastructure to
> support mmapable map values and adds mmap() handling to task_local
> storage leveraging this new functionality. A userspace task which
> mmap's a task_local storage map will receive a pointer to the map_value
> corresponding to that tasks' key - mmap'ing in other tasks' mapvals is
> not supported in this patch.
>
> Currently, struct bpf_local_storage_elem contains both bookkeeping
> information as well as a struct bpf_local_storage_data with additional
> bookkeeping information and the actual mapval data. We can't simply map
> the page containing this struct into userspace. Instead, mmapable
> local_storage uses bpf_local_storage_data's data field to point to the
> actual mapval, which is allocated separately such that it can be
> mmapped. Only the mapval lives on the page(s) allocated for it.
>
> The lifetime of the actual_data mmapable region is tied to the
> bpf_local_storage_elem which points to it. This doesn't necessarily mean
> that the pages go away when the bpf_local_storage_elem is free'd - if
> they're mapped into some userspace process they will remain until
> unmapped, but are no longer the task_local storage's mapval.
>
> Implementation details:
>
>   * A few small helpers are added to deal with bpf_local_storage_data's
>     'data' field having different semantics when the local_storage map
>     is mmapable. With their help, many of the changes to existing code
>     are purely mechanical (e.g. sdata->data becomes sdata_mapval(sdata),
>     selem->elem_size becomes selem_bytes_used(selem)).

might be worth doing this as a pre-patch with no functional changes to
make the main change a bit smaller and more focused?

>
>   * The map flags are copied into bpf_local_storage_data when its
>     containing bpf_local_storage_elem is alloc'd, since the
>     bpf_local_storage_map associated with them may be gone when
>     bpf_local_storage_data is free'd, and testing flags for
>     BPF_F_MMAPABLE is necessary when free'ing to ensure that the
>     mmapable region is free'd.
>     * The extra field doesn't change bpf_local_storage_elem's size.
>       There were 48 bytes of padding after the bpf_local_storage_data
>       field, now there are 40.
>
>   * Currently, bpf_local_storage_update always creates a new
>     bpf_local_storage_elem for the 'updated' value - the only exception
>     being if the map_value has a bpf_spin_lock field, in which case the
>     spin lock is grabbed instead of the less granular bpf_local_storage
>     lock, and the value updated in place. This inplace update behavior
>     is desired for mmapable local_storage map_values as well, since
>     creating a new selem would result in new mmapable pages.
>
>   * The size of the mmapable pages are accounted for when calling
>     mem_{charge,uncharge}. If the pages are mmap'd into a userspace task
>     mem_uncharge may be called before they actually go away.
>
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> ---
>  include/linux/bpf_local_storage.h |  14 ++-
>  kernel/bpf/bpf_local_storage.c    | 145 ++++++++++++++++++++++++------
>  kernel/bpf/bpf_task_storage.c     |  35 ++++++--
>  kernel/bpf/syscall.c              |   2 +-
>  4 files changed, 163 insertions(+), 33 deletions(-)
>
> diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
> index 173ec7f43ed1..114973f925ea 100644
> --- a/include/linux/bpf_local_storage.h
> +++ b/include/linux/bpf_local_storage.h
> @@ -69,7 +69,17 @@ struct bpf_local_storage_data {
>          * the number of cachelines accessed during the cache hit case.
>          */
>         struct bpf_local_storage_map __rcu *smap;
> -       u8 data[] __aligned(8);
> +       /* Need to duplicate smap's map_flags as smap may be gone when
> +        * it's time to free bpf_local_storage_data
> +        */
> +       u64 smap_map_flags;
> +       /* If BPF_F_MMAPABLE, this is a void * to separately-alloc'd data
> +        * Otherwise the actual mapval data lives here
> +        */
> +       union {
> +               DECLARE_FLEX_ARRAY(u8, data) __aligned(8);
> +               void *actual_data __aligned(8);

I don't know if it's the issue, but probably best to keep FLEX_ARRAY
member the last even within the union, just in case if some tool
doesn't handle FLEX_ARRAY not being last line number-wise?


> +       };
>  };
>
>  /* Linked to bpf_local_storage and bpf_local_storage_map */
> @@ -124,6 +134,8 @@ static struct bpf_local_storage_cache name = {                      \
>  /* Helper functions for bpf_local_storage */
>  int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
>
> +void *sdata_mapval(struct bpf_local_storage_data *data);
> +
>  struct bpf_map *
>  bpf_local_storage_map_alloc(union bpf_attr *attr,
>                             struct bpf_local_storage_cache *cache,
> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> index 146824cc9689..9b3becbcc1a3 100644
> --- a/kernel/bpf/bpf_local_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c
> @@ -15,7 +15,8 @@
>  #include <linux/rcupdate_trace.h>
>  #include <linux/rcupdate_wait.h>
>
> -#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
> +#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK \
> +       (BPF_F_NO_PREALLOC | BPF_F_CLONE | BPF_F_MMAPABLE)
>
>  static struct bpf_local_storage_map_bucket *
>  select_bucket(struct bpf_local_storage_map *smap,
> @@ -24,6 +25,51 @@ select_bucket(struct bpf_local_storage_map *smap,
>         return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
>  }
>
> +struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map);
> +
> +void *alloc_mmapable_selem_value(struct bpf_local_storage_map *smap)
> +{
> +       struct mem_cgroup *memcg, *old_memcg;
> +       void *ptr;
> +
> +       memcg = bpf_map_get_memcg(&smap->map);
> +       old_memcg = set_active_memcg(memcg);
> +       ptr = bpf_map_area_mmapable_alloc(PAGE_ALIGN(smap->map.value_size),
> +                                         NUMA_NO_NODE);
> +       set_active_memcg(old_memcg);
> +       mem_cgroup_put(memcg);
> +
> +       return ptr;
> +}
> +
> +void *sdata_mapval(struct bpf_local_storage_data *data)
> +{
> +       if (data->smap_map_flags & BPF_F_MMAPABLE)
> +               return data->actual_data;
> +       return &data->data;
> +}

given this being potentially high-frequency helper called from other
.o files and it is simple, should this be a static inline in .h header
instead?

> +
> +static size_t sdata_data_field_size(struct bpf_local_storage_map *smap,
> +                                   struct bpf_local_storage_data *data)
> +{
> +       if (smap->map.map_flags & BPF_F_MMAPABLE)
> +               return sizeof(void *);
> +       return (size_t)smap->map.value_size;
> +}
> +

[...]

> diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> index adf6dfe0ba68..ce75c8d8b2ce 100644
> --- a/kernel/bpf/bpf_task_storage.c
> +++ b/kernel/bpf/bpf_task_storage.c
> @@ -90,6 +90,7 @@ void bpf_task_storage_free(struct task_struct *task)
>  static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
>  {
>         struct bpf_local_storage_data *sdata;
> +       struct bpf_local_storage_map *smap;
>         struct task_struct *task;
>         unsigned int f_flags;
>         struct pid *pid;
> @@ -114,7 +115,8 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
>         sdata = task_storage_lookup(task, map, true);
>         bpf_task_storage_unlock();
>         put_pid(pid);
> -       return sdata ? sdata->data : NULL;
> +       smap = (struct bpf_local_storage_map *)map;

smap seems unused?

> +       return sdata ? sdata_mapval(sdata) : NULL;
>  out:
>         put_pid(pid);
>         return ERR_PTR(err);
> @@ -209,18 +211,19 @@ static void *__bpf_task_storage_get(struct bpf_map *map,
>                                     u64 flags, gfp_t gfp_flags, bool nobusy)
>  {
>         struct bpf_local_storage_data *sdata;
> +       struct bpf_local_storage_map *smap;
>
> +       smap = (struct bpf_local_storage_map *)map;

used much later, so maybe move it down? or just not change this part?

>         sdata = task_storage_lookup(task, map, nobusy);
>         if (sdata)
> -               return sdata->data;
> +               return sdata_mapval(sdata);
>
>         /* only allocate new storage, when the task is refcounted */
>         if (refcount_read(&task->usage) &&
>             (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
> -               sdata = bpf_local_storage_update(
> -                       task, (struct bpf_local_storage_map *)map, value,
> -                       BPF_NOEXIST, gfp_flags);
> -               return IS_ERR(sdata) ? NULL : sdata->data;
> +               sdata = bpf_local_storage_update(task, smap, value,
> +                                                BPF_NOEXIST, gfp_flags);
> +               return IS_ERR(sdata) ? NULL : sdata_mapval(sdata);
>         }
>
>         return NULL;
> @@ -317,6 +320,25 @@ static void task_storage_map_free(struct bpf_map *map)
>         bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
>  }
>
> +static int task_storage_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
> +{
> +       void *data;
> +
> +       if (!(map->map_flags & BPF_F_MMAPABLE) || vma->vm_pgoff ||
> +           (vma->vm_end - vma->vm_start) < map->value_size)

so we enforce that vm_pgoff is zero, that's understandable. But why
disallowing mmaping only a smaller portion of map value?

Also, more importantly, I think you should reject `vma->vm_end -
vma->vm_start > PAGE_ALIGN(map->value_size)`, no?

Another question. I might have missed it, but where do we disallow
mmap()'ing maps that have "special" fields in map_value, like kptrs,
spin_locks, timers, etc?

> +               return -EINVAL;
> +
> +       WARN_ON_ONCE(!bpf_rcu_lock_held());
> +       bpf_task_storage_lock();
> +       data = __bpf_task_storage_get(map, current, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE,
> +                                     0, true);
> +       bpf_task_storage_unlock();
> +       if (!data)
> +               return -EINVAL;
> +
> +       return remap_vmalloc_range(vma, data, vma->vm_pgoff);
> +}
> +
>  BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
>  const struct bpf_map_ops task_storage_map_ops = {
>         .map_meta_equal = bpf_map_meta_equal,
> @@ -331,6 +353,7 @@ const struct bpf_map_ops task_storage_map_ops = {
>         .map_mem_usage = bpf_local_storage_map_mem_usage,
>         .map_btf_id = &bpf_local_storage_map_btf_id[0],
>         .map_owner_storage_ptr = task_storage_ptr,
> +       .map_mmap = task_storage_map_mmap,
>  };
>
>  const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 5e43ddd1b83f..d7c05a509870 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -404,7 +404,7 @@ static void bpf_map_release_memcg(struct bpf_map *map)
>                 obj_cgroup_put(map->objcg);
>  }
>
> -static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
> +struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
>  {
>         if (map->objcg)
>                 return get_mem_cgroup_from_objcg(map->objcg);
> --
> 2.34.1
>
Alexei Starovoitov Nov. 21, 2023, 7:49 p.m. UTC | #12
On Tue, Nov 21, 2023 at 11:27 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/20/23 10:11 PM, David Marchevsky wrote:
> >
> >
> > On 11/20/23 7:42 PM, Martin KaFai Lau wrote:
> >> On 11/20/23 9:59 AM, Dave Marchevsky wrote:
> >>> diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
> >>> index 173ec7f43ed1..114973f925ea 100644
> >>> --- a/include/linux/bpf_local_storage.h
> >>> +++ b/include/linux/bpf_local_storage.h
> >>> @@ -69,7 +69,17 @@ struct bpf_local_storage_data {
> >>>         * the number of cachelines accessed during the cache hit case.
> >>>         */
> >>>        struct bpf_local_storage_map __rcu *smap;
> >>> -    u8 data[] __aligned(8);
> >>> +    /* Need to duplicate smap's map_flags as smap may be gone when
> >>> +     * it's time to free bpf_local_storage_data
> >>> +     */
> >>> +    u64 smap_map_flags;
> >>> +    /* If BPF_F_MMAPABLE, this is a void * to separately-alloc'd data
> >>> +     * Otherwise the actual mapval data lives here
> >>> +     */
> >>> +    union {
> >>> +        DECLARE_FLEX_ARRAY(u8, data) __aligned(8);
> >>> +        void *actual_data __aligned(8);
> >>
> >> The pages (that can be mmap'ed later) feel like a specific kind of kptr.
> >>
> >> Have you thought about allowing a kptr (pointing to some pages that can be mmap'ed later) to be stored as one of the members of the map's value as a kptr. bpf_local_storage_map is one of the maps that supports kptr.
> >>
> >> struct normal_and_mmap_value {
> >>      int some_int;
> >>      int __percpu_kptr *some_cnts;
> >>
> >>      struct bpf_mmap_page __kptr *some_stats;
> >> };
> >>
> >> struct mmap_only_value {
> >>      struct bpf_mmap_page __kptr *some_stats;
> >> };
> >>
> >> [ ... ]
> >>
> >
> > This is an intriguing idea. For conciseness I'll call this specific
> > kind of kptr 'mmapable kptrs' for the rest of this message. Below is
> > more of a brainstorming dump than a cohesive response, separate trains
> > of thought are separated by two newlines.
>
> Thanks for bearing with me while some ideas could be crazy. I am trying to see
> how this would look like for other local storage, sk and inode. Allocating a
> page for each sk will not be nice for server with half a million sk(s). e.g.
> half a million sk(s) sharing a few bandwidth policies or a few tuning
> parameters. Creating something mmap'able to the user space and also sharable
> among many sk(s) will be useful.
>
> >
> >
> > My initial thought upon seeing struct normal_and_mmap_value was to note
> > that we currently don't support mmaping for map_value types with _any_
> > special fields ('special' as determined by btf_parse_fields). But IIUC
> > you're actually talking about exposing the some_stats pointee memory via
> > mmap, not the containing struct with kptr fields. That is, for maps that
> > support these kptrs, mmap()ing a map with value type struct
> > normal_and_mmap_value would return the some_stats pointer value, and
> > likely initialize the pointer similarly to BPF_LOCAL_STORAGE_GET_F_CREATE
> > logic in this patch. We'd only be able to support one such mmapable kptr
> > field per mapval type, but that isn't a dealbreaker.
> >
> > Some maps, like task_storage, would only support mmap() on a map_value
> > with mmapable kptr field, as mmap()ing the mapval itself doesn't make
> > sense or is unsafe. Seems like arraymap would do the opposite, only
>
> Changing direction a bit since arraymap is brought up. :)
>
> arraymap supports BPF_F_MMAPABLE. If the local storage map's value can store an
> arraymap as kptr, the bpf prog should be able to access it as a map. More like
> the current map-in-map setup. The arraymap can be used as regular map in the
> user space also (like pinning). It may need some btf plumbing to tell the value
> type of the arrayamp to the verifier.
>
> The syscall bpf_map_update_elem(task_storage_map_fd, &task_pidfd, &value, flags)
> can be used where the value->array_mmap initialized as an arraymap_fd. This will
> limit the arraymap kptr update only from the syscall side which seems to be your
> usecase also? Allocating the arraymap from the bpf prog side needs some thoughts
> and need a whitelist.
>
> The same goes for the syscall bpf_map_lookup_elem(task_storage_map_fd,
> &task_pidfd, &value). The kernel can return a fd in value->array_mmap. May be we
> can create a libbpf helper to free the fd(s) resources held in the looked-up
> value by using the value's btf.
>
> The bpf_local_storage_map side probably does not need to support mmap() then.

Martin,
that's an interesting idea!
I kinda like it and I think it's worth exploring further.

I think the main quirk of the proposed mmap-of-task-local-storage
is using 'current' task as an implicit 'key' in task local storage map.
It fits here, but I'm not sure it addresses sched-ext use case.

Tejun, David,
could you please chime in ?
Do you think mmap(..., task_local_storage_map_fd, ...)
that returns a page that belongs to current task only is enough ?

If not we need to think through how to mmap local storage of other
tasks. One proposal was to use pgoff to carry the key somehow
like io-uring does, but if we want to generalize that the pgoff approach
falls apart if we want __mmapable_kptr to work like Martin is proposing above,
since the key will not fit in 64-bit of pgoff.

Maybe we need an office hours slot to discuss. This looks to be a big
topic. Not sure we can converge over email.
Just getting everyone on the same page will take a lot of email reading.
David Marchevsky Dec. 11, 2023, 5:31 p.m. UTC | #13
On 11/21/23 2:49 PM, Alexei Starovoitov wrote:
> On Tue, Nov 21, 2023 at 11:27 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 11/20/23 10:11 PM, David Marchevsky wrote:
>>>
>>>
>>> On 11/20/23 7:42 PM, Martin KaFai Lau wrote:
>>>> On 11/20/23 9:59 AM, Dave Marchevsky wrote:
>>>>> diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
>>>>> index 173ec7f43ed1..114973f925ea 100644
>>>>> --- a/include/linux/bpf_local_storage.h
>>>>> +++ b/include/linux/bpf_local_storage.h
>>>>> @@ -69,7 +69,17 @@ struct bpf_local_storage_data {
>>>>>         * the number of cachelines accessed during the cache hit case.
>>>>>         */
>>>>>        struct bpf_local_storage_map __rcu *smap;
>>>>> -    u8 data[] __aligned(8);
>>>>> +    /* Need to duplicate smap's map_flags as smap may be gone when
>>>>> +     * it's time to free bpf_local_storage_data
>>>>> +     */
>>>>> +    u64 smap_map_flags;
>>>>> +    /* If BPF_F_MMAPABLE, this is a void * to separately-alloc'd data
>>>>> +     * Otherwise the actual mapval data lives here
>>>>> +     */
>>>>> +    union {
>>>>> +        DECLARE_FLEX_ARRAY(u8, data) __aligned(8);
>>>>> +        void *actual_data __aligned(8);
>>>>
>>>> The pages (that can be mmap'ed later) feel like a specific kind of kptr.
>>>>
>>>> Have you thought about allowing a kptr (pointing to some pages that can be mmap'ed later) to be stored as one of the members of the map's value as a kptr. bpf_local_storage_map is one of the maps that supports kptr.
>>>>
>>>> struct normal_and_mmap_value {
>>>>      int some_int;
>>>>      int __percpu_kptr *some_cnts;
>>>>
>>>>      struct bpf_mmap_page __kptr *some_stats;
>>>> };
>>>>
>>>> struct mmap_only_value {
>>>>      struct bpf_mmap_page __kptr *some_stats;
>>>> };
>>>>
>>>> [ ... ]
>>>>
>>>
>>> This is an intriguing idea. For conciseness I'll call this specific
>>> kind of kptr 'mmapable kptrs' for the rest of this message. Below is
>>> more of a brainstorming dump than a cohesive response, separate trains
>>> of thought are separated by two newlines.
>>
>> Thanks for bearing with me while some ideas could be crazy. I am trying to see
>> how this would look like for other local storage, sk and inode. Allocating a
>> page for each sk will not be nice for server with half a million sk(s). e.g.
>> half a million sk(s) sharing a few bandwidth policies or a few tuning
>> parameters. Creating something mmap'able to the user space and also sharable
>> among many sk(s) will be useful.
>>
>>>
>>>
>>> My initial thought upon seeing struct normal_and_mmap_value was to note
>>> that we currently don't support mmaping for map_value types with _any_
>>> special fields ('special' as determined by btf_parse_fields). But IIUC
>>> you're actually talking about exposing the some_stats pointee memory via
>>> mmap, not the containing struct with kptr fields. That is, for maps that
>>> support these kptrs, mmap()ing a map with value type struct
>>> normal_and_mmap_value would return the some_stats pointer value, and
>>> likely initialize the pointer similarly to BPF_LOCAL_STORAGE_GET_F_CREATE
>>> logic in this patch. We'd only be able to support one such mmapable kptr
>>> field per mapval type, but that isn't a dealbreaker.
>>>
>>> Some maps, like task_storage, would only support mmap() on a map_value
>>> with mmapable kptr field, as mmap()ing the mapval itself doesn't make
>>> sense or is unsafe. Seems like arraymap would do the opposite, only
>>
>> Changing direction a bit since arraymap is brought up. :)
>>
>> arraymap supports BPF_F_MMAPABLE. If the local storage map's value can store an
>> arraymap as kptr, the bpf prog should be able to access it as a map. More like
>> the current map-in-map setup. The arraymap can be used as regular map in the
>> user space also (like pinning). It may need some btf plumbing to tell the value
>> type of the arrayamp to the verifier.
>>
>> The syscall bpf_map_update_elem(task_storage_map_fd, &task_pidfd, &value, flags)
>> can be used where the value->array_mmap initialized as an arraymap_fd. This will
>> limit the arraymap kptr update only from the syscall side which seems to be your
>> usecase also? Allocating the arraymap from the bpf prog side needs some thoughts
>> and need a whitelist.
>>
>> The same goes for the syscall bpf_map_lookup_elem(task_storage_map_fd,
>> &task_pidfd, &value). The kernel can return a fd in value->array_mmap. May be we
>> can create a libbpf helper to free the fd(s) resources held in the looked-up
>> value by using the value's btf.
>>
>> The bpf_local_storage_map side probably does not need to support mmap() then.
> 
> Martin,
> that's an interesting idea!
> I kinda like it and I think it's worth exploring further.
> 
> I think the main quirk of the proposed mmap-of-task-local-storage
> is using 'current' task as an implicit 'key' in task local storage map.
> It fits here, but I'm not sure it addresses sched-ext use case.
> 
> Tejun, David,
> could you please chime in ?
> Do you think mmap(..., task_local_storage_map_fd, ...)
> that returns a page that belongs to current task only is enough ?
> 
> If not we need to think through how to mmap local storage of other
> tasks. One proposal was to use pgoff to carry the key somehow
> like io-uring does, but if we want to generalize that the pgoff approach
> falls apart if we want __mmapable_kptr to work like Martin is proposing above,
> since the key will not fit in 64-bit of pgoff.
> 
> Maybe we need an office hours slot to discuss. This looks to be a big
> topic. Not sure we can converge over email.
> Just getting everyone on the same page will take a lot of email reading.

Meta BPF folks were all in one place for reasons unrelated to this
thread, and decided to have a design discussion regarding this mmapable
task_local storage implementation and other implementation ideas
discussed in this thread. I will summarize the discussion below. We
didn't arrive at a confident conclusion, though, so plenty of time for
others to chime in and for proper office hours discussion to happen if
necessary. Below, anything attributed to a specific person is
paraphrased from my notes, there will certainly be errors /
misrememberings.



mmapable task_local storage design discussion 11/29

We first summarized approaches that were discussed in this thread:

1) Current implementation in this series

2) pseudo-map_in_map, kptr arraymap type:

  struct mmapable_data_map {
          __uint(type, BPF_MAP_TYPE_ARRAY);
          __uint(map_flags, BPF_F_MMAPABLE);
          __uint(max_entries, 1);
          __type(key, __u32);
          __type(value, __u64);
  };

  struct my_mapval {
    int whatever;
    struct bpf_arraymap __kptr __arraymap_type(mmapable_data_map) *m;
    /* Need special logic to support getting / updating above field from userspace (as fd) */
  };

3) "mmapable page" kptr type, or "mmapable kptr" tag

  struct my_mapval {
    int whatever;
    struct bpf_mmapable_page *m;
  };
  
  struct my_mapval2 { /* Separate approach than my_mapval above */
    struct bpf_spin_lock l;
    int some_int;
    struct my_type __mmapable_kptr *some_stats;
  };

Points of consideration regardless of implementation approach:
  * mmap() syscall's return address must be page-aligned. If we want to
    reduce / eliminate wasted memory by supporting packing of multiple
    mapvals onto single page, need to be able to return non-page-aligned
    addr. Using a BPF syscall subcommand in lieu of or in addition to
    mmap() syscall would be a way to support this.
    * Dave wants to avoid implementing packing at all, but having a
      reasonable path forward would be nice in case actual usecase
      arises.
    * Andrii suggested a new actual mmap syscall supporting passing of
      custom params, useful for any subsystem using mmap in
      nontraditional ways. This was initially a response to "use offset
      to pass task id" discussion re: selecting non-current task.

   * How orthogonal is Martin's "kptr to arraymap" suggestion from the
     general mmapable local_storage goal? Is there a world where we
     choose a different approach for this work, and then to "kptr to
     arraymap" independently later?

The above was mostly summary of existing discussion in this thread. Rest
of discussion flowed from there.

Q&A:

- Do we need to be able to query other tasks' mapvals? (for David Vernet
  / Tejun Heo)

TJ had a usecase where this might've been necessary, but rewrote.
David: Being able to do this in general, aside from TJ's specific case,
would be useful. David provided an example from ghOSt project - central
scheduling. Another example: Folly runtime framework, farms out work to
worker threads, might want to tag them.

- Which usecases actually care about avoiding syscall overhead?

Alexei: Most services that would want to use this functionality to tag
their tasks don't need it, as they just set the value once.
Dave: Some tracing usecases (e.g. strobemeta) need it.

- Do we want to use mmap() in current-task-only limited way, or do BPF
  subcommand or something more exotic?

TJ: What if bpf subcommand returns FD that can be mmap'd. fd identifies
which task_local storage elem is mmap'd. Subcommand:
bpf_map_lookup_elem_fd(map *, u64 elem_id).
Alexei: Such a thing should return fd to arbitrary mapval, and should
support other common ops (open, close, etc.).
David: What's the problem w/ having fd that only supports mmap for now?
TJ: 'Dangling' fds exist in some usecases already

Discussion around the bpf_map_lookup_elem_fd idea continued for quite a
while. Folks liked that it avoids the "can only have one mmapable field"
issue from proposal (3) above, without making any implicit assumptions.

Alexei: Can we instead have pointer to userspace blob - similar to rseq
- to avoid wasting most of page?

TJ: My instinct is to stick to something more generic, would rather pay
4k.

Discussion around userspace pointer continued for a while as well,
ending in the conclusion that we should take a look at using
get_user_pages, perhaps wrapping such functionality in a 'guppable' kptr
type. Folks liked the 'guppable' idea as it sort-of avoids the wasted
memory issue, pushing the details to userspace, and punts on working out
a path forward for mmap interface, which other implementation ideas
require.

Action items based on convo, priority order:

  - Think more about / prototype 'guppable' kptr idea
  - If the above has issues, try bpf_map_lookup_elem_fd
  - If both above have issues, consider earlier approaches

I will start tackling these soon.
diff mbox series

Patch

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 173ec7f43ed1..114973f925ea 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -69,7 +69,17 @@  struct bpf_local_storage_data {
 	 * the number of cachelines accessed during the cache hit case.
 	 */
 	struct bpf_local_storage_map __rcu *smap;
-	u8 data[] __aligned(8);
+	/* Need to duplicate smap's map_flags as smap may be gone when
+	 * it's time to free bpf_local_storage_data
+	 */
+	u64 smap_map_flags;
+	/* If BPF_F_MMAPABLE, this is a void * to separately-alloc'd data
+	 * Otherwise the actual mapval data lives here
+	 */
+	union {
+		DECLARE_FLEX_ARRAY(u8, data) __aligned(8);
+		void *actual_data __aligned(8);
+	};
 };
 
 /* Linked to bpf_local_storage and bpf_local_storage_map */
@@ -124,6 +134,8 @@  static struct bpf_local_storage_cache name = {			\
 /* Helper functions for bpf_local_storage */
 int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
 
+void *sdata_mapval(struct bpf_local_storage_data *data);
+
 struct bpf_map *
 bpf_local_storage_map_alloc(union bpf_attr *attr,
 			    struct bpf_local_storage_cache *cache,
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 146824cc9689..9b3becbcc1a3 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -15,7 +15,8 @@ 
 #include <linux/rcupdate_trace.h>
 #include <linux/rcupdate_wait.h>
 
-#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
+#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK \
+	(BPF_F_NO_PREALLOC | BPF_F_CLONE | BPF_F_MMAPABLE)
 
 static struct bpf_local_storage_map_bucket *
 select_bucket(struct bpf_local_storage_map *smap,
@@ -24,6 +25,51 @@  select_bucket(struct bpf_local_storage_map *smap,
 	return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
 }
 
+struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map);
+
+void *alloc_mmapable_selem_value(struct bpf_local_storage_map *smap)
+{
+	struct mem_cgroup *memcg, *old_memcg;
+	void *ptr;
+
+	memcg = bpf_map_get_memcg(&smap->map);
+	old_memcg = set_active_memcg(memcg);
+	ptr = bpf_map_area_mmapable_alloc(PAGE_ALIGN(smap->map.value_size),
+					  NUMA_NO_NODE);
+	set_active_memcg(old_memcg);
+	mem_cgroup_put(memcg);
+
+	return ptr;
+}
+
+void *sdata_mapval(struct bpf_local_storage_data *data)
+{
+	if (data->smap_map_flags & BPF_F_MMAPABLE)
+		return data->actual_data;
+	return &data->data;
+}
+
+static size_t sdata_data_field_size(struct bpf_local_storage_map *smap,
+				    struct bpf_local_storage_data *data)
+{
+	if (smap->map.map_flags & BPF_F_MMAPABLE)
+		return sizeof(void *);
+	return (size_t)smap->map.value_size;
+}
+
+static u32 selem_bytes_used(struct bpf_local_storage_map *smap)
+{
+	if (smap->map.map_flags & BPF_F_MMAPABLE)
+		return smap->elem_size + PAGE_ALIGN(smap->map.value_size);
+	return smap->elem_size;
+}
+
+static bool can_update_existing_selem(struct bpf_local_storage_map *smap,
+				      u64 flags)
+{
+	return flags & BPF_F_LOCK || smap->map.map_flags & BPF_F_MMAPABLE;
+}
+
 static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
 {
 	struct bpf_map *map = &smap->map;
@@ -76,10 +122,19 @@  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 		void *value, bool charge_mem, gfp_t gfp_flags)
 {
 	struct bpf_local_storage_elem *selem;
+	void *mmapable_value = NULL;
+	u32 selem_mem;
 
-	if (charge_mem && mem_charge(smap, owner, smap->elem_size))
+	selem_mem = selem_bytes_used(smap);
+	if (charge_mem && mem_charge(smap, owner, selem_mem))
 		return NULL;
 
+	if (smap->map.map_flags & BPF_F_MMAPABLE) {
+		mmapable_value = alloc_mmapable_selem_value(smap);
+		if (!mmapable_value)
+			goto err_out;
+	}
+
 	if (smap->bpf_ma) {
 		migrate_disable();
 		selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
@@ -92,22 +147,28 @@  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 			 * only does bpf_mem_cache_free when there is
 			 * no other bpf prog is using the selem.
 			 */
-			memset(SDATA(selem)->data, 0, smap->map.value_size);
+			memset(SDATA(selem)->data, 0,
+			       sdata_data_field_size(smap, SDATA(selem)));
 	} else {
 		selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
 					gfp_flags | __GFP_NOWARN);
 	}
 
-	if (selem) {
-		if (value)
-			copy_map_value(&smap->map, SDATA(selem)->data, value);
-		/* No need to call check_and_init_map_value as memory is zero init */
-		return selem;
-	}
-
+	if (!selem)
+		goto err_out;
+
+	selem->sdata.smap_map_flags = smap->map.map_flags;
+	if (smap->map.map_flags & BPF_F_MMAPABLE)
+		selem->sdata.actual_data = mmapable_value;
+	if (value)
+		copy_map_value(&smap->map, sdata_mapval(SDATA(selem)), value);
+	/* No need to call check_and_init_map_value as memory is zero init */
+	return selem;
+err_out:
+	if (mmapable_value)
+		bpf_map_area_free(mmapable_value);
 	if (charge_mem)
-		mem_uncharge(smap, owner, smap->elem_size);
-
+		mem_uncharge(smap, owner, selem_mem);
 	return NULL;
 }
 
@@ -184,6 +245,21 @@  static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
 	}
 }
 
+static void __bpf_selem_kfree(struct bpf_local_storage_elem *selem)
+{
+	if (selem->sdata.smap_map_flags & BPF_F_MMAPABLE)
+		bpf_map_area_free(selem->sdata.actual_data);
+	kfree(selem);
+}
+
+static void __bpf_selem_kfree_rcu(struct rcu_head *rcu)
+{
+	struct bpf_local_storage_elem *selem;
+
+	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+	__bpf_selem_kfree(selem);
+}
+
 /* rcu tasks trace callback for bpf_ma == false */
 static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
 {
@@ -191,9 +267,9 @@  static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
 
 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
 	if (rcu_trace_implies_rcu_gp())
-		kfree(selem);
+		__bpf_selem_kfree(selem);
 	else
-		kfree_rcu(selem, rcu);
+		call_rcu(rcu, __bpf_selem_kfree_rcu);
 }
 
 /* Handle bpf_ma == false */
@@ -201,7 +277,7 @@  static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
 			     bool vanilla_rcu)
 {
 	if (vanilla_rcu)
-		kfree_rcu(selem, rcu);
+		call_rcu(&selem->rcu, __bpf_selem_kfree_rcu);
 	else
 		call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
 }
@@ -209,8 +285,12 @@  static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
 static void bpf_selem_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage_map *smap;
 
 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+	smap = selem->sdata.smap;
+	if (selem->sdata.smap_map_flags & BPF_F_MMAPABLE)
+		bpf_map_area_free(selem->sdata.actual_data);
 	bpf_mem_cache_raw_free(selem);
 }
 
@@ -241,6 +321,8 @@  void bpf_selem_free(struct bpf_local_storage_elem *selem,
 		 * immediately.
 		 */
 		migrate_disable();
+		if (smap->map.map_flags & BPF_F_MMAPABLE)
+			bpf_map_area_free(selem->sdata.actual_data);
 		bpf_mem_cache_free(&smap->selem_ma, selem);
 		migrate_enable();
 	}
@@ -266,7 +348,7 @@  static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	 * from local_storage.
 	 */
 	if (uncharge_mem)
-		mem_uncharge(smap, owner, smap->elem_size);
+		mem_uncharge(smap, owner, selem_bytes_used(smap));
 
 	free_local_storage = hlist_is_singular_node(&selem->snode,
 						    &local_storage->list);
@@ -583,14 +665,14 @@  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
 		if (err) {
 			bpf_selem_free(selem, smap, true);
-			mem_uncharge(smap, owner, smap->elem_size);
+			mem_uncharge(smap, owner, selem_bytes_used(smap));
 			return ERR_PTR(err);
 		}
 
 		return SDATA(selem);
 	}
 
-	if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
+	if (can_update_existing_selem(smap, map_flags) && !(map_flags & BPF_NOEXIST)) {
 		/* Hoping to find an old_sdata to do inline update
 		 * such that it can avoid taking the local_storage->lock
 		 * and changing the lists.
@@ -601,8 +683,13 @@  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		if (err)
 			return ERR_PTR(err);
 		if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
-			copy_map_value_locked(&smap->map, old_sdata->data,
-					      value, false);
+			if (map_flags & BPF_F_LOCK)
+				copy_map_value_locked(&smap->map,
+						      sdata_mapval(old_sdata),
+						      value, false);
+			else
+				copy_map_value(&smap->map, sdata_mapval(old_sdata),
+					       value);
 			return old_sdata;
 		}
 	}
@@ -633,8 +720,8 @@  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		goto unlock;
 
 	if (old_sdata && (map_flags & BPF_F_LOCK)) {
-		copy_map_value_locked(&smap->map, old_sdata->data, value,
-				      false);
+		copy_map_value_locked(&smap->map, sdata_mapval(old_sdata),
+				      value, false);
 		selem = SELEM(old_sdata);
 		goto unlock;
 	}
@@ -656,7 +743,7 @@  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 unlock:
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 	if (alloc_selem) {
-		mem_uncharge(smap, owner, smap->elem_size);
+		mem_uncharge(smap, owner, selem_bytes_used(smap));
 		bpf_selem_free(alloc_selem, smap, true);
 	}
 	return err ? ERR_PTR(err) : SDATA(selem);
@@ -707,6 +794,10 @@  int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
 	if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
 		return -E2BIG;
 
+	if ((attr->map_flags & BPF_F_MMAPABLE) &&
+	    attr->map_type != BPF_MAP_TYPE_TASK_STORAGE)
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -820,8 +911,12 @@  bpf_local_storage_map_alloc(union bpf_attr *attr,
 		raw_spin_lock_init(&smap->buckets[i].lock);
 	}
 
-	smap->elem_size = offsetof(struct bpf_local_storage_elem,
-				   sdata.data[attr->value_size]);
+	if (attr->map_flags & BPF_F_MMAPABLE)
+		smap->elem_size = offsetof(struct bpf_local_storage_elem,
+					   sdata.data[sizeof(void *)]);
+	else
+		smap->elem_size = offsetof(struct bpf_local_storage_elem,
+					   sdata.data[attr->value_size]);
 
 	smap->bpf_ma = bpf_ma;
 	if (bpf_ma) {
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index adf6dfe0ba68..ce75c8d8b2ce 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -90,6 +90,7 @@  void bpf_task_storage_free(struct task_struct *task)
 static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_local_storage_data *sdata;
+	struct bpf_local_storage_map *smap;
 	struct task_struct *task;
 	unsigned int f_flags;
 	struct pid *pid;
@@ -114,7 +115,8 @@  static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
 	sdata = task_storage_lookup(task, map, true);
 	bpf_task_storage_unlock();
 	put_pid(pid);
-	return sdata ? sdata->data : NULL;
+	smap = (struct bpf_local_storage_map *)map;
+	return sdata ? sdata_mapval(sdata) : NULL;
 out:
 	put_pid(pid);
 	return ERR_PTR(err);
@@ -209,18 +211,19 @@  static void *__bpf_task_storage_get(struct bpf_map *map,
 				    u64 flags, gfp_t gfp_flags, bool nobusy)
 {
 	struct bpf_local_storage_data *sdata;
+	struct bpf_local_storage_map *smap;
 
+	smap = (struct bpf_local_storage_map *)map;
 	sdata = task_storage_lookup(task, map, nobusy);
 	if (sdata)
-		return sdata->data;
+		return sdata_mapval(sdata);
 
 	/* only allocate new storage, when the task is refcounted */
 	if (refcount_read(&task->usage) &&
 	    (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
-		sdata = bpf_local_storage_update(
-			task, (struct bpf_local_storage_map *)map, value,
-			BPF_NOEXIST, gfp_flags);
-		return IS_ERR(sdata) ? NULL : sdata->data;
+		sdata = bpf_local_storage_update(task, smap, value,
+						 BPF_NOEXIST, gfp_flags);
+		return IS_ERR(sdata) ? NULL : sdata_mapval(sdata);
 	}
 
 	return NULL;
@@ -317,6 +320,25 @@  static void task_storage_map_free(struct bpf_map *map)
 	bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
 }
 
+static int task_storage_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+	void *data;
+
+	if (!(map->map_flags & BPF_F_MMAPABLE) || vma->vm_pgoff ||
+	    (vma->vm_end - vma->vm_start) < map->value_size)
+		return -EINVAL;
+
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
+	bpf_task_storage_lock();
+	data = __bpf_task_storage_get(map, current, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE,
+				      0, true);
+	bpf_task_storage_unlock();
+	if (!data)
+		return -EINVAL;
+
+	return remap_vmalloc_range(vma, data, vma->vm_pgoff);
+}
+
 BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
 const struct bpf_map_ops task_storage_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
@@ -331,6 +353,7 @@  const struct bpf_map_ops task_storage_map_ops = {
 	.map_mem_usage = bpf_local_storage_map_mem_usage,
 	.map_btf_id = &bpf_local_storage_map_btf_id[0],
 	.map_owner_storage_ptr = task_storage_ptr,
+	.map_mmap = task_storage_map_mmap,
 };
 
 const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5e43ddd1b83f..d7c05a509870 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -404,7 +404,7 @@  static void bpf_map_release_memcg(struct bpf_map *map)
 		obj_cgroup_put(map->objcg);
 }
 
-static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
+struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
 {
 	if (map->objcg)
 		return get_mem_cgroup_from_objcg(map->objcg);