diff mbox series

[bpf-next,v1,1/3] bpf: implement relay map basis

Message ID 20231227100130.84501-2-lulie@linux.alibaba.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: introduce BPF_MAP_TYPE_RELAY | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 fail Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-6 fail Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 fail Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-9 fail Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 fail Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-26 fail Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-30 fail Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-27 fail Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 fail Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 fail Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 fail Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 fail Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 fail Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success SINGLE THREAD; Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 2917 this patch: 2917
netdev/cc_maintainers success CCed 12 of 12 maintainers
netdev/build_clang success Errors and warnings before: 1278 this patch: 1278
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 2998 this patch: 2998
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: braces {} should be used on all arms of this statement WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-13 fail Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc

Commit Message

Philo Lu Dec. 27, 2023, 10:01 a.m. UTC
BPF_MAP_TYPE_RELAY is implemented based on relay interface, which
creates per-cpu buffer to transfer data. Each buffer is essentially a
list of fix-sized sub-buffers, and is exposed to user space as files in
debugfs. All of these compose a "relay chanel", which is the kernel of a
relay map.

Currently, attr->max_entries is used as subbuf size and attr->map_extra
is used as subbuf num, and the default value of subbuf num is 8. A new
map flag named BPF_F_OVERWRITE is also introduced to set overwrite mode
of relay map.

A relay map is represented as a directory in debugfs, and the per-cpu
buffers are files in this directory. Users can get the data through read
or mmap.

To avoid directory name conflicting, relay_map_update_elem is provided
to set the name. In fact, we create the relay channel and buffers with
BPF_MAP_CREATE, and create relay files and bind them with the channel
using BPF_MAP_UPDATE_ELEM. Generally, map_update_elem should be called
once and only once.

Here is an example:
```
struct {
__uint(type, BPF_MAP_TYPE_RELAY);
__uint(max_entries, 4096);
} my_relay SEC(".maps");
...
char dir_name[] = "relay_test";
bpf_map_update_elem(map_fd, NULL, dir_name, 0);
```

Assume there are 2 cpus, we will have 2 files:
```
/sys/kerenl/debug/relay_test/my_relay0
/sys/kerenl/debug/relay_test/my_relay1
```
Each represents a per-cpu buffer with size 8 * 4096 B (there are 8
subbufs by default, each with size 4096B).

Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
---
 include/linux/bpf_types.h |   3 +
 include/uapi/linux/bpf.h  |   7 ++
 kernel/bpf/Makefile       |   3 +
 kernel/bpf/relaymap.c     | 199 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c      |   2 +
 5 files changed, 214 insertions(+)
 create mode 100644 kernel/bpf/relaymap.c

Comments

Yafang Shao Dec. 27, 2023, 1:41 p.m. UTC | #1
On Wed, Dec 27, 2023 at 6:01 PM Philo Lu <lulie@linux.alibaba.com> wrote:
>
> BPF_MAP_TYPE_RELAY is implemented based on relay interface, which
> creates per-cpu buffer to transfer data. Each buffer is essentially a
> list of fix-sized sub-buffers, and is exposed to user space as files in
> debugfs. All of these compose a "relay chanel", which is the kernel of a
> relay map.
>
> Currently, attr->max_entries is used as subbuf size and attr->map_extra
> is used as subbuf num, and the default value of subbuf num is 8. A new
> map flag named BPF_F_OVERWRITE is also introduced to set overwrite mode
> of relay map.
>
> A relay map is represented as a directory in debugfs, and the per-cpu
> buffers are files in this directory. Users can get the data through read
> or mmap.
>
> To avoid directory name conflicting, relay_map_update_elem is provided
> to set the name. In fact, we create the relay channel and buffers with
> BPF_MAP_CREATE, and create relay files and bind them with the channel
> using BPF_MAP_UPDATE_ELEM. Generally, map_update_elem should be called
> once and only once.
>
> Here is an example:
> ```
> struct {
> __uint(type, BPF_MAP_TYPE_RELAY);
> __uint(max_entries, 4096);
> } my_relay SEC(".maps");
> ...
> char dir_name[] = "relay_test";
> bpf_map_update_elem(map_fd, NULL, dir_name, 0);
> ```
>
> Assume there are 2 cpus, we will have 2 files:
> ```
> /sys/kerenl/debug/relay_test/my_relay0
> /sys/kerenl/debug/relay_test/my_relay1
> ```

Is there a specific reason why relayfs necessitates creating an
individual file for each CPU? Alternatively, are there any approaches
available to collectively expose all CPUs using a single file?

When dealing with a large number of available CPUs, such as 236,
reading the buffer using the command `cat
/sys/kernel/debug/relay_test/my_relay{0...236} | awk '{}' ` can become
a bit cumbersome and tedious.

> Each represents a per-cpu buffer with size 8 * 4096 B (there are 8
> subbufs by default, each with size 4096B).
>
> Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
> ---
>  include/linux/bpf_types.h |   3 +
>  include/uapi/linux/bpf.h  |   7 ++
>  kernel/bpf/Makefile       |   3 +
>  kernel/bpf/relaymap.c     | 199 ++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c      |   2 +
>  5 files changed, 214 insertions(+)
>  create mode 100644 kernel/bpf/relaymap.c
>
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index fc0d6f32c687..c122d7b494c5 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -132,6 +132,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
> +#ifdef CONFIG_RELAY
> +BPF_MAP_TYPE(BPF_MAP_TYPE_RELAY, relay_map_ops)
> +#endif
>
>  BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
>  BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 754e68ca8744..143b75676bd3 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -951,6 +951,7 @@ enum bpf_map_type {
>         BPF_MAP_TYPE_BLOOM_FILTER,
>         BPF_MAP_TYPE_USER_RINGBUF,
>         BPF_MAP_TYPE_CGRP_STORAGE,
> +       BPF_MAP_TYPE_RELAY,
>  };
>
>  /* Note that tracing related programs such as
> @@ -1330,6 +1331,9 @@ enum {
>
>  /* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */
>         BPF_F_PATH_FD           = (1U << 14),
> +
> +/* Enable overwrite for relay map */
> +       BPF_F_OVERWRITE         = (1U << 15),
>  };
>
>  /* Flags for BPF_PROG_QUERY. */
> @@ -1401,6 +1405,9 @@ union bpf_attr {
>                  * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
>                  * number of hash functions (if 0, the bloom filter will default
>                  * to using 5 hash functions).
> +                *
> +                * BPF_MAP_TYPE_RELAY - the lowest 32 bits indicate the number of
> +                * relay subbufs (if 0, the number will be set to 8 by default).
>                  */
>                 __u64   map_extra;
>         };
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index f526b7573e97..45b35bb0e572 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -10,6 +10,9 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
>  obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
>  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
>  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
> +ifeq ($(CONFIG_RELAY),y)
> +obj-$(CONFIG_BPF_SYSCALL) += relaymap.o
> +endif
>  obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
>  obj-${CONFIG_BPF_LSM}    += bpf_inode_storage.o
>  obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
> diff --git a/kernel/bpf/relaymap.c b/kernel/bpf/relaymap.c
> new file mode 100644
> index 000000000000..02b33a8e6b6c
> --- /dev/null
> +++ b/kernel/bpf/relaymap.c
> @@ -0,0 +1,199 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/cpumask.h>
> +#include <linux/debugfs.h>
> +#include <linux/filter.h>
> +#include <linux/relay.h>
> +#include <linux/slab.h>
> +#include <linux/bpf.h>
> +#include <linux/err.h>
> +
> +#define RELAY_CREATE_FLAG_MASK (BPF_F_OVERWRITE)
> +
> +struct bpf_relay_map {
> +       struct bpf_map map;
> +       struct rchan *relay_chan;
> +       struct rchan_callbacks relay_cb;
> +};
> +
> +static struct dentry *create_buf_file_handler(const char *filename,
> +                                      struct dentry *parent, umode_t mode,
> +                                      struct rchan_buf *buf, int *is_global)
> +{
> +       /* Because we do relay_late_setup_files(), create_buf_file(NULL, NULL, ...)
> +        * will be called by relay_open.
> +        */
> +       if (!filename)
> +               return NULL;
> +
> +       return debugfs_create_file(filename, mode, parent, buf,
> +                                  &relay_file_operations);
> +}
> +
> +static int remove_buf_file_handler(struct dentry *dentry)
> +{
> +       debugfs_remove(dentry);
> +       return 0;
> +}
> +
> +/* For non-overwrite, use default subbuf_start cb */
> +static int subbuf_start_overwrite(struct rchan_buf *buf, void *subbuf,
> +                                      void *prev_subbuf, size_t prev_padding)
> +{
> +       return 1;
> +}
> +
> +/* bpf_attr is used as follows:
> + * - key size: must be 0
> + * - value size: value will be used as directory name by map_update_elem
> + *   (to create relay files). If passed as 0, it will be set to NAME_MAX as
> + *   default
> + *
> + * - max_entries: subbuf size
> + * - map_extra: subbuf num, default as 8
> + *
> + * When alloc, we do not set up relay files considering dir_name conflicts.
> + * Instead we use relay_late_setup_files() in map_update_elem(), and thus the
> + * value is used as dir_name, and map->name is used as base_filename.
> + */
> +static struct bpf_map *relay_map_alloc(union bpf_attr *attr)
> +{
> +       struct bpf_relay_map *rmap;
> +
> +       if (unlikely(attr->map_flags & ~RELAY_CREATE_FLAG_MASK))
> +               return ERR_PTR(-EINVAL);
> +
> +       /* key size must be 0 in relay map */
> +       if (unlikely(attr->key_size))
> +               return ERR_PTR(-EINVAL);
> +
> +       /* value size is used as directory name length */
> +       if (unlikely(attr->value_size > NAME_MAX)) {
> +               pr_warn("value_size should be no more than %d\n", NAME_MAX);
> +               return ERR_PTR(-EINVAL);
> +       } else if (attr->value_size == 0)
> +               attr->value_size = NAME_MAX;
> +
> +       /* set default subbuf num */
> +       if (unlikely(attr->map_extra & ~UINT_MAX))
> +               return ERR_PTR(-EINVAL);
> +       attr->map_extra = attr->map_extra & UINT_MAX;
> +       if (!attr->map_extra)
> +               attr->map_extra = 8;
> +
> +       if (strlen(attr->map_name) == 0)
> +               return ERR_PTR(-EINVAL);
> +
> +       rmap = bpf_map_area_alloc(sizeof(*rmap), NUMA_NO_NODE);
> +       if (!rmap)
> +               return ERR_PTR(-ENOMEM);
> +
> +       bpf_map_init_from_attr(&rmap->map, attr);
> +
> +       rmap->relay_cb.create_buf_file = create_buf_file_handler;
> +       rmap->relay_cb.remove_buf_file = remove_buf_file_handler;
> +       if (attr->map_flags & BPF_F_OVERWRITE)
> +               rmap->relay_cb.subbuf_start = subbuf_start_overwrite;
> +
> +       rmap->relay_chan = relay_open(NULL, NULL,
> +                               attr->max_entries, attr->map_extra,
> +                               &rmap->relay_cb, NULL);
> +       if (!rmap->relay_chan) {
> +               bpf_map_area_free(rmap);
> +               return ERR_PTR(-EINVAL);
> +       }
> +
> +       return &rmap->map;
> +}
> +
> +static void relay_map_free(struct bpf_map *map)
> +{
> +       struct bpf_relay_map *rmap;
> +       struct dentry *parent;
> +
> +       rmap = container_of(map, struct bpf_relay_map, map);
> +
> +       parent = rmap->relay_chan->parent;
> +       relay_close(rmap->relay_chan);
> +       /* relay_chan->parent should be removed mannually if exists. */
> +       debugfs_remove_recursive(parent);
> +       bpf_map_area_free(rmap);
> +}
> +
> +static void *relay_map_lookup_elem(struct bpf_map *map, void *key)
> +{
> +       return ERR_PTR(-EOPNOTSUPP);
> +}
> +
> +static long relay_map_update_elem(struct bpf_map *map, void *key, void *value,
> +                                  u64 flags)
> +{
> +       struct bpf_relay_map *rmap;
> +       struct dentry *parent;
> +       int err;
> +
> +       if (unlikely(flags))
> +               return -EINVAL;
> +
> +       if (unlikely(key))
> +               return -EINVAL;
> +
> +       /* If the directory already exists, debugfs_create_dir will fail. It could
> +        * have been created by map_update_elem before, or another system that uses
> +        * debugfs.
> +        *
> +        * Note that the directory name passed as value should not be longer than
> +        * map->value_size, including the '\0' at the end.
> +        */
> +       ((char *)value)[map->value_size - 1] = '\0';
> +       parent = debugfs_create_dir(value, NULL);
> +       if (IS_ERR_OR_NULL(parent))
> +               return PTR_ERR(parent);
> +
> +       /* We don't need a lock here, because the relay channel is protected in
> +        * relay_late_setup_files() with a mutex.
> +        */
> +       rmap = container_of(map, struct bpf_relay_map, map);
> +       err = relay_late_setup_files(rmap->relay_chan, map->name, parent);
> +       if (err) {
> +               debugfs_remove_recursive(parent);
> +               return err;
> +       }
> +
> +       return 0;
> +}
> +
> +static long relay_map_delete_elem(struct bpf_map *map, void *key)
> +{
> +       return -EOPNOTSUPP;
> +}
> +
> +static int relay_map_get_next_key(struct bpf_map *map, void *key,
> +                                   void *next_key)
> +{
> +       return -EOPNOTSUPP;
> +}
> +
> +static u64 relay_map_mem_usage(const struct bpf_map *map)
> +{
> +       struct bpf_relay_map *rmap;
> +       u64 usage = sizeof(struct bpf_relay_map);
> +
> +       rmap = container_of(map, struct bpf_relay_map, map);
> +       usage += sizeof(struct rchan);
> +       usage += (sizeof(struct rchan_buf) + rmap->relay_chan->alloc_size)
> +                        * num_online_cpus();
> +       return usage;
> +}
> +
> +BTF_ID_LIST_SINGLE(relay_map_btf_ids, struct, bpf_relay_map)
> +const struct bpf_map_ops relay_map_ops = {
> +       .map_meta_equal = bpf_map_meta_equal,
> +       .map_alloc = relay_map_alloc,
> +       .map_free = relay_map_free,
> +       .map_lookup_elem = relay_map_lookup_elem,
> +       .map_update_elem = relay_map_update_elem,
> +       .map_delete_elem = relay_map_delete_elem,
> +       .map_get_next_key = relay_map_get_next_key,
> +       .map_mem_usage = relay_map_mem_usage,
> +       .map_btf_id = &relay_map_btf_ids[0],
> +};
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 1bf9805ee185..d6b7949e29c7 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1147,6 +1147,7 @@ static int map_create(union bpf_attr *attr)
>         }
>
>         if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
> +           attr->map_type != BPF_MAP_TYPE_RELAY &&
>             attr->map_extra != 0)
>                 return -EINVAL;
>
> @@ -1202,6 +1203,7 @@ static int map_create(union bpf_attr *attr)
>         case BPF_MAP_TYPE_USER_RINGBUF:
>         case BPF_MAP_TYPE_CGROUP_STORAGE:
>         case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
> +       case BPF_MAP_TYPE_RELAY:
>                 /* unprivileged */
>                 break;
>         case BPF_MAP_TYPE_SK_STORAGE:
> --
> 2.32.0.3.g01195cf9f
>
diff mbox series

Patch

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fc0d6f32c687..c122d7b494c5 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -132,6 +132,9 @@  BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
+#ifdef CONFIG_RELAY
+BPF_MAP_TYPE(BPF_MAP_TYPE_RELAY, relay_map_ops)
+#endif
 
 BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 754e68ca8744..143b75676bd3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -951,6 +951,7 @@  enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
+	BPF_MAP_TYPE_RELAY,
 };
 
 /* Note that tracing related programs such as
@@ -1330,6 +1331,9 @@  enum {
 
 /* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */
 	BPF_F_PATH_FD		= (1U << 14),
+
+/* Enable overwrite for relay map */
+	BPF_F_OVERWRITE		= (1U << 15),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1401,6 +1405,9 @@  union bpf_attr {
 		 * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
 		 * number of hash functions (if 0, the bloom filter will default
 		 * to using 5 hash functions).
+		 *
+		 * BPF_MAP_TYPE_RELAY - the lowest 32 bits indicate the number of
+		 * relay subbufs (if 0, the number will be set to 8 by default).
 		 */
 		__u64	map_extra;
 	};
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f526b7573e97..45b35bb0e572 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -10,6 +10,9 @@  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+ifeq ($(CONFIG_RELAY),y)
+obj-$(CONFIG_BPF_SYSCALL) += relaymap.o
+endif
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
 obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
diff --git a/kernel/bpf/relaymap.c b/kernel/bpf/relaymap.c
new file mode 100644
index 000000000000..02b33a8e6b6c
--- /dev/null
+++ b/kernel/bpf/relaymap.c
@@ -0,0 +1,199 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpumask.h>
+#include <linux/debugfs.h>
+#include <linux/filter.h>
+#include <linux/relay.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/err.h>
+
+#define RELAY_CREATE_FLAG_MASK (BPF_F_OVERWRITE)
+
+struct bpf_relay_map {
+	struct bpf_map map;
+	struct rchan *relay_chan;
+	struct rchan_callbacks relay_cb;
+};
+
+static struct dentry *create_buf_file_handler(const char *filename,
+				       struct dentry *parent, umode_t mode,
+				       struct rchan_buf *buf, int *is_global)
+{
+	/* Because we do relay_late_setup_files(), create_buf_file(NULL, NULL, ...)
+	 * will be called by relay_open.
+	 */
+	if (!filename)
+		return NULL;
+
+	return debugfs_create_file(filename, mode, parent, buf,
+				   &relay_file_operations);
+}
+
+static int remove_buf_file_handler(struct dentry *dentry)
+{
+	debugfs_remove(dentry);
+	return 0;
+}
+
+/* For non-overwrite, use default subbuf_start cb */
+static int subbuf_start_overwrite(struct rchan_buf *buf, void *subbuf,
+				       void *prev_subbuf, size_t prev_padding)
+{
+	return 1;
+}
+
+/* bpf_attr is used as follows:
+ * - key size: must be 0
+ * - value size: value will be used as directory name by map_update_elem
+ *   (to create relay files). If passed as 0, it will be set to NAME_MAX as
+ *   default
+ *
+ * - max_entries: subbuf size
+ * - map_extra: subbuf num, default as 8
+ *
+ * When alloc, we do not set up relay files considering dir_name conflicts.
+ * Instead we use relay_late_setup_files() in map_update_elem(), and thus the
+ * value is used as dir_name, and map->name is used as base_filename.
+ */
+static struct bpf_map *relay_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_relay_map *rmap;
+
+	if (unlikely(attr->map_flags & ~RELAY_CREATE_FLAG_MASK))
+		return ERR_PTR(-EINVAL);
+
+	/* key size must be 0 in relay map */
+	if (unlikely(attr->key_size))
+		return ERR_PTR(-EINVAL);
+
+	/* value size is used as directory name length */
+	if (unlikely(attr->value_size > NAME_MAX)) {
+		pr_warn("value_size should be no more than %d\n", NAME_MAX);
+		return ERR_PTR(-EINVAL);
+	} else if (attr->value_size == 0)
+		attr->value_size = NAME_MAX;
+
+	/* set default subbuf num */
+	if (unlikely(attr->map_extra & ~UINT_MAX))
+		return ERR_PTR(-EINVAL);
+	attr->map_extra = attr->map_extra & UINT_MAX;
+	if (!attr->map_extra)
+		attr->map_extra = 8;
+
+	if (strlen(attr->map_name) == 0)
+		return ERR_PTR(-EINVAL);
+
+	rmap = bpf_map_area_alloc(sizeof(*rmap), NUMA_NO_NODE);
+	if (!rmap)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&rmap->map, attr);
+
+	rmap->relay_cb.create_buf_file = create_buf_file_handler;
+	rmap->relay_cb.remove_buf_file = remove_buf_file_handler;
+	if (attr->map_flags & BPF_F_OVERWRITE)
+		rmap->relay_cb.subbuf_start = subbuf_start_overwrite;
+
+	rmap->relay_chan = relay_open(NULL, NULL,
+				attr->max_entries, attr->map_extra,
+				&rmap->relay_cb, NULL);
+	if (!rmap->relay_chan) {
+		bpf_map_area_free(rmap);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return &rmap->map;
+}
+
+static void relay_map_free(struct bpf_map *map)
+{
+	struct bpf_relay_map *rmap;
+	struct dentry *parent;
+
+	rmap = container_of(map, struct bpf_relay_map, map);
+
+	parent = rmap->relay_chan->parent;
+	relay_close(rmap->relay_chan);
+	/* relay_chan->parent should be removed mannually if exists. */
+	debugfs_remove_recursive(parent);
+	bpf_map_area_free(rmap);
+}
+
+static void *relay_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static long relay_map_update_elem(struct bpf_map *map, void *key, void *value,
+				   u64 flags)
+{
+	struct bpf_relay_map *rmap;
+	struct dentry *parent;
+	int err;
+
+	if (unlikely(flags))
+		return -EINVAL;
+
+	if (unlikely(key))
+		return -EINVAL;
+
+	/* If the directory already exists, debugfs_create_dir will fail. It could
+	 * have been created by map_update_elem before, or another system that uses
+	 * debugfs.
+	 *
+	 * Note that the directory name passed as value should not be longer than
+	 * map->value_size, including the '\0' at the end.
+	 */
+	((char *)value)[map->value_size - 1] = '\0';
+	parent = debugfs_create_dir(value, NULL);
+	if (IS_ERR_OR_NULL(parent))
+		return PTR_ERR(parent);
+
+	/* We don't need a lock here, because the relay channel is protected in
+	 * relay_late_setup_files() with a mutex.
+	 */
+	rmap = container_of(map, struct bpf_relay_map, map);
+	err = relay_late_setup_files(rmap->relay_chan, map->name, parent);
+	if (err) {
+		debugfs_remove_recursive(parent);
+		return err;
+	}
+
+	return 0;
+}
+
+static long relay_map_delete_elem(struct bpf_map *map, void *key)
+{
+	return -EOPNOTSUPP;
+}
+
+static int relay_map_get_next_key(struct bpf_map *map, void *key,
+				    void *next_key)
+{
+	return -EOPNOTSUPP;
+}
+
+static u64 relay_map_mem_usage(const struct bpf_map *map)
+{
+	struct bpf_relay_map *rmap;
+	u64 usage = sizeof(struct bpf_relay_map);
+
+	rmap = container_of(map, struct bpf_relay_map, map);
+	usage += sizeof(struct rchan);
+	usage += (sizeof(struct rchan_buf) + rmap->relay_chan->alloc_size)
+			 * num_online_cpus();
+	return usage;
+}
+
+BTF_ID_LIST_SINGLE(relay_map_btf_ids, struct, bpf_relay_map)
+const struct bpf_map_ops relay_map_ops = {
+	.map_meta_equal = bpf_map_meta_equal,
+	.map_alloc = relay_map_alloc,
+	.map_free = relay_map_free,
+	.map_lookup_elem = relay_map_lookup_elem,
+	.map_update_elem = relay_map_update_elem,
+	.map_delete_elem = relay_map_delete_elem,
+	.map_get_next_key = relay_map_get_next_key,
+	.map_mem_usage = relay_map_mem_usage,
+	.map_btf_id = &relay_map_btf_ids[0],
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1bf9805ee185..d6b7949e29c7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1147,6 +1147,7 @@  static int map_create(union bpf_attr *attr)
 	}
 
 	if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
+	    attr->map_type != BPF_MAP_TYPE_RELAY &&
 	    attr->map_extra != 0)
 		return -EINVAL;
 
@@ -1202,6 +1203,7 @@  static int map_create(union bpf_attr *attr)
 	case BPF_MAP_TYPE_USER_RINGBUF:
 	case BPF_MAP_TYPE_CGROUP_STORAGE:
 	case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
+	case BPF_MAP_TYPE_RELAY:
 		/* unprivileged */
 		break;
 	case BPF_MAP_TYPE_SK_STORAGE: