@@ -126,6 +126,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
@@ -928,6 +928,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_INODE_STORAGE,
BPF_MAP_TYPE_TASK_STORAGE,
BPF_MAP_TYPE_BLOOM_FILTER,
+ BPF_MAP_TYPE_USER_RINGBUF,
};
/* Note that tracing related programs such as
@@ -38,10 +38,27 @@ struct bpf_ringbuf {
struct page **pages;
int nr_pages;
spinlock_t spinlock ____cacheline_aligned_in_smp;
- /* Consumer and producer counters are put into separate pages to allow
- * mapping consumer page as r/w, but restrict producer page to r/o.
- * This protects producer position from being modified by user-space
- * application and ruining in-kernel position tracking.
+ /* Consumer and producer counters are put into separate pages to
+ * allow each position to be mapped with different permissions.
+ * This prevents a user-space application from modifying the
+ * position and ruining in-kernel tracking. The permissions of the
+ * pages depend on who is producing samples: user-space or the
+ * kernel.
+ *
+ * Kernel-producer
+ * ---------------
+ * The producer position and data pages are mapped as r/o in
+ * userspace. For this approach, bits in the header of samples are
+ * used to signal to user-space, and to other producers, whether a
+ * sample is currently being written.
+ *
+ * User-space producer
+ * -------------------
+ * Only the page containing the consumer position is mapped r/o in
+ * user-space. User-space producers also use bits of the header to
+ * communicate to the kernel, but the kernel must carefully check and
+ * validate each sample to ensure that they're correctly formatted, and
+ * fully contained within the ring buffer.
*/
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
@@ -224,7 +241,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP;
}
-static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_ringbuf_map *rb_map;
@@ -242,6 +259,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
vma->vm_pgoff + RINGBUF_PGOFF);
}
+static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ if (vma->vm_flags & VM_WRITE) {
+ if (vma->vm_pgoff == 0)
+ /* Disallow writable mappings to the consumer pointer,
+ * and allow writable mappings to both the producer
+ * position, and the ring buffer data itself.
+ */
+ return -EPERM;
+ } else {
+ vma->vm_flags &= ~VM_MAYWRITE;
+ }
+ /* remap_vmalloc_range() checks size and offset constraints */
+ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
{
unsigned long cons_pos, prod_pos;
@@ -269,7 +306,7 @@ const struct bpf_map_ops ringbuf_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = ringbuf_map_alloc,
.map_free = ringbuf_map_free,
- .map_mmap = ringbuf_map_mmap,
+ .map_mmap = ringbuf_map_mmap_kern,
.map_poll = ringbuf_map_poll,
.map_lookup_elem = ringbuf_map_lookup_elem,
.map_update_elem = ringbuf_map_update_elem,
@@ -278,6 +315,19 @@ const struct bpf_map_ops ringbuf_map_ops = {
.map_btf_id = &ringbuf_map_btf_ids[0],
};
+BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
+const struct bpf_map_ops user_ringbuf_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = ringbuf_map_alloc,
+ .map_free = ringbuf_map_free,
+ .map_mmap = ringbuf_map_mmap_user,
+ .map_lookup_elem = ringbuf_map_lookup_elem,
+ .map_update_elem = ringbuf_map_update_elem,
+ .map_delete_elem = ringbuf_map_delete_elem,
+ .map_get_next_key = ringbuf_map_get_next_key,
+ .map_btf_id = &user_ringbuf_map_btf_ids[0],
+};
+
/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
* calculate offset from record metadata to ring buffer in pages, rounded
* down. This page offset is stored as part of record metadata and allows to
@@ -6240,6 +6240,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_ringbuf_discard_dynptr)
goto error;
break;
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ goto error;
case BPF_MAP_TYPE_STACK_TRACE:
if (func_id != BPF_FUNC_get_stackid)
goto error;
@@ -12635,6 +12637,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
case BPF_MAP_TYPE_INODE_STORAGE:
case BPF_MAP_TYPE_SK_STORAGE:
case BPF_MAP_TYPE_TASK_STORAGE:
@@ -55,7 +55,7 @@ MAP COMMANDS
| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage**
-| | **task_storage** | **bloom_filter** }
+| | **task_storage** | **bloom_filter** | **user_ringbuf** }
DESCRIPTION
===========
@@ -1459,7 +1459,7 @@ static int do_help(int argc, char **argv)
" devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
" cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
" queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n"
- " task_storage | bloom_filter }\n"
+ " task_storage | bloom_filter | user_ringbuf }\n"
" " HELP_SPEC_OPTIONS " |\n"
" {-f|--bpffs} | {-n|--nomount} }\n"
"",
@@ -928,6 +928,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_INODE_STORAGE,
BPF_MAP_TYPE_TASK_STORAGE,
BPF_MAP_TYPE_BLOOM_FILTER,
+ BPF_MAP_TYPE_USER_RINGBUF,
};
/* Note that tracing related programs such as
@@ -163,6 +163,7 @@ static const char * const map_type_name[] = {
[BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage",
[BPF_MAP_TYPE_TASK_STORAGE] = "task_storage",
[BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter",
+ [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf",
};
static const char * const prog_type_name[] = {