diff mbox series

[v3,bpf-next,2/3] bpf: Introduce task_vma open-coded iterator kfuncs

Message ID 20230822050558.2937659-3-davemarchevsky@fb.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Open-coded task_vma iter | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-30 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-PR fail PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 3084 this patch: 3090
netdev/cc_maintainers warning 10 maintainers not CCed: kpsingh@kernel.org martin.lau@linux.dev john.fastabend@gmail.com shuah@kernel.org song@kernel.org kuifeng@meta.com mykolal@fb.com linux-kselftest@vger.kernel.org jolsa@kernel.org haoluo@google.com
netdev/build_clang fail Errors and warnings before: 1532 this patch: 1536
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 3116 this patch: 3122
netdev/checkpatch warning CHECK: extern prototypes should be avoided in .h files WARNING: Prefer __aligned(8) over __attribute__((aligned(8))) WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 106 exceeds 80 columns WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-1 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on s390x with gcc

Commit Message

Dave Marchevsky Aug. 22, 2023, 5:05 a.m. UTC
This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
creation and manipulation of struct bpf_iter_task_vma in open-coded
iterator style. BPF programs can use these kfuncs directly or through
bpf_for_each macro for natural-looking iteration of all task vmas.

The implementation borrows heavily from bpf_find_vma helper's locking -
differing only in that it holds the mmap_read lock for all iterations
while the helper only executes its provided callback on a maximum of 1
vma. Aside from locking, struct vma_iterator and vma_next do all the
heavy lifting.

The newly-added struct bpf_iter_task_vma has a name collision with a
selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
file is renamed in order to avoid the collision.

A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
only field in struct bpf_iter_task_vma. This is because the inner data
struct contains a struct vma_iterator (not ptr), whose size is likely to
change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
such a change would require change in opaque bpf_iter_task_vma struct's
size. So better to allocate vma_iterator using BPF allocator, and since
that alloc must already succeed, might as well allocate all iter fields,
thereby freezing struct bpf_iter_task_vma size.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Cc: Nathan Slingerland <slinger@meta.com>
---
 include/uapi/linux/bpf.h                      |  4 +
 kernel/bpf/helpers.c                          |  3 +
 kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
 tools/include/uapi/linux/bpf.h                |  4 +
 tools/lib/bpf/bpf_helpers.h                   |  8 ++
 .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
 ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
 7 files changed, 116 insertions(+), 13 deletions(-)
 rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)

Comments

Yonghong Song Aug. 22, 2023, 5:42 p.m. UTC | #1
On 8/21/23 10:05 PM, Dave Marchevsky wrote:
> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> creation and manipulation of struct bpf_iter_task_vma in open-coded
> iterator style. BPF programs can use these kfuncs directly or through
> bpf_for_each macro for natural-looking iteration of all task vmas.
> 
> The implementation borrows heavily from bpf_find_vma helper's locking -
> differing only in that it holds the mmap_read lock for all iterations
> while the helper only executes its provided callback on a maximum of 1
> vma. Aside from locking, struct vma_iterator and vma_next do all the
> heavy lifting.
> 
> The newly-added struct bpf_iter_task_vma has a name collision with a
> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> file is renamed in order to avoid the collision.
> 
> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
> only field in struct bpf_iter_task_vma. This is because the inner data
> struct contains a struct vma_iterator (not ptr), whose size is likely to
> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
> such a change would require change in opaque bpf_iter_task_vma struct's
> size. So better to allocate vma_iterator using BPF allocator, and since
> that alloc must already succeed, might as well allocate all iter fields,
> thereby freezing struct bpf_iter_task_vma size.
> 
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> Cc: Nathan Slingerland <slinger@meta.com>
> ---
>   include/uapi/linux/bpf.h                      |  4 +
>   kernel/bpf/helpers.c                          |  3 +
>   kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
>   tools/include/uapi/linux/bpf.h                |  4 +
>   tools/lib/bpf/bpf_helpers.h                   |  8 ++
>   .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
>   ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
>   7 files changed, 116 insertions(+), 13 deletions(-)
>   rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 8790b3962e4b..49fc1989a548 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
>   	__u64 __opaque[1];
>   } __attribute__((aligned(8)));
>   
> +struct bpf_iter_task_vma {
> +	__u64 __opaque[1]; /* See bpf_iter_num comment above */
> +} __attribute__((aligned(8)));

In the future, we might have bpf_iter_cgroup, bpf_iter_task, 
bpf_iter_cgroup_task, etc. They may all use the same struct
like
   struct bpf_iter_<...> {
     __u64 __opaque[1];
   } __attribute__((aligned(8)));

Maybe we want a generic one instead of having lots of
structs with the same underline definition? For example,
   struct bpf_iter_generic
?

> +
>   #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index eb91cae0612a..7a06dea749f1 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
>   BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
>   BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
>   BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index c4ab9d6cdbe9..51c2dce435c1 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -7,7 +7,9 @@
>   #include <linux/fs.h>
>   #include <linux/fdtable.h>
>   #include <linux/filter.h>
> +#include <linux/bpf_mem_alloc.h>
>   #include <linux/btf_ids.h>
> +#include <linux/mm_types.h>
>   #include "mmap_unlock_work.h"
>   
>   static const char * const iter_task_type_names[] = {
> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
>   	.arg5_type	= ARG_ANYTHING,
>   };
>   
> +struct bpf_iter_task_vma_kern_data {
> +	struct task_struct *task;
> +	struct mm_struct *mm;
> +	struct mmap_unlock_irq_work *work;
> +	struct vma_iterator vmi;
> +};
> +
> +/* Non-opaque version of uapi bpf_iter_task_vma */
> +struct bpf_iter_task_vma_kern {
> +	struct bpf_iter_task_vma_kern_data *data;
> +} __attribute__((aligned(8)));
> +
> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> +				      struct task_struct *task, u64 addr)
> +{
> +	struct bpf_iter_task_vma_kern *kit = (void *)it;
> +	bool irq_work_busy = false;
> +	int err;
> +
> +	BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
> +	BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
> +
> +	/* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
> +	 * before, so non-NULL kit->data doesn't point to previously
> +	 * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
> +	 */
> +	kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
> +	if (!kit->data)
> +		return -ENOMEM;
> +	kit->data->task = NULL;
> +
> +	if (!task) {
> +		err = -ENOENT;
> +		goto err_cleanup_iter;
> +	}
> +
> +	kit->data->task = get_task_struct(task);

The above is not safe. Since there is no restriction on 'task',
the 'task' could be in a state for destruction with 'usage' count 0
and then get_task_struct(task) won't work since it unconditionally
tries to increase 'usage' count from 0 to 1.

Or, 'task' may be valid at the entry of the funciton, but when
'task' is in get_task_struct(), 'task' may have been destroyed
and 'task' memory is reused by somebody else.

I suggest that we check input parameter 'task' must be
PTR_TRUSTED or MEM_RCU. This way, the above !task checking
is not necessary and get_task_struct() can correctly
hold a reference to 'task'.

> +	kit->data->mm = task->mm;
> +	if (!kit->data->mm) {
> +		err = -ENOENT;
> +		goto err_cleanup_iter;
> +	}
> +
> +	/* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
> +	irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
> +	if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
> +		err = -EBUSY;
> +		goto err_cleanup_iter;
> +	}
> +
> +	vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
> +	return 0;
> +
> +err_cleanup_iter:
> +	if (kit->data->task)
> +		put_task_struct(kit->data->task);
> +	bpf_mem_free(&bpf_global_ma, kit->data);
> +	/* NULL kit->data signals failed bpf_iter_task_vma initialization */
> +	kit->data = NULL;
> +	return err;
> +}
> +
[...]
David Marchevsky Aug. 22, 2023, 7:19 p.m. UTC | #2
On 8/22/23 1:42 PM, Yonghong Song wrote:
> 
> 
> On 8/21/23 10:05 PM, Dave Marchevsky wrote:
>> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
>> creation and manipulation of struct bpf_iter_task_vma in open-coded
>> iterator style. BPF programs can use these kfuncs directly or through
>> bpf_for_each macro for natural-looking iteration of all task vmas.
>>
>> The implementation borrows heavily from bpf_find_vma helper's locking -
>> differing only in that it holds the mmap_read lock for all iterations
>> while the helper only executes its provided callback on a maximum of 1
>> vma. Aside from locking, struct vma_iterator and vma_next do all the
>> heavy lifting.
>>
>> The newly-added struct bpf_iter_task_vma has a name collision with a
>> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
>> file is renamed in order to avoid the collision.
>>
>> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
>> only field in struct bpf_iter_task_vma. This is because the inner data
>> struct contains a struct vma_iterator (not ptr), whose size is likely to
>> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
>> such a change would require change in opaque bpf_iter_task_vma struct's
>> size. So better to allocate vma_iterator using BPF allocator, and since
>> that alloc must already succeed, might as well allocate all iter fields,
>> thereby freezing struct bpf_iter_task_vma size.
>>
>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>> Cc: Nathan Slingerland <slinger@meta.com>
>> ---
>>   include/uapi/linux/bpf.h                      |  4 +
>>   kernel/bpf/helpers.c                          |  3 +
>>   kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
>>   tools/include/uapi/linux/bpf.h                |  4 +
>>   tools/lib/bpf/bpf_helpers.h                   |  8 ++
>>   .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
>>   ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
>>   7 files changed, 116 insertions(+), 13 deletions(-)
>>   rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 8790b3962e4b..49fc1989a548 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
>>       __u64 __opaque[1];
>>   } __attribute__((aligned(8)));
>>   +struct bpf_iter_task_vma {
>> +    __u64 __opaque[1]; /* See bpf_iter_num comment above */
>> +} __attribute__((aligned(8)));
> 
> In the future, we might have bpf_iter_cgroup, bpf_iter_task, bpf_iter_cgroup_task, etc. They may all use the same struct
> like
>   struct bpf_iter_<...> {
>     __u64 __opaque[1];
>   } __attribute__((aligned(8)));
> 
> Maybe we want a generic one instead of having lots of
> structs with the same underline definition? For example,
>   struct bpf_iter_generic
> ?
> 

The bpf_for_each macro assumes a consistent naming scheme for opaque iter struct
and associated kfuncs. Having a 'bpf_iter_generic' shared amongst multiple types
of iters would break the scheme. We could:

  * Add bpf_for_each_generic that only uses bpf_iter_generic
    * This exposes implementation details in an ugly way, though.
  * Do some macro magic to pick bpf_iter_generic for some types of iters, and
    use consistent naming pattern for others.
    * I'm not sure how to do this with preprocessor
  * Migrate all opaque iter structs to only contain pointer to bpf_mem_alloc'd
    data struct, and use bpf_iter_generic for all of them
    * Probably need to see more iter implementation / usage before making such
      a change
  * Do 'typedef __u64 __aligned(8) bpf_iter_<...>
    * BTF_KIND_TYPEDEF intead of BTF_KIND_STRUCT might throw off some verifier
      logic. Could do similar typedef w/ struct to try to work around
      it.

Let me know what you think. Personally I considered doing typedef while
implementing this, so that's the alternative I'd choose. 

>> +
>>   #endif /* _UAPI__LINUX_BPF_H__ */
>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>> index eb91cae0612a..7a06dea749f1 100644
>> --- a/kernel/bpf/helpers.c
>> +++ b/kernel/bpf/helpers.c
>> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
>>   BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
>>   BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
>>   BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
>>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>> index c4ab9d6cdbe9..51c2dce435c1 100644
>> --- a/kernel/bpf/task_iter.c
>> +++ b/kernel/bpf/task_iter.c
>> @@ -7,7 +7,9 @@
>>   #include <linux/fs.h>
>>   #include <linux/fdtable.h>
>>   #include <linux/filter.h>
>> +#include <linux/bpf_mem_alloc.h>
>>   #include <linux/btf_ids.h>
>> +#include <linux/mm_types.h>
>>   #include "mmap_unlock_work.h"
>>     static const char * const iter_task_type_names[] = {
>> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
>>       .arg5_type    = ARG_ANYTHING,
>>   };
>>   +struct bpf_iter_task_vma_kern_data {
>> +    struct task_struct *task;
>> +    struct mm_struct *mm;
>> +    struct mmap_unlock_irq_work *work;
>> +    struct vma_iterator vmi;
>> +};
>> +
>> +/* Non-opaque version of uapi bpf_iter_task_vma */
>> +struct bpf_iter_task_vma_kern {
>> +    struct bpf_iter_task_vma_kern_data *data;
>> +} __attribute__((aligned(8)));
>> +
>> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
>> +                      struct task_struct *task, u64 addr)
>> +{
>> +    struct bpf_iter_task_vma_kern *kit = (void *)it;
>> +    bool irq_work_busy = false;
>> +    int err;
>> +
>> +    BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
>> +    BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
>> +
>> +    /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
>> +     * before, so non-NULL kit->data doesn't point to previously
>> +     * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
>> +     */
>> +    kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
>> +    if (!kit->data)
>> +        return -ENOMEM;
>> +    kit->data->task = NULL;
>> +
>> +    if (!task) {
>> +        err = -ENOENT;
>> +        goto err_cleanup_iter;
>> +    }
>> +
>> +    kit->data->task = get_task_struct(task);
> 
> The above is not safe. Since there is no restriction on 'task',
> the 'task' could be in a state for destruction with 'usage' count 0
> and then get_task_struct(task) won't work since it unconditionally
> tries to increase 'usage' count from 0 to 1.
> 
> Or, 'task' may be valid at the entry of the funciton, but when
> 'task' is in get_task_struct(), 'task' may have been destroyed
> and 'task' memory is reused by somebody else.
> 
> I suggest that we check input parameter 'task' must be
> PTR_TRUSTED or MEM_RCU. This way, the above !task checking
> is not necessary and get_task_struct() can correctly
> hold a reference to 'task'.
> 

Adding a PTR_TRUSTED or MEM_RCU check seems reasonable. I'm curious
whether there's any way to feed a 'plain' struct task_struct PTR_TO_BTF_ID
to this kfunc currently.

* bpf_get_current_task_btf helper returns PTR_TRUSTED | PTR_TO_BTF_ID
* ptr hop from trusted task_struct to 'real_parent' or similar should
  yield MEM_RCU (due to BTF_TYPE_SAFE_RCU(struct task_struct) def
* if task kptr is in map_val, direct reference to it should result
  in PTR_UNTRUSTED PTR_TO_BTF_ID, must kptr_xchg it or acquire again
  using bpf_task_from_pid (?)

But regardless, better to be explicit. Will change.

>> +    kit->data->mm = task->mm;
>> +    if (!kit->data->mm) {
>> +        err = -ENOENT;
>> +        goto err_cleanup_iter;
>> +    }
>> +
>> +    /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
>> +    irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
>> +    if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
>> +        err = -EBUSY;
>> +        goto err_cleanup_iter;
>> +    }
>> +
>> +    vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
>> +    return 0;
>> +
>> +err_cleanup_iter:
>> +    if (kit->data->task)
>> +        put_task_struct(kit->data->task);
>> +    bpf_mem_free(&bpf_global_ma, kit->data);
>> +    /* NULL kit->data signals failed bpf_iter_task_vma initialization */
>> +    kit->data = NULL;
>> +    return err;
>> +}
>> +
> [...]
Yonghong Song Aug. 22, 2023, 8:14 p.m. UTC | #3
On 8/22/23 12:19 PM, David Marchevsky wrote:
> On 8/22/23 1:42 PM, Yonghong Song wrote:
>>
>>
>> On 8/21/23 10:05 PM, Dave Marchevsky wrote:
>>> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
>>> creation and manipulation of struct bpf_iter_task_vma in open-coded
>>> iterator style. BPF programs can use these kfuncs directly or through
>>> bpf_for_each macro for natural-looking iteration of all task vmas.
>>>
>>> The implementation borrows heavily from bpf_find_vma helper's locking -
>>> differing only in that it holds the mmap_read lock for all iterations
>>> while the helper only executes its provided callback on a maximum of 1
>>> vma. Aside from locking, struct vma_iterator and vma_next do all the
>>> heavy lifting.
>>>
>>> The newly-added struct bpf_iter_task_vma has a name collision with a
>>> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
>>> file is renamed in order to avoid the collision.
>>>
>>> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
>>> only field in struct bpf_iter_task_vma. This is because the inner data
>>> struct contains a struct vma_iterator (not ptr), whose size is likely to
>>> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
>>> such a change would require change in opaque bpf_iter_task_vma struct's
>>> size. So better to allocate vma_iterator using BPF allocator, and since
>>> that alloc must already succeed, might as well allocate all iter fields,
>>> thereby freezing struct bpf_iter_task_vma size.
>>>
>>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>>> Cc: Nathan Slingerland <slinger@meta.com>
>>> ---
>>>    include/uapi/linux/bpf.h                      |  4 +
>>>    kernel/bpf/helpers.c                          |  3 +
>>>    kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
>>>    tools/include/uapi/linux/bpf.h                |  4 +
>>>    tools/lib/bpf/bpf_helpers.h                   |  8 ++
>>>    .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
>>>    ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
>>>    7 files changed, 116 insertions(+), 13 deletions(-)
>>>    rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
>>>
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index 8790b3962e4b..49fc1989a548 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
>>>        __u64 __opaque[1];
>>>    } __attribute__((aligned(8)));
>>>    +struct bpf_iter_task_vma {
>>> +    __u64 __opaque[1]; /* See bpf_iter_num comment above */
>>> +} __attribute__((aligned(8)));
>>
>> In the future, we might have bpf_iter_cgroup, bpf_iter_task, bpf_iter_cgroup_task, etc. They may all use the same struct
>> like
>>    struct bpf_iter_<...> {
>>      __u64 __opaque[1];
>>    } __attribute__((aligned(8)));
>>
>> Maybe we want a generic one instead of having lots of
>> structs with the same underline definition? For example,
>>    struct bpf_iter_generic
>> ?
>>
> 
> The bpf_for_each macro assumes a consistent naming scheme for opaque iter struct
> and associated kfuncs. Having a 'bpf_iter_generic' shared amongst multiple types
> of iters would break the scheme. We could:
> 
>    * Add bpf_for_each_generic that only uses bpf_iter_generic
>      * This exposes implementation details in an ugly way, though.
>    * Do some macro magic to pick bpf_iter_generic for some types of iters, and
>      use consistent naming pattern for others.
>      * I'm not sure how to do this with preprocessor
>    * Migrate all opaque iter structs to only contain pointer to bpf_mem_alloc'd
>      data struct, and use bpf_iter_generic for all of them
>      * Probably need to see more iter implementation / usage before making such
>        a change
>    * Do 'typedef __u64 __aligned(8) bpf_iter_<...>
>      * BTF_KIND_TYPEDEF intead of BTF_KIND_STRUCT might throw off some verifier
>        logic. Could do similar typedef w/ struct to try to work around
>        it.
> 
> Let me know what you think. Personally I considered doing typedef while
> implementing this, so that's the alternative I'd choose.

Okay, since we have naming convention restriction, typedef probably the
best option, something like
   typedef struct bpf_iter_num bpf_iter_task_vma
?

Verifier might need to be changed if verifier strips all modifiers
(including tyypedef) to find the struct name.

> 
>>> +
>>>    #endif /* _UAPI__LINUX_BPF_H__ */
>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>>> index eb91cae0612a..7a06dea749f1 100644
>>> --- a/kernel/bpf/helpers.c
>>> +++ b/kernel/bpf/helpers.c
>>> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
>>>    BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
>>>    BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
>>>    BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
>>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
>>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
>>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
>>>    BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>>>    BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>>>    BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
>>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>>> index c4ab9d6cdbe9..51c2dce435c1 100644
>>> --- a/kernel/bpf/task_iter.c
>>> +++ b/kernel/bpf/task_iter.c
>>> @@ -7,7 +7,9 @@
>>>    #include <linux/fs.h>
>>>    #include <linux/fdtable.h>
>>>    #include <linux/filter.h>
>>> +#include <linux/bpf_mem_alloc.h>
>>>    #include <linux/btf_ids.h>
>>> +#include <linux/mm_types.h>
>>>    #include "mmap_unlock_work.h"
>>>      static const char * const iter_task_type_names[] = {
>>> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
>>>        .arg5_type    = ARG_ANYTHING,
>>>    };
>>>    +struct bpf_iter_task_vma_kern_data {
>>> +    struct task_struct *task;
>>> +    struct mm_struct *mm;
>>> +    struct mmap_unlock_irq_work *work;
>>> +    struct vma_iterator vmi;
>>> +};
>>> +
>>> +/* Non-opaque version of uapi bpf_iter_task_vma */
>>> +struct bpf_iter_task_vma_kern {
>>> +    struct bpf_iter_task_vma_kern_data *data;
>>> +} __attribute__((aligned(8)));
>>> +
>>> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
>>> +                      struct task_struct *task, u64 addr)
>>> +{
>>> +    struct bpf_iter_task_vma_kern *kit = (void *)it;
>>> +    bool irq_work_busy = false;
>>> +    int err;
>>> +
>>> +    BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
>>> +    BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
>>> +
>>> +    /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
>>> +     * before, so non-NULL kit->data doesn't point to previously
>>> +     * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
>>> +     */
>>> +    kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
>>> +    if (!kit->data)
>>> +        return -ENOMEM;
>>> +    kit->data->task = NULL;
>>> +
>>> +    if (!task) {
>>> +        err = -ENOENT;
>>> +        goto err_cleanup_iter;
>>> +    }
>>> +
>>> +    kit->data->task = get_task_struct(task);
>>
>> The above is not safe. Since there is no restriction on 'task',
>> the 'task' could be in a state for destruction with 'usage' count 0
>> and then get_task_struct(task) won't work since it unconditionally
>> tries to increase 'usage' count from 0 to 1.
>>
>> Or, 'task' may be valid at the entry of the funciton, but when
>> 'task' is in get_task_struct(), 'task' may have been destroyed
>> and 'task' memory is reused by somebody else.
>>
>> I suggest that we check input parameter 'task' must be
>> PTR_TRUSTED or MEM_RCU. This way, the above !task checking
>> is not necessary and get_task_struct() can correctly
>> hold a reference to 'task'.
>>
> 
> Adding a PTR_TRUSTED or MEM_RCU check seems reasonable. I'm curious
> whether there's any way to feed a 'plain' struct task_struct PTR_TO_BTF_ID
> to this kfunc currently.
> 
> * bpf_get_current_task_btf helper returns PTR_TRUSTED | PTR_TO_BTF_ID
> * ptr hop from trusted task_struct to 'real_parent' or similar should
>    yield MEM_RCU (due to BTF_TYPE_SAFE_RCU(struct task_struct) def

This is true.

> * if task kptr is in map_val, direct reference to it should result
>    in PTR_UNTRUSTED PTR_TO_BTF_ID, must kptr_xchg it or acquire again
>    using bpf_task_from_pid (?)

If in the rcu region, direct access of the task kptr can be marked
with MEM_RCU | PTR_MAYBE_NULL. After a null ptr check, you will
get a rcu protected task ptr.

> 
> But regardless, better to be explicit. Will change.
> 
>>> +    kit->data->mm = task->mm;
>>> +    if (!kit->data->mm) {
>>> +        err = -ENOENT;
>>> +        goto err_cleanup_iter;
>>> +    }
>>> +
>>> +    /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
>>> +    irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
>>> +    if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
>>> +        err = -EBUSY;
>>> +        goto err_cleanup_iter;
>>> +    }
>>> +
>>> +    vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
>>> +    return 0;
>>> +
>>> +err_cleanup_iter:
>>> +    if (kit->data->task)
>>> +        put_task_struct(kit->data->task);
>>> +    bpf_mem_free(&bpf_global_ma, kit->data);
>>> +    /* NULL kit->data signals failed bpf_iter_task_vma initialization */
>>> +    kit->data = NULL;
>>> +    return err;
>>> +}
>>> +
>> [...]
Alexei Starovoitov Aug. 22, 2023, 10:36 p.m. UTC | #4
On Tue, Aug 22, 2023 at 1:14 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
>
> On 8/22/23 12:19 PM, David Marchevsky wrote:
> > On 8/22/23 1:42 PM, Yonghong Song wrote:
> >>
> >>
> >> On 8/21/23 10:05 PM, Dave Marchevsky wrote:
> >>> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> >>> creation and manipulation of struct bpf_iter_task_vma in open-coded
> >>> iterator style. BPF programs can use these kfuncs directly or through
> >>> bpf_for_each macro for natural-looking iteration of all task vmas.
> >>>
> >>> The implementation borrows heavily from bpf_find_vma helper's locking -
> >>> differing only in that it holds the mmap_read lock for all iterations
> >>> while the helper only executes its provided callback on a maximum of 1
> >>> vma. Aside from locking, struct vma_iterator and vma_next do all the
> >>> heavy lifting.
> >>>
> >>> The newly-added struct bpf_iter_task_vma has a name collision with a
> >>> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> >>> file is renamed in order to avoid the collision.
> >>>
> >>> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
> >>> only field in struct bpf_iter_task_vma. This is because the inner data
> >>> struct contains a struct vma_iterator (not ptr), whose size is likely to
> >>> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
> >>> such a change would require change in opaque bpf_iter_task_vma struct's
> >>> size. So better to allocate vma_iterator using BPF allocator, and since
> >>> that alloc must already succeed, might as well allocate all iter fields,
> >>> thereby freezing struct bpf_iter_task_vma size.
> >>>
> >>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> >>> Cc: Nathan Slingerland <slinger@meta.com>
> >>> ---
> >>>    include/uapi/linux/bpf.h                      |  4 +
> >>>    kernel/bpf/helpers.c                          |  3 +
> >>>    kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
> >>>    tools/include/uapi/linux/bpf.h                |  4 +
> >>>    tools/lib/bpf/bpf_helpers.h                   |  8 ++
> >>>    .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
> >>>    ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
> >>>    7 files changed, 116 insertions(+), 13 deletions(-)
> >>>    rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> >>>
> >>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >>> index 8790b3962e4b..49fc1989a548 100644
> >>> --- a/include/uapi/linux/bpf.h
> >>> +++ b/include/uapi/linux/bpf.h
> >>> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
> >>>        __u64 __opaque[1];
> >>>    } __attribute__((aligned(8)));
> >>>    +struct bpf_iter_task_vma {
> >>> +    __u64 __opaque[1]; /* See bpf_iter_num comment above */
> >>> +} __attribute__((aligned(8)));
> >>
> >> In the future, we might have bpf_iter_cgroup, bpf_iter_task, bpf_iter_cgroup_task, etc. They may all use the same struct
> >> like
> >>    struct bpf_iter_<...> {
> >>      __u64 __opaque[1];
> >>    } __attribute__((aligned(8)));
> >>
> >> Maybe we want a generic one instead of having lots of
> >> structs with the same underline definition? For example,
> >>    struct bpf_iter_generic
> >> ?
> >>
> >
> > The bpf_for_each macro assumes a consistent naming scheme for opaque iter struct
> > and associated kfuncs. Having a 'bpf_iter_generic' shared amongst multiple types
> > of iters would break the scheme. We could:
> >
> >    * Add bpf_for_each_generic that only uses bpf_iter_generic
> >      * This exposes implementation details in an ugly way, though.
> >    * Do some macro magic to pick bpf_iter_generic for some types of iters, and
> >      use consistent naming pattern for others.
> >      * I'm not sure how to do this with preprocessor
> >    * Migrate all opaque iter structs to only contain pointer to bpf_mem_alloc'd
> >      data struct, and use bpf_iter_generic for all of them
> >      * Probably need to see more iter implementation / usage before making such
> >        a change
> >    * Do 'typedef __u64 __aligned(8) bpf_iter_<...>
> >      * BTF_KIND_TYPEDEF intead of BTF_KIND_STRUCT might throw off some verifier
> >        logic. Could do similar typedef w/ struct to try to work around
> >        it.
> >
> > Let me know what you think. Personally I considered doing typedef while
> > implementing this, so that's the alternative I'd choose.
>
> Okay, since we have naming convention restriction, typedef probably the
> best option, something like
>    typedef struct bpf_iter_num bpf_iter_task_vma
> ?
>
> Verifier might need to be changed if verifier strips all modifiers
> (including tyypedef) to find the struct name.

I don't quite see how typedef helps here.
Say we do:
struct bpf_iter_task_vma {
     __u64 __opaque[1];
} __attribute__((aligned(8)));

as Dave is proposing.
Then tomorrow we add another bpf_iter_foo that is exactly the same opaque[1].
And we will have bpf_iter_num, bpf_iter_task_vma, bpf_iter_foo structs
with the same layout. So what? Eye sore?
In case we need to extend task_vma from 1 to 2 it will be easier to do
when all of them are separate structs.

And typedef has unknown verification implications.

Either way we need to find a way to move these structs from uapi/bpf.h
along with bpf_rb_root and friends to some "obviously unstable" header.
Andrii Nakryiko Aug. 22, 2023, 11:52 p.m. UTC | #5
On Mon, Aug 21, 2023 at 10:06 PM Dave Marchevsky <davemarchevsky@fb.com> wrote:
>
> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> creation and manipulation of struct bpf_iter_task_vma in open-coded
> iterator style. BPF programs can use these kfuncs directly or through
> bpf_for_each macro for natural-looking iteration of all task vmas.
>
> The implementation borrows heavily from bpf_find_vma helper's locking -
> differing only in that it holds the mmap_read lock for all iterations
> while the helper only executes its provided callback on a maximum of 1
> vma. Aside from locking, struct vma_iterator and vma_next do all the
> heavy lifting.
>
> The newly-added struct bpf_iter_task_vma has a name collision with a
> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> file is renamed in order to avoid the collision.
>
> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
> only field in struct bpf_iter_task_vma. This is because the inner data
> struct contains a struct vma_iterator (not ptr), whose size is likely to
> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
> such a change would require change in opaque bpf_iter_task_vma struct's
> size. So better to allocate vma_iterator using BPF allocator, and since
> that alloc must already succeed, might as well allocate all iter fields,
> thereby freezing struct bpf_iter_task_vma size.
>
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> Cc: Nathan Slingerland <slinger@meta.com>
> ---
>  include/uapi/linux/bpf.h                      |  4 +
>  kernel/bpf/helpers.c                          |  3 +
>  kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
>  tools/include/uapi/linux/bpf.h                |  4 +
>  tools/lib/bpf/bpf_helpers.h                   |  8 ++
>  .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
>  ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
>  7 files changed, 116 insertions(+), 13 deletions(-)
>  rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 8790b3962e4b..49fc1989a548 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
>         __u64 __opaque[1];
>  } __attribute__((aligned(8)));
>
> +struct bpf_iter_task_vma {
> +       __u64 __opaque[1]; /* See bpf_iter_num comment above */
> +} __attribute__((aligned(8)));
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index eb91cae0612a..7a06dea749f1 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
>  BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
>  BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index c4ab9d6cdbe9..51c2dce435c1 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -7,7 +7,9 @@
>  #include <linux/fs.h>
>  #include <linux/fdtable.h>
>  #include <linux/filter.h>
> +#include <linux/bpf_mem_alloc.h>
>  #include <linux/btf_ids.h>
> +#include <linux/mm_types.h>
>  #include "mmap_unlock_work.h"
>
>  static const char * const iter_task_type_names[] = {
> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
>         .arg5_type      = ARG_ANYTHING,
>  };
>
> +struct bpf_iter_task_vma_kern_data {
> +       struct task_struct *task;
> +       struct mm_struct *mm;
> +       struct mmap_unlock_irq_work *work;
> +       struct vma_iterator vmi;
> +};
> +
> +/* Non-opaque version of uapi bpf_iter_task_vma */
> +struct bpf_iter_task_vma_kern {
> +       struct bpf_iter_task_vma_kern_data *data;
> +} __attribute__((aligned(8)));
> +

it's a bit worrying that we'll rely on memory allocation inside NMI to
be able to use this. I'm missing previous email discussion (I declared
email bankruptcy after long vacation), but I suppose the option to fix
bpf_iter_task_vma (to 88 bytes: 64 for vma_iterator + 24 for extra
pointers), or even to 96 to have a bit of headroom in case we need a
bit more space was rejected? It seems unlikely that vma_iterator will
have to grow, but if it does, it has 5 bytes of padding right now for
various flags, plus we can have extra 8 bytes reserved just in case.

I know it's a big struct and will take a big chunk of the BPF stack,
but I'm a bit worried about both the performance implication of mem
alloc under NMI, and allocation failing.

Maybe the worry is overblown, but I thought I'll bring it up anyways.

> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> +                                     struct task_struct *task, u64 addr)
> +{
> +       struct bpf_iter_task_vma_kern *kit = (void *)it;
> +       bool irq_work_busy = false;
> +       int err;
> +

[...]

>  static void do_mmap_read_unlock(struct irq_work *entry)
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 8790b3962e4b..49fc1989a548 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
>         __u64 __opaque[1];
>  } __attribute__((aligned(8)));
>
> +struct bpf_iter_task_vma {
> +       __u64 __opaque[1]; /* See bpf_iter_num comment above */
> +} __attribute__((aligned(8)));
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
> index bbab9ad9dc5a..d885ffee4d88 100644
> --- a/tools/lib/bpf/bpf_helpers.h
> +++ b/tools/lib/bpf/bpf_helpers.h
> @@ -302,6 +302,14 @@ extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __weak
>  extern int *bpf_iter_num_next(struct bpf_iter_num *it) __weak __ksym;
>  extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym;
>
> +struct bpf_iter_task_vma;
> +
> +extern int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> +                                struct task_struct *task,
> +                                unsigned long addr) __weak __ksym;
> +extern struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) __weak __ksym;
> +extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __weak __ksym;

my intent wasn't to add all open-coded iterators to bpf_helpers.h. I
think bpf_iter_num_* is rather an exception and isn't supposed to ever
change or be removed, while other iterators should be allowed to be
changed.

The goal is for all such kfuncs (and struct bpf_iter_task_vma state
itself, probably) to come from vmlinux.h, eventually, so let's leave
it out of libbpf's stable bpf_helpers.h header.


[...]

> @@ -1533,7 +1533,7 @@ static void test_task_vma_dead_task(void)
>  out:
>         waitpid(child_pid, &wstatus, 0);
>         close(iter_fd);
> -       bpf_iter_task_vma__destroy(skel);
> +       bpf_iter_task_vmas__destroy(skel);
>  }
>
>  void test_bpf_sockmap_map_iter_fd(void)
> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
> similarity index 100%
> rename from tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
> rename to tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
> --
> 2.34.1
>
Andrii Nakryiko Aug. 22, 2023, 11:57 p.m. UTC | #6
On Tue, Aug 22, 2023 at 3:37 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Aug 22, 2023 at 1:14 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >
> >
> >
> > On 8/22/23 12:19 PM, David Marchevsky wrote:
> > > On 8/22/23 1:42 PM, Yonghong Song wrote:
> > >>
> > >>
> > >> On 8/21/23 10:05 PM, Dave Marchevsky wrote:
> > >>> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> > >>> creation and manipulation of struct bpf_iter_task_vma in open-coded
> > >>> iterator style. BPF programs can use these kfuncs directly or through
> > >>> bpf_for_each macro for natural-looking iteration of all task vmas.
> > >>>
> > >>> The implementation borrows heavily from bpf_find_vma helper's locking -
> > >>> differing only in that it holds the mmap_read lock for all iterations
> > >>> while the helper only executes its provided callback on a maximum of 1
> > >>> vma. Aside from locking, struct vma_iterator and vma_next do all the
> > >>> heavy lifting.
> > >>>
> > >>> The newly-added struct bpf_iter_task_vma has a name collision with a
> > >>> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> > >>> file is renamed in order to avoid the collision.
> > >>>
> > >>> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
> > >>> only field in struct bpf_iter_task_vma. This is because the inner data
> > >>> struct contains a struct vma_iterator (not ptr), whose size is likely to
> > >>> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
> > >>> such a change would require change in opaque bpf_iter_task_vma struct's
> > >>> size. So better to allocate vma_iterator using BPF allocator, and since
> > >>> that alloc must already succeed, might as well allocate all iter fields,
> > >>> thereby freezing struct bpf_iter_task_vma size.
> > >>>
> > >>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> > >>> Cc: Nathan Slingerland <slinger@meta.com>
> > >>> ---
> > >>>    include/uapi/linux/bpf.h                      |  4 +
> > >>>    kernel/bpf/helpers.c                          |  3 +
> > >>>    kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
> > >>>    tools/include/uapi/linux/bpf.h                |  4 +
> > >>>    tools/lib/bpf/bpf_helpers.h                   |  8 ++
> > >>>    .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
> > >>>    ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
> > >>>    7 files changed, 116 insertions(+), 13 deletions(-)
> > >>>    rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> > >>>
> > >>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > >>> index 8790b3962e4b..49fc1989a548 100644
> > >>> --- a/include/uapi/linux/bpf.h
> > >>> +++ b/include/uapi/linux/bpf.h
> > >>> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
> > >>>        __u64 __opaque[1];
> > >>>    } __attribute__((aligned(8)));
> > >>>    +struct bpf_iter_task_vma {
> > >>> +    __u64 __opaque[1]; /* See bpf_iter_num comment above */
> > >>> +} __attribute__((aligned(8)));
> > >>
> > >> In the future, we might have bpf_iter_cgroup, bpf_iter_task, bpf_iter_cgroup_task, etc. They may all use the same struct
> > >> like
> > >>    struct bpf_iter_<...> {
> > >>      __u64 __opaque[1];
> > >>    } __attribute__((aligned(8)));
> > >>
> > >> Maybe we want a generic one instead of having lots of
> > >> structs with the same underline definition? For example,
> > >>    struct bpf_iter_generic
> > >> ?
> > >>
> > >
> > > The bpf_for_each macro assumes a consistent naming scheme for opaque iter struct
> > > and associated kfuncs. Having a 'bpf_iter_generic' shared amongst multiple types
> > > of iters would break the scheme. We could:
> > >
> > >    * Add bpf_for_each_generic that only uses bpf_iter_generic
> > >      * This exposes implementation details in an ugly way, though.
> > >    * Do some macro magic to pick bpf_iter_generic for some types of iters, and
> > >      use consistent naming pattern for others.
> > >      * I'm not sure how to do this with preprocessor
> > >    * Migrate all opaque iter structs to only contain pointer to bpf_mem_alloc'd
> > >      data struct, and use bpf_iter_generic for all of them
> > >      * Probably need to see more iter implementation / usage before making such
> > >        a change
> > >    * Do 'typedef __u64 __aligned(8) bpf_iter_<...>
> > >      * BTF_KIND_TYPEDEF intead of BTF_KIND_STRUCT might throw off some verifier
> > >        logic. Could do similar typedef w/ struct to try to work around
> > >        it.
> > >
> > > Let me know what you think. Personally I considered doing typedef while
> > > implementing this, so that's the alternative I'd choose.
> >
> > Okay, since we have naming convention restriction, typedef probably the
> > best option, something like
> >    typedef struct bpf_iter_num bpf_iter_task_vma
> > ?
> >
> > Verifier might need to be changed if verifier strips all modifiers
> > (including tyypedef) to find the struct name.
>
> I don't quite see how typedef helps here.
> Say we do:
> struct bpf_iter_task_vma {
>      __u64 __opaque[1];
> } __attribute__((aligned(8)));
>
> as Dave is proposing.
> Then tomorrow we add another bpf_iter_foo that is exactly the same opaque[1].
> And we will have bpf_iter_num, bpf_iter_task_vma, bpf_iter_foo structs
> with the same layout. So what? Eye sore?
> In case we need to extend task_vma from 1 to 2 it will be easier to do
> when all of them are separate structs.
>
> And typedef has unknown verification implications.

+1, I wouldn't complicate things and have a proper struct with strict
naming convention for each iter kind

>
> Either way we need to find a way to move these structs from uapi/bpf.h
> along with bpf_rb_root and friends to some "obviously unstable" header.

I don't think we have to add struct bpf_iter_task_vma to uapi/bpf.h at
all and rely on it coming from vmlinux.h. As I mentioned in previous
email, num iterator is a bit special in its generality and simplicity
(which allows to be confident it won't need to be changed), while
other iterators should be treated as more unstable and come from
vmlinux.h (or wherever else kfuncs will be coming from in the future).

tl;dr: we can define this struct right next to where we defined
bpf_iter_task_vma_new() kfunc. Unless I'm missing some complication,
of course.
Andrii Nakryiko Aug. 23, 2023, 12:04 a.m. UTC | #7
On Tue, Aug 22, 2023 at 12:20 PM David Marchevsky
<david.marchevsky@linux.dev> wrote:
>
> On 8/22/23 1:42 PM, Yonghong Song wrote:
> >
> >
> > On 8/21/23 10:05 PM, Dave Marchevsky wrote:
> >> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> >> creation and manipulation of struct bpf_iter_task_vma in open-coded
> >> iterator style. BPF programs can use these kfuncs directly or through
> >> bpf_for_each macro for natural-looking iteration of all task vmas.
> >>
> >> The implementation borrows heavily from bpf_find_vma helper's locking -
> >> differing only in that it holds the mmap_read lock for all iterations
> >> while the helper only executes its provided callback on a maximum of 1
> >> vma. Aside from locking, struct vma_iterator and vma_next do all the
> >> heavy lifting.
> >>
> >> The newly-added struct bpf_iter_task_vma has a name collision with a
> >> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> >> file is renamed in order to avoid the collision.
> >>
> >> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
> >> only field in struct bpf_iter_task_vma. This is because the inner data
> >> struct contains a struct vma_iterator (not ptr), whose size is likely to
> >> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
> >> such a change would require change in opaque bpf_iter_task_vma struct's
> >> size. So better to allocate vma_iterator using BPF allocator, and since
> >> that alloc must already succeed, might as well allocate all iter fields,
> >> thereby freezing struct bpf_iter_task_vma size.
> >>
> >> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> >> Cc: Nathan Slingerland <slinger@meta.com>
> >> ---
> >>   include/uapi/linux/bpf.h                      |  4 +
> >>   kernel/bpf/helpers.c                          |  3 +
> >>   kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
> >>   tools/include/uapi/linux/bpf.h                |  4 +
> >>   tools/lib/bpf/bpf_helpers.h                   |  8 ++
> >>   .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
> >>   ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
> >>   7 files changed, 116 insertions(+), 13 deletions(-)
> >>   rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> >>
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index 8790b3962e4b..49fc1989a548 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
> >>       __u64 __opaque[1];
> >>   } __attribute__((aligned(8)));
> >>   +struct bpf_iter_task_vma {
> >> +    __u64 __opaque[1]; /* See bpf_iter_num comment above */
> >> +} __attribute__((aligned(8)));
> >
> > In the future, we might have bpf_iter_cgroup, bpf_iter_task, bpf_iter_cgroup_task, etc. They may all use the same struct
> > like
> >   struct bpf_iter_<...> {
> >     __u64 __opaque[1];
> >   } __attribute__((aligned(8)));
> >
> > Maybe we want a generic one instead of having lots of
> > structs with the same underline definition? For example,
> >   struct bpf_iter_generic
> > ?
> >
>
> The bpf_for_each macro assumes a consistent naming scheme for opaque iter struct
> and associated kfuncs. Having a 'bpf_iter_generic' shared amongst multiple types
> of iters would break the scheme. We could:
>
>   * Add bpf_for_each_generic that only uses bpf_iter_generic
>     * This exposes implementation details in an ugly way, though.
>   * Do some macro magic to pick bpf_iter_generic for some types of iters, and
>     use consistent naming pattern for others.
>     * I'm not sure how to do this with preprocessor
>   * Migrate all opaque iter structs to only contain pointer to bpf_mem_alloc'd
>     data struct, and use bpf_iter_generic for all of them
>     * Probably need to see more iter implementation / usage before making such
>       a change
>   * Do 'typedef __u64 __aligned(8) bpf_iter_<...>
>     * BTF_KIND_TYPEDEF intead of BTF_KIND_STRUCT might throw off some verifier
>       logic. Could do similar typedef w/ struct to try to work around
>       it.
>
> Let me know what you think. Personally I considered doing typedef while
> implementing this, so that's the alternative I'd choose.
>
> >> +
> >>   #endif /* _UAPI__LINUX_BPF_H__ */
> >> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> >> index eb91cae0612a..7a06dea749f1 100644
> >> --- a/kernel/bpf/helpers.c
> >> +++ b/kernel/bpf/helpers.c
> >> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
> >>   BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
> >>   BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
> >>   BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
> >>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> >>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> >>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> >> index c4ab9d6cdbe9..51c2dce435c1 100644
> >> --- a/kernel/bpf/task_iter.c
> >> +++ b/kernel/bpf/task_iter.c
> >> @@ -7,7 +7,9 @@
> >>   #include <linux/fs.h>
> >>   #include <linux/fdtable.h>
> >>   #include <linux/filter.h>
> >> +#include <linux/bpf_mem_alloc.h>
> >>   #include <linux/btf_ids.h>
> >> +#include <linux/mm_types.h>
> >>   #include "mmap_unlock_work.h"
> >>     static const char * const iter_task_type_names[] = {
> >> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
> >>       .arg5_type    = ARG_ANYTHING,
> >>   };
> >>   +struct bpf_iter_task_vma_kern_data {
> >> +    struct task_struct *task;
> >> +    struct mm_struct *mm;
> >> +    struct mmap_unlock_irq_work *work;
> >> +    struct vma_iterator vmi;
> >> +};
> >> +
> >> +/* Non-opaque version of uapi bpf_iter_task_vma */
> >> +struct bpf_iter_task_vma_kern {
> >> +    struct bpf_iter_task_vma_kern_data *data;
> >> +} __attribute__((aligned(8)));
> >> +
> >> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> >> +                      struct task_struct *task, u64 addr)
> >> +{
> >> +    struct bpf_iter_task_vma_kern *kit = (void *)it;
> >> +    bool irq_work_busy = false;
> >> +    int err;
> >> +
> >> +    BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
> >> +    BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
> >> +
> >> +    /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
> >> +     * before, so non-NULL kit->data doesn't point to previously
> >> +     * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
> >> +     */
> >> +    kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
> >> +    if (!kit->data)
> >> +        return -ENOMEM;
> >> +    kit->data->task = NULL;
> >> +
> >> +    if (!task) {
> >> +        err = -ENOENT;
> >> +        goto err_cleanup_iter;
> >> +    }
> >> +
> >> +    kit->data->task = get_task_struct(task);
> >
> > The above is not safe. Since there is no restriction on 'task',
> > the 'task' could be in a state for destruction with 'usage' count 0
> > and then get_task_struct(task) won't work since it unconditionally
> > tries to increase 'usage' count from 0 to 1.
> >
> > Or, 'task' may be valid at the entry of the funciton, but when
> > 'task' is in get_task_struct(), 'task' may have been destroyed
> > and 'task' memory is reused by somebody else.
> >
> > I suggest that we check input parameter 'task' must be
> > PTR_TRUSTED or MEM_RCU. This way, the above !task checking
> > is not necessary and get_task_struct() can correctly
> > hold a reference to 'task'.
> >
>
> Adding a PTR_TRUSTED or MEM_RCU check seems reasonable. I'm curious
> whether there's any way to feed a 'plain' struct task_struct PTR_TO_BTF_ID
> to this kfunc currently.
>
> * bpf_get_current_task_btf helper returns PTR_TRUSTED | PTR_TO_BTF_ID
> * ptr hop from trusted task_struct to 'real_parent' or similar should
>   yield MEM_RCU (due to BTF_TYPE_SAFE_RCU(struct task_struct) def
> * if task kptr is in map_val, direct reference to it should result
>   in PTR_UNTRUSTED PTR_TO_BTF_ID, must kptr_xchg it or acquire again
>   using bpf_task_from_pid (?)
>
> But regardless, better to be explicit. Will change.

How horrible would it be to base an interface on TID/PID (i.e., int)
as input argument to specify a task? I'm just thinking it might be
more generic and easy to use in more situations:
   - for all the cases where we have struct task_struct, getting its
pid is trivial: `task->pid`;
   - but in some situations PID might be coming from outside: either
as an argument to CLI tool, or from old-style tracepoint (e.g.,
context_switch where we have prev/next task pid), etc.

The downside is that we'd need to look up a task, right? But on the
other hand we get more generality and won't have to rely on having
PTR_TRUSTED task_struct.

Thoughts?

>
> >> +    kit->data->mm = task->mm;
> >> +    if (!kit->data->mm) {
> >> +        err = -ENOENT;
> >> +        goto err_cleanup_iter;
> >> +    }
> >> +
> >> +    /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
> >> +    irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
> >> +    if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
> >> +        err = -EBUSY;
> >> +        goto err_cleanup_iter;
> >> +    }
> >> +
> >> +    vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
> >> +    return 0;
> >> +
> >> +err_cleanup_iter:
> >> +    if (kit->data->task)
> >> +        put_task_struct(kit->data->task);
> >> +    bpf_mem_free(&bpf_global_ma, kit->data);
> >> +    /* NULL kit->data signals failed bpf_iter_task_vma initialization */
> >> +    kit->data = NULL;
> >> +    return err;
> >> +}
> >> +
> > [...]
Yonghong Song Aug. 23, 2023, 12:11 a.m. UTC | #8
On 8/22/23 3:36 PM, Alexei Starovoitov wrote:
> On Tue, Aug 22, 2023 at 1:14 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>>
>>
>> On 8/22/23 12:19 PM, David Marchevsky wrote:
>>> On 8/22/23 1:42 PM, Yonghong Song wrote:
>>>>
>>>>
>>>> On 8/21/23 10:05 PM, Dave Marchevsky wrote:
>>>>> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
>>>>> creation and manipulation of struct bpf_iter_task_vma in open-coded
>>>>> iterator style. BPF programs can use these kfuncs directly or through
>>>>> bpf_for_each macro for natural-looking iteration of all task vmas.
>>>>>
>>>>> The implementation borrows heavily from bpf_find_vma helper's locking -
>>>>> differing only in that it holds the mmap_read lock for all iterations
>>>>> while the helper only executes its provided callback on a maximum of 1
>>>>> vma. Aside from locking, struct vma_iterator and vma_next do all the
>>>>> heavy lifting.
>>>>>
>>>>> The newly-added struct bpf_iter_task_vma has a name collision with a
>>>>> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
>>>>> file is renamed in order to avoid the collision.
>>>>>
>>>>> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
>>>>> only field in struct bpf_iter_task_vma. This is because the inner data
>>>>> struct contains a struct vma_iterator (not ptr), whose size is likely to
>>>>> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
>>>>> such a change would require change in opaque bpf_iter_task_vma struct's
>>>>> size. So better to allocate vma_iterator using BPF allocator, and since
>>>>> that alloc must already succeed, might as well allocate all iter fields,
>>>>> thereby freezing struct bpf_iter_task_vma size.
>>>>>
>>>>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>>>>> Cc: Nathan Slingerland <slinger@meta.com>
>>>>> ---
>>>>>     include/uapi/linux/bpf.h                      |  4 +
>>>>>     kernel/bpf/helpers.c                          |  3 +
>>>>>     kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
>>>>>     tools/include/uapi/linux/bpf.h                |  4 +
>>>>>     tools/lib/bpf/bpf_helpers.h                   |  8 ++
>>>>>     .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
>>>>>     ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
>>>>>     7 files changed, 116 insertions(+), 13 deletions(-)
>>>>>     rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
>>>>>
>>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>>> index 8790b3962e4b..49fc1989a548 100644
>>>>> --- a/include/uapi/linux/bpf.h
>>>>> +++ b/include/uapi/linux/bpf.h
>>>>> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
>>>>>         __u64 __opaque[1];
>>>>>     } __attribute__((aligned(8)));
>>>>>     +struct bpf_iter_task_vma {
>>>>> +    __u64 __opaque[1]; /* See bpf_iter_num comment above */
>>>>> +} __attribute__((aligned(8)));
>>>>
>>>> In the future, we might have bpf_iter_cgroup, bpf_iter_task, bpf_iter_cgroup_task, etc. They may all use the same struct
>>>> like
>>>>     struct bpf_iter_<...> {
>>>>       __u64 __opaque[1];
>>>>     } __attribute__((aligned(8)));
>>>>
>>>> Maybe we want a generic one instead of having lots of
>>>> structs with the same underline definition? For example,
>>>>     struct bpf_iter_generic
>>>> ?
>>>>
>>>
>>> The bpf_for_each macro assumes a consistent naming scheme for opaque iter struct
>>> and associated kfuncs. Having a 'bpf_iter_generic' shared amongst multiple types
>>> of iters would break the scheme. We could:
>>>
>>>     * Add bpf_for_each_generic that only uses bpf_iter_generic
>>>       * This exposes implementation details in an ugly way, though.
>>>     * Do some macro magic to pick bpf_iter_generic for some types of iters, and
>>>       use consistent naming pattern for others.
>>>       * I'm not sure how to do this with preprocessor
>>>     * Migrate all opaque iter structs to only contain pointer to bpf_mem_alloc'd
>>>       data struct, and use bpf_iter_generic for all of them
>>>       * Probably need to see more iter implementation / usage before making such
>>>         a change
>>>     * Do 'typedef __u64 __aligned(8) bpf_iter_<...>
>>>       * BTF_KIND_TYPEDEF intead of BTF_KIND_STRUCT might throw off some verifier
>>>         logic. Could do similar typedef w/ struct to try to work around
>>>         it.
>>>
>>> Let me know what you think. Personally I considered doing typedef while
>>> implementing this, so that's the alternative I'd choose.
>>
>> Okay, since we have naming convention restriction, typedef probably the
>> best option, something like
>>     typedef struct bpf_iter_num bpf_iter_task_vma
>> ?
>>
>> Verifier might need to be changed if verifier strips all modifiers
>> (including tyypedef) to find the struct name.
> 
> I don't quite see how typedef helps here.
> Say we do:
> struct bpf_iter_task_vma {
>       __u64 __opaque[1];
> } __attribute__((aligned(8)));
> 
> as Dave is proposing.
> Then tomorrow we add another bpf_iter_foo that is exactly the same opaque[1].
> And we will have bpf_iter_num, bpf_iter_task_vma, bpf_iter_foo structs
> with the same layout. So what? Eye sore?
> In case we need to extend task_vma from 1 to 2 it will be easier to do
> when all of them are separate structs.
> 
> And typedef has unknown verification implications.

This is true. Some investigation is needed.

> 
> Either way we need to find a way to move these structs from uapi/bpf.h
> along with bpf_rb_root and friends to some "obviously unstable" header.

If we move this out uapi/bpf.h to an unstable header, we will have much
more flexibility for future change. Original idea (v1) to allocate the
whole struct on the stack would be okay even if kernel changes
struct vma_iterator size, whether bigger or smaller.
David Marchevsky Aug. 23, 2023, 5:42 a.m. UTC | #9
On 8/22/23 8:04 PM, Andrii Nakryiko wrote:
> On Tue, Aug 22, 2023 at 12:20 PM David Marchevsky
> <david.marchevsky@linux.dev> wrote:
>>
>> On 8/22/23 1:42 PM, Yonghong Song wrote:
>>>
>>>
>>> On 8/21/23 10:05 PM, Dave Marchevsky wrote:
>>>> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
>>>> creation and manipulation of struct bpf_iter_task_vma in open-coded
>>>> iterator style. BPF programs can use these kfuncs directly or through
>>>> bpf_for_each macro for natural-looking iteration of all task vmas.
>>>>
>>>> The implementation borrows heavily from bpf_find_vma helper's locking -
>>>> differing only in that it holds the mmap_read lock for all iterations
>>>> while the helper only executes its provided callback on a maximum of 1
>>>> vma. Aside from locking, struct vma_iterator and vma_next do all the
>>>> heavy lifting.
>>>>
>>>> The newly-added struct bpf_iter_task_vma has a name collision with a
>>>> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
>>>> file is renamed in order to avoid the collision.
>>>>
>>>> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
>>>> only field in struct bpf_iter_task_vma. This is because the inner data
>>>> struct contains a struct vma_iterator (not ptr), whose size is likely to
>>>> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
>>>> such a change would require change in opaque bpf_iter_task_vma struct's
>>>> size. So better to allocate vma_iterator using BPF allocator, and since
>>>> that alloc must already succeed, might as well allocate all iter fields,
>>>> thereby freezing struct bpf_iter_task_vma size.
>>>>
>>>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>>>> Cc: Nathan Slingerland <slinger@meta.com>
>>>> ---
>>>>   include/uapi/linux/bpf.h                      |  4 +
>>>>   kernel/bpf/helpers.c                          |  3 +
>>>>   kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
>>>>   tools/include/uapi/linux/bpf.h                |  4 +
>>>>   tools/lib/bpf/bpf_helpers.h                   |  8 ++
>>>>   .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
>>>>   ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
>>>>   7 files changed, 116 insertions(+), 13 deletions(-)
>>>>   rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
>>>>
>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index 8790b3962e4b..49fc1989a548 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
>>>>       __u64 __opaque[1];
>>>>   } __attribute__((aligned(8)));
>>>>   +struct bpf_iter_task_vma {
>>>> +    __u64 __opaque[1]; /* See bpf_iter_num comment above */
>>>> +} __attribute__((aligned(8)));
>>>
>>> In the future, we might have bpf_iter_cgroup, bpf_iter_task, bpf_iter_cgroup_task, etc. They may all use the same struct
>>> like
>>>   struct bpf_iter_<...> {
>>>     __u64 __opaque[1];
>>>   } __attribute__((aligned(8)));
>>>
>>> Maybe we want a generic one instead of having lots of
>>> structs with the same underline definition? For example,
>>>   struct bpf_iter_generic
>>> ?
>>>
>>
>> The bpf_for_each macro assumes a consistent naming scheme for opaque iter struct
>> and associated kfuncs. Having a 'bpf_iter_generic' shared amongst multiple types
>> of iters would break the scheme. We could:
>>
>>   * Add bpf_for_each_generic that only uses bpf_iter_generic
>>     * This exposes implementation details in an ugly way, though.
>>   * Do some macro magic to pick bpf_iter_generic for some types of iters, and
>>     use consistent naming pattern for others.
>>     * I'm not sure how to do this with preprocessor
>>   * Migrate all opaque iter structs to only contain pointer to bpf_mem_alloc'd
>>     data struct, and use bpf_iter_generic for all of them
>>     * Probably need to see more iter implementation / usage before making such
>>       a change
>>   * Do 'typedef __u64 __aligned(8) bpf_iter_<...>
>>     * BTF_KIND_TYPEDEF intead of BTF_KIND_STRUCT might throw off some verifier
>>       logic. Could do similar typedef w/ struct to try to work around
>>       it.
>>
>> Let me know what you think. Personally I considered doing typedef while
>> implementing this, so that's the alternative I'd choose.
>>
>>>> +
>>>>   #endif /* _UAPI__LINUX_BPF_H__ */
>>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>>>> index eb91cae0612a..7a06dea749f1 100644
>>>> --- a/kernel/bpf/helpers.c
>>>> +++ b/kernel/bpf/helpers.c
>>>> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
>>>>   BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
>>>>   BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
>>>>   BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
>>>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
>>>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
>>>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
>>>>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>>>>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>>>>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
>>>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>>>> index c4ab9d6cdbe9..51c2dce435c1 100644
>>>> --- a/kernel/bpf/task_iter.c
>>>> +++ b/kernel/bpf/task_iter.c
>>>> @@ -7,7 +7,9 @@
>>>>   #include <linux/fs.h>
>>>>   #include <linux/fdtable.h>
>>>>   #include <linux/filter.h>
>>>> +#include <linux/bpf_mem_alloc.h>
>>>>   #include <linux/btf_ids.h>
>>>> +#include <linux/mm_types.h>
>>>>   #include "mmap_unlock_work.h"
>>>>     static const char * const iter_task_type_names[] = {
>>>> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
>>>>       .arg5_type    = ARG_ANYTHING,
>>>>   };
>>>>   +struct bpf_iter_task_vma_kern_data {
>>>> +    struct task_struct *task;
>>>> +    struct mm_struct *mm;
>>>> +    struct mmap_unlock_irq_work *work;
>>>> +    struct vma_iterator vmi;
>>>> +};
>>>> +
>>>> +/* Non-opaque version of uapi bpf_iter_task_vma */
>>>> +struct bpf_iter_task_vma_kern {
>>>> +    struct bpf_iter_task_vma_kern_data *data;
>>>> +} __attribute__((aligned(8)));
>>>> +
>>>> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
>>>> +                      struct task_struct *task, u64 addr)
>>>> +{
>>>> +    struct bpf_iter_task_vma_kern *kit = (void *)it;
>>>> +    bool irq_work_busy = false;
>>>> +    int err;
>>>> +
>>>> +    BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
>>>> +    BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
>>>> +
>>>> +    /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
>>>> +     * before, so non-NULL kit->data doesn't point to previously
>>>> +     * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
>>>> +     */
>>>> +    kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
>>>> +    if (!kit->data)
>>>> +        return -ENOMEM;
>>>> +    kit->data->task = NULL;
>>>> +
>>>> +    if (!task) {
>>>> +        err = -ENOENT;
>>>> +        goto err_cleanup_iter;
>>>> +    }
>>>> +
>>>> +    kit->data->task = get_task_struct(task);
>>>
>>> The above is not safe. Since there is no restriction on 'task',
>>> the 'task' could be in a state for destruction with 'usage' count 0
>>> and then get_task_struct(task) won't work since it unconditionally
>>> tries to increase 'usage' count from 0 to 1.
>>>
>>> Or, 'task' may be valid at the entry of the funciton, but when
>>> 'task' is in get_task_struct(), 'task' may have been destroyed
>>> and 'task' memory is reused by somebody else.
>>>
>>> I suggest that we check input parameter 'task' must be
>>> PTR_TRUSTED or MEM_RCU. This way, the above !task checking
>>> is not necessary and get_task_struct() can correctly
>>> hold a reference to 'task'.
>>>
>>
>> Adding a PTR_TRUSTED or MEM_RCU check seems reasonable. I'm curious
>> whether there's any way to feed a 'plain' struct task_struct PTR_TO_BTF_ID
>> to this kfunc currently.
>>
>> * bpf_get_current_task_btf helper returns PTR_TRUSTED | PTR_TO_BTF_ID
>> * ptr hop from trusted task_struct to 'real_parent' or similar should
>>   yield MEM_RCU (due to BTF_TYPE_SAFE_RCU(struct task_struct) def
>> * if task kptr is in map_val, direct reference to it should result
>>   in PTR_UNTRUSTED PTR_TO_BTF_ID, must kptr_xchg it or acquire again
>>   using bpf_task_from_pid (?)
>>
>> But regardless, better to be explicit. Will change.
> 
> How horrible would it be to base an interface on TID/PID (i.e., int)
> as input argument to specify a task? I'm just thinking it might be
> more generic and easy to use in more situations:
>    - for all the cases where we have struct task_struct, getting its
> pid is trivial: `task->pid`;
>    - but in some situations PID might be coming from outside: either
> as an argument to CLI tool, or from old-style tracepoint (e.g.,
> context_switch where we have prev/next task pid), etc.
> 
> The downside is that we'd need to look up a task, right? But on the
> other hand we get more generality and won't have to rely on having
> PTR_TRUSTED task_struct.
> 
> Thoughts?
> 

Meh, taking tid here feels like the 'old-school' approach, before recent
efforts to teach the verifier more about resource acquisition, locking,
iteration, trustedness, etc. All allowing us to push more important logic
out of 'opaque' helper impl and into BPF programs.

In this tid -> struct task_struct case I think the provided resource acquisition 
is sufficient. Using your examples:

  * We have a TRUSTED or RCU struct task_struct
    * No need to do anything, can pass to bpf_iter_task_vma_new

  * We have a struct task_struct, but it's UNTRUSTED or has no trustedness
    type flags
    * Use bpf_task_acquire or bpf_task_from_pid

  * We just have a pid ('coming from outside')
    * Use bpf_task_from_pid

If there is some scenario where we can't get from pid to properly acquired task
in the BPF program, let's improve the resource acquisition instead of pushing
it into the kfunc impl.

Also, should we look up (and refcount incr) the task using task->rcu_users
refcount w/ bpf_task_{acquire,release}, or using task->usage refcount w/ 
{get,put}_task_struct ? More generally, if there are >1 valid ways to acquire
task_struct or some other resource, pushing acquisition to the BPF program
gives us the benefit of not having to pick one (or do possibly ugly / complex
flags). As long as type flags express that the resource will not go away,
this kfunc impl can ignore the details of how that property came about.

>>
>>>> +    kit->data->mm = task->mm;
>>>> +    if (!kit->data->mm) {
>>>> +        err = -ENOENT;
>>>> +        goto err_cleanup_iter;
>>>> +    }
>>>> +
>>>> +    /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
>>>> +    irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
>>>> +    if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
>>>> +        err = -EBUSY;
>>>> +        goto err_cleanup_iter;
>>>> +    }
>>>> +
>>>> +    vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
>>>> +    return 0;
>>>> +
>>>> +err_cleanup_iter:
>>>> +    if (kit->data->task)
>>>> +        put_task_struct(kit->data->task);
>>>> +    bpf_mem_free(&bpf_global_ma, kit->data);
>>>> +    /* NULL kit->data signals failed bpf_iter_task_vma initialization */
>>>> +    kit->data = NULL;
>>>> +    return err;
>>>> +}
>>>> +
>>> [...]
David Marchevsky Aug. 23, 2023, 7:26 a.m. UTC | #10
On 8/22/23 7:52 PM, Andrii Nakryiko wrote:
> On Mon, Aug 21, 2023 at 10:06 PM Dave Marchevsky <davemarchevsky@fb.com> wrote:
>>
>> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
>> creation and manipulation of struct bpf_iter_task_vma in open-coded
>> iterator style. BPF programs can use these kfuncs directly or through
>> bpf_for_each macro for natural-looking iteration of all task vmas.
>>
>> The implementation borrows heavily from bpf_find_vma helper's locking -
>> differing only in that it holds the mmap_read lock for all iterations
>> while the helper only executes its provided callback on a maximum of 1
>> vma. Aside from locking, struct vma_iterator and vma_next do all the
>> heavy lifting.
>>
>> The newly-added struct bpf_iter_task_vma has a name collision with a
>> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
>> file is renamed in order to avoid the collision.
>>
>> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
>> only field in struct bpf_iter_task_vma. This is because the inner data
>> struct contains a struct vma_iterator (not ptr), whose size is likely to
>> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
>> such a change would require change in opaque bpf_iter_task_vma struct's
>> size. So better to allocate vma_iterator using BPF allocator, and since
>> that alloc must already succeed, might as well allocate all iter fields,
>> thereby freezing struct bpf_iter_task_vma size.
>>
>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>> Cc: Nathan Slingerland <slinger@meta.com>
>> ---
>>  include/uapi/linux/bpf.h                      |  4 +
>>  kernel/bpf/helpers.c                          |  3 +
>>  kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
>>  tools/include/uapi/linux/bpf.h                |  4 +
>>  tools/lib/bpf/bpf_helpers.h                   |  8 ++
>>  .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
>>  ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
>>  7 files changed, 116 insertions(+), 13 deletions(-)
>>  rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 8790b3962e4b..49fc1989a548 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
>>         __u64 __opaque[1];
>>  } __attribute__((aligned(8)));
>>
>> +struct bpf_iter_task_vma {
>> +       __u64 __opaque[1]; /* See bpf_iter_num comment above */
>> +} __attribute__((aligned(8)));
>> +
>>  #endif /* _UAPI__LINUX_BPF_H__ */
>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>> index eb91cae0612a..7a06dea749f1 100644
>> --- a/kernel/bpf/helpers.c
>> +++ b/kernel/bpf/helpers.c
>> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
>>  BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
>>  BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
>>  BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
>>  BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>>  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>>  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>> index c4ab9d6cdbe9..51c2dce435c1 100644
>> --- a/kernel/bpf/task_iter.c
>> +++ b/kernel/bpf/task_iter.c
>> @@ -7,7 +7,9 @@
>>  #include <linux/fs.h>
>>  #include <linux/fdtable.h>
>>  #include <linux/filter.h>
>> +#include <linux/bpf_mem_alloc.h>
>>  #include <linux/btf_ids.h>
>> +#include <linux/mm_types.h>
>>  #include "mmap_unlock_work.h"
>>
>>  static const char * const iter_task_type_names[] = {
>> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
>>         .arg5_type      = ARG_ANYTHING,
>>  };
>>
>> +struct bpf_iter_task_vma_kern_data {
>> +       struct task_struct *task;
>> +       struct mm_struct *mm;
>> +       struct mmap_unlock_irq_work *work;
>> +       struct vma_iterator vmi;
>> +};
>> +
>> +/* Non-opaque version of uapi bpf_iter_task_vma */
>> +struct bpf_iter_task_vma_kern {
>> +       struct bpf_iter_task_vma_kern_data *data;
>> +} __attribute__((aligned(8)));
>> +
> 
> it's a bit worrying that we'll rely on memory allocation inside NMI to
> be able to use this. I'm missing previous email discussion (I declared
> email bankruptcy after long vacation), but I suppose the option to fix
> bpf_iter_task_vma (to 88 bytes: 64 for vma_iterator + 24 for extra
> pointers), or even to 96 to have a bit of headroom in case we need a
> bit more space was rejected? It seems unlikely that vma_iterator will
> have to grow, but if it does, it has 5 bytes of padding right now for
> various flags, plus we can have extra 8 bytes reserved just in case.
> 
> I know it's a big struct and will take a big chunk of the BPF stack,
> but I'm a bit worried about both the performance implication of mem
> alloc under NMI, and allocation failing.
> 
> Maybe the worry is overblown, but I thought I'll bring it up anyways.
> 

Few tangential trains of thought here, separated by multiple newlines
for easier reading.


IIUC the any-context BPF allocator will not actually allocate memory in NMI
context, instead relying on its existing pre-filled caches.

Alexei's patch adding the allocator says ([0]):

  The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general.

So sounds bpf_mem_alloc in a kfunc called by a BPF program is NMI-safe.


That's not to say that I'm happy about adding a fallible bpf_mem_alloc call here
before the kfunc can do anything useful. But it seems like the best way to
guarantee that we never see a mailing list message like:

  Hello, I just added a field to 'struct ma_state' in my subsystem and it seems
  I've triggered a BUILD_BUG_ON in this far-away BPF subsystem. It looks like
  you're making stability guarantees based on the size of my internal struct.
  What the hell?

Sure, after I remove the kfuncs and struct bpf_iter_task_vma fwd decl from
bpf_helpers.h - per your other comment below - we can do the whole "kfuncs
aren't uapi and this struct bpf_iter_task_vma is coming from vmlinux.h,
not some stable header" spiel and convince this hypothetical person. Not having
to do the spiel here reinforces the more general "Modern BPF exposes
functionality w/ kfuncs and kptrs, which are inherently _unstable_" messaging
more effectively than having to explain.


If we go back to putting struct vma_iterator on the BPF stack, I think we
definitely want to keep the BUILD_BUG_ON. If it were removed and vma_iterator
size changes, that would affect portability of BPF programs that assume the old
size of bpf_iter_task_vma, no? Which bpf_for_each is doing since it puts
bpf_iter_task_vma on the stack.

Is there some CO-RE technique that would handle above scenario portably? I
can't think of anything straightforward. Maybe if BPF prog BTF only had
a fwd decl for bpf_iter_task_vma, and size thus had to be taken from
vmlinux BTF. But that would fail to compile since it the struct goes
on the stack. Maybe use some placeholder size for compilation and use
BTF tag to tell libbpf to patch insns w/ vmlinux's size for this struct?


Re: padding bytes, seems worse to me than not using them. Have to make
assumptions about far-away struct, specifically vma_iterator
which landed quite recently as part of maple tree series. The assumptions
don't prevent my hypothetical mailing list confusion from happening, increases
the confusion if it does happen ("I added a small field recently, why didn't
this break then? If it's explicitly and intentionally unstable, why add
padding bytes?")

  [0]: https://lore.kernel.org/bpf/20220902211058.60789-2-alexei.starovoitov@gmail.com

>> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
>> +                                     struct task_struct *task, u64 addr)
>> +{
>> +       struct bpf_iter_task_vma_kern *kit = (void *)it;
>> +       bool irq_work_busy = false;
>> +       int err;
>> +
> 
> [...]
> 
>>  static void do_mmap_read_unlock(struct irq_work *entry)
>> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
>> index 8790b3962e4b..49fc1989a548 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
>>         __u64 __opaque[1];
>>  } __attribute__((aligned(8)));
>>
>> +struct bpf_iter_task_vma {
>> +       __u64 __opaque[1]; /* See bpf_iter_num comment above */
>> +} __attribute__((aligned(8)));
>> +
>>  #endif /* _UAPI__LINUX_BPF_H__ */
>> diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
>> index bbab9ad9dc5a..d885ffee4d88 100644
>> --- a/tools/lib/bpf/bpf_helpers.h
>> +++ b/tools/lib/bpf/bpf_helpers.h
>> @@ -302,6 +302,14 @@ extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __weak
>>  extern int *bpf_iter_num_next(struct bpf_iter_num *it) __weak __ksym;
>>  extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym;
>>
>> +struct bpf_iter_task_vma;
>> +
>> +extern int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
>> +                                struct task_struct *task,
>> +                                unsigned long addr) __weak __ksym;
>> +extern struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) __weak __ksym;
>> +extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __weak __ksym;
> 
> my intent wasn't to add all open-coded iterators to bpf_helpers.h. I
> think bpf_iter_num_* is rather an exception and isn't supposed to ever
> change or be removed, while other iterators should be allowed to be
> changed.
> 
> The goal is for all such kfuncs (and struct bpf_iter_task_vma state
> itself, probably) to come from vmlinux.h, eventually, so let's leave
> it out of libbpf's stable bpf_helpers.h header.
> 
> 
> [...]

As alluded to in my long response above, this sounds reasonable, will
remove from here.

> 
>> @@ -1533,7 +1533,7 @@ static void test_task_vma_dead_task(void)
>>  out:
>>         waitpid(child_pid, &wstatus, 0);
>>         close(iter_fd);
>> -       bpf_iter_task_vma__destroy(skel);
>> +       bpf_iter_task_vmas__destroy(skel);
>>  }
>>
>>  void test_bpf_sockmap_map_iter_fd(void)
>> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
>> similarity index 100%
>> rename from tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
>> rename to tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
>> --
>> 2.34.1
>>
Alexei Starovoitov Aug. 23, 2023, 2:57 p.m. UTC | #11
On Tue, Aug 22, 2023 at 10:42 PM David Marchevsky
<david.marchevsky@linux.dev> wrote:
>
> On 8/22/23 8:04 PM, Andrii Nakryiko wrote:
> > On Tue, Aug 22, 2023 at 12:20 PM David Marchevsky
> > <david.marchevsky@linux.dev> wrote:
> >>
> >> On 8/22/23 1:42 PM, Yonghong Song wrote:
> >>>
> >>>
> >>> On 8/21/23 10:05 PM, Dave Marchevsky wrote:
> >>>> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> >>>> creation and manipulation of struct bpf_iter_task_vma in open-coded
> >>>> iterator style. BPF programs can use these kfuncs directly or through
> >>>> bpf_for_each macro for natural-looking iteration of all task vmas.
> >>>>
> >>>> The implementation borrows heavily from bpf_find_vma helper's locking -
> >>>> differing only in that it holds the mmap_read lock for all iterations
> >>>> while the helper only executes its provided callback on a maximum of 1
> >>>> vma. Aside from locking, struct vma_iterator and vma_next do all the
> >>>> heavy lifting.
> >>>>
> >>>> The newly-added struct bpf_iter_task_vma has a name collision with a
> >>>> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> >>>> file is renamed in order to avoid the collision.
> >>>>
> >>>> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
> >>>> only field in struct bpf_iter_task_vma. This is because the inner data
> >>>> struct contains a struct vma_iterator (not ptr), whose size is likely to
> >>>> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
> >>>> such a change would require change in opaque bpf_iter_task_vma struct's
> >>>> size. So better to allocate vma_iterator using BPF allocator, and since
> >>>> that alloc must already succeed, might as well allocate all iter fields,
> >>>> thereby freezing struct bpf_iter_task_vma size.
> >>>>
> >>>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> >>>> Cc: Nathan Slingerland <slinger@meta.com>
> >>>> ---
> >>>>   include/uapi/linux/bpf.h                      |  4 +
> >>>>   kernel/bpf/helpers.c                          |  3 +
> >>>>   kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
> >>>>   tools/include/uapi/linux/bpf.h                |  4 +
> >>>>   tools/lib/bpf/bpf_helpers.h                   |  8 ++
> >>>>   .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
> >>>>   ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
> >>>>   7 files changed, 116 insertions(+), 13 deletions(-)
> >>>>   rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> >>>>
> >>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >>>> index 8790b3962e4b..49fc1989a548 100644
> >>>> --- a/include/uapi/linux/bpf.h
> >>>> +++ b/include/uapi/linux/bpf.h
> >>>> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
> >>>>       __u64 __opaque[1];
> >>>>   } __attribute__((aligned(8)));
> >>>>   +struct bpf_iter_task_vma {
> >>>> +    __u64 __opaque[1]; /* See bpf_iter_num comment above */
> >>>> +} __attribute__((aligned(8)));
> >>>
> >>> In the future, we might have bpf_iter_cgroup, bpf_iter_task, bpf_iter_cgroup_task, etc. They may all use the same struct
> >>> like
> >>>   struct bpf_iter_<...> {
> >>>     __u64 __opaque[1];
> >>>   } __attribute__((aligned(8)));
> >>>
> >>> Maybe we want a generic one instead of having lots of
> >>> structs with the same underline definition? For example,
> >>>   struct bpf_iter_generic
> >>> ?
> >>>
> >>
> >> The bpf_for_each macro assumes a consistent naming scheme for opaque iter struct
> >> and associated kfuncs. Having a 'bpf_iter_generic' shared amongst multiple types
> >> of iters would break the scheme. We could:
> >>
> >>   * Add bpf_for_each_generic that only uses bpf_iter_generic
> >>     * This exposes implementation details in an ugly way, though.
> >>   * Do some macro magic to pick bpf_iter_generic for some types of iters, and
> >>     use consistent naming pattern for others.
> >>     * I'm not sure how to do this with preprocessor
> >>   * Migrate all opaque iter structs to only contain pointer to bpf_mem_alloc'd
> >>     data struct, and use bpf_iter_generic for all of them
> >>     * Probably need to see more iter implementation / usage before making such
> >>       a change
> >>   * Do 'typedef __u64 __aligned(8) bpf_iter_<...>
> >>     * BTF_KIND_TYPEDEF intead of BTF_KIND_STRUCT might throw off some verifier
> >>       logic. Could do similar typedef w/ struct to try to work around
> >>       it.
> >>
> >> Let me know what you think. Personally I considered doing typedef while
> >> implementing this, so that's the alternative I'd choose.
> >>
> >>>> +
> >>>>   #endif /* _UAPI__LINUX_BPF_H__ */
> >>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> >>>> index eb91cae0612a..7a06dea749f1 100644
> >>>> --- a/kernel/bpf/helpers.c
> >>>> +++ b/kernel/bpf/helpers.c
> >>>> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
> >>>>   BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
> >>>>   BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
> >>>>   BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> >>>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
> >>>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
> >>>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
> >>>>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> >>>>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> >>>>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> >>>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> >>>> index c4ab9d6cdbe9..51c2dce435c1 100644
> >>>> --- a/kernel/bpf/task_iter.c
> >>>> +++ b/kernel/bpf/task_iter.c
> >>>> @@ -7,7 +7,9 @@
> >>>>   #include <linux/fs.h>
> >>>>   #include <linux/fdtable.h>
> >>>>   #include <linux/filter.h>
> >>>> +#include <linux/bpf_mem_alloc.h>
> >>>>   #include <linux/btf_ids.h>
> >>>> +#include <linux/mm_types.h>
> >>>>   #include "mmap_unlock_work.h"
> >>>>     static const char * const iter_task_type_names[] = {
> >>>> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
> >>>>       .arg5_type    = ARG_ANYTHING,
> >>>>   };
> >>>>   +struct bpf_iter_task_vma_kern_data {
> >>>> +    struct task_struct *task;
> >>>> +    struct mm_struct *mm;
> >>>> +    struct mmap_unlock_irq_work *work;
> >>>> +    struct vma_iterator vmi;
> >>>> +};
> >>>> +
> >>>> +/* Non-opaque version of uapi bpf_iter_task_vma */
> >>>> +struct bpf_iter_task_vma_kern {
> >>>> +    struct bpf_iter_task_vma_kern_data *data;
> >>>> +} __attribute__((aligned(8)));
> >>>> +
> >>>> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> >>>> +                      struct task_struct *task, u64 addr)
> >>>> +{
> >>>> +    struct bpf_iter_task_vma_kern *kit = (void *)it;
> >>>> +    bool irq_work_busy = false;
> >>>> +    int err;
> >>>> +
> >>>> +    BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
> >>>> +    BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
> >>>> +
> >>>> +    /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
> >>>> +     * before, so non-NULL kit->data doesn't point to previously
> >>>> +     * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
> >>>> +     */
> >>>> +    kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
> >>>> +    if (!kit->data)
> >>>> +        return -ENOMEM;
> >>>> +    kit->data->task = NULL;
> >>>> +
> >>>> +    if (!task) {
> >>>> +        err = -ENOENT;
> >>>> +        goto err_cleanup_iter;
> >>>> +    }
> >>>> +
> >>>> +    kit->data->task = get_task_struct(task);
> >>>
> >>> The above is not safe. Since there is no restriction on 'task',
> >>> the 'task' could be in a state for destruction with 'usage' count 0
> >>> and then get_task_struct(task) won't work since it unconditionally
> >>> tries to increase 'usage' count from 0 to 1.
> >>>
> >>> Or, 'task' may be valid at the entry of the funciton, but when
> >>> 'task' is in get_task_struct(), 'task' may have been destroyed
> >>> and 'task' memory is reused by somebody else.
> >>>
> >>> I suggest that we check input parameter 'task' must be
> >>> PTR_TRUSTED or MEM_RCU. This way, the above !task checking
> >>> is not necessary and get_task_struct() can correctly
> >>> hold a reference to 'task'.
> >>>
> >>
> >> Adding a PTR_TRUSTED or MEM_RCU check seems reasonable. I'm curious
> >> whether there's any way to feed a 'plain' struct task_struct PTR_TO_BTF_ID
> >> to this kfunc currently.
> >>
> >> * bpf_get_current_task_btf helper returns PTR_TRUSTED | PTR_TO_BTF_ID
> >> * ptr hop from trusted task_struct to 'real_parent' or similar should
> >>   yield MEM_RCU (due to BTF_TYPE_SAFE_RCU(struct task_struct) def
> >> * if task kptr is in map_val, direct reference to it should result
> >>   in PTR_UNTRUSTED PTR_TO_BTF_ID, must kptr_xchg it or acquire again
> >>   using bpf_task_from_pid (?)
> >>
> >> But regardless, better to be explicit. Will change.
> >
> > How horrible would it be to base an interface on TID/PID (i.e., int)
> > as input argument to specify a task? I'm just thinking it might be
> > more generic and easy to use in more situations:
> >    - for all the cases where we have struct task_struct, getting its
> > pid is trivial: `task->pid`;
> >    - but in some situations PID might be coming from outside: either
> > as an argument to CLI tool, or from old-style tracepoint (e.g.,
> > context_switch where we have prev/next task pid), etc.
> >
> > The downside is that we'd need to look up a task, right? But on the
> > other hand we get more generality and won't have to rely on having
> > PTR_TRUSTED task_struct.
> >
> > Thoughts?
> >
>
> Meh, taking tid here feels like the 'old-school' approach, before recent
> efforts to teach the verifier more about resource acquisition, locking,
> iteration, trustedness, etc. All allowing us to push more important logic
> out of 'opaque' helper impl and into BPF programs.
>
> In this tid -> struct task_struct case I think the provided resource acquisition
> is sufficient. Using your examples:
>
>   * We have a TRUSTED or RCU struct task_struct
>     * No need to do anything, can pass to bpf_iter_task_vma_new
>
>   * We have a struct task_struct, but it's UNTRUSTED or has no trustedness
>     type flags
>     * Use bpf_task_acquire or bpf_task_from_pid
>
>   * We just have a pid ('coming from outside')
>     * Use bpf_task_from_pid
>
> If there is some scenario where we can't get from pid to properly acquired task
> in the BPF program, let's improve the resource acquisition instead of pushing
> it into the kfunc impl.
>
> Also, should we look up (and refcount incr) the task using task->rcu_users
> refcount w/ bpf_task_{acquire,release}, or using task->usage refcount w/
> {get,put}_task_struct ? More generally, if there are >1 valid ways to acquire
> task_struct or some other resource, pushing acquisition to the BPF program
> gives us the benefit of not having to pick one (or do possibly ugly / complex
> flags). As long as type flags express that the resource will not go away,
> this kfunc impl can ignore the details of how that property came about.

Agree with above reasoning. pid->task should be separate set of kfunc.
It's not a trivial task either. there are pid namespaces. there is u32 pid
and there is 'struct pid'. u32_pid->struct_pid->task has to be designed first.
Probably in some case the trusted task pointer is ready to be accessed,
but context could be such that pid->task cannot be done.
Alexei Starovoitov Aug. 23, 2023, 3:03 p.m. UTC | #12
On Wed, Aug 23, 2023 at 12:26 AM David Marchevsky
<david.marchevsky@linux.dev> wrote:
>
> On 8/22/23 7:52 PM, Andrii Nakryiko wrote:
> > On Mon, Aug 21, 2023 at 10:06 PM Dave Marchevsky <davemarchevsky@fb.com> wrote:
> >>
> >> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> >> creation and manipulation of struct bpf_iter_task_vma in open-coded
> >> iterator style. BPF programs can use these kfuncs directly or through
> >> bpf_for_each macro for natural-looking iteration of all task vmas.
> >>
> >> The implementation borrows heavily from bpf_find_vma helper's locking -
> >> differing only in that it holds the mmap_read lock for all iterations
> >> while the helper only executes its provided callback on a maximum of 1
> >> vma. Aside from locking, struct vma_iterator and vma_next do all the
> >> heavy lifting.
> >>
> >> The newly-added struct bpf_iter_task_vma has a name collision with a
> >> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> >> file is renamed in order to avoid the collision.
> >>
> >> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
> >> only field in struct bpf_iter_task_vma. This is because the inner data
> >> struct contains a struct vma_iterator (not ptr), whose size is likely to
> >> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
> >> such a change would require change in opaque bpf_iter_task_vma struct's
> >> size. So better to allocate vma_iterator using BPF allocator, and since
> >> that alloc must already succeed, might as well allocate all iter fields,
> >> thereby freezing struct bpf_iter_task_vma size.
> >>
> >> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> >> Cc: Nathan Slingerland <slinger@meta.com>
> >> ---
> >>  include/uapi/linux/bpf.h                      |  4 +
> >>  kernel/bpf/helpers.c                          |  3 +
> >>  kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
> >>  tools/include/uapi/linux/bpf.h                |  4 +
> >>  tools/lib/bpf/bpf_helpers.h                   |  8 ++
> >>  .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
> >>  ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
> >>  7 files changed, 116 insertions(+), 13 deletions(-)
> >>  rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> >>
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index 8790b3962e4b..49fc1989a548 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
> >>         __u64 __opaque[1];
> >>  } __attribute__((aligned(8)));
> >>
> >> +struct bpf_iter_task_vma {
> >> +       __u64 __opaque[1]; /* See bpf_iter_num comment above */
> >> +} __attribute__((aligned(8)));
> >> +
> >>  #endif /* _UAPI__LINUX_BPF_H__ */
> >> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> >> index eb91cae0612a..7a06dea749f1 100644
> >> --- a/kernel/bpf/helpers.c
> >> +++ b/kernel/bpf/helpers.c
> >> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
> >>  BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
> >>  BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
> >>  BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
> >>  BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> >>  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> >>  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> >> index c4ab9d6cdbe9..51c2dce435c1 100644
> >> --- a/kernel/bpf/task_iter.c
> >> +++ b/kernel/bpf/task_iter.c
> >> @@ -7,7 +7,9 @@
> >>  #include <linux/fs.h>
> >>  #include <linux/fdtable.h>
> >>  #include <linux/filter.h>
> >> +#include <linux/bpf_mem_alloc.h>
> >>  #include <linux/btf_ids.h>
> >> +#include <linux/mm_types.h>
> >>  #include "mmap_unlock_work.h"
> >>
> >>  static const char * const iter_task_type_names[] = {
> >> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
> >>         .arg5_type      = ARG_ANYTHING,
> >>  };
> >>
> >> +struct bpf_iter_task_vma_kern_data {
> >> +       struct task_struct *task;
> >> +       struct mm_struct *mm;
> >> +       struct mmap_unlock_irq_work *work;
> >> +       struct vma_iterator vmi;
> >> +};
> >> +
> >> +/* Non-opaque version of uapi bpf_iter_task_vma */
> >> +struct bpf_iter_task_vma_kern {
> >> +       struct bpf_iter_task_vma_kern_data *data;
> >> +} __attribute__((aligned(8)));
> >> +
> >
> > it's a bit worrying that we'll rely on memory allocation inside NMI to
> > be able to use this. I'm missing previous email discussion (I declared
> > email bankruptcy after long vacation), but I suppose the option to fix
> > bpf_iter_task_vma (to 88 bytes: 64 for vma_iterator + 24 for extra
> > pointers), or even to 96 to have a bit of headroom in case we need a
> > bit more space was rejected? It seems unlikely that vma_iterator will
> > have to grow, but if it does, it has 5 bytes of padding right now for
> > various flags, plus we can have extra 8 bytes reserved just in case.
> >
> > I know it's a big struct and will take a big chunk of the BPF stack,
> > but I'm a bit worried about both the performance implication of mem
> > alloc under NMI, and allocation failing.
> >
> > Maybe the worry is overblown, but I thought I'll bring it up anyways.
> >
>
> Few tangential trains of thought here, separated by multiple newlines
> for easier reading.
>
>
> IIUC the any-context BPF allocator will not actually allocate memory in NMI
> context, instead relying on its existing pre-filled caches.
>
> Alexei's patch adding the allocator says ([0]):
>
>   The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general.
>
> So sounds bpf_mem_alloc in a kfunc called by a BPF program is NMI-safe.
>
>
> That's not to say that I'm happy about adding a fallible bpf_mem_alloc call here
> before the kfunc can do anything useful. But it seems like the best way to
> guarantee that we never see a mailing list message like:
>
>   Hello, I just added a field to 'struct ma_state' in my subsystem and it seems
>   I've triggered a BUILD_BUG_ON in this far-away BPF subsystem. It looks like
>   you're making stability guarantees based on the size of my internal struct.
>   What the hell?
>
> Sure, after I remove the kfuncs and struct bpf_iter_task_vma fwd decl from
> bpf_helpers.h - per your other comment below - we can do the whole "kfuncs
> aren't uapi and this struct bpf_iter_task_vma is coming from vmlinux.h,
> not some stable header" spiel and convince this hypothetical person. Not having
> to do the spiel here reinforces the more general "Modern BPF exposes
> functionality w/ kfuncs and kptrs, which are inherently _unstable_" messaging
> more effectively than having to explain.
>
>
> If we go back to putting struct vma_iterator on the BPF stack, I think we
> definitely want to keep the BUILD_BUG_ON. If it were removed and vma_iterator
> size changes, that would affect portability of BPF programs that assume the old
> size of bpf_iter_task_vma, no? Which bpf_for_each is doing since it puts
> bpf_iter_task_vma on the stack.
>
> Is there some CO-RE technique that would handle above scenario portably? I
> can't think of anything straightforward. Maybe if BPF prog BTF only had
> a fwd decl for bpf_iter_task_vma, and size thus had to be taken from
> vmlinux BTF. But that would fail to compile since it the struct goes
> on the stack. Maybe use some placeholder size for compilation and use
> BTF tag to tell libbpf to patch insns w/ vmlinux's size for this struct?
>
>
> Re: padding bytes, seems worse to me than not using them. Have to make
> assumptions about far-away struct, specifically vma_iterator
> which landed quite recently as part of maple tree series. The assumptions
> don't prevent my hypothetical mailing list confusion from happening, increases
> the confusion if it does happen ("I added a small field recently, why didn't
> this break then? If it's explicitly and intentionally unstable, why add
> padding bytes?")
>
>   [0]: https://lore.kernel.org/bpf/20220902211058.60789-2-alexei.starovoitov@gmail.com

+1
imo struct bpf_iter_task_vma_kern is a bit too big to put on bpf prog stack.
bpf_mem_alloc short term is a lesser evil imo.
Long term we need to think how to extend bpf ISA with alloca.
It's time to figure out how to grow the stack.
Andrii Nakryiko Aug. 23, 2023, 4:55 p.m. UTC | #13
On Wed, Aug 23, 2023 at 7:58 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Aug 22, 2023 at 10:42 PM David Marchevsky
> <david.marchevsky@linux.dev> wrote:
> >
> > On 8/22/23 8:04 PM, Andrii Nakryiko wrote:
> > > On Tue, Aug 22, 2023 at 12:20 PM David Marchevsky
> > > <david.marchevsky@linux.dev> wrote:
> > >>
> > >> On 8/22/23 1:42 PM, Yonghong Song wrote:
> > >>>
> > >>>
> > >>> On 8/21/23 10:05 PM, Dave Marchevsky wrote:
> > >>>> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> > >>>> creation and manipulation of struct bpf_iter_task_vma in open-coded
> > >>>> iterator style. BPF programs can use these kfuncs directly or through
> > >>>> bpf_for_each macro for natural-looking iteration of all task vmas.
> > >>>>
> > >>>> The implementation borrows heavily from bpf_find_vma helper's locking -
> > >>>> differing only in that it holds the mmap_read lock for all iterations
> > >>>> while the helper only executes its provided callback on a maximum of 1
> > >>>> vma. Aside from locking, struct vma_iterator and vma_next do all the
> > >>>> heavy lifting.
> > >>>>
> > >>>> The newly-added struct bpf_iter_task_vma has a name collision with a
> > >>>> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> > >>>> file is renamed in order to avoid the collision.
> > >>>>
> > >>>> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
> > >>>> only field in struct bpf_iter_task_vma. This is because the inner data
> > >>>> struct contains a struct vma_iterator (not ptr), whose size is likely to
> > >>>> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
> > >>>> such a change would require change in opaque bpf_iter_task_vma struct's
> > >>>> size. So better to allocate vma_iterator using BPF allocator, and since
> > >>>> that alloc must already succeed, might as well allocate all iter fields,
> > >>>> thereby freezing struct bpf_iter_task_vma size.
> > >>>>
> > >>>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> > >>>> Cc: Nathan Slingerland <slinger@meta.com>
> > >>>> ---
> > >>>>   include/uapi/linux/bpf.h                      |  4 +
> > >>>>   kernel/bpf/helpers.c                          |  3 +
> > >>>>   kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
> > >>>>   tools/include/uapi/linux/bpf.h                |  4 +
> > >>>>   tools/lib/bpf/bpf_helpers.h                   |  8 ++
> > >>>>   .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
> > >>>>   ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
> > >>>>   7 files changed, 116 insertions(+), 13 deletions(-)
> > >>>>   rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> > >>>>
> > >>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > >>>> index 8790b3962e4b..49fc1989a548 100644
> > >>>> --- a/include/uapi/linux/bpf.h
> > >>>> +++ b/include/uapi/linux/bpf.h
> > >>>> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
> > >>>>       __u64 __opaque[1];
> > >>>>   } __attribute__((aligned(8)));
> > >>>>   +struct bpf_iter_task_vma {
> > >>>> +    __u64 __opaque[1]; /* See bpf_iter_num comment above */
> > >>>> +} __attribute__((aligned(8)));
> > >>>
> > >>> In the future, we might have bpf_iter_cgroup, bpf_iter_task, bpf_iter_cgroup_task, etc. They may all use the same struct
> > >>> like
> > >>>   struct bpf_iter_<...> {
> > >>>     __u64 __opaque[1];
> > >>>   } __attribute__((aligned(8)));
> > >>>
> > >>> Maybe we want a generic one instead of having lots of
> > >>> structs with the same underline definition? For example,
> > >>>   struct bpf_iter_generic
> > >>> ?
> > >>>
> > >>
> > >> The bpf_for_each macro assumes a consistent naming scheme for opaque iter struct
> > >> and associated kfuncs. Having a 'bpf_iter_generic' shared amongst multiple types
> > >> of iters would break the scheme. We could:
> > >>
> > >>   * Add bpf_for_each_generic that only uses bpf_iter_generic
> > >>     * This exposes implementation details in an ugly way, though.
> > >>   * Do some macro magic to pick bpf_iter_generic for some types of iters, and
> > >>     use consistent naming pattern for others.
> > >>     * I'm not sure how to do this with preprocessor
> > >>   * Migrate all opaque iter structs to only contain pointer to bpf_mem_alloc'd
> > >>     data struct, and use bpf_iter_generic for all of them
> > >>     * Probably need to see more iter implementation / usage before making such
> > >>       a change
> > >>   * Do 'typedef __u64 __aligned(8) bpf_iter_<...>
> > >>     * BTF_KIND_TYPEDEF intead of BTF_KIND_STRUCT might throw off some verifier
> > >>       logic. Could do similar typedef w/ struct to try to work around
> > >>       it.
> > >>
> > >> Let me know what you think. Personally I considered doing typedef while
> > >> implementing this, so that's the alternative I'd choose.
> > >>
> > >>>> +
> > >>>>   #endif /* _UAPI__LINUX_BPF_H__ */
> > >>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > >>>> index eb91cae0612a..7a06dea749f1 100644
> > >>>> --- a/kernel/bpf/helpers.c
> > >>>> +++ b/kernel/bpf/helpers.c
> > >>>> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
> > >>>>   BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
> > >>>>   BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
> > >>>>   BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> > >>>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
> > >>>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
> > >>>> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
> > >>>>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> > >>>>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> > >>>>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> > >>>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > >>>> index c4ab9d6cdbe9..51c2dce435c1 100644
> > >>>> --- a/kernel/bpf/task_iter.c
> > >>>> +++ b/kernel/bpf/task_iter.c
> > >>>> @@ -7,7 +7,9 @@
> > >>>>   #include <linux/fs.h>
> > >>>>   #include <linux/fdtable.h>
> > >>>>   #include <linux/filter.h>
> > >>>> +#include <linux/bpf_mem_alloc.h>
> > >>>>   #include <linux/btf_ids.h>
> > >>>> +#include <linux/mm_types.h>
> > >>>>   #include "mmap_unlock_work.h"
> > >>>>     static const char * const iter_task_type_names[] = {
> > >>>> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
> > >>>>       .arg5_type    = ARG_ANYTHING,
> > >>>>   };
> > >>>>   +struct bpf_iter_task_vma_kern_data {
> > >>>> +    struct task_struct *task;
> > >>>> +    struct mm_struct *mm;
> > >>>> +    struct mmap_unlock_irq_work *work;
> > >>>> +    struct vma_iterator vmi;
> > >>>> +};
> > >>>> +
> > >>>> +/* Non-opaque version of uapi bpf_iter_task_vma */
> > >>>> +struct bpf_iter_task_vma_kern {
> > >>>> +    struct bpf_iter_task_vma_kern_data *data;
> > >>>> +} __attribute__((aligned(8)));
> > >>>> +
> > >>>> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> > >>>> +                      struct task_struct *task, u64 addr)
> > >>>> +{
> > >>>> +    struct bpf_iter_task_vma_kern *kit = (void *)it;
> > >>>> +    bool irq_work_busy = false;
> > >>>> +    int err;
> > >>>> +
> > >>>> +    BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
> > >>>> +    BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
> > >>>> +
> > >>>> +    /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
> > >>>> +     * before, so non-NULL kit->data doesn't point to previously
> > >>>> +     * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
> > >>>> +     */
> > >>>> +    kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
> > >>>> +    if (!kit->data)
> > >>>> +        return -ENOMEM;
> > >>>> +    kit->data->task = NULL;
> > >>>> +
> > >>>> +    if (!task) {
> > >>>> +        err = -ENOENT;
> > >>>> +        goto err_cleanup_iter;
> > >>>> +    }
> > >>>> +
> > >>>> +    kit->data->task = get_task_struct(task);
> > >>>
> > >>> The above is not safe. Since there is no restriction on 'task',
> > >>> the 'task' could be in a state for destruction with 'usage' count 0
> > >>> and then get_task_struct(task) won't work since it unconditionally
> > >>> tries to increase 'usage' count from 0 to 1.
> > >>>
> > >>> Or, 'task' may be valid at the entry of the funciton, but when
> > >>> 'task' is in get_task_struct(), 'task' may have been destroyed
> > >>> and 'task' memory is reused by somebody else.
> > >>>
> > >>> I suggest that we check input parameter 'task' must be
> > >>> PTR_TRUSTED or MEM_RCU. This way, the above !task checking
> > >>> is not necessary and get_task_struct() can correctly
> > >>> hold a reference to 'task'.
> > >>>
> > >>
> > >> Adding a PTR_TRUSTED or MEM_RCU check seems reasonable. I'm curious
> > >> whether there's any way to feed a 'plain' struct task_struct PTR_TO_BTF_ID
> > >> to this kfunc currently.
> > >>
> > >> * bpf_get_current_task_btf helper returns PTR_TRUSTED | PTR_TO_BTF_ID
> > >> * ptr hop from trusted task_struct to 'real_parent' or similar should
> > >>   yield MEM_RCU (due to BTF_TYPE_SAFE_RCU(struct task_struct) def
> > >> * if task kptr is in map_val, direct reference to it should result
> > >>   in PTR_UNTRUSTED PTR_TO_BTF_ID, must kptr_xchg it or acquire again
> > >>   using bpf_task_from_pid (?)
> > >>
> > >> But regardless, better to be explicit. Will change.
> > >
> > > How horrible would it be to base an interface on TID/PID (i.e., int)
> > > as input argument to specify a task? I'm just thinking it might be
> > > more generic and easy to use in more situations:
> > >    - for all the cases where we have struct task_struct, getting its
> > > pid is trivial: `task->pid`;
> > >    - but in some situations PID might be coming from outside: either
> > > as an argument to CLI tool, or from old-style tracepoint (e.g.,
> > > context_switch where we have prev/next task pid), etc.
> > >
> > > The downside is that we'd need to look up a task, right? But on the
> > > other hand we get more generality and won't have to rely on having
> > > PTR_TRUSTED task_struct.
> > >
> > > Thoughts?
> > >
> >
> > Meh, taking tid here feels like the 'old-school' approach, before recent
> > efforts to teach the verifier more about resource acquisition, locking,
> > iteration, trustedness, etc. All allowing us to push more important logic
> > out of 'opaque' helper impl and into BPF programs.
> >
> > In this tid -> struct task_struct case I think the provided resource acquisition
> > is sufficient. Using your examples:
> >
> >   * We have a TRUSTED or RCU struct task_struct
> >     * No need to do anything, can pass to bpf_iter_task_vma_new
> >
> >   * We have a struct task_struct, but it's UNTRUSTED or has no trustedness
> >     type flags
> >     * Use bpf_task_acquire or bpf_task_from_pid
> >
> >   * We just have a pid ('coming from outside')
> >     * Use bpf_task_from_pid
> >
> > If there is some scenario where we can't get from pid to properly acquired task
> > in the BPF program, let's improve the resource acquisition instead of pushing
> > it into the kfunc impl.
> >
> > Also, should we look up (and refcount incr) the task using task->rcu_users
> > refcount w/ bpf_task_{acquire,release}, or using task->usage refcount w/
> > {get,put}_task_struct ? More generally, if there are >1 valid ways to acquire
> > task_struct or some other resource, pushing acquisition to the BPF program
> > gives us the benefit of not having to pick one (or do possibly ugly / complex
> > flags). As long as type flags express that the resource will not go away,
> > this kfunc impl can ignore the details of how that property came about.
>
> Agree with above reasoning. pid->task should be separate set of kfunc.
> It's not a trivial task either. there are pid namespaces. there is u32 pid
> and there is 'struct pid'. u32_pid->struct_pid->task has to be designed first.
> Probably in some case the trusted task pointer is ready to be accessed,
> but context could be such that pid->task cannot be done.

makes sense, having bpf_task_from_pid() removes my concerns, I forgot about it.
Andrii Nakryiko Aug. 23, 2023, 5:07 p.m. UTC | #14
On Wed, Aug 23, 2023 at 12:26 AM David Marchevsky
<david.marchevsky@linux.dev> wrote:
>
> On 8/22/23 7:52 PM, Andrii Nakryiko wrote:
> > On Mon, Aug 21, 2023 at 10:06 PM Dave Marchevsky <davemarchevsky@fb.com> wrote:
> >>
> >> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> >> creation and manipulation of struct bpf_iter_task_vma in open-coded
> >> iterator style. BPF programs can use these kfuncs directly or through
> >> bpf_for_each macro for natural-looking iteration of all task vmas.
> >>
> >> The implementation borrows heavily from bpf_find_vma helper's locking -
> >> differing only in that it holds the mmap_read lock for all iterations
> >> while the helper only executes its provided callback on a maximum of 1
> >> vma. Aside from locking, struct vma_iterator and vma_next do all the
> >> heavy lifting.
> >>
> >> The newly-added struct bpf_iter_task_vma has a name collision with a
> >> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> >> file is renamed in order to avoid the collision.
> >>
> >> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
> >> only field in struct bpf_iter_task_vma. This is because the inner data
> >> struct contains a struct vma_iterator (not ptr), whose size is likely to
> >> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
> >> such a change would require change in opaque bpf_iter_task_vma struct's
> >> size. So better to allocate vma_iterator using BPF allocator, and since
> >> that alloc must already succeed, might as well allocate all iter fields,
> >> thereby freezing struct bpf_iter_task_vma size.
> >>
> >> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> >> Cc: Nathan Slingerland <slinger@meta.com>
> >> ---
> >>  include/uapi/linux/bpf.h                      |  4 +
> >>  kernel/bpf/helpers.c                          |  3 +
> >>  kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
> >>  tools/include/uapi/linux/bpf.h                |  4 +
> >>  tools/lib/bpf/bpf_helpers.h                   |  8 ++
> >>  .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
> >>  ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
> >>  7 files changed, 116 insertions(+), 13 deletions(-)
> >>  rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> >>
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index 8790b3962e4b..49fc1989a548 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
> >>         __u64 __opaque[1];
> >>  } __attribute__((aligned(8)));
> >>
> >> +struct bpf_iter_task_vma {
> >> +       __u64 __opaque[1]; /* See bpf_iter_num comment above */
> >> +} __attribute__((aligned(8)));
> >> +
> >>  #endif /* _UAPI__LINUX_BPF_H__ */
> >> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> >> index eb91cae0612a..7a06dea749f1 100644
> >> --- a/kernel/bpf/helpers.c
> >> +++ b/kernel/bpf/helpers.c
> >> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
> >>  BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
> >>  BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
> >>  BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
> >>  BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> >>  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> >>  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> >> index c4ab9d6cdbe9..51c2dce435c1 100644
> >> --- a/kernel/bpf/task_iter.c
> >> +++ b/kernel/bpf/task_iter.c
> >> @@ -7,7 +7,9 @@
> >>  #include <linux/fs.h>
> >>  #include <linux/fdtable.h>
> >>  #include <linux/filter.h>
> >> +#include <linux/bpf_mem_alloc.h>
> >>  #include <linux/btf_ids.h>
> >> +#include <linux/mm_types.h>
> >>  #include "mmap_unlock_work.h"
> >>
> >>  static const char * const iter_task_type_names[] = {
> >> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
> >>         .arg5_type      = ARG_ANYTHING,
> >>  };
> >>
> >> +struct bpf_iter_task_vma_kern_data {
> >> +       struct task_struct *task;
> >> +       struct mm_struct *mm;
> >> +       struct mmap_unlock_irq_work *work;
> >> +       struct vma_iterator vmi;
> >> +};
> >> +
> >> +/* Non-opaque version of uapi bpf_iter_task_vma */
> >> +struct bpf_iter_task_vma_kern {
> >> +       struct bpf_iter_task_vma_kern_data *data;
> >> +} __attribute__((aligned(8)));
> >> +
> >
> > it's a bit worrying that we'll rely on memory allocation inside NMI to
> > be able to use this. I'm missing previous email discussion (I declared
> > email bankruptcy after long vacation), but I suppose the option to fix
> > bpf_iter_task_vma (to 88 bytes: 64 for vma_iterator + 24 for extra
> > pointers), or even to 96 to have a bit of headroom in case we need a
> > bit more space was rejected? It seems unlikely that vma_iterator will
> > have to grow, but if it does, it has 5 bytes of padding right now for
> > various flags, plus we can have extra 8 bytes reserved just in case.
> >
> > I know it's a big struct and will take a big chunk of the BPF stack,
> > but I'm a bit worried about both the performance implication of mem
> > alloc under NMI, and allocation failing.
> >
> > Maybe the worry is overblown, but I thought I'll bring it up anyways.
> >
>
> Few tangential trains of thought here, separated by multiple newlines
> for easier reading.
>
>
> IIUC the any-context BPF allocator will not actually allocate memory in NMI
> context, instead relying on its existing pre-filled caches.
>
> Alexei's patch adding the allocator says ([0]):
>
>   The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general.
>
> So sounds bpf_mem_alloc in a kfunc called by a BPF program is NMI-safe.

Right. My concern wasn't about safety of allocation, but rather about
it being drawn from a limited pool and potentially returning -ENOMEM
even if the system is not actually out of memory.

I guess only the actual usage in a big fleet will show how often
-ENOMEM comes up. It might be not a problem at all, which is why I'm
saying that my worries might be overblown.

>
>
> That's not to say that I'm happy about adding a fallible bpf_mem_alloc call here
> before the kfunc can do anything useful. But it seems like the best way to
> guarantee that we never see a mailing list message like:
>
>   Hello, I just added a field to 'struct ma_state' in my subsystem and it seems
>   I've triggered a BUILD_BUG_ON in this far-away BPF subsystem. It looks like
>   you're making stability guarantees based on the size of my internal struct.
>   What the hell?
>
> Sure, after I remove the kfuncs and struct bpf_iter_task_vma fwd decl from
> bpf_helpers.h - per your other comment below - we can do the whole "kfuncs
> aren't uapi and this struct bpf_iter_task_vma is coming from vmlinux.h,
> not some stable header" spiel and convince this hypothetical person. Not having
> to do the spiel here reinforces the more general "Modern BPF exposes
> functionality w/ kfuncs and kptrs, which are inherently _unstable_" messaging
> more effectively than having to explain.
>
>
> If we go back to putting struct vma_iterator on the BPF stack, I think we
> definitely want to keep the BUILD_BUG_ON. If it were removed and vma_iterator
> size changes, that would affect portability of BPF programs that assume the old
> size of bpf_iter_task_vma, no? Which bpf_for_each is doing since it puts
> bpf_iter_task_vma on the stack.

Yes, I think we'll have to have BUILD_BUG_ON. And yes, whoever
increases vma_iter by more than 13 bytes will have to interact with
us.

But my thinking was that in *unlikely* case of this happening, given
the unstable API rules, we'll just drop "v1" of task_vma iterator,
both kfuncs and iter struct itself, and will introduce v2 with a
bigger size of the state. We definitely don't want to just change the
size of the iter state struct, that will cause not just confusion, but
potentially stack variables clobbering, if old definition is used.

And the above seems acceptable only because it seems that vma_iter
changing its size so dramatically is very unlikely. Kernel folks won't
be increasing the already pretty big struct just for the fun of it.


>
> Is there some CO-RE technique that would handle above scenario portably? I
> can't think of anything straightforward. Maybe if BPF prog BTF only had
> a fwd decl for bpf_iter_task_vma, and size thus had to be taken from
> vmlinux BTF. But that would fail to compile since it the struct goes
> on the stack. Maybe use some placeholder size for compilation and use
> BTF tag to tell libbpf to patch insns w/ vmlinux's size for this struct?
>

I don't think there is any magic that could be done when on the stack
variable size changes. But if we do v1 vs v2 change (if necessary),
then we have mechanisms to detect the presence of new kfuncs and then
choosing proper iterator (task_vma vs task_vma2, something like that).


>
> Re: padding bytes, seems worse to me than not using them. Have to make
> assumptions about far-away struct, specifically vma_iterator
> which landed quite recently as part of maple tree series. The assumptions
> don't prevent my hypothetical mailing list confusion from happening, increases
> the confusion if it does happen ("I added a small field recently, why didn't
> this break then? If it's explicitly and intentionally unstable, why add
> padding bytes?")
>

To prevent unnecessary user breakage. Just because something is
unstable doesn't mean we want to break it every kernel release, right?
Otherwise why don't we rename each kfunc just for the fun of it, every
release ;) 8 extra bytes on the stack seems like a negligible price
for ensuring less user pain in the long run.

>   [0]: https://lore.kernel.org/bpf/20220902211058.60789-2-alexei.starovoitov@gmail.com
>
> >> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> >> +                                     struct task_struct *task, u64 addr)
> >> +{
> >> +       struct bpf_iter_task_vma_kern *kit = (void *)it;
> >> +       bool irq_work_busy = false;
> >> +       int err;
> >> +
> >
> > [...]
> >
> >>  static void do_mmap_read_unlock(struct irq_work *entry)
> >> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> >> index 8790b3962e4b..49fc1989a548 100644
> >> --- a/tools/include/uapi/linux/bpf.h
> >> +++ b/tools/include/uapi/linux/bpf.h
> >> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
> >>         __u64 __opaque[1];
> >>  } __attribute__((aligned(8)));
> >>
> >> +struct bpf_iter_task_vma {
> >> +       __u64 __opaque[1]; /* See bpf_iter_num comment above */
> >> +} __attribute__((aligned(8)));
> >> +
> >>  #endif /* _UAPI__LINUX_BPF_H__ */
> >> diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
> >> index bbab9ad9dc5a..d885ffee4d88 100644
> >> --- a/tools/lib/bpf/bpf_helpers.h
> >> +++ b/tools/lib/bpf/bpf_helpers.h
> >> @@ -302,6 +302,14 @@ extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __weak
> >>  extern int *bpf_iter_num_next(struct bpf_iter_num *it) __weak __ksym;
> >>  extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym;
> >>
> >> +struct bpf_iter_task_vma;
> >> +
> >> +extern int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> >> +                                struct task_struct *task,
> >> +                                unsigned long addr) __weak __ksym;
> >> +extern struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) __weak __ksym;
> >> +extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __weak __ksym;
> >
> > my intent wasn't to add all open-coded iterators to bpf_helpers.h. I
> > think bpf_iter_num_* is rather an exception and isn't supposed to ever
> > change or be removed, while other iterators should be allowed to be
> > changed.
> >
> > The goal is for all such kfuncs (and struct bpf_iter_task_vma state
> > itself, probably) to come from vmlinux.h, eventually, so let's leave
> > it out of libbpf's stable bpf_helpers.h header.
> >
> >
> > [...]
>
> As alluded to in my long response above, this sounds reasonable, will
> remove from here.
>

Cool.

> >
> >> @@ -1533,7 +1533,7 @@ static void test_task_vma_dead_task(void)
> >>  out:
> >>         waitpid(child_pid, &wstatus, 0);
> >>         close(iter_fd);
> >> -       bpf_iter_task_vma__destroy(skel);
> >> +       bpf_iter_task_vmas__destroy(skel);
> >>  }
> >>
> >>  void test_bpf_sockmap_map_iter_fd(void)
> >> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
> >> similarity index 100%
> >> rename from tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
> >> rename to tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
> >> --
> >> 2.34.1
> >>
Andrii Nakryiko Aug. 23, 2023, 5:14 p.m. UTC | #15
On Wed, Aug 23, 2023 at 8:03 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Aug 23, 2023 at 12:26 AM David Marchevsky
> <david.marchevsky@linux.dev> wrote:
> >
> > On 8/22/23 7:52 PM, Andrii Nakryiko wrote:
> > > On Mon, Aug 21, 2023 at 10:06 PM Dave Marchevsky <davemarchevsky@fb.com> wrote:
> > >>
> > >> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> > >> creation and manipulation of struct bpf_iter_task_vma in open-coded
> > >> iterator style. BPF programs can use these kfuncs directly or through
> > >> bpf_for_each macro for natural-looking iteration of all task vmas.
> > >>
> > >> The implementation borrows heavily from bpf_find_vma helper's locking -
> > >> differing only in that it holds the mmap_read lock for all iterations
> > >> while the helper only executes its provided callback on a maximum of 1
> > >> vma. Aside from locking, struct vma_iterator and vma_next do all the
> > >> heavy lifting.
> > >>
> > >> The newly-added struct bpf_iter_task_vma has a name collision with a
> > >> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> > >> file is renamed in order to avoid the collision.
> > >>
> > >> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
> > >> only field in struct bpf_iter_task_vma. This is because the inner data
> > >> struct contains a struct vma_iterator (not ptr), whose size is likely to
> > >> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
> > >> such a change would require change in opaque bpf_iter_task_vma struct's
> > >> size. So better to allocate vma_iterator using BPF allocator, and since
> > >> that alloc must already succeed, might as well allocate all iter fields,
> > >> thereby freezing struct bpf_iter_task_vma size.
> > >>
> > >> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> > >> Cc: Nathan Slingerland <slinger@meta.com>
> > >> ---
> > >>  include/uapi/linux/bpf.h                      |  4 +
> > >>  kernel/bpf/helpers.c                          |  3 +
> > >>  kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
> > >>  tools/include/uapi/linux/bpf.h                |  4 +
> > >>  tools/lib/bpf/bpf_helpers.h                   |  8 ++
> > >>  .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
> > >>  ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
> > >>  7 files changed, 116 insertions(+), 13 deletions(-)
> > >>  rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> > >>
> > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > >> index 8790b3962e4b..49fc1989a548 100644
> > >> --- a/include/uapi/linux/bpf.h
> > >> +++ b/include/uapi/linux/bpf.h
> > >> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
> > >>         __u64 __opaque[1];
> > >>  } __attribute__((aligned(8)));
> > >>
> > >> +struct bpf_iter_task_vma {
> > >> +       __u64 __opaque[1]; /* See bpf_iter_num comment above */
> > >> +} __attribute__((aligned(8)));
> > >> +
> > >>  #endif /* _UAPI__LINUX_BPF_H__ */
> > >> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > >> index eb91cae0612a..7a06dea749f1 100644
> > >> --- a/kernel/bpf/helpers.c
> > >> +++ b/kernel/bpf/helpers.c
> > >> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
> > >>  BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
> > >>  BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
> > >>  BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> > >> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
> > >> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
> > >> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
> > >>  BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> > >>  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> > >>  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> > >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > >> index c4ab9d6cdbe9..51c2dce435c1 100644
> > >> --- a/kernel/bpf/task_iter.c
> > >> +++ b/kernel/bpf/task_iter.c
> > >> @@ -7,7 +7,9 @@
> > >>  #include <linux/fs.h>
> > >>  #include <linux/fdtable.h>
> > >>  #include <linux/filter.h>
> > >> +#include <linux/bpf_mem_alloc.h>
> > >>  #include <linux/btf_ids.h>
> > >> +#include <linux/mm_types.h>
> > >>  #include "mmap_unlock_work.h"
> > >>
> > >>  static const char * const iter_task_type_names[] = {
> > >> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
> > >>         .arg5_type      = ARG_ANYTHING,
> > >>  };
> > >>
> > >> +struct bpf_iter_task_vma_kern_data {
> > >> +       struct task_struct *task;
> > >> +       struct mm_struct *mm;
> > >> +       struct mmap_unlock_irq_work *work;
> > >> +       struct vma_iterator vmi;
> > >> +};
> > >> +
> > >> +/* Non-opaque version of uapi bpf_iter_task_vma */
> > >> +struct bpf_iter_task_vma_kern {
> > >> +       struct bpf_iter_task_vma_kern_data *data;
> > >> +} __attribute__((aligned(8)));
> > >> +
> > >
> > > it's a bit worrying that we'll rely on memory allocation inside NMI to
> > > be able to use this. I'm missing previous email discussion (I declared
> > > email bankruptcy after long vacation), but I suppose the option to fix
> > > bpf_iter_task_vma (to 88 bytes: 64 for vma_iterator + 24 for extra
> > > pointers), or even to 96 to have a bit of headroom in case we need a
> > > bit more space was rejected? It seems unlikely that vma_iterator will
> > > have to grow, but if it does, it has 5 bytes of padding right now for
> > > various flags, plus we can have extra 8 bytes reserved just in case.
> > >
> > > I know it's a big struct and will take a big chunk of the BPF stack,
> > > but I'm a bit worried about both the performance implication of mem
> > > alloc under NMI, and allocation failing.
> > >
> > > Maybe the worry is overblown, but I thought I'll bring it up anyways.
> > >
> >
> > Few tangential trains of thought here, separated by multiple newlines
> > for easier reading.
> >
> >
> > IIUC the any-context BPF allocator will not actually allocate memory in NMI
> > context, instead relying on its existing pre-filled caches.
> >
> > Alexei's patch adding the allocator says ([0]):
> >
> >   The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general.
> >
> > So sounds bpf_mem_alloc in a kfunc called by a BPF program is NMI-safe.
> >
> >
> > That's not to say that I'm happy about adding a fallible bpf_mem_alloc call here
> > before the kfunc can do anything useful. But it seems like the best way to
> > guarantee that we never see a mailing list message like:
> >
> >   Hello, I just added a field to 'struct ma_state' in my subsystem and it seems
> >   I've triggered a BUILD_BUG_ON in this far-away BPF subsystem. It looks like
> >   you're making stability guarantees based on the size of my internal struct.
> >   What the hell?
> >
> > Sure, after I remove the kfuncs and struct bpf_iter_task_vma fwd decl from
> > bpf_helpers.h - per your other comment below - we can do the whole "kfuncs
> > aren't uapi and this struct bpf_iter_task_vma is coming from vmlinux.h,
> > not some stable header" spiel and convince this hypothetical person. Not having
> > to do the spiel here reinforces the more general "Modern BPF exposes
> > functionality w/ kfuncs and kptrs, which are inherently _unstable_" messaging
> > more effectively than having to explain.
> >
> >
> > If we go back to putting struct vma_iterator on the BPF stack, I think we
> > definitely want to keep the BUILD_BUG_ON. If it were removed and vma_iterator
> > size changes, that would affect portability of BPF programs that assume the old
> > size of bpf_iter_task_vma, no? Which bpf_for_each is doing since it puts
> > bpf_iter_task_vma on the stack.
> >
> > Is there some CO-RE technique that would handle above scenario portably? I
> > can't think of anything straightforward. Maybe if BPF prog BTF only had
> > a fwd decl for bpf_iter_task_vma, and size thus had to be taken from
> > vmlinux BTF. But that would fail to compile since it the struct goes
> > on the stack. Maybe use some placeholder size for compilation and use
> > BTF tag to tell libbpf to patch insns w/ vmlinux's size for this struct?
> >
> >
> > Re: padding bytes, seems worse to me than not using them. Have to make
> > assumptions about far-away struct, specifically vma_iterator
> > which landed quite recently as part of maple tree series. The assumptions
> > don't prevent my hypothetical mailing list confusion from happening, increases
> > the confusion if it does happen ("I added a small field recently, why didn't
> > this break then? If it's explicitly and intentionally unstable, why add
> > padding bytes?")
> >
> >   [0]: https://lore.kernel.org/bpf/20220902211058.60789-2-alexei.starovoitov@gmail.com
>
> +1
> imo struct bpf_iter_task_vma_kern is a bit too big to put on bpf prog stack.
> bpf_mem_alloc short term is a lesser evil imo.

Yep, a bit big, though still reasonable to put on the stack. As I
mentioned above, my worry is the unreliability of being able to
instantiate task_vma iterator in NMI context (which is probably where
this will be used often). We'll see if this actually the problem in
practice once this is used and deployed fleet-wide. If it is, we might
want to revisit it by introducing v2.

> Long term we need to think how to extend bpf ISA with alloca.

To be frank, I'm not following how alloca is relevant here. We don't
have anything dynamically sized on the stack.

Unless you envision protocol where we have a separate function to get
size of iter struct, then alloca enough space, then pass that to
bpf_iter_xxx_new()? Not sure whether this is statically verifiable,
but given it's long-term, we can put it on backburner for now.

> It's time to figure out how to grow the stack.

We effectively already grow the stack based on exact prog's usage.
It's more of "how to support a bigger stack" during verification with
reasonable increase in memory and cpu usage.


Anyways, as I said, I suspected the answer will be what it is, but
documenting the reasons and what alternatives were considered is still
worthwhile, IMO. Thanks for the discussion.
Alexei Starovoitov Aug. 23, 2023, 5:26 p.m. UTC | #16
On Wed, Aug 23, 2023 at 10:07 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> Yes, I think we'll have to have BUILD_BUG_ON. And yes, whoever
> increases vma_iter by more than 13 bytes will have to interact with
> us.

A bit of history.
Before maple tree the vma_iterator didn't exist. vma_next would walk rb tree.
So if we introduced task_vma iterator couple years ago the maple tree
change would have grown our bpf_iter_task_vma by 64 bytes.
If we reserved an 8 or 16 byte gap it wouldn't have helped.
Hence it's wishful thinking that a gap might help in the future.

Direct stack alloc of kernel data structure is also dangerous in
presence of kernel debug knobs. There are no locks inside vma_iterator
at the moment, but if it was there we wouldn't be able to use it
on bpf prog stack regardless of its size, because lockdep on/off
would have changed the size.
I think it's always better to have extra indirection between bpf prog
and kernel object.
Andrii Nakryiko Aug. 23, 2023, 5:43 p.m. UTC | #17
On Wed, Aug 23, 2023 at 10:26 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Aug 23, 2023 at 10:07 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > Yes, I think we'll have to have BUILD_BUG_ON. And yes, whoever
> > increases vma_iter by more than 13 bytes will have to interact with
> > us.
>
> A bit of history.
> Before maple tree the vma_iterator didn't exist. vma_next would walk rb tree.
> So if we introduced task_vma iterator couple years ago the maple tree
> change would have grown our bpf_iter_task_vma by 64 bytes.
> If we reserved an 8 or 16 byte gap it wouldn't have helped.

Yep, we'd have to introduce v2, of course. And if tomorrow we switch
from maple tree to something else, we'd probably need another version
of iterator as well. I made no claims that padding will be a
future-proof approach, just a pragmatic mitigation of small reasonable
changes in a struct we can't control.

> Hence it's wishful thinking that a gap might help in the future.
>
> Direct stack alloc of kernel data structure is also dangerous in
> presence of kernel debug knobs. There are no locks inside vma_iterator
> at the moment, but if it was there we wouldn't be able to use it
> on bpf prog stack regardless of its size, because lockdep on/off
> would have changed the size.
> I think it's always better to have extra indirection between bpf prog
> and kernel object.

It's a tradeoff and I understand the pros and cons. Let's do mem_alloc
and see how that works in practice.
Alexei Starovoitov Aug. 23, 2023, 5:53 p.m. UTC | #18
On Wed, Aug 23, 2023 at 10:14 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> > Long term we need to think how to extend bpf ISA with alloca.
>
> To be frank, I'm not following how alloca is relevant here. We don't
> have anything dynamically sized on the stack.
>
> Unless you envision protocol where we have a separate function to get
> size of iter struct, then alloca enough space, then pass that to
> bpf_iter_xxx_new()? Not sure whether this is statically verifiable,
> but given it's long-term, we can put it on backburner for now.

With alloca bpf_for_each() macro can allocate whatever stack necessary.

In other words:

struct bpf_iter_task_vma *it;

it = bpf_alloca(bpf_core_type_size(struct bpf_iter_task_vma));
bpf_for_each2(task_vma, it, ...) { .. }

While struct bpf_iter_task_vma can come from vmlinux.h

size of kern data struct is CO-RE-able, so no worries about increase
in size due to maple tree or lockdep on/off.
And no concern of failing allocation at run-time.
(the verifier would reject big stack alloc at load time).
Andrii Nakryiko Aug. 23, 2023, 6:13 p.m. UTC | #19
On Wed, Aug 23, 2023 at 10:53 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Aug 23, 2023 at 10:14 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > > Long term we need to think how to extend bpf ISA with alloca.
> >
> > To be frank, I'm not following how alloca is relevant here. We don't
> > have anything dynamically sized on the stack.
> >
> > Unless you envision protocol where we have a separate function to get
> > size of iter struct, then alloca enough space, then pass that to
> > bpf_iter_xxx_new()? Not sure whether this is statically verifiable,
> > but given it's long-term, we can put it on backburner for now.
>
> With alloca bpf_for_each() macro can allocate whatever stack necessary.
>
> In other words:
>
> struct bpf_iter_task_vma *it;
>
> it = bpf_alloca(bpf_core_type_size(struct bpf_iter_task_vma));
> bpf_for_each2(task_vma, it, ...) { .. }

ah, I see, not a dedicated kfunc, just CO-RE relocation. Makes sense.

>
> While struct bpf_iter_task_vma can come from vmlinux.h
>
> size of kern data struct is CO-RE-able, so no worries about increase
> in size due to maple tree or lockdep on/off.
> And no concern of failing allocation at run-time.
> (the verifier would reject big stack alloc at load time).

yep, makes sense, the size will be statically known to the verifier. I
was overcomplicating this in my mind with extra kfunc.
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8790b3962e4b..49fc1989a548 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7311,4 +7311,8 @@  struct bpf_iter_num {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+struct bpf_iter_task_vma {
+	__u64 __opaque[1]; /* See bpf_iter_num comment above */
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index eb91cae0612a..7a06dea749f1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2482,6 +2482,9 @@  BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index c4ab9d6cdbe9..51c2dce435c1 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -7,7 +7,9 @@ 
 #include <linux/fs.h>
 #include <linux/fdtable.h>
 #include <linux/filter.h>
+#include <linux/bpf_mem_alloc.h>
 #include <linux/btf_ids.h>
+#include <linux/mm_types.h>
 #include "mmap_unlock_work.h"
 
 static const char * const iter_task_type_names[] = {
@@ -823,6 +825,88 @@  const struct bpf_func_proto bpf_find_vma_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+struct bpf_iter_task_vma_kern_data {
+	struct task_struct *task;
+	struct mm_struct *mm;
+	struct mmap_unlock_irq_work *work;
+	struct vma_iterator vmi;
+};
+
+/* Non-opaque version of uapi bpf_iter_task_vma */
+struct bpf_iter_task_vma_kern {
+	struct bpf_iter_task_vma_kern_data *data;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
+				      struct task_struct *task, u64 addr)
+{
+	struct bpf_iter_task_vma_kern *kit = (void *)it;
+	bool irq_work_busy = false;
+	int err;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
+
+	/* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
+	 * before, so non-NULL kit->data doesn't point to previously
+	 * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
+	 */
+	kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
+	if (!kit->data)
+		return -ENOMEM;
+	kit->data->task = NULL;
+
+	if (!task) {
+		err = -ENOENT;
+		goto err_cleanup_iter;
+	}
+
+	kit->data->task = get_task_struct(task);
+	kit->data->mm = task->mm;
+	if (!kit->data->mm) {
+		err = -ENOENT;
+		goto err_cleanup_iter;
+	}
+
+	/* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
+	irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
+	if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
+		err = -EBUSY;
+		goto err_cleanup_iter;
+	}
+
+	vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
+	return 0;
+
+err_cleanup_iter:
+	if (kit->data->task)
+		put_task_struct(kit->data->task);
+	bpf_mem_free(&bpf_global_ma, kit->data);
+	/* NULL kit->data signals failed bpf_iter_task_vma initialization */
+	kit->data = NULL;
+	return err;
+}
+
+__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
+{
+	struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+	if (!kit->data) /* bpf_iter_task_vma_new failed */
+		return NULL;
+	return vma_next(&kit->data->vmi);
+}
+
+__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
+{
+	struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+	if (kit->data) {
+		bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+		put_task_struct(kit->data->task);
+		bpf_mem_free(&bpf_global_ma, kit->data);
+	}
+}
+
 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
 
 static void do_mmap_read_unlock(struct irq_work *entry)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 8790b3962e4b..49fc1989a548 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7311,4 +7311,8 @@  struct bpf_iter_num {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+struct bpf_iter_task_vma {
+	__u64 __opaque[1]; /* See bpf_iter_num comment above */
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index bbab9ad9dc5a..d885ffee4d88 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -302,6 +302,14 @@  extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __weak
 extern int *bpf_iter_num_next(struct bpf_iter_num *it) __weak __ksym;
 extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym;
 
+struct bpf_iter_task_vma;
+
+extern int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
+				 struct task_struct *task,
+				 unsigned long addr) __weak __ksym;
+extern struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) __weak __ksym;
+extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __weak __ksym;
+
 #ifndef bpf_for_each
 /* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for
  * using BPF open-coded iterators without having to write mundane explicit
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 1f02168103dd..41aba139b20b 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -10,7 +10,7 @@ 
 #include "bpf_iter_task.skel.h"
 #include "bpf_iter_task_stack.skel.h"
 #include "bpf_iter_task_file.skel.h"
-#include "bpf_iter_task_vma.skel.h"
+#include "bpf_iter_task_vmas.skel.h"
 #include "bpf_iter_task_btf.skel.h"
 #include "bpf_iter_tcp4.skel.h"
 #include "bpf_iter_tcp6.skel.h"
@@ -1399,19 +1399,19 @@  static void str_strip_first_line(char *str)
 static void test_task_vma_common(struct bpf_iter_attach_opts *opts)
 {
 	int err, iter_fd = -1, proc_maps_fd = -1;
-	struct bpf_iter_task_vma *skel;
+	struct bpf_iter_task_vmas *skel;
 	int len, read_size = 4;
 	char maps_path[64];
 
-	skel = bpf_iter_task_vma__open();
-	if (!ASSERT_OK_PTR(skel, "bpf_iter_task_vma__open"))
+	skel = bpf_iter_task_vmas__open();
+	if (!ASSERT_OK_PTR(skel, "bpf_iter_task_vmas__open"))
 		return;
 
 	skel->bss->pid = getpid();
 	skel->bss->one_task = opts ? 1 : 0;
 
-	err = bpf_iter_task_vma__load(skel);
-	if (!ASSERT_OK(err, "bpf_iter_task_vma__load"))
+	err = bpf_iter_task_vmas__load(skel);
+	if (!ASSERT_OK(err, "bpf_iter_task_vmas__load"))
 		goto out;
 
 	skel->links.proc_maps = bpf_program__attach_iter(
@@ -1462,25 +1462,25 @@  static void test_task_vma_common(struct bpf_iter_attach_opts *opts)
 out:
 	close(proc_maps_fd);
 	close(iter_fd);
-	bpf_iter_task_vma__destroy(skel);
+	bpf_iter_task_vmas__destroy(skel);
 }
 
 static void test_task_vma_dead_task(void)
 {
-	struct bpf_iter_task_vma *skel;
+	struct bpf_iter_task_vmas *skel;
 	int wstatus, child_pid = -1;
 	time_t start_tm, cur_tm;
 	int err, iter_fd = -1;
 	int wait_sec = 3;
 
-	skel = bpf_iter_task_vma__open();
-	if (!ASSERT_OK_PTR(skel, "bpf_iter_task_vma__open"))
+	skel = bpf_iter_task_vmas__open();
+	if (!ASSERT_OK_PTR(skel, "bpf_iter_task_vmas__open"))
 		return;
 
 	skel->bss->pid = getpid();
 
-	err = bpf_iter_task_vma__load(skel);
-	if (!ASSERT_OK(err, "bpf_iter_task_vma__load"))
+	err = bpf_iter_task_vmas__load(skel);
+	if (!ASSERT_OK(err, "bpf_iter_task_vmas__load"))
 		goto out;
 
 	skel->links.proc_maps = bpf_program__attach_iter(
@@ -1533,7 +1533,7 @@  static void test_task_vma_dead_task(void)
 out:
 	waitpid(child_pid, &wstatus, 0);
 	close(iter_fd);
-	bpf_iter_task_vma__destroy(skel);
+	bpf_iter_task_vmas__destroy(skel);
 }
 
 void test_bpf_sockmap_map_iter_fd(void)
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
similarity index 100%
rename from tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
rename to tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c