diff mbox series

[bpf-next,2/3] bpf: Introduce task_vma open-coded iterator kfuncs

Message ID 20230810183513.684836-3-davemarchevsky@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Open-coded task_vma iter | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 3067 this patch: 1712
netdev/cc_maintainers warning 12 maintainers not CCed: kpsingh@kernel.org martin.lau@linux.dev john.fastabend@gmail.com sdf@google.com shuah@kernel.org song@kernel.org yonghong.song@linux.dev kuifeng@meta.com mykolal@fb.com linux-kselftest@vger.kernel.org jolsa@kernel.org haoluo@google.com
netdev/build_clang fail Errors and warnings before: 1530 this patch: 1534
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 3099 this patch: 3105
netdev/checkpatch warning CHECK: extern prototypes should be avoided in .h files WARNING: Prefer __aligned(8) over __attribute__((aligned(8))) WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 106 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-12 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on s390x with gcc

Commit Message

Dave Marchevsky Aug. 10, 2023, 6:35 p.m. UTC
This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
creation and manipulation of struct bpf_iter_task_vma in open-coded
iterator style. BPF programs can use these kfuncs directly or through
bpf_for_each macro for natural-looking iteration of all task vmas.

The implementation borrows heavily from bpf_find_vma helper's locking -
differing only in that it holds the mmap_read lock for all iterations
while the helper only executes its provided callback on a maximum of 1
vma. Aside from locking, struct vma_iterator and vma_next do all the
heavy lifting.

The newly-added struct bpf_iter_task_vma has a name collision with a
selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
file is renamed in order to avoid the collision.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Cc: Nathan Slingerland <slinger@meta.com>
---
 include/uapi/linux/bpf.h                      |  5 ++
 kernel/bpf/helpers.c                          |  3 +
 kernel/bpf/task_iter.c                        | 56 +++++++++++++++++++
 tools/include/uapi/linux/bpf.h                |  5 ++
 tools/lib/bpf/bpf_helpers.h                   |  8 +++
 .../selftests/bpf/prog_tests/bpf_iter.c       | 26 ++++-----
 ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
 7 files changed, 90 insertions(+), 13 deletions(-)
 rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)

Comments

Stanislav Fomichev Aug. 10, 2023, 9:57 p.m. UTC | #1
On 08/10, Dave Marchevsky wrote:
> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> creation and manipulation of struct bpf_iter_task_vma in open-coded
> iterator style. BPF programs can use these kfuncs directly or through
> bpf_for_each macro for natural-looking iteration of all task vmas.
> 
> The implementation borrows heavily from bpf_find_vma helper's locking -
> differing only in that it holds the mmap_read lock for all iterations
> while the helper only executes its provided callback on a maximum of 1
> vma. Aside from locking, struct vma_iterator and vma_next do all the
> heavy lifting.
> 
> The newly-added struct bpf_iter_task_vma has a name collision with a
> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> file is renamed in order to avoid the collision.
> 
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> Cc: Nathan Slingerland <slinger@meta.com>
> ---
>  include/uapi/linux/bpf.h                      |  5 ++
>  kernel/bpf/helpers.c                          |  3 +
>  kernel/bpf/task_iter.c                        | 56 +++++++++++++++++++
>  tools/include/uapi/linux/bpf.h                |  5 ++
>  tools/lib/bpf/bpf_helpers.h                   |  8 +++
>  .../selftests/bpf/prog_tests/bpf_iter.c       | 26 ++++-----
>  ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
>  7 files changed, 90 insertions(+), 13 deletions(-)
>  rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index d21deb46f49f..c4a65968f9f5 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7291,4 +7291,9 @@ struct bpf_iter_num {
>  	__u64 __opaque[1];
>  } __attribute__((aligned(8)));
>  
> +struct bpf_iter_task_vma {

[..]

> +	__u64 __opaque[9]; /* See bpf_iter_num comment above */
> +	char __opaque_c[3];

Everything in the series makes sense, but this part is a big confusing
when reading without too much context. If you're gonna do a respin, maybe:

- __opaque_c[8*9+3] (or whatever the size is)? any reason for separate
  __u64 + char?
- maybe worth adding something like /* Opaque representation of
  bpf_iter_task_vma_kern; see bpf_iter_num comment above */.
  that bpf_iter_task_vma<>bpf_iter_task_vma_kern wasn't super apparent
  until I got to the BUG_ON part
David Marchevsky Aug. 11, 2023, 2:57 p.m. UTC | #2
On 8/10/23 5:57 PM, Stanislav Fomichev wrote:
> On 08/10, Dave Marchevsky wrote:
>> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
>> creation and manipulation of struct bpf_iter_task_vma in open-coded
>> iterator style. BPF programs can use these kfuncs directly or through
>> bpf_for_each macro for natural-looking iteration of all task vmas.
>>
>> The implementation borrows heavily from bpf_find_vma helper's locking -
>> differing only in that it holds the mmap_read lock for all iterations
>> while the helper only executes its provided callback on a maximum of 1
>> vma. Aside from locking, struct vma_iterator and vma_next do all the
>> heavy lifting.
>>
>> The newly-added struct bpf_iter_task_vma has a name collision with a
>> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
>> file is renamed in order to avoid the collision.
>>
>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>> Cc: Nathan Slingerland <slinger@meta.com>
>> ---
>>  include/uapi/linux/bpf.h                      |  5 ++
>>  kernel/bpf/helpers.c                          |  3 +
>>  kernel/bpf/task_iter.c                        | 56 +++++++++++++++++++
>>  tools/include/uapi/linux/bpf.h                |  5 ++
>>  tools/lib/bpf/bpf_helpers.h                   |  8 +++
>>  .../selftests/bpf/prog_tests/bpf_iter.c       | 26 ++++-----
>>  ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
>>  7 files changed, 90 insertions(+), 13 deletions(-)
>>  rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index d21deb46f49f..c4a65968f9f5 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -7291,4 +7291,9 @@ struct bpf_iter_num {
>>  	__u64 __opaque[1];
>>  } __attribute__((aligned(8)));
>>  
>> +struct bpf_iter_task_vma {
> 
> [..]
> 
>> +	__u64 __opaque[9]; /* See bpf_iter_num comment above */
>> +	char __opaque_c[3];
> 
> Everything in the series makes sense, but this part is a big confusing
> when reading without too much context. If you're gonna do a respin, maybe:
> 
> - __opaque_c[8*9+3] (or whatever the size is)? any reason for separate
>   __u64 + char?

IIUC this is because BTF generation doesn't pick up __attribute__((aligned(8))),
so if a vmlinux.h is generated via 'bpftool btf dump file vmlinux format c' and
this struct only contains chars, it won't have the correct alignment.

I'm not sure if the bitfield approach taken by bpf_{list,rb}_node similar has
the same effect. Some quick googling indicates that if it does, it's probably
not in the C standard.

But yeah, I agree that it's ugly. While we're on the topic, WDYT about my
comment in the cover letter about this struct (copied here for convenience):

  * The struct vma_iterator wrapped by struct bpf_iter_task_vma itself wraps
    struct ma_state. Because we need the entire struct, not a ptr, changes to
    either struct vma_iterator or struct ma_state will necessitate changing the
    opaque struct bpf_iter_task_vma to account for the new size. This feels a
    bit brittle. We could instead use bpf_mem_alloc to allocate a struct
    vma_iterator in bpf_iter_task_vma_new and have struct bpf_iter_task_vma
    point to that, but that's not quite equivalent as BPF progs will usually
    use the stack for this struct via bpf_for_each. Went with the simpler route
    for now.

> - maybe worth adding something like /* Opaque representation of
>   bpf_iter_task_vma_kern; see bpf_iter_num comment above */.
>   that bpf_iter_task_vma<>bpf_iter_task_vma_kern wasn't super apparent
>   until I got to the BUG_ON part

It feels weird to refer to the non-UAPI _kern struct in uapi header. Maybe
better to add a comment to the _kern struct referring to this one? I don't
feel strongly either way, though.
Yonghong Song Aug. 11, 2023, 4:22 p.m. UTC | #3
On 8/10/23 11:35 AM, Dave Marchevsky wrote:
> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> creation and manipulation of struct bpf_iter_task_vma in open-coded
> iterator style. BPF programs can use these kfuncs directly or through
> bpf_for_each macro for natural-looking iteration of all task vmas.
> 
> The implementation borrows heavily from bpf_find_vma helper's locking -
> differing only in that it holds the mmap_read lock for all iterations
> while the helper only executes its provided callback on a maximum of 1
> vma. Aside from locking, struct vma_iterator and vma_next do all the
> heavy lifting.
> 
> The newly-added struct bpf_iter_task_vma has a name collision with a
> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> file is renamed in order to avoid the collision.
> 
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> Cc: Nathan Slingerland <slinger@meta.com>
> ---
>   include/uapi/linux/bpf.h                      |  5 ++
>   kernel/bpf/helpers.c                          |  3 +
>   kernel/bpf/task_iter.c                        | 56 +++++++++++++++++++
>   tools/include/uapi/linux/bpf.h                |  5 ++
>   tools/lib/bpf/bpf_helpers.h                   |  8 +++
>   .../selftests/bpf/prog_tests/bpf_iter.c       | 26 ++++-----
>   ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
>   7 files changed, 90 insertions(+), 13 deletions(-)
>   rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index d21deb46f49f..c4a65968f9f5 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7291,4 +7291,9 @@ struct bpf_iter_num {
>   	__u64 __opaque[1];
>   } __attribute__((aligned(8)));
>   
> +struct bpf_iter_task_vma {
> +	__u64 __opaque[9]; /* See bpf_iter_num comment above */
> +	char __opaque_c[3];
> +} __attribute__((aligned(8)));

I do see we have issues with this struct. See below.

> +
>   #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index eb91cae0612a..7a06dea749f1 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
>   BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
>   BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
>   BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index c4ab9d6cdbe9..76be9998a65a 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -8,6 +8,7 @@
>   #include <linux/fdtable.h>
>   #include <linux/filter.h>
>   #include <linux/btf_ids.h>
> +#include <linux/mm_types.h>
>   #include "mmap_unlock_work.h"
>   
>   static const char * const iter_task_type_names[] = {
> @@ -823,6 +824,61 @@ const struct bpf_func_proto bpf_find_vma_proto = {
>   	.arg5_type	= ARG_ANYTHING,
>   };
>   
> +struct bpf_iter_task_vma_kern {
> +	struct mm_struct *mm;
> +	struct mmap_unlock_irq_work *work;
> +	struct vma_iterator vmi;
> +} __attribute__((aligned(8)));

Let us say in 6.5, There is an app developed with 6.5 and
everything works fine.

And in 6.6, vma_iterator size changed, either less or more than
the size in 6.5, then how you fix this issue? You need to update
uapi header bpf_iter_task_vma? Update the header file?
If the vma_iterator size is grew from 6.6, then the app won't work
any more.

So I suggest we do allocation for vma_iterator in bpf_iter_task_vma_new
to avoid this uapi issue.

> +
> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> +				      struct task_struct *task, u64 addr)
> +{
> +	struct bpf_iter_task_vma_kern *i = (void *)it;

i => kit?

> +	bool irq_work_busy = false;
> +
> +	BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
> +	BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
> +
> +	BTF_TYPE_EMIT(struct bpf_iter_task_vma);

This is not needed.

> +
> +	/* NULL i->mm signals failed bpf_iter_task_vma initialization.
> +	 * i->work == NULL is valid.
> +	 */
> +	i->mm = NULL;
> +	if (!task)
> +		return -ENOENT;
> +
> +	i->mm = task->mm;
> +	if (!i->mm)
> +		return -ENOENT;
> +
> +	irq_work_busy = bpf_mmap_unlock_get_irq_work(&i->work);
> +	if (irq_work_busy || !mmap_read_trylock(i->mm)) {
> +		i->mm = NULL;
> +		return -EBUSY;
> +	}
> +
> +	vma_iter_init(&i->vmi, i->mm, addr);
> +	return 0;
> +}
> +
> +__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
> +{
> +	struct bpf_iter_task_vma_kern *i = (void *)it;
> +
> +	if (!i->mm) /* bpf_iter_task_vma_new failed */
> +		return NULL;
> +	return vma_next(&i->vmi);
> +}
> +
> +__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
> +{
> +	struct bpf_iter_task_vma_kern *i = (void *)it;
> +
> +	if (i->mm)
> +		bpf_mmap_unlock_mm(i->work, i->mm);
> +}
> +
>   DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
>   
[...]
Yonghong Song Aug. 11, 2023, 4:41 p.m. UTC | #4
On 8/10/23 11:35 AM, Dave Marchevsky wrote:
> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> creation and manipulation of struct bpf_iter_task_vma in open-coded
> iterator style. BPF programs can use these kfuncs directly or through
> bpf_for_each macro for natural-looking iteration of all task vmas.
> 
> The implementation borrows heavily from bpf_find_vma helper's locking -
> differing only in that it holds the mmap_read lock for all iterations
> while the helper only executes its provided callback on a maximum of 1
> vma. Aside from locking, struct vma_iterator and vma_next do all the
> heavy lifting.
> 
> The newly-added struct bpf_iter_task_vma has a name collision with a
> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> file is renamed in order to avoid the collision.
> 
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> Cc: Nathan Slingerland <slinger@meta.com>
> ---
>   include/uapi/linux/bpf.h                      |  5 ++
>   kernel/bpf/helpers.c                          |  3 +
>   kernel/bpf/task_iter.c                        | 56 +++++++++++++++++++
>   tools/include/uapi/linux/bpf.h                |  5 ++
>   tools/lib/bpf/bpf_helpers.h                   |  8 +++
>   .../selftests/bpf/prog_tests/bpf_iter.c       | 26 ++++-----
>   ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
>   7 files changed, 90 insertions(+), 13 deletions(-)
>   rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index d21deb46f49f..c4a65968f9f5 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7291,4 +7291,9 @@ struct bpf_iter_num {
>   	__u64 __opaque[1];
>   } __attribute__((aligned(8)));
>   
> +struct bpf_iter_task_vma {
> +	__u64 __opaque[9]; /* See bpf_iter_num comment above */
> +	char __opaque_c[3];
> +} __attribute__((aligned(8)));
> +
>   #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index eb91cae0612a..7a06dea749f1 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
>   BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
>   BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
>   BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index c4ab9d6cdbe9..76be9998a65a 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -8,6 +8,7 @@
>   #include <linux/fdtable.h>
>   #include <linux/filter.h>
>   #include <linux/btf_ids.h>
> +#include <linux/mm_types.h>
>   #include "mmap_unlock_work.h"
>   
>   static const char * const iter_task_type_names[] = {
> @@ -823,6 +824,61 @@ const struct bpf_func_proto bpf_find_vma_proto = {
>   	.arg5_type	= ARG_ANYTHING,
>   };
>   
> +struct bpf_iter_task_vma_kern {
> +	struct mm_struct *mm;
> +	struct mmap_unlock_irq_work *work;
> +	struct vma_iterator vmi;
> +} __attribute__((aligned(8)));
> +
> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> +				      struct task_struct *task, u64 addr)
> +{
> +	struct bpf_iter_task_vma_kern *i = (void *)it;
> +	bool irq_work_busy = false;
> +
> +	BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
> +	BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
> +
> +	BTF_TYPE_EMIT(struct bpf_iter_task_vma);
> +
> +	/* NULL i->mm signals failed bpf_iter_task_vma initialization.
> +	 * i->work == NULL is valid.
> +	 */
> +	i->mm = NULL;
> +	if (!task)
> +		return -ENOENT;
> +
> +	i->mm = task->mm;
> +	if (!i->mm)
> +		return -ENOENT;

We might have an issue here as well if task is in __put_task_struct() 
stage. It is possible that we did i->mm from task->mm and then
task is freed and 'mm' is reused by somebody self.

To prevent such cases, I suggest we try to take a reference
of 'task' first. If we can get a reference then task is valid
and task->mm will not be freed and we will be fine.

> +
> +	irq_work_busy = bpf_mmap_unlock_get_irq_work(&i->work);
> +	if (irq_work_busy || !mmap_read_trylock(i->mm)) {
> +		i->mm = NULL;
> +		return -EBUSY;
> +	}
> +
> +	vma_iter_init(&i->vmi, i->mm, addr);
> +	return 0;
> +}
> +
[...]
Stanislav Fomichev Aug. 11, 2023, 5:03 p.m. UTC | #5
On 08/11, David Marchevsky wrote:
> On 8/10/23 5:57 PM, Stanislav Fomichev wrote:
> > On 08/10, Dave Marchevsky wrote:
> >> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> >> creation and manipulation of struct bpf_iter_task_vma in open-coded
> >> iterator style. BPF programs can use these kfuncs directly or through
> >> bpf_for_each macro for natural-looking iteration of all task vmas.
> >>
> >> The implementation borrows heavily from bpf_find_vma helper's locking -
> >> differing only in that it holds the mmap_read lock for all iterations
> >> while the helper only executes its provided callback on a maximum of 1
> >> vma. Aside from locking, struct vma_iterator and vma_next do all the
> >> heavy lifting.
> >>
> >> The newly-added struct bpf_iter_task_vma has a name collision with a
> >> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> >> file is renamed in order to avoid the collision.
> >>
> >> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> >> Cc: Nathan Slingerland <slinger@meta.com>
> >> ---
> >>  include/uapi/linux/bpf.h                      |  5 ++
> >>  kernel/bpf/helpers.c                          |  3 +
> >>  kernel/bpf/task_iter.c                        | 56 +++++++++++++++++++
> >>  tools/include/uapi/linux/bpf.h                |  5 ++
> >>  tools/lib/bpf/bpf_helpers.h                   |  8 +++
> >>  .../selftests/bpf/prog_tests/bpf_iter.c       | 26 ++++-----
> >>  ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
> >>  7 files changed, 90 insertions(+), 13 deletions(-)
> >>  rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
> >>
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index d21deb46f49f..c4a65968f9f5 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -7291,4 +7291,9 @@ struct bpf_iter_num {
> >>  	__u64 __opaque[1];
> >>  } __attribute__((aligned(8)));
> >>  
> >> +struct bpf_iter_task_vma {
> > 
> > [..]
> > 
> >> +	__u64 __opaque[9]; /* See bpf_iter_num comment above */
> >> +	char __opaque_c[3];
> > 
> > Everything in the series makes sense, but this part is a big confusing
> > when reading without too much context. If you're gonna do a respin, maybe:
> > 
> > - __opaque_c[8*9+3] (or whatever the size is)? any reason for separate
> >   __u64 + char?
> 
> IIUC this is because BTF generation doesn't pick up __attribute__((aligned(8))),
> so if a vmlinux.h is generated via 'bpftool btf dump file vmlinux format c' and
> this struct only contains chars, it won't have the correct alignment.
> 
> I'm not sure if the bitfield approach taken by bpf_{list,rb}_node similar has
> the same effect. Some quick googling indicates that if it does, it's probably
> not in the C standard.

Ugh, the alignment, right..

> But yeah, I agree that it's ugly. While we're on the topic, WDYT about my
> comment in the cover letter about this struct (copied here for convenience):
> 
>   * The struct vma_iterator wrapped by struct bpf_iter_task_vma itself wraps
>     struct ma_state. Because we need the entire struct, not a ptr, changes to
>     either struct vma_iterator or struct ma_state will necessitate changing the
>     opaque struct bpf_iter_task_vma to account for the new size. This feels a
>     bit brittle. We could instead use bpf_mem_alloc to allocate a struct
>     vma_iterator in bpf_iter_task_vma_new and have struct bpf_iter_task_vma
>     point to that, but that's not quite equivalent as BPF progs will usually
>     use the stack for this struct via bpf_for_each. Went with the simpler route
>     for now.

LGTM! (assuming you'll keep non-pointer; looking at that other thread
where Yonghong suggests to go with the ptr...)

> > - maybe worth adding something like /* Opaque representation of
> >   bpf_iter_task_vma_kern; see bpf_iter_num comment above */.
> >   that bpf_iter_task_vma<>bpf_iter_task_vma_kern wasn't super apparent
> >   until I got to the BUG_ON part
> 
> It feels weird to refer to the non-UAPI _kern struct in uapi header. Maybe
> better to add a comment to the _kern struct referring to this one? I don't
> feel strongly either way, though.

Yeah, good point, let's keep as is.
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d21deb46f49f..c4a65968f9f5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7291,4 +7291,9 @@  struct bpf_iter_num {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+struct bpf_iter_task_vma {
+	__u64 __opaque[9]; /* See bpf_iter_num comment above */
+	char __opaque_c[3];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index eb91cae0612a..7a06dea749f1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2482,6 +2482,9 @@  BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index c4ab9d6cdbe9..76be9998a65a 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -8,6 +8,7 @@ 
 #include <linux/fdtable.h>
 #include <linux/filter.h>
 #include <linux/btf_ids.h>
+#include <linux/mm_types.h>
 #include "mmap_unlock_work.h"
 
 static const char * const iter_task_type_names[] = {
@@ -823,6 +824,61 @@  const struct bpf_func_proto bpf_find_vma_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+struct bpf_iter_task_vma_kern {
+	struct mm_struct *mm;
+	struct mmap_unlock_irq_work *work;
+	struct vma_iterator vmi;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
+				      struct task_struct *task, u64 addr)
+{
+	struct bpf_iter_task_vma_kern *i = (void *)it;
+	bool irq_work_busy = false;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
+
+	BTF_TYPE_EMIT(struct bpf_iter_task_vma);
+
+	/* NULL i->mm signals failed bpf_iter_task_vma initialization.
+	 * i->work == NULL is valid.
+	 */
+	i->mm = NULL;
+	if (!task)
+		return -ENOENT;
+
+	i->mm = task->mm;
+	if (!i->mm)
+		return -ENOENT;
+
+	irq_work_busy = bpf_mmap_unlock_get_irq_work(&i->work);
+	if (irq_work_busy || !mmap_read_trylock(i->mm)) {
+		i->mm = NULL;
+		return -EBUSY;
+	}
+
+	vma_iter_init(&i->vmi, i->mm, addr);
+	return 0;
+}
+
+__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
+{
+	struct bpf_iter_task_vma_kern *i = (void *)it;
+
+	if (!i->mm) /* bpf_iter_task_vma_new failed */
+		return NULL;
+	return vma_next(&i->vmi);
+}
+
+__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
+{
+	struct bpf_iter_task_vma_kern *i = (void *)it;
+
+	if (i->mm)
+		bpf_mmap_unlock_mm(i->work, i->mm);
+}
+
 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
 
 static void do_mmap_read_unlock(struct irq_work *entry)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d21deb46f49f..c4a65968f9f5 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7291,4 +7291,9 @@  struct bpf_iter_num {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+struct bpf_iter_task_vma {
+	__u64 __opaque[9]; /* See bpf_iter_num comment above */
+	char __opaque_c[3];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index bbab9ad9dc5a..d885ffee4d88 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -302,6 +302,14 @@  extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __weak
 extern int *bpf_iter_num_next(struct bpf_iter_num *it) __weak __ksym;
 extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym;
 
+struct bpf_iter_task_vma;
+
+extern int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
+				 struct task_struct *task,
+				 unsigned long addr) __weak __ksym;
+extern struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) __weak __ksym;
+extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __weak __ksym;
+
 #ifndef bpf_for_each
 /* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for
  * using BPF open-coded iterators without having to write mundane explicit
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 1f02168103dd..41aba139b20b 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -10,7 +10,7 @@ 
 #include "bpf_iter_task.skel.h"
 #include "bpf_iter_task_stack.skel.h"
 #include "bpf_iter_task_file.skel.h"
-#include "bpf_iter_task_vma.skel.h"
+#include "bpf_iter_task_vmas.skel.h"
 #include "bpf_iter_task_btf.skel.h"
 #include "bpf_iter_tcp4.skel.h"
 #include "bpf_iter_tcp6.skel.h"
@@ -1399,19 +1399,19 @@  static void str_strip_first_line(char *str)
 static void test_task_vma_common(struct bpf_iter_attach_opts *opts)
 {
 	int err, iter_fd = -1, proc_maps_fd = -1;
-	struct bpf_iter_task_vma *skel;
+	struct bpf_iter_task_vmas *skel;
 	int len, read_size = 4;
 	char maps_path[64];
 
-	skel = bpf_iter_task_vma__open();
-	if (!ASSERT_OK_PTR(skel, "bpf_iter_task_vma__open"))
+	skel = bpf_iter_task_vmas__open();
+	if (!ASSERT_OK_PTR(skel, "bpf_iter_task_vmas__open"))
 		return;
 
 	skel->bss->pid = getpid();
 	skel->bss->one_task = opts ? 1 : 0;
 
-	err = bpf_iter_task_vma__load(skel);
-	if (!ASSERT_OK(err, "bpf_iter_task_vma__load"))
+	err = bpf_iter_task_vmas__load(skel);
+	if (!ASSERT_OK(err, "bpf_iter_task_vmas__load"))
 		goto out;
 
 	skel->links.proc_maps = bpf_program__attach_iter(
@@ -1462,25 +1462,25 @@  static void test_task_vma_common(struct bpf_iter_attach_opts *opts)
 out:
 	close(proc_maps_fd);
 	close(iter_fd);
-	bpf_iter_task_vma__destroy(skel);
+	bpf_iter_task_vmas__destroy(skel);
 }
 
 static void test_task_vma_dead_task(void)
 {
-	struct bpf_iter_task_vma *skel;
+	struct bpf_iter_task_vmas *skel;
 	int wstatus, child_pid = -1;
 	time_t start_tm, cur_tm;
 	int err, iter_fd = -1;
 	int wait_sec = 3;
 
-	skel = bpf_iter_task_vma__open();
-	if (!ASSERT_OK_PTR(skel, "bpf_iter_task_vma__open"))
+	skel = bpf_iter_task_vmas__open();
+	if (!ASSERT_OK_PTR(skel, "bpf_iter_task_vmas__open"))
 		return;
 
 	skel->bss->pid = getpid();
 
-	err = bpf_iter_task_vma__load(skel);
-	if (!ASSERT_OK(err, "bpf_iter_task_vma__load"))
+	err = bpf_iter_task_vmas__load(skel);
+	if (!ASSERT_OK(err, "bpf_iter_task_vmas__load"))
 		goto out;
 
 	skel->links.proc_maps = bpf_program__attach_iter(
@@ -1533,7 +1533,7 @@  static void test_task_vma_dead_task(void)
 out:
 	waitpid(child_pid, &wstatus, 0);
 	close(iter_fd);
-	bpf_iter_task_vma__destroy(skel);
+	bpf_iter_task_vmas__destroy(skel);
 }
 
 void test_bpf_sockmap_map_iter_fd(void)
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
similarity index 100%
rename from tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
rename to tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c