diff mbox series

[v3,bpf-next,1/3] bpf: Add bpf_iter_cpumask kfuncs

Message ID 20240117024823.4186-2-laoar.shao@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: Add bpf_iter_cpumask | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success SINGLE THREAD; Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 1120 this patch: 1123
netdev/cc_maintainers success CCed 0 of 0 maintainers
netdev/build_clang success Errors and warnings before: 1108 this patch: 1108
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 1135 this patch: 1138
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc

Commit Message

Yafang Shao Jan. 17, 2024, 2:48 a.m. UTC
Add three new kfuncs for bpf_iter_cpumask.
- bpf_iter_cpumask_new
  It is defined with KF_RCU_PROTECTED and KF_RCU.
  KF_RCU_PROTECTED is defined because we must use it under the
  protection of RCU.
  KF_RCU is defined because the cpumask must be a RCU trusted pointer
  such as task->cpus_ptr.
- bpf_iter_cpumask_next
- bpf_iter_cpumask_destroy

These new kfuncs facilitate the iteration of percpu data, such as
runqueues, psi_cgroup_cpu, and more.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 kernel/bpf/cpumask.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

Comments

Yonghong Song Jan. 18, 2024, 10:27 p.m. UTC | #1
On 1/16/24 6:48 PM, Yafang Shao wrote:
> Add three new kfuncs for bpf_iter_cpumask.
> - bpf_iter_cpumask_new
>    It is defined with KF_RCU_PROTECTED and KF_RCU.
>    KF_RCU_PROTECTED is defined because we must use it under the
>    protection of RCU.
>    KF_RCU is defined because the cpumask must be a RCU trusted pointer
>    such as task->cpus_ptr.

I am not sure whether we need both or not.

KF_RCU_PROTECTED means the function call needs within the rcu cs.
KF_RCU means the argument usage needs within the rcu cs.
We only need one of them (preferrably KF_RCU).

> - bpf_iter_cpumask_next
> - bpf_iter_cpumask_destroy
>
> These new kfuncs facilitate the iteration of percpu data, such as
> runqueues, psi_cgroup_cpu, and more.
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
>   kernel/bpf/cpumask.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 69 insertions(+)
>
> diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
> index 2e73533a3811..1840e48e6142 100644
> --- a/kernel/bpf/cpumask.c
> +++ b/kernel/bpf/cpumask.c
> @@ -422,6 +422,72 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
>   	return cpumask_weight(cpumask);
>   }
>   
> +struct bpf_iter_cpumask {
> +	__u64 __opaque[2];
> +} __aligned(8);
> +
> +struct bpf_iter_cpumask_kern {
> +	const struct cpumask *mask;
> +	int cpu;
> +} __aligned(8);
> +
> +/**
> + * bpf_iter_cpumask_new() - Create a new bpf_iter_cpumask for a specified cpumask
> + * @it: The new bpf_iter_cpumask to be created.
> + * @mask: The cpumask to be iterated over.
> + *
> + * This function initializes a new bpf_iter_cpumask structure for iterating over
> + * the specified CPU mask. It assigns the provided cpumask to the newly created
> + * bpf_iter_cpumask @it for subsequent iteration operations.
> + *
> + * On success, 0 is returen. On failure, ERR is returned.
> + */
> +__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask)
> +{
> +	struct bpf_iter_cpumask_kern *kit = (void *)it;
> +
> +	BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) > sizeof(struct bpf_iter_cpumask));
> +	BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
> +		     __alignof__(struct bpf_iter_cpumask));
> +
> +	kit->mask = mask;
> +	kit->cpu = -1;
> +	return 0;
> +}

We have problem here. Let us say bpf_iter_cpumask_new() is called inside rcu cs.
Once the control goes out of rcu cs, 'mask' could be freed, right?
Or you require bpf_iter_cpumask_next() needs to be in the same rcu cs
as bpf_iter_cpumask_new(). But such a requirement seems odd.

I think we can do things similar to bpf_iter_task_vma. You can allocate memory
with bpf_mem_alloc() in bpf_iter_cpumask_new() to keep a copy of mask. This
way, you do not need to worry about potential use-after-free issue.
The memory can be freed with bpf_iter_cpumask_destroy().

> +
> +/**
> + * bpf_iter_cpumask_next() - Get the next CPU in a bpf_iter_cpumask
> + * @it: The bpf_iter_cpumask
> + *
> + * This function retrieves a pointer to the number of the next CPU within the
> + * specified bpf_iter_cpumask. It allows sequential access to CPUs within the
> + * cpumask. If there are no further CPUs available, it returns NULL.
> + *
> + * Returns a pointer to the number of the next CPU in the cpumask or NULL if no
> + * further CPUs.
> + */
> +__bpf_kfunc int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it)
> +{
> +	struct bpf_iter_cpumask_kern *kit = (void *)it;
> +	const struct cpumask *mask = kit->mask;
> +	int cpu;
> +
> +	cpu = cpumask_next(kit->cpu, mask);
> +	if (cpu >= nr_cpu_ids)
> +		return NULL;
> +
> +	kit->cpu = cpu;
> +	return &kit->cpu;
> +}
> +
> +/**
> + * bpf_iter_cpumask_destroy() - Destroy a bpf_iter_cpumask
> + * @it: The bpf_iter_cpumask to be destroyed.
> + */
> +__bpf_kfunc void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it)
> +{
> +}
> +
>   __bpf_kfunc_end_defs();
>   
>   BTF_SET8_START(cpumask_kfunc_btf_ids)
> @@ -450,6 +516,9 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
>   BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
>   BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
>   BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
> +BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW | KF_RCU_PROTECTED | KF_RCU)
> +BTF_ID_FLAGS(func, bpf_iter_cpumask_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_cpumask_destroy, KF_ITER_DESTROY)
>   BTF_SET8_END(cpumask_kfunc_btf_ids)
>   
>   static const struct btf_kfunc_id_set cpumask_kfunc_set = {
Hou Tao Jan. 19, 2024, 12:51 a.m. UTC | #2
Hi,

On 1/19/2024 6:27 AM, Yonghong Song wrote:
>
> On 1/16/24 6:48 PM, Yafang Shao wrote:
>> Add three new kfuncs for bpf_iter_cpumask.
>> - bpf_iter_cpumask_new
>>    It is defined with KF_RCU_PROTECTED and KF_RCU.
>>    KF_RCU_PROTECTED is defined because we must use it under the
>>    protection of RCU.
>>    KF_RCU is defined because the cpumask must be a RCU trusted pointer
>>    such as task->cpus_ptr.
>
> I am not sure whether we need both or not.
>
> KF_RCU_PROTECTED means the function call needs within the rcu cs.
> KF_RCU means the argument usage needs within the rcu cs.
> We only need one of them (preferrably KF_RCU).
>
>> - bpf_iter_cpumask_next
>> - bpf_iter_cpumask_destroy
>>
>> These new kfuncs facilitate the iteration of percpu data, such as
>> runqueues, psi_cgroup_cpu, and more.
>>
>> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
>> ---
>>   kernel/bpf/cpumask.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 69 insertions(+)
>>
>> diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
>> index 2e73533a3811..1840e48e6142 100644
>> --- a/kernel/bpf/cpumask.c
>> +++ b/kernel/bpf/cpumask.c
>> @@ -422,6 +422,72 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct
>> cpumask *cpumask)
>>       return cpumask_weight(cpumask);
>>   }
>>   +struct bpf_iter_cpumask {
>> +    __u64 __opaque[2];
>> +} __aligned(8);
>> +
>> +struct bpf_iter_cpumask_kern {
>> +    const struct cpumask *mask;
>> +    int cpu;
>> +} __aligned(8);
>> +
>> +/**
>> + * bpf_iter_cpumask_new() - Create a new bpf_iter_cpumask for a
>> specified cpumask
>> + * @it: The new bpf_iter_cpumask to be created.
>> + * @mask: The cpumask to be iterated over.
>> + *
>> + * This function initializes a new bpf_iter_cpumask structure for
>> iterating over
>> + * the specified CPU mask. It assigns the provided cpumask to the
>> newly created
>> + * bpf_iter_cpumask @it for subsequent iteration operations.
>> + *
>> + * On success, 0 is returen. On failure, ERR is returned.
>> + */
>> +__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it,
>> const struct cpumask *mask)
>> +{
>> +    struct bpf_iter_cpumask_kern *kit = (void *)it;
>> +
>> +    BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) >
>> sizeof(struct bpf_iter_cpumask));
>> +    BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
>> +             __alignof__(struct bpf_iter_cpumask));
>> +
>> +    kit->mask = mask;
>> +    kit->cpu = -1;
>> +    return 0;
>> +}
>
> We have problem here. Let us say bpf_iter_cpumask_new() is called
> inside rcu cs.
> Once the control goes out of rcu cs, 'mask' could be freed, right?
> Or you require bpf_iter_cpumask_next() needs to be in the same rcu cs
> as bpf_iter_cpumask_new(). But such a requirement seems odd.

So the case is possible when using bpf_iter_cpumask_new() and
bpf_iter_cpumask_next() in sleepable program and these two kfuncs are
used in two different rcu_read_lock/rcu_read_unlock code blocks, right ?
>
> I think we can do things similar to bpf_iter_task_vma. You can
> allocate memory
> with bpf_mem_alloc() in bpf_iter_cpumask_new() to keep a copy of mask.
> This
> way, you do not need to worry about potential use-after-free issue.
> The memory can be freed with bpf_iter_cpumask_destroy().
>
>> +
>> +/**
>> + * bpf_iter_cpumask_next() - Get the next CPU in a bpf_iter_cpumask
>> + * @it: The bpf_iter_cpumask
>> + *
>> + * This function retrieves a pointer to the number of the next CPU
>> within the
>> + * specified bpf_iter_cpumask. It allows sequential access to CPUs
>> within the
>> + * cpumask. If there are no further CPUs available, it returns NULL.
>> + *
>> + * Returns a pointer to the number of the next CPU in the cpumask or
>> NULL if no
>> + * further CPUs.
>> + */
>> +__bpf_kfunc int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it)
>> +{
>> +    struct bpf_iter_cpumask_kern *kit = (void *)it;
>> +    const struct cpumask *mask = kit->mask;
>> +    int cpu;
>> +
>> +    cpu = cpumask_next(kit->cpu, mask);
>> +    if (cpu >= nr_cpu_ids)
>> +        return NULL;
>> +
>> +    kit->cpu = cpu;
>> +    return &kit->cpu;
>> +}
>> +
>> +/**
>> + * bpf_iter_cpumask_destroy() - Destroy a bpf_iter_cpumask
>> + * @it: The bpf_iter_cpumask to be destroyed.
>> + */
>> +__bpf_kfunc void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it)
>> +{
>> +}
>> +
>>   __bpf_kfunc_end_defs();
>>     BTF_SET8_START(cpumask_kfunc_btf_ids)
>> @@ -450,6 +516,9 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
>>   BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
>>   BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
>>   BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
>> +BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW |
>> KF_RCU_PROTECTED | KF_RCU)
>> +BTF_ID_FLAGS(func, bpf_iter_cpumask_next, KF_ITER_NEXT | KF_RET_NULL)
>> +BTF_ID_FLAGS(func, bpf_iter_cpumask_destroy, KF_ITER_DESTROY)
>>   BTF_SET8_END(cpumask_kfunc_btf_ids)
>>     static const struct btf_kfunc_id_set cpumask_kfunc_set = {
>
> .
Yonghong Song Jan. 19, 2024, 3:45 a.m. UTC | #3
On 1/18/24 4:51 PM, Hou Tao wrote:
> Hi,
>
> On 1/19/2024 6:27 AM, Yonghong Song wrote:
>> On 1/16/24 6:48 PM, Yafang Shao wrote:
>>> Add three new kfuncs for bpf_iter_cpumask.
>>> - bpf_iter_cpumask_new
>>>     It is defined with KF_RCU_PROTECTED and KF_RCU.
>>>     KF_RCU_PROTECTED is defined because we must use it under the
>>>     protection of RCU.
>>>     KF_RCU is defined because the cpumask must be a RCU trusted pointer
>>>     such as task->cpus_ptr.
>> I am not sure whether we need both or not.
>>
>> KF_RCU_PROTECTED means the function call needs within the rcu cs.
>> KF_RCU means the argument usage needs within the rcu cs.
>> We only need one of them (preferrably KF_RCU).
>>
>>> - bpf_iter_cpumask_next
>>> - bpf_iter_cpumask_destroy
>>>
>>> These new kfuncs facilitate the iteration of percpu data, such as
>>> runqueues, psi_cgroup_cpu, and more.
>>>
>>> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
>>> ---
>>>    kernel/bpf/cpumask.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
>>>    1 file changed, 69 insertions(+)
>>>
>>> diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
>>> index 2e73533a3811..1840e48e6142 100644
>>> --- a/kernel/bpf/cpumask.c
>>> +++ b/kernel/bpf/cpumask.c
>>> @@ -422,6 +422,72 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct
>>> cpumask *cpumask)
>>>        return cpumask_weight(cpumask);
>>>    }
>>>    +struct bpf_iter_cpumask {
>>> +    __u64 __opaque[2];
>>> +} __aligned(8);
>>> +
>>> +struct bpf_iter_cpumask_kern {
>>> +    const struct cpumask *mask;
>>> +    int cpu;
>>> +} __aligned(8);
>>> +
>>> +/**
>>> + * bpf_iter_cpumask_new() - Create a new bpf_iter_cpumask for a
>>> specified cpumask
>>> + * @it: The new bpf_iter_cpumask to be created.
>>> + * @mask: The cpumask to be iterated over.
>>> + *
>>> + * This function initializes a new bpf_iter_cpumask structure for
>>> iterating over
>>> + * the specified CPU mask. It assigns the provided cpumask to the
>>> newly created
>>> + * bpf_iter_cpumask @it for subsequent iteration operations.
>>> + *
>>> + * On success, 0 is returen. On failure, ERR is returned.
>>> + */
>>> +__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it,
>>> const struct cpumask *mask)
>>> +{
>>> +    struct bpf_iter_cpumask_kern *kit = (void *)it;
>>> +
>>> +    BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) >
>>> sizeof(struct bpf_iter_cpumask));
>>> +    BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
>>> +             __alignof__(struct bpf_iter_cpumask));
>>> +
>>> +    kit->mask = mask;
>>> +    kit->cpu = -1;
>>> +    return 0;
>>> +}
>> We have problem here. Let us say bpf_iter_cpumask_new() is called
>> inside rcu cs.
>> Once the control goes out of rcu cs, 'mask' could be freed, right?
>> Or you require bpf_iter_cpumask_next() needs to be in the same rcu cs
>> as bpf_iter_cpumask_new(). But such a requirement seems odd.
> So the case is possible when using bpf_iter_cpumask_new() and
> bpf_iter_cpumask_next() in sleepable program and these two kfuncs are
> used in two different rcu_read_lock/rcu_read_unlock code blocks, right ?

Right, or bpf_iter_cpumask_new() inside rcu cs and bpf_iter_cpumask_next() not.

>> I think we can do things similar to bpf_iter_task_vma. You can
>> allocate memory
>> with bpf_mem_alloc() in bpf_iter_cpumask_new() to keep a copy of mask.
>> This
>> way, you do not need to worry about potential use-after-free issue.
>> The memory can be freed with bpf_iter_cpumask_destroy().
>>
>>> +
>>> +/**
>>> + * bpf_iter_cpumask_next() - Get the next CPU in a bpf_iter_cpumask
>>> + * @it: The bpf_iter_cpumask
>>> + *
>>> + * This function retrieves a pointer to the number of the next CPU
>>> within the
>>> + * specified bpf_iter_cpumask. It allows sequential access to CPUs
>>> within the
>>> + * cpumask. If there are no further CPUs available, it returns NULL.
>>> + *
>>> + * Returns a pointer to the number of the next CPU in the cpumask or
>>> NULL if no
>>> + * further CPUs.
>>> + */
>>> +__bpf_kfunc int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it)
>>> +{
>>> +    struct bpf_iter_cpumask_kern *kit = (void *)it;
>>> +    const struct cpumask *mask = kit->mask;
>>> +    int cpu;
>>> +
>>> +    cpu = cpumask_next(kit->cpu, mask);
>>> +    if (cpu >= nr_cpu_ids)
>>> +        return NULL;
>>> +
>>> +    kit->cpu = cpu;
>>> +    return &kit->cpu;
>>> +}
>>> +
>>> +/**
>>> + * bpf_iter_cpumask_destroy() - Destroy a bpf_iter_cpumask
>>> + * @it: The bpf_iter_cpumask to be destroyed.
>>> + */
>>> +__bpf_kfunc void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it)
>>> +{
>>> +}
>>> +
>>>    __bpf_kfunc_end_defs();
>>>      BTF_SET8_START(cpumask_kfunc_btf_ids)
>>> @@ -450,6 +516,9 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
>>>    BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
>>>    BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
>>>    BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
>>> +BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW |
>>> KF_RCU_PROTECTED | KF_RCU)
>>> +BTF_ID_FLAGS(func, bpf_iter_cpumask_next, KF_ITER_NEXT | KF_RET_NULL)
>>> +BTF_ID_FLAGS(func, bpf_iter_cpumask_destroy, KF_ITER_DESTROY)
>>>    BTF_SET8_END(cpumask_kfunc_btf_ids)
>>>      static const struct btf_kfunc_id_set cpumask_kfunc_set = {
>> .
Yafang Shao Jan. 19, 2024, 9:50 a.m. UTC | #4
On Fri, Jan 19, 2024 at 6:27 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> On 1/16/24 6:48 PM, Yafang Shao wrote:
> > Add three new kfuncs for bpf_iter_cpumask.
> > - bpf_iter_cpumask_new
> >    It is defined with KF_RCU_PROTECTED and KF_RCU.
> >    KF_RCU_PROTECTED is defined because we must use it under the
> >    protection of RCU.
> >    KF_RCU is defined because the cpumask must be a RCU trusted pointer
> >    such as task->cpus_ptr.
>
> I am not sure whether we need both or not.
>
> KF_RCU_PROTECTED means the function call needs within the rcu cs.
> KF_RCU means the argument usage needs within the rcu cs.
> We only need one of them (preferrably KF_RCU).

As you explained below, KF_RCU_PROTECTED is actually for
bpf_iter_cpumask_next().

>
> > - bpf_iter_cpumask_next
> > - bpf_iter_cpumask_destroy
> >
> > These new kfuncs facilitate the iteration of percpu data, such as
> > runqueues, psi_cgroup_cpu, and more.
> >
> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > ---
> >   kernel/bpf/cpumask.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
> >   1 file changed, 69 insertions(+)
> >
> > diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
> > index 2e73533a3811..1840e48e6142 100644
> > --- a/kernel/bpf/cpumask.c
> > +++ b/kernel/bpf/cpumask.c
> > @@ -422,6 +422,72 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
> >       return cpumask_weight(cpumask);
> >   }
> >
> > +struct bpf_iter_cpumask {
> > +     __u64 __opaque[2];
> > +} __aligned(8);
> > +
> > +struct bpf_iter_cpumask_kern {
> > +     const struct cpumask *mask;
> > +     int cpu;
> > +} __aligned(8);
> > +
> > +/**
> > + * bpf_iter_cpumask_new() - Create a new bpf_iter_cpumask for a specified cpumask
> > + * @it: The new bpf_iter_cpumask to be created.
> > + * @mask: The cpumask to be iterated over.
> > + *
> > + * This function initializes a new bpf_iter_cpumask structure for iterating over
> > + * the specified CPU mask. It assigns the provided cpumask to the newly created
> > + * bpf_iter_cpumask @it for subsequent iteration operations.
> > + *
> > + * On success, 0 is returen. On failure, ERR is returned.
> > + */
> > +__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask)
> > +{
> > +     struct bpf_iter_cpumask_kern *kit = (void *)it;
> > +
> > +     BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) > sizeof(struct bpf_iter_cpumask));
> > +     BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
> > +                  __alignof__(struct bpf_iter_cpumask));
> > +
> > +     kit->mask = mask;
> > +     kit->cpu = -1;
> > +     return 0;
> > +}
>
> We have problem here. Let us say bpf_iter_cpumask_new() is called inside rcu cs.
> Once the control goes out of rcu cs, 'mask' could be freed, right?
> Or you require bpf_iter_cpumask_next() needs to be in the same rcu cs
> as bpf_iter_cpumask_new(). But such a requirement seems odd.
>
> I think we can do things similar to bpf_iter_task_vma. You can allocate memory
> with bpf_mem_alloc() in bpf_iter_cpumask_new() to keep a copy of mask. This
> way, you do not need to worry about potential use-after-free issue.
> The memory can be freed with bpf_iter_cpumask_destroy().

Good suggestion. That seems better.

>
> > +
> > +/**
> > + * bpf_iter_cpumask_next() - Get the next CPU in a bpf_iter_cpumask
> > + * @it: The bpf_iter_cpumask
> > + *
> > + * This function retrieves a pointer to the number of the next CPU within the
> > + * specified bpf_iter_cpumask. It allows sequential access to CPUs within the
> > + * cpumask. If there are no further CPUs available, it returns NULL.
> > + *
> > + * Returns a pointer to the number of the next CPU in the cpumask or NULL if no
> > + * further CPUs.
> > + */
> > +__bpf_kfunc int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it)
> > +{
> > +     struct bpf_iter_cpumask_kern *kit = (void *)it;
> > +     const struct cpumask *mask = kit->mask;
> > +     int cpu;
> > +
> > +     cpu = cpumask_next(kit->cpu, mask);
> > +     if (cpu >= nr_cpu_ids)
> > +             return NULL;
> > +
> > +     kit->cpu = cpu;
> > +     return &kit->cpu;
> > +}
> > +
> > +/**
> > + * bpf_iter_cpumask_destroy() - Destroy a bpf_iter_cpumask
> > + * @it: The bpf_iter_cpumask to be destroyed.
> > + */
> > +__bpf_kfunc void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it)
> > +{
> > +}
> > +
> >   __bpf_kfunc_end_defs();
> >
> >   BTF_SET8_START(cpumask_kfunc_btf_ids)
> > @@ -450,6 +516,9 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
> >   BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
> >   BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
> >   BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
> > +BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW | KF_RCU_PROTECTED | KF_RCU)
> > +BTF_ID_FLAGS(func, bpf_iter_cpumask_next, KF_ITER_NEXT | KF_RET_NULL)
> > +BTF_ID_FLAGS(func, bpf_iter_cpumask_destroy, KF_ITER_DESTROY)
> >   BTF_SET8_END(cpumask_kfunc_btf_ids)
> >
> >   static const struct btf_kfunc_id_set cpumask_kfunc_set = {



--
Regards
Yafang
diff mbox series

Patch

diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 2e73533a3811..1840e48e6142 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -422,6 +422,72 @@  __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
 	return cpumask_weight(cpumask);
 }
 
+struct bpf_iter_cpumask {
+	__u64 __opaque[2];
+} __aligned(8);
+
+struct bpf_iter_cpumask_kern {
+	const struct cpumask *mask;
+	int cpu;
+} __aligned(8);
+
+/**
+ * bpf_iter_cpumask_new() - Create a new bpf_iter_cpumask for a specified cpumask
+ * @it: The new bpf_iter_cpumask to be created.
+ * @mask: The cpumask to be iterated over.
+ *
+ * This function initializes a new bpf_iter_cpumask structure for iterating over
+ * the specified CPU mask. It assigns the provided cpumask to the newly created
+ * bpf_iter_cpumask @it for subsequent iteration operations.
+ *
+ * On success, 0 is returen. On failure, ERR is returned.
+ */
+__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask)
+{
+	struct bpf_iter_cpumask_kern *kit = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) > sizeof(struct bpf_iter_cpumask));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
+		     __alignof__(struct bpf_iter_cpumask));
+
+	kit->mask = mask;
+	kit->cpu = -1;
+	return 0;
+}
+
+/**
+ * bpf_iter_cpumask_next() - Get the next CPU in a bpf_iter_cpumask
+ * @it: The bpf_iter_cpumask
+ *
+ * This function retrieves a pointer to the number of the next CPU within the
+ * specified bpf_iter_cpumask. It allows sequential access to CPUs within the
+ * cpumask. If there are no further CPUs available, it returns NULL.
+ *
+ * Returns a pointer to the number of the next CPU in the cpumask or NULL if no
+ * further CPUs.
+ */
+__bpf_kfunc int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it)
+{
+	struct bpf_iter_cpumask_kern *kit = (void *)it;
+	const struct cpumask *mask = kit->mask;
+	int cpu;
+
+	cpu = cpumask_next(kit->cpu, mask);
+	if (cpu >= nr_cpu_ids)
+		return NULL;
+
+	kit->cpu = cpu;
+	return &kit->cpu;
+}
+
+/**
+ * bpf_iter_cpumask_destroy() - Destroy a bpf_iter_cpumask
+ * @it: The bpf_iter_cpumask to be destroyed.
+ */
+__bpf_kfunc void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it)
+{
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_SET8_START(cpumask_kfunc_btf_ids)
@@ -450,6 +516,9 @@  BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW | KF_RCU_PROTECTED | KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_cpumask_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_cpumask_destroy, KF_ITER_DESTROY)
 BTF_SET8_END(cpumask_kfunc_btf_ids)
 
 static const struct btf_kfunc_id_set cpumask_kfunc_set = {