diff mbox series

[bpf-next,1/3] bpf: Free elements after one RCU-tasks-trace grace period

Message ID 20221011071128.3470622-2-houtao@huaweicloud.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Remove unnecessary RCU grace period chaining | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 5 this patch: 5
netdev/cc_maintainers warning 2 maintainers not CCed: song@kernel.org martin.lau@linux.dev
netdev/build_clang success Errors and warnings before: 5 this patch: 5
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 5 this patch: 5
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 36 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-4 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-1 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-10 fail Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 fail Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-6 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 fail Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs_no_alu32 on s390x with gcc

Commit Message

Hou Tao Oct. 11, 2022, 7:11 a.m. UTC
From: Hou Tao <houtao1@huawei.com>

According to the implementation of RCU Tasks Trace, it inovkes
->postscan_func() to wait for one RCU-tasks-trace grace period and
rcu_tasks_trace_postscan() inovkes synchronize_rcu() to wait for one
normal RCU grace period in turn, so one RCU-tasks-trace grace period
will imply one RCU grace period.

So there is no need to do call_rcu() again in the callback of
call_rcu_tasks_trace() and it can just free these elements directly.

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 kernel/bpf/memalloc.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

Comments

Paul E. McKenney Oct. 11, 2022, 9:07 a.m. UTC | #1
On Tue, Oct 11, 2022 at 03:11:26PM +0800, Hou Tao wrote:
> From: Hou Tao <houtao1@huawei.com>
> 
> According to the implementation of RCU Tasks Trace, it inovkes
> ->postscan_func() to wait for one RCU-tasks-trace grace period and
> rcu_tasks_trace_postscan() inovkes synchronize_rcu() to wait for one
> normal RCU grace period in turn, so one RCU-tasks-trace grace period
> will imply one RCU grace period.
> 
> So there is no need to do call_rcu() again in the callback of
> call_rcu_tasks_trace() and it can just free these elements directly.

This is true, but this is an implementation detail that is not guaranteed
in future versions of the kernel.  But if this additional call_rcu()
is causing trouble, I could add some API member that returned true in
kernels where it does happen to be the case that call_rcu_tasks_trace()
implies a call_rcu()-style grace period.

The BPF memory allocator could then complain or adapt, as appropriate.

Thoughts?

							Thanx, Paul

> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>  kernel/bpf/memalloc.c | 17 ++++++-----------
>  1 file changed, 6 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> index 5f83be1d2018..6f32dddc804f 100644
> --- a/kernel/bpf/memalloc.c
> +++ b/kernel/bpf/memalloc.c
> @@ -209,6 +209,9 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
>  	kfree(obj);
>  }
>  
> +/* Now RCU Tasks grace period implies RCU grace period, so no need to do
> + * an extra call_rcu().
> + */
>  static void __free_rcu(struct rcu_head *head)
>  {
>  	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> @@ -220,13 +223,6 @@ static void __free_rcu(struct rcu_head *head)
>  	atomic_set(&c->call_rcu_in_progress, 0);
>  }
>  
> -static void __free_rcu_tasks_trace(struct rcu_head *head)
> -{
> -	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> -
> -	call_rcu(&c->rcu, __free_rcu);
> -}
> -
>  static void enque_to_free(struct bpf_mem_cache *c, void *obj)
>  {
>  	struct llist_node *llnode = obj;
> @@ -252,11 +248,10 @@ static void do_call_rcu(struct bpf_mem_cache *c)
>  		 * from __free_rcu() and from drain_mem_cache().
>  		 */
>  		__llist_add(llnode, &c->waiting_for_gp);
> -	/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
> -	 * Then use call_rcu() to wait for normal progs to finish
> -	 * and finally do free_one() on each element.
> +	/* Use call_rcu_tasks_trace() to wait for both sleepable and normal
> +	 * progs to finish and finally do free_one() on each element.
>  	 */
> -	call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
> +	call_rcu_tasks_trace(&c->rcu, __free_rcu);
>  }
>  
>  static void free_bulk(struct bpf_mem_cache *c)
> -- 
> 2.29.2
>
Hou Tao Oct. 11, 2022, 11:31 a.m. UTC | #2
Hi,

On 10/11/2022 5:07 PM, Paul E. McKenney wrote:
> On Tue, Oct 11, 2022 at 03:11:26PM +0800, Hou Tao wrote:
>> From: Hou Tao <houtao1@huawei.com>
>>
>> According to the implementation of RCU Tasks Trace, it inovkes
>> ->postscan_func() to wait for one RCU-tasks-trace grace period and
>> rcu_tasks_trace_postscan() inovkes synchronize_rcu() to wait for one
>> normal RCU grace period in turn, so one RCU-tasks-trace grace period
>> will imply one RCU grace period.
>>
>> So there is no need to do call_rcu() again in the callback of
>> call_rcu_tasks_trace() and it can just free these elements directly.
> This is true, but this is an implementation detail that is not guaranteed
> in future versions of the kernel.  But if this additional call_rcu()
> is causing trouble, I could add some API member that returned true in
> kernels where it does happen to be the case that call_rcu_tasks_trace()
> implies a call_rcu()-style grace period.
>
> The BPF memory allocator could then complain or adapt, as appropriate.
>
> Thoughts?
It is indeed an implementation details. But In an idle KVM guest, for memory
reclamation in bpf memory allocator a RCU tasks trace grace period is about 30ms
and RCU grace period is about 20 ms. Under stress condition, the grace period
will be much longer. If the extra RCU grace period can be removed, these memory
can be reclaimed more quickly and it will be beneficial for memory pressure.

Now it seems we can use RCU poll APIs (e.g. get_state_synchronize_rcu() and
poll_state_synchronize_rcu() ) to check whether or not a RCU grace period has
passed. But It needs to add at least one unsigned long into the freeing object.
The extra memory overhead may be OK for bpf memory allocator, but it is not for
small object. So could you please show example on how these new APIs work ? Does
it need to modify the to-be-free object ?
>
> 							Thanx, Paul
>
>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>> ---
>>  kernel/bpf/memalloc.c | 17 ++++++-----------
>>  1 file changed, 6 insertions(+), 11 deletions(-)
>>
>> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
>> index 5f83be1d2018..6f32dddc804f 100644
>> --- a/kernel/bpf/memalloc.c
>> +++ b/kernel/bpf/memalloc.c
>> @@ -209,6 +209,9 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
>>  	kfree(obj);
>>  }
>>  
>> +/* Now RCU Tasks grace period implies RCU grace period, so no need to do
>> + * an extra call_rcu().
>> + */
>>  static void __free_rcu(struct rcu_head *head)
>>  {
>>  	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>> @@ -220,13 +223,6 @@ static void __free_rcu(struct rcu_head *head)
>>  	atomic_set(&c->call_rcu_in_progress, 0);
>>  }
>>  
>> -static void __free_rcu_tasks_trace(struct rcu_head *head)
>> -{
>> -	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>> -
>> -	call_rcu(&c->rcu, __free_rcu);
>> -}
>> -
>>  static void enque_to_free(struct bpf_mem_cache *c, void *obj)
>>  {
>>  	struct llist_node *llnode = obj;
>> @@ -252,11 +248,10 @@ static void do_call_rcu(struct bpf_mem_cache *c)
>>  		 * from __free_rcu() and from drain_mem_cache().
>>  		 */
>>  		__llist_add(llnode, &c->waiting_for_gp);
>> -	/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
>> -	 * Then use call_rcu() to wait for normal progs to finish
>> -	 * and finally do free_one() on each element.
>> +	/* Use call_rcu_tasks_trace() to wait for both sleepable and normal
>> +	 * progs to finish and finally do free_one() on each element.
>>  	 */
>> -	call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
>> +	call_rcu_tasks_trace(&c->rcu, __free_rcu);
>>  }
>>  
>>  static void free_bulk(struct bpf_mem_cache *c)
>> -- 
>> 2.29.2
>>
Paul E. McKenney Oct. 12, 2022, 6:36 a.m. UTC | #3
On Tue, Oct 11, 2022 at 07:31:28PM +0800, Hou Tao wrote:
> Hi,
> 
> On 10/11/2022 5:07 PM, Paul E. McKenney wrote:
> > On Tue, Oct 11, 2022 at 03:11:26PM +0800, Hou Tao wrote:
> >> From: Hou Tao <houtao1@huawei.com>
> >>
> >> According to the implementation of RCU Tasks Trace, it inovkes
> >> ->postscan_func() to wait for one RCU-tasks-trace grace period and
> >> rcu_tasks_trace_postscan() inovkes synchronize_rcu() to wait for one
> >> normal RCU grace period in turn, so one RCU-tasks-trace grace period
> >> will imply one RCU grace period.
> >>
> >> So there is no need to do call_rcu() again in the callback of
> >> call_rcu_tasks_trace() and it can just free these elements directly.
> > This is true, but this is an implementation detail that is not guaranteed
> > in future versions of the kernel.  But if this additional call_rcu()
> > is causing trouble, I could add some API member that returned true in
> > kernels where it does happen to be the case that call_rcu_tasks_trace()
> > implies a call_rcu()-style grace period.
> >
> > The BPF memory allocator could then complain or adapt, as appropriate.
> >
> > Thoughts?
> It is indeed an implementation details. But In an idle KVM guest, for memory
> reclamation in bpf memory allocator a RCU tasks trace grace period is about 30ms
> and RCU grace period is about 20 ms. Under stress condition, the grace period
> will be much longer. If the extra RCU grace period can be removed, these memory
> can be reclaimed more quickly and it will be beneficial for memory pressure.

I understand the benefits.  We just need to get a safe way to take
advantage of them.

> Now it seems we can use RCU poll APIs (e.g. get_state_synchronize_rcu() and
> poll_state_synchronize_rcu() ) to check whether or not a RCU grace period has
> passed. But It needs to add at least one unsigned long into the freeing object.
> The extra memory overhead may be OK for bpf memory allocator, but it is not for
> small object. So could you please show example on how these new APIs work ? Does
> it need to modify the to-be-free object ?

Good point on the polling APIs, more on this below.

I was thinking in terms of an API like this:

	static inline bool rcu_trace_implies_rcu_gp(void)
	{
		return true;
	}

Along with comments on the synchronize_rcu() pointing at the
rcu_trace_implies_rcu_gp().

Another approach is to wait for the grace periods concurrently, but this
requires not one but two rcu_head structures.

Back to the polling API.  Are these things freed individually, or can
they be grouped?  If they can be grouped, the storage for the grace-period
state could be associated with the group.

							Thanx, Paul

> >> Signed-off-by: Hou Tao <houtao1@huawei.com>
> >> ---
> >>  kernel/bpf/memalloc.c | 17 ++++++-----------
> >>  1 file changed, 6 insertions(+), 11 deletions(-)
> >>
> >> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> >> index 5f83be1d2018..6f32dddc804f 100644
> >> --- a/kernel/bpf/memalloc.c
> >> +++ b/kernel/bpf/memalloc.c
> >> @@ -209,6 +209,9 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
> >>  	kfree(obj);
> >>  }
> >>  
> >> +/* Now RCU Tasks grace period implies RCU grace period, so no need to do
> >> + * an extra call_rcu().
> >> + */
> >>  static void __free_rcu(struct rcu_head *head)
> >>  {
> >>  	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> >> @@ -220,13 +223,6 @@ static void __free_rcu(struct rcu_head *head)
> >>  	atomic_set(&c->call_rcu_in_progress, 0);
> >>  }
> >>  
> >> -static void __free_rcu_tasks_trace(struct rcu_head *head)
> >> -{
> >> -	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> >> -
> >> -	call_rcu(&c->rcu, __free_rcu);
> >> -}
> >> -
> >>  static void enque_to_free(struct bpf_mem_cache *c, void *obj)
> >>  {
> >>  	struct llist_node *llnode = obj;
> >> @@ -252,11 +248,10 @@ static void do_call_rcu(struct bpf_mem_cache *c)
> >>  		 * from __free_rcu() and from drain_mem_cache().
> >>  		 */
> >>  		__llist_add(llnode, &c->waiting_for_gp);
> >> -	/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
> >> -	 * Then use call_rcu() to wait for normal progs to finish
> >> -	 * and finally do free_one() on each element.
> >> +	/* Use call_rcu_tasks_trace() to wait for both sleepable and normal
> >> +	 * progs to finish and finally do free_one() on each element.
> >>  	 */
> >> -	call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
> >> +	call_rcu_tasks_trace(&c->rcu, __free_rcu);
> >>  }
> >>  
> >>  static void free_bulk(struct bpf_mem_cache *c)
> >> -- 
> >> 2.29.2
> >>
>
Hou Tao Oct. 12, 2022, 9:26 a.m. UTC | #4
Hi,

On 10/12/2022 2:36 PM, Paul E. McKenney wrote:
> On Tue, Oct 11, 2022 at 07:31:28PM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 10/11/2022 5:07 PM, Paul E. McKenney wrote:
>>> On Tue, Oct 11, 2022 at 03:11:26PM +0800, Hou Tao wrote:
>>>> From: Hou Tao <houtao1@huawei.com>
>>>>
>>>> According to the implementation of RCU Tasks Trace, it inovkes
>>>> ->postscan_func() to wait for one RCU-tasks-trace grace period and
>>>> rcu_tasks_trace_postscan() inovkes synchronize_rcu() to wait for one
>>>> normal RCU grace period in turn, so one RCU-tasks-trace grace period
>>>> will imply one RCU grace period.
>>>>
>>>> So there is no need to do call_rcu() again in the callback of
>>>> call_rcu_tasks_trace() and it can just free these elements directly.
>>> This is true, but this is an implementation detail that is not guaranteed
>>> in future versions of the kernel.  But if this additional call_rcu()
>>> is causing trouble, I could add some API member that returned true in
>>> kernels where it does happen to be the case that call_rcu_tasks_trace()
>>> implies a call_rcu()-style grace period.
>>>
>>> The BPF memory allocator could then complain or adapt, as appropriate.
>>>
>>> Thoughts?
>> It is indeed an implementation details. But In an idle KVM guest, for memory
>> reclamation in bpf memory allocator a RCU tasks trace grace period is about 30ms
>> and RCU grace period is about 20 ms. Under stress condition, the grace period
>> will be much longer. If the extra RCU grace period can be removed, these memory
>> can be reclaimed more quickly and it will be beneficial for memory pressure.
> I understand the benefits.  We just need to get a safe way to take
> advantage of them.
>
>> Now it seems we can use RCU poll APIs (e.g. get_state_synchronize_rcu() and
>> poll_state_synchronize_rcu() ) to check whether or not a RCU grace period has
>> passed. But It needs to add at least one unsigned long into the freeing object.
>> The extra memory overhead may be OK for bpf memory allocator, but it is not for
>> small object. So could you please show example on how these new APIs work ? Does
>> it need to modify the to-be-free object ?
> Good point on the polling APIs, more on this below.
>
> I was thinking in terms of an API like this:
>
> 	static inline bool rcu_trace_implies_rcu_gp(void)
> 	{
> 		return true;
> 	}
>
> Along with comments on the synchronize_rcu() pointing at the
> rcu_trace_implies_rcu_gp().
It is a simple API and the modifications for call_rcu_tasks_trace() users will
also be simple. The callback of call_rcu_tasks_trace() will invoke
rcu_trace_implies_rcu_gp(), and it returns true, the callback invokes the
callback for call_rcu() directly, else it does so through call_rcu().
>
> Another approach is to wait for the grace periods concurrently, but this
> requires not one but two rcu_head structures.
Beside the extra space usage, does it also complicate the logic in callback
function because it needs to handle the concurrency problem ?
>
> Back to the polling API.  Are these things freed individually, or can
> they be grouped?  If they can be grouped, the storage for the grace-period
> state could be associated with the group.
As said above, for bpf memory allocator it may be OK because it frees elements
in batch, but for bpf local storage and its element these memories are freed
individually. So I think if the implication of RCU tasks trace grace period will
not be changed in the foreseeable future, adding rcu_trace_implies_rcu_gp() and
using it in bpf is a good idea. What do you think ?
>
> 							Thanx, Paul
>
>>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>>>> ---
>>>>  kernel/bpf/memalloc.c | 17 ++++++-----------
>>>>  1 file changed, 6 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
>>>> index 5f83be1d2018..6f32dddc804f 100644
>>>> --- a/kernel/bpf/memalloc.c
>>>> +++ b/kernel/bpf/memalloc.c
>>>> @@ -209,6 +209,9 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
>>>>  	kfree(obj);
>>>>  }
>>>>  
>>>> +/* Now RCU Tasks grace period implies RCU grace period, so no need to do
>>>> + * an extra call_rcu().
>>>> + */
>>>>  static void __free_rcu(struct rcu_head *head)
>>>>  {
>>>>  	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>>>> @@ -220,13 +223,6 @@ static void __free_rcu(struct rcu_head *head)
>>>>  	atomic_set(&c->call_rcu_in_progress, 0);
>>>>  }
>>>>  
>>>> -static void __free_rcu_tasks_trace(struct rcu_head *head)
>>>> -{
>>>> -	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>>>> -
>>>> -	call_rcu(&c->rcu, __free_rcu);
>>>> -}
>>>> -
>>>>  static void enque_to_free(struct bpf_mem_cache *c, void *obj)
>>>>  {
>>>>  	struct llist_node *llnode = obj;
>>>> @@ -252,11 +248,10 @@ static void do_call_rcu(struct bpf_mem_cache *c)
>>>>  		 * from __free_rcu() and from drain_mem_cache().
>>>>  		 */
>>>>  		__llist_add(llnode, &c->waiting_for_gp);
>>>> -	/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
>>>> -	 * Then use call_rcu() to wait for normal progs to finish
>>>> -	 * and finally do free_one() on each element.
>>>> +	/* Use call_rcu_tasks_trace() to wait for both sleepable and normal
>>>> +	 * progs to finish and finally do free_one() on each element.
>>>>  	 */
>>>> -	call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
>>>> +	call_rcu_tasks_trace(&c->rcu, __free_rcu);
>>>>  }
>>>>  
>>>>  static void free_bulk(struct bpf_mem_cache *c)
>>>> -- 
>>>> 2.29.2
>>>>
Paul E. McKenney Oct. 12, 2022, 4:11 p.m. UTC | #5
On Wed, Oct 12, 2022 at 05:26:26PM +0800, Hou Tao wrote:
> Hi,
> 
> On 10/12/2022 2:36 PM, Paul E. McKenney wrote:
> > On Tue, Oct 11, 2022 at 07:31:28PM +0800, Hou Tao wrote:
> >> Hi,
> >>
> >> On 10/11/2022 5:07 PM, Paul E. McKenney wrote:
> >>> On Tue, Oct 11, 2022 at 03:11:26PM +0800, Hou Tao wrote:
> >>>> From: Hou Tao <houtao1@huawei.com>
> >>>>
> >>>> According to the implementation of RCU Tasks Trace, it inovkes
> >>>> ->postscan_func() to wait for one RCU-tasks-trace grace period and
> >>>> rcu_tasks_trace_postscan() inovkes synchronize_rcu() to wait for one
> >>>> normal RCU grace period in turn, so one RCU-tasks-trace grace period
> >>>> will imply one RCU grace period.
> >>>>
> >>>> So there is no need to do call_rcu() again in the callback of
> >>>> call_rcu_tasks_trace() and it can just free these elements directly.
> >>> This is true, but this is an implementation detail that is not guaranteed
> >>> in future versions of the kernel.  But if this additional call_rcu()
> >>> is causing trouble, I could add some API member that returned true in
> >>> kernels where it does happen to be the case that call_rcu_tasks_trace()
> >>> implies a call_rcu()-style grace period.
> >>>
> >>> The BPF memory allocator could then complain or adapt, as appropriate.
> >>>
> >>> Thoughts?
> >> It is indeed an implementation details. But In an idle KVM guest, for memory
> >> reclamation in bpf memory allocator a RCU tasks trace grace period is about 30ms
> >> and RCU grace period is about 20 ms. Under stress condition, the grace period
> >> will be much longer. If the extra RCU grace period can be removed, these memory
> >> can be reclaimed more quickly and it will be beneficial for memory pressure.
> > I understand the benefits.  We just need to get a safe way to take
> > advantage of them.
> >
> >> Now it seems we can use RCU poll APIs (e.g. get_state_synchronize_rcu() and
> >> poll_state_synchronize_rcu() ) to check whether or not a RCU grace period has
> >> passed. But It needs to add at least one unsigned long into the freeing object.
> >> The extra memory overhead may be OK for bpf memory allocator, but it is not for
> >> small object. So could you please show example on how these new APIs work ? Does
> >> it need to modify the to-be-free object ?
> > Good point on the polling APIs, more on this below.
> >
> > I was thinking in terms of an API like this:
> >
> > 	static inline bool rcu_trace_implies_rcu_gp(void)
> > 	{
> > 		return true;
> > 	}
> >
> > Along with comments on the synchronize_rcu() pointing at the
> > rcu_trace_implies_rcu_gp().
> 
> It is a simple API and the modifications for call_rcu_tasks_trace() users will
> also be simple. The callback of call_rcu_tasks_trace() will invoke
> rcu_trace_implies_rcu_gp(), and it returns true, the callback invokes the
> callback for call_rcu() directly, else it does so through call_rcu().

Sounds good!

Please note that if the callback function just does kfree() or equivalent,
this will work fine.  If it acquires spinlocks, you may need to do
local_bh_disable() before invoking it directly and local_bh_enable()
afterwards.

> > Another approach is to wait for the grace periods concurrently, but this
> > requires not one but two rcu_head structures.
> 
> Beside the extra space usage, does it also complicate the logic in callback
> function because it needs to handle the concurrency problem ?

Definitely!!!

Perhaps something like this:

	static void cbf(struct rcu_head *rhp)
	{
		p = container_of(rhp, ...);

		if (atomic_dec_and_test(&p->cbs_awaiting))
			kfree(p);
	}

	atomic_set(&p->cbs_awating, 2);
	call_rcu(p->rh1, cbf);
	call_rcu_tasks_trace(p->rh2, cbf);

Is this worth it?  I have no idea.  I must defer to you.

> > Back to the polling API.  Are these things freed individually, or can
> > they be grouped?  If they can be grouped, the storage for the grace-period
> > state could be associated with the group.
> 
> As said above, for bpf memory allocator it may be OK because it frees elements
> in batch, but for bpf local storage and its element these memories are freed
> individually. So I think if the implication of RCU tasks trace grace period will
> not be changed in the foreseeable future, adding rcu_trace_implies_rcu_gp() and
> using it in bpf is a good idea. What do you think ?

Maybe the BPF memory allocator does it one way and BPF local storage
does it another way.

How about if I produce a patch for rcu_trace_implies_rcu_gp() and let
you carry it with your series?  That way I don't have an unused function
in -rcu and you don't have to wait for me to send it upstream?

							Thanx, Paul

> >>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
> >>>> ---
> >>>>  kernel/bpf/memalloc.c | 17 ++++++-----------
> >>>>  1 file changed, 6 insertions(+), 11 deletions(-)
> >>>>
> >>>> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> >>>> index 5f83be1d2018..6f32dddc804f 100644
> >>>> --- a/kernel/bpf/memalloc.c
> >>>> +++ b/kernel/bpf/memalloc.c
> >>>> @@ -209,6 +209,9 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
> >>>>  	kfree(obj);
> >>>>  }
> >>>>  
> >>>> +/* Now RCU Tasks grace period implies RCU grace period, so no need to do
> >>>> + * an extra call_rcu().
> >>>> + */
> >>>>  static void __free_rcu(struct rcu_head *head)
> >>>>  {
> >>>>  	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> >>>> @@ -220,13 +223,6 @@ static void __free_rcu(struct rcu_head *head)
> >>>>  	atomic_set(&c->call_rcu_in_progress, 0);
> >>>>  }
> >>>>  
> >>>> -static void __free_rcu_tasks_trace(struct rcu_head *head)
> >>>> -{
> >>>> -	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> >>>> -
> >>>> -	call_rcu(&c->rcu, __free_rcu);
> >>>> -}
> >>>> -
> >>>>  static void enque_to_free(struct bpf_mem_cache *c, void *obj)
> >>>>  {
> >>>>  	struct llist_node *llnode = obj;
> >>>> @@ -252,11 +248,10 @@ static void do_call_rcu(struct bpf_mem_cache *c)
> >>>>  		 * from __free_rcu() and from drain_mem_cache().
> >>>>  		 */
> >>>>  		__llist_add(llnode, &c->waiting_for_gp);
> >>>> -	/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
> >>>> -	 * Then use call_rcu() to wait for normal progs to finish
> >>>> -	 * and finally do free_one() on each element.
> >>>> +	/* Use call_rcu_tasks_trace() to wait for both sleepable and normal
> >>>> +	 * progs to finish and finally do free_one() on each element.
> >>>>  	 */
> >>>> -	call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
> >>>> +	call_rcu_tasks_trace(&c->rcu, __free_rcu);
> >>>>  }
> >>>>  
> >>>>  static void free_bulk(struct bpf_mem_cache *c)
> >>>> -- 
> >>>> 2.29.2
> >>>>
>
Hou Tao Oct. 13, 2022, 1:25 a.m. UTC | #6
Hi,

On 10/13/2022 12:11 AM, Paul E. McKenney wrote:
> On Wed, Oct 12, 2022 at 05:26:26PM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 10/12/2022 2:36 PM, Paul E. McKenney wrote:
>>> On Tue, Oct 11, 2022 at 07:31:28PM +0800, Hou Tao wrote:
>>>> Hi,
>>>>
>>>> On 10/11/2022 5:07 PM, Paul E. McKenney wrote:
>>>>> On Tue, Oct 11, 2022 at 03:11:26PM +0800, Hou Tao wrote:
>>>>>> From: Hou Tao <houtao1@huawei.com>
>>>>>>
>>>>>> According to the implementation of RCU Tasks Trace, it inovkes
>>>>>> ->postscan_func() to wait for one RCU-tasks-trace grace period and
>>>>>> rcu_tasks_trace_postscan() inovkes synchronize_rcu() to wait for one
>>>>>> normal RCU grace period in turn, so one RCU-tasks-trace grace period
>>>>>> will imply one RCU grace period.
>>>>>>
>>>>>> So there is no need to do call_rcu() again in the callback of
>>>>>> call_rcu_tasks_trace() and it can just free these elements directly.
>>>>> This is true, but this is an implementation detail that is not guaranteed
>>>>> in future versions of the kernel.  But if this additional call_rcu()
>>>>> is causing trouble, I could add some API member that returned true in
>>>>> kernels where it does happen to be the case that call_rcu_tasks_trace()
>>>>> implies a call_rcu()-style grace period.
>>>>>
>>>>> The BPF memory allocator could then complain or adapt, as appropriate.
>>>>>
>>>>> Thoughts?
>>>> It is indeed an implementation details. But In an idle KVM guest, for memory
>>>> reclamation in bpf memory allocator a RCU tasks trace grace period is about 30ms
>>>> and RCU grace period is about 20 ms. Under stress condition, the grace period
>>>> will be much longer. If the extra RCU grace period can be removed, these memory
>>>> can be reclaimed more quickly and it will be beneficial for memory pressure.
>>> I understand the benefits.  We just need to get a safe way to take
>>> advantage of them.
>>>
>>>> Now it seems we can use RCU poll APIs (e.g. get_state_synchronize_rcu() and
>>>> poll_state_synchronize_rcu() ) to check whether or not a RCU grace period has
>>>> passed. But It needs to add at least one unsigned long into the freeing object.
>>>> The extra memory overhead may be OK for bpf memory allocator, but it is not for
>>>> small object. So could you please show example on how these new APIs work ? Does
>>>> it need to modify the to-be-free object ?
>>> Good point on the polling APIs, more on this below.
>>>
>>> I was thinking in terms of an API like this:
>>>
>>> 	static inline bool rcu_trace_implies_rcu_gp(void)
>>> 	{
>>> 		return true;
>>> 	}
>>>
>>> Along with comments on the synchronize_rcu() pointing at the
>>> rcu_trace_implies_rcu_gp().
>> It is a simple API and the modifications for call_rcu_tasks_trace() users will
>> also be simple. The callback of call_rcu_tasks_trace() will invoke
>> rcu_trace_implies_rcu_gp(), and it returns true, the callback invokes the
>> callback for call_rcu() directly, else it does so through call_rcu().
> Sounds good!
>
> Please note that if the callback function just does kfree() or equivalent,
> this will work fine.  If it acquires spinlocks, you may need to do
> local_bh_disable() before invoking it directly and local_bh_enable()
> afterwards.
What is the purpose for invoking local_bh_disable() ? Is it trying to ensure the
callback is called under soft-irq context or something else ? For all I know,
task rcu already invokes its callback with soft-irq disabled.
>
>>> Another approach is to wait for the grace periods concurrently, but this
>>> requires not one but two rcu_head structures.
>> Beside the extra space usage, does it also complicate the logic in callback
>> function because it needs to handle the concurrency problem ?
> Definitely!!!
>
> Perhaps something like this:
>
> 	static void cbf(struct rcu_head *rhp)
> 	{
> 		p = container_of(rhp, ...);
>
> 		if (atomic_dec_and_test(&p->cbs_awaiting))
> 			kfree(p);
> 	}
>
> 	atomic_set(&p->cbs_awating, 2);
> 	call_rcu(p->rh1, cbf);
> 	call_rcu_tasks_trace(p->rh2, cbf);
>
> Is this worth it?  I have no idea.  I must defer to you.
I still prefer the simple solution.
>
>>> Back to the polling API.  Are these things freed individually, or can
>>> they be grouped?  If they can be grouped, the storage for the grace-period
>>> state could be associated with the group.
>> As said above, for bpf memory allocator it may be OK because it frees elements
>> in batch, but for bpf local storage and its element these memories are freed
>> individually. So I think if the implication of RCU tasks trace grace period will
>> not be changed in the foreseeable future, adding rcu_trace_implies_rcu_gp() and
>> using it in bpf is a good idea. What do you think ?
> Maybe the BPF memory allocator does it one way and BPF local storage
> does it another way.
Why not. Maybe bpf expert think the space overhead is also reasonable in the BPF
local storage case.
>
> How about if I produce a patch for rcu_trace_implies_rcu_gp() and let
> you carry it with your series?  That way I don't have an unused function
> in -rcu and you don't have to wait for me to send it upstream?
Sound reasonable to me. Also thanks for your suggestions.
>
> 							Thanx, Paul
>
>>>>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>>>>>> ---
>>>>>>  kernel/bpf/memalloc.c | 17 ++++++-----------
>>>>>>  1 file changed, 6 insertions(+), 11 deletions(-)
>>>>>>
>>>>>> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
>>>>>> index 5f83be1d2018..6f32dddc804f 100644
>>>>>> --- a/kernel/bpf/memalloc.c
>>>>>> +++ b/kernel/bpf/memalloc.c
>>>>>> @@ -209,6 +209,9 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
>>>>>>  	kfree(obj);
>>>>>>  }
>>>>>>  
>>>>>> +/* Now RCU Tasks grace period implies RCU grace period, so no need to do
>>>>>> + * an extra call_rcu().
>>>>>> + */
>>>>>>  static void __free_rcu(struct rcu_head *head)
>>>>>>  {
>>>>>>  	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>>>>>> @@ -220,13 +223,6 @@ static void __free_rcu(struct rcu_head *head)
>>>>>>  	atomic_set(&c->call_rcu_in_progress, 0);
>>>>>>  }
>>>>>>  
>>>>>> -static void __free_rcu_tasks_trace(struct rcu_head *head)
>>>>>> -{
>>>>>> -	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>>>>>> -
>>>>>> -	call_rcu(&c->rcu, __free_rcu);
>>>>>> -}
>>>>>> -
>>>>>>  static void enque_to_free(struct bpf_mem_cache *c, void *obj)
>>>>>>  {
>>>>>>  	struct llist_node *llnode = obj;
>>>>>> @@ -252,11 +248,10 @@ static void do_call_rcu(struct bpf_mem_cache *c)
>>>>>>  		 * from __free_rcu() and from drain_mem_cache().
>>>>>>  		 */
>>>>>>  		__llist_add(llnode, &c->waiting_for_gp);
>>>>>> -	/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
>>>>>> -	 * Then use call_rcu() to wait for normal progs to finish
>>>>>> -	 * and finally do free_one() on each element.
>>>>>> +	/* Use call_rcu_tasks_trace() to wait for both sleepable and normal
>>>>>> +	 * progs to finish and finally do free_one() on each element.
>>>>>>  	 */
>>>>>> -	call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
>>>>>> +	call_rcu_tasks_trace(&c->rcu, __free_rcu);
>>>>>>  }
>>>>>>  
>>>>>>  static void free_bulk(struct bpf_mem_cache *c)
>>>>>> -- 
>>>>>> 2.29.2
>>>>>>
Hou Tao Oct. 13, 2022, 1:41 a.m. UTC | #7
Hi,

On 10/13/2022 12:11 AM, Paul E. McKenney wrote:
> On Wed, Oct 12, 2022 at 05:26:26PM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 10/12/2022 2:36 PM, Paul E. McKenney wrote:
SNIP
>> As said above, for bpf memory allocator it may be OK because it frees elements
>> in batch, but for bpf local storage and its element these memories are freed
>> individually. So I think if the implication of RCU tasks trace grace period will
>> not be changed in the foreseeable future, adding rcu_trace_implies_rcu_gp() and
>> using it in bpf is a good idea. What do you think ?
> Maybe the BPF memory allocator does it one way and BPF local storage
> does it another way.
Another question. Just find out that there are new APIs for RCU polling (e.g.
get_state_synchronize_rcu_full()). According to comments, the advantage of new
API is that it will never miss a passed grace period. So for this case is
get_state_synchronize_rcu() enough ? Or should I switch to use
get_state_synchronize_rcu_full() instead ?

Regards
> How about if I produce a patch for rcu_trace_implies_rcu_gp() and let
> you carry it with your series?  That way I don't have an unused function
> in -rcu and you don't have to wait for me to send it upstream?
>
> 							Thanx, Paul
>
>>>>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>>>>>> ---
>>>>>>  kernel/bpf/memalloc.c | 17 ++++++-----------
>>>>>>  1 file changed, 6 insertions(+), 11 deletions(-)
>>>>>>
>>>>>> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
>>>>>> index 5f83be1d2018..6f32dddc804f 100644
>>>>>> --- a/kernel/bpf/memalloc.c
>>>>>> +++ b/kernel/bpf/memalloc.c
>>>>>> @@ -209,6 +209,9 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
>>>>>>  	kfree(obj);
>>>>>>  }
>>>>>>  
>>>>>> +/* Now RCU Tasks grace period implies RCU grace period, so no need to do
>>>>>> + * an extra call_rcu().
>>>>>> + */
>>>>>>  static void __free_rcu(struct rcu_head *head)
>>>>>>  {
>>>>>>  	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>>>>>> @@ -220,13 +223,6 @@ static void __free_rcu(struct rcu_head *head)
>>>>>>  	atomic_set(&c->call_rcu_in_progress, 0);
>>>>>>  }
>>>>>>  
>>>>>> -static void __free_rcu_tasks_trace(struct rcu_head *head)
>>>>>> -{
>>>>>> -	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>>>>>> -
>>>>>> -	call_rcu(&c->rcu, __free_rcu);
>>>>>> -}
>>>>>> -
>>>>>>  static void enque_to_free(struct bpf_mem_cache *c, void *obj)
>>>>>>  {
>>>>>>  	struct llist_node *llnode = obj;
>>>>>> @@ -252,11 +248,10 @@ static void do_call_rcu(struct bpf_mem_cache *c)
>>>>>>  		 * from __free_rcu() and from drain_mem_cache().
>>>>>>  		 */
>>>>>>  		__llist_add(llnode, &c->waiting_for_gp);
>>>>>> -	/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
>>>>>> -	 * Then use call_rcu() to wait for normal progs to finish
>>>>>> -	 * and finally do free_one() on each element.
>>>>>> +	/* Use call_rcu_tasks_trace() to wait for both sleepable and normal
>>>>>> +	 * progs to finish and finally do free_one() on each element.
>>>>>>  	 */
>>>>>> -	call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
>>>>>> +	call_rcu_tasks_trace(&c->rcu, __free_rcu);
>>>>>>  }
>>>>>>  
>>>>>>  static void free_bulk(struct bpf_mem_cache *c)
>>>>>> -- 
>>>>>> 2.29.2
>>>>>>
Martin KaFai Lau Oct. 13, 2022, 5:07 a.m. UTC | #8
On 10/12/22 6:25 PM, Hou Tao wrote:
>>>> Back to the polling API.  Are these things freed individually, or can
>>>> they be grouped?  If they can be grouped, the storage for the grace-period
>>>> state could be associated with the group.
>>> As said above, for bpf memory allocator it may be OK because it frees elements
>>> in batch, but for bpf local storage and its element these memories are freed
>>> individually. So I think if the implication of RCU tasks trace grace period will
>>> not be changed in the foreseeable future, adding rcu_trace_implies_rcu_gp() and
>>> using it in bpf is a good idea. What do you think ?
>> Maybe the BPF memory allocator does it one way and BPF local storage
>> does it another way.
> Why not. Maybe bpf expert think the space overhead is also reasonable in the BPF
> local storage case.

There is only 8 bytes hole left in 'struct bpf_local_storage_elem', so it is 
precious.  Put aside the space overhead, only deletion of a local storage 
requires call_rcu_tasks_trace().  The common use case for local storage is to 
alloc once and stay there for the whole life time of the sk/task/inode.  Thus, 
delete should happen very infrequent.

It will still be nice to optimize though if it does not need extra space and 
that seems possible from reading the thread.
Hou Tao Oct. 13, 2022, 9:02 a.m. UTC | #9
Hi,

On 10/13/2022 1:07 PM, Martin KaFai Lau wrote:
> On 10/12/22 6:25 PM, Hou Tao wrote:
>>>>> Back to the polling API.  Are these things freed individually, or can
>>>>> they be grouped?  If they can be grouped, the storage for the grace-period
>>>>> state could be associated with the group.
>>>> As said above, for bpf memory allocator it may be OK because it frees elements
>>>> in batch, but for bpf local storage and its element these memories are freed
>>>> individually. So I think if the implication of RCU tasks trace grace period
>>>> will
>>>> not be changed in the foreseeable future, adding rcu_trace_implies_rcu_gp()
>>>> and
>>>> using it in bpf is a good idea. What do you think ?
>>> Maybe the BPF memory allocator does it one way and BPF local storage
>>> does it another way.
>> Why not. Maybe bpf expert think the space overhead is also reasonable in the BPF
>> local storage case.
>
> There is only 8 bytes hole left in 'struct bpf_local_storage_elem', so it is
> precious.  Put aside the space overhead, only deletion of a local storage
> requires call_rcu_tasks_trace().  The common use case for local storage is to
> alloc once and stay there for the whole life time of the sk/task/inode.  Thus,
> delete should happen very infrequent.
>
> It will still be nice to optimize though if it does not need extra space and
> that seems possible from reading the thread.
Understand. Yes, if using the newly added rcu_trace_implies_rcu_gp(), there will
be no space overhead. I will use the rcu_trace_implies_rcu_gp()-way in all these
cases and defer the use of RCU polling APIs to future work.
Paul E. McKenney Oct. 13, 2022, 7 p.m. UTC | #10
On Thu, Oct 13, 2022 at 09:25:31AM +0800, Hou Tao wrote:
> Hi,
> 
> On 10/13/2022 12:11 AM, Paul E. McKenney wrote:
> > On Wed, Oct 12, 2022 at 05:26:26PM +0800, Hou Tao wrote:
> >> Hi,
> >>
> >> On 10/12/2022 2:36 PM, Paul E. McKenney wrote:
> >>> On Tue, Oct 11, 2022 at 07:31:28PM +0800, Hou Tao wrote:
> >>>> Hi,
> >>>>
> >>>> On 10/11/2022 5:07 PM, Paul E. McKenney wrote:
> >>>>> On Tue, Oct 11, 2022 at 03:11:26PM +0800, Hou Tao wrote:
> >>>>>> From: Hou Tao <houtao1@huawei.com>
> >>>>>>
> >>>>>> According to the implementation of RCU Tasks Trace, it inovkes
> >>>>>> ->postscan_func() to wait for one RCU-tasks-trace grace period and
> >>>>>> rcu_tasks_trace_postscan() inovkes synchronize_rcu() to wait for one
> >>>>>> normal RCU grace period in turn, so one RCU-tasks-trace grace period
> >>>>>> will imply one RCU grace period.
> >>>>>>
> >>>>>> So there is no need to do call_rcu() again in the callback of
> >>>>>> call_rcu_tasks_trace() and it can just free these elements directly.
> >>>>> This is true, but this is an implementation detail that is not guaranteed
> >>>>> in future versions of the kernel.  But if this additional call_rcu()
> >>>>> is causing trouble, I could add some API member that returned true in
> >>>>> kernels where it does happen to be the case that call_rcu_tasks_trace()
> >>>>> implies a call_rcu()-style grace period.
> >>>>>
> >>>>> The BPF memory allocator could then complain or adapt, as appropriate.
> >>>>>
> >>>>> Thoughts?
> >>>> It is indeed an implementation details. But In an idle KVM guest, for memory
> >>>> reclamation in bpf memory allocator a RCU tasks trace grace period is about 30ms
> >>>> and RCU grace period is about 20 ms. Under stress condition, the grace period
> >>>> will be much longer. If the extra RCU grace period can be removed, these memory
> >>>> can be reclaimed more quickly and it will be beneficial for memory pressure.
> >>> I understand the benefits.  We just need to get a safe way to take
> >>> advantage of them.
> >>>
> >>>> Now it seems we can use RCU poll APIs (e.g. get_state_synchronize_rcu() and
> >>>> poll_state_synchronize_rcu() ) to check whether or not a RCU grace period has
> >>>> passed. But It needs to add at least one unsigned long into the freeing object.
> >>>> The extra memory overhead may be OK for bpf memory allocator, but it is not for
> >>>> small object. So could you please show example on how these new APIs work ? Does
> >>>> it need to modify the to-be-free object ?
> >>> Good point on the polling APIs, more on this below.
> >>>
> >>> I was thinking in terms of an API like this:
> >>>
> >>> 	static inline bool rcu_trace_implies_rcu_gp(void)
> >>> 	{
> >>> 		return true;
> >>> 	}
> >>>
> >>> Along with comments on the synchronize_rcu() pointing at the
> >>> rcu_trace_implies_rcu_gp().
> >> It is a simple API and the modifications for call_rcu_tasks_trace() users will
> >> also be simple. The callback of call_rcu_tasks_trace() will invoke
> >> rcu_trace_implies_rcu_gp(), and it returns true, the callback invokes the
> >> callback for call_rcu() directly, else it does so through call_rcu().
> > Sounds good!
> >
> > Please note that if the callback function just does kfree() or equivalent,
> > this will work fine.  If it acquires spinlocks, you may need to do
> > local_bh_disable() before invoking it directly and local_bh_enable()
> > afterwards.
> What is the purpose for invoking local_bh_disable() ? Is it trying to ensure the
> callback is called under soft-irq context or something else ? For all I know,
> task rcu already invokes its callback with soft-irq disabled.
> >
> >>> Another approach is to wait for the grace periods concurrently, but this
> >>> requires not one but two rcu_head structures.
> >> Beside the extra space usage, does it also complicate the logic in callback
> >> function because it needs to handle the concurrency problem ?
> > Definitely!!!
> >
> > Perhaps something like this:
> >
> > 	static void cbf(struct rcu_head *rhp)
> > 	{
> > 		p = container_of(rhp, ...);
> >
> > 		if (atomic_dec_and_test(&p->cbs_awaiting))
> > 			kfree(p);
> > 	}
> >
> > 	atomic_set(&p->cbs_awating, 2);
> > 	call_rcu(p->rh1, cbf);
> > 	call_rcu_tasks_trace(p->rh2, cbf);
> >
> > Is this worth it?  I have no idea.  I must defer to you.
> I still prefer the simple solution.
> >
> >>> Back to the polling API.  Are these things freed individually, or can
> >>> they be grouped?  If they can be grouped, the storage for the grace-period
> >>> state could be associated with the group.
> >> As said above, for bpf memory allocator it may be OK because it frees elements
> >> in batch, but for bpf local storage and its element these memories are freed
> >> individually. So I think if the implication of RCU tasks trace grace period will
> >> not be changed in the foreseeable future, adding rcu_trace_implies_rcu_gp() and
> >> using it in bpf is a good idea. What do you think ?
> > Maybe the BPF memory allocator does it one way and BPF local storage
> > does it another way.
> Why not. Maybe bpf expert think the space overhead is also reasonable in the BPF
> local storage case.
> >
> > How about if I produce a patch for rcu_trace_implies_rcu_gp() and let
> > you carry it with your series?  That way I don't have an unused function
> > in -rcu and you don't have to wait for me to send it upstream?
> Sound reasonable to me. Also thanks for your suggestions.

Here you go!  Thoughts?

							Thanx, Paul

------------------------------------------------------------------------

commit 2eac2f7a9a6d8921e8084a6acdffa595e99dbd17
Author: Paul E. McKenney <paulmck@kernel.org>
Date:   Thu Oct 13 11:54:13 2022 -0700

    rcu-tasks: Provide rcu_trace_implies_rcu_gp()
    
    As an accident of implementation, an RCU Tasks Trace grace period also
    acts as an RCU grace period.  However, this could change at any time.
    This commit therefore creates an rcu_trace_implies_rcu_gp() that currently
    returns true to codify this accident.  Code relying on this accident
    must call this function to verify that this accident is still happening.
    
    Reported-by: Hou Tao <houtao@huaweicloud.com>
    Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
    Cc: Alexei Starovoitov <ast@kernel.org>
    Cc: Martin KaFai Lau <martin.lau@linux.dev>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 08605ce7379d7..8822f06e4b40c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -240,6 +240,18 @@ static inline void exit_tasks_rcu_start(void) { }
 static inline void exit_tasks_rcu_finish(void) { }
 #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
 
+/**
+ * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period?
+ *
+ * As an accident of implementation, an RCU Tasks Trace grace period also
+ * acts as an RCU grace period.  However, this could change at any time.
+ * Code relying on this accident must call this function to verify that
+ * this accident is still happening.
+ *
+ * You have been warned!
+ */
+static inline bool rcu_trace_implies_rcu_gp(void) { return true; }
+
 /**
  * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
  *
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index b0b885e071fa8..fe9840d90e960 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1535,6 +1535,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
 {
 	// Wait for late-stage exiting tasks to finish exiting.
 	// These might have passed the call to exit_tasks_rcu_finish().
+
+	// If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
 	synchronize_rcu();
 	// Any tasks that exit after this point will set
 	// TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
Paul E. McKenney Oct. 13, 2022, 7:04 p.m. UTC | #11
On Thu, Oct 13, 2022 at 09:41:46AM +0800, Hou Tao wrote:
> Hi,
> 
> On 10/13/2022 12:11 AM, Paul E. McKenney wrote:
> > On Wed, Oct 12, 2022 at 05:26:26PM +0800, Hou Tao wrote:
> >> Hi,
> >>
> >> On 10/12/2022 2:36 PM, Paul E. McKenney wrote:
> SNIP
> >> As said above, for bpf memory allocator it may be OK because it frees elements
> >> in batch, but for bpf local storage and its element these memories are freed
> >> individually. So I think if the implication of RCU tasks trace grace period will
> >> not be changed in the foreseeable future, adding rcu_trace_implies_rcu_gp() and
> >> using it in bpf is a good idea. What do you think ?
> > Maybe the BPF memory allocator does it one way and BPF local storage
> > does it another way.
> 
> Another question. Just find out that there are new APIs for RCU polling (e.g.
> get_state_synchronize_rcu_full()). According to comments, the advantage of new
> API is that it will never miss a passed grace period. So for this case is
> get_state_synchronize_rcu() enough ? Or should I switch to use
> get_state_synchronize_rcu_full() instead ?

I suggest starting with get_state_synchronize_rcu(), and moving to the
_full() variants only if experience shows that it is necessary.

Please note that these functions work with normal RCU, that is,
call_rcu(), but not call_rcu_tasks(), call_rcu_tasks_trace(), or
call_rcu_rude().  Please note also that SRCU has its own set of polling
APIs, for example, get_state_synchronize_srcu().

								Thanx, Paul

> Regards
> > How about if I produce a patch for rcu_trace_implies_rcu_gp() and let
> > you carry it with your series?  That way I don't have an unused function
> > in -rcu and you don't have to wait for me to send it upstream?
> >
> > 							Thanx, Paul
> >
> >>>>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
> >>>>>> ---
> >>>>>>  kernel/bpf/memalloc.c | 17 ++++++-----------
> >>>>>>  1 file changed, 6 insertions(+), 11 deletions(-)
> >>>>>>
> >>>>>> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> >>>>>> index 5f83be1d2018..6f32dddc804f 100644
> >>>>>> --- a/kernel/bpf/memalloc.c
> >>>>>> +++ b/kernel/bpf/memalloc.c
> >>>>>> @@ -209,6 +209,9 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
> >>>>>>  	kfree(obj);
> >>>>>>  }
> >>>>>>  
> >>>>>> +/* Now RCU Tasks grace period implies RCU grace period, so no need to do
> >>>>>> + * an extra call_rcu().
> >>>>>> + */
> >>>>>>  static void __free_rcu(struct rcu_head *head)
> >>>>>>  {
> >>>>>>  	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> >>>>>> @@ -220,13 +223,6 @@ static void __free_rcu(struct rcu_head *head)
> >>>>>>  	atomic_set(&c->call_rcu_in_progress, 0);
> >>>>>>  }
> >>>>>>  
> >>>>>> -static void __free_rcu_tasks_trace(struct rcu_head *head)
> >>>>>> -{
> >>>>>> -	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> >>>>>> -
> >>>>>> -	call_rcu(&c->rcu, __free_rcu);
> >>>>>> -}
> >>>>>> -
> >>>>>>  static void enque_to_free(struct bpf_mem_cache *c, void *obj)
> >>>>>>  {
> >>>>>>  	struct llist_node *llnode = obj;
> >>>>>> @@ -252,11 +248,10 @@ static void do_call_rcu(struct bpf_mem_cache *c)
> >>>>>>  		 * from __free_rcu() and from drain_mem_cache().
> >>>>>>  		 */
> >>>>>>  		__llist_add(llnode, &c->waiting_for_gp);
> >>>>>> -	/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
> >>>>>> -	 * Then use call_rcu() to wait for normal progs to finish
> >>>>>> -	 * and finally do free_one() on each element.
> >>>>>> +	/* Use call_rcu_tasks_trace() to wait for both sleepable and normal
> >>>>>> +	 * progs to finish and finally do free_one() on each element.
> >>>>>>  	 */
> >>>>>> -	call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
> >>>>>> +	call_rcu_tasks_trace(&c->rcu, __free_rcu);
> >>>>>>  }
> >>>>>>  
> >>>>>>  static void free_bulk(struct bpf_mem_cache *c)
> >>>>>> -- 
> >>>>>> 2.29.2
> >>>>>>
>
Hou Tao Oct. 14, 2022, 4:04 a.m. UTC | #12
Hi,

On 10/14/2022 3:04 AM, Paul E. McKenney wrote:
> On Thu, Oct 13, 2022 at 09:41:46AM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 10/13/2022 12:11 AM, Paul E. McKenney wrote:
>>> On Wed, Oct 12, 2022 at 05:26:26PM +0800, Hou Tao wrote:
>>>> Hi,
>>>>
>>>> On 10/12/2022 2:36 PM, Paul E. McKenney wrote:
>> SNIP
>>>> As said above, for bpf memory allocator it may be OK because it frees elements
>>>> in batch, but for bpf local storage and its element these memories are freed
>>>> individually. So I think if the implication of RCU tasks trace grace period will
>>>> not be changed in the foreseeable future, adding rcu_trace_implies_rcu_gp() and
>>>> using it in bpf is a good idea. What do you think ?
>>> Maybe the BPF memory allocator does it one way and BPF local storage
>>> does it another way.
>> Another question. Just find out that there are new APIs for RCU polling (e.g.
>> get_state_synchronize_rcu_full()). According to comments, the advantage of new
>> API is that it will never miss a passed grace period. So for this case is
>> get_state_synchronize_rcu() enough ? Or should I switch to use
>> get_state_synchronize_rcu_full() instead ?
> I suggest starting with get_state_synchronize_rcu(), and moving to the
> _full() variants only if experience shows that it is necessary.
>
> Please note that these functions work with normal RCU, that is,
> call_rcu(), but not call_rcu_tasks(), call_rcu_tasks_trace(), or
> call_rcu_rude().  Please note also that SRCU has its own set of polling
> APIs, for example, get_state_synchronize_srcu().
I see. Thanks for your suggestion and details explanations.
>
> 								Thanx, Paul
>
>> Regards
>>> How about if I produce a patch for rcu_trace_implies_rcu_gp() and let
>>> you carry it with your series?  That way I don't have an unused function
>>> in -rcu and you don't have to wait for me to send it upstream?
>>>
>>> 							Thanx, Paul
>>>
>>>>>>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>>>>>>>> ---
>>>>>>>>  kernel/bpf/memalloc.c | 17 ++++++-----------
>>>>>>>>  1 file changed, 6 insertions(+), 11 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
>>>>>>>> index 5f83be1d2018..6f32dddc804f 100644
>>>>>>>> --- a/kernel/bpf/memalloc.c
>>>>>>>> +++ b/kernel/bpf/memalloc.c
>>>>>>>> @@ -209,6 +209,9 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
>>>>>>>>  	kfree(obj);
>>>>>>>>  }
>>>>>>>>  
>>>>>>>> +/* Now RCU Tasks grace period implies RCU grace period, so no need to do
>>>>>>>> + * an extra call_rcu().
>>>>>>>> + */
>>>>>>>>  static void __free_rcu(struct rcu_head *head)
>>>>>>>>  {
>>>>>>>>  	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>>>>>>>> @@ -220,13 +223,6 @@ static void __free_rcu(struct rcu_head *head)
>>>>>>>>  	atomic_set(&c->call_rcu_in_progress, 0);
>>>>>>>>  }
>>>>>>>>  
>>>>>>>> -static void __free_rcu_tasks_trace(struct rcu_head *head)
>>>>>>>> -{
>>>>>>>> -	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>>>>>>>> -
>>>>>>>> -	call_rcu(&c->rcu, __free_rcu);
>>>>>>>> -}
>>>>>>>> -
>>>>>>>>  static void enque_to_free(struct bpf_mem_cache *c, void *obj)
>>>>>>>>  {
>>>>>>>>  	struct llist_node *llnode = obj;
>>>>>>>> @@ -252,11 +248,10 @@ static void do_call_rcu(struct bpf_mem_cache *c)
>>>>>>>>  		 * from __free_rcu() and from drain_mem_cache().
>>>>>>>>  		 */
>>>>>>>>  		__llist_add(llnode, &c->waiting_for_gp);
>>>>>>>> -	/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
>>>>>>>> -	 * Then use call_rcu() to wait for normal progs to finish
>>>>>>>> -	 * and finally do free_one() on each element.
>>>>>>>> +	/* Use call_rcu_tasks_trace() to wait for both sleepable and normal
>>>>>>>> +	 * progs to finish and finally do free_one() on each element.
>>>>>>>>  	 */
>>>>>>>> -	call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
>>>>>>>> +	call_rcu_tasks_trace(&c->rcu, __free_rcu);
>>>>>>>>  }
>>>>>>>>  
>>>>>>>>  static void free_bulk(struct bpf_mem_cache *c)
>>>>>>>> -- 
>>>>>>>> 2.29.2
>>>>>>>>
Hou Tao Oct. 14, 2022, 4:20 a.m. UTC | #13
Hi,

On 10/14/2022 3:00 AM, Paul E. McKenney wrote:
> On Thu, Oct 13, 2022 at 09:25:31AM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 10/13/2022 12:11 AM, Paul E. McKenney wrote:
>>> On Wed, Oct 12, 2022 at 05:26:26PM +0800, Hou Tao wrote:
SNIP
>>> How about if I produce a patch for rcu_trace_implies_rcu_gp() and let
>>> you carry it with your series?  That way I don't have an unused function
>>> in -rcu and you don't have to wait for me to send it upstream?
>> Sound reasonable to me. Also thanks for your suggestions.
> Here you go!  Thoughts?
It looks great and thanks for it.
>
> 							Thanx, Paul
>
> ------------------------------------------------------------------------
>
> commit 2eac2f7a9a6d8921e8084a6acdffa595e99dbd17
> Author: Paul E. McKenney <paulmck@kernel.org>
> Date:   Thu Oct 13 11:54:13 2022 -0700
>
>     rcu-tasks: Provide rcu_trace_implies_rcu_gp()
>     
>     As an accident of implementation, an RCU Tasks Trace grace period also
>     acts as an RCU grace period.  However, this could change at any time.
>     This commit therefore creates an rcu_trace_implies_rcu_gp() that currently
>     returns true to codify this accident.  Code relying on this accident
>     must call this function to verify that this accident is still happening.
>     
>     Reported-by: Hou Tao <houtao@huaweicloud.com>
>     Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
>     Cc: Alexei Starovoitov <ast@kernel.org>
>     Cc: Martin KaFai Lau <martin.lau@linux.dev>
>
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 08605ce7379d7..8822f06e4b40c 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -240,6 +240,18 @@ static inline void exit_tasks_rcu_start(void) { }
>  static inline void exit_tasks_rcu_finish(void) { }
>  #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
>  
> +/**
> + * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period?
> + *
> + * As an accident of implementation, an RCU Tasks Trace grace period also
> + * acts as an RCU grace period.  However, this could change at any time.
> + * Code relying on this accident must call this function to verify that
> + * this accident is still happening.
> + *
> + * You have been warned!
> + */
> +static inline bool rcu_trace_implies_rcu_gp(void) { return true; }
> +
>  /**
>   * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
>   *
> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> index b0b885e071fa8..fe9840d90e960 100644
> --- a/kernel/rcu/tasks.h
> +++ b/kernel/rcu/tasks.h
> @@ -1535,6 +1535,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
>  {
>  	// Wait for late-stage exiting tasks to finish exiting.
>  	// These might have passed the call to exit_tasks_rcu_finish().
> +
> +	// If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
>  	synchronize_rcu();
>  	// Any tasks that exit after this point will set
>  	// TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
Paul E. McKenney Oct. 14, 2022, 12:15 p.m. UTC | #14
On Fri, Oct 14, 2022 at 12:20:19PM +0800, Hou Tao wrote:
> Hi,
> 
> On 10/14/2022 3:00 AM, Paul E. McKenney wrote:
> > On Thu, Oct 13, 2022 at 09:25:31AM +0800, Hou Tao wrote:
> >> Hi,
> >>
> >> On 10/13/2022 12:11 AM, Paul E. McKenney wrote:
> >>> On Wed, Oct 12, 2022 at 05:26:26PM +0800, Hou Tao wrote:
> SNIP
> >>> How about if I produce a patch for rcu_trace_implies_rcu_gp() and let
> >>> you carry it with your series?  That way I don't have an unused function
> >>> in -rcu and you don't have to wait for me to send it upstream?
> >> Sound reasonable to me. Also thanks for your suggestions.
> > Here you go!  Thoughts?
> 
> It looks great and thanks for it.

Very good!  I will carry it in -rcu for some time, so please let me know
when/if you pull it into a series.

							Thanx, Paul

> > ------------------------------------------------------------------------
> >
> > commit 2eac2f7a9a6d8921e8084a6acdffa595e99dbd17
> > Author: Paul E. McKenney <paulmck@kernel.org>
> > Date:   Thu Oct 13 11:54:13 2022 -0700
> >
> >     rcu-tasks: Provide rcu_trace_implies_rcu_gp()
> >     
> >     As an accident of implementation, an RCU Tasks Trace grace period also
> >     acts as an RCU grace period.  However, this could change at any time.
> >     This commit therefore creates an rcu_trace_implies_rcu_gp() that currently
> >     returns true to codify this accident.  Code relying on this accident
> >     must call this function to verify that this accident is still happening.
> >     
> >     Reported-by: Hou Tao <houtao@huaweicloud.com>
> >     Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> >     Cc: Alexei Starovoitov <ast@kernel.org>
> >     Cc: Martin KaFai Lau <martin.lau@linux.dev>
> >
> > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> > index 08605ce7379d7..8822f06e4b40c 100644
> > --- a/include/linux/rcupdate.h
> > +++ b/include/linux/rcupdate.h
> > @@ -240,6 +240,18 @@ static inline void exit_tasks_rcu_start(void) { }
> >  static inline void exit_tasks_rcu_finish(void) { }
> >  #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
> >  
> > +/**
> > + * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period?
> > + *
> > + * As an accident of implementation, an RCU Tasks Trace grace period also
> > + * acts as an RCU grace period.  However, this could change at any time.
> > + * Code relying on this accident must call this function to verify that
> > + * this accident is still happening.
> > + *
> > + * You have been warned!
> > + */
> > +static inline bool rcu_trace_implies_rcu_gp(void) { return true; }
> > +
> >  /**
> >   * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
> >   *
> > diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> > index b0b885e071fa8..fe9840d90e960 100644
> > --- a/kernel/rcu/tasks.h
> > +++ b/kernel/rcu/tasks.h
> > @@ -1535,6 +1535,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
> >  {
> >  	// Wait for late-stage exiting tasks to finish exiting.
> >  	// These might have passed the call to exit_tasks_rcu_finish().
> > +
> > +	// If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
> >  	synchronize_rcu();
> >  	// Any tasks that exit after this point will set
> >  	// TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
>
Hou Tao Oct. 17, 2022, 6:55 a.m. UTC | #15
Hi,

On 10/14/2022 8:15 PM, Paul E. McKenney wrote:
> On Fri, Oct 14, 2022 at 12:20:19PM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 10/14/2022 3:00 AM, Paul E. McKenney wrote:
>>> On Thu, Oct 13, 2022 at 09:25:31AM +0800, Hou Tao wrote:
>>>> Hi,
>>>>
>>>> On 10/13/2022 12:11 AM, Paul E. McKenney wrote:
>>>>> On Wed, Oct 12, 2022 at 05:26:26PM +0800, Hou Tao wrote:
>> SNIP
>>>>> How about if I produce a patch for rcu_trace_implies_rcu_gp() and let
>>>>> you carry it with your series?  That way I don't have an unused function
>>>>> in -rcu and you don't have to wait for me to send it upstream?
>>>> Sound reasonable to me. Also thanks for your suggestions.
>>> Here you go!  Thoughts?
>> It looks great and thanks for it.
> Very good!  I will carry it in -rcu for some time, so please let me know
> when/if you pull it into a series.
Have already included the patch in v2 patch-set and send it out [0].

0: https://lore.kernel.org/bpf/20221014113946.965131-1-houtao@huaweicloud.com/T/#t

>
> 							Thanx, Paul
>
>>> ------------------------------------------------------------------------
>>>
>>> commit 2eac2f7a9a6d8921e8084a6acdffa595e99dbd17
>>> Author: Paul E. McKenney <paulmck@kernel.org>
>>> Date:   Thu Oct 13 11:54:13 2022 -0700
>>>
>>>     rcu-tasks: Provide rcu_trace_implies_rcu_gp()
>>>     
>>>     As an accident of implementation, an RCU Tasks Trace grace period also
>>>     acts as an RCU grace period.  However, this could change at any time.
>>>     This commit therefore creates an rcu_trace_implies_rcu_gp() that currently
>>>     returns true to codify this accident.  Code relying on this accident
>>>     must call this function to verify that this accident is still happening.
>>>     
>>>     Reported-by: Hou Tao <houtao@huaweicloud.com>
>>>     Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
>>>     Cc: Alexei Starovoitov <ast@kernel.org>
>>>     Cc: Martin KaFai Lau <martin.lau@linux.dev>
>>>
>>> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
>>> index 08605ce7379d7..8822f06e4b40c 100644
>>> --- a/include/linux/rcupdate.h
>>> +++ b/include/linux/rcupdate.h
>>> @@ -240,6 +240,18 @@ static inline void exit_tasks_rcu_start(void) { }
>>>  static inline void exit_tasks_rcu_finish(void) { }
>>>  #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
>>>  
>>> +/**
>>> + * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period?
>>> + *
>>> + * As an accident of implementation, an RCU Tasks Trace grace period also
>>> + * acts as an RCU grace period.  However, this could change at any time.
>>> + * Code relying on this accident must call this function to verify that
>>> + * this accident is still happening.
>>> + *
>>> + * You have been warned!
>>> + */
>>> +static inline bool rcu_trace_implies_rcu_gp(void) { return true; }
>>> +
>>>  /**
>>>   * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
>>>   *
>>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
>>> index b0b885e071fa8..fe9840d90e960 100644
>>> --- a/kernel/rcu/tasks.h
>>> +++ b/kernel/rcu/tasks.h
>>> @@ -1535,6 +1535,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
>>>  {
>>>  	// Wait for late-stage exiting tasks to finish exiting.
>>>  	// These might have passed the call to exit_tasks_rcu_finish().
>>> +
>>> +	// If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
>>>  	synchronize_rcu();
>>>  	// Any tasks that exit after this point will set
>>>  	// TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
> .
Paul E. McKenney Oct. 17, 2022, 10:23 a.m. UTC | #16
On Mon, Oct 17, 2022 at 02:55:42PM +0800, Hou Tao wrote:
> Hi,
> 
> On 10/14/2022 8:15 PM, Paul E. McKenney wrote:
> > On Fri, Oct 14, 2022 at 12:20:19PM +0800, Hou Tao wrote:
> >> Hi,
> >>
> >> On 10/14/2022 3:00 AM, Paul E. McKenney wrote:
> >>> On Thu, Oct 13, 2022 at 09:25:31AM +0800, Hou Tao wrote:
> >>>> Hi,
> >>>>
> >>>> On 10/13/2022 12:11 AM, Paul E. McKenney wrote:
> >>>>> On Wed, Oct 12, 2022 at 05:26:26PM +0800, Hou Tao wrote:
> >> SNIP
> >>>>> How about if I produce a patch for rcu_trace_implies_rcu_gp() and let
> >>>>> you carry it with your series?  That way I don't have an unused function
> >>>>> in -rcu and you don't have to wait for me to send it upstream?
> >>>> Sound reasonable to me. Also thanks for your suggestions.
> >>> Here you go!  Thoughts?
> >> It looks great and thanks for it.
> > Very good!  I will carry it in -rcu for some time, so please let me know
> > when/if you pull it into a series.
> Have already included the patch in v2 patch-set and send it out [0].
> 
> 0: https://lore.kernel.org/bpf/20221014113946.965131-1-houtao@huaweicloud.com/T/#t

Thank you, I will drop it from -rcu on my next rebase.  Should you need
to refer to it again, it will remain at tag rcutrace-rcu.2022.10.13a.

							Thanx, Paul

> >>> ------------------------------------------------------------------------
> >>>
> >>> commit 2eac2f7a9a6d8921e8084a6acdffa595e99dbd17
> >>> Author: Paul E. McKenney <paulmck@kernel.org>
> >>> Date:   Thu Oct 13 11:54:13 2022 -0700
> >>>
> >>>     rcu-tasks: Provide rcu_trace_implies_rcu_gp()
> >>>     
> >>>     As an accident of implementation, an RCU Tasks Trace grace period also
> >>>     acts as an RCU grace period.  However, this could change at any time.
> >>>     This commit therefore creates an rcu_trace_implies_rcu_gp() that currently
> >>>     returns true to codify this accident.  Code relying on this accident
> >>>     must call this function to verify that this accident is still happening.
> >>>     
> >>>     Reported-by: Hou Tao <houtao@huaweicloud.com>
> >>>     Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> >>>     Cc: Alexei Starovoitov <ast@kernel.org>
> >>>     Cc: Martin KaFai Lau <martin.lau@linux.dev>
> >>>
> >>> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> >>> index 08605ce7379d7..8822f06e4b40c 100644
> >>> --- a/include/linux/rcupdate.h
> >>> +++ b/include/linux/rcupdate.h
> >>> @@ -240,6 +240,18 @@ static inline void exit_tasks_rcu_start(void) { }
> >>>  static inline void exit_tasks_rcu_finish(void) { }
> >>>  #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
> >>>  
> >>> +/**
> >>> + * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period?
> >>> + *
> >>> + * As an accident of implementation, an RCU Tasks Trace grace period also
> >>> + * acts as an RCU grace period.  However, this could change at any time.
> >>> + * Code relying on this accident must call this function to verify that
> >>> + * this accident is still happening.
> >>> + *
> >>> + * You have been warned!
> >>> + */
> >>> +static inline bool rcu_trace_implies_rcu_gp(void) { return true; }
> >>> +
> >>>  /**
> >>>   * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
> >>>   *
> >>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> >>> index b0b885e071fa8..fe9840d90e960 100644
> >>> --- a/kernel/rcu/tasks.h
> >>> +++ b/kernel/rcu/tasks.h
> >>> @@ -1535,6 +1535,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
> >>>  {
> >>>  	// Wait for late-stage exiting tasks to finish exiting.
> >>>  	// These might have passed the call to exit_tasks_rcu_finish().
> >>> +
> >>> +	// If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
> >>>  	synchronize_rcu();
> >>>  	// Any tasks that exit after this point will set
> >>>  	// TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
> > .
>
diff mbox series

Patch

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 5f83be1d2018..6f32dddc804f 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -209,6 +209,9 @@  static void free_one(struct bpf_mem_cache *c, void *obj)
 	kfree(obj);
 }
 
+/* Now RCU Tasks grace period implies RCU grace period, so no need to do
+ * an extra call_rcu().
+ */
 static void __free_rcu(struct rcu_head *head)
 {
 	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
@@ -220,13 +223,6 @@  static void __free_rcu(struct rcu_head *head)
 	atomic_set(&c->call_rcu_in_progress, 0);
 }
 
-static void __free_rcu_tasks_trace(struct rcu_head *head)
-{
-	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
-
-	call_rcu(&c->rcu, __free_rcu);
-}
-
 static void enque_to_free(struct bpf_mem_cache *c, void *obj)
 {
 	struct llist_node *llnode = obj;
@@ -252,11 +248,10 @@  static void do_call_rcu(struct bpf_mem_cache *c)
 		 * from __free_rcu() and from drain_mem_cache().
 		 */
 		__llist_add(llnode, &c->waiting_for_gp);
-	/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
-	 * Then use call_rcu() to wait for normal progs to finish
-	 * and finally do free_one() on each element.
+	/* Use call_rcu_tasks_trace() to wait for both sleepable and normal
+	 * progs to finish and finally do free_one() on each element.
 	 */
-	call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
+	call_rcu_tasks_trace(&c->rcu, __free_rcu);
 }
 
 static void free_bulk(struct bpf_mem_cache *c)