diff mbox series

percpu_ref: call wake_up_all() after percpu_ref_put() completes

Message ID 20220407103335.36885-1-zhengqi.arch@bytedance.com (mailing list archive)
State New
Headers show
Series percpu_ref: call wake_up_all() after percpu_ref_put() completes | expand

Commit Message

Qi Zheng April 7, 2022, 10:33 a.m. UTC
In the percpu_ref_call_confirm_rcu(), we call the wake_up_all()
before calling percpu_ref_put(), which will cause the value of
percpu_ref to be unstable when percpu_ref_switch_to_atomic_sync()
returns.

	CPU0				CPU1

percpu_ref_switch_to_atomic_sync(&ref)
--> percpu_ref_switch_to_atomic(&ref)
    --> percpu_ref_get(ref);	/* put after confirmation */
	call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);

					percpu_ref_switch_to_atomic_rcu
					--> percpu_ref_call_confirm_rcu
					    --> data->confirm_switch = NULL;
						wake_up_all(&percpu_ref_switch_waitq);

    /* here waiting to wake up */
    wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
						(A)percpu_ref_put(ref);
/* The value of &ref is unstable! */
percpu_ref_is_zero(&ref)
						(B)percpu_ref_put(ref);

As shown above, assuming that the counts on each cpu add up to 0 before
calling percpu_ref_switch_to_atomic_sync(), we expect that after switching
to atomic mode, percpu_ref_is_zero() can return true. But actually it will
return different values in the two cases of A and B, which is not what
we expected.

Maybe the original purpose of percpu_ref_switch_to_atomic_sync() is
just to ensure that the conversion to atomic mode is completed, but it
should not return with an extra reference count.

Calling wake_up_all() after percpu_ref_put() ensures that the value of
percpu_ref is stable after percpu_ref_switch_to_atomic_sync() returns.
So just do it.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 lib/percpu-refcount.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

Comments

Andrew Morton April 7, 2022, 10:57 p.m. UTC | #1
(cc Ming Lei)

On Thu,  7 Apr 2022 18:33:35 +0800 Qi Zheng <zhengqi.arch@bytedance.com> wrote:

> In the percpu_ref_call_confirm_rcu(), we call the wake_up_all()
> before calling percpu_ref_put(), which will cause the value of
> percpu_ref to be unstable when percpu_ref_switch_to_atomic_sync()
> returns.
> 
> 	CPU0				CPU1
> 
> percpu_ref_switch_to_atomic_sync(&ref)
> --> percpu_ref_switch_to_atomic(&ref)
>     --> percpu_ref_get(ref);	/* put after confirmation */
> 	call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
> 
> 					percpu_ref_switch_to_atomic_rcu
> 					--> percpu_ref_call_confirm_rcu
> 					    --> data->confirm_switch = NULL;
> 						wake_up_all(&percpu_ref_switch_waitq);
> 
>     /* here waiting to wake up */
>     wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
> 						(A)percpu_ref_put(ref);
> /* The value of &ref is unstable! */
> percpu_ref_is_zero(&ref)
> 						(B)percpu_ref_put(ref);
> 
> As shown above, assuming that the counts on each cpu add up to 0 before
> calling percpu_ref_switch_to_atomic_sync(), we expect that after switching
> to atomic mode, percpu_ref_is_zero() can return true. But actually it will
> return different values in the two cases of A and B, which is not what
> we expected.
> 
> Maybe the original purpose of percpu_ref_switch_to_atomic_sync() is
> just to ensure that the conversion to atomic mode is completed, but it
> should not return with an extra reference count.
> 
> Calling wake_up_all() after percpu_ref_put() ensures that the value of
> percpu_ref is stable after percpu_ref_switch_to_atomic_sync() returns.
> So just do it.

Thanks.  I'll grab this, but shall await input from others before doing
anything else with it.

> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> +++ b/lib/percpu-refcount.c
> @@ -154,13 +154,14 @@ static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
>  
>  	data->confirm_switch(ref);
>  	data->confirm_switch = NULL;
> -	wake_up_all(&percpu_ref_switch_waitq);
>  
>  	if (!data->allow_reinit)
>  		__percpu_ref_exit(ref);
>  
>  	/* drop ref from percpu_ref_switch_to_atomic() */
>  	percpu_ref_put(ref);
> +
> +	wake_up_all(&percpu_ref_switch_waitq);
>  }
>  
>  static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
Dennis Zhou April 8, 2022, 12:39 a.m. UTC | #2
Hello,

On Thu, Apr 07, 2022 at 03:57:52PM -0700, Andrew Morton wrote:
> (cc Ming Lei)
> 
> On Thu,  7 Apr 2022 18:33:35 +0800 Qi Zheng <zhengqi.arch@bytedance.com> wrote:
> 
> > In the percpu_ref_call_confirm_rcu(), we call the wake_up_all()
> > before calling percpu_ref_put(), which will cause the value of
> > percpu_ref to be unstable when percpu_ref_switch_to_atomic_sync()
> > returns.
> > 
> > 	CPU0				CPU1
> > 
> > percpu_ref_switch_to_atomic_sync(&ref)
> > --> percpu_ref_switch_to_atomic(&ref)
> >     --> percpu_ref_get(ref);	/* put after confirmation */
> > 	call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
> > 
> > 					percpu_ref_switch_to_atomic_rcu
> > 					--> percpu_ref_call_confirm_rcu
> > 					    --> data->confirm_switch = NULL;
> > 						wake_up_all(&percpu_ref_switch_waitq);
> > 
> >     /* here waiting to wake up */
> >     wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
> > 						(A)percpu_ref_put(ref);
> > /* The value of &ref is unstable! */
> > percpu_ref_is_zero(&ref)
> > 						(B)percpu_ref_put(ref);
> > 
> > As shown above, assuming that the counts on each cpu add up to 0 before
> > calling percpu_ref_switch_to_atomic_sync(), we expect that after switching
> > to atomic mode, percpu_ref_is_zero() can return true. But actually it will
> > return different values in the two cases of A and B, which is not what
> > we expected.
> > 
> > Maybe the original purpose of percpu_ref_switch_to_atomic_sync() is
> > just to ensure that the conversion to atomic mode is completed, but it
> > should not return with an extra reference count.
> > 
> > Calling wake_up_all() after percpu_ref_put() ensures that the value of
> > percpu_ref is stable after percpu_ref_switch_to_atomic_sync() returns.
> > So just do it.
> 
> Thanks.  I'll grab this, but shall await input from others before doing
> anything else with it.
> 

Seems right to me. The percpu_ref protects the __percpu_ref_exit(), not
the waiters.

Acked-by: Dennis Zhou <dennis@kernel.org>

Thanks,
Dennis

> > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > +++ b/lib/percpu-refcount.c
> > @@ -154,13 +154,14 @@ static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
> >  
> >  	data->confirm_switch(ref);
> >  	data->confirm_switch = NULL;
> > -	wake_up_all(&percpu_ref_switch_waitq);
> >  
> >  	if (!data->allow_reinit)
> >  		__percpu_ref_exit(ref);
> >  
> >  	/* drop ref from percpu_ref_switch_to_atomic() */
> >  	percpu_ref_put(ref);
> > +
> > +	wake_up_all(&percpu_ref_switch_waitq);
> >  }
> >  
> >  static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
>
Ming Lei April 8, 2022, 1:40 a.m. UTC | #3
On Thu, Apr 07, 2022 at 03:57:52PM -0700, Andrew Morton wrote:
> (cc Ming Lei)
> 
> On Thu,  7 Apr 2022 18:33:35 +0800 Qi Zheng <zhengqi.arch@bytedance.com> wrote:
> 
> > In the percpu_ref_call_confirm_rcu(), we call the wake_up_all()
> > before calling percpu_ref_put(), which will cause the value of
> > percpu_ref to be unstable when percpu_ref_switch_to_atomic_sync()
> > returns.
> > 
> > 	CPU0				CPU1
> > 
> > percpu_ref_switch_to_atomic_sync(&ref)
> > --> percpu_ref_switch_to_atomic(&ref)
> >     --> percpu_ref_get(ref);	/* put after confirmation */
> > 	call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
> > 
> > 					percpu_ref_switch_to_atomic_rcu
> > 					--> percpu_ref_call_confirm_rcu
> > 					    --> data->confirm_switch = NULL;
> > 						wake_up_all(&percpu_ref_switch_waitq);
> > 
> >     /* here waiting to wake up */
> >     wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
> > 						(A)percpu_ref_put(ref);
> > /* The value of &ref is unstable! */
> > percpu_ref_is_zero(&ref)
> > 						(B)percpu_ref_put(ref);
> > 
> > As shown above, assuming that the counts on each cpu add up to 0 before
> > calling percpu_ref_switch_to_atomic_sync(), we expect that after switching
> > to atomic mode, percpu_ref_is_zero() can return true. But actually it will

Looks all current users expect the refcount is stable after percpu_ref_switch_to_atomic_sync
returns, even though the API itself doesn't mention the point explicitly.

> > return different values in the two cases of A and B, which is not what
> > we expected.
> > 
> > Maybe the original purpose of percpu_ref_switch_to_atomic_sync() is
> > just to ensure that the conversion to atomic mode is completed, but it
> > should not return with an extra reference count.
> > 
> > Calling wake_up_all() after percpu_ref_put() ensures that the value of
> > percpu_ref is stable after percpu_ref_switch_to_atomic_sync() returns.
> > So just do it.
> 
> Thanks.  I'll grab this, but shall await input from others before doing
> anything else with it.
> 
> > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > +++ b/lib/percpu-refcount.c
> > @@ -154,13 +154,14 @@ static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
> >  
> >  	data->confirm_switch(ref);
> >  	data->confirm_switch = NULL;
> > -	wake_up_all(&percpu_ref_switch_waitq);
> >  
> >  	if (!data->allow_reinit)
> >  		__percpu_ref_exit(ref);
> >  
> >  	/* drop ref from percpu_ref_switch_to_atomic() */
> >  	percpu_ref_put(ref);
> > +
> > +	wake_up_all(&percpu_ref_switch_waitq);
> >  }

Looks fine:

Reviewed-by: Ming Lei <ming.lei@redhat.com>


Thanks,
Ming
Muchun Song April 8, 2022, 2:54 a.m. UTC | #4
On Thu, Apr 07, 2022 at 06:33:35PM +0800, Qi Zheng wrote:
> In the percpu_ref_call_confirm_rcu(), we call the wake_up_all()
> before calling percpu_ref_put(), which will cause the value of
> percpu_ref to be unstable when percpu_ref_switch_to_atomic_sync()
> returns.
> 
> 	CPU0				CPU1
> 
> percpu_ref_switch_to_atomic_sync(&ref)
> --> percpu_ref_switch_to_atomic(&ref)
>     --> percpu_ref_get(ref);	/* put after confirmation */
> 	call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
> 
> 					percpu_ref_switch_to_atomic_rcu
> 					--> percpu_ref_call_confirm_rcu
> 					    --> data->confirm_switch = NULL;
> 						wake_up_all(&percpu_ref_switch_waitq);
> 
>     /* here waiting to wake up */
>     wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
> 						(A)percpu_ref_put(ref);
> /* The value of &ref is unstable! */
> percpu_ref_is_zero(&ref)
> 						(B)percpu_ref_put(ref);
> 
> As shown above, assuming that the counts on each cpu add up to 0 before
> calling percpu_ref_switch_to_atomic_sync(), we expect that after switching
> to atomic mode, percpu_ref_is_zero() can return true. But actually it will
> return different values in the two cases of A and B, which is not what
> we expected.
> 
> Maybe the original purpose of percpu_ref_switch_to_atomic_sync() is
> just to ensure that the conversion to atomic mode is completed, but it
> should not return with an extra reference count.
> 
> Calling wake_up_all() after percpu_ref_put() ensures that the value of
> percpu_ref is stable after percpu_ref_switch_to_atomic_sync() returns.
> So just do it.
> 
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>

Are any users affected by this?  If so, I think a Fixes tag
is necessary.

The fix LGTM.

Reviewed-by: Muchun Song <songmuchun@bytedance.com>

Thanks.
Qi Zheng April 8, 2022, 3:50 a.m. UTC | #5
On 2022/4/8 10:54 AM, Muchun Song wrote:
> On Thu, Apr 07, 2022 at 06:33:35PM +0800, Qi Zheng wrote:
>> In the percpu_ref_call_confirm_rcu(), we call the wake_up_all()
>> before calling percpu_ref_put(), which will cause the value of
>> percpu_ref to be unstable when percpu_ref_switch_to_atomic_sync()
>> returns.
>>
>> 	CPU0				CPU1
>>
>> percpu_ref_switch_to_atomic_sync(&ref)
>> --> percpu_ref_switch_to_atomic(&ref)
>>      --> percpu_ref_get(ref);	/* put after confirmation */
>> 	call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
>>
>> 					percpu_ref_switch_to_atomic_rcu
>> 					--> percpu_ref_call_confirm_rcu
>> 					    --> data->confirm_switch = NULL;
>> 						wake_up_all(&percpu_ref_switch_waitq);
>>
>>      /* here waiting to wake up */
>>      wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
>> 						(A)percpu_ref_put(ref);
>> /* The value of &ref is unstable! */
>> percpu_ref_is_zero(&ref)
>> 						(B)percpu_ref_put(ref);
>>
>> As shown above, assuming that the counts on each cpu add up to 0 before
>> calling percpu_ref_switch_to_atomic_sync(), we expect that after switching
>> to atomic mode, percpu_ref_is_zero() can return true. But actually it will
>> return different values in the two cases of A and B, which is not what
>> we expected.
>>
>> Maybe the original purpose of percpu_ref_switch_to_atomic_sync() is
>> just to ensure that the conversion to atomic mode is completed, but it
>> should not return with an extra reference count.
>>
>> Calling wake_up_all() after percpu_ref_put() ensures that the value of
>> percpu_ref is stable after percpu_ref_switch_to_atomic_sync() returns.
>> So just do it.
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> 
> Are any users affected by this?  If so, I think a Fixes tag
> is necessary.

Looks all current users(blk_pre_runtime_suspend() and set_in_sync()) are
affected by this.

I see that this patch has been merged into the mm tree, can Andrew help
me add the following Fixes tag?

Fixes: 490c79a65708 ("percpu_ref: decouple switching to atomic mode and 
killing")

Thanks,
Qi

> 
> The fix LGTM.
> 
> Reviewed-by: Muchun Song <songmuchun@bytedance.com>
> 
> Thanks.
>
Andrew Morton April 8, 2022, 3:54 a.m. UTC | #6
On Fri, 8 Apr 2022 11:50:05 +0800 Qi Zheng <zhengqi.arch@bytedance.com> wrote:

> 
> 
> On 2022/4/8 10:54 AM, Muchun Song wrote:
> > On Thu, Apr 07, 2022 at 06:33:35PM +0800, Qi Zheng wrote:
> >> In the percpu_ref_call_confirm_rcu(), we call the wake_up_all()
> >> before calling percpu_ref_put(), which will cause the value of
> >> percpu_ref to be unstable when percpu_ref_switch_to_atomic_sync()
> >> returns.
> >>
> >> 	CPU0				CPU1
> >>
> >> percpu_ref_switch_to_atomic_sync(&ref)
> >> --> percpu_ref_switch_to_atomic(&ref)
> >>      --> percpu_ref_get(ref);	/* put after confirmation */
> >> 	call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
> >>
> >> 					percpu_ref_switch_to_atomic_rcu
> >> 					--> percpu_ref_call_confirm_rcu
> >> 					    --> data->confirm_switch = NULL;
> >> 						wake_up_all(&percpu_ref_switch_waitq);
> >>
> >>      /* here waiting to wake up */
> >>      wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
> >> 						(A)percpu_ref_put(ref);
> >> /* The value of &ref is unstable! */
> >> percpu_ref_is_zero(&ref)
> >> 						(B)percpu_ref_put(ref);
> >>
> >> As shown above, assuming that the counts on each cpu add up to 0 before
> >> calling percpu_ref_switch_to_atomic_sync(), we expect that after switching
> >> to atomic mode, percpu_ref_is_zero() can return true. But actually it will
> >> return different values in the two cases of A and B, which is not what
> >> we expected.
> >>
> >> Maybe the original purpose of percpu_ref_switch_to_atomic_sync() is
> >> just to ensure that the conversion to atomic mode is completed, but it
> >> should not return with an extra reference count.
> >>
> >> Calling wake_up_all() after percpu_ref_put() ensures that the value of
> >> percpu_ref is stable after percpu_ref_switch_to_atomic_sync() returns.
> >> So just do it.
> >>
> >> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > 
> > Are any users affected by this?  If so, I think a Fixes tag
> > is necessary.
> 
> Looks all current users(blk_pre_runtime_suspend() and set_in_sync()) are
> affected by this.
> 
> I see that this patch has been merged into the mm tree, can Andrew help
> me add the following Fixes tag?

Andrew is helpful ;)

Do you see reasons why we should backport this into -stable trees? 
It's 8 years old, so my uninformed guess is "no"?
Qi Zheng April 8, 2022, 4:06 a.m. UTC | #7
On 2022/4/8 11:54 AM, Andrew Morton wrote:
> On Fri, 8 Apr 2022 11:50:05 +0800 Qi Zheng <zhengqi.arch@bytedance.com> wrote:
> 
>>
>>
>> On 2022/4/8 10:54 AM, Muchun Song wrote:
>>> On Thu, Apr 07, 2022 at 06:33:35PM +0800, Qi Zheng wrote:
>>>> In the percpu_ref_call_confirm_rcu(), we call the wake_up_all()
>>>> before calling percpu_ref_put(), which will cause the value of
>>>> percpu_ref to be unstable when percpu_ref_switch_to_atomic_sync()
>>>> returns.
>>>>
>>>> 	CPU0				CPU1
>>>>
>>>> percpu_ref_switch_to_atomic_sync(&ref)
>>>> --> percpu_ref_switch_to_atomic(&ref)
>>>>       --> percpu_ref_get(ref);	/* put after confirmation */
>>>> 	call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
>>>>
>>>> 					percpu_ref_switch_to_atomic_rcu
>>>> 					--> percpu_ref_call_confirm_rcu
>>>> 					    --> data->confirm_switch = NULL;
>>>> 						wake_up_all(&percpu_ref_switch_waitq);
>>>>
>>>>       /* here waiting to wake up */
>>>>       wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
>>>> 						(A)percpu_ref_put(ref);
>>>> /* The value of &ref is unstable! */
>>>> percpu_ref_is_zero(&ref)
>>>> 						(B)percpu_ref_put(ref);
>>>>
>>>> As shown above, assuming that the counts on each cpu add up to 0 before
>>>> calling percpu_ref_switch_to_atomic_sync(), we expect that after switching
>>>> to atomic mode, percpu_ref_is_zero() can return true. But actually it will
>>>> return different values in the two cases of A and B, which is not what
>>>> we expected.
>>>>
>>>> Maybe the original purpose of percpu_ref_switch_to_atomic_sync() is
>>>> just to ensure that the conversion to atomic mode is completed, but it
>>>> should not return with an extra reference count.
>>>>
>>>> Calling wake_up_all() after percpu_ref_put() ensures that the value of
>>>> percpu_ref is stable after percpu_ref_switch_to_atomic_sync() returns.
>>>> So just do it.
>>>>
>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>
>>> Are any users affected by this?  If so, I think a Fixes tag
>>> is necessary.
>>
>> Looks all current users(blk_pre_runtime_suspend() and set_in_sync()) are
>> affected by this.
>>
>> I see that this patch has been merged into the mm tree, can Andrew help
>> me add the following Fixes tag?
> 
> Andrew is helpful ;)
> 
> Do you see reasons why we should backport this into -stable trees?
> It's 8 years old, so my uninformed guess is "no"?

Hmm, although the commit 490c79a65708 add wake_up_all(), it is no
problem for the usage at that time, maybe the correct Fixes tag is the
following:

Fixes: 210f7cdcf088 ("percpu-refcount: support synchronous switch to 
atomic mode.")

But in fact, there is no problem with it, but all current users expect
the refcount is stable after percpu_ref_switch_to_atomic_sync() returns.

I have no idea as which Fixes tag to add.

>
Andrew Morton April 8, 2022, 4:10 a.m. UTC | #8
On Fri, 8 Apr 2022 12:06:20 +0800 Qi Zheng <zhengqi.arch@bytedance.com> wrote:

> 
> 
> >>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> >>>
> >>> Are any users affected by this?  If so, I think a Fixes tag
> >>> is necessary.
> >>
> >> Looks all current users(blk_pre_runtime_suspend() and set_in_sync()) are
> >> affected by this.
> >>
> >> I see that this patch has been merged into the mm tree, can Andrew help
> >> me add the following Fixes tag?
> > 
> > Andrew is helpful ;)
> > 
> > Do you see reasons why we should backport this into -stable trees?
> > It's 8 years old, so my uninformed guess is "no"?
> 
> Hmm, although the commit 490c79a65708 add wake_up_all(), it is no
> problem for the usage at that time, maybe the correct Fixes tag is the
> following:
> 
> Fixes: 210f7cdcf088 ("percpu-refcount: support synchronous switch to 
> atomic mode.")
> 
> But in fact, there is no problem with it, but all current users expect
> the refcount is stable after percpu_ref_switch_to_atomic_sync() returns.
> 
> I have no idea as which Fixes tag to add.

Well the solution to that problem is to add cc:stable and let Greg
figure it out ;)

The more serious question is "should we backport this".  What is the
end-user-visible impact of the bug?  Do our users need the fix or not?
Qi Zheng April 8, 2022, 4:14 a.m. UTC | #9
On 2022/4/8 12:10 PM, Andrew Morton wrote:
> On Fri, 8 Apr 2022 12:06:20 +0800 Qi Zheng <zhengqi.arch@bytedance.com> wrote:
> 
>>
>>
>>>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>
>>>>> Are any users affected by this?  If so, I think a Fixes tag
>>>>> is necessary.
>>>>
>>>> Looks all current users(blk_pre_runtime_suspend() and set_in_sync()) are
>>>> affected by this.
>>>>
>>>> I see that this patch has been merged into the mm tree, can Andrew help
>>>> me add the following Fixes tag?
>>>
>>> Andrew is helpful ;)
>>>
>>> Do you see reasons why we should backport this into -stable trees?
>>> It's 8 years old, so my uninformed guess is "no"?
>>
>> Hmm, although the commit 490c79a65708 add wake_up_all(), it is no
>> problem for the usage at that time, maybe the correct Fixes tag is the
>> following:
>>
>> Fixes: 210f7cdcf088 ("percpu-refcount: support synchronous switch to
>> atomic mode.")
>>
>> But in fact, there is no problem with it, but all current users expect
>> the refcount is stable after percpu_ref_switch_to_atomic_sync() returns.
>>
>> I have no idea as which Fixes tag to add.
> 
> Well the solution to that problem is to add cc:stable and let Greg
> figure it out ;)
> 
> The more serious question is "should we backport this".  What is the
> end-user-visible impact of the bug?  Do our users need the fix or not?

The impact on the current user is that it is possible to miss an 
opportunity to reach 0 due to the case B in the commit message:

/* The value of &ref is unstable! */
percpu_ref_is_zero(&ref)
						(B)percpu_ref_put(ref);

Thanks,
Qi

>
Qi Zheng April 8, 2022, 4:16 a.m. UTC | #10
On 2022/4/8 12:14 PM, Qi Zheng wrote:
> 
> 
> On 2022/4/8 12:10 PM, Andrew Morton wrote:
>> On Fri, 8 Apr 2022 12:06:20 +0800 Qi Zheng 
>> <zhengqi.arch@bytedance.com> wrote:
>>
>>>
>>>
>>>>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>>
>>>>>> Are any users affected by this?  If so, I think a Fixes tag
>>>>>> is necessary.
>>>>>
>>>>> Looks all current users(blk_pre_runtime_suspend() and 
>>>>> set_in_sync()) are
>>>>> affected by this.
>>>>>
>>>>> I see that this patch has been merged into the mm tree, can Andrew 
>>>>> help
>>>>> me add the following Fixes tag?
>>>>
>>>> Andrew is helpful ;)
>>>>
>>>> Do you see reasons why we should backport this into -stable trees?
>>>> It's 8 years old, so my uninformed guess is "no"?
>>>
>>> Hmm, although the commit 490c79a65708 add wake_up_all(), it is no
>>> problem for the usage at that time, maybe the correct Fixes tag is the
>>> following:
>>>
>>> Fixes: 210f7cdcf088 ("percpu-refcount: support synchronous switch to
>>> atomic mode.")
>>>
>>> But in fact, there is no problem with it, but all current users expect
>>> the refcount is stable after percpu_ref_switch_to_atomic_sync() returns.
>>>
>>> I have no idea as which Fixes tag to add.
>>
>> Well the solution to that problem is to add cc:stable and let Greg
>> figure it out ;)
>>
>> The more serious question is "should we backport this".  What is the
>> end-user-visible impact of the bug?  Do our users need the fix or not?
> 
> The impact on the current user is that it is possible to miss an 
> opportunity to reach 0 due to the case B in the commit message:

There may be performance issues, but should not cause serious bugs.

> 
> /* The value of &ref is unstable! */
> percpu_ref_is_zero(&ref)
>                          (B)percpu_ref_put(ref);
> 
> Thanks,
> Qi
> 
>>
>
Dennis Zhou April 8, 2022, 5:57 a.m. UTC | #11
On Fri, Apr 08, 2022 at 12:14:54PM +0800, Qi Zheng wrote:
> 
> 
> On 2022/4/8 12:10 PM, Andrew Morton wrote:
> > On Fri, 8 Apr 2022 12:06:20 +0800 Qi Zheng <zhengqi.arch@bytedance.com> wrote:
> > 
> > > 
> > > 
> > > > > > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > > > > > 
> > > > > > Are any users affected by this?  If so, I think a Fixes tag
> > > > > > is necessary.
> > > > > 
> > > > > Looks all current users(blk_pre_runtime_suspend() and set_in_sync()) are
> > > > > affected by this.
> > > > > 
> > > > > I see that this patch has been merged into the mm tree, can Andrew help
> > > > > me add the following Fixes tag?
> > > > 
> > > > Andrew is helpful ;)
> > > > 
> > > > Do you see reasons why we should backport this into -stable trees?
> > > > It's 8 years old, so my uninformed guess is "no"?
> > > 
> > > Hmm, although the commit 490c79a65708 add wake_up_all(), it is no
> > > problem for the usage at that time, maybe the correct Fixes tag is the
> > > following:
> > > 
> > > Fixes: 210f7cdcf088 ("percpu-refcount: support synchronous switch to
> > > atomic mode.")
> > > 
> > > But in fact, there is no problem with it, but all current users expect
> > > the refcount is stable after percpu_ref_switch_to_atomic_sync() returns.
> > > 
> > > I have no idea as which Fixes tag to add.
> > 
> > Well the solution to that problem is to add cc:stable and let Greg
> > figure it out ;)
> > 
> > The more serious question is "should we backport this".  What is the
> > end-user-visible impact of the bug?  Do our users need the fix or not?
> 
> The impact on the current user is that it is possible to miss an opportunity
> to reach 0 due to the case B in the commit message:
> 

Did you find this bug through code inspection or was the finding
motivated by a production incident?

The usage in block/blk-pm.c looks problematic, but I'm guessing this is
a really, really hard bug to trigger. You need to have the wake up be
faster than an atomic decrement. The q_usage_counter allows reinit so it
skips the __percpu_ref_exit() call.

Thanks,
Dennis
Qi Zheng April 8, 2022, 6:28 a.m. UTC | #12
On 2022/4/8 1:57 PM, Dennis Zhou wrote:
> On Fri, Apr 08, 2022 at 12:14:54PM +0800, Qi Zheng wrote:
>>
>>
>> On 2022/4/8 12:10 PM, Andrew Morton wrote:
>>> On Fri, 8 Apr 2022 12:06:20 +0800 Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>>>
>>>>
>>>>
>>>>>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>>>
>>>>>>> Are any users affected by this?  If so, I think a Fixes tag
>>>>>>> is necessary.
>>>>>>
>>>>>> Looks all current users(blk_pre_runtime_suspend() and set_in_sync()) are
>>>>>> affected by this.
>>>>>>
>>>>>> I see that this patch has been merged into the mm tree, can Andrew help
>>>>>> me add the following Fixes tag?
>>>>>
>>>>> Andrew is helpful ;)
>>>>>
>>>>> Do you see reasons why we should backport this into -stable trees?
>>>>> It's 8 years old, so my uninformed guess is "no"?
>>>>
>>>> Hmm, although the commit 490c79a65708 add wake_up_all(), it is no
>>>> problem for the usage at that time, maybe the correct Fixes tag is the
>>>> following:
>>>>
>>>> Fixes: 210f7cdcf088 ("percpu-refcount: support synchronous switch to
>>>> atomic mode.")
>>>>
>>>> But in fact, there is no problem with it, but all current users expect
>>>> the refcount is stable after percpu_ref_switch_to_atomic_sync() returns.
>>>>
>>>> I have no idea as which Fixes tag to add.
>>>
>>> Well the solution to that problem is to add cc:stable and let Greg
>>> figure it out ;)
>>>
>>> The more serious question is "should we backport this".  What is the
>>> end-user-visible impact of the bug?  Do our users need the fix or not?
>>
>> The impact on the current user is that it is possible to miss an opportunity
>> to reach 0 due to the case B in the commit message:
>>
> 
> Did you find this bug through code inspection or was the finding
> motivated by a production incident?

I find this bug through code inspection, because I want to use
percpu_ref_switch_to_atomic_sync()+percpu_ref_is_zero() to do something
similar.

> 
> The usage in block/blk-pm.c looks problematic, but I'm guessing this is
> a really, really hard bug to trigger. You need to have the wake up be

Agree, I manually added the delay in wake_up_all() and percpu_ref_put()
to trigger the case B.

> faster than an atomic decrement. The q_usage_counter allows reinit so it
> skips the __percpu_ref_exit() call.
> 
> Thanks,
> Dennis
Tejun Heo April 8, 2022, 5:41 p.m. UTC | #13
Hello,

On Thu, Apr 07, 2022 at 06:33:35PM +0800, Qi Zheng wrote:
> In the percpu_ref_call_confirm_rcu(), we call the wake_up_all()
> before calling percpu_ref_put(), which will cause the value of
> percpu_ref to be unstable when percpu_ref_switch_to_atomic_sync()
> returns.
> 
> 	CPU0				CPU1
> 
> percpu_ref_switch_to_atomic_sync(&ref)
> --> percpu_ref_switch_to_atomic(&ref)
>     --> percpu_ref_get(ref);	/* put after confirmation */
> 	call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
> 
> 					percpu_ref_switch_to_atomic_rcu
> 					--> percpu_ref_call_confirm_rcu
> 					    --> data->confirm_switch = NULL;
> 						wake_up_all(&percpu_ref_switch_waitq);
> 
>     /* here waiting to wake up */
>     wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
> 						(A)percpu_ref_put(ref);
> /* The value of &ref is unstable! */
> percpu_ref_is_zero(&ref)
> 						(B)percpu_ref_put(ref);
> 
> As shown above, assuming that the counts on each cpu add up to 0 before
> calling percpu_ref_switch_to_atomic_sync(), we expect that after switching
> to atomic mode, percpu_ref_is_zero() can return true. But actually it will
> return different values in the two cases of A and B, which is not what
> we expected.
> 
> Maybe the original purpose of percpu_ref_switch_to_atomic_sync() is
> just to ensure that the conversion to atomic mode is completed, but it
> should not return with an extra reference count.
> 
> Calling wake_up_all() after percpu_ref_put() ensures that the value of
> percpu_ref is stable after percpu_ref_switch_to_atomic_sync() returns.
> So just do it.
> 
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>  lib/percpu-refcount.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
> index af9302141bcf..b11b4152c8cd 100644
> --- a/lib/percpu-refcount.c
> +++ b/lib/percpu-refcount.c
> @@ -154,13 +154,14 @@ static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
>  
>  	data->confirm_switch(ref);
>  	data->confirm_switch = NULL;
> -	wake_up_all(&percpu_ref_switch_waitq);
>  
>  	if (!data->allow_reinit)
>  		__percpu_ref_exit(ref);
>  
>  	/* drop ref from percpu_ref_switch_to_atomic() */
>  	percpu_ref_put(ref);
> +
> +	wake_up_all(&percpu_ref_switch_waitq);

The interface, at least originally, doesn't give any guarantee over whether
there's gonna be a residual reference on it or not. There's nothing
necessarily wrong with guaranteeing that but it's rather unusual and given
that putting the base ref in a percpu_ref is a special "kill" operation and
a ref in percpu mode always returns %false on is_zero(), I'm not quite sure
how such semantics would be useful. Do you care to explain the use case with
concrete examples?

Also, the proposed patch is racy. There's nothing preventing
percpu_ref_switch_to_atomic_sync() from waking up early between
confirm_switch clearing and the wake_up_all, so the above change doesn't
guarantee what it tries to guarantee. For that, you'd have to move
confirm_switch clearing *after* percpu_ref_put() but then, you'd be
accessing the ref after its final ref is put which can lead to
use-after-free.

In fact, the whole premise seems wrong. The switching needs a reference to
the percpu_ref because it is accessing it asynchronously. The switching side
doesn't know when the ref is gonna go away once it puts its reference and
thus can't signal that they're done after putting their reference.

We *can* make that work by putting the whole thing in its own critical
section so that we can make confirm_switch clearing atomic with the possibly
final put, but that's gonna add some complexity and begs the question why
we'd need such a thing.

Andrew, I don't think the patch as proposed makes much sense. Maybe it'd be
better to keep it out of the tree for the time being?

Thanks.
Dennis Zhou April 8, 2022, 7:19 p.m. UTC | #14
On Fri, Apr 08, 2022 at 07:41:05AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Thu, Apr 07, 2022 at 06:33:35PM +0800, Qi Zheng wrote:
> > In the percpu_ref_call_confirm_rcu(), we call the wake_up_all()
> > before calling percpu_ref_put(), which will cause the value of
> > percpu_ref to be unstable when percpu_ref_switch_to_atomic_sync()
> > returns.
> > 
> > 	CPU0				CPU1
> > 
> > percpu_ref_switch_to_atomic_sync(&ref)
> > --> percpu_ref_switch_to_atomic(&ref)
> >     --> percpu_ref_get(ref);	/* put after confirmation */
> > 	call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
> > 
> > 					percpu_ref_switch_to_atomic_rcu
> > 					--> percpu_ref_call_confirm_rcu
> > 					    --> data->confirm_switch = NULL;
> > 						wake_up_all(&percpu_ref_switch_waitq);
> > 
> >     /* here waiting to wake up */
> >     wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
> > 						(A)percpu_ref_put(ref);
> > /* The value of &ref is unstable! */
> > percpu_ref_is_zero(&ref)
> > 						(B)percpu_ref_put(ref);
> > 
> > As shown above, assuming that the counts on each cpu add up to 0 before
> > calling percpu_ref_switch_to_atomic_sync(), we expect that after switching
> > to atomic mode, percpu_ref_is_zero() can return true. But actually it will
> > return different values in the two cases of A and B, which is not what
> > we expected.
> > 
> > Maybe the original purpose of percpu_ref_switch_to_atomic_sync() is
> > just to ensure that the conversion to atomic mode is completed, but it
> > should not return with an extra reference count.
> > 
> > Calling wake_up_all() after percpu_ref_put() ensures that the value of
> > percpu_ref is stable after percpu_ref_switch_to_atomic_sync() returns.
> > So just do it.
> > 
> > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > ---
> >  lib/percpu-refcount.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> > 
> > diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
> > index af9302141bcf..b11b4152c8cd 100644
> > --- a/lib/percpu-refcount.c
> > +++ b/lib/percpu-refcount.c
> > @@ -154,13 +154,14 @@ static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
> >  
> >  	data->confirm_switch(ref);
> >  	data->confirm_switch = NULL;
> > -	wake_up_all(&percpu_ref_switch_waitq);
> >  
> >  	if (!data->allow_reinit)
> >  		__percpu_ref_exit(ref);
> >  
> >  	/* drop ref from percpu_ref_switch_to_atomic() */
> >  	percpu_ref_put(ref);
> > +
> > +	wake_up_all(&percpu_ref_switch_waitq);
> 
> The interface, at least originally, doesn't give any guarantee over whether
> there's gonna be a residual reference on it or not. There's nothing
> necessarily wrong with guaranteeing that but it's rather unusual and given
> that putting the base ref in a percpu_ref is a special "kill" operation and
> a ref in percpu mode always returns %false on is_zero(), I'm not quite sure
> how such semantics would be useful. Do you care to explain the use case with
> concrete examples?

block/blk-pm.c has:
        percpu_ref_switch_to_atomic_sync(&q->q_usage_counter);
        if (percpu_ref_is_zero(&q->q_usage_counter))

> 
> Also, the proposed patch is racy. There's nothing preventing
> percpu_ref_switch_to_atomic_sync() from waking up early between
> confirm_switch clearing and the wake_up_all, so the above change doesn't
> guarantee what it tries to guarantee. For that, you'd have to move
> confirm_switch clearing *after* percpu_ref_put() but then, you'd be
> accessing the ref after its final ref is put which can lead to
> use-after-free.
> 

Sad that is my bad missing that.

> In fact, the whole premise seems wrong. The switching needs a reference to
> the percpu_ref because it is accessing it asynchronously. The switching side
> doesn't know when the ref is gonna go away once it puts its reference and
> thus can't signal that they're done after putting their reference.
> 

I read it as 2 usages of percpu_ref. 1 is as the tie a lifetime to an
object, the 2nd is just as a raw reference counter which md and
request_queue use.

In the first use case, I don't think it makes any sense to call
percpu_ref_switch_to_atomic_sync(). And if you did, wouldn't
percpu_ref_switch_to_atomic_sync() to percpu_ref_is_zero() either be
use-after-free or always false.

I feel like the 2nd use case is fair game though because if you're using
percpu_ref_switch_to_atomic_*(), the lifetime of percpu_ref has to be
guaranteed outside of the kill callback.

> We *can* make that work by putting the whole thing in its own critical
> section so that we can make confirm_switch clearing atomic with the possibly
> final put, but that's gonna add some complexity and begs the question why
> we'd need such a thing.
> 
> Andrew, I don't think the patch as proposed makes much sense. Maybe it'd be
> better to keep it out of the tree for the time being?
> 
> Thanks.
> 

Thanks,
Dennis
Qi Zheng April 9, 2022, 12:40 a.m. UTC | #15
On 2022/4/9 1:41 AM, Tejun Heo wrote:
> Hello,
> 
> On Thu, Apr 07, 2022 at 06:33:35PM +0800, Qi Zheng wrote:
>> In the percpu_ref_call_confirm_rcu(), we call the wake_up_all()
>> before calling percpu_ref_put(), which will cause the value of
>> percpu_ref to be unstable when percpu_ref_switch_to_atomic_sync()
>> returns.
>>
>> 	CPU0				CPU1
>>
>> percpu_ref_switch_to_atomic_sync(&ref)
>> --> percpu_ref_switch_to_atomic(&ref)
>>      --> percpu_ref_get(ref);	/* put after confirmation */
>> 	call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
>>
>> 					percpu_ref_switch_to_atomic_rcu
>> 					--> percpu_ref_call_confirm_rcu
>> 					    --> data->confirm_switch = NULL;
>> 						wake_up_all(&percpu_ref_switch_waitq);
>>
>>      /* here waiting to wake up */
>>      wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
>> 						(A)percpu_ref_put(ref);
>> /* The value of &ref is unstable! */
>> percpu_ref_is_zero(&ref)
>> 						(B)percpu_ref_put(ref);
>>
>> As shown above, assuming that the counts on each cpu add up to 0 before
>> calling percpu_ref_switch_to_atomic_sync(), we expect that after switching
>> to atomic mode, percpu_ref_is_zero() can return true. But actually it will
>> return different values in the two cases of A and B, which is not what
>> we expected.
>>
>> Maybe the original purpose of percpu_ref_switch_to_atomic_sync() is
>> just to ensure that the conversion to atomic mode is completed, but it
>> should not return with an extra reference count.
>>
>> Calling wake_up_all() after percpu_ref_put() ensures that the value of
>> percpu_ref is stable after percpu_ref_switch_to_atomic_sync() returns.
>> So just do it.
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>>   lib/percpu-refcount.c | 3 ++-
>>   1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
>> index af9302141bcf..b11b4152c8cd 100644
>> --- a/lib/percpu-refcount.c
>> +++ b/lib/percpu-refcount.c
>> @@ -154,13 +154,14 @@ static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
>>   
>>   	data->confirm_switch(ref);
>>   	data->confirm_switch = NULL;
>> -	wake_up_all(&percpu_ref_switch_waitq);
>>   
>>   	if (!data->allow_reinit)
>>   		__percpu_ref_exit(ref);
>>   
>>   	/* drop ref from percpu_ref_switch_to_atomic() */
>>   	percpu_ref_put(ref);
>> +
>> +	wake_up_all(&percpu_ref_switch_waitq);
> 
> The interface, at least originally, doesn't give any guarantee over whether
> there's gonna be a residual reference on it or not. There's nothing
> necessarily wrong with guaranteeing that but it's rather unusual and given
> that putting the base ref in a percpu_ref is a special "kill" operation and
> a ref in percpu mode always returns %false on is_zero(), I'm not quite sure
> how such semantics would be useful. Do you care to explain the use case with
> concrete examples?

There are currently two users of percpu_ref_switch_to_atomic_sync(), and 
both are used in the example, one is mddev->writes_pending in
driver/md/md.c and the other is q->q_usage_counter in block/blk-pm.c.

The former discards the initial reference count after percpu_ref_init(),
and the latter kills the initial reference count(by calling 
percpu_ref_kill() in blk_freeze_queue_start()) before
percpu_ref_switch_to_atomic_sync(). Looks like they all expect
percpu_ref to be stable when percpu_ref_switch_to_atomic_sync() returns.

> 
> Also, the proposed patch is racy. There's nothing preventing
> percpu_ref_switch_to_atomic_sync() from waking up early between
> confirm_switch clearing and the wake_up_all, so the above change doesn't
> guarantee what it tries to guarantee. For that, you'd have to move
> confirm_switch clearing *after* percpu_ref_put() but then, you'd be
> accessing the ref after its final ref is put which can lead to
> use-after-free.
> 

Oh sorry, it is my bad missing.

> In fact, the whole premise seems wrong. The switching needs a reference to
> the percpu_ref because it is accessing it asynchronously. The switching side
> doesn't know when the ref is gonna go away once it puts its reference and
> thus can't signal that they're done after putting their reference.
> 
> We *can* make that work by putting the whole thing in its own critical
> section so that we can make confirm_switch clearing atomic with the possibly
> final put, but that's gonna add some complexity and begs the question why
> we'd need such a thing.

How about moving the last percpu_ref_put() outside of the
percpu_ref_switch_to_atomic_rcu() in sync mode like below? But this may 
not be elegant.

diff --git a/include/linux/percpu-refcount.h 
b/include/linux/percpu-refcount.h
index d73a1c08c3e3..07f92e7e3e19 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -98,6 +98,7 @@ struct percpu_ref_data {
         percpu_ref_func_t       *confirm_switch;
         bool                    force_atomic:1;
         bool                    allow_reinit:1;
+       bool                    sync;
         struct rcu_head         rcu;
         struct percpu_ref       *ref;
  };
@@ -123,7 +124,8 @@ int __must_check percpu_ref_init(struct percpu_ref *ref,
                                  gfp_t gfp);
  void percpu_ref_exit(struct percpu_ref *ref);
  void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
-                                percpu_ref_func_t *confirm_switch);
+                                percpu_ref_func_t *confirm_switch,
+                                bool sync);
  void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
  void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
  void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index af9302141bcf..2a9d777bcf35 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -99,6 +99,7 @@ int percpu_ref_init(struct percpu_ref *ref, 
percpu_ref_func_t *release,
         data->release = release;
         data->confirm_switch = NULL;
         data->ref = ref;
+       data->sync = false;
         ref->data = data;
         return 0;
  }
@@ -146,21 +147,30 @@ void percpu_ref_exit(struct percpu_ref *ref)
  }
  EXPORT_SYMBOL_GPL(percpu_ref_exit);

+static inline void percpu_ref_switch_to_atomic_post(struct percpu_ref *ref)
+{
+       struct percpu_ref_data *data = ref->data;
+
+       if (!data->allow_reinit)
+               __percpu_ref_exit(ref);
+
+       /* drop ref from percpu_ref_switch_to_atomic() */
+       percpu_ref_put(ref);
+}
+
  static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
  {
         struct percpu_ref_data *data = container_of(rcu,
                         struct percpu_ref_data, rcu);
         struct percpu_ref *ref = data->ref;
+       bool need_put = !data->sync;

         data->confirm_switch(ref);
         data->confirm_switch = NULL;
         wake_up_all(&percpu_ref_switch_waitq);

-       if (!data->allow_reinit)
-               __percpu_ref_exit(ref);
-
-       /* drop ref from percpu_ref_switch_to_atomic() */
-       percpu_ref_put(ref);
+       if (need_put)
+               percpu_ref_switch_to_atomic_post(ref);
  }

  static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
@@ -302,12 +312,14 @@ static void __percpu_ref_switch_mode(struct 
percpu_ref *ref,
   * switching to atomic mode, this function can be called from any context.
   */
  void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
-                                percpu_ref_func_t *confirm_switch)
+                                percpu_ref_func_t *confirm_switch,
+                                bool sync)
  {
         unsigned long flags;

         spin_lock_irqsave(&percpu_ref_switch_lock, flags);

+       ref->data->sync = sync;
         ref->data->force_atomic = true;
         __percpu_ref_switch_mode(ref, confirm_switch);

@@ -325,8 +337,9 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);
   */
  void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
  {
-       percpu_ref_switch_to_atomic(ref, NULL);
+       percpu_ref_switch_to_atomic(ref, NULL, true);
         wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
+       percpu_ref_switch_to_atomic_post(ref);
  }
  EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);

> 
> Andrew, I don't think the patch as proposed makes much sense. Maybe it'd be
> better to keep it out of the tree for the time being?
> 
> Thanks.
>
Qi Zheng April 11, 2022, 7:19 a.m. UTC | #16
On 2022/4/9 8:40 AM, Qi Zheng wrote:
> 
> 
> On 2022/4/9 1:41 AM, Tejun Heo wrote:
>> Hello,
>>
>> On Thu, Apr 07, 2022 at 06:33:35PM +0800, Qi Zheng wrote:
>>> In the percpu_ref_call_confirm_rcu(), we call the wake_up_all()
>>> before calling percpu_ref_put(), which will cause the value of
>>> percpu_ref to be unstable when percpu_ref_switch_to_atomic_sync()
>>> returns.
>>>
>>>     CPU0                CPU1
>>>
>>> percpu_ref_switch_to_atomic_sync(&ref)
>>> --> percpu_ref_switch_to_atomic(&ref)
>>>      --> percpu_ref_get(ref);    /* put after confirmation */
>>>     call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
>>>
>>>                     percpu_ref_switch_to_atomic_rcu
>>>                     --> percpu_ref_call_confirm_rcu
>>>                         --> data->confirm_switch = NULL;
>>>                         wake_up_all(&percpu_ref_switch_waitq);
>>>
>>>      /* here waiting to wake up */
>>>      wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
>>>                         (A)percpu_ref_put(ref);
>>> /* The value of &ref is unstable! */
>>> percpu_ref_is_zero(&ref)
>>>                         (B)percpu_ref_put(ref);
>>>
>>> As shown above, assuming that the counts on each cpu add up to 0 before
>>> calling percpu_ref_switch_to_atomic_sync(), we expect that after 
>>> switching
>>> to atomic mode, percpu_ref_is_zero() can return true. But actually it 
>>> will
>>> return different values in the two cases of A and B, which is not what
>>> we expected.
>>>
>>> Maybe the original purpose of percpu_ref_switch_to_atomic_sync() is
>>> just to ensure that the conversion to atomic mode is completed, but it
>>> should not return with an extra reference count.
>>>
>>> Calling wake_up_all() after percpu_ref_put() ensures that the value of
>>> percpu_ref is stable after percpu_ref_switch_to_atomic_sync() returns.
>>> So just do it.
>>>
>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>> ---
>>>   lib/percpu-refcount.c | 3 ++-
>>>   1 file changed, 2 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
>>> index af9302141bcf..b11b4152c8cd 100644
>>> --- a/lib/percpu-refcount.c
>>> +++ b/lib/percpu-refcount.c
>>> @@ -154,13 +154,14 @@ static void percpu_ref_call_confirm_rcu(struct 
>>> rcu_head *rcu)
>>>       data->confirm_switch(ref);
>>>       data->confirm_switch = NULL;
>>> -    wake_up_all(&percpu_ref_switch_waitq);
>>>       if (!data->allow_reinit)
>>>           __percpu_ref_exit(ref);
>>>       /* drop ref from percpu_ref_switch_to_atomic() */
>>>       percpu_ref_put(ref);
>>> +
>>> +    wake_up_all(&percpu_ref_switch_waitq);
>>
>> The interface, at least originally, doesn't give any guarantee over 
>> whether
>> there's gonna be a residual reference on it or not. There's nothing
>> necessarily wrong with guaranteeing that but it's rather unusual and 
>> given
>> that putting the base ref in a percpu_ref is a special "kill" 
>> operation and
>> a ref in percpu mode always returns %false on is_zero(), I'm not quite 
>> sure
>> how such semantics would be useful. Do you care to explain the use 
>> case with
>> concrete examples?
> 
> There are currently two users of percpu_ref_switch_to_atomic_sync(), and 
> both are used in the example, one is mddev->writes_pending in
> driver/md/md.c and the other is q->q_usage_counter in block/blk-pm.c.
> 
> The former discards the initial reference count after percpu_ref_init(),
> and the latter kills the initial reference count(by calling 
> percpu_ref_kill() in blk_freeze_queue_start()) before
> percpu_ref_switch_to_atomic_sync(). Looks like they all expect
> percpu_ref to be stable when percpu_ref_switch_to_atomic_sync() returns.
> 
>>
>> Also, the proposed patch is racy. There's nothing preventing
>> percpu_ref_switch_to_atomic_sync() from waking up early between
>> confirm_switch clearing and the wake_up_all, so the above change doesn't
>> guarantee what it tries to guarantee. For that, you'd have to move
>> confirm_switch clearing *after* percpu_ref_put() but then, you'd be
>> accessing the ref after its final ref is put which can lead to
>> use-after-free.
>>
> 
> Oh sorry, it is my bad missing.
> 
>> In fact, the whole premise seems wrong. The switching needs a 
>> reference to
>> the percpu_ref because it is accessing it asynchronously. The 
>> switching side
>> doesn't know when the ref is gonna go away once it puts its reference and
>> thus can't signal that they're done after putting their reference.
>>
>> We *can* make that work by putting the whole thing in its own critical
>> section so that we can make confirm_switch clearing atomic with the 
>> possibly
>> final put, but that's gonna add some complexity and begs the question why
>> we'd need such a thing.
> 
> How about moving the last percpu_ref_put() outside of the
> percpu_ref_switch_to_atomic_rcu() in sync mode like below? But this may 
> not be elegant.
> 
> diff --git a/include/linux/percpu-refcount.h 
> b/include/linux/percpu-refcount.h
> index d73a1c08c3e3..07f92e7e3e19 100644
> --- a/include/linux/percpu-refcount.h
> +++ b/include/linux/percpu-refcount.h
> @@ -98,6 +98,7 @@ struct percpu_ref_data {
>          percpu_ref_func_t       *confirm_switch;
>          bool                    force_atomic:1;
>          bool                    allow_reinit:1;
> +       bool                    sync;
>          struct rcu_head         rcu;
>          struct percpu_ref       *ref;
>   };
> @@ -123,7 +124,8 @@ int __must_check percpu_ref_init(struct percpu_ref 
> *ref,
>                                   gfp_t gfp);
>   void percpu_ref_exit(struct percpu_ref *ref);
>   void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
> -                                percpu_ref_func_t *confirm_switch);
> +                                percpu_ref_func_t *confirm_switch,
> +                                bool sync);
>   void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
>   void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
>   void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
> diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
> index af9302141bcf..2a9d777bcf35 100644
> --- a/lib/percpu-refcount.c
> +++ b/lib/percpu-refcount.c
> @@ -99,6 +99,7 @@ int percpu_ref_init(struct percpu_ref *ref, 
> percpu_ref_func_t *release,
>          data->release = release;
>          data->confirm_switch = NULL;
>          data->ref = ref;
> +       data->sync = false;
>          ref->data = data;
>          return 0;
>   }
> @@ -146,21 +147,30 @@ void percpu_ref_exit(struct percpu_ref *ref)
>   }
>   EXPORT_SYMBOL_GPL(percpu_ref_exit);
> 
> +static inline void percpu_ref_switch_to_atomic_post(struct percpu_ref 
> *ref)
> +{
> +       struct percpu_ref_data *data = ref->data;
> +
> +       if (!data->allow_reinit)
> +               __percpu_ref_exit(ref);
> +
> +       /* drop ref from percpu_ref_switch_to_atomic() */
> +       percpu_ref_put(ref);
> +}
> +
>   static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
>   {
>          struct percpu_ref_data *data = container_of(rcu,
>                          struct percpu_ref_data, rcu);
>          struct percpu_ref *ref = data->ref;
> +       bool need_put = !data->sync;
> 
>          data->confirm_switch(ref);
>          data->confirm_switch = NULL;
>          wake_up_all(&percpu_ref_switch_waitq);
> 
> -       if (!data->allow_reinit)
> -               __percpu_ref_exit(ref);
> -
> -       /* drop ref from percpu_ref_switch_to_atomic() */
> -       percpu_ref_put(ref);
> +       if (need_put)
> +               percpu_ref_switch_to_atomic_post(ref);
>   }
> 
>   static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
> @@ -302,12 +312,14 @@ static void __percpu_ref_switch_mode(struct 
> percpu_ref *ref,
>    * switching to atomic mode, this function can be called from any 
> context.
>    */
>   void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
> -                                percpu_ref_func_t *confirm_switch)
> +                                percpu_ref_func_t *confirm_switch,
> +                                bool sync)
>   {
>          unsigned long flags;
> 
>          spin_lock_irqsave(&percpu_ref_switch_lock, flags);
> 
> +       ref->data->sync = sync;
>          ref->data->force_atomic = true;
>          __percpu_ref_switch_mode(ref, confirm_switch);
> 
> @@ -325,8 +337,9 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);
>    */
>   void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
>   {
> -       percpu_ref_switch_to_atomic(ref, NULL);
> +       percpu_ref_switch_to_atomic(ref, NULL, true);
>          wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
> +       percpu_ref_switch_to_atomic_post(ref);
>   }
>   EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);

Hi all, is this fix ok? I can send the v2 version if it looks good.

Thanks,
Qi

> 
>>
>> Andrew, I don't think the patch as proposed makes much sense. Maybe 
>> it'd be
>> better to keep it out of the tree for the time being?
>>
>> Thanks.
>>
>
diff mbox series

Patch

diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index af9302141bcf..b11b4152c8cd 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -154,13 +154,14 @@  static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
 
 	data->confirm_switch(ref);
 	data->confirm_switch = NULL;
-	wake_up_all(&percpu_ref_switch_waitq);
 
 	if (!data->allow_reinit)
 		__percpu_ref_exit(ref);
 
 	/* drop ref from percpu_ref_switch_to_atomic() */
 	percpu_ref_put(ref);
+
+	wake_up_all(&percpu_ref_switch_waitq);
 }
 
 static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)