diff mbox series

[1/1] NFSD: fix WARN_ON_ONCE in __queue_delayed_work

Message ID 1673333310-24837-1-git-send-email-dai.ngo@oracle.com (mailing list archive)
State New, archived
Headers show
Series [1/1] NFSD: fix WARN_ON_ONCE in __queue_delayed_work | expand

Commit Message

Dai Ngo Jan. 10, 2023, 6:48 a.m. UTC
Currently nfsd4_state_shrinker_worker can be schduled multiple times
from nfsd4_state_shrinker_count when memory is low. This causes
the WARN_ON_ONCE in __queue_delayed_work to trigger.

This patch allows only one instance of nfsd4_state_shrinker_worker
at a time using the nfsd_shrinker_active flag, protected by the
client_lock.

Replace mod_delayed_work with queue_delayed_work since we
don't expect to modify the delay of any pending work.

Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
 fs/nfsd/netns.h     |  1 +
 fs/nfsd/nfs4state.c | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

Comments

Jeffrey Layton Jan. 10, 2023, 10:30 a.m. UTC | #1
On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote:
> Currently nfsd4_state_shrinker_worker can be schduled multiple times
> from nfsd4_state_shrinker_count when memory is low. This causes
> the WARN_ON_ONCE in __queue_delayed_work to trigger.
> 
> This patch allows only one instance of nfsd4_state_shrinker_worker
> at a time using the nfsd_shrinker_active flag, protected by the
> client_lock.
> 
> Replace mod_delayed_work with queue_delayed_work since we
> don't expect to modify the delay of any pending work.
> 
> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
> Reported-by: Mike Galbraith <efault@gmx.de>
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
>  fs/nfsd/netns.h     |  1 +
>  fs/nfsd/nfs4state.c | 16 ++++++++++++++--
>  2 files changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> index 8c854ba3285b..801d70926442 100644
> --- a/fs/nfsd/netns.h
> +++ b/fs/nfsd/netns.h
> @@ -196,6 +196,7 @@ struct nfsd_net {
>  	atomic_t		nfsd_courtesy_clients;
>  	struct shrinker		nfsd_client_shrinker;
>  	struct delayed_work	nfsd_shrinker_work;
> +	bool			nfsd_shrinker_active;
>  };
>  
>  /* Simple check to find out if a given net was properly initialized */
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index ee56c9466304..e00551af6a11 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
>  	struct nfsd_net *nn = container_of(shrink,
>  			struct nfsd_net, nfsd_client_shrinker);
>  
> +	spin_lock(&nn->client_lock);
> +	if (nn->nfsd_shrinker_active) {
> +		spin_unlock(&nn->client_lock);
> +		return 0;
> +	}

Is this extra machinery really necessary? The bool and spinlock don't
seem to be needed. Typically there is no issue with calling
queued_delayed_work when the work is already queued. It just returns
false in that case without doing anything.

> 
>  	count = atomic_read(&nn->nfsd_courtesy_clients);
>  	if (!count)
>  		count = atomic_long_read(&num_delegations);
> -	if (count)
> -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
> +	if (count) {
> +		nn->nfsd_shrinker_active = true;
> +		spin_unlock(&nn->client_lock);
> +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
> +	} else
> +		spin_unlock(&nn->client_lock);
>  	return (unsigned long)count;
>  }
>  
> @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
>  
>  	courtesy_client_reaper(nn);
>  	deleg_reaper(nn);
> +	spin_lock(&nn->client_lock);
> +	nn->nfsd_shrinker_active = 0;
> +	spin_unlock(&nn->client_lock);
>  }
>  
>  static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
Chuck Lever Jan. 10, 2023, 2:26 p.m. UTC | #2
> On Jan 10, 2023, at 1:48 AM, Dai Ngo <dai.ngo@oracle.com> wrote:
> 
> Currently nfsd4_state_shrinker_worker can be schduled multiple times
> from nfsd4_state_shrinker_count when memory is low. This causes
> the WARN_ON_ONCE in __queue_delayed_work to trigger.
> 
> This patch allows only one instance of nfsd4_state_shrinker_worker
> at a time using the nfsd_shrinker_active flag, protected by the
> client_lock.
> 
> Replace mod_delayed_work with queue_delayed_work since we
> don't expect to modify the delay of any pending work.
> 
> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
> Reported-by: Mike Galbraith <efault@gmx.de>
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
> fs/nfsd/netns.h     |  1 +
> fs/nfsd/nfs4state.c | 16 ++++++++++++++--
> 2 files changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> index 8c854ba3285b..801d70926442 100644
> --- a/fs/nfsd/netns.h
> +++ b/fs/nfsd/netns.h
> @@ -196,6 +196,7 @@ struct nfsd_net {
> 	atomic_t		nfsd_courtesy_clients;
> 	struct shrinker		nfsd_client_shrinker;
> 	struct delayed_work	nfsd_shrinker_work;
> +	bool			nfsd_shrinker_active;
> };
> 
> /* Simple check to find out if a given net was properly initialized */
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index ee56c9466304..e00551af6a11 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
> 	struct nfsd_net *nn = container_of(shrink,
> 			struct nfsd_net, nfsd_client_shrinker);
> 
> +	spin_lock(&nn->client_lock);
> +	if (nn->nfsd_shrinker_active) {
> +		spin_unlock(&nn->client_lock);
> +		return 0;
> +	}
> 	count = atomic_read(&nn->nfsd_courtesy_clients);
> 	if (!count)
> 		count = atomic_long_read(&num_delegations);
> -	if (count)
> -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
> +	if (count) {
> +		nn->nfsd_shrinker_active = true;
> +		spin_unlock(&nn->client_lock);
> +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);

As Jeff said, try just replacing the mod_delayed_work() call
with the queue_delayed_work() call, without the extra gating.


> +	} else
> +		spin_unlock(&nn->client_lock);
> 	return (unsigned long)count;
> }
> 
> @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
> 
> 	courtesy_client_reaper(nn);
> 	deleg_reaper(nn);
> +	spin_lock(&nn->client_lock);
> +	nn->nfsd_shrinker_active = 0;
> +	spin_unlock(&nn->client_lock);
> }
> 
> static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
> -- 
> 2.9.5
> 

--
Chuck Lever
Dai Ngo Jan. 10, 2023, 5:33 p.m. UTC | #3
On 1/10/23 2:30 AM, Jeff Layton wrote:
> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote:
>> Currently nfsd4_state_shrinker_worker can be schduled multiple times
>> from nfsd4_state_shrinker_count when memory is low. This causes
>> the WARN_ON_ONCE in __queue_delayed_work to trigger.
>>
>> This patch allows only one instance of nfsd4_state_shrinker_worker
>> at a time using the nfsd_shrinker_active flag, protected by the
>> client_lock.
>>
>> Replace mod_delayed_work with queue_delayed_work since we
>> don't expect to modify the delay of any pending work.
>>
>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
>> Reported-by: Mike Galbraith <efault@gmx.de>
>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>> ---
>>   fs/nfsd/netns.h     |  1 +
>>   fs/nfsd/nfs4state.c | 16 ++++++++++++++--
>>   2 files changed, 15 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
>> index 8c854ba3285b..801d70926442 100644
>> --- a/fs/nfsd/netns.h
>> +++ b/fs/nfsd/netns.h
>> @@ -196,6 +196,7 @@ struct nfsd_net {
>>   	atomic_t		nfsd_courtesy_clients;
>>   	struct shrinker		nfsd_client_shrinker;
>>   	struct delayed_work	nfsd_shrinker_work;
>> +	bool			nfsd_shrinker_active;
>>   };
>>   
>>   /* Simple check to find out if a given net was properly initialized */
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index ee56c9466304..e00551af6a11 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
>>   	struct nfsd_net *nn = container_of(shrink,
>>   			struct nfsd_net, nfsd_client_shrinker);
>>   
>> +	spin_lock(&nn->client_lock);
>> +	if (nn->nfsd_shrinker_active) {
>> +		spin_unlock(&nn->client_lock);
>> +		return 0;
>> +	}
> Is this extra machinery really necessary? The bool and spinlock don't
> seem to be needed. Typically there is no issue with calling
> queued_delayed_work when the work is already queued. It just returns
> false in that case without doing anything.

When there are multiple calls to mod_delayed_work/queue_delayed_work
we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if
the work is queued but not execute yet.

This problem was reported by Mike. I initially tried with only the
bool but that was not enough that was why the spinlock was added.
Mike verified that the patch fixed the problem.

-Dai

>
>>   	count = atomic_read(&nn->nfsd_courtesy_clients);
>>   	if (!count)
>>   		count = atomic_long_read(&num_delegations);
>> -	if (count)
>> -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>> +	if (count) {
>> +		nn->nfsd_shrinker_active = true;
>> +		spin_unlock(&nn->client_lock);
>> +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>> +	} else
>> +		spin_unlock(&nn->client_lock);
>>   	return (unsigned long)count;
>>   }
>>   
>> @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
>>   
>>   	courtesy_client_reaper(nn);
>>   	deleg_reaper(nn);
>> +	spin_lock(&nn->client_lock);
>> +	nn->nfsd_shrinker_active = 0;
>> +	spin_unlock(&nn->client_lock);
>>   }
>>   
>>   static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
Chuck Lever Jan. 10, 2023, 6:17 p.m. UTC | #4
> On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> 
> 
> On 1/10/23 2:30 AM, Jeff Layton wrote:
>> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote:
>>> Currently nfsd4_state_shrinker_worker can be schduled multiple times
>>> from nfsd4_state_shrinker_count when memory is low. This causes
>>> the WARN_ON_ONCE in __queue_delayed_work to trigger.
>>> 
>>> This patch allows only one instance of nfsd4_state_shrinker_worker
>>> at a time using the nfsd_shrinker_active flag, protected by the
>>> client_lock.
>>> 
>>> Replace mod_delayed_work with queue_delayed_work since we
>>> don't expect to modify the delay of any pending work.
>>> 
>>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
>>> Reported-by: Mike Galbraith <efault@gmx.de>
>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>> ---
>>>  fs/nfsd/netns.h     |  1 +
>>>  fs/nfsd/nfs4state.c | 16 ++++++++++++++--
>>>  2 files changed, 15 insertions(+), 2 deletions(-)
>>> 
>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
>>> index 8c854ba3285b..801d70926442 100644
>>> --- a/fs/nfsd/netns.h
>>> +++ b/fs/nfsd/netns.h
>>> @@ -196,6 +196,7 @@ struct nfsd_net {
>>>  	atomic_t		nfsd_courtesy_clients;
>>>  	struct shrinker		nfsd_client_shrinker;
>>>  	struct delayed_work	nfsd_shrinker_work;
>>> +	bool			nfsd_shrinker_active;
>>>  };
>>>    /* Simple check to find out if a given net was properly initialized */
>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>> index ee56c9466304..e00551af6a11 100644
>>> --- a/fs/nfsd/nfs4state.c
>>> +++ b/fs/nfsd/nfs4state.c
>>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
>>>  	struct nfsd_net *nn = container_of(shrink,
>>>  			struct nfsd_net, nfsd_client_shrinker);
>>>  +	spin_lock(&nn->client_lock);
>>> +	if (nn->nfsd_shrinker_active) {
>>> +		spin_unlock(&nn->client_lock);
>>> +		return 0;
>>> +	}
>> Is this extra machinery really necessary? The bool and spinlock don't
>> seem to be needed. Typically there is no issue with calling
>> queued_delayed_work when the work is already queued. It just returns
>> false in that case without doing anything.
> 
> When there are multiple calls to mod_delayed_work/queue_delayed_work
> we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if
> the work is queued but not execute yet.

The delay argument of zero is interesting. If it's set to a value
greater than zero, do you still see a problem?


> This problem was reported by Mike. I initially tried with only the
> bool but that was not enough that was why the spinlock was added.
> Mike verified that the patch fixed the problem.
> 
> -Dai
> 
>> 
>>>  	count = atomic_read(&nn->nfsd_courtesy_clients);
>>>  	if (!count)
>>>  		count = atomic_long_read(&num_delegations);
>>> -	if (count)
>>> -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>>> +	if (count) {
>>> +		nn->nfsd_shrinker_active = true;
>>> +		spin_unlock(&nn->client_lock);
>>> +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>>> +	} else
>>> +		spin_unlock(&nn->client_lock);
>>>  	return (unsigned long)count;
>>>  }
>>>  @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
>>>    	courtesy_client_reaper(nn);
>>>  	deleg_reaper(nn);
>>> +	spin_lock(&nn->client_lock);
>>> +	nn->nfsd_shrinker_active = 0;
>>> +	spin_unlock(&nn->client_lock);
>>>  }
>>>    static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)

--
Chuck Lever
Jeffrey Layton Jan. 10, 2023, 6:34 p.m. UTC | #5
On Tue, 2023-01-10 at 18:17 +0000, Chuck Lever III wrote:
> 
> > On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> > 
> > 
> > On 1/10/23 2:30 AM, Jeff Layton wrote:
> > > On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote:
> > > > Currently nfsd4_state_shrinker_worker can be schduled multiple times
> > > > from nfsd4_state_shrinker_count when memory is low. This causes
> > > > the WARN_ON_ONCE in __queue_delayed_work to trigger.
> > > > 
> > > > This patch allows only one instance of nfsd4_state_shrinker_worker
> > > > at a time using the nfsd_shrinker_active flag, protected by the
> > > > client_lock.
> > > > 
> > > > Replace mod_delayed_work with queue_delayed_work since we
> > > > don't expect to modify the delay of any pending work.
> > > > 
> > > > Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
> > > > Reported-by: Mike Galbraith <efault@gmx.de>
> > > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> > > > ---
> > > >  fs/nfsd/netns.h     |  1 +
> > > >  fs/nfsd/nfs4state.c | 16 ++++++++++++++--
> > > >  2 files changed, 15 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> > > > index 8c854ba3285b..801d70926442 100644
> > > > --- a/fs/nfsd/netns.h
> > > > +++ b/fs/nfsd/netns.h
> > > > @@ -196,6 +196,7 @@ struct nfsd_net {
> > > >  	atomic_t		nfsd_courtesy_clients;
> > > >  	struct shrinker		nfsd_client_shrinker;
> > > >  	struct delayed_work	nfsd_shrinker_work;
> > > > +	bool			nfsd_shrinker_active;
> > > >  };
> > > >    /* Simple check to find out if a given net was properly initialized */
> > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > index ee56c9466304..e00551af6a11 100644
> > > > --- a/fs/nfsd/nfs4state.c
> > > > +++ b/fs/nfsd/nfs4state.c
> > > > @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
> > > >  	struct nfsd_net *nn = container_of(shrink,
> > > >  			struct nfsd_net, nfsd_client_shrinker);
> > > >  +	spin_lock(&nn->client_lock);
> > > > +	if (nn->nfsd_shrinker_active) {
> > > > +		spin_unlock(&nn->client_lock);
> > > > +		return 0;
> > > > +	}
> > > Is this extra machinery really necessary? The bool and spinlock don't
> > > seem to be needed. Typically there is no issue with calling
> > > queued_delayed_work when the work is already queued. It just returns
> > > false in that case without doing anything.
> > 
> > When there are multiple calls to mod_delayed_work/queue_delayed_work
> > we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if
> > the work is queued but not execute yet.
> 
> The delay argument of zero is interesting. If it's set to a value
> greater than zero, do you still see a problem?
> 

It should be safe to call it with a delay of 0. If it's always queued
with a delay of 0 though (and it seems to be), you could save a little
space by using a struct work_struct instead.

Also, I'm not sure if this is related, but where do you cancel the
nfsd_shrinker_work before tearing down the struct nfsd_net? I'm not sure
that would explains the problem Mike reported, but I think that needs to
be addressed.

> 
> > This problem was reported by Mike. I initially tried with only the
> > bool but that was not enough that was why the spinlock was added.
> > Mike verified that the patch fixed the problem.
> > 
> > -Dai
> > 
> > > 
> > > >  	count = atomic_read(&nn->nfsd_courtesy_clients);
> > > >  	if (!count)
> > > >  		count = atomic_long_read(&num_delegations);
> > > > -	if (count)
> > > > -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
> > > > +	if (count) {
> > > > +		nn->nfsd_shrinker_active = true;
> > > > +		spin_unlock(&nn->client_lock);
> > > > +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
> > > > +	} else
> > > > +		spin_unlock(&nn->client_lock);
> > > >  	return (unsigned long)count;
> > > >  }
> > > >  @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
> > > >    	courtesy_client_reaper(nn);
> > > >  	deleg_reaper(nn);
> > > > +	spin_lock(&nn->client_lock);
> > > > +	nn->nfsd_shrinker_active = 0;
> > > > +	spin_unlock(&nn->client_lock);
> > > >  }
> > > >    static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
> 
> --
> Chuck Lever
> 
> 
>
Dai Ngo Jan. 10, 2023, 6:46 p.m. UTC | #6
On 1/10/23 10:17 AM, Chuck Lever III wrote:
>
>> On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>
>>
>> On 1/10/23 2:30 AM, Jeff Layton wrote:
>>> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote:
>>>> Currently nfsd4_state_shrinker_worker can be schduled multiple times
>>>> from nfsd4_state_shrinker_count when memory is low. This causes
>>>> the WARN_ON_ONCE in __queue_delayed_work to trigger.
>>>>
>>>> This patch allows only one instance of nfsd4_state_shrinker_worker
>>>> at a time using the nfsd_shrinker_active flag, protected by the
>>>> client_lock.
>>>>
>>>> Replace mod_delayed_work with queue_delayed_work since we
>>>> don't expect to modify the delay of any pending work.
>>>>
>>>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
>>>> Reported-by: Mike Galbraith <efault@gmx.de>
>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>> ---
>>>>   fs/nfsd/netns.h     |  1 +
>>>>   fs/nfsd/nfs4state.c | 16 ++++++++++++++--
>>>>   2 files changed, 15 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
>>>> index 8c854ba3285b..801d70926442 100644
>>>> --- a/fs/nfsd/netns.h
>>>> +++ b/fs/nfsd/netns.h
>>>> @@ -196,6 +196,7 @@ struct nfsd_net {
>>>>   	atomic_t		nfsd_courtesy_clients;
>>>>   	struct shrinker		nfsd_client_shrinker;
>>>>   	struct delayed_work	nfsd_shrinker_work;
>>>> +	bool			nfsd_shrinker_active;
>>>>   };
>>>>     /* Simple check to find out if a given net was properly initialized */
>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>> index ee56c9466304..e00551af6a11 100644
>>>> --- a/fs/nfsd/nfs4state.c
>>>> +++ b/fs/nfsd/nfs4state.c
>>>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
>>>>   	struct nfsd_net *nn = container_of(shrink,
>>>>   			struct nfsd_net, nfsd_client_shrinker);
>>>>   +	spin_lock(&nn->client_lock);
>>>> +	if (nn->nfsd_shrinker_active) {
>>>> +		spin_unlock(&nn->client_lock);
>>>> +		return 0;
>>>> +	}
>>> Is this extra machinery really necessary? The bool and spinlock don't
>>> seem to be needed. Typically there is no issue with calling
>>> queued_delayed_work when the work is already queued. It just returns
>>> false in that case without doing anything.
>> When there are multiple calls to mod_delayed_work/queue_delayed_work
>> we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if
>> the work is queued but not execute yet.
> The delay argument of zero is interesting. If it's set to a value
> greater than zero, do you still see a problem?

I tried and tried but could not reproduce the problem that Mike
reported. I guess my VMs don't have fast enough cpus to make it
happen.

As Jeff mentioned, delay 0 should be safe and we want to run
the shrinker as soon as possible when memory is low.

-Dai

>
>
>> This problem was reported by Mike. I initially tried with only the
>> bool but that was not enough that was why the spinlock was added.
>> Mike verified that the patch fixed the problem.
>>
>> -Dai
>>
>>>>   	count = atomic_read(&nn->nfsd_courtesy_clients);
>>>>   	if (!count)
>>>>   		count = atomic_long_read(&num_delegations);
>>>> -	if (count)
>>>> -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>>>> +	if (count) {
>>>> +		nn->nfsd_shrinker_active = true;
>>>> +		spin_unlock(&nn->client_lock);
>>>> +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>>>> +	} else
>>>> +		spin_unlock(&nn->client_lock);
>>>>   	return (unsigned long)count;
>>>>   }
>>>>   @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
>>>>     	courtesy_client_reaper(nn);
>>>>   	deleg_reaper(nn);
>>>> +	spin_lock(&nn->client_lock);
>>>> +	nn->nfsd_shrinker_active = 0;
>>>> +	spin_unlock(&nn->client_lock);
>>>>   }
>>>>     static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
> --
> Chuck Lever
>
>
>
Chuck Lever Jan. 10, 2023, 6:53 p.m. UTC | #7
> On Jan 10, 2023, at 1:46 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> 
> 
> On 1/10/23 10:17 AM, Chuck Lever III wrote:
>> 
>>> On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>> 
>>> 
>>> On 1/10/23 2:30 AM, Jeff Layton wrote:
>>>> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote:
>>>>> Currently nfsd4_state_shrinker_worker can be schduled multiple times
>>>>> from nfsd4_state_shrinker_count when memory is low. This causes
>>>>> the WARN_ON_ONCE in __queue_delayed_work to trigger.
>>>>> 
>>>>> This patch allows only one instance of nfsd4_state_shrinker_worker
>>>>> at a time using the nfsd_shrinker_active flag, protected by the
>>>>> client_lock.
>>>>> 
>>>>> Replace mod_delayed_work with queue_delayed_work since we
>>>>> don't expect to modify the delay of any pending work.
>>>>> 
>>>>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
>>>>> Reported-by: Mike Galbraith <efault@gmx.de>
>>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>>> ---
>>>>>  fs/nfsd/netns.h     |  1 +
>>>>>  fs/nfsd/nfs4state.c | 16 ++++++++++++++--
>>>>>  2 files changed, 15 insertions(+), 2 deletions(-)
>>>>> 
>>>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
>>>>> index 8c854ba3285b..801d70926442 100644
>>>>> --- a/fs/nfsd/netns.h
>>>>> +++ b/fs/nfsd/netns.h
>>>>> @@ -196,6 +196,7 @@ struct nfsd_net {
>>>>>  	atomic_t		nfsd_courtesy_clients;
>>>>>  	struct shrinker		nfsd_client_shrinker;
>>>>>  	struct delayed_work	nfsd_shrinker_work;
>>>>> +	bool			nfsd_shrinker_active;
>>>>>  };
>>>>>    /* Simple check to find out if a given net was properly initialized */
>>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>>> index ee56c9466304..e00551af6a11 100644
>>>>> --- a/fs/nfsd/nfs4state.c
>>>>> +++ b/fs/nfsd/nfs4state.c
>>>>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
>>>>>  	struct nfsd_net *nn = container_of(shrink,
>>>>>  			struct nfsd_net, nfsd_client_shrinker);
>>>>>  +	spin_lock(&nn->client_lock);
>>>>> +	if (nn->nfsd_shrinker_active) {
>>>>> +		spin_unlock(&nn->client_lock);
>>>>> +		return 0;
>>>>> +	}
>>>> Is this extra machinery really necessary? The bool and spinlock don't
>>>> seem to be needed. Typically there is no issue with calling
>>>> queued_delayed_work when the work is already queued. It just returns
>>>> false in that case without doing anything.
>>> When there are multiple calls to mod_delayed_work/queue_delayed_work
>>> we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if
>>> the work is queued but not execute yet.
>> The delay argument of zero is interesting. If it's set to a value
>> greater than zero, do you still see a problem?
> 
> I tried and tried but could not reproduce the problem that Mike
> reported. I guess my VMs don't have fast enough cpus to make it
> happen.

I'd prefer not to guess... it sounds like we don't have a clear
root cause on this one yet.

I think I agree with Jeff: a spinlock shouldn't be required to
make queuing work safe via this API.


> As Jeff mentioned, delay 0 should be safe and we want to run
> the shrinker as soon as possible when memory is low.

I suggested that because the !delay code paths seem to lead
directly to the WARN_ONs in queue_work(). <shrug>


> -Dai
> 
>> 
>> 
>>> This problem was reported by Mike. I initially tried with only the
>>> bool but that was not enough that was why the spinlock was added.
>>> Mike verified that the patch fixed the problem.
>>> 
>>> -Dai
>>> 
>>>>>  	count = atomic_read(&nn->nfsd_courtesy_clients);
>>>>>  	if (!count)
>>>>>  		count = atomic_long_read(&num_delegations);
>>>>> -	if (count)
>>>>> -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>>>>> +	if (count) {
>>>>> +		nn->nfsd_shrinker_active = true;
>>>>> +		spin_unlock(&nn->client_lock);
>>>>> +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>>>>> +	} else
>>>>> +		spin_unlock(&nn->client_lock);
>>>>>  	return (unsigned long)count;
>>>>>  }
>>>>>  @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
>>>>>    	courtesy_client_reaper(nn);
>>>>>  	deleg_reaper(nn);
>>>>> +	spin_lock(&nn->client_lock);
>>>>> +	nn->nfsd_shrinker_active = 0;
>>>>> +	spin_unlock(&nn->client_lock);
>>>>>  }
>>>>>    static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
>> --
>> Chuck Lever

--
Chuck Lever
Dai Ngo Jan. 10, 2023, 7:07 p.m. UTC | #8
On 1/10/23 10:53 AM, Chuck Lever III wrote:
>
>> On Jan 10, 2023, at 1:46 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>
>>
>> On 1/10/23 10:17 AM, Chuck Lever III wrote:
>>>> On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>>>
>>>>
>>>> On 1/10/23 2:30 AM, Jeff Layton wrote:
>>>>> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote:
>>>>>> Currently nfsd4_state_shrinker_worker can be schduled multiple times
>>>>>> from nfsd4_state_shrinker_count when memory is low. This causes
>>>>>> the WARN_ON_ONCE in __queue_delayed_work to trigger.
>>>>>>
>>>>>> This patch allows only one instance of nfsd4_state_shrinker_worker
>>>>>> at a time using the nfsd_shrinker_active flag, protected by the
>>>>>> client_lock.
>>>>>>
>>>>>> Replace mod_delayed_work with queue_delayed_work since we
>>>>>> don't expect to modify the delay of any pending work.
>>>>>>
>>>>>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
>>>>>> Reported-by: Mike Galbraith <efault@gmx.de>
>>>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>>>> ---
>>>>>>   fs/nfsd/netns.h     |  1 +
>>>>>>   fs/nfsd/nfs4state.c | 16 ++++++++++++++--
>>>>>>   2 files changed, 15 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
>>>>>> index 8c854ba3285b..801d70926442 100644
>>>>>> --- a/fs/nfsd/netns.h
>>>>>> +++ b/fs/nfsd/netns.h
>>>>>> @@ -196,6 +196,7 @@ struct nfsd_net {
>>>>>>   	atomic_t		nfsd_courtesy_clients;
>>>>>>   	struct shrinker		nfsd_client_shrinker;
>>>>>>   	struct delayed_work	nfsd_shrinker_work;
>>>>>> +	bool			nfsd_shrinker_active;
>>>>>>   };
>>>>>>     /* Simple check to find out if a given net was properly initialized */
>>>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>>>> index ee56c9466304..e00551af6a11 100644
>>>>>> --- a/fs/nfsd/nfs4state.c
>>>>>> +++ b/fs/nfsd/nfs4state.c
>>>>>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
>>>>>>   	struct nfsd_net *nn = container_of(shrink,
>>>>>>   			struct nfsd_net, nfsd_client_shrinker);
>>>>>>   +	spin_lock(&nn->client_lock);
>>>>>> +	if (nn->nfsd_shrinker_active) {
>>>>>> +		spin_unlock(&nn->client_lock);
>>>>>> +		return 0;
>>>>>> +	}
>>>>> Is this extra machinery really necessary? The bool and spinlock don't
>>>>> seem to be needed. Typically there is no issue with calling
>>>>> queued_delayed_work when the work is already queued. It just returns
>>>>> false in that case without doing anything.
>>>> When there are multiple calls to mod_delayed_work/queue_delayed_work
>>>> we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if
>>>> the work is queued but not execute yet.
>>> The delay argument of zero is interesting. If it's set to a value
>>> greater than zero, do you still see a problem?
>> I tried and tried but could not reproduce the problem that Mike
>> reported. I guess my VMs don't have fast enough cpus to make it
>> happen.
> I'd prefer not to guess... it sounds like we don't have a clear
> root cause on this one yet.

Yes, we do, as I explained in above. The reason I could not reproduce
it because my system is not fast enough to get multiple calls to
nfsd4_state_shrinker_count simultaneously.

>
> I think I agree with Jeff: a spinlock shouldn't be required to
> make queuing work safe via this API.
>
>
>> As Jeff mentioned, delay 0 should be safe and we want to run
>> the shrinker as soon as possible when memory is low.
> I suggested that because the !delay code paths seem to lead
> directly to the WARN_ONs in queue_work(). <shrug>

If we use 0 delay then we need the spinlock, as proven by Mike's test.
If we use delay != 0 then it may work without the spinlock, we will
Mike to retest it.

You and Jeff decide what the delay is and I will make the change
and retest.

-Dai

>
>
>> -Dai
>>
>>>
>>>> This problem was reported by Mike. I initially tried with only the
>>>> bool but that was not enough that was why the spinlock was added.
>>>> Mike verified that the patch fixed the problem.
>>>>
>>>> -Dai
>>>>
>>>>>>   	count = atomic_read(&nn->nfsd_courtesy_clients);
>>>>>>   	if (!count)
>>>>>>   		count = atomic_long_read(&num_delegations);
>>>>>> -	if (count)
>>>>>> -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>>>>>> +	if (count) {
>>>>>> +		nn->nfsd_shrinker_active = true;
>>>>>> +		spin_unlock(&nn->client_lock);
>>>>>> +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>>>>>> +	} else
>>>>>> +		spin_unlock(&nn->client_lock);
>>>>>>   	return (unsigned long)count;
>>>>>>   }
>>>>>>   @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
>>>>>>     	courtesy_client_reaper(nn);
>>>>>>   	deleg_reaper(nn);
>>>>>> +	spin_lock(&nn->client_lock);
>>>>>> +	nn->nfsd_shrinker_active = 0;
>>>>>> +	spin_unlock(&nn->client_lock);
>>>>>>   }
>>>>>>     static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
>>> --
>>> Chuck Lever
> --
> Chuck Lever
>
>
>
Jeffrey Layton Jan. 10, 2023, 7:16 p.m. UTC | #9
On Tue, 2023-01-10 at 18:53 +0000, Chuck Lever III wrote:
> 
> > On Jan 10, 2023, at 1:46 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> > 
> > 
> > On 1/10/23 10:17 AM, Chuck Lever III wrote:
> > > 
> > > > On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> > > > 
> > > > 
> > > > On 1/10/23 2:30 AM, Jeff Layton wrote:
> > > > > On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote:
> > > > > > Currently nfsd4_state_shrinker_worker can be schduled multiple times
> > > > > > from nfsd4_state_shrinker_count when memory is low. This causes
> > > > > > the WARN_ON_ONCE in __queue_delayed_work to trigger.
> > > > > > 
> > > > > > This patch allows only one instance of nfsd4_state_shrinker_worker
> > > > > > at a time using the nfsd_shrinker_active flag, protected by the
> > > > > > client_lock.
> > > > > > 
> > > > > > Replace mod_delayed_work with queue_delayed_work since we
> > > > > > don't expect to modify the delay of any pending work.
> > > > > > 
> > > > > > Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
> > > > > > Reported-by: Mike Galbraith <efault@gmx.de>
> > > > > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> > > > > > ---
> > > > > >  fs/nfsd/netns.h     |  1 +
> > > > > >  fs/nfsd/nfs4state.c | 16 ++++++++++++++--
> > > > > >  2 files changed, 15 insertions(+), 2 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> > > > > > index 8c854ba3285b..801d70926442 100644
> > > > > > --- a/fs/nfsd/netns.h
> > > > > > +++ b/fs/nfsd/netns.h
> > > > > > @@ -196,6 +196,7 @@ struct nfsd_net {
> > > > > >  	atomic_t		nfsd_courtesy_clients;
> > > > > >  	struct shrinker		nfsd_client_shrinker;
> > > > > >  	struct delayed_work	nfsd_shrinker_work;
> > > > > > +	bool			nfsd_shrinker_active;
> > > > > >  };
> > > > > >    /* Simple check to find out if a given net was properly initialized */
> > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > index ee56c9466304..e00551af6a11 100644
> > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
> > > > > >  	struct nfsd_net *nn = container_of(shrink,
> > > > > >  			struct nfsd_net, nfsd_client_shrinker);
> > > > > >  +	spin_lock(&nn->client_lock);
> > > > > > +	if (nn->nfsd_shrinker_active) {
> > > > > > +		spin_unlock(&nn->client_lock);
> > > > > > +		return 0;
> > > > > > +	}
> > > > > Is this extra machinery really necessary? The bool and spinlock don't
> > > > > seem to be needed. Typically there is no issue with calling
> > > > > queued_delayed_work when the work is already queued. It just returns
> > > > > false in that case without doing anything.
> > > > When there are multiple calls to mod_delayed_work/queue_delayed_work
> > > > we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if
> > > > the work is queued but not execute yet.
> > > The delay argument of zero is interesting. If it's set to a value
> > > greater than zero, do you still see a problem?
> > 
> > I tried and tried but could not reproduce the problem that Mike
> > reported. I guess my VMs don't have fast enough cpus to make it
> > happen.
> 
> I'd prefer not to guess... it sounds like we don't have a clear
> root cause on this one yet.
> 
> I think I agree with Jeff: a spinlock shouldn't be required to
> make queuing work safe via this API.
> 
> 
> > As Jeff mentioned, delay 0 should be safe and we want to run
> > the shrinker as soon as possible when memory is low.
> 
> I suggested that because the !delay code paths seem to lead
> directly to the WARN_ONs in queue_work(). <shrug>
> 
> 


One of the WARNs in that Mike hit was this:

        WARN_ON_ONCE(timer->function != delayed_work_timer_fn);

nfsd isn't doing anything exotic with that function pointer, so that
really looks like something got corrupted. Given that this is happening
under low-memory conditions, then I have to wonder if we're just ending
up with a workqueue job that remained on the queue after the nfsd_net
got freed and recycled.

I'd start with a patch like this (note, untested):

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2f4a2449b314..86da6663806e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -8158,6 +8158,7 @@ nfs4_state_shutdown_net(struct net *net)
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
        unregister_shrinker(&nn->nfsd_client_shrinker);
+       cancel_delayed_work_sync(&nn->nfsd_shrinker_work);
        cancel_delayed_work_sync(&nn->laundromat_work);
        locks_end_grace(&nn->nfsd4_manager);


Either way, I think longer nfsd_shrinker_work ought to be converted to a
normal work_struct since you don't ever use the delay.

> > -Dai
> > 
> > > 
> > > 
> > > > This problem was reported by Mike. I initially tried with only the
> > > > bool but that was not enough that was why the spinlock was added.
> > > > Mike verified that the patch fixed the problem.
> > > > 
> > > > -Dai
> > > > 
> > > > > >  	count = atomic_read(&nn->nfsd_courtesy_clients);
> > > > > >  	if (!count)
> > > > > >  		count = atomic_long_read(&num_delegations);
> > > > > > -	if (count)
> > > > > > -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
> > > > > > +	if (count) {
> > > > > > +		nn->nfsd_shrinker_active = true;
> > > > > > +		spin_unlock(&nn->client_lock);
> > > > > > +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
> > > > > > +	} else
> > > > > > +		spin_unlock(&nn->client_lock);
> > > > > >  	return (unsigned long)count;
> > > > > >  }
> > > > > >  @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
> > > > > >    	courtesy_client_reaper(nn);
> > > > > >  	deleg_reaper(nn);
> > > > > > +	spin_lock(&nn->client_lock);
> > > > > > +	nn->nfsd_shrinker_active = 0;
> > > > > > +	spin_unlock(&nn->client_lock);
> > > > > >  }
> > > > > >    static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
> > > --
> > > Chuck Lever
> 
> --
> Chuck Lever
> 
> 
>
Dai Ngo Jan. 10, 2023, 7:17 p.m. UTC | #10
On 1/10/23 10:34 AM, Jeff Layton wrote:
> On Tue, 2023-01-10 at 18:17 +0000, Chuck Lever III wrote:
>>> On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>>
>>>
>>> On 1/10/23 2:30 AM, Jeff Layton wrote:
>>>> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote:
>>>>> Currently nfsd4_state_shrinker_worker can be schduled multiple times
>>>>> from nfsd4_state_shrinker_count when memory is low. This causes
>>>>> the WARN_ON_ONCE in __queue_delayed_work to trigger.
>>>>>
>>>>> This patch allows only one instance of nfsd4_state_shrinker_worker
>>>>> at a time using the nfsd_shrinker_active flag, protected by the
>>>>> client_lock.
>>>>>
>>>>> Replace mod_delayed_work with queue_delayed_work since we
>>>>> don't expect to modify the delay of any pending work.
>>>>>
>>>>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
>>>>> Reported-by: Mike Galbraith <efault@gmx.de>
>>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>>> ---
>>>>>   fs/nfsd/netns.h     |  1 +
>>>>>   fs/nfsd/nfs4state.c | 16 ++++++++++++++--
>>>>>   2 files changed, 15 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
>>>>> index 8c854ba3285b..801d70926442 100644
>>>>> --- a/fs/nfsd/netns.h
>>>>> +++ b/fs/nfsd/netns.h
>>>>> @@ -196,6 +196,7 @@ struct nfsd_net {
>>>>>   	atomic_t		nfsd_courtesy_clients;
>>>>>   	struct shrinker		nfsd_client_shrinker;
>>>>>   	struct delayed_work	nfsd_shrinker_work;
>>>>> +	bool			nfsd_shrinker_active;
>>>>>   };
>>>>>     /* Simple check to find out if a given net was properly initialized */
>>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>>> index ee56c9466304..e00551af6a11 100644
>>>>> --- a/fs/nfsd/nfs4state.c
>>>>> +++ b/fs/nfsd/nfs4state.c
>>>>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
>>>>>   	struct nfsd_net *nn = container_of(shrink,
>>>>>   			struct nfsd_net, nfsd_client_shrinker);
>>>>>   +	spin_lock(&nn->client_lock);
>>>>> +	if (nn->nfsd_shrinker_active) {
>>>>> +		spin_unlock(&nn->client_lock);
>>>>> +		return 0;
>>>>> +	}
>>>> Is this extra machinery really necessary? The bool and spinlock don't
>>>> seem to be needed. Typically there is no issue with calling
>>>> queued_delayed_work when the work is already queued. It just returns
>>>> false in that case without doing anything.
>>> When there are multiple calls to mod_delayed_work/queue_delayed_work
>>> we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if
>>> the work is queued but not execute yet.
>> The delay argument of zero is interesting. If it's set to a value
>> greater than zero, do you still see a problem?
>>
> It should be safe to call it with a delay of 0. If it's always queued
> with a delay of 0 though (and it seems to be), you could save a little
> space by using a struct work_struct instead.

Can I defer this optimization for now? I need some time to look into it.

>
> Also, I'm not sure if this is related, but where do you cancel the
> nfsd_shrinker_work before tearing down the struct nfsd_net? I'm not sure
> that would explains the problem Mike reported, but I think that needs to
> be addressed.

Yes, good catch. I will add the cancelling in v2 patch.

Thanks,
-Dai

>
>>> This problem was reported by Mike. I initially tried with only the
>>> bool but that was not enough that was why the spinlock was added.
>>> Mike verified that the patch fixed the problem.
>>>
>>> -Dai
>>>
>>>>>   	count = atomic_read(&nn->nfsd_courtesy_clients);
>>>>>   	if (!count)
>>>>>   		count = atomic_long_read(&num_delegations);
>>>>> -	if (count)
>>>>> -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>>>>> +	if (count) {
>>>>> +		nn->nfsd_shrinker_active = true;
>>>>> +		spin_unlock(&nn->client_lock);
>>>>> +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>>>>> +	} else
>>>>> +		spin_unlock(&nn->client_lock);
>>>>>   	return (unsigned long)count;
>>>>>   }
>>>>>   @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
>>>>>     	courtesy_client_reaper(nn);
>>>>>   	deleg_reaper(nn);
>>>>> +	spin_lock(&nn->client_lock);
>>>>> +	nn->nfsd_shrinker_active = 0;
>>>>> +	spin_unlock(&nn->client_lock);
>>>>>   }
>>>>>     static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
>> --
>> Chuck Lever
>>
>>
>>
Jeffrey Layton Jan. 10, 2023, 7:27 p.m. UTC | #11
On Tue, 2023-01-10 at 11:07 -0800, dai.ngo@oracle.com wrote:
> On 1/10/23 10:53 AM, Chuck Lever III wrote:
> > 
> > > On Jan 10, 2023, at 1:46 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> > > 
> > > 
> > > On 1/10/23 10:17 AM, Chuck Lever III wrote:
> > > > > On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> > > > > 
> > > > > 
> > > > > On 1/10/23 2:30 AM, Jeff Layton wrote:
> > > > > > On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote:
> > > > > > > Currently nfsd4_state_shrinker_worker can be schduled multiple times
> > > > > > > from nfsd4_state_shrinker_count when memory is low. This causes
> > > > > > > the WARN_ON_ONCE in __queue_delayed_work to trigger.
> > > > > > > 
> > > > > > > This patch allows only one instance of nfsd4_state_shrinker_worker
> > > > > > > at a time using the nfsd_shrinker_active flag, protected by the
> > > > > > > client_lock.
> > > > > > > 
> > > > > > > Replace mod_delayed_work with queue_delayed_work since we
> > > > > > > don't expect to modify the delay of any pending work.
> > > > > > > 
> > > > > > > Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
> > > > > > > Reported-by: Mike Galbraith <efault@gmx.de>
> > > > > > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> > > > > > > ---
> > > > > > >   fs/nfsd/netns.h     |  1 +
> > > > > > >   fs/nfsd/nfs4state.c | 16 ++++++++++++++--
> > > > > > >   2 files changed, 15 insertions(+), 2 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> > > > > > > index 8c854ba3285b..801d70926442 100644
> > > > > > > --- a/fs/nfsd/netns.h
> > > > > > > +++ b/fs/nfsd/netns.h
> > > > > > > @@ -196,6 +196,7 @@ struct nfsd_net {
> > > > > > >   	atomic_t		nfsd_courtesy_clients;
> > > > > > >   	struct shrinker		nfsd_client_shrinker;
> > > > > > >   	struct delayed_work	nfsd_shrinker_work;
> > > > > > > +	bool			nfsd_shrinker_active;
> > > > > > >   };
> > > > > > >     /* Simple check to find out if a given net was properly initialized */
> > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > index ee56c9466304..e00551af6a11 100644
> > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
> > > > > > >   	struct nfsd_net *nn = container_of(shrink,
> > > > > > >   			struct nfsd_net, nfsd_client_shrinker);
> > > > > > >   +	spin_lock(&nn->client_lock);
> > > > > > > +	if (nn->nfsd_shrinker_active) {
> > > > > > > +		spin_unlock(&nn->client_lock);
> > > > > > > +		return 0;
> > > > > > > +	}
> > > > > > Is this extra machinery really necessary? The bool and spinlock don't
> > > > > > seem to be needed. Typically there is no issue with calling
> > > > > > queued_delayed_work when the work is already queued. It just returns
> > > > > > false in that case without doing anything.
> > > > > When there are multiple calls to mod_delayed_work/queue_delayed_work
> > > > > we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if
> > > > > the work is queued but not execute yet.
> > > > The delay argument of zero is interesting. If it's set to a value
> > > > greater than zero, do you still see a problem?
> > > I tried and tried but could not reproduce the problem that Mike
> > > reported. I guess my VMs don't have fast enough cpus to make it
> > > happen.
> > I'd prefer not to guess... it sounds like we don't have a clear
> > root cause on this one yet.
> 
> Yes, we do, as I explained in above. The reason I could not reproduce
> it because my system is not fast enough to get multiple calls to
> nfsd4_state_shrinker_count simultaneously.
> 
> > 
> > I think I agree with Jeff: a spinlock shouldn't be required to
> > make queuing work safe via this API.
> > 
> > 
> > > As Jeff mentioned, delay 0 should be safe and we want to run
> > > the shrinker as soon as possible when memory is low.
> > I suggested that because the !delay code paths seem to lead
> > directly to the WARN_ONs in queue_work(). <shrug>
> 
> If we use 0 delay then we need the spinlock, as proven by Mike's test.
> If we use delay != 0 then it may work without the spinlock, we will
> Mike to retest it.
> 
> You and Jeff decide what the delay is and I will make the change
> and retest.
> 

I don't think this is the issue either. It's more likely that this is
just changing the timing such that it didn't happen. Looking at the
traces that Mike posted, it really looks like the delayed_work is
corrupt.

> > 
> > 
> > > -Dai
> > > 
> > > > 
> > > > > This problem was reported by Mike. I initially tried with only the
> > > > > bool but that was not enough that was why the spinlock was added.
> > > > > Mike verified that the patch fixed the problem.
> > > > > 
> > > > > -Dai
> > > > > 
> > > > > > >   	count = atomic_read(&nn->nfsd_courtesy_clients);
> > > > > > >   	if (!count)
> > > > > > >   		count = atomic_long_read(&num_delegations);
> > > > > > > -	if (count)
> > > > > > > -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
> > > > > > > +	if (count) {
> > > > > > > +		nn->nfsd_shrinker_active = true;
> > > > > > > +		spin_unlock(&nn->client_lock);
> > > > > > > +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
> > > > > > > +	} else
> > > > > > > +		spin_unlock(&nn->client_lock);
> > > > > > >   	return (unsigned long)count;
> > > > > > >   }
> > > > > > >   @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
> > > > > > >     	courtesy_client_reaper(nn);
> > > > > > >   	deleg_reaper(nn);
> > > > > > > +	spin_lock(&nn->client_lock);
> > > > > > > +	nn->nfsd_shrinker_active = 0;
> > > > > > > +	spin_unlock(&nn->client_lock);
> > > > > > >   }
> > > > > > >     static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
> > > > --
> > > > Chuck Lever
> > --
> > Chuck Lever
> > 
> > 
> >
Jeffrey Layton Jan. 10, 2023, 7:30 p.m. UTC | #12
On Tue, 2023-01-10 at 11:17 -0800, dai.ngo@oracle.com wrote:
> On 1/10/23 10:34 AM, Jeff Layton wrote:
> > On Tue, 2023-01-10 at 18:17 +0000, Chuck Lever III wrote:
> > > > On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> > > > 
> > > > 
> > > > On 1/10/23 2:30 AM, Jeff Layton wrote:
> > > > > On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote:
> > > > > > Currently nfsd4_state_shrinker_worker can be schduled multiple times
> > > > > > from nfsd4_state_shrinker_count when memory is low. This causes
> > > > > > the WARN_ON_ONCE in __queue_delayed_work to trigger.
> > > > > > 
> > > > > > This patch allows only one instance of nfsd4_state_shrinker_worker
> > > > > > at a time using the nfsd_shrinker_active flag, protected by the
> > > > > > client_lock.
> > > > > > 
> > > > > > Replace mod_delayed_work with queue_delayed_work since we
> > > > > > don't expect to modify the delay of any pending work.
> > > > > > 
> > > > > > Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
> > > > > > Reported-by: Mike Galbraith <efault@gmx.de>
> > > > > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> > > > > > ---
> > > > > >   fs/nfsd/netns.h     |  1 +
> > > > > >   fs/nfsd/nfs4state.c | 16 ++++++++++++++--
> > > > > >   2 files changed, 15 insertions(+), 2 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> > > > > > index 8c854ba3285b..801d70926442 100644
> > > > > > --- a/fs/nfsd/netns.h
> > > > > > +++ b/fs/nfsd/netns.h
> > > > > > @@ -196,6 +196,7 @@ struct nfsd_net {
> > > > > >   	atomic_t		nfsd_courtesy_clients;
> > > > > >   	struct shrinker		nfsd_client_shrinker;
> > > > > >   	struct delayed_work	nfsd_shrinker_work;
> > > > > > +	bool			nfsd_shrinker_active;
> > > > > >   };
> > > > > >     /* Simple check to find out if a given net was properly initialized */
> > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > index ee56c9466304..e00551af6a11 100644
> > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
> > > > > >   	struct nfsd_net *nn = container_of(shrink,
> > > > > >   			struct nfsd_net, nfsd_client_shrinker);
> > > > > >   +	spin_lock(&nn->client_lock);
> > > > > > +	if (nn->nfsd_shrinker_active) {
> > > > > > +		spin_unlock(&nn->client_lock);
> > > > > > +		return 0;
> > > > > > +	}
> > > > > Is this extra machinery really necessary? The bool and spinlock don't
> > > > > seem to be needed. Typically there is no issue with calling
> > > > > queued_delayed_work when the work is already queued. It just returns
> > > > > false in that case without doing anything.
> > > > When there are multiple calls to mod_delayed_work/queue_delayed_work
> > > > we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if
> > > > the work is queued but not execute yet.
> > > The delay argument of zero is interesting. If it's set to a value
> > > greater than zero, do you still see a problem?
> > > 
> > It should be safe to call it with a delay of 0. If it's always queued
> > with a delay of 0 though (and it seems to be), you could save a little
> > space by using a struct work_struct instead.
> 
> Can I defer this optimization for now? I need some time to look into it.
> 

I'd like to see it as part of the eventual patch that's merged. There's
no reason to use a delayed_work struct here at all. You always want the
shrinker work to run ASAP. It should be a simple conversion.
> > 
> > Also, I'm not sure if this is related, but where do you cancel the
> > nfsd_shrinker_work before tearing down the struct nfsd_net? I'm not sure
> > that would explains the problem Mike reported, but I think that needs to
> > be addressed.
> 
> Yes, good catch. I will add the cancelling in v2 patch.
> 
> 

Looking over the traces that Mike posted, I suspect this is the real
bug, particularly if the server is being restarted during this test.

> > 
> > > > This problem was reported by Mike. I initially tried with only the
> > > > bool but that was not enough that was why the spinlock was added.
> > > > Mike verified that the patch fixed the problem.
> > > > 
> > > > -Dai
> > > > 
> > > > > >   	count = atomic_read(&nn->nfsd_courtesy_clients);
> > > > > >   	if (!count)
> > > > > >   		count = atomic_long_read(&num_delegations);
> > > > > > -	if (count)
> > > > > > -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
> > > > > > +	if (count) {
> > > > > > +		nn->nfsd_shrinker_active = true;
> > > > > > +		spin_unlock(&nn->client_lock);
> > > > > > +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
> > > > > > +	} else
> > > > > > +		spin_unlock(&nn->client_lock);
> > > > > >   	return (unsigned long)count;
> > > > > >   }
> > > > > >   @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
> > > > > >     	courtesy_client_reaper(nn);
> > > > > >   	deleg_reaper(nn);
> > > > > > +	spin_lock(&nn->client_lock);
> > > > > > +	nn->nfsd_shrinker_active = 0;
> > > > > > +	spin_unlock(&nn->client_lock);
> > > > > >   }
> > > > > >     static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
> > > --
> > > Chuck Lever
> > > 
> > > 
> > >
Dai Ngo Jan. 10, 2023, 7:58 p.m. UTC | #13
On 1/10/23 11:30 AM, Jeff Layton wrote:
> On Tue, 2023-01-10 at 11:17 -0800, dai.ngo@oracle.com wrote:
>> On 1/10/23 10:34 AM, Jeff Layton wrote:
>>> On Tue, 2023-01-10 at 18:17 +0000, Chuck Lever III wrote:
>>>>> On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>>>>
>>>>>
>>>>> On 1/10/23 2:30 AM, Jeff Layton wrote:
>>>>>> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote:
>>>>>>> Currently nfsd4_state_shrinker_worker can be schduled multiple times
>>>>>>> from nfsd4_state_shrinker_count when memory is low. This causes
>>>>>>> the WARN_ON_ONCE in __queue_delayed_work to trigger.
>>>>>>>
>>>>>>> This patch allows only one instance of nfsd4_state_shrinker_worker
>>>>>>> at a time using the nfsd_shrinker_active flag, protected by the
>>>>>>> client_lock.
>>>>>>>
>>>>>>> Replace mod_delayed_work with queue_delayed_work since we
>>>>>>> don't expect to modify the delay of any pending work.
>>>>>>>
>>>>>>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition")
>>>>>>> Reported-by: Mike Galbraith <efault@gmx.de>
>>>>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>>>>> ---
>>>>>>>    fs/nfsd/netns.h     |  1 +
>>>>>>>    fs/nfsd/nfs4state.c | 16 ++++++++++++++--
>>>>>>>    2 files changed, 15 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
>>>>>>> index 8c854ba3285b..801d70926442 100644
>>>>>>> --- a/fs/nfsd/netns.h
>>>>>>> +++ b/fs/nfsd/netns.h
>>>>>>> @@ -196,6 +196,7 @@ struct nfsd_net {
>>>>>>>    	atomic_t		nfsd_courtesy_clients;
>>>>>>>    	struct shrinker		nfsd_client_shrinker;
>>>>>>>    	struct delayed_work	nfsd_shrinker_work;
>>>>>>> +	bool			nfsd_shrinker_active;
>>>>>>>    };
>>>>>>>      /* Simple check to find out if a given net was properly initialized */
>>>>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>>>>> index ee56c9466304..e00551af6a11 100644
>>>>>>> --- a/fs/nfsd/nfs4state.c
>>>>>>> +++ b/fs/nfsd/nfs4state.c
>>>>>>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
>>>>>>>    	struct nfsd_net *nn = container_of(shrink,
>>>>>>>    			struct nfsd_net, nfsd_client_shrinker);
>>>>>>>    +	spin_lock(&nn->client_lock);
>>>>>>> +	if (nn->nfsd_shrinker_active) {
>>>>>>> +		spin_unlock(&nn->client_lock);
>>>>>>> +		return 0;
>>>>>>> +	}
>>>>>> Is this extra machinery really necessary? The bool and spinlock don't
>>>>>> seem to be needed. Typically there is no issue with calling
>>>>>> queued_delayed_work when the work is already queued. It just returns
>>>>>> false in that case without doing anything.
>>>>> When there are multiple calls to mod_delayed_work/queue_delayed_work
>>>>> we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if
>>>>> the work is queued but not execute yet.
>>>> The delay argument of zero is interesting. If it's set to a value
>>>> greater than zero, do you still see a problem?
>>>>
>>> It should be safe to call it with a delay of 0. If it's always queued
>>> with a delay of 0 though (and it seems to be), you could save a little
>>> space by using a struct work_struct instead.
>> Can I defer this optimization for now? I need some time to look into it.
>>
> I'd like to see it as part of the eventual patch that's merged. There's
> no reason to use a delayed_work struct here at all. You always want the
> shrinker work to run ASAP. It should be a simple conversion.

ok, I'll make the change in v2.

>>> Also, I'm not sure if this is related, but where do you cancel the
>>> nfsd_shrinker_work before tearing down the struct nfsd_net? I'm not sure
>>> that would explains the problem Mike reported, but I think that needs to
>>> be addressed.
>> Yes, good catch. I will add the cancelling in v2 patch.
>>
>>
> Looking over the traces that Mike posted, I suspect this is the real
> bug, particularly if the server is being restarted during this test.

Yes, I noticed the WARN_ON_ONCE(timer->function != delayed_work_timer_fn)
too and this seems to indicate some kind of corruption. However, I'm not
sure if Mike's test restarts the nfs-server service. This could be a bug
in work queue module when it's under stress.

-Dai

>
>>>>> This problem was reported by Mike. I initially tried with only the
>>>>> bool but that was not enough that was why the spinlock was added.
>>>>> Mike verified that the patch fixed the problem.
>>>>>
>>>>> -Dai
>>>>>
>>>>>>>    	count = atomic_read(&nn->nfsd_courtesy_clients);
>>>>>>>    	if (!count)
>>>>>>>    		count = atomic_long_read(&num_delegations);
>>>>>>> -	if (count)
>>>>>>> -		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>>>>>>> +	if (count) {
>>>>>>> +		nn->nfsd_shrinker_active = true;
>>>>>>> +		spin_unlock(&nn->client_lock);
>>>>>>> +		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
>>>>>>> +	} else
>>>>>>> +		spin_unlock(&nn->client_lock);
>>>>>>>    	return (unsigned long)count;
>>>>>>>    }
>>>>>>>    @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
>>>>>>>      	courtesy_client_reaper(nn);
>>>>>>>    	deleg_reaper(nn);
>>>>>>> +	spin_lock(&nn->client_lock);
>>>>>>> +	nn->nfsd_shrinker_active = 0;
>>>>>>> +	spin_unlock(&nn->client_lock);
>>>>>>>    }
>>>>>>>      static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
>>>> --
>>>> Chuck Lever
>>>>
>>>>
>>>>
Mike Galbraith Jan. 11, 2023, 2:34 a.m. UTC | #14
On Tue, 2023-01-10 at 11:58 -0800, dai.ngo@oracle.com wrote:
>
> On 1/10/23 11:30 AM, Jeff Layton wrote:
>
> > >
> > >
> > Looking over the traces that Mike posted, I suspect this is the real
> > bug, particularly if the server is being restarted during this test.
>
> Yes, I noticed the WARN_ON_ONCE(timer->function != delayed_work_timer_fn)
> too and this seems to indicate some kind of corruption. However, I'm not
> sure if Mike's test restarts the nfs-server service. This could be a bug
> in work queue module when it's under stress.

My reproducer was to merely mount and traverse/md5sum, while that was
going on, fire up LTP's min_free_kbytes testcase (memory hog from hell)
on the server.  Systemthing may well be restarting the server service
in response to oomkill.  In fact, the struct delayed_work in question
at WARN_ON_ONCE() time didn't look the least bit ready for business.

FWIW, I had noticed the missing cancel while eyeballing, and stuck one
next to the existing one as a hail-mary, but that helped not at all.

crash> delayed_work ffff8881601fab48
struct delayed_work {
  work = {
    data = {
      counter = 1
    },
    entry = {
      next = 0x0,
      prev = 0x0
    },
    func = 0x0
  },
  timer = {
    entry = {
      next = 0x0,
      pprev = 0x0
    },
    expires = 0,
    function = 0x0,
    flags = 0
  },
  wq = 0x0,
  cpu = 0
}
Jeffrey Layton Jan. 11, 2023, 10:15 a.m. UTC | #15
On Wed, 2023-01-11 at 03:34 +0100, Mike Galbraith wrote:
> On Tue, 2023-01-10 at 11:58 -0800, dai.ngo@oracle.com wrote:
> > 
> > On 1/10/23 11:30 AM, Jeff Layton wrote:
> > 
> > > > 
> > > > 
> > > Looking over the traces that Mike posted, I suspect this is the real
> > > bug, particularly if the server is being restarted during this test.
> > 
> > Yes, I noticed the WARN_ON_ONCE(timer->function != delayed_work_timer_fn)
> > too and this seems to indicate some kind of corruption. However, I'm not
> > sure if Mike's test restarts the nfs-server service. This could be a bug
> > in work queue module when it's under stress.
> 
> My reproducer was to merely mount and traverse/md5sum, while that was
> going on, fire up LTP's min_free_kbytes testcase (memory hog from hell)
> on the server.  Systemthing may well be restarting the server service
> in response to oomkill.  In fact, the struct delayed_work in question
> at WARN_ON_ONCE() time didn't look the least bit ready for business.
> 
> FWIW, I had noticed the missing cancel while eyeballing, and stuck one
> next to the existing one as a hail-mary, but that helped not at all.
> 

Ok, thanks, that's good to know.

I still doubt that the problem is the race that Dai seems to think it
is. The workqueue infrastructure has been fairly stable for years. If
there were problems with concurrent tasks queueing the same work, the
kernel would be blowing up all over the place.

> crash> delayed_work ffff8881601fab48
> struct delayed_work {
>   work = {
>     data = {
>       counter = 1
>     },
>     entry = {
>       next = 0x0,
>       prev = 0x0
>     },
>     func = 0x0
>   },
>   timer = {
>     entry = {
>       next = 0x0,
>       pprev = 0x0
>     },
>     expires = 0,
>     function = 0x0,
>     flags = 0
>   },
>   wq = 0x0,
>   cpu = 0
> }

That looks more like a memory scribble or UAF. Merely having multiple
tasks calling queue_work at the same time wouldn't be enough to trigger
this, IMO. It's more likely that the extra locking is changing the
timing of your reproducer somehow.

It might be interesting to turn up KASAN if you're able.
Jeffrey Layton Jan. 11, 2023, 10:55 a.m. UTC | #16
On Wed, 2023-01-11 at 05:15 -0500, Jeff Layton wrote:
> On Wed, 2023-01-11 at 03:34 +0100, Mike Galbraith wrote:
> > On Tue, 2023-01-10 at 11:58 -0800, dai.ngo@oracle.com wrote:
> > > 
> > > On 1/10/23 11:30 AM, Jeff Layton wrote:
> > > 
> > > > > 
> > > > > 
> > > > Looking over the traces that Mike posted, I suspect this is the real
> > > > bug, particularly if the server is being restarted during this test.
> > > 
> > > Yes, I noticed the WARN_ON_ONCE(timer->function != delayed_work_timer_fn)
> > > too and this seems to indicate some kind of corruption. However, I'm not
> > > sure if Mike's test restarts the nfs-server service. This could be a bug
> > > in work queue module when it's under stress.
> > 
> > My reproducer was to merely mount and traverse/md5sum, while that was
> > going on, fire up LTP's min_free_kbytes testcase (memory hog from hell)
> > on the server.  Systemthing may well be restarting the server service
> > in response to oomkill.  In fact, the struct delayed_work in question
> > at WARN_ON_ONCE() time didn't look the least bit ready for business.
> > 
> > FWIW, I had noticed the missing cancel while eyeballing, and stuck one
> > next to the existing one as a hail-mary, but that helped not at all.
> > 
> 
> Ok, thanks, that's good to know.
> 
> I still doubt that the problem is the race that Dai seems to think it
> is. The workqueue infrastructure has been fairly stable for years. If
> there were problems with concurrent tasks queueing the same work, the
> kernel would be blowing up all over the place.
> 
> > crash> delayed_work ffff8881601fab48
> > struct delayed_work {
> >   work = {
> >     data = {
> >       counter = 1
> >     },
> >     entry = {
> >       next = 0x0,
> >       prev = 0x0
> >     },
> >     func = 0x0
> >   },
> >   timer = {
> >     entry = {
> >       next = 0x0,
> >       pprev = 0x0
> >     },
> >     expires = 0,
> >     function = 0x0,
> >     flags = 0
> >   },
> >   wq = 0x0,
> >   cpu = 0
> > }
> 
> That looks more like a memory scribble or UAF. Merely having multiple
> tasks calling queue_work at the same time wouldn't be enough to trigger
> this, IMO. It's more likely that the extra locking is changing the
> timing of your reproducer somehow.
> 
> It might be interesting to turn up KASAN if you're able. 

If you still have this vmcore, it might be interesting to do the pointer
math and find the nfsd_net structure that contains the above
delayed_work. Does the rest of it also seem to be corrupt? My guess is
that the corrupted structure extends beyond just the delayed_work above.

Also, it might be helpful to do this:

     kmem -s ffff8881601fab48

...which should tell us whether and what part of the slab this object is
now a part of. That said, net-namespace object allocations are somewhat
weird, and I'm not 100% sure they come out of the slab.
Mike Galbraith Jan. 11, 2023, 11:19 a.m. UTC | #17
On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote:
> >
> > > crash> delayed_work ffff8881601fab48
> > > struct delayed_work {
> > >   work = {
> > >     data = {
> > >       counter = 1
> > >     },
> > >     entry = {
> > >       next = 0x0,
> > >       prev = 0x0
> > >     },
> > >     func = 0x0
> > >   },
> > >   timer = {
> > >     entry = {
> > >       next = 0x0,
> > >       pprev = 0x0
> > >     },
> > >     expires = 0,
> > >     function = 0x0,
> > >     flags = 0
> > >   },
> > >   wq = 0x0,
> > >   cpu = 0
> > > }
> >
> > That looks more like a memory scribble or UAF. Merely having multiple
> > tasks calling queue_work at the same time wouldn't be enough to trigger
> > this, IMO. It's more likely that the extra locking is changing the
> > timing of your reproducer somehow.
> >
> > It might be interesting to turn up KASAN if you're able.

I can try that.

> If you still have this vmcore, it might be interesting to do the pointer
> math and find the nfsd_net structure that contains the above
> delayed_work. Does the rest of it also seem to be corrupt? My guess is
> that the corrupted structure extends beyond just the delayed_work above.
>
> Also, it might be helpful to do this:
>
>      kmem -s ffff8881601fab48
>
> ...which should tell us whether and what part of the slab this object is
> now a part of. That said, net-namespace object allocations are somewhat
> weird, and I'm not 100% sure they come out of the slab.

I tossed the vmcore, but can generate another.  I had done kmem sans -s
previously, still have that.

crash> kmem ffff8881601fab48
CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
kmem: kmalloc-1k: partial list slab: ffffea0005b20c08 invalid page.inuse: -1
ffff888100041840     1024       2329      2432     76    32k  kmalloc-1k
  SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
  ffffea0005807e00  ffff8881601f8000     0     32         32     0
  FREE / [ALLOCATED]
  [ffff8881601fa800]

      PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
ffffea0005807e80 1601fa000 dead000000000400        0  0 200000000000000
crash
Dai Ngo Jan. 11, 2023, 11:31 a.m. UTC | #18
On 1/11/23 3:19 AM, Mike Galbraith wrote:
> On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote:
>>>> crash> delayed_work ffff8881601fab48
>>>> struct delayed_work {
>>>>    work = {
>>>>      data = {
>>>>        counter = 1
>>>>      },
>>>>      entry = {
>>>>        next = 0x0,
>>>>        prev = 0x0
>>>>      },
>>>>      func = 0x0
>>>>    },
>>>>    timer = {
>>>>      entry = {
>>>>        next = 0x0,
>>>>        pprev = 0x0
>>>>      },
>>>>      expires = 0,
>>>>      function = 0x0,
>>>>      flags = 0
>>>>    },
>>>>    wq = 0x0,
>>>>    cpu = 0
>>>> }
>>> That looks more like a memory scribble or UAF. Merely having multiple
>>> tasks calling queue_work at the same time wouldn't be enough to trigger
>>> this, IMO. It's more likely that the extra locking is changing the
>>> timing of your reproducer somehow.
>>>
>>> It might be interesting to turn up KASAN if you're able.
> I can try that.
>
>> If you still have this vmcore, it might be interesting to do the pointer
>> math and find the nfsd_net structure that contains the above
>> delayed_work. Does the rest of it also seem to be corrupt? My guess is
>> that the corrupted structure extends beyond just the delayed_work above.
>>
>> Also, it might be helpful to do this:
>>
>>       kmem -s ffff8881601fab48
>>
>> ...which should tell us whether and what part of the slab this object is
>> now a part of. That said, net-namespace object allocations are somewhat
>> weird, and I'm not 100% sure they come out of the slab.
> I tossed the vmcore, but can generate another.  I had done kmem sans -s
> previously, still have that.
>
> crash> kmem ffff8881601fab48
> CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
> kmem: kmalloc-1k: partial list slab: ffffea0005b20c08 invalid page.inuse: -1
> ffff888100041840     1024       2329      2432     76    32k  kmalloc-1k
>    SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
>    ffffea0005807e00  ffff8881601f8000     0     32         32     0
>    FREE / [ALLOCATED]
>    [ffff8881601fa800]
>
>        PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
> ffffea0005807e80 1601fa000 dead000000000400        0  0 200000000000000
> crash

Can you try:

crash7latest> nfsd_net_id
nfsd_net_id = $2 = 9                <<===
crash7latest> struct net.gen  init_net
   gen = 0xffff97fc17d07d80
crash7latest> x /10g 0xffff97fc17d07d80
0xffff97fc17d07d80:	0x000000000000000d	0x0000000000000000
0xffff97fc17d07d90:	0x0000000000000000	0xffff97fc0ac40060
0xffff97fc17d07da0:	0xffff994e7bf87600	0xffff98f731172a20
0xffff97fc17d07db0:	0xffff9844b05d9c00	0xffff9832a6a0add0
0xffff97fc17d07dc0:	0xffff984a4470d740	0xffff984a93eb0600    <<=== entry for nfsd_net_id
crash7latest> nfsd_net 0xffff984a93eb0600

Thanks,
-Dai

>
Jeffrey Layton Jan. 11, 2023, noon UTC | #19
On Wed, 2023-01-11 at 12:19 +0100, Mike Galbraith wrote:
> On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote:
> > > 
> > > > crash> delayed_work ffff8881601fab48
> > > > struct delayed_work {
> > > >   work = {
> > > >     data = {
> > > >       counter = 1
> > > >     },
> > > >     entry = {
> > > >       next = 0x0,
> > > >       prev = 0x0
> > > >     },
> > > >     func = 0x0
> > > >   },
> > > >   timer = {
> > > >     entry = {
> > > >       next = 0x0,
> > > >       pprev = 0x0
> > > >     },
> > > >     expires = 0,
> > > >     function = 0x0,
> > > >     flags = 0
> > > >   },
> > > >   wq = 0x0,
> > > >   cpu = 0
> > > > }
> > > 
> > > That looks more like a memory scribble or UAF. Merely having multiple
> > > tasks calling queue_work at the same time wouldn't be enough to trigger
> > > this, IMO. It's more likely that the extra locking is changing the
> > > timing of your reproducer somehow.
> > > 
> > > It might be interesting to turn up KASAN if you're able.
> 
> I can try that.
> 
> > If you still have this vmcore, it might be interesting to do the pointer
> > math and find the nfsd_net structure that contains the above
> > delayed_work. Does the rest of it also seem to be corrupt? My guess is
> > that the corrupted structure extends beyond just the delayed_work above.
> > 
> > Also, it might be helpful to do this:
> > 
> >      kmem -s ffff8881601fab48
> > 
> > ...which should tell us whether and what part of the slab this object is
> > now a part of. That said, net-namespace object allocations are somewhat
> > weird, and I'm not 100% sure they come out of the slab.
> 
> I tossed the vmcore, but can generate another.  I had done kmem sans -s
> previously, still have that.
> 
> crash> kmem ffff8881601fab48
> CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
> kmem: kmalloc-1k: partial list slab: ffffea0005b20c08 invalid page.inuse: -1
> ffff888100041840     1024       2329      2432     76    32k  kmalloc-1k
>   SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
>   ffffea0005807e00  ffff8881601f8000     0     32         32     0
>   FREE / [ALLOCATED]
>   [ffff8881601fa800]
> 
>       PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
> ffffea0005807e80 1601fa000 dead000000000400        0  0 200000000000000
> crash
> 

Thanks. The pernet allocations do come out of the slab. The allocation
is done in ops_init in net/core/namespace.c. This one is a 1k
allocation, which jives with the size of nfsd_net (which is 976 bytes).

So, this seems to be consistent with where an nfsd_net would have come
from. Maybe not a UAF, but I do think we have some sort of mem
corruption going on.
Mike Galbraith Jan. 11, 2023, 12:15 p.m. UTC | #20
On Wed, 2023-01-11 at 12:19 +0100, Mike Galbraith wrote:
> On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote:
> > >
> > >
> > >
> > > It might be interesting to turn up KASAN if you're able.
>
> I can try that.

KASAN did not squeak.

> > If you still have this vmcore, it might be interesting to do the pointer
> > math and find the nfsd_net structure that contains the above
> > delayed_work. Does the rest of it also seem to be corrupt?

Virgin source with workqueue.c WARN_ON_ONCE() landmine.

crash> nfsd_net -x 0xFFFF8881114E9800
struct nfsd_net {
  cld_net = 0x0,
  svc_expkey_cache = 0xffff8881420f8a00,
  svc_export_cache = 0xffff8881420f8800,
  idtoname_cache = 0xffff8881420f9a00,
  nametoid_cache = 0xffff8881420f9c00,
  nfsd4_manager = {
    list = {
      next = 0x0,
      prev = 0x0
    },
    block_opens = 0x0
  },
  grace_ended = 0x0,
  boot_time = 0x0,
  nfsd_client_dir = 0x0,
  reclaim_str_hashtbl = 0x0,
  reclaim_str_hashtbl_size = 0x0,
  conf_id_hashtbl = 0x0,
  conf_name_tree = {
    rb_node = 0x0
  },
  unconf_id_hashtbl = 0x0,
  unconf_name_tree = {
    rb_node = 0x0
  },
  sessionid_hashtbl = 0x0,
  client_lru = {
    next = 0x0,
    prev = 0x0
  },
  close_lru = {
    next = 0x0,
    prev = 0x0
  },
  del_recall_lru = {
    next = 0x0,
    prev = 0x0
  },
  blocked_locks_lru = {
    next = 0x0,
    prev = 0x0
  },
  laundromat_work = {
    work = {
      data = {
        counter = 0x0
      },
      entry = {
        next = 0x0,
        prev = 0x0
      },
      func = 0x0
    },
    timer = {
      entry = {
        next = 0x0,
        pprev = 0x0
      },
      expires = 0x0,
      function = 0x0,
      flags = 0x0
    },
    wq = 0x0,
    cpu = 0x0
  },
  client_lock = {
    {
      rlock = {
        raw_lock = {
          {
            val = {
              counter = 0x0
            },
            {
              locked = 0x0,
              pending = 0x0
            },
            {
              locked_pending = 0x0,
              tail = 0x0
            }
          }
        }
      }
    }
  },
  blocked_locks_lock = {
    {
      rlock = {
        raw_lock = {
          {
            val = {
              counter = 0x0
            },
            {
              locked = 0x0,
              pending = 0x0
            },
            {
              locked_pending = 0x0,
              tail = 0x0
            }
          }
        }
      }
    }
  },
  rec_file = 0x0,
  in_grace = 0x0,
  client_tracking_ops = 0x0,
  nfsd4_lease = 0x5a,
  nfsd4_grace = 0x5a,
  somebody_reclaimed = 0x0,
  track_reclaim_completes = 0x0,
  nr_reclaim_complete = {
    counter = 0x0
  },
  nfsd_net_up = 0x0,
  lockd_up = 0x0,
  writeverf_lock = {
    seqcount = {
      seqcount = {
        sequence = 0x0
      }
    },
    lock = {
      {
        rlock = {
          raw_lock = {
            {
              val = {
                counter = 0x0
              },
              {
                locked = 0x0,
                pending = 0x0
              },
              {
                locked_pending = 0x0,
                tail = 0x0
              }
            }
          }
        }
      }
    }
  },
  writeverf = "\000\000\000\000\000\000\000",
  max_connections = 0x0,
  clientid_base = 0x37b4ca7b,
  clientid_counter = 0x37b4ca7d,
  clverifier_counter = 0xa8ee910d,
  nfsd_serv = 0x0,
  keep_active = 0x0,
  s2s_cp_cl_id = 0x37b4ca7c,
  s2s_cp_stateids = {
    idr_rt = {
      xa_lock = {
        {
          rlock = {
            raw_lock = {
              {
                val = {
                  counter = 0x0
                },
                {
                  locked = 0x0,
                  pending = 0x0
                },
                {
                  locked_pending = 0x0,
                  tail = 0x0
                }
              }
            }
          }
        }
      },
      xa_flags = 0x0,
      xa_head = 0x0
    },
    idr_base = 0x0,
    idr_next = 0x0
  },
  s2s_cp_lock = {
    {
      rlock = {
        raw_lock = {
          {
            val = {
              counter = 0x0
            },
            {
              locked = 0x0,
              pending = 0x0
            },
            {
              locked_pending = 0x0,
              tail = 0x0
            }
          }
        }
      }
    }
  },
  nfsd_versions = 0x0,
  nfsd4_minorversions = 0x0,
  drc_hashtbl = 0xffff88810a2f0000,
  max_drc_entries = 0x14740,
  maskbits = 0xb,
  drc_hashsize = 0x800,
  num_drc_entries = {
    counter = 0x0
  },
  counter = {{
      lock = {
        raw_lock = {
          {
            val = {
              counter = 0x0
            },
            {
              locked = 0x0,
              pending = 0x0
            },
            {
              locked_pending = 0x0,
              tail = 0x0
            }
          }
        }
      },
      count = 0x0,
      list = {
        next = 0xffff888103f98dd0,
        prev = 0xffff8881114e9a18
      },
      counters = 0x607dc8402e10
    }, {
      lock = {
        raw_lock = {
          {
            val = {
              counter = 0x0
            },
            {
              locked = 0x0,
              pending = 0x0
            },
            {
              locked_pending = 0x0,
              tail = 0x0
            }
          }
        }
      },
      count = 0x0,
      list = {
        next = 0xffff8881114e99f0,
        prev = 0xffff88810b5743e0
      },
      counters = 0x607dc8402e14
    }},
  longest_chain = 0x0,
  longest_chain_cachesize = 0x0,
  nfsd_reply_cache_shrinker = {
    count_objects = 0xffffffffa0e4e9b0 <nfsd_reply_cache_count>,
    scan_objects = 0xffffffffa0e4f020 <nfsd_reply_cache_scan>,
    batch = 0x0,
    seeks = 0x1,
    flags = 0x1,
    list = {
      next = 0xffff888111daf420,
      prev = 0xffff8881114e9b30
    },
    nr_deferred = 0xffff88813a544a00
  },
  nfsd_ssc_lock = {
    {
      rlock = {
        raw_lock = {
          {
            val = {
              counter = 0x0
            },
            {
              locked = 0x0,
              pending = 0x0
            },
            {
              locked_pending = 0x0,
              tail = 0x0
            }
          }
        }
      }
    }
  },
  nfsd_ssc_mount_list = {
    next = 0x0,
    prev = 0x0
  },
  nfsd_ssc_waitq = {
    lock = {
      {
        rlock = {
          raw_lock = {
            {
              val = {
                counter = 0x0
              },
              {
                locked = 0x0,
                pending = 0x0
              },
              {
                locked_pending = 0x0,
                tail = 0x0
              }
            }
          }
        }
      }
    },
    head = {
      next = 0x0,
      prev = 0x0
    }
  },
  nfsd_name = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",
  fcache_disposal = 0x0,
  siphash_key = {
    key = {0x2a5ba10a35b36754, 0xd6b3a5a0e7696876}
  },
  nfs4_client_count = {
    counter = 0x0
  },
  nfs4_max_clients = 0x1800,
  nfsd_courtesy_clients = {
    counter = 0x0
  },
  nfsd_client_shrinker = {
    count_objects = 0xffffffffa0e742c0 <nfsd4_state_shrinker_count>,
    scan_objects = 0xffffffffa0e73a90 <nfsd4_state_shrinker_scan>,
    batch = 0x0,
    seeks = 0x2,
    flags = 0x1,
    list = {
      next = 0xffff8881114e9a58,
      prev = 0xffffffffa131b280 <mmu_shrinker+32>
    },
    nr_deferred = 0xffff88813a5449d8
  },
  nfsd_shrinker_work = {
    work = {
      data = {
        counter = 0x1
      },
      entry = {
        next = 0x0,
        prev = 0x0
      },
      func = 0x0
    },
    timer = {
      entry = {
        next = 0x0,
        pprev = 0x0
      },
      expires = 0x0,
      function = 0x0,
      flags = 0x0
    },
    wq = 0x0,
    cpu = 0x0
  }
}
crash> kmem -s 0xFFFF8881114E9800
CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
ffff888100042dc0     1024      18325     18352   1147    32k  kmalloc-1k
  SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
  ffffea0004453a00  ffff8881114e8000     0     16         16     0
  FREE / [ALLOCATED]
  [ffff8881114e9800]
crash>
Mike Galbraith Jan. 11, 2023, 12:26 p.m. UTC | #21
On Wed, 2023-01-11 at 03:31 -0800, dai.ngo@oracle.com wrote:
>
> Can you try:
>
> crash7latest> nfsd_net_id
> nfsd_net_id = $2 = 9                <<===
> crash7latest> struct net.gen  init_net
>    gen = 0xffff97fc17d07d80
> crash7latest> x /10g 0xffff97fc17d07d80
> 0xffff97fc17d07d80:     0x000000000000000d      0x0000000000000000
> 0xffff97fc17d07d90:     0x0000000000000000      0xffff97fc0ac40060
> 0xffff97fc17d07da0:     0xffff994e7bf87600      0xffff98f731172a20
> 0xffff97fc17d07db0:     0xffff9844b05d9c00      0xffff9832a6a0add0
> 0xffff97fc17d07dc0:     0xffff984a4470d740      0xffff984a93eb0600   
> <<=== entry for nfsd_net_id
> crash7latest> nfsd_net 0xffff984a93eb0600

(monkey see monkey do.. eep eep)

crash> nfsd_net_id
p: gdb request failed: p nfsd_net_id
crash> struct net.gen  init_net
  gen = 0xffff88810b7b8a00,
crash> x /10g 0xffff88810b7b8a00
0xffff88810b7b8a00:	0x0000000000000010	0x0000000000000000
0xffff88810b7b8a10:	0x0000000000000000	0xffff888101563380
0xffff88810b7b8a20:	0xffff888101ebd900	0xffff888101ebda00
0xffff88810b7b8a30:	0xffff888101f88b80	0xffff8881022056c0
0xffff88810b7b8a40:	0xffff888133b79e00	0xffff888110a2ca00
crash> nfsd_net 0xffff888110a2ca00
struct nfsd_net {
  cld_net = 0xffff888131c3c000,
  svc_expkey_cache = 0xffff888110a2cc00,
  svc_export_cache = 0xffff888110a2ce00,
  idtoname_cache = 0xffff8881061a8a00,
  nametoid_cache = 0xffff8881061a8c00,
  nfsd4_manager = {
    list = {
      next = 0xffff888141efa000,
      prev = 0xffff888133e6ea00
    },
    block_opens = false
  },
  grace_ended = false,
  boot_time = -131387065447864,
  nfsd_client_dir = 0xffff888110a2ca48,
  reclaim_str_hashtbl = 0xffff88810bed7408,
  reclaim_str_hashtbl_size = 1083333640,
  conf_id_hashtbl = 0x0,
  conf_name_tree = {
    rb_node = 0xffff888140925c00
  },
  unconf_id_hashtbl = 0xffff88810181c800,
  unconf_name_tree = {
    rb_node = 0x200000000
  },
  sessionid_hashtbl = 0x1,
  client_lru = {
    next = 0x0,
    prev = 0x0
  },
  close_lru = {
    next = 0xffff888110a2caa0,
    prev = 0xffff888110a2caa0
  },
  del_recall_lru = {
    next = 0x0,
    prev = 0xffffffffffffffff
  },
  blocked_locks_lru = {
    next = 0x0,
    prev = 0xffff88810a0e0f00
  },
  laundromat_work = {
    work = {
      data = {
        counter = 0
      },
      entry = {
        next = 0x0,
        prev = 0x0
      },
      func = 0x0
    },
    timer = {
      entry = {
        next = 0x0,
        pprev = 0x0
      },
      expires = 520729437059154371,
      function = 0x0,
      flags = 3526430787
    },
    wq = 0x24448948f6314540,
    cpu = 1133332496
  },
  client_lock = {
    {
      rlock = {
        raw_lock = {
          {
            val = {
              counter = 344528932
            },
            {
              locked = 36 '$',
              pending = 24 '\030'
            },
            {
              locked_pending = 6180,
              tail = 5257
            }
          }
        }
      }
    }
  },
  blocked_locks_lock = {
    {
      rlock = {
        raw_lock = {
          {
            val = {
              counter = 1820937252
            },
            {
              locked = 36 '$',
              pending = 76 'L'
            },
            {
              locked_pending = 19492,
              tail = 27785
            }
          }
        }
      }
    }
  },
  rec_file = 0x4808245c89483824,
  in_grace = 137,
  client_tracking_ops = 0xe8df8948005d8b,
  nfsd4_lease = -8266309238763028480,
  nfsd4_grace = 5476377146897729659,
  somebody_reclaimed = 139,
  track_reclaim_completes = 99,
  nr_reclaim_complete = {
    counter = -402096755
  },
  nfsd_net_up = false,
  lockd_up = false,
  writeverf_lock = {
    seqcount = {
      seqcount = {
        sequence = 140872013
      }
    },
    lock = {
      {
        rlock = {
          raw_lock = {
            {
              val = {
                counter = -387479220
              },
              {
                locked = 76 'L',
                pending = 137 '\211'
              },
              {
                locked_pending = 35148,
                tail = 59623
              }
            }
          }
        }
      }
    }
  },
  writeverf = "\000\000\000\000M\211,$",
  max_connections = 612141896,
  clientid_base = 59416,
  clientid_counter = 2336751616,
  clverifier_counter = 1275601988,
  nfsd_serv = 0x1024448b48185889,
  keep_active = 140740940,
  s2s_cp_cl_id = 1223133516,
  s2s_cp_stateids = {
    idr_rt = {
      xa_lock = {
        {
          rlock = {
            raw_lock = {
              {
                val = {
                  counter = 15205257
                },
                {
                  locked = 137 '\211',
                  pending = 3 '\003'
                },
                {
                  locked_pending = 905,
                  tail = 232
                }
              }
            }
          }
        }
      },
      xa_flags = 1224736768,
      xa_head = 0xf74f6854d241c89
    },
    idr_base = 276532552,
    idr_next = 232
  },
  s2s_cp_lock = {
    {
      rlock = {
        raw_lock = {
          {
            val = {
              counter = 1933134848
            },
            {
              locked = 0 '\000',
              pending = 76 'L'
            },
            {
              locked_pending = 19456,
              tail = 29497
            }
          }
        }
      }
    }
  },
  nfsd_versions = 0x443924048b012404,
  nfsd4_minorversions = 0x2b4820f2424,
  drc_hashtbl = 0x8678d4d107b8d48,
  max_drc_entries = 232,
  maskbits = 1938508800,
  drc_hashsize = 4287187984,
  num_drc_entries = {
    counter = 232
  },
  counter = {{
      lock = {
        raw_lock = {
          {
            val = {
              counter = 931745024
            },
            {
              locked = 0 '\000',
              pending = 77 'M'
            },
            {
              locked_pending = 19712,
              tail = 14217
            }
          }
        }
      },
      count = -8858645092202691189,
      list = {
        next = 0x24648b4cffffff43,
        prev = 0x246c8b4c24148b40
      },
      counters = 0xffffffffa0d0b540 <__this_module>
    }, {
      lock = {
        raw_lock = {
          {
            val = {
              counter = 256
            },
            {
              locked = 0 '\000',
              pending = 1 '\001'
            },
            {
              locked_pending = 256,
              tail = 0
            }
          }
        }
      },
      count = -131387314532352,
      list = {
        next = 0x0,
        prev = 0xffffffffa0c949c0 <svc_udp_ops+1248>
      },
      counters = 0xffffffffa0c67f00 <ip_map_put>
    }},
  longest_chain = 2697366144,
  longest_chain_cachesize = 4294967295,
  nfsd_reply_cache_shrinker = {
    count_objects = 0xffffffffa0c67cd0 <ip_map_request>,
    scan_objects = 0xffffffffa0c68e40 <ip_map_parse>,
    batch = -1597606560,
    seeks = 0,
    flags = 0,
    list = {
      next = 0xffffffffa0c67350 <ip_map_alloc>,
      prev = 0x0
    },
    nr_deferred = 0xffffffffa0c68a00 <ip_map_match>
  },
  nfsd_ssc_lock = {
    {
      rlock = {
        raw_lock = {
          {
            val = {
              counter = -1597603936
            },
            {
              locked = 160 '\240',
              pending = 127 '\177'
            },
            {
              locked_pending = 32672,
              tail = 41158
            }
          }
        }
      }
    }
  },
  nfsd_ssc_mount_list = {
    next = 0xffffffffa0c68b10 <update>,
    prev = 0x49
  },
  nfsd_ssc_waitq = {
    lock = {
      {
        rlock = {
          raw_lock = {
            {
              val = {
                counter = -1596979232
              },
              {
                locked = 224 '\340',
                pending = 7 '\a'
              },
              {
                locked_pending = 2016,
                tail = 41168
              }
            }
          }
        }
      }
    },
    head = {
      next = 0xffff888110a2ce88,
      prev = 0xc2
    }
  },
  nfsd_name = "\001\000\000\000\000\000\000\000\200\t\021D\201\210\377\377\200\t\021D\201\210\377\377\001\000\000\000\000\000\000\000\032\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\301\303\061\201\210\377\377@$\234\203\377\377\377\377",
  fcache_disposal = 0x0,
  siphash_key = {
    key = {0, 0}
  },
  nfs4_client_count = {
    counter = 451
  },
  nfs4_max_clients = 122552490,
  nfsd_courtesy_clients = {
    counter = 0
  },
  nfsd_client_shrinker = {
    count_objects = 0xe8000002a0a3894c,
    scan_objects = 0x98b3894400000000,
    batch = 5483261796049485826,
    seeks = 15267721,
    flags = 1275068416,
    list = {
      next = 0x18247c8d4918658b,
      prev = 0x7c8b4900000000e8
    },
    nr_deferred = 0x4800000000e81824
  },
  nfsd_shrinker_work = {
    work = {
      data = {
        counter = -8554306017173128307
      },
      entry = {
        next = 0x894c00000000e8c4,
        prev = 0xf7894c00000268a3
      },
      func = 0x6d8b4800000000e8
    },
    timer = {
      entry = {
        next = 0x270bb8d4818,
        pprev = 0xbb8d4800000000e8
      },
      expires = 8118733695596102332,
      function = 0xe8000002,
      flags = 45908935
    },
    wq = 0x147424e783410000,
    cpu = 553616193
  }
}
crash>
Jeffrey Layton Jan. 11, 2023, 12:33 p.m. UTC | #22
On Wed, 2023-01-11 at 13:15 +0100, Mike Galbraith wrote:
> On Wed, 2023-01-11 at 12:19 +0100, Mike Galbraith wrote:
> > On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote:
> > > > 
> > > > 
> > > > 
> > > > It might be interesting to turn up KASAN if you're able.
> > 
> > I can try that.
> 
> KASAN did not squeak.
> 
> > > If you still have this vmcore, it might be interesting to do the pointer
> > > math and find the nfsd_net structure that contains the above
> > > delayed_work. Does the rest of it also seem to be corrupt?
> 
> Virgin source with workqueue.c WARN_ON_ONCE() landmine.
> 

Thanks. Mixed bag here...


> crash> nfsd_net -x 0xFFFF8881114E9800
> struct nfsd_net {
>   cld_net = 0x0,
>   svc_expkey_cache = 0xffff8881420f8a00,
>   svc_export_cache = 0xffff8881420f8800,
>   idtoname_cache = 0xffff8881420f9a00,
>   nametoid_cache = 0xffff8881420f9c00,
>   nfsd4_manager = {
>     list = {
>       next = 0x0,
>       prev = 0x0
>     },
>     block_opens = 0x0
>   },
>   grace_ended = 0x0,


>   boot_time = 0x0,
>   nfsd_client_dir = 0x0,
>   reclaim_str_hashtbl = 0x0,
>   reclaim_str_hashtbl_size = 0x0,
>   conf_id_hashtbl = 0x0,
>   conf_name_tree = {
>     rb_node = 0x0
>   },
>   unconf_id_hashtbl = 0x0,
>   unconf_name_tree = {
>     rb_node = 0x0
>   },
>   sessionid_hashtbl = 0x0,
>   client_lru = {
>     next = 0x0,
>     prev = 0x0
>   },
>   close_lru = {
>     next = 0x0,
>     prev = 0x0
>   },
>   del_recall_lru = {
>     next = 0x0,
>     prev = 0x0
>   },
>   blocked_locks_lru = {
>     next = 0x0,
>     prev = 0x0
>   },

All of the above list_heads are zeroed out and they shouldn't be.

>   laundromat_work = {
>     work = {
>       data = {
>         counter = 0x0
>       },
>       entry = {
>         next = 0x0,
>         prev = 0x0
>       },
>       func = 0x0
>     },
>     timer = {
>       entry = {
>         next = 0x0,
>         pprev = 0x0
>       },
>       expires = 0x0,
>       function = 0x0,
>       flags = 0x0
>     },
>     wq = 0x0,
>     cpu = 0x0
>   },
>   client_lock = {
>     {
>       rlock = {
>         raw_lock = {
>           {
>             val = {
>               counter = 0x0
>             },
>             {
>               locked = 0x0,
>               pending = 0x0
>             },
>             {
>               locked_pending = 0x0,
>               tail = 0x0
>             }
>           }
>         }
>       }
>     }
>   },
>   blocked_locks_lock = {
>     {
>       rlock = {
>         raw_lock = {
>           {
>             val = {
>               counter = 0x0
>             },
>             {
>               locked = 0x0,
>               pending = 0x0
>             },
>             {
>               locked_pending = 0x0,
>               tail = 0x0
>             }
>           }
>         }
>       }
>     }
>   },
>   rec_file = 0x0,
>   in_grace = 0x0,
>   client_tracking_ops = 0x0,
>   nfsd4_lease = 0x5a,
>   nfsd4_grace = 0x5a,

The grace and lease times look ok, oddly enough.

>   somebody_reclaimed = 0x0,
>   track_reclaim_completes = 0x0,
>   nr_reclaim_complete = {
>     counter = 0x0
>   },
>   nfsd_net_up = 0x0,

nfsd_net_up is false, which means that this server isn't running (or
that the memory here was scribbled over).

>   lockd_up = 0x0,
>   writeverf_lock = {
>     seqcount = {
>       seqcount = {
>         sequence = 0x0
>       }
>     },
>     lock = {
>       {
>         rlock = {
>           raw_lock = {
>             {
>               val = {
>                 counter = 0x0
>               },
>               {
>                 locked = 0x0,
>                 pending = 0x0
>               },
>               {
>                 locked_pending = 0x0,
>                 tail = 0x0
>               }
>             }
>           }
>         }
>       }
>     }
>   },
>   writeverf = "\000\000\000\000\000\000\000",
>   max_connections = 0x0,
>   clientid_base = 0x37b4ca7b,
>   clientid_counter = 0x37b4ca7d,
>   clverifier_counter = 0xa8ee910d,
>   nfsd_serv = 0x0,
>   keep_active = 0x0,
>   s2s_cp_cl_id = 0x37b4ca7c,
>   s2s_cp_stateids = {
>     idr_rt = {
>       xa_lock = {
>         {
>           rlock = {
>             raw_lock = {
>               {
>                 val = {
>                   counter = 0x0
>                 },
>                 {
>                   locked = 0x0,
>                   pending = 0x0
>                 },
>                 {
>                   locked_pending = 0x0,
>                   tail = 0x0
>                 }
>               }
>             }
>           }
>         }
>       },
>       xa_flags = 0x0,
>       xa_head = 0x0
>     },
>     idr_base = 0x0,
>     idr_next = 0x0
>   },
>   s2s_cp_lock = {
>     {
>       rlock = {
>         raw_lock = {
>           {
>             val = {
>               counter = 0x0
>             },
>             {
>               locked = 0x0,
>               pending = 0x0
>             },
>             {
>               locked_pending = 0x0,
>               tail = 0x0
>             }
>           }
>         }
>       }
>     }
>   },
>   nfsd_versions = 0x0,
>   nfsd4_minorversions = 0x0,
>   drc_hashtbl = 0xffff88810a2f0000,
>   max_drc_entries = 0x14740,
>   maskbits = 0xb,
>   drc_hashsize = 0x800,
>   num_drc_entries = {
>     counter = 0x0
>   },
>   counter = {{
>       lock = {
>         raw_lock = {
>           {
>             val = {
>               counter = 0x0
>             },
>             {
>               locked = 0x0,
>               pending = 0x0
>             },
>             {
>               locked_pending = 0x0,
>               tail = 0x0
>             }
>           }
>         }
>       },
>       count = 0x0,
>       list = {
>         next = 0xffff888103f98dd0,
>         prev = 0xffff8881114e9a18
>       },
>       counters = 0x607dc8402e10
>     }, {
>       lock = {
>         raw_lock = {
>           {
>             val = {
>               counter = 0x0
>             },
>             {
>               locked = 0x0,
>               pending = 0x0
>             },
>             {
>               locked_pending = 0x0,
>               tail = 0x0
>             }
>           }
>         }
>       },
>       count = 0x0,
>       list = {
>         next = 0xffff8881114e99f0,
>         prev = 0xffff88810b5743e0
>       },
>       counters = 0x607dc8402e14
>     }},
>   longest_chain = 0x0,
>   longest_chain_cachesize = 0x0,
>   nfsd_reply_cache_shrinker = {
>     count_objects = 0xffffffffa0e4e9b0 <nfsd_reply_cache_count>,
>     scan_objects = 0xffffffffa0e4f020 <nfsd_reply_cache_scan>,

Shrinker pointers look ok, as does its list_head.

>     batch = 0x0,
>     seeks = 0x1,
>     flags = 0x1,
>     list = {
>       next = 0xffff888111daf420,
>       prev = 0xffff8881114e9b30
>     },
>     nr_deferred = 0xffff88813a544a00
>   },
>   nfsd_ssc_lock = {
>     {
>       rlock = {
>         raw_lock = {
>           {
>             val = {
>               counter = 0x0
>             },
>             {
>               locked = 0x0,
>               pending = 0x0
>             },
>             {
>               locked_pending = 0x0,
>               tail = 0x0
>             }
>           }
>         }
>       }
>     }
>   },
>   nfsd_ssc_mount_list = {
>     next = 0x0,
>     prev = 0x0
>   },
>   nfsd_ssc_waitq = {
>     lock = {
>       {
>         rlock = {
>           raw_lock = {
>             {
>               val = {
>                 counter = 0x0
>               },
>               {
>                 locked = 0x0,
>                 pending = 0x0
>               },
>               {
>                 locked_pending = 0x0,
>                 tail = 0x0
>               }
>             }
>           }
>         }
>       }
>     },
>     head = {
>       next = 0x0,
>       prev = 0x0
>     }
>   },
>   nfsd_name = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",
> 
> 

nfsd_name is usually set to utsname, so that looks bogus.

>   fcache_disposal = 0x0,
>   siphash_key = {
>     key = {0x2a5ba10a35b36754, 0xd6b3a5a0e7696876}
>   },
>   nfs4_client_count = {
>     counter = 0x0
>   },
>   nfs4_max_clients = 0x1800,
>   nfsd_courtesy_clients = {
>     counter = 0x0
>   },
>   nfsd_client_shrinker = {
>     count_objects = 0xffffffffa0e742c0 <nfsd4_state_shrinker_count>,
>     scan_objects = 0xffffffffa0e73a90 <nfsd4_state_shrinker_scan>,
>     batch = 0x0,
>     seeks = 0x2,
>     flags = 0x1,
>     list = {
>       next = 0xffff8881114e9a58,
>       prev = 0xffffffffa131b280 <mmu_shrinker+32>
>     },
>     nr_deferred = 0xffff88813a5449d8
>   },
>   nfsd_shrinker_work = {
>     work = {
>       data = {
>         counter = 0x1
>       },
>       entry = {
>         next = 0x0,
>         prev = 0x0
>       },
>       func = 0x0
>     },
>     timer = {
>       entry = {
>         next = 0x0,
>         pprev = 0x0
>       },
>       expires = 0x0,
>       function = 0x0,
>       flags = 0x0
>     },
>     wq = 0x0,
>     cpu = 0x0
>   }
> }
> crash> kmem -s 0xFFFF8881114E9800
> CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
> ffff888100042dc0     1024      18325     18352   1147    32k  kmalloc-1k
>   SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
>   ffffea0004453a00  ffff8881114e8000     0     16         16     0
>   FREE / [ALLOCATED]
>   [ffff8881114e9800]
> crash>
> 

Bit of a mixed bag here. A lot of these fields are corrupt, but not all
of them.

One thing that might interesting to rule out a UAF would be to
explicitly poison this struct in nfsd_exit_net. Basically do something
like this at the end of exit_net:

	memset(net, 0x7c, sizeof(*net));

That might help trigger an oops sooner after the problem occurs.

If you're feeling ambitious, another thing you could do is track down
some of the running nfsd's in the vmcore, find their rqstp values and
see whether the sockets are pointed at the same nfsd_net as the one you
found above (see nfsd() function to see how to get from one to the
other).

If they're pointed at a different nfsd_net that that would suggest that
we are looking at a UAF. If it's the same nfsd_net, then I'd lean more
toward some sort of memory scribble.
Jeffrey Layton Jan. 11, 2023, 12:44 p.m. UTC | #23
On Wed, 2023-01-11 at 13:26 +0100, Mike Galbraith wrote:
> On Wed, 2023-01-11 at 03:31 -0800, dai.ngo@oracle.com wrote:
> > 
> > Can you try:
> > 
> > crash7latest> nfsd_net_id
> > nfsd_net_id = $2 = 9                <<===
> > crash7latest> struct net.gen  init_net
> >    gen = 0xffff97fc17d07d80
> > crash7latest> x /10g 0xffff97fc17d07d80
> > 0xffff97fc17d07d80:     0x000000000000000d      0x0000000000000000
> > 0xffff97fc17d07d90:     0x0000000000000000      0xffff97fc0ac40060
> > 0xffff97fc17d07da0:     0xffff994e7bf87600      0xffff98f731172a20
> > 0xffff97fc17d07db0:     0xffff9844b05d9c00      0xffff9832a6a0add0
> > 0xffff97fc17d07dc0:     0xffff984a4470d740      0xffff984a93eb0600   
> > <<=== entry for nfsd_net_id
> > crash7latest> nfsd_net 0xffff984a93eb0600
> 
> (monkey see monkey do.. eep eep)
> 
> crash> nfsd_net_id
> p: gdb request failed: p nfsd_net_id
> crash> struct net.gen  init_net
>   gen = 0xffff88810b7b8a00,
> crash> x /10g 0xffff88810b7b8a00
> 0xffff88810b7b8a00:	0x0000000000000010	0x0000000000000000
> 0xffff88810b7b8a10:	0x0000000000000000	0xffff888101563380
> 0xffff88810b7b8a20:	0xffff888101ebd900	0xffff888101ebda00
> 0xffff88810b7b8a30:	0xffff888101f88b80	0xffff8881022056c0
> 0xffff88810b7b8a40:	0xffff888133b79e00	0xffff888110a2ca00
> crash> nfsd_net 0xffff888110a2ca00
> struct nfsd_net {
>   cld_net = 0xffff888131c3c000,
>   svc_expkey_cache = 0xffff888110a2cc00,
>   svc_export_cache = 0xffff888110a2ce00,
>   idtoname_cache = 0xffff8881061a8a00,
>   nametoid_cache = 0xffff8881061a8c00,
>   nfsd4_manager = {
>     list = {
>       next = 0xffff888141efa000,
>       prev = 0xffff888133e6ea00
>     },
>     block_opens = false
>   },
>   grace_ended = false,
>   boot_time = -131387065447864,
>   nfsd_client_dir = 0xffff888110a2ca48,
>   reclaim_str_hashtbl = 0xffff88810bed7408,
>   reclaim_str_hashtbl_size = 1083333640,
>   conf_id_hashtbl = 0x0,
>   conf_name_tree = {
>     rb_node = 0xffff888140925c00
>   },
>   unconf_id_hashtbl = 0xffff88810181c800,
>   unconf_name_tree = {
>     rb_node = 0x200000000
>   },
>   sessionid_hashtbl = 0x1,
>   client_lru = {
>     next = 0x0,
>     prev = 0x0
>   },
>   close_lru = {
>     next = 0xffff888110a2caa0,
>     prev = 0xffff888110a2caa0
>   },
>   del_recall_lru = {
>     next = 0x0,
>     prev = 0xffffffffffffffff
>   },
>   blocked_locks_lru = {
>     next = 0x0,
>     prev = 0xffff88810a0e0f00
>   },
>   laundromat_work = {
>     work = {
>       data = {
>         counter = 0
>       },
>       entry = {
>         next = 0x0,
>         prev = 0x0
>       },
>       func = 0x0
>     },
>     timer = {
>       entry = {
>         next = 0x0,
>         pprev = 0x0
>       },
>       expires = 520729437059154371,
>       function = 0x0,
>       flags = 3526430787
>     },
>     wq = 0x24448948f6314540,
>     cpu = 1133332496
>   },
>   client_lock = {
>     {
>       rlock = {
>         raw_lock = {
>           {
>             val = {
>               counter = 344528932
>             },
>             {
>               locked = 36 '$',
>               pending = 24 '\030'
>             },
>             {
>               locked_pending = 6180,
>               tail = 5257
>             }
>           }
>         }
>       }
>     }
>   },
>   blocked_locks_lock = {
>     {
>       rlock = {
>         raw_lock = {
>           {
>             val = {
>               counter = 1820937252
>             },
>             {
>               locked = 36 '$',
>               pending = 76 'L'
>             },
>             {
>               locked_pending = 19492,
>               tail = 27785
>             }
>           }
>         }
>       }
>     }
>   },
>   rec_file = 0x4808245c89483824,
>   in_grace = 137,
>   client_tracking_ops = 0xe8df8948005d8b,
>   nfsd4_lease = -8266309238763028480,
>   nfsd4_grace = 5476377146897729659,
>   somebody_reclaimed = 139,
>   track_reclaim_completes = 99,
>   nr_reclaim_complete = {
>     counter = -402096755
>   },
>   nfsd_net_up = false,
>   lockd_up = false,
>   writeverf_lock = {
>     seqcount = {
>       seqcount = {
>         sequence = 140872013
>       }
>     },
>     lock = {
>       {
>         rlock = {
>           raw_lock = {
>             {
>               val = {
>                 counter = -387479220
>               },
>               {
>                 locked = 76 'L',
>                 pending = 137 '\211'
>               },
>               {
>                 locked_pending = 35148,
>                 tail = 59623
>               }
>             }
>           }
>         }
>       }
>     }
>   },
>   writeverf = "\000\000\000\000M\211,$",
>   max_connections = 612141896,
>   clientid_base = 59416,
>   clientid_counter = 2336751616,
>   clverifier_counter = 1275601988,
>   nfsd_serv = 0x1024448b48185889,
>   keep_active = 140740940,
>   s2s_cp_cl_id = 1223133516,
>   s2s_cp_stateids = {
>     idr_rt = {
>       xa_lock = {
>         {
>           rlock = {
>             raw_lock = {
>               {
>                 val = {
>                   counter = 15205257
>                 },
>                 {
>                   locked = 137 '\211',
>                   pending = 3 '\003'
>                 },
>                 {
>                   locked_pending = 905,
>                   tail = 232
>                 }
>               }
>             }
>           }
>         }
>       },
>       xa_flags = 1224736768,
>       xa_head = 0xf74f6854d241c89
>     },
>     idr_base = 276532552,
>     idr_next = 232
>   },
>   s2s_cp_lock = {
>     {
>       rlock = {
>         raw_lock = {
>           {
>             val = {
>               counter = 1933134848
>             },
>             {
>               locked = 0 '\000',
>               pending = 76 'L'
>             },
>             {
>               locked_pending = 19456,
>               tail = 29497
>             }
>           }
>         }
>       }
>     }
>   },
>   nfsd_versions = 0x443924048b012404,
>   nfsd4_minorversions = 0x2b4820f2424,
>   drc_hashtbl = 0x8678d4d107b8d48,
>   max_drc_entries = 232,
>   maskbits = 1938508800,
>   drc_hashsize = 4287187984,
>   num_drc_entries = {
>     counter = 232
>   },
>   counter = {{
>       lock = {
>         raw_lock = {
>           {
>             val = {
>               counter = 931745024
>             },
>             {
>               locked = 0 '\000',
>               pending = 77 'M'
>             },
>             {
>               locked_pending = 19712,
>               tail = 14217
>             }
>           }
>         }
>       },
>       count = -8858645092202691189,
>       list = {
>         next = 0x24648b4cffffff43,
>         prev = 0x246c8b4c24148b40
>       },
>       counters = 0xffffffffa0d0b540 <__this_module>
>     }, {
>       lock = {
>         raw_lock = {
>           {
>             val = {
>               counter = 256
>             },
>             {
>               locked = 0 '\000',
>               pending = 1 '\001'
>             },
>             {
>               locked_pending = 256,
>               tail = 0
>             }
>           }
>         }
>       },
>       count = -131387314532352,
>       list = {
>         next = 0x0,
>         prev = 0xffffffffa0c949c0 <svc_udp_ops+1248>
>       },
>       counters = 0xffffffffa0c67f00 <ip_map_put>
>     }},
>   longest_chain = 2697366144,
>   longest_chain_cachesize = 4294967295,
>   nfsd_reply_cache_shrinker = {
>     count_objects = 0xffffffffa0c67cd0 <ip_map_request>,
>     scan_objects = 0xffffffffa0c68e40 <ip_map_parse>,

Looks like this part of the struct may have been overwritten with
ip_map_cache_template ? Nothing else in here looks recognizable, so I
have to wonder if you actually have the correct nfsd_net pointer here.

>     batch = -1597606560,
>     seeks = 0,
>     flags = 0,
>     list = {
>       next = 0xffffffffa0c67350 <ip_map_alloc>,
>       prev = 0x0
>     },
>     nr_deferred = 0xffffffffa0c68a00 <ip_map_match>
>   },
>   nfsd_ssc_lock = {
>     {
>       rlock = {
>         raw_lock = {
>           {
>             val = {
>               counter = -1597603936
>             },
>             {
>               locked = 160 '\240',
>               pending = 127 '\177'
>             },
>             {
>               locked_pending = 32672,
>               tail = 41158
>             }
>           }
>         }
>       }
>     }
>   },
>   nfsd_ssc_mount_list = {
>     next = 0xffffffffa0c68b10 <update>,
>     prev = 0x49
>   },
>   nfsd_ssc_waitq = {
>     lock = {
>       {
>         rlock = {
>           raw_lock = {
>             {
>               val = {
>                 counter = -1596979232
>               },
>               {
>                 locked = 224 '\340',
>                 pending = 7 '\a'
>               },
>               {
>                 locked_pending = 2016,
>                 tail = 41168
>               }
>             }
>           }
>         }
>       }
>     },
>     head = {
>       next = 0xffff888110a2ce88,
>       prev = 0xc2
>     }
>   },
>   nfsd_name = "\001\000\000\000\000\000\000\000\200\t\021D\201\210\377\377\200\t\021D\201\210\377\377\001\000\000\000\000\000\000\000\032\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\301\303\061\201\210\377\377@$\234\203\377\377\377\377",
>   fcache_disposal = 0x0,
>   siphash_key = {
>     key = {0, 0}
>   },
>   nfs4_client_count = {
>     counter = 451
>   },
>   nfs4_max_clients = 122552490,
>   nfsd_courtesy_clients = {
>     counter = 0
>   },
>   nfsd_client_shrinker = {
>     count_objects = 0xe8000002a0a3894c,
>     scan_objects = 0x98b3894400000000,
>     batch = 5483261796049485826,
>     seeks = 15267721,
>     flags = 1275068416,
>     list = {
>       next = 0x18247c8d4918658b,
>       prev = 0x7c8b4900000000e8
>     },
>     nr_deferred = 0x4800000000e81824
>   },
>   nfsd_shrinker_work = {
>     work = {
>       data = {
>         counter = -8554306017173128307
>       },
>       entry = {
>         next = 0x894c00000000e8c4,
>         prev = 0xf7894c00000268a3
>       },
>       func = 0x6d8b4800000000e8
>     },
>     timer = {
>       entry = {
>         next = 0x270bb8d4818,
>         pprev = 0xbb8d4800000000e8
>       },
>       expires = 8118733695596102332,
>       function = 0xe8000002,
>       flags = 45908935
>     },
>     wq = 0x147424e783410000,
>     cpu = 553616193
>   }
> }
> crash>
>
Mike Galbraith Jan. 11, 2023, 1:48 p.m. UTC | #24
On Wed, 2023-01-11 at 07:33 -0500, Jeff Layton wrote:
>
> One thing that might interesting to rule out a UAF would be to
> explicitly poison this struct in nfsd_exit_net. Basically do something
> like this at the end of exit_net:
>
>         memset(net, 0x7c, sizeof(*net));
>
> That might help trigger an oops sooner after the problem occurs.

Blasting net rendered the VM non-booting.  Blasting nn OTOH seems to
have changed nothing at all.

> If you're feeling ambitious, another thing you could do is track down
> some of the running nfsd's in the vmcore, find their rqstp values and
> see whether the sockets are pointed at the same nfsd_net as the one you
> found above (see nfsd() function to see how to get from one to the
> other).
>
> If they're pointed at a different nfsd_net that that would suggest that
> we are looking at a UAF. If it's the same nfsd_net, then I'd lean more
> toward some sort of memory scribble.

Way better: scrawny NFS chimp hands dump to big/strong NFS gorilla :)

	-Mike
Jeffrey Layton Jan. 11, 2023, 2:01 p.m. UTC | #25
On Wed, 2023-01-11 at 07:33 -0500, Jeff Layton wrote:
> On Wed, 2023-01-11 at 13:15 +0100, Mike Galbraith wrote:
> > On Wed, 2023-01-11 at 12:19 +0100, Mike Galbraith wrote:
> > > On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote:
> > > > > 
> > > > > 
> > > > > 
> > > > > It might be interesting to turn up KASAN if you're able.
> > > 
> > > I can try that.
> > 
> > KASAN did not squeak.
> > 
> > > > If you still have this vmcore, it might be interesting to do the pointer
> > > > math and find the nfsd_net structure that contains the above
> > > > delayed_work. Does the rest of it also seem to be corrupt?
> > 
> > Virgin source with workqueue.c WARN_ON_ONCE() landmine.
> > 
> 
> Thanks. Mixed bag here...
> 
> 
> > crash> nfsd_net -x 0xFFFF8881114E9800
> > struct nfsd_net {
> >   cld_net = 0x0,
> >   svc_expkey_cache = 0xffff8881420f8a00,
> >   svc_export_cache = 0xffff8881420f8800,
> >   idtoname_cache = 0xffff8881420f9a00,
> >   nametoid_cache = 0xffff8881420f9c00,
> >   nfsd4_manager = {
> >     list = {
> >       next = 0x0,
> >       prev = 0x0
> >     },
> >     block_opens = 0x0
> >   },
> >   grace_ended = 0x0,
> 
> 
> >   boot_time = 0x0,
> >   nfsd_client_dir = 0x0,
> >   reclaim_str_hashtbl = 0x0,
> >   reclaim_str_hashtbl_size = 0x0,
> >   conf_id_hashtbl = 0x0,
> >   conf_name_tree = {
> >     rb_node = 0x0
> >   },
> >   unconf_id_hashtbl = 0x0,
> >   unconf_name_tree = {
> >     rb_node = 0x0
> >   },
> >   sessionid_hashtbl = 0x0,
> >   client_lru = {
> >     next = 0x0,
> >     prev = 0x0
> >   },
> >   close_lru = {
> >     next = 0x0,
> >     prev = 0x0
> >   },
> >   del_recall_lru = {
> >     next = 0x0,
> >     prev = 0x0
> >   },
> >   blocked_locks_lru = {
> >     next = 0x0,
> >     prev = 0x0
> >   },
> 
> All of the above list_heads are zeroed out and they shouldn't be.
> 
> >   laundromat_work = {
> >     work = {
> >       data = {
> >         counter = 0x0
> >       },
> >       entry = {
> >         next = 0x0,
> >         prev = 0x0
> >       },
> >       func = 0x0
> >     },
> >     timer = {
> >       entry = {
> >         next = 0x0,
> >         pprev = 0x0
> >       },
> >       expires = 0x0,
> >       function = 0x0,
> >       flags = 0x0
> >     },
> >     wq = 0x0,
> >     cpu = 0x0
> >   },
> >   client_lock = {
> >     {
> >       rlock = {
> >         raw_lock = {
> >           {
> >             val = {
> >               counter = 0x0
> >             },
> >             {
> >               locked = 0x0,
> >               pending = 0x0
> >             },
> >             {
> >               locked_pending = 0x0,
> >               tail = 0x0
> >             }
> >           }
> >         }
> >       }
> >     }
> >   },
> >   blocked_locks_lock = {
> >     {
> >       rlock = {
> >         raw_lock = {
> >           {
> >             val = {
> >               counter = 0x0
> >             },
> >             {
> >               locked = 0x0,
> >               pending = 0x0
> >             },
> >             {
> >               locked_pending = 0x0,
> >               tail = 0x0
> >             }
> >           }
> >         }
> >       }
> >     }
> >   },
> >   rec_file = 0x0,
> >   in_grace = 0x0,
> >   client_tracking_ops = 0x0,
> >   nfsd4_lease = 0x5a,
> >   nfsd4_grace = 0x5a,
> 
> The grace and lease times look ok, oddly enough.
> 
> >   somebody_reclaimed = 0x0,
> >   track_reclaim_completes = 0x0,
> >   nr_reclaim_complete = {
> >     counter = 0x0
> >   },
> >   nfsd_net_up = 0x0,
> 
> nfsd_net_up is false, which means that this server isn't running (or
> that the memory here was scribbled over).
> 
> >   lockd_up = 0x0,
> >   writeverf_lock = {
> >     seqcount = {
> >       seqcount = {
> >         sequence = 0x0
> >       }
> >     },
> >     lock = {
> >       {
> >         rlock = {
> >           raw_lock = {
> >             {
> >               val = {
> >                 counter = 0x0
> >               },
> >               {
> >                 locked = 0x0,
> >                 pending = 0x0
> >               },
> >               {
> >                 locked_pending = 0x0,
> >                 tail = 0x0
> >               }
> >             }
> >           }
> >         }
> >       }
> >     }
> >   },
> >   writeverf = "\000\000\000\000\000\000\000",
> >   max_connections = 0x0,
> >   clientid_base = 0x37b4ca7b,
> >   clientid_counter = 0x37b4ca7d,
> >   clverifier_counter = 0xa8ee910d,
> >   nfsd_serv = 0x0,
> >   keep_active = 0x0,
> >   s2s_cp_cl_id = 0x37b4ca7c,
> >   s2s_cp_stateids = {
> >     idr_rt = {
> >       xa_lock = {
> >         {
> >           rlock = {
> >             raw_lock = {
> >               {
> >                 val = {
> >                   counter = 0x0
> >                 },
> >                 {
> >                   locked = 0x0,
> >                   pending = 0x0
> >                 },
> >                 {
> >                   locked_pending = 0x0,
> >                   tail = 0x0
> >                 }
> >               }
> >             }
> >           }
> >         }
> >       },
> >       xa_flags = 0x0,
> >       xa_head = 0x0
> >     },
> >     idr_base = 0x0,
> >     idr_next = 0x0
> >   },
> >   s2s_cp_lock = {
> >     {
> >       rlock = {
> >         raw_lock = {
> >           {
> >             val = {
> >               counter = 0x0
> >             },
> >             {
> >               locked = 0x0,
> >               pending = 0x0
> >             },
> >             {
> >               locked_pending = 0x0,
> >               tail = 0x0
> >             }
> >           }
> >         }
> >       }
> >     }
> >   },
> >   nfsd_versions = 0x0,
> >   nfsd4_minorversions = 0x0,
> >   drc_hashtbl = 0xffff88810a2f0000,
> >   max_drc_entries = 0x14740,
> >   maskbits = 0xb,
> >   drc_hashsize = 0x800,
> >   num_drc_entries = {
> >     counter = 0x0
> >   },
> >   counter = {{
> >       lock = {
> >         raw_lock = {
> >           {
> >             val = {
> >               counter = 0x0
> >             },
> >             {
> >               locked = 0x0,
> >               pending = 0x0
> >             },
> >             {
> >               locked_pending = 0x0,
> >               tail = 0x0
> >             }
> >           }
> >         }
> >       },
> >       count = 0x0,
> >       list = {
> >         next = 0xffff888103f98dd0,
> >         prev = 0xffff8881114e9a18
> >       },
> >       counters = 0x607dc8402e10
> >     }, {
> >       lock = {
> >         raw_lock = {
> >           {
> >             val = {
> >               counter = 0x0
> >             },
> >             {
> >               locked = 0x0,
> >               pending = 0x0
> >             },
> >             {
> >               locked_pending = 0x0,
> >               tail = 0x0
> >             }
> >           }
> >         }
> >       },
> >       count = 0x0,
> >       list = {
> >         next = 0xffff8881114e99f0,
> >         prev = 0xffff88810b5743e0
> >       },
> >       counters = 0x607dc8402e14
> >     }},
> >   longest_chain = 0x0,
> >   longest_chain_cachesize = 0x0,
> >   nfsd_reply_cache_shrinker = {
> >     count_objects = 0xffffffffa0e4e9b0 <nfsd_reply_cache_count>,
> >     scan_objects = 0xffffffffa0e4f020 <nfsd_reply_cache_scan>,
> 
> Shrinker pointers look ok, as does its list_head.

I think I might see what's going on here:

This struct is consistent with an nfsd_net structure that allocated and
had its ->init routine run on it, but that has not had nfsd started in
its namespace.

pernet structs are kzalloc'ed. The shrinker registrations and the
lease/grace times are set just after allocation in the ->init routine.
The rest of the fields are not set until nfsd is started (specifically
when /proc/fs/nfsd/threads is written to).

My guess is that Mike is doing some activity that creates new net
namespaces, and then once we start getting into OOM conditions the
shrinker ends up hitting uninitialized fields in the nfsd_net and
kaboom.

I haven't yet looked to see when this bug was introduced, but the right
fix is probably to ensure that everything important for this job is
initialized after the pernet ->init operation runs.
Jeffrey Layton Jan. 11, 2023, 2:16 p.m. UTC | #26
On Wed, 2023-01-11 at 09:01 -0500, Jeff Layton wrote:
> On Wed, 2023-01-11 at 07:33 -0500, Jeff Layton wrote:
> > On Wed, 2023-01-11 at 13:15 +0100, Mike Galbraith wrote:
> > > On Wed, 2023-01-11 at 12:19 +0100, Mike Galbraith wrote:
> > > > On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote:
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > It might be interesting to turn up KASAN if you're able.
> > > > 
> > > > I can try that.
> > > 
> > > KASAN did not squeak.
> > > 
> > > > > If you still have this vmcore, it might be interesting to do the pointer
> > > > > math and find the nfsd_net structure that contains the above
> > > > > delayed_work. Does the rest of it also seem to be corrupt?
> > > 
> > > Virgin source with workqueue.c WARN_ON_ONCE() landmine.
> > > 
> > 
> > Thanks. Mixed bag here...
> > 
> > 
> > > crash> nfsd_net -x 0xFFFF8881114E9800
> > > struct nfsd_net {
> > >   cld_net = 0x0,
> > >   svc_expkey_cache = 0xffff8881420f8a00,
> > >   svc_export_cache = 0xffff8881420f8800,
> > >   idtoname_cache = 0xffff8881420f9a00,
> > >   nametoid_cache = 0xffff8881420f9c00,
> > >   nfsd4_manager = {
> > >     list = {
> > >       next = 0x0,
> > >       prev = 0x0
> > >     },
> > >     block_opens = 0x0
> > >   },
> > >   grace_ended = 0x0,
> > 
> > 
> > >   boot_time = 0x0,
> > >   nfsd_client_dir = 0x0,
> > >   reclaim_str_hashtbl = 0x0,
> > >   reclaim_str_hashtbl_size = 0x0,
> > >   conf_id_hashtbl = 0x0,
> > >   conf_name_tree = {
> > >     rb_node = 0x0
> > >   },
> > >   unconf_id_hashtbl = 0x0,
> > >   unconf_name_tree = {
> > >     rb_node = 0x0
> > >   },
> > >   sessionid_hashtbl = 0x0,
> > >   client_lru = {
> > >     next = 0x0,
> > >     prev = 0x0
> > >   },
> > >   close_lru = {
> > >     next = 0x0,
> > >     prev = 0x0
> > >   },
> > >   del_recall_lru = {
> > >     next = 0x0,
> > >     prev = 0x0
> > >   },
> > >   blocked_locks_lru = {
> > >     next = 0x0,
> > >     prev = 0x0
> > >   },
> > 
> > All of the above list_heads are zeroed out and they shouldn't be.
> > 
> > >   laundromat_work = {
> > >     work = {
> > >       data = {
> > >         counter = 0x0
> > >       },
> > >       entry = {
> > >         next = 0x0,
> > >         prev = 0x0
> > >       },
> > >       func = 0x0
> > >     },
> > >     timer = {
> > >       entry = {
> > >         next = 0x0,
> > >         pprev = 0x0
> > >       },
> > >       expires = 0x0,
> > >       function = 0x0,
> > >       flags = 0x0
> > >     },
> > >     wq = 0x0,
> > >     cpu = 0x0
> > >   },
> > >   client_lock = {
> > >     {
> > >       rlock = {
> > >         raw_lock = {
> > >           {
> > >             val = {
> > >               counter = 0x0
> > >             },
> > >             {
> > >               locked = 0x0,
> > >               pending = 0x0
> > >             },
> > >             {
> > >               locked_pending = 0x0,
> > >               tail = 0x0
> > >             }
> > >           }
> > >         }
> > >       }
> > >     }
> > >   },
> > >   blocked_locks_lock = {
> > >     {
> > >       rlock = {
> > >         raw_lock = {
> > >           {
> > >             val = {
> > >               counter = 0x0
> > >             },
> > >             {
> > >               locked = 0x0,
> > >               pending = 0x0
> > >             },
> > >             {
> > >               locked_pending = 0x0,
> > >               tail = 0x0
> > >             }
> > >           }
> > >         }
> > >       }
> > >     }
> > >   },
> > >   rec_file = 0x0,
> > >   in_grace = 0x0,
> > >   client_tracking_ops = 0x0,
> > >   nfsd4_lease = 0x5a,
> > >   nfsd4_grace = 0x5a,
> > 
> > The grace and lease times look ok, oddly enough.
> > 
> > >   somebody_reclaimed = 0x0,
> > >   track_reclaim_completes = 0x0,
> > >   nr_reclaim_complete = {
> > >     counter = 0x0
> > >   },
> > >   nfsd_net_up = 0x0,
> > 
> > nfsd_net_up is false, which means that this server isn't running (or
> > that the memory here was scribbled over).
> > 
> > >   lockd_up = 0x0,
> > >   writeverf_lock = {
> > >     seqcount = {
> > >       seqcount = {
> > >         sequence = 0x0
> > >       }
> > >     },
> > >     lock = {
> > >       {
> > >         rlock = {
> > >           raw_lock = {
> > >             {
> > >               val = {
> > >                 counter = 0x0
> > >               },
> > >               {
> > >                 locked = 0x0,
> > >                 pending = 0x0
> > >               },
> > >               {
> > >                 locked_pending = 0x0,
> > >                 tail = 0x0
> > >               }
> > >             }
> > >           }
> > >         }
> > >       }
> > >     }
> > >   },
> > >   writeverf = "\000\000\000\000\000\000\000",
> > >   max_connections = 0x0,
> > >   clientid_base = 0x37b4ca7b,
> > >   clientid_counter = 0x37b4ca7d,
> > >   clverifier_counter = 0xa8ee910d,
> > >   nfsd_serv = 0x0,
> > >   keep_active = 0x0,
> > >   s2s_cp_cl_id = 0x37b4ca7c,
> > >   s2s_cp_stateids = {
> > >     idr_rt = {
> > >       xa_lock = {
> > >         {
> > >           rlock = {
> > >             raw_lock = {
> > >               {
> > >                 val = {
> > >                   counter = 0x0
> > >                 },
> > >                 {
> > >                   locked = 0x0,
> > >                   pending = 0x0
> > >                 },
> > >                 {
> > >                   locked_pending = 0x0,
> > >                   tail = 0x0
> > >                 }
> > >               }
> > >             }
> > >           }
> > >         }
> > >       },
> > >       xa_flags = 0x0,
> > >       xa_head = 0x0
> > >     },
> > >     idr_base = 0x0,
> > >     idr_next = 0x0
> > >   },
> > >   s2s_cp_lock = {
> > >     {
> > >       rlock = {
> > >         raw_lock = {
> > >           {
> > >             val = {
> > >               counter = 0x0
> > >             },
> > >             {
> > >               locked = 0x0,
> > >               pending = 0x0
> > >             },
> > >             {
> > >               locked_pending = 0x0,
> > >               tail = 0x0
> > >             }
> > >           }
> > >         }
> > >       }
> > >     }
> > >   },
> > >   nfsd_versions = 0x0,
> > >   nfsd4_minorversions = 0x0,
> > >   drc_hashtbl = 0xffff88810a2f0000,
> > >   max_drc_entries = 0x14740,
> > >   maskbits = 0xb,
> > >   drc_hashsize = 0x800,
> > >   num_drc_entries = {
> > >     counter = 0x0
> > >   },
> > >   counter = {{
> > >       lock = {
> > >         raw_lock = {
> > >           {
> > >             val = {
> > >               counter = 0x0
> > >             },
> > >             {
> > >               locked = 0x0,
> > >               pending = 0x0
> > >             },
> > >             {
> > >               locked_pending = 0x0,
> > >               tail = 0x0
> > >             }
> > >           }
> > >         }
> > >       },
> > >       count = 0x0,
> > >       list = {
> > >         next = 0xffff888103f98dd0,
> > >         prev = 0xffff8881114e9a18
> > >       },
> > >       counters = 0x607dc8402e10
> > >     }, {
> > >       lock = {
> > >         raw_lock = {
> > >           {
> > >             val = {
> > >               counter = 0x0
> > >             },
> > >             {
> > >               locked = 0x0,
> > >               pending = 0x0
> > >             },
> > >             {
> > >               locked_pending = 0x0,
> > >               tail = 0x0
> > >             }
> > >           }
> > >         }
> > >       },
> > >       count = 0x0,
> > >       list = {
> > >         next = 0xffff8881114e99f0,
> > >         prev = 0xffff88810b5743e0
> > >       },
> > >       counters = 0x607dc8402e14
> > >     }},
> > >   longest_chain = 0x0,
> > >   longest_chain_cachesize = 0x0,
> > >   nfsd_reply_cache_shrinker = {
> > >     count_objects = 0xffffffffa0e4e9b0 <nfsd_reply_cache_count>,
> > >     scan_objects = 0xffffffffa0e4f020 <nfsd_reply_cache_scan>,
> > 
> > Shrinker pointers look ok, as does its list_head.
> 
> I think I might see what's going on here:
> 
> This struct is consistent with an nfsd_net structure that allocated and
> had its ->init routine run on it, but that has not had nfsd started in
> its namespace.
> 
> pernet structs are kzalloc'ed. The shrinker registrations and the
> lease/grace times are set just after allocation in the ->init routine.
> The rest of the fields are not set until nfsd is started (specifically
> when /proc/fs/nfsd/threads is written to).
> 
> My guess is that Mike is doing some activity that creates new net
> namespaces, and then once we start getting into OOM conditions the
> shrinker ends up hitting uninitialized fields in the nfsd_net and
> kaboom.
> 
> I haven't yet looked to see when this bug was introduced, but the riggitht
> fix is probably to ensure that everything important for this job is
> initialized after the pernet ->init operation runs.

It might even be best to just move the shrinker registrations to occur
when nfsd is started rather than doing it at nfsd_net initialization
time. Obviously, if nfsd isn't running, then it'll have nothing to free,
and it's a waste of time to register the shrinker at all.
diff mbox series

Patch

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 8c854ba3285b..801d70926442 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -196,6 +196,7 @@  struct nfsd_net {
 	atomic_t		nfsd_courtesy_clients;
 	struct shrinker		nfsd_client_shrinker;
 	struct delayed_work	nfsd_shrinker_work;
+	bool			nfsd_shrinker_active;
 };
 
 /* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ee56c9466304..e00551af6a11 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4407,11 +4407,20 @@  nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
 	struct nfsd_net *nn = container_of(shrink,
 			struct nfsd_net, nfsd_client_shrinker);
 
+	spin_lock(&nn->client_lock);
+	if (nn->nfsd_shrinker_active) {
+		spin_unlock(&nn->client_lock);
+		return 0;
+	}
 	count = atomic_read(&nn->nfsd_courtesy_clients);
 	if (!count)
 		count = atomic_long_read(&num_delegations);
-	if (count)
-		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
+	if (count) {
+		nn->nfsd_shrinker_active = true;
+		spin_unlock(&nn->client_lock);
+		queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
+	} else
+		spin_unlock(&nn->client_lock);
 	return (unsigned long)count;
 }
 
@@ -6239,6 +6248,9 @@  nfsd4_state_shrinker_worker(struct work_struct *work)
 
 	courtesy_client_reaper(nn);
 	deleg_reaper(nn);
+	spin_lock(&nn->client_lock);
+	nn->nfsd_shrinker_active = 0;
+	spin_unlock(&nn->client_lock);
 }
 
 static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)