diff mbox

[2/6] nfsd: swap fs root in NFSd kthreads

Message ID 20121206153447.30693.54128.stgit@localhost.localdomain (mailing list archive)
State New, archived
Headers show

Commit Message

Stanislav Kinsbursky Dec. 6, 2012, 3:34 p.m. UTC
NFSd does lookup. Lookup is done starting from current->fs->root.
NFSd is a kthread, cloned by kthreadd, and thus have global (but luckely
unshared) root.
So we have to swap root to those, which process, started NFSd, has. Because
that process can be in a container with it's own root.

Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
---
 fs/nfsd/netns.h  |    1 +
 fs/nfsd/nfssvc.c |   33 ++++++++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

J. Bruce Fields Dec. 10, 2012, 8:28 p.m. UTC | #1
On Thu, Dec 06, 2012 at 06:34:47PM +0300, Stanislav Kinsbursky wrote:
> NFSd does lookup. Lookup is done starting from current->fs->root.
> NFSd is a kthread, cloned by kthreadd, and thus have global (but luckely
> unshared) root.
> So we have to swap root to those, which process, started NFSd, has. Because
> that process can be in a container with it's own root.

This doesn't sound right to me.

Which lookups exactly do you see being done relative to
current->fs->root ?

--b.

> 
> Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
> ---
>  fs/nfsd/netns.h  |    1 +
>  fs/nfsd/nfssvc.c |   33 ++++++++++++++++++++++++++++++++-
>  2 files changed, 33 insertions(+), 1 deletions(-)
> 
> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> index abfc97c..5c423c6 100644
> --- a/fs/nfsd/netns.h
> +++ b/fs/nfsd/netns.h
> @@ -101,6 +101,7 @@ struct nfsd_net {
>  	struct timeval nfssvc_boot;
>  
>  	struct svc_serv *nfsd_serv;
> +	struct path	root;
>  };
>  
>  extern int nfsd_net_id;
> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> index cee62ab..177bb60 100644
> --- a/fs/nfsd/nfssvc.c
> +++ b/fs/nfsd/nfssvc.c
> @@ -392,6 +392,7 @@ int nfsd_create_serv(struct net *net)
>  
>  	set_max_drc();
>  	do_gettimeofday(&nn->nfssvc_boot);		/* record boot time */
> +	get_fs_root(current->fs, &nn->root);
>  	return 0;
>  }
>  
> @@ -426,8 +427,10 @@ void nfsd_destroy(struct net *net)
>  	if (destroy)
>  		svc_shutdown_net(nn->nfsd_serv, net);
>  	svc_destroy(nn->nfsd_serv);
> -	if (destroy)
> +	if (destroy) {
> +		path_put(&nn->root);
>  		nn->nfsd_serv = NULL;
> +	}
>  }
>  
>  int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
> @@ -533,6 +536,25 @@ out:
>  	return error;
>  }
>  
> +/*
> + * This function is actually slightly modified set_fs_root()<F9>
> + */
> +static void nfsd_swap_root(struct net *net)
> +{
> +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +	struct fs_struct *fs = current->fs;
> +	struct path old_root;
> +
> +	path_get(&nn->root);
> +	spin_lock(&fs->lock);
> +	write_seqcount_begin(&fs->seq);
> +	old_root = fs->root;
> +	fs->root = nn->root;
> +	write_seqcount_end(&fs->seq);
> +	spin_unlock(&fs->lock);
> +	if (old_root.dentry)
> +		path_put(&old_root);
> +}
>  
>  /*
>   * This is the NFS server kernel thread
> @@ -559,6 +581,15 @@ nfsd(void *vrqstp)
>  	current->fs->umask = 0;
>  
>  	/*
> +	 * We have to swap NFSd kthread's fs->root.
> +	 * Why so? Because NFSd can be started in container, which has it's own
> +	 * root.
> +	 * And so what? NFSd lookup files, and lookup start from
> +	 * current->fs->root.
> +	 */
> +	nfsd_swap_root(net);
> +
> +	/*
>  	 * thread is spawned with all signals set to SIG_IGN, re-enable
>  	 * the ones that will bring down the thread
>  	 */
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stanislav Kinsbursky Dec. 11, 2012, 2 p.m. UTC | #2
11.12.2012 00:28, J. Bruce Fields ?????:
> On Thu, Dec 06, 2012 at 06:34:47PM +0300, Stanislav Kinsbursky wrote:
>> NFSd does lookup. Lookup is done starting from current->fs->root.
>> NFSd is a kthread, cloned by kthreadd, and thus have global (but luckely
>> unshared) root.
>> So we have to swap root to those, which process, started NFSd, has. Because
>> that process can be in a container with it's own root.
>
> This doesn't sound right to me.
>
> Which lookups exactly do you see being done relative to
> current->fs->root ?
>

Ok, you are right. I was mistaken here.
This is not a exactly lookup, but d_path() problem in svc_export_request().
I.e. without root swapping, d_path() will give not local export path (like "/export")
but something like this "/root/containers_root/export".

> --b.
>
>>
>> Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
>> ---
>>   fs/nfsd/netns.h  |    1 +
>>   fs/nfsd/nfssvc.c |   33 ++++++++++++++++++++++++++++++++-
>>   2 files changed, 33 insertions(+), 1 deletions(-)
>>
>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
>> index abfc97c..5c423c6 100644
>> --- a/fs/nfsd/netns.h
>> +++ b/fs/nfsd/netns.h
>> @@ -101,6 +101,7 @@ struct nfsd_net {
>>   	struct timeval nfssvc_boot;
>>
>>   	struct svc_serv *nfsd_serv;
>> +	struct path	root;
>>   };
>>
>>   extern int nfsd_net_id;
>> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
>> index cee62ab..177bb60 100644
>> --- a/fs/nfsd/nfssvc.c
>> +++ b/fs/nfsd/nfssvc.c
>> @@ -392,6 +392,7 @@ int nfsd_create_serv(struct net *net)
>>
>>   	set_max_drc();
>>   	do_gettimeofday(&nn->nfssvc_boot);		/* record boot time */
>> +	get_fs_root(current->fs, &nn->root);
>>   	return 0;
>>   }
>>
>> @@ -426,8 +427,10 @@ void nfsd_destroy(struct net *net)
>>   	if (destroy)
>>   		svc_shutdown_net(nn->nfsd_serv, net);
>>   	svc_destroy(nn->nfsd_serv);
>> -	if (destroy)
>> +	if (destroy) {
>> +		path_put(&nn->root);
>>   		nn->nfsd_serv = NULL;
>> +	}
>>   }
>>
>>   int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
>> @@ -533,6 +536,25 @@ out:
>>   	return error;
>>   }
>>
>> +/*
>> + * This function is actually slightly modified set_fs_root()<F9>
>> + */
>> +static void nfsd_swap_root(struct net *net)
>> +{
>> +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>> +	struct fs_struct *fs = current->fs;
>> +	struct path old_root;
>> +
>> +	path_get(&nn->root);
>> +	spin_lock(&fs->lock);
>> +	write_seqcount_begin(&fs->seq);
>> +	old_root = fs->root;
>> +	fs->root = nn->root;
>> +	write_seqcount_end(&fs->seq);
>> +	spin_unlock(&fs->lock);
>> +	if (old_root.dentry)
>> +		path_put(&old_root);
>> +}
>>
>>   /*
>>    * This is the NFS server kernel thread
>> @@ -559,6 +581,15 @@ nfsd(void *vrqstp)
>>   	current->fs->umask = 0;
>>
>>   	/*
>> +	 * We have to swap NFSd kthread's fs->root.
>> +	 * Why so? Because NFSd can be started in container, which has it's own
>> +	 * root.
>> +	 * And so what? NFSd lookup files, and lookup start from
>> +	 * current->fs->root.
>> +	 */
>> +	nfsd_swap_root(net);
>> +
>> +	/*
>>   	 * thread is spawned with all signals set to SIG_IGN, re-enable
>>   	 * the ones that will bring down the thread
>>   	 */
>>
Stanislav Kinsbursky Dec. 11, 2012, 2:12 p.m. UTC | #3
11.12.2012 18:00, Stanislav Kinsbursky ?????:
> 11.12.2012 00:28, J. Bruce Fields ?????:
>> On Thu, Dec 06, 2012 at 06:34:47PM +0300, Stanislav Kinsbursky wrote:
>>> NFSd does lookup. Lookup is done starting from current->fs->root.
>>> NFSd is a kthread, cloned by kthreadd, and thus have global (but luckely
>>> unshared) root.
>>> So we have to swap root to those, which process, started NFSd, has. Because
>>> that process can be in a container with it's own root.
>>
>> This doesn't sound right to me.
>>
>> Which lookups exactly do you see being done relative to
>> current->fs->root ?
>>
>
> Ok, you are right. I was mistaken here.
> This is not a exactly lookup, but d_path() problem in svc_export_request().
> I.e. without root swapping, d_path() will give not local export path (like "/export")
> but something like this "/root/containers_root/export".
>

We, actually, can do it less in less aggressive way.
I.e. instead root swap and current svc_export_request() implementation:

void svc_export_request(...)
{
	<snip>
         pth = d_path(&exp->ex_path, *bpp, *blen);
	<snip>
}

we can do something like this:

void svc_export_request(...)
{
	struct nfsd_net *nn = ...
	<snip>
	spin_lock(&dcache_lock);
         pth = __d_path(&exp->ex_path, &nn->root, *bpp, *blen);
	spin_unlock(&dcache_lock);
	<snip>
}

>> --b.
>>
>>>
>>> Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
>>> ---
>>>   fs/nfsd/netns.h  |    1 +
>>>   fs/nfsd/nfssvc.c |   33 ++++++++++++++++++++++++++++++++-
>>>   2 files changed, 33 insertions(+), 1 deletions(-)
>>>
>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
>>> index abfc97c..5c423c6 100644
>>> --- a/fs/nfsd/netns.h
>>> +++ b/fs/nfsd/netns.h
>>> @@ -101,6 +101,7 @@ struct nfsd_net {
>>>       struct timeval nfssvc_boot;
>>>
>>>       struct svc_serv *nfsd_serv;
>>> +    struct path    root;
>>>   };
>>>
>>>   extern int nfsd_net_id;
>>> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
>>> index cee62ab..177bb60 100644
>>> --- a/fs/nfsd/nfssvc.c
>>> +++ b/fs/nfsd/nfssvc.c
>>> @@ -392,6 +392,7 @@ int nfsd_create_serv(struct net *net)
>>>
>>>       set_max_drc();
>>>       do_gettimeofday(&nn->nfssvc_boot);        /* record boot time */
>>> +    get_fs_root(current->fs, &nn->root);
>>>       return 0;
>>>   }
>>>
>>> @@ -426,8 +427,10 @@ void nfsd_destroy(struct net *net)
>>>       if (destroy)
>>>           svc_shutdown_net(nn->nfsd_serv, net);
>>>       svc_destroy(nn->nfsd_serv);
>>> -    if (destroy)
>>> +    if (destroy) {
>>> +        path_put(&nn->root);
>>>           nn->nfsd_serv = NULL;
>>> +    }
>>>   }
>>>
>>>   int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
>>> @@ -533,6 +536,25 @@ out:
>>>       return error;
>>>   }
>>>
>>> +/*
>>> + * This function is actually slightly modified set_fs_root()<F9>
>>> + */
>>> +static void nfsd_swap_root(struct net *net)
>>> +{
>>> +    struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>>> +    struct fs_struct *fs = current->fs;
>>> +    struct path old_root;
>>> +
>>> +    path_get(&nn->root);
>>> +    spin_lock(&fs->lock);
>>> +    write_seqcount_begin(&fs->seq);
>>> +    old_root = fs->root;
>>> +    fs->root = nn->root;
>>> +    write_seqcount_end(&fs->seq);
>>> +    spin_unlock(&fs->lock);
>>> +    if (old_root.dentry)
>>> +        path_put(&old_root);
>>> +}
>>>
>>>   /*
>>>    * This is the NFS server kernel thread
>>> @@ -559,6 +581,15 @@ nfsd(void *vrqstp)
>>>       current->fs->umask = 0;
>>>
>>>       /*
>>> +     * We have to swap NFSd kthread's fs->root.
>>> +     * Why so? Because NFSd can be started in container, which has it's own
>>> +     * root.
>>> +     * And so what? NFSd lookup files, and lookup start from
>>> +     * current->fs->root.
>>> +     */
>>> +    nfsd_swap_root(net);
>>> +
>>> +    /*
>>>        * thread is spawned with all signals set to SIG_IGN, re-enable
>>>        * the ones that will bring down the thread
>>>        */
>>>
>
>
Stanislav Kinsbursky Dec. 11, 2012, 2:51 p.m. UTC | #4
11.12.2012 18:12, Stanislav Kinsbursky ?????:
> 11.12.2012 18:00, Stanislav Kinsbursky ?????:
>> 11.12.2012 00:28, J. Bruce Fields ?????:
>>> On Thu, Dec 06, 2012 at 06:34:47PM +0300, Stanislav Kinsbursky wrote:
>>>> NFSd does lookup. Lookup is done starting from current->fs->root.
>>>> NFSd is a kthread, cloned by kthreadd, and thus have global (but luckely
>>>> unshared) root.
>>>> So we have to swap root to those, which process, started NFSd, has. Because
>>>> that process can be in a container with it's own root.
>>>
>>> This doesn't sound right to me.
>>>
>>> Which lookups exactly do you see being done relative to
>>> current->fs->root ?
>>>
>>
>> Ok, you are right. I was mistaken here.
>> This is not a exactly lookup, but d_path() problem in svc_export_request().
>> I.e. without root swapping, d_path() will give not local export path (like "/export")
>> but something like this "/root/containers_root/export".
>>
>
> We, actually, can do it less in less aggressive way.
> I.e. instead root swap and current svc_export_request() implementation:
>
> void svc_export_request(...)
> {
>      <snip>
>          pth = d_path(&exp->ex_path, *bpp, *blen);
>      <snip>
> }
>
> we can do something like this:
>
> void svc_export_request(...)
> {
>      struct nfsd_net *nn = ...
>      <snip>
>      spin_lock(&dcache_lock);
>          pth = __d_path(&exp->ex_path, &nn->root, *bpp, *blen);
>      spin_unlock(&dcache_lock);
>      <snip>
> }
>

No, this won't work. Sorry for noise.

>>> --b.
>>>
>>>>
>>>> Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
>>>> ---
>>>>   fs/nfsd/netns.h  |    1 +
>>>>   fs/nfsd/nfssvc.c |   33 ++++++++++++++++++++++++++++++++-
>>>>   2 files changed, 33 insertions(+), 1 deletions(-)
>>>>
>>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
>>>> index abfc97c..5c423c6 100644
>>>> --- a/fs/nfsd/netns.h
>>>> +++ b/fs/nfsd/netns.h
>>>> @@ -101,6 +101,7 @@ struct nfsd_net {
>>>>       struct timeval nfssvc_boot;
>>>>
>>>>       struct svc_serv *nfsd_serv;
>>>> +    struct path    root;
>>>>   };
>>>>
>>>>   extern int nfsd_net_id;
>>>> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
>>>> index cee62ab..177bb60 100644
>>>> --- a/fs/nfsd/nfssvc.c
>>>> +++ b/fs/nfsd/nfssvc.c
>>>> @@ -392,6 +392,7 @@ int nfsd_create_serv(struct net *net)
>>>>
>>>>       set_max_drc();
>>>>       do_gettimeofday(&nn->nfssvc_boot);        /* record boot time */
>>>> +    get_fs_root(current->fs, &nn->root);
>>>>       return 0;
>>>>   }
>>>>
>>>> @@ -426,8 +427,10 @@ void nfsd_destroy(struct net *net)
>>>>       if (destroy)
>>>>           svc_shutdown_net(nn->nfsd_serv, net);
>>>>       svc_destroy(nn->nfsd_serv);
>>>> -    if (destroy)
>>>> +    if (destroy) {
>>>> +        path_put(&nn->root);
>>>>           nn->nfsd_serv = NULL;
>>>> +    }
>>>>   }
>>>>
>>>>   int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
>>>> @@ -533,6 +536,25 @@ out:
>>>>       return error;
>>>>   }
>>>>
>>>> +/*
>>>> + * This function is actually slightly modified set_fs_root()<F9>
>>>> + */
>>>> +static void nfsd_swap_root(struct net *net)
>>>> +{
>>>> +    struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>>>> +    struct fs_struct *fs = current->fs;
>>>> +    struct path old_root;
>>>> +
>>>> +    path_get(&nn->root);
>>>> +    spin_lock(&fs->lock);
>>>> +    write_seqcount_begin(&fs->seq);
>>>> +    old_root = fs->root;
>>>> +    fs->root = nn->root;
>>>> +    write_seqcount_end(&fs->seq);
>>>> +    spin_unlock(&fs->lock);
>>>> +    if (old_root.dentry)
>>>> +        path_put(&old_root);
>>>> +}
>>>>
>>>>   /*
>>>>    * This is the NFS server kernel thread
>>>> @@ -559,6 +581,15 @@ nfsd(void *vrqstp)
>>>>       current->fs->umask = 0;
>>>>
>>>>       /*
>>>> +     * We have to swap NFSd kthread's fs->root.
>>>> +     * Why so? Because NFSd can be started in container, which has it's own
>>>> +     * root.
>>>> +     * And so what? NFSd lookup files, and lookup start from
>>>> +     * current->fs->root.
>>>> +     */
>>>> +    nfsd_swap_root(net);
>>>> +
>>>> +    /*
>>>>        * thread is spawned with all signals set to SIG_IGN, re-enable
>>>>        * the ones that will bring down the thread
>>>>        */
>>>>
>>
>>
>
>
Al Viro Dec. 11, 2012, 2:54 p.m. UTC | #5
On Tue, Dec 11, 2012 at 06:00:00PM +0400, Stanislav Kinsbursky wrote:
> 11.12.2012 00:28, J. Bruce Fields ??????????:
> >On Thu, Dec 06, 2012 at 06:34:47PM +0300, Stanislav Kinsbursky wrote:
> >>NFSd does lookup. Lookup is done starting from current->fs->root.
> >>NFSd is a kthread, cloned by kthreadd, and thus have global (but luckely
> >>unshared) root.
> >>So we have to swap root to those, which process, started NFSd, has. Because
> >>that process can be in a container with it's own root.
> >
> >This doesn't sound right to me.
> >
> >Which lookups exactly do you see being done relative to
> >current->fs->root ?
> >
> 
> Ok, you are right. I was mistaken here.
> This is not a exactly lookup, but d_path() problem in svc_export_request().
> I.e. without root swapping, d_path() will give not local export path (like "/export")
> but something like this "/root/containers_root/export".

Now, *that* is a different story (and makes some sense).  Take a look
at __d_path(), please.  You don't need to set ->fs->root to get d_path()
equivalent relative to given point.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Dec. 11, 2012, 2:56 p.m. UTC | #6
On Tue, Dec 11, 2012 at 06:12:40PM +0400, Stanislav Kinsbursky wrote:
> UID: 9899
> 
> 11.12.2012 18:00, Stanislav Kinsbursky ?????:
> >11.12.2012 00:28, J. Bruce Fields ?????:
> >>On Thu, Dec 06, 2012 at 06:34:47PM +0300, Stanislav Kinsbursky wrote:
> >>>NFSd does lookup. Lookup is done starting from current->fs->root.
> >>>NFSd is a kthread, cloned by kthreadd, and thus have global (but luckely
> >>>unshared) root.
> >>>So we have to swap root to those, which process, started NFSd, has. Because
> >>>that process can be in a container with it's own root.
> >>
> >>This doesn't sound right to me.
> >>
> >>Which lookups exactly do you see being done relative to
> >>current->fs->root ?
> >>
> >
> >Ok, you are right. I was mistaken here.
> >This is not a exactly lookup, but d_path() problem in svc_export_request().
> >I.e. without root swapping, d_path() will give not local export path (like "/export")
> >but something like this "/root/containers_root/export".
> >
> 
> We, actually, can do it less in less aggressive way.
> I.e. instead root swap and current svc_export_request() implementation:
> 
> void svc_export_request(...)
> {
> 	<snip>
>         pth = d_path(&exp->ex_path, *bpp, *blen);
> 	<snip>
> }
> 
> we can do something like this:
> 
> void svc_export_request(...)
> {
> 	struct nfsd_net *nn = ...
> 	<snip>
> 	spin_lock(&dcache_lock);
>         pth = __d_path(&exp->ex_path, &nn->root, *bpp, *blen);
> 	spin_unlock(&dcache_lock);
> 	<snip>
> }

That looks simpler, but I still don't understand why we need it.

I'm confused about how d_path works; I would have thought that
filesystem namespaces would have their own vfsmount trees and hence that
the (vfsmount, dentry) would be enough to specify the path.  Is the root
argument for the case of chroot?  Do we care about that?

Also, svc_export_request is called from mountd's read of
/proc/net/rpc/nfsd.export/channel.  If mountd's root is wrong, then
nothing's going to work anyway.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stanislav Kinsbursky Dec. 11, 2012, 2:57 p.m. UTC | #7
11.12.2012 18:54, Al Viro ?????:
> On Tue, Dec 11, 2012 at 06:00:00PM +0400, Stanislav Kinsbursky wrote:
>> 11.12.2012 00:28, J. Bruce Fields ??????????:
>>> On Thu, Dec 06, 2012 at 06:34:47PM +0300, Stanislav Kinsbursky wrote:
>>>> NFSd does lookup. Lookup is done starting from current->fs->root.
>>>> NFSd is a kthread, cloned by kthreadd, and thus have global (but luckely
>>>> unshared) root.
>>>> So we have to swap root to those, which process, started NFSd, has. Because
>>>> that process can be in a container with it's own root.
>>>
>>> This doesn't sound right to me.
>>>
>>> Which lookups exactly do you see being done relative to
>>> current->fs->root ?
>>>
>>
>> Ok, you are right. I was mistaken here.
>> This is not a exactly lookup, but d_path() problem in svc_export_request().
>> I.e. without root swapping, d_path() will give not local export path (like "/export")
>> but something like this "/root/containers_root/export".
>
> Now, *that* is a different story (and makes some sense).  Take a look
> at __d_path(), please.  You don't need to set ->fs->root to get d_path()
> equivalent relative to given point.
>

Thanks, Al.
But __d_path() is not exported and this code is called from NFSd module.
Is it suitable for you to export __d_path()?
Al Viro Dec. 11, 2012, 2:58 p.m. UTC | #8
On Tue, Dec 11, 2012 at 09:56:21AM -0500, J. Bruce Fields wrote:

> That looks simpler, but I still don't understand why we need it.
> 
> I'm confused about how d_path works; I would have thought that
> filesystem namespaces would have their own vfsmount trees and hence that
> the (vfsmount, dentry) would be enough to specify the path.  Is the root
> argument for the case of chroot?  Do we care about that?

__d_path() is relative pathname from here to there.  Whether (and what for)
is it wanted in case of nfsd patches is a separate question...

> Also, svc_export_request is called from mountd's read of
> /proc/net/rpc/nfsd.export/channel.  If mountd's root is wrong, then
> nothing's going to work anyway.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stanislav Kinsbursky Dec. 11, 2012, 3:07 p.m. UTC | #9
11.12.2012 18:56, J. Bruce Fields ?????:
> On Tue, Dec 11, 2012 at 06:12:40PM +0400, Stanislav Kinsbursky wrote:
>> UID: 9899
>>
>> 11.12.2012 18:00, Stanislav Kinsbursky ?????:
>>> 11.12.2012 00:28, J. Bruce Fields ?????:
>>>> On Thu, Dec 06, 2012 at 06:34:47PM +0300, Stanislav Kinsbursky wrote:
>>>>> NFSd does lookup. Lookup is done starting from current->fs->root.
>>>>> NFSd is a kthread, cloned by kthreadd, and thus have global (but luckely
>>>>> unshared) root.
>>>>> So we have to swap root to those, which process, started NFSd, has. Because
>>>>> that process can be in a container with it's own root.
>>>>
>>>> This doesn't sound right to me.
>>>>
>>>> Which lookups exactly do you see being done relative to
>>>> current->fs->root ?
>>>>
>>>
>>> Ok, you are right. I was mistaken here.
>>> This is not a exactly lookup, but d_path() problem in svc_export_request().
>>> I.e. without root swapping, d_path() will give not local export path (like "/export")
>>> but something like this "/root/containers_root/export".
>>>
>>
>> We, actually, can do it less in less aggressive way.
>> I.e. instead root swap and current svc_export_request() implementation:
>>
>> void svc_export_request(...)
>> {
>> 	<snip>
>>          pth = d_path(&exp->ex_path, *bpp, *blen);
>> 	<snip>
>> }
>>
>> we can do something like this:
>>
>> void svc_export_request(...)
>> {
>> 	struct nfsd_net *nn = ...
>> 	<snip>
>> 	spin_lock(&dcache_lock);
>>          pth = __d_path(&exp->ex_path, &nn->root, *bpp, *blen);
>> 	spin_unlock(&dcache_lock);
>> 	<snip>
>> }
>
> That looks simpler, but I still don't understand why we need it.
>
> I'm confused about how d_path works; I would have thought that
> filesystem namespaces would have their own vfsmount trees and hence that
> the (vfsmount, dentry) would be enough to specify the path.  Is the root
> argument for the case of chroot?  Do we care about that?
>

It works very simple: just traverse the tree from specified dentry up to current->fs->root.dentry.
Having container in some fully separated mount point is great, of course. But:
1) this is a limitation we really want to avoid. I.e. container can be chrooted into some path like "/root/containers_root/" as in example above.
2) NFSd kthread works in init root environment. But we anyway want to get proper path string in container root, but not in kthreads root.

> Also, svc_export_request is called from mountd's read of
> /proc/net/rpc/nfsd.export/channel.  If mountd's root is wrong, then
> nothing's going to work anyway.
>

I don't really understand, how  mountd's root can be wrong. I.e. its' always right as I see it. NFSd kthreads have to swap/use relative path/whatever to 
communicate with proper mountd.
Or I'm missing something?

> --b.
>
J. Bruce Fields Dec. 11, 2012, 3:20 p.m. UTC | #10
On Tue, Dec 11, 2012 at 07:07:00PM +0400, Stanislav Kinsbursky wrote:
> 11.12.2012 18:56, J. Bruce Fields ?????:
> >On Tue, Dec 11, 2012 at 06:12:40PM +0400, Stanislav Kinsbursky wrote:
> >>UID: 9899
> >>
> >>11.12.2012 18:00, Stanislav Kinsbursky ?????:
> >>>11.12.2012 00:28, J. Bruce Fields ?????:
> >>>>On Thu, Dec 06, 2012 at 06:34:47PM +0300, Stanislav Kinsbursky wrote:
> >>>>>NFSd does lookup. Lookup is done starting from current->fs->root.
> >>>>>NFSd is a kthread, cloned by kthreadd, and thus have global (but luckely
> >>>>>unshared) root.
> >>>>>So we have to swap root to those, which process, started NFSd, has. Because
> >>>>>that process can be in a container with it's own root.
> >>>>
> >>>>This doesn't sound right to me.
> >>>>
> >>>>Which lookups exactly do you see being done relative to
> >>>>current->fs->root ?
> >>>>
> >>>
> >>>Ok, you are right. I was mistaken here.
> >>>This is not a exactly lookup, but d_path() problem in svc_export_request().
> >>>I.e. without root swapping, d_path() will give not local export path (like "/export")
> >>>but something like this "/root/containers_root/export".
> >>>
> >>
> >>We, actually, can do it less in less aggressive way.
> >>I.e. instead root swap and current svc_export_request() implementation:
> >>
> >>void svc_export_request(...)
> >>{
> >>	<snip>
> >>         pth = d_path(&exp->ex_path, *bpp, *blen);
> >>	<snip>
> >>}
> >>
> >>we can do something like this:
> >>
> >>void svc_export_request(...)
> >>{
> >>	struct nfsd_net *nn = ...
> >>	<snip>
> >>	spin_lock(&dcache_lock);
> >>         pth = __d_path(&exp->ex_path, &nn->root, *bpp, *blen);
> >>	spin_unlock(&dcache_lock);
> >>	<snip>
> >>}
> >
> >That looks simpler, but I still don't understand why we need it.
> >
> >I'm confused about how d_path works; I would have thought that
> >filesystem namespaces would have their own vfsmount trees and hence that
> >the (vfsmount, dentry) would be enough to specify the path.  Is the root
> >argument for the case of chroot?  Do we care about that?
> >
> 
> It works very simple: just traverse the tree from specified dentry up to current->fs->root.dentry.
> Having container in some fully separated mount point is great, of course. But:
> 1) this is a limitation we really want to avoid. I.e. container can be chrooted into some path like "/root/containers_root/" as in example above.
> 2) NFSd kthread works in init root environment. But we anyway want to get proper path string in container root, but not in kthreads root.
> 
> >Also, svc_export_request is called from mountd's read of
> >/proc/net/rpc/nfsd.export/channel.  If mountd's root is wrong, then
> >nothing's going to work anyway.
> >
> 
> I don't really understand, how  mountd's root can be wrong. I.e.
> its' always right as I see it. NFSd kthreads have to swap/use
> relative path/whatever to communicate with proper mountd.
> Or I'm missing something?

Ugh, I see the problem: I thought svc_export_request was called at the
time mountd does the read, but instead its done at the time nfsd does
the upcall.

I suspect that's wrong, and we really want this done in the context of
the mountd process when it does the read call.  If d_path is called
there then we have no problem.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Dec. 11, 2012, 3:35 p.m. UTC | #11
On Tue, Dec 11, 2012 at 10:20:36AM -0500, J. Bruce Fields wrote:
> On Tue, Dec 11, 2012 at 07:07:00PM +0400, Stanislav Kinsbursky wrote:
> > I don't really understand, how  mountd's root can be wrong. I.e.
> > its' always right as I see it. NFSd kthreads have to swap/use
> > relative path/whatever to communicate with proper mountd.
> > Or I'm missing something?
> 
> Ugh, I see the problem: I thought svc_export_request was called at the
> time mountd does the read, but instead its done at the time nfsd does
> the upcall.
> 
> I suspect that's wrong, and we really want this done in the context of
> the mountd process when it does the read call.  If d_path is called
> there then we have no problem.

Right, so I'd be happier if we could modify sunrpc_cache_pipe_upcall to
skip calling cache_request and instead delay that until cache_read().  I
think that should be possible.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stanislav Kinsbursky Dec. 12, 2012, 7:45 a.m. UTC | #12
11.12.2012 19:35, J. Bruce Fields ?????:
> On Tue, Dec 11, 2012 at 10:20:36AM -0500, J. Bruce Fields wrote:
>> On Tue, Dec 11, 2012 at 07:07:00PM +0400, Stanislav Kinsbursky wrote:
>>> I don't really understand, how  mountd's root can be wrong. I.e.
>>> its' always right as I see it. NFSd kthreads have to swap/use
>>> relative path/whatever to communicate with proper mountd.
>>> Or I'm missing something?
>>
>> Ugh, I see the problem: I thought svc_export_request was called at the
>> time mountd does the read, but instead its done at the time nfsd does
>> the upcall.
>>
>> I suspect that's wrong, and we really want this done in the context of
>> the mountd process when it does the read call.  If d_path is called
>> there then we have no problem.
>
> Right, so I'd be happier if we could modify sunrpc_cache_pipe_upcall to
> skip calling cache_request and instead delay that until cache_read().  I
> think that should be possible.
>

Hmmm...
I'm not familiar with mountd to kernel exchange protocol.
It looks like kernel passes that path string to mountd on upcall.

> --b.
>
Stanislav Kinsbursky Jan. 11, 2013, 2:56 p.m. UTC | #13
11.12.2012 19:35, J. Bruce Fields ?????:
> On Tue, Dec 11, 2012 at 10:20:36AM -0500, J. Bruce Fields wrote:
>> On Tue, Dec 11, 2012 at 07:07:00PM +0400, Stanislav Kinsbursky wrote:
>>> I don't really understand, how  mountd's root can be wrong. I.e.
>>> its' always right as I see it. NFSd kthreads have to swap/use
>>> relative path/whatever to communicate with proper mountd.
>>> Or I'm missing something?
>>
>> Ugh, I see the problem: I thought svc_export_request was called at the
>> time mountd does the read, but instead its done at the time nfsd does
>> the upcall.
>>
>> I suspect that's wrong, and we really want this done in the context of
>> the mountd process when it does the read call.  If d_path is called
>> there then we have no problem.
>
> Right, so I'd be happier if we could modify sunrpc_cache_pipe_upcall to
> skip calling cache_request and instead delay that until cache_read().  I
> think that should be possible.
>

So, Bruce, what we going to do (or what you want me to do) with the rest of NFSd changes?
I.e. how I should solve this d_path() problem?
I.e. I don't understand what did you mean by "I'd be happier if we could modify sunrpc_cache_pipe_upcall to
skip calling cache_request and instead delay that until cache_read()".
Could you give me a hint?

> --b.
>
J. Bruce Fields Jan. 11, 2013, 5:03 p.m. UTC | #14
On Fri, Jan 11, 2013 at 06:56:58PM +0400, Stanislav Kinsbursky wrote:
> 11.12.2012 19:35, J. Bruce Fields ?????:
> >On Tue, Dec 11, 2012 at 10:20:36AM -0500, J. Bruce Fields wrote:
> >>On Tue, Dec 11, 2012 at 07:07:00PM +0400, Stanislav Kinsbursky wrote:
> >>>I don't really understand, how  mountd's root can be wrong. I.e.
> >>>its' always right as I see it. NFSd kthreads have to swap/use
> >>>relative path/whatever to communicate with proper mountd.
> >>>Or I'm missing something?
> >>
> >>Ugh, I see the problem: I thought svc_export_request was called at the
> >>time mountd does the read, but instead its done at the time nfsd does
> >>the upcall.
> >>
> >>I suspect that's wrong, and we really want this done in the context of
> >>the mountd process when it does the read call.  If d_path is called
> >>there then we have no problem.
> >
> >Right, so I'd be happier if we could modify sunrpc_cache_pipe_upcall to
> >skip calling cache_request and instead delay that until cache_read().  I
> >think that should be possible.
> >
> 
> So, Bruce, what we going to do (or what you want me to do) with the rest of NFSd changes?
> I.e. how I should solve this d_path() problem?
> I.e. I don't understand what did you mean by "I'd be happier if we could modify sunrpc_cache_pipe_upcall to
> skip calling cache_request and instead delay that until cache_read()".
> Could you give me a hint?

Definitely.  So normally the way these upcalls happen are:

	1. the kernel does a cache lookup, finds no matching item, and
	   calls sunrpc_cache_pipe_upcall().
	2. sunrpc_cache_pipe_upcall() formats the upcall: it allocates a
	   struct cache_request crq and fills crq->buf with the upcall
	   data by calling the cache's ->cache_request() method.
	3. Then rpc.mountd realizes there's data available in
	   /proc/net/rpc/nfsd.fh/content, so it does a read on that file.
	4. cache_read copies the formatted upcall from crq->buf to
	   to userspace.

So all I'm suggesting is that instead of calling ->cache_request() at
step 2, we do it at step 4.

Then cache_request will be called from rpc.mountd's read.  So we'll know
which container rpc.mountd is in.

Does that make sense?

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stanislav Kinsbursky Jan. 14, 2013, 6:08 a.m. UTC | #15
11.01.2013 21:03, J. Bruce Fields ?????:
> On Fri, Jan 11, 2013 at 06:56:58PM +0400, Stanislav Kinsbursky wrote:
>> 11.12.2012 19:35, J. Bruce Fields ?????:
>>> On Tue, Dec 11, 2012 at 10:20:36AM -0500, J. Bruce Fields wrote:
>>>> On Tue, Dec 11, 2012 at 07:07:00PM +0400, Stanislav Kinsbursky wrote:
>>>>> I don't really understand, how  mountd's root can be wrong. I.e.
>>>>> its' always right as I see it. NFSd kthreads have to swap/use
>>>>> relative path/whatever to communicate with proper mountd.
>>>>> Or I'm missing something?
>>>>
>>>> Ugh, I see the problem: I thought svc_export_request was called at the
>>>> time mountd does the read, but instead its done at the time nfsd does
>>>> the upcall.
>>>>
>>>> I suspect that's wrong, and we really want this done in the context of
>>>> the mountd process when it does the read call.  If d_path is called
>>>> there then we have no problem.
>>>
>>> Right, so I'd be happier if we could modify sunrpc_cache_pipe_upcall to
>>> skip calling cache_request and instead delay that until cache_read().  I
>>> think that should be possible.
>>>
>>
>> So, Bruce, what we going to do (or what you want me to do) with the rest of NFSd changes?
>> I.e. how I should solve this d_path() problem?
>> I.e. I don't understand what did you mean by "I'd be happier if we could modify sunrpc_cache_pipe_upcall to
>> skip calling cache_request and instead delay that until cache_read()".
>> Could you give me a hint?
>
> Definitely.  So normally the way these upcalls happen are:
>
> 	1. the kernel does a cache lookup, finds no matching item, and
> 	   calls sunrpc_cache_pipe_upcall().
> 	2. sunrpc_cache_pipe_upcall() formats the upcall: it allocates a
> 	   struct cache_request crq and fills crq->buf with the upcall
> 	   data by calling the cache's ->cache_request() method.
> 	3. Then rpc.mountd realizes there's data available in
> 	   /proc/net/rpc/nfsd.fh/content, so it does a read on that file.
> 	4. cache_read copies the formatted upcall from crq->buf to
> 	   to userspace.
>
> So all I'm suggesting is that instead of calling ->cache_request() at
> step 2, we do it at step 4.
>
> Then cache_request will be called from rpc.mountd's read.  So we'll know
> which container rpc.mountd is in.
>
> Does that make sense?
>

Ok, now I understand.
If the upcall is just the way to inform rpc.mound, that there are some data to read and process, then it your proposing changes should work.
I'll prepare initial patch set.
Thanks!

> --b.
>
diff mbox

Patch

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index abfc97c..5c423c6 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -101,6 +101,7 @@  struct nfsd_net {
 	struct timeval nfssvc_boot;
 
 	struct svc_serv *nfsd_serv;
+	struct path	root;
 };
 
 extern int nfsd_net_id;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index cee62ab..177bb60 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -392,6 +392,7 @@  int nfsd_create_serv(struct net *net)
 
 	set_max_drc();
 	do_gettimeofday(&nn->nfssvc_boot);		/* record boot time */
+	get_fs_root(current->fs, &nn->root);
 	return 0;
 }
 
@@ -426,8 +427,10 @@  void nfsd_destroy(struct net *net)
 	if (destroy)
 		svc_shutdown_net(nn->nfsd_serv, net);
 	svc_destroy(nn->nfsd_serv);
-	if (destroy)
+	if (destroy) {
+		path_put(&nn->root);
 		nn->nfsd_serv = NULL;
+	}
 }
 
 int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
@@ -533,6 +536,25 @@  out:
 	return error;
 }
 
+/*
+ * This function is actually slightly modified set_fs_root()<F9>
+ */
+static void nfsd_swap_root(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct fs_struct *fs = current->fs;
+	struct path old_root;
+
+	path_get(&nn->root);
+	spin_lock(&fs->lock);
+	write_seqcount_begin(&fs->seq);
+	old_root = fs->root;
+	fs->root = nn->root;
+	write_seqcount_end(&fs->seq);
+	spin_unlock(&fs->lock);
+	if (old_root.dentry)
+		path_put(&old_root);
+}
 
 /*
  * This is the NFS server kernel thread
@@ -559,6 +581,15 @@  nfsd(void *vrqstp)
 	current->fs->umask = 0;
 
 	/*
+	 * We have to swap NFSd kthread's fs->root.
+	 * Why so? Because NFSd can be started in container, which has it's own
+	 * root.
+	 * And so what? NFSd lookup files, and lookup start from
+	 * current->fs->root.
+	 */
+	nfsd_swap_root(net);
+
+	/*
 	 * thread is spawned with all signals set to SIG_IGN, re-enable
 	 * the ones that will bring down the thread
 	 */