diff mbox series

[RFC,v5,19/19] ceph: add fscrypt ioctls

Message ID 20210326173227.96363-20-jlayton@kernel.org (mailing list archive)
State New, archived
Headers show
Series ceph+fscrypt: context, filename and symlink support | expand

Commit Message

Jeff Layton March 26, 2021, 5:32 p.m. UTC
We gate most of the ioctls on MDS feature support. The exception is the
key removal and status functions that we still want to work if the MDS's
were to (inexplicably) lose the feature.

For the set_policy ioctl, we take Fcx caps to ensure that nothing can
create files in the directory while the ioctl is running. That should
be enough to ensure that the "empty_dir" check is reliable.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/ioctl.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

Comments

Luis Henriques April 6, 2021, 3:38 p.m. UTC | #1
Hi Jeff!

On Fri, Mar 26, 2021 at 01:32:27PM -0400, Jeff Layton wrote:
> We gate most of the ioctls on MDS feature support. The exception is the
> key removal and status functions that we still want to work if the MDS's
> were to (inexplicably) lose the feature.
> 
> For the set_policy ioctl, we take Fcx caps to ensure that nothing can
> create files in the directory while the ioctl is running. That should
> be enough to ensure that the "empty_dir" check is reliable.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/ioctl.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 94 insertions(+)
> 
> diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
> index 6e061bf62ad4..34b85bcfcfc7 100644
> --- a/fs/ceph/ioctl.c
> +++ b/fs/ceph/ioctl.c
> @@ -6,6 +6,7 @@
>  #include "mds_client.h"
>  #include "ioctl.h"
>  #include <linux/ceph/striper.h>
> +#include <linux/fscrypt.h>
>  
>  /*
>   * ioctls
> @@ -268,8 +269,56 @@ static long ceph_ioctl_syncio(struct file *file)
>  	return 0;
>  }
>  
> +static int vet_mds_for_fscrypt(struct file *file)
> +{
> +	int i, ret = -EOPNOTSUPP;
> +	struct ceph_mds_client	*mdsc = ceph_sb_to_mdsc(file_inode(file)->i_sb);
> +
> +	mutex_lock(&mdsc->mutex);
> +	for (i = 0; i < mdsc->max_sessions; i++) {
> +		struct ceph_mds_session *s = mdsc->sessions[i];
> +
> +		if (!s)
> +			continue;
> +		if (test_bit(CEPHFS_FEATURE_ALTERNATE_NAME, &s->s_features))
> +			ret = 0;
> +		break;
> +	}
> +	mutex_unlock(&mdsc->mutex);
> +	return ret;
> +}
> +
> +static long ceph_set_encryption_policy(struct file *file, unsigned long arg)
> +{
> +	int ret, got = 0;
> +	struct page *page = NULL;
> +	struct inode *inode = file_inode(file);
> +	struct ceph_inode_info *ci = ceph_inode(inode);
> +
> +	ret = vet_mds_for_fscrypt(file);
> +	if (ret)
> +		return ret;
> +
> +	/*
> +	 * Ensure we hold these caps so that we _know_ that the rstats check
> +	 * in the empty_dir check is reliable.
> +	 */
> +	ret = ceph_get_caps(file, CEPH_CAP_FILE_SHARED, 0, -1, &got, &page);
> +	if (ret)
> +		return ret;
> +	if (page)
> +		put_page(page);
> +	ret = fscrypt_ioctl_set_policy(file, (const void __user *)arg);
> +	if (got)
> +		ceph_put_cap_refs(ci, got);
> +	return ret;
> +}
> +
>  long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>  {
> +	int ret;
> +	struct ceph_inode_info *ci = ceph_inode(file_inode(file));
> +
>  	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
>  	switch (cmd) {
>  	case CEPH_IOC_GET_LAYOUT:
> @@ -289,6 +338,51 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>  
>  	case CEPH_IOC_SYNCIO:
>  		return ceph_ioctl_syncio(file);
> +
> +	case FS_IOC_SET_ENCRYPTION_POLICY:
> +		return ceph_set_encryption_policy(file, arg);
> +
> +	case FS_IOC_GET_ENCRYPTION_POLICY:
> +		ret = vet_mds_for_fscrypt(file);
> +		if (ret)
> +			return ret;
> +		return fscrypt_ioctl_get_policy(file, (void __user *)arg);
> +
> +	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
> +		ret = vet_mds_for_fscrypt(file);
> +		if (ret)
> +			return ret;
> +		return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg);
> +
> +	case FS_IOC_ADD_ENCRYPTION_KEY:
> +		ret = vet_mds_for_fscrypt(file);
> +		if (ret)
> +			return ret;
> +		atomic_inc(&ci->i_shared_gen);

I've spent a few hours already looking at the bug I reported before, and I
can't really understand this code.  What does it mean to increment
->i_shared_gen at this point?

The reason I'm asking is because it looks like the problem I'm seeing goes
away if I remove this code.  Here's what I'm doing/seeing:

# mount ...
# fscrypt unlock d

  -> 'd' dentry is eventually pruned at this point *if* ->i_shared_gen was
     incremented by the line above.

# cat d/f

  -> when ceph_fill_inode() is executed, 'd' isn't *not* set as encrypted
     because both ci->i_xattrs.version and info->xattr_version are both
     set to 0.

cat: d/f: No such file or directory

I'm not sure anymore if the issue is on the client or on the MDS side.
Before digging deeper, I wonder if this ring any bell. ;-)

Cheers,
--
Luís


> +		ceph_dir_clear_ordered(file_inode(file));
> +		ceph_dir_clear_complete(file_inode(file));
> +		return fscrypt_ioctl_add_key(file, (void __user *)arg);
> +
> +	case FS_IOC_REMOVE_ENCRYPTION_KEY:
> +		atomic_inc(&ci->i_shared_gen);
> +		ceph_dir_clear_ordered(file_inode(file));
> +		ceph_dir_clear_complete(file_inode(file));
> +		return fscrypt_ioctl_remove_key(file, (void __user *)arg);
> +
> +	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
> +		atomic_inc(&ci->i_shared_gen);
> +		ceph_dir_clear_ordered(file_inode(file));
> +		ceph_dir_clear_complete(file_inode(file));
> +		return fscrypt_ioctl_remove_key_all_users(file, (void __user *)arg);
> +
> +	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
> +		return fscrypt_ioctl_get_key_status(file, (void __user *)arg);
> +
> +	case FS_IOC_GET_ENCRYPTION_NONCE:
> +		ret = vet_mds_for_fscrypt(file);
> +		if (ret)
> +			return ret;
> +		return fscrypt_ioctl_get_nonce(file, (void __user *)arg);
>  	}
>  
>  	return -ENOTTY;
> -- 
> 2.30.2
>
Jeff Layton April 6, 2021, 4:03 p.m. UTC | #2
On Tue, 2021-04-06 at 16:38 +0100, Luis Henriques wrote:
> Hi Jeff!
> 
> On Fri, Mar 26, 2021 at 01:32:27PM -0400, Jeff Layton wrote:
> > We gate most of the ioctls on MDS feature support. The exception is the
> > key removal and status functions that we still want to work if the MDS's
> > were to (inexplicably) lose the feature.
> > 
> > For the set_policy ioctl, we take Fcx caps to ensure that nothing can
> > create files in the directory while the ioctl is running. That should
> > be enough to ensure that the "empty_dir" check is reliable.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/ioctl.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 94 insertions(+)
> > 
> > diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
> > index 6e061bf62ad4..34b85bcfcfc7 100644
> > --- a/fs/ceph/ioctl.c
> > +++ b/fs/ceph/ioctl.c
> > @@ -6,6 +6,7 @@
> >  #include "mds_client.h"
> >  #include "ioctl.h"
> >  #include <linux/ceph/striper.h>
> > +#include <linux/fscrypt.h>
> >  
> >  /*
> >   * ioctls
> > @@ -268,8 +269,56 @@ static long ceph_ioctl_syncio(struct file *file)
> >  	return 0;
> >  }
> >  
> > +static int vet_mds_for_fscrypt(struct file *file)
> > +{
> > +	int i, ret = -EOPNOTSUPP;
> > +	struct ceph_mds_client	*mdsc = ceph_sb_to_mdsc(file_inode(file)->i_sb);
> > +
> > +	mutex_lock(&mdsc->mutex);
> > +	for (i = 0; i < mdsc->max_sessions; i++) {
> > +		struct ceph_mds_session *s = mdsc->sessions[i];
> > +
> > +		if (!s)
> > +			continue;
> > +		if (test_bit(CEPHFS_FEATURE_ALTERNATE_NAME, &s->s_features))
> > +			ret = 0;
> > +		break;
> > +	}
> > +	mutex_unlock(&mdsc->mutex);
> > +	return ret;
> > +}
> > +
> > +static long ceph_set_encryption_policy(struct file *file, unsigned long arg)
> > +{
> > +	int ret, got = 0;
> > +	struct page *page = NULL;
> > +	struct inode *inode = file_inode(file);
> > +	struct ceph_inode_info *ci = ceph_inode(inode);
> > +
> > +	ret = vet_mds_for_fscrypt(file);
> > +	if (ret)
> > +		return ret;
> > +
> > +	/*
> > +	 * Ensure we hold these caps so that we _know_ that the rstats check
> > +	 * in the empty_dir check is reliable.
> > +	 */
> > +	ret = ceph_get_caps(file, CEPH_CAP_FILE_SHARED, 0, -1, &got, &page);
> > +	if (ret)
> > +		return ret;
> > +	if (page)
> > +		put_page(page);
> > +	ret = fscrypt_ioctl_set_policy(file, (const void __user *)arg);
> > +	if (got)
> > +		ceph_put_cap_refs(ci, got);
> > +	return ret;
> > +}
> > +
> >  long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> >  {
> > +	int ret;
> > +	struct ceph_inode_info *ci = ceph_inode(file_inode(file));
> > +
> >  	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
> >  	switch (cmd) {
> >  	case CEPH_IOC_GET_LAYOUT:
> > @@ -289,6 +338,51 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> >  
> >  	case CEPH_IOC_SYNCIO:
> >  		return ceph_ioctl_syncio(file);
> > +
> > +	case FS_IOC_SET_ENCRYPTION_POLICY:
> > +		return ceph_set_encryption_policy(file, arg);
> > +
> > +	case FS_IOC_GET_ENCRYPTION_POLICY:
> > +		ret = vet_mds_for_fscrypt(file);
> > +		if (ret)
> > +			return ret;
> > +		return fscrypt_ioctl_get_policy(file, (void __user *)arg);
> > +
> > +	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
> > +		ret = vet_mds_for_fscrypt(file);
> > +		if (ret)
> > +			return ret;
> > +		return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg);
> > +
> > +	case FS_IOC_ADD_ENCRYPTION_KEY:
> > +		ret = vet_mds_for_fscrypt(file);
> > +		if (ret)
> > +			return ret;
> > +		atomic_inc(&ci->i_shared_gen);
> 
> I've spent a few hours already looking at the bug I reported before, and I
> can't really understand this code.  What does it mean to increment
> ->i_shared_gen at this point?
> 
> The reason I'm asking is because it looks like the problem I'm seeing goes
> away if I remove this code.  Here's what I'm doing/seeing:
> 
> # mount ...
> # fscrypt unlock d
> 
>   -> 'd' dentry is eventually pruned at this point *if* ->i_shared_gen was
>      incremented by the line above.
> 
> # cat d/f
> 
>   -> when ceph_fill_inode() is executed, 'd' isn't *not* set as encrypted
>      because both ci->i_xattrs.version and info->xattr_version are both
>      set to 0.
> 

Interesting. That sounds like it might be the bug right there. "d"
should clearly have a fscrypt context in its xattrs at that point. If
the MDS isn't passing that back, then that could be a problem.

I had a concern about that when I was developing this, and I *thought*
Zheng had assured us that the MDS will always pass along the xattr blob
in a trace. Maybe that's not correct?

> cat: d/f: No such file or directory
> 
> I'm not sure anymore if the issue is on the client or on the MDS side.
> Before digging deeper, I wonder if this ring any bell. ;-)
> 
> 

No, this is not something I've seen before.

Dentries that live in a directory have a copy of the i_shared_gen of the
directory when they are instantiated. Bumping that value on a directory
should basically ensure that its child dentries end up invalidated,
which is what we want once we add the key to the directory. Once we add
a key, any old dentries in that directory are no longer valid.

That said, I could certainly have missed some subtlety here.

> 
> > +		ceph_dir_clear_ordered(file_inode(file));
> > +		ceph_dir_clear_complete(file_inode(file));
> > +		return fscrypt_ioctl_add_key(file, (void __user *)arg);
> > +
> > +	case FS_IOC_REMOVE_ENCRYPTION_KEY:
> > +		atomic_inc(&ci->i_shared_gen);
> > +		ceph_dir_clear_ordered(file_inode(file));
> > +		ceph_dir_clear_complete(file_inode(file));
> > +		return fscrypt_ioctl_remove_key(file, (void __user *)arg);
> > +
> > +	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
> > +		atomic_inc(&ci->i_shared_gen);
> > +		ceph_dir_clear_ordered(file_inode(file));
> > +		ceph_dir_clear_complete(file_inode(file));
> > +		return fscrypt_ioctl_remove_key_all_users(file, (void __user *)arg);
> > +
> > +	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
> > +		return fscrypt_ioctl_get_key_status(file, (void __user *)arg);
> > +
> > +	case FS_IOC_GET_ENCRYPTION_NONCE:
> > +		ret = vet_mds_for_fscrypt(file);
> > +		if (ret)
> > +			return ret;
> > +		return fscrypt_ioctl_get_nonce(file, (void __user *)arg);
> >  	}
> >  
> >  	return -ENOTTY;
> > -- 
> > 2.30.2
> >
Luis Henriques April 6, 2021, 4:24 p.m. UTC | #3
On Tue, Apr 06, 2021 at 12:03:27PM -0400, Jeff Layton wrote:
> On Tue, 2021-04-06 at 16:38 +0100, Luis Henriques wrote:
> > Hi Jeff!
> > 
> > On Fri, Mar 26, 2021 at 01:32:27PM -0400, Jeff Layton wrote:
> > > We gate most of the ioctls on MDS feature support. The exception is the
> > > key removal and status functions that we still want to work if the MDS's
> > > were to (inexplicably) lose the feature.
> > > 
> > > For the set_policy ioctl, we take Fcx caps to ensure that nothing can
> > > create files in the directory while the ioctl is running. That should
> > > be enough to ensure that the "empty_dir" check is reliable.
> > > 
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/ceph/ioctl.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 94 insertions(+)
> > > 
> > > diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
> > > index 6e061bf62ad4..34b85bcfcfc7 100644
> > > --- a/fs/ceph/ioctl.c
> > > +++ b/fs/ceph/ioctl.c
> > > @@ -6,6 +6,7 @@
> > >  #include "mds_client.h"
> > >  #include "ioctl.h"
> > >  #include <linux/ceph/striper.h>
> > > +#include <linux/fscrypt.h>
> > >  
> > >  /*
> > >   * ioctls
> > > @@ -268,8 +269,56 @@ static long ceph_ioctl_syncio(struct file *file)
> > >  	return 0;
> > >  }
> > >  
> > > +static int vet_mds_for_fscrypt(struct file *file)
> > > +{
> > > +	int i, ret = -EOPNOTSUPP;
> > > +	struct ceph_mds_client	*mdsc = ceph_sb_to_mdsc(file_inode(file)->i_sb);
> > > +
> > > +	mutex_lock(&mdsc->mutex);
> > > +	for (i = 0; i < mdsc->max_sessions; i++) {
> > > +		struct ceph_mds_session *s = mdsc->sessions[i];
> > > +
> > > +		if (!s)
> > > +			continue;
> > > +		if (test_bit(CEPHFS_FEATURE_ALTERNATE_NAME, &s->s_features))
> > > +			ret = 0;
> > > +		break;
> > > +	}
> > > +	mutex_unlock(&mdsc->mutex);
> > > +	return ret;
> > > +}
> > > +
> > > +static long ceph_set_encryption_policy(struct file *file, unsigned long arg)
> > > +{
> > > +	int ret, got = 0;
> > > +	struct page *page = NULL;
> > > +	struct inode *inode = file_inode(file);
> > > +	struct ceph_inode_info *ci = ceph_inode(inode);
> > > +
> > > +	ret = vet_mds_for_fscrypt(file);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	/*
> > > +	 * Ensure we hold these caps so that we _know_ that the rstats check
> > > +	 * in the empty_dir check is reliable.
> > > +	 */
> > > +	ret = ceph_get_caps(file, CEPH_CAP_FILE_SHARED, 0, -1, &got, &page);
> > > +	if (ret)
> > > +		return ret;
> > > +	if (page)
> > > +		put_page(page);
> > > +	ret = fscrypt_ioctl_set_policy(file, (const void __user *)arg);
> > > +	if (got)
> > > +		ceph_put_cap_refs(ci, got);
> > > +	return ret;
> > > +}
> > > +
> > >  long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> > >  {
> > > +	int ret;
> > > +	struct ceph_inode_info *ci = ceph_inode(file_inode(file));
> > > +
> > >  	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
> > >  	switch (cmd) {
> > >  	case CEPH_IOC_GET_LAYOUT:
> > > @@ -289,6 +338,51 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> > >  
> > >  	case CEPH_IOC_SYNCIO:
> > >  		return ceph_ioctl_syncio(file);
> > > +
> > > +	case FS_IOC_SET_ENCRYPTION_POLICY:
> > > +		return ceph_set_encryption_policy(file, arg);
> > > +
> > > +	case FS_IOC_GET_ENCRYPTION_POLICY:
> > > +		ret = vet_mds_for_fscrypt(file);
> > > +		if (ret)
> > > +			return ret;
> > > +		return fscrypt_ioctl_get_policy(file, (void __user *)arg);
> > > +
> > > +	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
> > > +		ret = vet_mds_for_fscrypt(file);
> > > +		if (ret)
> > > +			return ret;
> > > +		return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg);
> > > +
> > > +	case FS_IOC_ADD_ENCRYPTION_KEY:
> > > +		ret = vet_mds_for_fscrypt(file);
> > > +		if (ret)
> > > +			return ret;
> > > +		atomic_inc(&ci->i_shared_gen);
> > 
> > I've spent a few hours already looking at the bug I reported before, and I
> > can't really understand this code.  What does it mean to increment
> > ->i_shared_gen at this point?
> > 
> > The reason I'm asking is because it looks like the problem I'm seeing goes
> > away if I remove this code.  Here's what I'm doing/seeing:
> > 
> > # mount ...
> > # fscrypt unlock d
> > 
> >   -> 'd' dentry is eventually pruned at this point *if* ->i_shared_gen was
> >      incremented by the line above.
> > 
> > # cat d/f
> > 
> >   -> when ceph_fill_inode() is executed, 'd' isn't *not* set as encrypted
> >      because both ci->i_xattrs.version and info->xattr_version are both
> >      set to 0.
> > 
> 
> Interesting. That sounds like it might be the bug right there. "d"
> should clearly have a fscrypt context in its xattrs at that point. If
> the MDS isn't passing that back, then that could be a problem.
> 
> I had a concern about that when I was developing this, and I *thought*
> Zheng had assured us that the MDS will always pass along the xattr blob
> in a trace. Maybe that's not correct?

Hmm, that's what I thought too.  I was hoping not having to go look at the
MDS, but seems like I'll have to :-)

> > cat: d/f: No such file or directory
> > 
> > I'm not sure anymore if the issue is on the client or on the MDS side.
> > Before digging deeper, I wonder if this ring any bell. ;-)
> > 
> > 
> 
> No, this is not something I've seen before.
> 
> Dentries that live in a directory have a copy of the i_shared_gen of the
> directory when they are instantiated. Bumping that value on a directory
> should basically ensure that its child dentries end up invalidated,
> which is what we want once we add the key to the directory. Once we add
> a key, any old dentries in that directory are no longer valid.
> 
> That said, I could certainly have missed some subtlety here.

Great, thanks for clarifying.  This should help me investigate a little
bit more.

[ And I'm also surprised you don't see this behaviour as it's very easy to
  reproduce. ]

Cheers,
--
Luís

> > 
> > > +		ceph_dir_clear_ordered(file_inode(file));
> > > +		ceph_dir_clear_complete(file_inode(file));
> > > +		return fscrypt_ioctl_add_key(file, (void __user *)arg);
> > > +
> > > +	case FS_IOC_REMOVE_ENCRYPTION_KEY:
> > > +		atomic_inc(&ci->i_shared_gen);
> > > +		ceph_dir_clear_ordered(file_inode(file));
> > > +		ceph_dir_clear_complete(file_inode(file));
> > > +		return fscrypt_ioctl_remove_key(file, (void __user *)arg);
> > > +
> > > +	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
> > > +		atomic_inc(&ci->i_shared_gen);
> > > +		ceph_dir_clear_ordered(file_inode(file));
> > > +		ceph_dir_clear_complete(file_inode(file));
> > > +		return fscrypt_ioctl_remove_key_all_users(file, (void __user *)arg);
> > > +
> > > +	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
> > > +		return fscrypt_ioctl_get_key_status(file, (void __user *)arg);
> > > +
> > > +	case FS_IOC_GET_ENCRYPTION_NONCE:
> > > +		ret = vet_mds_for_fscrypt(file);
> > > +		if (ret)
> > > +			return ret;
> > > +		return fscrypt_ioctl_get_nonce(file, (void __user *)arg);
> > >  	}
> > >  
> > >  	return -ENOTTY;
> > > -- 
> > > 2.30.2
> > > 
> 
> -- 
> Jeff Layton <jlayton@kernel.org>
>
Jeff Layton April 6, 2021, 5:27 p.m. UTC | #4
On Tue, 2021-04-06 at 17:24 +0100, Luis Henriques wrote:
> On Tue, Apr 06, 2021 at 12:03:27PM -0400, Jeff Layton wrote:
> > On Tue, 2021-04-06 at 16:38 +0100, Luis Henriques wrote:
> > > Hi Jeff!
> > > 
> > > On Fri, Mar 26, 2021 at 01:32:27PM -0400, Jeff Layton wrote:
> > > > We gate most of the ioctls on MDS feature support. The exception is the
> > > > key removal and status functions that we still want to work if the MDS's
> > > > were to (inexplicably) lose the feature.
> > > > 
> > > > For the set_policy ioctl, we take Fcx caps to ensure that nothing can
> > > > create files in the directory while the ioctl is running. That should
> > > > be enough to ensure that the "empty_dir" check is reliable.
> > > > 
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > >  fs/ceph/ioctl.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 94 insertions(+)
> > > > 
> > > > diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
> > > > index 6e061bf62ad4..34b85bcfcfc7 100644
> > > > --- a/fs/ceph/ioctl.c
> > > > +++ b/fs/ceph/ioctl.c
> > > > @@ -6,6 +6,7 @@
> > > >  #include "mds_client.h"
> > > >  #include "ioctl.h"
> > > >  #include <linux/ceph/striper.h>
> > > > +#include <linux/fscrypt.h>
> > > >  
> > > >  /*
> > > >   * ioctls
> > > > @@ -268,8 +269,56 @@ static long ceph_ioctl_syncio(struct file *file)
> > > >  	return 0;
> > > >  }
> > > >  
> > > > +static int vet_mds_for_fscrypt(struct file *file)
> > > > +{
> > > > +	int i, ret = -EOPNOTSUPP;
> > > > +	struct ceph_mds_client	*mdsc = ceph_sb_to_mdsc(file_inode(file)->i_sb);
> > > > +
> > > > +	mutex_lock(&mdsc->mutex);
> > > > +	for (i = 0; i < mdsc->max_sessions; i++) {
> > > > +		struct ceph_mds_session *s = mdsc->sessions[i];
> > > > +
> > > > +		if (!s)
> > > > +			continue;
> > > > +		if (test_bit(CEPHFS_FEATURE_ALTERNATE_NAME, &s->s_features))
> > > > +			ret = 0;
> > > > +		break;
> > > > +	}
> > > > +	mutex_unlock(&mdsc->mutex);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static long ceph_set_encryption_policy(struct file *file, unsigned long arg)
> > > > +{
> > > > +	int ret, got = 0;
> > > > +	struct page *page = NULL;
> > > > +	struct inode *inode = file_inode(file);
> > > > +	struct ceph_inode_info *ci = ceph_inode(inode);
> > > > +
> > > > +	ret = vet_mds_for_fscrypt(file);
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > +	/*
> > > > +	 * Ensure we hold these caps so that we _know_ that the rstats check
> > > > +	 * in the empty_dir check is reliable.
> > > > +	 */
> > > > +	ret = ceph_get_caps(file, CEPH_CAP_FILE_SHARED, 0, -1, &got, &page);
> > > > +	if (ret)
> > > > +		return ret;
> > > > +	if (page)
> > > > +		put_page(page);
> > > > +	ret = fscrypt_ioctl_set_policy(file, (const void __user *)arg);
> > > > +	if (got)
> > > > +		ceph_put_cap_refs(ci, got);
> > > > +	return ret;
> > > > +}
> > > > +
> > > >  long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> > > >  {
> > > > +	int ret;
> > > > +	struct ceph_inode_info *ci = ceph_inode(file_inode(file));
> > > > +
> > > >  	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
> > > >  	switch (cmd) {
> > > >  	case CEPH_IOC_GET_LAYOUT:
> > > > @@ -289,6 +338,51 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> > > >  
> > > >  	case CEPH_IOC_SYNCIO:
> > > >  		return ceph_ioctl_syncio(file);
> > > > +
> > > > +	case FS_IOC_SET_ENCRYPTION_POLICY:
> > > > +		return ceph_set_encryption_policy(file, arg);
> > > > +
> > > > +	case FS_IOC_GET_ENCRYPTION_POLICY:
> > > > +		ret = vet_mds_for_fscrypt(file);
> > > > +		if (ret)
> > > > +			return ret;
> > > > +		return fscrypt_ioctl_get_policy(file, (void __user *)arg);
> > > > +
> > > > +	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
> > > > +		ret = vet_mds_for_fscrypt(file);
> > > > +		if (ret)
> > > > +			return ret;
> > > > +		return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg);
> > > > +
> > > > +	case FS_IOC_ADD_ENCRYPTION_KEY:
> > > > +		ret = vet_mds_for_fscrypt(file);
> > > > +		if (ret)
> > > > +			return ret;
> > > > +		atomic_inc(&ci->i_shared_gen);
> > > 
> > > I've spent a few hours already looking at the bug I reported before, and I
> > > can't really understand this code.  What does it mean to increment
> > > ->i_shared_gen at this point?
> > > 
> > > The reason I'm asking is because it looks like the problem I'm seeing goes
> > > away if I remove this code.  Here's what I'm doing/seeing:
> > > 
> > > # mount ...
> > > # fscrypt unlock d
> > > 
> > >   -> 'd' dentry is eventually pruned at this point *if* ->i_shared_gen was
> > >      incremented by the line above.
> > > 
> > > # cat d/f
> > > 
> > >   -> when ceph_fill_inode() is executed, 'd' isn't *not* set as encrypted
> > >      because both ci->i_xattrs.version and info->xattr_version are both
> > >      set to 0.
> > > 
> > 
> > Interesting. That sounds like it might be the bug right there. "d"
> > should clearly have a fscrypt context in its xattrs at that point. If
> > the MDS isn't passing that back, then that could be a problem.
> > 
> > I had a concern about that when I was developing this, and I *thought*
> > Zheng had assured us that the MDS will always pass along the xattr blob
> > in a trace. Maybe that's not correct?
> 
> Hmm, that's what I thought too.  I was hoping not having to go look at the
> MDS, but seems like I'll have to :-)
> 

That'd be good, if possible.

> > > cat: d/f: No such file or directory
> > > 
> > > I'm not sure anymore if the issue is on the client or on the MDS side.
> > > Before digging deeper, I wonder if this ring any bell. ;-)
> > > 
> > > 
> > 
> > No, this is not something I've seen before.
> > 
> > Dentries that live in a directory have a copy of the i_shared_gen of the
> > directory when they are instantiated. Bumping that value on a directory
> > should basically ensure that its child dentries end up invalidated,
> > which is what we want once we add the key to the directory. Once we add
> > a key, any old dentries in that directory are no longer valid.
> > 
> > That said, I could certainly have missed some subtlety here.
> 
> Great, thanks for clarifying.  This should help me investigate a little
> bit more.
> 
> [ And I'm also surprised you don't see this behaviour as it's very easy to
>   reproduce. ]
> 
> 

It is odd... fwiw, I ran this for 5 mins or so and never saw a problem:

    $ while [ $? -eq 0 ]; do sudo umount /mnt/crypt; sudo mount /mnt/crypt; fscrypt unlock --key=/home/jlayton/fscrypt-keyfile /mnt/crypt/d; cat /mnt/crypt/d/f; done

...do I need some other operations in between? Also, the cluster in this
case is Pacific. It's possible this is a result of changes since then if
you're on a vstart cluster or something.

$ sudo ./cephadm version
Using recent ceph image docker.io/ceph/ceph@sha256:9b04c0f15704c49591640a37c7adfd40ffad0a4b42fecb950c3407687cb4f29a
ceph version 16.2.0 (0c2054e95bcd9b30fdd908a79ac1d8bbc3394442) pacific (stable)


> > > 
> > > > +		ceph_dir_clear_ordered(file_inode(file));
> > > > +		ceph_dir_clear_complete(file_inode(file));
> > > > +		return fscrypt_ioctl_add_key(file, (void __user *)arg);
> > > > +
> > > > +	case FS_IOC_REMOVE_ENCRYPTION_KEY:
> > > > +		atomic_inc(&ci->i_shared_gen);
> > > > +		ceph_dir_clear_ordered(file_inode(file));
> > > > +		ceph_dir_clear_complete(file_inode(file));
> > > > +		return fscrypt_ioctl_remove_key(file, (void __user *)arg);
> > > > +
> > > > +	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
> > > > +		atomic_inc(&ci->i_shared_gen);
> > > > +		ceph_dir_clear_ordered(file_inode(file));
> > > > +		ceph_dir_clear_complete(file_inode(file));
> > > > +		return fscrypt_ioctl_remove_key_all_users(file, (void __user *)arg);
> > > > +
> > > > +	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
> > > > +		return fscrypt_ioctl_get_key_status(file, (void __user *)arg);
> > > > +
> > > > +	case FS_IOC_GET_ENCRYPTION_NONCE:
> > > > +		ret = vet_mds_for_fscrypt(file);
> > > > +		if (ret)
> > > > +			return ret;
> > > > +		return fscrypt_ioctl_get_nonce(file, (void __user *)arg);
> > > >  	}
> > > >  
> > > >  	return -ENOTTY;
> > > > -- 
> > > > 2.30.2
> > > > 
> > 
> > -- 
> > Jeff Layton <jlayton@kernel.org>
> >
Luis Henriques April 6, 2021, 6:04 p.m. UTC | #5
On Tue, Apr 06, 2021 at 01:27:21PM -0400, Jeff Layton wrote:
<snip>
> > > > I've spent a few hours already looking at the bug I reported before, and I
> > > > can't really understand this code.  What does it mean to increment
> > > > ->i_shared_gen at this point?
> > > > 
> > > > The reason I'm asking is because it looks like the problem I'm seeing goes
> > > > away if I remove this code.  Here's what I'm doing/seeing:
> > > > 
> > > > # mount ...
> > > > # fscrypt unlock d
> > > > 
> > > >   -> 'd' dentry is eventually pruned at this point *if* ->i_shared_gen was
> > > >      incremented by the line above.
> > > > 
> > > > # cat d/f
> > > > 
> > > >   -> when ceph_fill_inode() is executed, 'd' isn't *not* set as encrypted
> > > >      because both ci->i_xattrs.version and info->xattr_version are both
> > > >      set to 0.
> > > > 
> > > 
> > > Interesting. That sounds like it might be the bug right there. "d"
> > > should clearly have a fscrypt context in its xattrs at that point. If
> > > the MDS isn't passing that back, then that could be a problem.
> > > 
> > > I had a concern about that when I was developing this, and I *thought*
> > > Zheng had assured us that the MDS will always pass along the xattr blob
> > > in a trace. Maybe that's not correct?
> > 
> > Hmm, that's what I thought too.  I was hoping not having to go look at the
> > MDS, but seems like I'll have to :-)
> > 
> 
> That'd be good, if possible.
> 
> > > > cat: d/f: No such file or directory
> > > > 
> > > > I'm not sure anymore if the issue is on the client or on the MDS side.
> > > > Before digging deeper, I wonder if this ring any bell. ;-)
> > > > 
> > > > 
> > > 
> > > No, this is not something I've seen before.
> > > 
> > > Dentries that live in a directory have a copy of the i_shared_gen of the
> > > directory when they are instantiated. Bumping that value on a directory
> > > should basically ensure that its child dentries end up invalidated,
> > > which is what we want once we add the key to the directory. Once we add
> > > a key, any old dentries in that directory are no longer valid.
> > > 
> > > That said, I could certainly have missed some subtlety here.
> > 
> > Great, thanks for clarifying.  This should help me investigate a little
> > bit more.
> > 
> > [ And I'm also surprised you don't see this behaviour as it's very easy to
> >   reproduce. ]
> > 
> > 
> 
> It is odd... fwiw, I ran this for 5 mins or so and never saw a problem:
> 
>     $ while [ $? -eq 0 ]; do sudo umount /mnt/crypt; sudo mount /mnt/crypt; fscrypt unlock --key=/home/jlayton/fscrypt-keyfile /mnt/crypt/d; cat /mnt/crypt/d/f; done
>

TBH I only do this operation once and it almost always fails.  The only
difference I see is that I don't really use a keyfile, but a passphrase
instead.  Not sure if it makes any difference.  Also, it may be worth
adding a delay before the 'cat' to make sure the dentry is pruned.

> ...do I need some other operations in between? Also, the cluster in this
> case is Pacific. It's possible this is a result of changes since then if
> you're on a vstart cluster or something.
> 
> $ sudo ./cephadm version
> Using recent ceph image docker.io/ceph/ceph@sha256:9b04c0f15704c49591640a37c7adfd40ffad0a4b42fecb950c3407687cb4f29a
> ceph version 16.2.0 (0c2054e95bcd9b30fdd908a79ac1d8bbc3394442) pacific (stable)

I've re-compiled the cluster after hard-resetting it to commit
6a19e303187c which you mentioned in a previous email in this thread.  But
the result was the same.

Anyway, using a vstart cluster is also a huge difference I guess.  I'll
keep debugging.  Thanks!

Cheers,
--
Luís
Jeff Layton April 7, 2021, 12:47 p.m. UTC | #6
On Tue, 2021-04-06 at 19:04 +0100, Luis Henriques wrote:
> On Tue, Apr 06, 2021 at 01:27:21PM -0400, Jeff Layton wrote:
> <snip>
> > > > > I've spent a few hours already looking at the bug I reported before, and I
> > > > > can't really understand this code.  What does it mean to increment
> > > > > ->i_shared_gen at this point?
> > > > > 
> > > > > The reason I'm asking is because it looks like the problem I'm seeing goes
> > > > > away if I remove this code.  Here's what I'm doing/seeing:
> > > > > 
> > > > > # mount ...
> > > > > # fscrypt unlock d
> > > > > 
> > > > >   -> 'd' dentry is eventually pruned at this point *if* ->i_shared_gen was
> > > > >      incremented by the line above.
> > > > > 
> > > > > # cat d/f
> > > > > 
> > > > >   -> when ceph_fill_inode() is executed, 'd' isn't *not* set as encrypted
> > > > >      because both ci->i_xattrs.version and info->xattr_version are both
> > > > >      set to 0.
> > > > > 
> > > > 
> > > > Interesting. That sounds like it might be the bug right there. "d"
> > > > should clearly have a fscrypt context in its xattrs at that point. If
> > > > the MDS isn't passing that back, then that could be a problem.
> > > > 
> > > > I had a concern about that when I was developing this, and I *thought*
> > > > Zheng had assured us that the MDS will always pass along the xattr blob
> > > > in a trace. Maybe that's not correct?
> > > 
> > > Hmm, that's what I thought too.  I was hoping not having to go look at the
> > > MDS, but seems like I'll have to :-)
> > > 
> > 
> > That'd be good, if possible.
> > 
> > > > > cat: d/f: No such file or directory
> > > > > 
> > > > > I'm not sure anymore if the issue is on the client or on the MDS side.
> > > > > Before digging deeper, I wonder if this ring any bell. ;-)
> > > > > 
> > > > > 
> > > > 
> > > > No, this is not something I've seen before.
> > > > 
> > > > Dentries that live in a directory have a copy of the i_shared_gen of the
> > > > directory when they are instantiated. Bumping that value on a directory
> > > > should basically ensure that its child dentries end up invalidated,
> > > > which is what we want once we add the key to the directory. Once we add
> > > > a key, any old dentries in that directory are no longer valid.
> > > > 
> > > > That said, I could certainly have missed some subtlety here.
> > > 
> > > Great, thanks for clarifying.  This should help me investigate a little
> > > bit more.
> > > 
> > > [ And I'm also surprised you don't see this behaviour as it's very easy to
> > >   reproduce. ]
> > > 
> > > 
> > 
> > It is odd... fwiw, I ran this for 5 mins or so and never saw a problem:
> > 
> >     $ while [ $? -eq 0 ]; do sudo umount /mnt/crypt; sudo mount /mnt/crypt; fscrypt unlock --key=/home/jlayton/fscrypt-keyfile /mnt/crypt/d; cat /mnt/crypt/d/f; done
> > 
> 
> TBH I only do this operation once and it almost always fails.  The only
> difference I see is that I don't really use a keyfile, but a passphrase
> instead.  Not sure if it makes any difference.  Also, it may be worth
> adding a delay before the 'cat' to make sure the dentry is pruned.
> 

No joy. I tried different delays between 1-5s and it didn't change
anything.

> > ...do I need some other operations in between? Also, the cluster in this
> > case is Pacific. It's possible this is a result of changes since then if
> > you're on a vstart cluster or something.
> > 
> > $ sudo ./cephadm version
> > Using recent ceph image docker.io/ceph/ceph@sha256:9b04c0f15704c49591640a37c7adfd40ffad0a4b42fecb950c3407687cb4f29a
> > ceph version 16.2.0 (0c2054e95bcd9b30fdd908a79ac1d8bbc3394442) pacific (stable)
> 
> I've re-compiled the cluster after hard-resetting it to commit
> 6a19e303187c which you mentioned in a previous email in this thread.  But
> the result was the same.
> 
> Anyway, using a vstart cluster is also a huge difference I guess.  I'll
> keep debugging.  Thanks!
> 

I may try to set one up today to see if I can reproduce it. Thanks for
the testing help so far!
diff mbox series

Patch

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 6e061bf62ad4..34b85bcfcfc7 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -6,6 +6,7 @@ 
 #include "mds_client.h"
 #include "ioctl.h"
 #include <linux/ceph/striper.h>
+#include <linux/fscrypt.h>
 
 /*
  * ioctls
@@ -268,8 +269,56 @@  static long ceph_ioctl_syncio(struct file *file)
 	return 0;
 }
 
+static int vet_mds_for_fscrypt(struct file *file)
+{
+	int i, ret = -EOPNOTSUPP;
+	struct ceph_mds_client	*mdsc = ceph_sb_to_mdsc(file_inode(file)->i_sb);
+
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		struct ceph_mds_session *s = mdsc->sessions[i];
+
+		if (!s)
+			continue;
+		if (test_bit(CEPHFS_FEATURE_ALTERNATE_NAME, &s->s_features))
+			ret = 0;
+		break;
+	}
+	mutex_unlock(&mdsc->mutex);
+	return ret;
+}
+
+static long ceph_set_encryption_policy(struct file *file, unsigned long arg)
+{
+	int ret, got = 0;
+	struct page *page = NULL;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	ret = vet_mds_for_fscrypt(file);
+	if (ret)
+		return ret;
+
+	/*
+	 * Ensure we hold these caps so that we _know_ that the rstats check
+	 * in the empty_dir check is reliable.
+	 */
+	ret = ceph_get_caps(file, CEPH_CAP_FILE_SHARED, 0, -1, &got, &page);
+	if (ret)
+		return ret;
+	if (page)
+		put_page(page);
+	ret = fscrypt_ioctl_set_policy(file, (const void __user *)arg);
+	if (got)
+		ceph_put_cap_refs(ci, got);
+	return ret;
+}
+
 long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
+	int ret;
+	struct ceph_inode_info *ci = ceph_inode(file_inode(file));
+
 	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
 	switch (cmd) {
 	case CEPH_IOC_GET_LAYOUT:
@@ -289,6 +338,51 @@  long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	case CEPH_IOC_SYNCIO:
 		return ceph_ioctl_syncio(file);
+
+	case FS_IOC_SET_ENCRYPTION_POLICY:
+		return ceph_set_encryption_policy(file, arg);
+
+	case FS_IOC_GET_ENCRYPTION_POLICY:
+		ret = vet_mds_for_fscrypt(file);
+		if (ret)
+			return ret;
+		return fscrypt_ioctl_get_policy(file, (void __user *)arg);
+
+	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+		ret = vet_mds_for_fscrypt(file);
+		if (ret)
+			return ret;
+		return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg);
+
+	case FS_IOC_ADD_ENCRYPTION_KEY:
+		ret = vet_mds_for_fscrypt(file);
+		if (ret)
+			return ret;
+		atomic_inc(&ci->i_shared_gen);
+		ceph_dir_clear_ordered(file_inode(file));
+		ceph_dir_clear_complete(file_inode(file));
+		return fscrypt_ioctl_add_key(file, (void __user *)arg);
+
+	case FS_IOC_REMOVE_ENCRYPTION_KEY:
+		atomic_inc(&ci->i_shared_gen);
+		ceph_dir_clear_ordered(file_inode(file));
+		ceph_dir_clear_complete(file_inode(file));
+		return fscrypt_ioctl_remove_key(file, (void __user *)arg);
+
+	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+		atomic_inc(&ci->i_shared_gen);
+		ceph_dir_clear_ordered(file_inode(file));
+		ceph_dir_clear_complete(file_inode(file));
+		return fscrypt_ioctl_remove_key_all_users(file, (void __user *)arg);
+
+	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+		return fscrypt_ioctl_get_key_status(file, (void __user *)arg);
+
+	case FS_IOC_GET_ENCRYPTION_NONCE:
+		ret = vet_mds_for_fscrypt(file);
+		if (ret)
+			return ret;
+		return fscrypt_ioctl_get_nonce(file, (void __user *)arg);
 	}
 
 	return -ENOTTY;