diff mbox series

[v6,10/13] ceph: decode interval_sets for delegated inos

Message ID 20200302141434.59825-11-jlayton@kernel.org (mailing list archive)
State New, archived
Headers show
Series ceph: async directory operations support | expand

Commit Message

Jeff Layton March 2, 2020, 2:14 p.m. UTC
Starting in Octopus, the MDS will hand out caps that allow the client
to do asynchronous file creates under certain conditions. As part of
that, the MDS will delegate ranges of inode numbers to the client.

Add the infrastructure to decode these ranges, and stuff them into an
xarray for later consumption by the async creation code.

Because the xarray code currently only handles unsigned long indexes,
and those are 32-bits on 32-bit arches, we only enable the decoding when
running on a 64-bit arch.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/mds_client.c | 122 +++++++++++++++++++++++++++++++++++++++----
 fs/ceph/mds_client.h |   9 +++-
 2 files changed, 121 insertions(+), 10 deletions(-)

Comments

Luis Henriques March 5, 2020, 11:45 a.m. UTC | #1
On Mon, Mar 02, 2020 at 09:14:31AM -0500, Jeff Layton wrote:
> Starting in Octopus, the MDS will hand out caps that allow the client
> to do asynchronous file creates under certain conditions. As part of
> that, the MDS will delegate ranges of inode numbers to the client.
> 
> Add the infrastructure to decode these ranges, and stuff them into an
> xarray for later consumption by the async creation code.
> 
> Because the xarray code currently only handles unsigned long indexes,
> and those are 32-bits on 32-bit arches, we only enable the decoding when
> running on a 64-bit arch.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/mds_client.c | 122 +++++++++++++++++++++++++++++++++++++++----
>  fs/ceph/mds_client.h |   9 +++-
>  2 files changed, 121 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index db8304447f35..87f75d05b004 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -415,21 +415,121 @@ static int parse_reply_info_filelock(void **p, void *end,
>  	return -EIO;
>  }
>  
> +
> +#if BITS_PER_LONG == 64
> +
> +#define DELEGATED_INO_AVAILABLE		xa_mk_value(1)
> +
> +static int ceph_parse_deleg_inos(void **p, void *end,
> +				 struct ceph_mds_session *s)
> +{
> +	u32 sets;
> +
> +	ceph_decode_32_safe(p, end, sets, bad);
> +	dout("got %u sets of delegated inodes\n", sets);
> +	while (sets--) {
> +		u64 start, len, ino;
> +
> +		ceph_decode_64_safe(p, end, start, bad);
> +		ceph_decode_64_safe(p, end, len, bad);
> +		while (len--) {
> +			int err = xa_insert(&s->s_delegated_inos, ino = start++,
> +					    DELEGATED_INO_AVAILABLE,
> +					    GFP_KERNEL);
> +			if (!err) {
> +				dout("added delegated inode 0x%llx\n",
> +				     start - 1);
> +			} else if (err == -EBUSY) {
> +				pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
> +					start - 1);
> +			} else {
> +				return err;
> +			}
> +		}
> +	}
> +	return 0;
> +bad:
> +	return -EIO;
> +}
> +
> +u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
> +{
> +	unsigned long ino;
> +	void *val;
> +
> +	xa_for_each(&s->s_delegated_inos, ino, val) {
> +		val = xa_erase(&s->s_delegated_inos, ino);
> +		if (val == DELEGATED_INO_AVAILABLE)
> +			return ino;
> +	}
> +	return 0;
> +}
> +
> +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
> +{
> +	return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
> +			 GFP_KERNEL);
> +}
> +#else /* BITS_PER_LONG == 64 */
> +/*
> + * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
> + * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
> + * and bottom words?
> + */
> +static int ceph_parse_deleg_inos(void **p, void *end,
> +				 struct ceph_mds_session *s)
> +{
> +	u32 sets;
> +
> +	ceph_decode_32_safe(p, end, sets, bad);
> +	if (sets)
> +		ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
> +	return 0;
> +bad:
> +	return -EIO;
> +}
> +
> +u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
> +{
> +	return 0;
> +}
> +
> +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
> +{
> +	return 0;
> +}
> +#endif /* BITS_PER_LONG == 64 */
> +
>  /*
>   * parse create results
>   */
>  static int parse_reply_info_create(void **p, void *end,
>  				  struct ceph_mds_reply_info_parsed *info,
> -				  u64 features)
> +				  u64 features, struct ceph_mds_session *s)
>  {
> +	int ret;
> +
>  	if (features == (u64)-1 ||
>  	    (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
> -		/* Malformed reply? */
>  		if (*p == end) {
> +			/* Malformed reply? */
>  			info->has_create_ino = false;
> -		} else {
> +		} else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
> +			u8 struct_v, struct_compat;
> +			u32 len;
> +
>  			info->has_create_ino = true;
> +			ceph_decode_8_safe(p, end, struct_v, bad);
> +			ceph_decode_8_safe(p, end, struct_compat, bad);
> +			ceph_decode_32_safe(p, end, len, bad);
> +			ceph_decode_64_safe(p, end, info->ino, bad);

I've done a quick test in current 'testing' branch and it seems that it's
currently broken.  A bisect identified this commit as 'bad' and it's
failing at this point.

I'm running an old (a few weeks) 'master' vstart cluster, so I don't have
the needed bits for using this DELEG_INO feature.  Running xfstest
generic/001 results in:

   ceph: mds parse_reply err -5
   ceph: mdsc_handle_reply got corrupt reply mds0(tid:9)
   ...

s->s_features does include the CEPHFS_FEATURE_DELEG_INO bit set;
'features' is -1 (0xffffffffffffffff) and s->s_features is 0x3fff.  Maybe
the issue is actually somewhere else (the cephfs feature handling code),
but I'm still looking.

Cheers,
--
Luís

> +			ret = ceph_parse_deleg_inos(p, end, s);
> +			if (ret)
> +				return ret;
> +		} else {
> +			/* legacy */
>  			ceph_decode_64_safe(p, end, info->ino, bad);
> +			info->has_create_ino = true;
>  		}
>  	} else {
>  		if (*p != end)
> @@ -448,7 +548,7 @@ static int parse_reply_info_create(void **p, void *end,
>   */
>  static int parse_reply_info_extra(void **p, void *end,
>  				  struct ceph_mds_reply_info_parsed *info,
> -				  u64 features)
> +				  u64 features, struct ceph_mds_session *s)
>  {
>  	u32 op = le32_to_cpu(info->head->op);
>  
> @@ -457,7 +557,7 @@ static int parse_reply_info_extra(void **p, void *end,
>  	else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
>  		return parse_reply_info_readdir(p, end, info, features);
>  	else if (op == CEPH_MDS_OP_CREATE)
> -		return parse_reply_info_create(p, end, info, features);
> +		return parse_reply_info_create(p, end, info, features, s);
>  	else
>  		return -EIO;
>  }
> @@ -465,7 +565,7 @@ static int parse_reply_info_extra(void **p, void *end,
>  /*
>   * parse entire mds reply
>   */
> -static int parse_reply_info(struct ceph_msg *msg,
> +static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
>  			    struct ceph_mds_reply_info_parsed *info,
>  			    u64 features)
>  {
> @@ -490,7 +590,7 @@ static int parse_reply_info(struct ceph_msg *msg,
>  	ceph_decode_32_safe(&p, end, len, bad);
>  	if (len > 0) {
>  		ceph_decode_need(&p, end, len, bad);
> -		err = parse_reply_info_extra(&p, p+len, info, features);
> +		err = parse_reply_info_extra(&p, p+len, info, features, s);
>  		if (err < 0)
>  			goto out_bad;
>  	}
> @@ -558,6 +658,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
>  	if (refcount_dec_and_test(&s->s_ref)) {
>  		if (s->s_auth.authorizer)
>  			ceph_auth_destroy_authorizer(s->s_auth.authorizer);
> +		xa_destroy(&s->s_delegated_inos);
>  		kfree(s);
>  	}
>  }
> @@ -645,6 +746,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
>  	refcount_set(&s->s_ref, 1);
>  	INIT_LIST_HEAD(&s->s_waiting);
>  	INIT_LIST_HEAD(&s->s_unsafe);
> +	xa_init(&s->s_delegated_inos);
>  	s->s_num_cap_releases = 0;
>  	s->s_cap_reconnect = 0;
>  	s->s_cap_iterator = NULL;
> @@ -2975,9 +3077,9 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
>  	dout("handle_reply tid %lld result %d\n", tid, result);
>  	rinfo = &req->r_reply_info;
>  	if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
> -		err = parse_reply_info(msg, rinfo, (u64)-1);
> +		err = parse_reply_info(session, msg, rinfo, (u64)-1);
>  	else
> -		err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
> +		err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
>  	mutex_unlock(&mdsc->mutex);
>  
>  	mutex_lock(&session->s_mutex);
> @@ -3673,6 +3775,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
>  	if (!reply)
>  		goto fail_nomsg;
>  
> +	xa_destroy(&session->s_delegated_inos);
> +
>  	mutex_lock(&session->s_mutex);
>  	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
>  	session->s_seq = 0;
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index f10d342ea585..4c3b71707470 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -23,8 +23,9 @@ enum ceph_feature_type {
>  	CEPHFS_FEATURE_RECLAIM_CLIENT,
>  	CEPHFS_FEATURE_LAZY_CAP_WANTED,
>  	CEPHFS_FEATURE_MULTI_RECONNECT,
> +	CEPHFS_FEATURE_DELEG_INO,
>  
> -	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
> +	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
>  };
>  
>  /*
> @@ -37,6 +38,7 @@ enum ceph_feature_type {
>  	CEPHFS_FEATURE_REPLY_ENCODING,		\
>  	CEPHFS_FEATURE_LAZY_CAP_WANTED,		\
>  	CEPHFS_FEATURE_MULTI_RECONNECT,		\
> +	CEPHFS_FEATURE_DELEG_INO,		\
>  						\
>  	CEPHFS_FEATURE_MAX,			\
>  }
> @@ -201,6 +203,7 @@ struct ceph_mds_session {
>  
>  	struct list_head  s_waiting;  /* waiting requests */
>  	struct list_head  s_unsafe;   /* unsafe requests */
> +	struct xarray	  s_delegated_inos;
>  };
>  
>  /*
> @@ -542,6 +545,7 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
>  extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
>  			  struct ceph_mds_session *session,
>  			  int max_caps);
> +
>  static inline int ceph_wait_on_async_create(struct inode *inode)
>  {
>  	struct ceph_inode_info *ci = ceph_inode(inode);
> @@ -549,4 +553,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
>  	return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
>  			   TASK_INTERRUPTIBLE);
>  }
> +
> +extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
> +extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
>  #endif
> -- 
> 2.24.1
>
Jeff Layton March 5, 2020, 12:02 p.m. UTC | #2
On Thu, 2020-03-05 at 11:45 +0000, Luis Henriques wrote:
> On Mon, Mar 02, 2020 at 09:14:31AM -0500, Jeff Layton wrote:
> > Starting in Octopus, the MDS will hand out caps that allow the client
> > to do asynchronous file creates under certain conditions. As part of
> > that, the MDS will delegate ranges of inode numbers to the client.
> > 
> > Add the infrastructure to decode these ranges, and stuff them into an
> > xarray for later consumption by the async creation code.
> > 
> > Because the xarray code currently only handles unsigned long indexes,
> > and those are 32-bits on 32-bit arches, we only enable the decoding when
> > running on a 64-bit arch.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/mds_client.c | 122 +++++++++++++++++++++++++++++++++++++++----
> >  fs/ceph/mds_client.h |   9 +++-
> >  2 files changed, 121 insertions(+), 10 deletions(-)
> > 
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index db8304447f35..87f75d05b004 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -415,21 +415,121 @@ static int parse_reply_info_filelock(void **p, void *end,
> >  	return -EIO;
> >  }
> >  
> > +
> > +#if BITS_PER_LONG == 64
> > +
> > +#define DELEGATED_INO_AVAILABLE		xa_mk_value(1)
> > +
> > +static int ceph_parse_deleg_inos(void **p, void *end,
> > +				 struct ceph_mds_session *s)
> > +{
> > +	u32 sets;
> > +
> > +	ceph_decode_32_safe(p, end, sets, bad);
> > +	dout("got %u sets of delegated inodes\n", sets);
> > +	while (sets--) {
> > +		u64 start, len, ino;
> > +
> > +		ceph_decode_64_safe(p, end, start, bad);
> > +		ceph_decode_64_safe(p, end, len, bad);
> > +		while (len--) {
> > +			int err = xa_insert(&s->s_delegated_inos, ino = start++,
> > +					    DELEGATED_INO_AVAILABLE,
> > +					    GFP_KERNEL);
> > +			if (!err) {
> > +				dout("added delegated inode 0x%llx\n",
> > +				     start - 1);
> > +			} else if (err == -EBUSY) {
> > +				pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
> > +					start - 1);
> > +			} else {
> > +				return err;
> > +			}
> > +		}
> > +	}
> > +	return 0;
> > +bad:
> > +	return -EIO;
> > +}
> > +
> > +u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
> > +{
> > +	unsigned long ino;
> > +	void *val;
> > +
> > +	xa_for_each(&s->s_delegated_inos, ino, val) {
> > +		val = xa_erase(&s->s_delegated_inos, ino);
> > +		if (val == DELEGATED_INO_AVAILABLE)
> > +			return ino;
> > +	}
> > +	return 0;
> > +}
> > +
> > +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
> > +{
> > +	return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
> > +			 GFP_KERNEL);
> > +}
> > +#else /* BITS_PER_LONG == 64 */
> > +/*
> > + * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
> > + * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
> > + * and bottom words?
> > + */
> > +static int ceph_parse_deleg_inos(void **p, void *end,
> > +				 struct ceph_mds_session *s)
> > +{
> > +	u32 sets;
> > +
> > +	ceph_decode_32_safe(p, end, sets, bad);
> > +	if (sets)
> > +		ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
> > +	return 0;
> > +bad:
> > +	return -EIO;
> > +}
> > +
> > +u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
> > +{
> > +	return 0;
> > +}
> > +
> > +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
> > +{
> > +	return 0;
> > +}
> > +#endif /* BITS_PER_LONG == 64 */
> > +
> >  /*
> >   * parse create results
> >   */
> >  static int parse_reply_info_create(void **p, void *end,
> >  				  struct ceph_mds_reply_info_parsed *info,
> > -				  u64 features)
> > +				  u64 features, struct ceph_mds_session *s)
> >  {
> > +	int ret;
> > +
> >  	if (features == (u64)-1 ||
> >  	    (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
> > -		/* Malformed reply? */
> >  		if (*p == end) {
> > +			/* Malformed reply? */
> >  			info->has_create_ino = false;
> > -		} else {
> > +		} else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
> > +			u8 struct_v, struct_compat;
> > +			u32 len;
> > +
> >  			info->has_create_ino = true;
> > +			ceph_decode_8_safe(p, end, struct_v, bad);
> > +			ceph_decode_8_safe(p, end, struct_compat, bad);
> > +			ceph_decode_32_safe(p, end, len, bad);
> > +			ceph_decode_64_safe(p, end, info->ino, bad);
> 
> I've done a quick test in current 'testing' branch and it seems that it's
> currently broken.  A bisect identified this commit as 'bad' and it's
> failing at this point.
> 
> I'm running an old (a few weeks) 'master' vstart cluster, so I don't have
> the needed bits for using this DELEG_INO feature.  Running xfstest
> generic/001 results in:
> 
>    ceph: mds parse_reply err -5
>    ceph: mdsc_handle_reply got corrupt reply mds0(tid:9)
>    ...
> 
> s->s_features does include the CEPHFS_FEATURE_DELEG_INO bit set;
> 'features' is -1 (0xffffffffffffffff) and s->s_features is 0x3fff.  Maybe
> the issue is actually somewhere else (the cephfs feature handling code),
> but I'm still looking.
> 

From the patch that added this feature in userland ceph code (commit
2bcf4b62643b5):

--- a/src/mds/cephfs_features.h
+++ b/src/mds/cephfs_features.h
@@ -32,6 +32,7 @@
 #define CEPHFS_FEATURE_LAZY_CAP_WANTED  11
 #define CEPHFS_FEATURE_MULTI_RECONNECT  12
 #define CEPHFS_FEATURE_NAUTILUS         12
+#define CEPHFS_FEATURE_DELEG_INO        13
 #define CEPHFS_FEATURE_OCTOPUS          13
 
 #define CEPHFS_FEATURES_ALL {          \
@@ -45,6 +46,7 @@
   CEPHFS_FEATURE_LAZY_CAP_WANTED,      \
   CEPHFS_FEATURE_MULTI_RECONNECT,      \
   CEPHFS_FEATURE_NAUTILUS,              \
+  CEPHFS_FEATURE_DELEG_INO,             \
   CEPHFS_FEATURE_OCTOPUS,               \
 }

...this feature was added under the aegis of the
CEPHFS_FEATURE_DELEG_INO flag, but that bit is shared with
CEPHFS_FEATURE_OCTOPUS, which was already enabled in octopus before we
ever added it (back on April 1st 2019).

Any version of the MDS that has commit 49930ad8a3402 but does not have 
2bcf4b62643b5 will not work properly with newer kernels. Personally, I
don't see that as a problem per-se, as that should only be the case with
bleeding-edge MDS builds. Official releases should never see this issue.

Going forward, I think commit 49930ad8a3402 was probably a bad idea. We
really should not add "release" cephfs feature bits to the mask until
just before an official release, and should just make it alias the last
"real" feature bit. That should help ensure that we don't hit this
problem in the future.

> > +			ret = ceph_parse_deleg_inos(p, end, s);
> > +			if (ret)
> > +				return ret;
> > +		} else {
> > +			/* legacy */
> >  			ceph_decode_64_safe(p, end, info->ino, bad);
> > +			info->has_create_ino = true;
> >  		}
> >  	} else {
> >  		if (*p != end)
> > @@ -448,7 +548,7 @@ static int parse_reply_info_create(void **p, void *end,
> >   */
> >  static int parse_reply_info_extra(void **p, void *end,
> >  				  struct ceph_mds_reply_info_parsed *info,
> > -				  u64 features)
> > +				  u64 features, struct ceph_mds_session *s)
> >  {
> >  	u32 op = le32_to_cpu(info->head->op);
> >  
> > @@ -457,7 +557,7 @@ static int parse_reply_info_extra(void **p, void *end,
> >  	else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
> >  		return parse_reply_info_readdir(p, end, info, features);
> >  	else if (op == CEPH_MDS_OP_CREATE)
> > -		return parse_reply_info_create(p, end, info, features);
> > +		return parse_reply_info_create(p, end, info, features, s);
> >  	else
> >  		return -EIO;
> >  }
> > @@ -465,7 +565,7 @@ static int parse_reply_info_extra(void **p, void *end,
> >  /*
> >   * parse entire mds reply
> >   */
> > -static int parse_reply_info(struct ceph_msg *msg,
> > +static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
> >  			    struct ceph_mds_reply_info_parsed *info,
> >  			    u64 features)
> >  {
> > @@ -490,7 +590,7 @@ static int parse_reply_info(struct ceph_msg *msg,
> >  	ceph_decode_32_safe(&p, end, len, bad);
> >  	if (len > 0) {
> >  		ceph_decode_need(&p, end, len, bad);
> > -		err = parse_reply_info_extra(&p, p+len, info, features);
> > +		err = parse_reply_info_extra(&p, p+len, info, features, s);
> >  		if (err < 0)
> >  			goto out_bad;
> >  	}
> > @@ -558,6 +658,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
> >  	if (refcount_dec_and_test(&s->s_ref)) {
> >  		if (s->s_auth.authorizer)
> >  			ceph_auth_destroy_authorizer(s->s_auth.authorizer);
> > +		xa_destroy(&s->s_delegated_inos);
> >  		kfree(s);
> >  	}
> >  }
> > @@ -645,6 +746,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
> >  	refcount_set(&s->s_ref, 1);
> >  	INIT_LIST_HEAD(&s->s_waiting);
> >  	INIT_LIST_HEAD(&s->s_unsafe);
> > +	xa_init(&s->s_delegated_inos);
> >  	s->s_num_cap_releases = 0;
> >  	s->s_cap_reconnect = 0;
> >  	s->s_cap_iterator = NULL;
> > @@ -2975,9 +3077,9 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
> >  	dout("handle_reply tid %lld result %d\n", tid, result);
> >  	rinfo = &req->r_reply_info;
> >  	if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
> > -		err = parse_reply_info(msg, rinfo, (u64)-1);
> > +		err = parse_reply_info(session, msg, rinfo, (u64)-1);
> >  	else
> > -		err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
> > +		err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
> >  	mutex_unlock(&mdsc->mutex);
> >  
> >  	mutex_lock(&session->s_mutex);
> > @@ -3673,6 +3775,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
> >  	if (!reply)
> >  		goto fail_nomsg;
> >  
> > +	xa_destroy(&session->s_delegated_inos);
> > +
> >  	mutex_lock(&session->s_mutex);
> >  	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
> >  	session->s_seq = 0;
> > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > index f10d342ea585..4c3b71707470 100644
> > --- a/fs/ceph/mds_client.h
> > +++ b/fs/ceph/mds_client.h
> > @@ -23,8 +23,9 @@ enum ceph_feature_type {
> >  	CEPHFS_FEATURE_RECLAIM_CLIENT,
> >  	CEPHFS_FEATURE_LAZY_CAP_WANTED,
> >  	CEPHFS_FEATURE_MULTI_RECONNECT,
> > +	CEPHFS_FEATURE_DELEG_INO,
> >  
> > -	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
> > +	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
> >  };
> >  
> >  /*
> > @@ -37,6 +38,7 @@ enum ceph_feature_type {
> >  	CEPHFS_FEATURE_REPLY_ENCODING,		\
> >  	CEPHFS_FEATURE_LAZY_CAP_WANTED,		\
> >  	CEPHFS_FEATURE_MULTI_RECONNECT,		\
> > +	CEPHFS_FEATURE_DELEG_INO,		\
> >  						\
> >  	CEPHFS_FEATURE_MAX,			\
> >  }
> > @@ -201,6 +203,7 @@ struct ceph_mds_session {
> >  
> >  	struct list_head  s_waiting;  /* waiting requests */
> >  	struct list_head  s_unsafe;   /* unsafe requests */
> > +	struct xarray	  s_delegated_inos;
> >  };
> >  
> >  /*
> > @@ -542,6 +545,7 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
> >  extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
> >  			  struct ceph_mds_session *session,
> >  			  int max_caps);
> > +
> >  static inline int ceph_wait_on_async_create(struct inode *inode)
> >  {
> >  	struct ceph_inode_info *ci = ceph_inode(inode);
> > @@ -549,4 +553,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
> >  	return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
> >  			   TASK_INTERRUPTIBLE);
> >  }
> > +
> > +extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
> > +extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
> >  #endif
> > -- 
> > 2.24.1
> >
Luis Henriques March 5, 2020, 12:20 p.m. UTC | #3
On Thu, Mar 05, 2020 at 07:02:59AM -0500, Jeff Layton wrote:
> On Thu, 2020-03-05 at 11:45 +0000, Luis Henriques wrote:
> > On Mon, Mar 02, 2020 at 09:14:31AM -0500, Jeff Layton wrote:
> > > Starting in Octopus, the MDS will hand out caps that allow the client
> > > to do asynchronous file creates under certain conditions. As part of
> > > that, the MDS will delegate ranges of inode numbers to the client.
> > > 
> > > Add the infrastructure to decode these ranges, and stuff them into an
> > > xarray for later consumption by the async creation code.
> > > 
> > > Because the xarray code currently only handles unsigned long indexes,
> > > and those are 32-bits on 32-bit arches, we only enable the decoding when
> > > running on a 64-bit arch.
> > > 
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/ceph/mds_client.c | 122 +++++++++++++++++++++++++++++++++++++++----
> > >  fs/ceph/mds_client.h |   9 +++-
> > >  2 files changed, 121 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index db8304447f35..87f75d05b004 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -415,21 +415,121 @@ static int parse_reply_info_filelock(void **p, void *end,
> > >  	return -EIO;
> > >  }
> > >  
> > > +
> > > +#if BITS_PER_LONG == 64
> > > +
> > > +#define DELEGATED_INO_AVAILABLE		xa_mk_value(1)
> > > +
> > > +static int ceph_parse_deleg_inos(void **p, void *end,
> > > +				 struct ceph_mds_session *s)
> > > +{
> > > +	u32 sets;
> > > +
> > > +	ceph_decode_32_safe(p, end, sets, bad);
> > > +	dout("got %u sets of delegated inodes\n", sets);
> > > +	while (sets--) {
> > > +		u64 start, len, ino;
> > > +
> > > +		ceph_decode_64_safe(p, end, start, bad);
> > > +		ceph_decode_64_safe(p, end, len, bad);
> > > +		while (len--) {
> > > +			int err = xa_insert(&s->s_delegated_inos, ino = start++,
> > > +					    DELEGATED_INO_AVAILABLE,
> > > +					    GFP_KERNEL);
> > > +			if (!err) {
> > > +				dout("added delegated inode 0x%llx\n",
> > > +				     start - 1);
> > > +			} else if (err == -EBUSY) {
> > > +				pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
> > > +					start - 1);
> > > +			} else {
> > > +				return err;
> > > +			}
> > > +		}
> > > +	}
> > > +	return 0;
> > > +bad:
> > > +	return -EIO;
> > > +}
> > > +
> > > +u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
> > > +{
> > > +	unsigned long ino;
> > > +	void *val;
> > > +
> > > +	xa_for_each(&s->s_delegated_inos, ino, val) {
> > > +		val = xa_erase(&s->s_delegated_inos, ino);
> > > +		if (val == DELEGATED_INO_AVAILABLE)
> > > +			return ino;
> > > +	}
> > > +	return 0;
> > > +}
> > > +
> > > +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
> > > +{
> > > +	return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
> > > +			 GFP_KERNEL);
> > > +}
> > > +#else /* BITS_PER_LONG == 64 */
> > > +/*
> > > + * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
> > > + * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
> > > + * and bottom words?
> > > + */
> > > +static int ceph_parse_deleg_inos(void **p, void *end,
> > > +				 struct ceph_mds_session *s)
> > > +{
> > > +	u32 sets;
> > > +
> > > +	ceph_decode_32_safe(p, end, sets, bad);
> > > +	if (sets)
> > > +		ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
> > > +	return 0;
> > > +bad:
> > > +	return -EIO;
> > > +}
> > > +
> > > +u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
> > > +{
> > > +	return 0;
> > > +}
> > > +
> > > +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
> > > +{
> > > +	return 0;
> > > +}
> > > +#endif /* BITS_PER_LONG == 64 */
> > > +
> > >  /*
> > >   * parse create results
> > >   */
> > >  static int parse_reply_info_create(void **p, void *end,
> > >  				  struct ceph_mds_reply_info_parsed *info,
> > > -				  u64 features)
> > > +				  u64 features, struct ceph_mds_session *s)
> > >  {
> > > +	int ret;
> > > +
> > >  	if (features == (u64)-1 ||
> > >  	    (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
> > > -		/* Malformed reply? */
> > >  		if (*p == end) {
> > > +			/* Malformed reply? */
> > >  			info->has_create_ino = false;
> > > -		} else {
> > > +		} else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
> > > +			u8 struct_v, struct_compat;
> > > +			u32 len;
> > > +
> > >  			info->has_create_ino = true;
> > > +			ceph_decode_8_safe(p, end, struct_v, bad);
> > > +			ceph_decode_8_safe(p, end, struct_compat, bad);
> > > +			ceph_decode_32_safe(p, end, len, bad);
> > > +			ceph_decode_64_safe(p, end, info->ino, bad);
> > 
> > I've done a quick test in current 'testing' branch and it seems that it's
> > currently broken.  A bisect identified this commit as 'bad' and it's
> > failing at this point.
> > 
> > I'm running an old (a few weeks) 'master' vstart cluster, so I don't have
> > the needed bits for using this DELEG_INO feature.  Running xfstest
> > generic/001 results in:
> > 
> >    ceph: mds parse_reply err -5
> >    ceph: mdsc_handle_reply got corrupt reply mds0(tid:9)
> >    ...
> > 
> > s->s_features does include the CEPHFS_FEATURE_DELEG_INO bit set;
> > 'features' is -1 (0xffffffffffffffff) and s->s_features is 0x3fff.  Maybe
> > the issue is actually somewhere else (the cephfs feature handling code),
> > but I'm still looking.
> > 
> 
> From the patch that added this feature in userland ceph code (commit
> 2bcf4b62643b5):
> 
> --- a/src/mds/cephfs_features.h
> +++ b/src/mds/cephfs_features.h
> @@ -32,6 +32,7 @@
>  #define CEPHFS_FEATURE_LAZY_CAP_WANTED  11
>  #define CEPHFS_FEATURE_MULTI_RECONNECT  12
>  #define CEPHFS_FEATURE_NAUTILUS         12
> +#define CEPHFS_FEATURE_DELEG_INO        13
>  #define CEPHFS_FEATURE_OCTOPUS          13
>  
>  #define CEPHFS_FEATURES_ALL {          \
> @@ -45,6 +46,7 @@
>    CEPHFS_FEATURE_LAZY_CAP_WANTED,      \
>    CEPHFS_FEATURE_MULTI_RECONNECT,      \
>    CEPHFS_FEATURE_NAUTILUS,              \
> +  CEPHFS_FEATURE_DELEG_INO,             \
>    CEPHFS_FEATURE_OCTOPUS,               \
>  }
> 
> ...this feature was added under the aegis of the
> CEPHFS_FEATURE_DELEG_INO flag, but that bit is shared with
> CEPHFS_FEATURE_OCTOPUS, which was already enabled in octopus before we
> ever added it (back on April 1st 2019).
> 
> Any version of the MDS that has commit 49930ad8a3402 but does not have 
> 2bcf4b62643b5 will not work properly with newer kernels. Personally, I
> don't see that as a problem per-se, as that should only be the case with
> bleeding-edge MDS builds. Official releases should never see this issue.
> 
> Going forward, I think commit 49930ad8a3402 was probably a bad idea. We
> really should not add "release" cephfs feature bits to the mask until
> just before an official release, and should just make it alias the last
> "real" feature bit. That should help ensure that we don't hit this
> problem in the future.

Doh!  Thanks a lot for figuring this out, Jeff!  You may have saved me a
few hours with this explanation!

Cheers,
--
Luís


> 
> > > +			ret = ceph_parse_deleg_inos(p, end, s);
> > > +			if (ret)
> > > +				return ret;
> > > +		} else {
> > > +			/* legacy */
> > >  			ceph_decode_64_safe(p, end, info->ino, bad);
> > > +			info->has_create_ino = true;
> > >  		}
> > >  	} else {
> > >  		if (*p != end)
> > > @@ -448,7 +548,7 @@ static int parse_reply_info_create(void **p, void *end,
> > >   */
> > >  static int parse_reply_info_extra(void **p, void *end,
> > >  				  struct ceph_mds_reply_info_parsed *info,
> > > -				  u64 features)
> > > +				  u64 features, struct ceph_mds_session *s)
> > >  {
> > >  	u32 op = le32_to_cpu(info->head->op);
> > >  
> > > @@ -457,7 +557,7 @@ static int parse_reply_info_extra(void **p, void *end,
> > >  	else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
> > >  		return parse_reply_info_readdir(p, end, info, features);
> > >  	else if (op == CEPH_MDS_OP_CREATE)
> > > -		return parse_reply_info_create(p, end, info, features);
> > > +		return parse_reply_info_create(p, end, info, features, s);
> > >  	else
> > >  		return -EIO;
> > >  }
> > > @@ -465,7 +565,7 @@ static int parse_reply_info_extra(void **p, void *end,
> > >  /*
> > >   * parse entire mds reply
> > >   */
> > > -static int parse_reply_info(struct ceph_msg *msg,
> > > +static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
> > >  			    struct ceph_mds_reply_info_parsed *info,
> > >  			    u64 features)
> > >  {
> > > @@ -490,7 +590,7 @@ static int parse_reply_info(struct ceph_msg *msg,
> > >  	ceph_decode_32_safe(&p, end, len, bad);
> > >  	if (len > 0) {
> > >  		ceph_decode_need(&p, end, len, bad);
> > > -		err = parse_reply_info_extra(&p, p+len, info, features);
> > > +		err = parse_reply_info_extra(&p, p+len, info, features, s);
> > >  		if (err < 0)
> > >  			goto out_bad;
> > >  	}
> > > @@ -558,6 +658,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
> > >  	if (refcount_dec_and_test(&s->s_ref)) {
> > >  		if (s->s_auth.authorizer)
> > >  			ceph_auth_destroy_authorizer(s->s_auth.authorizer);
> > > +		xa_destroy(&s->s_delegated_inos);
> > >  		kfree(s);
> > >  	}
> > >  }
> > > @@ -645,6 +746,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
> > >  	refcount_set(&s->s_ref, 1);
> > >  	INIT_LIST_HEAD(&s->s_waiting);
> > >  	INIT_LIST_HEAD(&s->s_unsafe);
> > > +	xa_init(&s->s_delegated_inos);
> > >  	s->s_num_cap_releases = 0;
> > >  	s->s_cap_reconnect = 0;
> > >  	s->s_cap_iterator = NULL;
> > > @@ -2975,9 +3077,9 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
> > >  	dout("handle_reply tid %lld result %d\n", tid, result);
> > >  	rinfo = &req->r_reply_info;
> > >  	if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
> > > -		err = parse_reply_info(msg, rinfo, (u64)-1);
> > > +		err = parse_reply_info(session, msg, rinfo, (u64)-1);
> > >  	else
> > > -		err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
> > > +		err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
> > >  	mutex_unlock(&mdsc->mutex);
> > >  
> > >  	mutex_lock(&session->s_mutex);
> > > @@ -3673,6 +3775,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
> > >  	if (!reply)
> > >  		goto fail_nomsg;
> > >  
> > > +	xa_destroy(&session->s_delegated_inos);
> > > +
> > >  	mutex_lock(&session->s_mutex);
> > >  	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
> > >  	session->s_seq = 0;
> > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > index f10d342ea585..4c3b71707470 100644
> > > --- a/fs/ceph/mds_client.h
> > > +++ b/fs/ceph/mds_client.h
> > > @@ -23,8 +23,9 @@ enum ceph_feature_type {
> > >  	CEPHFS_FEATURE_RECLAIM_CLIENT,
> > >  	CEPHFS_FEATURE_LAZY_CAP_WANTED,
> > >  	CEPHFS_FEATURE_MULTI_RECONNECT,
> > > +	CEPHFS_FEATURE_DELEG_INO,
> > >  
> > > -	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
> > > +	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
> > >  };
> > >  
> > >  /*
> > > @@ -37,6 +38,7 @@ enum ceph_feature_type {
> > >  	CEPHFS_FEATURE_REPLY_ENCODING,		\
> > >  	CEPHFS_FEATURE_LAZY_CAP_WANTED,		\
> > >  	CEPHFS_FEATURE_MULTI_RECONNECT,		\
> > > +	CEPHFS_FEATURE_DELEG_INO,		\
> > >  						\
> > >  	CEPHFS_FEATURE_MAX,			\
> > >  }
> > > @@ -201,6 +203,7 @@ struct ceph_mds_session {
> > >  
> > >  	struct list_head  s_waiting;  /* waiting requests */
> > >  	struct list_head  s_unsafe;   /* unsafe requests */
> > > +	struct xarray	  s_delegated_inos;
> > >  };
> > >  
> > >  /*
> > > @@ -542,6 +545,7 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
> > >  extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
> > >  			  struct ceph_mds_session *session,
> > >  			  int max_caps);
> > > +
> > >  static inline int ceph_wait_on_async_create(struct inode *inode)
> > >  {
> > >  	struct ceph_inode_info *ci = ceph_inode(inode);
> > > @@ -549,4 +553,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
> > >  	return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
> > >  			   TASK_INTERRUPTIBLE);
> > >  }
> > > +
> > > +extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
> > > +extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
> > >  #endif
> > > -- 
> > > 2.24.1
> > > 
> 
> -- 
> Jeff Layton <jlayton@kernel.org>
>
Ilya Dryomov March 5, 2020, 1:36 p.m. UTC | #4
On Thu, Mar 5, 2020 at 1:03 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Thu, 2020-03-05 at 11:45 +0000, Luis Henriques wrote:
> > On Mon, Mar 02, 2020 at 09:14:31AM -0500, Jeff Layton wrote:
> > > Starting in Octopus, the MDS will hand out caps that allow the client
> > > to do asynchronous file creates under certain conditions. As part of
> > > that, the MDS will delegate ranges of inode numbers to the client.
> > >
> > > Add the infrastructure to decode these ranges, and stuff them into an
> > > xarray for later consumption by the async creation code.
> > >
> > > Because the xarray code currently only handles unsigned long indexes,
> > > and those are 32-bits on 32-bit arches, we only enable the decoding when
> > > running on a 64-bit arch.
> > >
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/ceph/mds_client.c | 122 +++++++++++++++++++++++++++++++++++++++----
> > >  fs/ceph/mds_client.h |   9 +++-
> > >  2 files changed, 121 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index db8304447f35..87f75d05b004 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -415,21 +415,121 @@ static int parse_reply_info_filelock(void **p, void *end,
> > >     return -EIO;
> > >  }
> > >
> > > +
> > > +#if BITS_PER_LONG == 64
> > > +
> > > +#define DELEGATED_INO_AVAILABLE            xa_mk_value(1)
> > > +
> > > +static int ceph_parse_deleg_inos(void **p, void *end,
> > > +                            struct ceph_mds_session *s)
> > > +{
> > > +   u32 sets;
> > > +
> > > +   ceph_decode_32_safe(p, end, sets, bad);
> > > +   dout("got %u sets of delegated inodes\n", sets);
> > > +   while (sets--) {
> > > +           u64 start, len, ino;
> > > +
> > > +           ceph_decode_64_safe(p, end, start, bad);
> > > +           ceph_decode_64_safe(p, end, len, bad);
> > > +           while (len--) {
> > > +                   int err = xa_insert(&s->s_delegated_inos, ino = start++,
> > > +                                       DELEGATED_INO_AVAILABLE,
> > > +                                       GFP_KERNEL);
> > > +                   if (!err) {
> > > +                           dout("added delegated inode 0x%llx\n",
> > > +                                start - 1);
> > > +                   } else if (err == -EBUSY) {
> > > +                           pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
> > > +                                   start - 1);
> > > +                   } else {
> > > +                           return err;
> > > +                   }
> > > +           }
> > > +   }
> > > +   return 0;
> > > +bad:
> > > +   return -EIO;
> > > +}
> > > +
> > > +u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
> > > +{
> > > +   unsigned long ino;
> > > +   void *val;
> > > +
> > > +   xa_for_each(&s->s_delegated_inos, ino, val) {
> > > +           val = xa_erase(&s->s_delegated_inos, ino);
> > > +           if (val == DELEGATED_INO_AVAILABLE)
> > > +                   return ino;
> > > +   }
> > > +   return 0;
> > > +}
> > > +
> > > +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
> > > +{
> > > +   return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
> > > +                    GFP_KERNEL);
> > > +}
> > > +#else /* BITS_PER_LONG == 64 */
> > > +/*
> > > + * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
> > > + * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
> > > + * and bottom words?
> > > + */
> > > +static int ceph_parse_deleg_inos(void **p, void *end,
> > > +                            struct ceph_mds_session *s)
> > > +{
> > > +   u32 sets;
> > > +
> > > +   ceph_decode_32_safe(p, end, sets, bad);
> > > +   if (sets)
> > > +           ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
> > > +   return 0;
> > > +bad:
> > > +   return -EIO;
> > > +}
> > > +
> > > +u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
> > > +{
> > > +   return 0;
> > > +}
> > > +
> > > +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
> > > +{
> > > +   return 0;
> > > +}
> > > +#endif /* BITS_PER_LONG == 64 */
> > > +
> > >  /*
> > >   * parse create results
> > >   */
> > >  static int parse_reply_info_create(void **p, void *end,
> > >                               struct ceph_mds_reply_info_parsed *info,
> > > -                             u64 features)
> > > +                             u64 features, struct ceph_mds_session *s)
> > >  {
> > > +   int ret;
> > > +
> > >     if (features == (u64)-1 ||
> > >         (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
> > > -           /* Malformed reply? */
> > >             if (*p == end) {
> > > +                   /* Malformed reply? */
> > >                     info->has_create_ino = false;
> > > -           } else {
> > > +           } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
> > > +                   u8 struct_v, struct_compat;
> > > +                   u32 len;
> > > +
> > >                     info->has_create_ino = true;
> > > +                   ceph_decode_8_safe(p, end, struct_v, bad);
> > > +                   ceph_decode_8_safe(p, end, struct_compat, bad);
> > > +                   ceph_decode_32_safe(p, end, len, bad);
> > > +                   ceph_decode_64_safe(p, end, info->ino, bad);
> >
> > I've done a quick test in current 'testing' branch and it seems that it's
> > currently broken.  A bisect identified this commit as 'bad' and it's
> > failing at this point.
> >
> > I'm running an old (a few weeks) 'master' vstart cluster, so I don't have
> > the needed bits for using this DELEG_INO feature.  Running xfstest
> > generic/001 results in:
> >
> >    ceph: mds parse_reply err -5
> >    ceph: mdsc_handle_reply got corrupt reply mds0(tid:9)
> >    ...
> >
> > s->s_features does include the CEPHFS_FEATURE_DELEG_INO bit set;
> > 'features' is -1 (0xffffffffffffffff) and s->s_features is 0x3fff.  Maybe
> > the issue is actually somewhere else (the cephfs feature handling code),
> > but I'm still looking.
> >
>
> From the patch that added this feature in userland ceph code (commit
> 2bcf4b62643b5):
>
> --- a/src/mds/cephfs_features.h
> +++ b/src/mds/cephfs_features.h
> @@ -32,6 +32,7 @@
>  #define CEPHFS_FEATURE_LAZY_CAP_WANTED  11
>  #define CEPHFS_FEATURE_MULTI_RECONNECT  12
>  #define CEPHFS_FEATURE_NAUTILUS         12
> +#define CEPHFS_FEATURE_DELEG_INO        13
>  #define CEPHFS_FEATURE_OCTOPUS          13
>
>  #define CEPHFS_FEATURES_ALL {          \
> @@ -45,6 +46,7 @@
>    CEPHFS_FEATURE_LAZY_CAP_WANTED,      \
>    CEPHFS_FEATURE_MULTI_RECONNECT,      \
>    CEPHFS_FEATURE_NAUTILUS,              \
> +  CEPHFS_FEATURE_DELEG_INO,             \
>    CEPHFS_FEATURE_OCTOPUS,               \
>  }
>
> ...this feature was added under the aegis of the
> CEPHFS_FEATURE_DELEG_INO flag, but that bit is shared with
> CEPHFS_FEATURE_OCTOPUS, which was already enabled in octopus before we
> ever added it (back on April 1st 2019).
>
> Any version of the MDS that has commit 49930ad8a3402 but does not have
> 2bcf4b62643b5 will not work properly with newer kernels. Personally, I
> don't see that as a problem per-se, as that should only be the case with
> bleeding-edge MDS builds. Official releases should never see this issue.
>
> Going forward, I think commit 49930ad8a3402 was probably a bad idea. We
> really should not add "release" cephfs feature bits to the mask until
> just before an official release, and should just make it alias the last
> "real" feature bit. That should help ensure that we don't hit this
> problem in the future.

We should avoid "release" feature bits altogether, as discussed in
the last CDM.  They appeared because we were running out of free bits
for RADOS features (a 64-bit field).  CephFS features are encoded in
a bit vector that can grow as needed.

Feature masks for groups of related feature bits make sense, but they
should be handled at a higher level.  E.g. a set of feature bits that
a client should support in order for inline data to work properly that
we flip to required if the user runs "ceph fs set <fsname> inline_data
true".  The actual feature bits in the bit vector should all be
distinct -- no overlaps.

Thanks,

                Ilya
Jeff Layton March 5, 2020, 1:44 p.m. UTC | #5
On Thu, 2020-03-05 at 14:36 +0100, Ilya Dryomov wrote:
> On Thu, Mar 5, 2020 at 1:03 PM Jeff Layton <jlayton@kernel.org> wrote:
> > On Thu, 2020-03-05 at 11:45 +0000, Luis Henriques wrote:
> > > On Mon, Mar 02, 2020 at 09:14:31AM -0500, Jeff Layton wrote:
> > > > Starting in Octopus, the MDS will hand out caps that allow the client
> > > > to do asynchronous file creates under certain conditions. As part of
> > > > that, the MDS will delegate ranges of inode numbers to the client.
> > > > 
> > > > Add the infrastructure to decode these ranges, and stuff them into an
> > > > xarray for later consumption by the async creation code.
> > > > 
> > > > Because the xarray code currently only handles unsigned long indexes,
> > > > and those are 32-bits on 32-bit arches, we only enable the decoding when
> > > > running on a 64-bit arch.
> > > > 
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > >  fs/ceph/mds_client.c | 122 +++++++++++++++++++++++++++++++++++++++----
> > > >  fs/ceph/mds_client.h |   9 +++-
> > > >  2 files changed, 121 insertions(+), 10 deletions(-)
> > > > 
> > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > index db8304447f35..87f75d05b004 100644
> > > > --- a/fs/ceph/mds_client.c
> > > > +++ b/fs/ceph/mds_client.c
> > > > @@ -415,21 +415,121 @@ static int parse_reply_info_filelock(void **p, void *end,
> > > >     return -EIO;
> > > >  }
> > > > 
> > > > +
> > > > +#if BITS_PER_LONG == 64
> > > > +
> > > > +#define DELEGATED_INO_AVAILABLE            xa_mk_value(1)
> > > > +
> > > > +static int ceph_parse_deleg_inos(void **p, void *end,
> > > > +                            struct ceph_mds_session *s)
> > > > +{
> > > > +   u32 sets;
> > > > +
> > > > +   ceph_decode_32_safe(p, end, sets, bad);
> > > > +   dout("got %u sets of delegated inodes\n", sets);
> > > > +   while (sets--) {
> > > > +           u64 start, len, ino;
> > > > +
> > > > +           ceph_decode_64_safe(p, end, start, bad);
> > > > +           ceph_decode_64_safe(p, end, len, bad);
> > > > +           while (len--) {
> > > > +                   int err = xa_insert(&s->s_delegated_inos, ino = start++,
> > > > +                                       DELEGATED_INO_AVAILABLE,
> > > > +                                       GFP_KERNEL);
> > > > +                   if (!err) {
> > > > +                           dout("added delegated inode 0x%llx\n",
> > > > +                                start - 1);
> > > > +                   } else if (err == -EBUSY) {
> > > > +                           pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
> > > > +                                   start - 1);
> > > > +                   } else {
> > > > +                           return err;
> > > > +                   }
> > > > +           }
> > > > +   }
> > > > +   return 0;
> > > > +bad:
> > > > +   return -EIO;
> > > > +}
> > > > +
> > > > +u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
> > > > +{
> > > > +   unsigned long ino;
> > > > +   void *val;
> > > > +
> > > > +   xa_for_each(&s->s_delegated_inos, ino, val) {
> > > > +           val = xa_erase(&s->s_delegated_inos, ino);
> > > > +           if (val == DELEGATED_INO_AVAILABLE)
> > > > +                   return ino;
> > > > +   }
> > > > +   return 0;
> > > > +}
> > > > +
> > > > +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
> > > > +{
> > > > +   return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
> > > > +                    GFP_KERNEL);
> > > > +}
> > > > +#else /* BITS_PER_LONG == 64 */
> > > > +/*
> > > > + * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
> > > > + * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
> > > > + * and bottom words?
> > > > + */
> > > > +static int ceph_parse_deleg_inos(void **p, void *end,
> > > > +                            struct ceph_mds_session *s)
> > > > +{
> > > > +   u32 sets;
> > > > +
> > > > +   ceph_decode_32_safe(p, end, sets, bad);
> > > > +   if (sets)
> > > > +           ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
> > > > +   return 0;
> > > > +bad:
> > > > +   return -EIO;
> > > > +}
> > > > +
> > > > +u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
> > > > +{
> > > > +   return 0;
> > > > +}
> > > > +
> > > > +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
> > > > +{
> > > > +   return 0;
> > > > +}
> > > > +#endif /* BITS_PER_LONG == 64 */
> > > > +
> > > >  /*
> > > >   * parse create results
> > > >   */
> > > >  static int parse_reply_info_create(void **p, void *end,
> > > >                               struct ceph_mds_reply_info_parsed *info,
> > > > -                             u64 features)
> > > > +                             u64 features, struct ceph_mds_session *s)
> > > >  {
> > > > +   int ret;
> > > > +
> > > >     if (features == (u64)-1 ||
> > > >         (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
> > > > -           /* Malformed reply? */
> > > >             if (*p == end) {
> > > > +                   /* Malformed reply? */
> > > >                     info->has_create_ino = false;
> > > > -           } else {
> > > > +           } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
> > > > +                   u8 struct_v, struct_compat;
> > > > +                   u32 len;
> > > > +
> > > >                     info->has_create_ino = true;
> > > > +                   ceph_decode_8_safe(p, end, struct_v, bad);
> > > > +                   ceph_decode_8_safe(p, end, struct_compat, bad);
> > > > +                   ceph_decode_32_safe(p, end, len, bad);
> > > > +                   ceph_decode_64_safe(p, end, info->ino, bad);
> > > 
> > > I've done a quick test in current 'testing' branch and it seems that it's
> > > currently broken.  A bisect identified this commit as 'bad' and it's
> > > failing at this point.
> > > 
> > > I'm running an old (a few weeks) 'master' vstart cluster, so I don't have
> > > the needed bits for using this DELEG_INO feature.  Running xfstest
> > > generic/001 results in:
> > > 
> > >    ceph: mds parse_reply err -5
> > >    ceph: mdsc_handle_reply got corrupt reply mds0(tid:9)
> > >    ...
> > > 
> > > s->s_features does include the CEPHFS_FEATURE_DELEG_INO bit set;
> > > 'features' is -1 (0xffffffffffffffff) and s->s_features is 0x3fff.  Maybe
> > > the issue is actually somewhere else (the cephfs feature handling code),
> > > but I'm still looking.
> > > 
> > 
> > From the patch that added this feature in userland ceph code (commit
> > 2bcf4b62643b5):
> > 
> > --- a/src/mds/cephfs_features.h
> > +++ b/src/mds/cephfs_features.h
> > @@ -32,6 +32,7 @@
> >  #define CEPHFS_FEATURE_LAZY_CAP_WANTED  11
> >  #define CEPHFS_FEATURE_MULTI_RECONNECT  12
> >  #define CEPHFS_FEATURE_NAUTILUS         12
> > +#define CEPHFS_FEATURE_DELEG_INO        13
> >  #define CEPHFS_FEATURE_OCTOPUS          13
> > 
> >  #define CEPHFS_FEATURES_ALL {          \
> > @@ -45,6 +46,7 @@
> >    CEPHFS_FEATURE_LAZY_CAP_WANTED,      \
> >    CEPHFS_FEATURE_MULTI_RECONNECT,      \
> >    CEPHFS_FEATURE_NAUTILUS,              \
> > +  CEPHFS_FEATURE_DELEG_INO,             \
> >    CEPHFS_FEATURE_OCTOPUS,               \
> >  }
> > 
> > ...this feature was added under the aegis of the
> > CEPHFS_FEATURE_DELEG_INO flag, but that bit is shared with
> > CEPHFS_FEATURE_OCTOPUS, which was already enabled in octopus before we
> > ever added it (back on April 1st 2019).
> > 
> > Any version of the MDS that has commit 49930ad8a3402 but does not have
> > 2bcf4b62643b5 will not work properly with newer kernels. Personally, I
> > don't see that as a problem per-se, as that should only be the case with
> > bleeding-edge MDS builds. Official releases should never see this issue.
> > 
> > Going forward, I think commit 49930ad8a3402 was probably a bad idea. We
> > really should not add "release" cephfs feature bits to the mask until
> > just before an official release, and should just make it alias the last
> > "real" feature bit. That should help ensure that we don't hit this
> > problem in the future.
> 
> We should avoid "release" feature bits altogether, as discussed in
> the last CDM.  They appeared because we were running out of free bits
> for RADOS features (a 64-bit field).  CephFS features are encoded in
> a bit vector that can grow as needed.
> 
> Feature masks for groups of related feature bits make sense, but they
> should be handled at a higher level.  E.g. a set of feature bits that
> a client should support in order for inline data to work properly that
> we flip to required if the user runs "ceph fs set <fsname> inline_data
> true".  The actual feature bits in the bit vector should all be
> distinct -- no overlaps.
> 

Yes, sorry, that was what we discussed, and that sounds fine to me.

The only reason those tags exist at all is that userland ceph has a
"min_compat_client" setting that disallows clients that don't support
release "X" from mounting.

I don't recall whether we had a plan to remove that setting from the
userland ceph code. Did we? It seems like if we're moving away from
declaring a particular release feature bit, then we should deprecate
min_compat_client.
diff mbox series

Patch

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index db8304447f35..87f75d05b004 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -415,21 +415,121 @@  static int parse_reply_info_filelock(void **p, void *end,
 	return -EIO;
 }
 
+
+#if BITS_PER_LONG == 64
+
+#define DELEGATED_INO_AVAILABLE		xa_mk_value(1)
+
+static int ceph_parse_deleg_inos(void **p, void *end,
+				 struct ceph_mds_session *s)
+{
+	u32 sets;
+
+	ceph_decode_32_safe(p, end, sets, bad);
+	dout("got %u sets of delegated inodes\n", sets);
+	while (sets--) {
+		u64 start, len, ino;
+
+		ceph_decode_64_safe(p, end, start, bad);
+		ceph_decode_64_safe(p, end, len, bad);
+		while (len--) {
+			int err = xa_insert(&s->s_delegated_inos, ino = start++,
+					    DELEGATED_INO_AVAILABLE,
+					    GFP_KERNEL);
+			if (!err) {
+				dout("added delegated inode 0x%llx\n",
+				     start - 1);
+			} else if (err == -EBUSY) {
+				pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
+					start - 1);
+			} else {
+				return err;
+			}
+		}
+	}
+	return 0;
+bad:
+	return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+	unsigned long ino;
+	void *val;
+
+	xa_for_each(&s->s_delegated_inos, ino, val) {
+		val = xa_erase(&s->s_delegated_inos, ino);
+		if (val == DELEGATED_INO_AVAILABLE)
+			return ino;
+	}
+	return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+	return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
+			 GFP_KERNEL);
+}
+#else /* BITS_PER_LONG == 64 */
+/*
+ * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
+ * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
+ * and bottom words?
+ */
+static int ceph_parse_deleg_inos(void **p, void *end,
+				 struct ceph_mds_session *s)
+{
+	u32 sets;
+
+	ceph_decode_32_safe(p, end, sets, bad);
+	if (sets)
+		ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
+	return 0;
+bad:
+	return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+	return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+	return 0;
+}
+#endif /* BITS_PER_LONG == 64 */
+
 /*
  * parse create results
  */
 static int parse_reply_info_create(void **p, void *end,
 				  struct ceph_mds_reply_info_parsed *info,
-				  u64 features)
+				  u64 features, struct ceph_mds_session *s)
 {
+	int ret;
+
 	if (features == (u64)-1 ||
 	    (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
-		/* Malformed reply? */
 		if (*p == end) {
+			/* Malformed reply? */
 			info->has_create_ino = false;
-		} else {
+		} else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
+			u8 struct_v, struct_compat;
+			u32 len;
+
 			info->has_create_ino = true;
+			ceph_decode_8_safe(p, end, struct_v, bad);
+			ceph_decode_8_safe(p, end, struct_compat, bad);
+			ceph_decode_32_safe(p, end, len, bad);
+			ceph_decode_64_safe(p, end, info->ino, bad);
+			ret = ceph_parse_deleg_inos(p, end, s);
+			if (ret)
+				return ret;
+		} else {
+			/* legacy */
 			ceph_decode_64_safe(p, end, info->ino, bad);
+			info->has_create_ino = true;
 		}
 	} else {
 		if (*p != end)
@@ -448,7 +548,7 @@  static int parse_reply_info_create(void **p, void *end,
  */
 static int parse_reply_info_extra(void **p, void *end,
 				  struct ceph_mds_reply_info_parsed *info,
-				  u64 features)
+				  u64 features, struct ceph_mds_session *s)
 {
 	u32 op = le32_to_cpu(info->head->op);
 
@@ -457,7 +557,7 @@  static int parse_reply_info_extra(void **p, void *end,
 	else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
 		return parse_reply_info_readdir(p, end, info, features);
 	else if (op == CEPH_MDS_OP_CREATE)
-		return parse_reply_info_create(p, end, info, features);
+		return parse_reply_info_create(p, end, info, features, s);
 	else
 		return -EIO;
 }
@@ -465,7 +565,7 @@  static int parse_reply_info_extra(void **p, void *end,
 /*
  * parse entire mds reply
  */
-static int parse_reply_info(struct ceph_msg *msg,
+static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
 			    struct ceph_mds_reply_info_parsed *info,
 			    u64 features)
 {
@@ -490,7 +590,7 @@  static int parse_reply_info(struct ceph_msg *msg,
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
 		ceph_decode_need(&p, end, len, bad);
-		err = parse_reply_info_extra(&p, p+len, info, features);
+		err = parse_reply_info_extra(&p, p+len, info, features, s);
 		if (err < 0)
 			goto out_bad;
 	}
@@ -558,6 +658,7 @@  void ceph_put_mds_session(struct ceph_mds_session *s)
 	if (refcount_dec_and_test(&s->s_ref)) {
 		if (s->s_auth.authorizer)
 			ceph_auth_destroy_authorizer(s->s_auth.authorizer);
+		xa_destroy(&s->s_delegated_inos);
 		kfree(s);
 	}
 }
@@ -645,6 +746,7 @@  static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	refcount_set(&s->s_ref, 1);
 	INIT_LIST_HEAD(&s->s_waiting);
 	INIT_LIST_HEAD(&s->s_unsafe);
+	xa_init(&s->s_delegated_inos);
 	s->s_num_cap_releases = 0;
 	s->s_cap_reconnect = 0;
 	s->s_cap_iterator = NULL;
@@ -2975,9 +3077,9 @@  static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	dout("handle_reply tid %lld result %d\n", tid, result);
 	rinfo = &req->r_reply_info;
 	if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
-		err = parse_reply_info(msg, rinfo, (u64)-1);
+		err = parse_reply_info(session, msg, rinfo, (u64)-1);
 	else
-		err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+		err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
 	mutex_unlock(&mdsc->mutex);
 
 	mutex_lock(&session->s_mutex);
@@ -3673,6 +3775,8 @@  static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	if (!reply)
 		goto fail_nomsg;
 
+	xa_destroy(&session->s_delegated_inos);
+
 	mutex_lock(&session->s_mutex);
 	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
 	session->s_seq = 0;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index f10d342ea585..4c3b71707470 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -23,8 +23,9 @@  enum ceph_feature_type {
 	CEPHFS_FEATURE_RECLAIM_CLIENT,
 	CEPHFS_FEATURE_LAZY_CAP_WANTED,
 	CEPHFS_FEATURE_MULTI_RECONNECT,
+	CEPHFS_FEATURE_DELEG_INO,
 
-	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
+	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
 };
 
 /*
@@ -37,6 +38,7 @@  enum ceph_feature_type {
 	CEPHFS_FEATURE_REPLY_ENCODING,		\
 	CEPHFS_FEATURE_LAZY_CAP_WANTED,		\
 	CEPHFS_FEATURE_MULTI_RECONNECT,		\
+	CEPHFS_FEATURE_DELEG_INO,		\
 						\
 	CEPHFS_FEATURE_MAX,			\
 }
@@ -201,6 +203,7 @@  struct ceph_mds_session {
 
 	struct list_head  s_waiting;  /* waiting requests */
 	struct list_head  s_unsafe;   /* unsafe requests */
+	struct xarray	  s_delegated_inos;
 };
 
 /*
@@ -542,6 +545,7 @@  extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
 extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
 			  struct ceph_mds_session *session,
 			  int max_caps);
+
 static inline int ceph_wait_on_async_create(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -549,4 +553,7 @@  static inline int ceph_wait_on_async_create(struct inode *inode)
 	return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
 			   TASK_INTERRUPTIBLE);
 }
+
+extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
+extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
 #endif