diff mbox

[2/9] libceph: support crush tunables

Message ID 1342831308-18815-3-git-send-email-sage@inktank.com (mailing list archive)
State New, archived
Headers show

Commit Message

Sage Weil July 21, 2012, 12:41 a.m. UTC
From: caleb miles <caleb.miles@inktank.com>

The server side recently added support for tuning some magic
crush variables. Decode these variables if they are present, or use the
default values if they are not present.

Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.

Signed-off-by: caleb miles <caleb.miles@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/ceph_features.h |    4 ++-
 include/linux/crush/crush.h        |    8 +++++++
 net/ceph/crush/mapper.c            |   13 ++++++-----
 net/ceph/osdmap.c                  |   39 ++++++++++++++++++++++++++++++++++++
 4 files changed, 57 insertions(+), 7 deletions(-)

Comments

Yehuda Sadeh July 24, 2012, 10:24 p.m. UTC | #1
On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> From: caleb miles <caleb.miles@inktank.com>
>
> The server side recently added support for tuning some magic
> crush variables. Decode these variables if they are present, or use the
> default values if they are not present.
>
> Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.
>
> Signed-off-by: caleb miles <caleb.miles@inktank.com>
> Reviewed-by: Sage Weil <sage@inktank.com>
> ---
>  include/linux/ceph/ceph_features.h |    4 ++-
>  include/linux/crush/crush.h        |    8 +++++++
>  net/ceph/crush/mapper.c            |   13 ++++++-----
>  net/ceph/osdmap.c                  |   39 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 57 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
> index 342f93d..df25dcf 100644
> --- a/include/linux/ceph/ceph_features.h
> +++ b/include/linux/ceph/ceph_features.h
> @@ -12,12 +12,14 @@
>  #define CEPH_FEATURE_MONNAMES       (1<<5)
>  #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
>  #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> +#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)

any reason why this is 18 and not 8?

>
>  /*
>   * Features supported.
>   */
>  #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
> -       (CEPH_FEATURE_NOSRCADDR)
> +       (CEPH_FEATURE_NOSRCADDR |        \
> +        CEPH_FEATURE_CRUSH_TUNABLES)
>
>  #define CEPH_FEATURES_REQUIRED_DEFAULT   \
>         (CEPH_FEATURE_NOSRCADDR)
> diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
> index 7c47508..25baa28 100644
> --- a/include/linux/crush/crush.h
> +++ b/include/linux/crush/crush.h
> @@ -154,6 +154,14 @@ struct crush_map {
>         __s32 max_buckets;
>         __u32 max_rules;
>         __s32 max_devices;
> +
> +       /* choose local retries before re-descent */
> +       __u32 choose_local_tries;
> +       /* choose local attempts using a fallback permutation before
> +        * re-descent */
> +       __u32 choose_local_fallback_tries;
> +       /* choose attempts before giving up */
> +       __u32 choose_total_tries;
>  };
>
>
> diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
> index d7edc24..35fce75 100644
> --- a/net/ceph/crush/mapper.c
> +++ b/net/ceph/crush/mapper.c
> @@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
>         int item = 0;
>         int itemtype;
>         int collide, reject;
> -       const unsigned int orig_tries = 5; /* attempts before we fall back to search */
>
>         dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
>                 bucket->id, x, outpos, numrep);
> @@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
>                                         reject = 1;
>                                         goto reject;
>                                 }
> -                               if (flocal >= (in->size>>1) &&
> -                                   flocal > orig_tries)
> +                               if (map->choose_local_fallback_tries > 0 &&
> +                                   flocal >= (in->size>>1) &&
> +                                   flocal > map->choose_local_fallback_tries)

is flocal right here or should it be ftotal?

>                                         item = bucket_perm_choose(in, x, r);
>                                 else
>                                         item = crush_bucket_choose(in, x, r);
> @@ -422,13 +422,14 @@ reject:
>                                         ftotal++;
>                                         flocal++;
>
> -                                       if (collide && flocal < 3)
> +                                       if (collide && flocal <= map->choose_local_tries)
>                                                 /* retry locally a few times */
>                                                 retry_bucket = 1;
> -                                       else if (flocal <= in->size + orig_tries)
> +                                       else if (map->choose_local_fallback_tries > 0 &&
> +                                                flocal <= in->size + map->choose_local_fallback_tries)
>                                                 /* exhaustive bucket search */
>                                                 retry_bucket = 1;
> -                                       else if (ftotal < 20)
> +                                       else if (ftotal <= map->choose_total_tries)
>                                                 /* then retry descent */
>                                                 retry_descent = 1;
>                                         else
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index 9600674..3124b71 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -135,6 +135,21 @@ bad:
>         return -EINVAL;
>  }
>
> +static int skip_name_map(void **p, void *end)
> +{
> +        int len;
> +        ceph_decode_32_safe(p, end, len ,bad);
> +        while (len--) {
> +                int strlen;
use u32 for strlen

> +                *p += sizeof(u32);
> +                ceph_decode_32_safe(p, end, strlen, bad);
> +                *p += strlen;
> +}
> +        return 0;
> +bad:
> +        return -EINVAL;
> +}
> +
>  static struct crush_map *crush_decode(void *pbyval, void *end)
>  {
>         struct crush_map *c;
> @@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>         void **p = &pbyval;
>         void *start = pbyval;
>         u32 magic;
> +       u32 num_name_maps;
>
>         dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
>
> @@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>         if (c == NULL)
>                 return ERR_PTR(-ENOMEM);
>
> +        /* set tunables to default values */
> +        c->choose_local_tries = 2;
> +        c->choose_local_fallback_tries = 5;
> +        c->choose_total_tries = 19;
> +
>         ceph_decode_need(p, end, 4*sizeof(u32), bad);
>         magic = ceph_decode_32(p);
>         if (magic != CRUSH_MAGIC) {
> @@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>         }
>
>         /* ignore trailing name maps. */
> +        for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
> +                err = skip_name_map(p, end);
> +                if (err < 0)
> +                        goto done;
> +        }
> +
> +        /* tunables */
> +        ceph_decode_need(p, end, 3*sizeof(u32), done);
> +        c->choose_local_tries = ceph_decode_32(p);
> +        c->choose_local_fallback_tries =  ceph_decode_32(p);
> +        c->choose_total_tries = ceph_decode_32(p);
> +        dout("crush decode tunable choose_local_tries = %d",
> +             c->choose_local_tries);
> +        dout("crush decode tunable choose_local_fallback_tries = %d",
> +             c->choose_local_fallback_tries);
> +        dout("crush decode tunable choose_total_tries = %d",
> +             c->choose_total_tries);
>
> +done:
>         dout("crush_decode success\n");
>         return c;
>
> --
> 1.7.9
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Elder July 30, 2012, 6:36 p.m. UTC | #2
On 07/20/2012 07:41 PM, Sage Weil wrote:
> From: caleb miles <caleb.miles@inktank.com>
> 
> The server side recently added support for tuning some magic
> crush variables. Decode these variables if they are present, or use the
> default values if they are not present.
> 
> Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.
> 
> Signed-off-by: caleb miles <caleb.miles@inktank.com>
> Reviewed-by: Sage Weil <sage@inktank.com>

You know I prefer symbolic constant definitions instead of
bald numeric constants in the middle of code.  I.e.:

    #define CRUSH_LOCAL_TRIES_DEFAULT		2
    #define CRUSH_LOCAL_FALLBACK_TRIES_DEFAULT	5
    #define CRUSH_TOTAL_TRIES_DEFAULT		19

But it's OK with me if this gets committed as-is.

Reviewed-by: Alex Elder <elder@inktank.com>

> ---
>  include/linux/ceph/ceph_features.h |    4 ++-
>  include/linux/crush/crush.h        |    8 +++++++
>  net/ceph/crush/mapper.c            |   13 ++++++-----
>  net/ceph/osdmap.c                  |   39 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 57 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
> index 342f93d..df25dcf 100644
> --- a/include/linux/ceph/ceph_features.h
> +++ b/include/linux/ceph/ceph_features.h
> @@ -12,12 +12,14 @@
>  #define CEPH_FEATURE_MONNAMES       (1<<5)
>  #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
>  #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> +#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
>  
>  /*
>   * Features supported.
>   */
>  #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
> -	(CEPH_FEATURE_NOSRCADDR)
> +	(CEPH_FEATURE_NOSRCADDR |	 \
> +	 CEPH_FEATURE_CRUSH_TUNABLES)
>  
>  #define CEPH_FEATURES_REQUIRED_DEFAULT   \
>  	(CEPH_FEATURE_NOSRCADDR)
> diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
> index 7c47508..25baa28 100644
> --- a/include/linux/crush/crush.h
> +++ b/include/linux/crush/crush.h
> @@ -154,6 +154,14 @@ struct crush_map {
>  	__s32 max_buckets;
>  	__u32 max_rules;
>  	__s32 max_devices;
> +
> +	/* choose local retries before re-descent */
> +	__u32 choose_local_tries;
> +	/* choose local attempts using a fallback permutation before
> +	 * re-descent */
> +	__u32 choose_local_fallback_tries;
> +	/* choose attempts before giving up */ 
> +	__u32 choose_total_tries;
>  };
>  
>  
> diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
> index d7edc24..35fce75 100644
> --- a/net/ceph/crush/mapper.c
> +++ b/net/ceph/crush/mapper.c
> @@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
>  	int item = 0;
>  	int itemtype;
>  	int collide, reject;
> -	const unsigned int orig_tries = 5; /* attempts before we fall back to search */
>  
>  	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
>  		bucket->id, x, outpos, numrep);
> @@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
>  					reject = 1;
>  					goto reject;
>  				}
> -				if (flocal >= (in->size>>1) &&
> -				    flocal > orig_tries)
> +				if (map->choose_local_fallback_tries > 0 &&
> +				    flocal >= (in->size>>1) &&
> +				    flocal > map->choose_local_fallback_tries)
>  					item = bucket_perm_choose(in, x, r);
>  				else
>  					item = crush_bucket_choose(in, x, r);
> @@ -422,13 +422,14 @@ reject:
>  					ftotal++;
>  					flocal++;
>  
> -					if (collide && flocal < 3)
> +					if (collide && flocal <= map->choose_local_tries)
>  						/* retry locally a few times */
>  						retry_bucket = 1;
> -					else if (flocal <= in->size + orig_tries)
> +					else if (map->choose_local_fallback_tries > 0 &&
> +						 flocal <= in->size + map->choose_local_fallback_tries)
>  						/* exhaustive bucket search */
>  						retry_bucket = 1;
> -					else if (ftotal < 20)
> +					else if (ftotal <= map->choose_total_tries)
>  						/* then retry descent */
>  						retry_descent = 1;
>  					else
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index 9600674..3124b71 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -135,6 +135,21 @@ bad:
>  	return -EINVAL;
>  }
>  
> +static int skip_name_map(void **p, void *end)
> +{
> +        int len;
> +        ceph_decode_32_safe(p, end, len ,bad);
> +        while (len--) {
> +                int strlen;
> +                *p += sizeof(u32);
> +                ceph_decode_32_safe(p, end, strlen, bad);
> +                *p += strlen;
> +}
> +        return 0;
> +bad:
> +        return -EINVAL;
> +}
> +
>  static struct crush_map *crush_decode(void *pbyval, void *end)
>  {
>  	struct crush_map *c;
> @@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>  	void **p = &pbyval;
>  	void *start = pbyval;
>  	u32 magic;
> +	u32 num_name_maps;
>  
>  	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
>  
> @@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>  	if (c == NULL)
>  		return ERR_PTR(-ENOMEM);
>  
> +        /* set tunables to default values */
> +        c->choose_local_tries = 2;
> +        c->choose_local_fallback_tries = 5;
> +        c->choose_total_tries = 19;
> +
>  	ceph_decode_need(p, end, 4*sizeof(u32), bad);
>  	magic = ceph_decode_32(p);
>  	if (magic != CRUSH_MAGIC) {
> @@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>  	}
>  
>  	/* ignore trailing name maps. */
> +        for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
> +                err = skip_name_map(p, end);
> +                if (err < 0)
> +                        goto done;
> +        }
> +
> +        /* tunables */
> +        ceph_decode_need(p, end, 3*sizeof(u32), done);
> +        c->choose_local_tries = ceph_decode_32(p);
> +        c->choose_local_fallback_tries =  ceph_decode_32(p);
> +        c->choose_total_tries = ceph_decode_32(p);
> +        dout("crush decode tunable choose_local_tries = %d",
> +             c->choose_local_tries);
> +        dout("crush decode tunable choose_local_fallback_tries = %d",
> +             c->choose_local_fallback_tries);
> +        dout("crush decode tunable choose_total_tries = %d",
> +             c->choose_total_tries);
>  
> +done:
>  	dout("crush_decode success\n");
>  	return c;
>  
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sage Weil July 30, 2012, 11:14 p.m. UTC | #3
On Tue, 24 Jul 2012, Yehuda Sadeh wrote:
> On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> > From: caleb miles <caleb.miles@inktank.com>
> >
> > The server side recently added support for tuning some magic
> > crush variables. Decode these variables if they are present, or use the
> > default values if they are not present.
> >
> > Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.
> >
> > Signed-off-by: caleb miles <caleb.miles@inktank.com>
> > Reviewed-by: Sage Weil <sage@inktank.com>
> > ---
> >  include/linux/ceph/ceph_features.h |    4 ++-
> >  include/linux/crush/crush.h        |    8 +++++++
> >  net/ceph/crush/mapper.c            |   13 ++++++-----
> >  net/ceph/osdmap.c                  |   39 ++++++++++++++++++++++++++++++++++++
> >  4 files changed, 57 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
> > index 342f93d..df25dcf 100644
> > --- a/include/linux/ceph/ceph_features.h
> > +++ b/include/linux/ceph/ceph_features.h
> > @@ -12,12 +12,14 @@
> >  #define CEPH_FEATURE_MONNAMES       (1<<5)
> >  #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
> >  #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> > +#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
> 
> any reason why this is 18 and not 8?

9-17 are used.. just not implemented/used by the kernel code.

> 
> >
> >  /*
> >   * Features supported.
> >   */
> >  #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
> > -       (CEPH_FEATURE_NOSRCADDR)
> > +       (CEPH_FEATURE_NOSRCADDR |        \
> > +        CEPH_FEATURE_CRUSH_TUNABLES)
> >
> >  #define CEPH_FEATURES_REQUIRED_DEFAULT   \
> >         (CEPH_FEATURE_NOSRCADDR)
> > diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
> > index 7c47508..25baa28 100644
> > --- a/include/linux/crush/crush.h
> > +++ b/include/linux/crush/crush.h
> > @@ -154,6 +154,14 @@ struct crush_map {
> >         __s32 max_buckets;
> >         __u32 max_rules;
> >         __s32 max_devices;
> > +
> > +       /* choose local retries before re-descent */
> > +       __u32 choose_local_tries;
> > +       /* choose local attempts using a fallback permutation before
> > +        * re-descent */
> > +       __u32 choose_local_fallback_tries;
> > +       /* choose attempts before giving up */
> > +       __u32 choose_total_tries;
> >  };
> >
> >
> > diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
> > index d7edc24..35fce75 100644
> > --- a/net/ceph/crush/mapper.c
> > +++ b/net/ceph/crush/mapper.c
> > @@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
> >         int item = 0;
> >         int itemtype;
> >         int collide, reject;
> > -       const unsigned int orig_tries = 5; /* attempts before we fall back to search */
> >
> >         dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
> >                 bucket->id, x, outpos, numrep);
> > @@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
> >                                         reject = 1;
> >                                         goto reject;
> >                                 }
> > -                               if (flocal >= (in->size>>1) &&
> > -                                   flocal > orig_tries)
> > +                               if (map->choose_local_fallback_tries > 0 &&
> > +                                   flocal >= (in->size>>1) &&
> > +                                   flocal > map->choose_local_fallback_tries)
> 
> is flocal right here or should it be ftotal?
> 
> >                                         item = bucket_perm_choose(in, x, r);
> >                                 else
> >                                         item = crush_bucket_choose(in, x, r);
> > @@ -422,13 +422,14 @@ reject:
> >                                         ftotal++;
> >                                         flocal++;
> >
> > -                                       if (collide && flocal < 3)
> > +                                       if (collide && flocal <= map->choose_local_tries)
> >                                                 /* retry locally a few times */
> >                                                 retry_bucket = 1;
> > -                                       else if (flocal <= in->size + orig_tries)
> > +                                       else if (map->choose_local_fallback_tries > 0 &&
> > +                                                flocal <= in->size + map->choose_local_fallback_tries)
> >                                                 /* exhaustive bucket search */
> >                                                 retry_bucket = 1;
> > -                                       else if (ftotal < 20)
> > +                                       else if (ftotal <= map->choose_total_tries)
> >                                                 /* then retry descent */
> >                                                 retry_descent = 1;
> >                                         else
> > diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> > index 9600674..3124b71 100644
> > --- a/net/ceph/osdmap.c
> > +++ b/net/ceph/osdmap.c
> > @@ -135,6 +135,21 @@ bad:
> >         return -EINVAL;
> >  }
> >
> > +static int skip_name_map(void **p, void *end)
> > +{
> > +        int len;
> > +        ceph_decode_32_safe(p, end, len ,bad);
> > +        while (len--) {
> > +                int strlen;
> use u32 for strlen
> 
> > +                *p += sizeof(u32);
> > +                ceph_decode_32_safe(p, end, strlen, bad);
> > +                *p += strlen;
> > +}
> > +        return 0;
> > +bad:
> > +        return -EINVAL;
> > +}
> > +
> >  static struct crush_map *crush_decode(void *pbyval, void *end)
> >  {
> >         struct crush_map *c;
> > @@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
> >         void **p = &pbyval;
> >         void *start = pbyval;
> >         u32 magic;
> > +       u32 num_name_maps;
> >
> >         dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
> >
> > @@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
> >         if (c == NULL)
> >                 return ERR_PTR(-ENOMEM);
> >
> > +        /* set tunables to default values */
> > +        c->choose_local_tries = 2;
> > +        c->choose_local_fallback_tries = 5;
> > +        c->choose_total_tries = 19;
> > +
> >         ceph_decode_need(p, end, 4*sizeof(u32), bad);
> >         magic = ceph_decode_32(p);
> >         if (magic != CRUSH_MAGIC) {
> > @@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
> >         }
> >
> >         /* ignore trailing name maps. */
> > +        for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
> > +                err = skip_name_map(p, end);
> > +                if (err < 0)
> > +                        goto done;
> > +        }
> > +
> > +        /* tunables */
> > +        ceph_decode_need(p, end, 3*sizeof(u32), done);
> > +        c->choose_local_tries = ceph_decode_32(p);
> > +        c->choose_local_fallback_tries =  ceph_decode_32(p);
> > +        c->choose_total_tries = ceph_decode_32(p);
> > +        dout("crush decode tunable choose_local_tries = %d",
> > +             c->choose_local_tries);
> > +        dout("crush decode tunable choose_local_fallback_tries = %d",
> > +             c->choose_local_fallback_tries);
> > +        dout("crush decode tunable choose_total_tries = %d",
> > +             c->choose_total_tries);
> >
> > +done:
> >         dout("crush_decode success\n");
> >         return c;
> >
> > --
> > 1.7.9
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yehuda Sadeh July 30, 2012, 11:45 p.m. UTC | #4
On Mon, Jul 30, 2012 at 4:14 PM, Sage Weil <sage@inktank.com> wrote:
>
> On Tue, 24 Jul 2012, Yehuda Sadeh wrote:
> > On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> > > From: caleb miles <caleb.miles@inktank.com>
> > >
> > > The server side recently added support for tuning some magic
> > > crush variables. Decode these variables if they are present, or use
> > > the
> > > default values if they are not present.
> > >
> > > Corresponds to ceph.git commit
> > > 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.
> > >
> > > Signed-off-by: caleb miles <caleb.miles@inktank.com>
> > > Reviewed-by: Sage Weil <sage@inktank.com>
> > > ---
> > >  include/linux/ceph/ceph_features.h |    4 ++-
> > >  include/linux/crush/crush.h        |    8 +++++++
> > >  net/ceph/crush/mapper.c            |   13 ++++++-----
> > >  net/ceph/osdmap.c                  |   39
> > > ++++++++++++++++++++++++++++++++++++
> > >  4 files changed, 57 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/include/linux/ceph/ceph_features.h
> > > b/include/linux/ceph/ceph_features.h
> > > index 342f93d..df25dcf 100644
> > > --- a/include/linux/ceph/ceph_features.h
> > > +++ b/include/linux/ceph/ceph_features.h
> > > @@ -12,12 +12,14 @@
> > >  #define CEPH_FEATURE_MONNAMES       (1<<5)
> > >  #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
> > >  #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> > > +#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
> >
> > any reason why this is 18 and not 8?
>
> 9-17 are used.. just not implemented/used by the kernel code.
>

Maybe mention it somewhere in a comment, to avoid future confusion?
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 342f93d..df25dcf 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -12,12 +12,14 @@ 
 #define CEPH_FEATURE_MONNAMES       (1<<5)
 #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
 #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
+#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
 
 /*
  * Features supported.
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
-	(CEPH_FEATURE_NOSRCADDR)
+	(CEPH_FEATURE_NOSRCADDR |	 \
+	 CEPH_FEATURE_CRUSH_TUNABLES)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
 	(CEPH_FEATURE_NOSRCADDR)
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 7c47508..25baa28 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -154,6 +154,14 @@  struct crush_map {
 	__s32 max_buckets;
 	__u32 max_rules;
 	__s32 max_devices;
+
+	/* choose local retries before re-descent */
+	__u32 choose_local_tries;
+	/* choose local attempts using a fallback permutation before
+	 * re-descent */
+	__u32 choose_local_fallback_tries;
+	/* choose attempts before giving up */ 
+	__u32 choose_total_tries;
 };
 
 
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index d7edc24..35fce75 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -306,7 +306,6 @@  static int crush_choose(const struct crush_map *map,
 	int item = 0;
 	int itemtype;
 	int collide, reject;
-	const unsigned int orig_tries = 5; /* attempts before we fall back to search */
 
 	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
 		bucket->id, x, outpos, numrep);
@@ -351,8 +350,9 @@  static int crush_choose(const struct crush_map *map,
 					reject = 1;
 					goto reject;
 				}
-				if (flocal >= (in->size>>1) &&
-				    flocal > orig_tries)
+				if (map->choose_local_fallback_tries > 0 &&
+				    flocal >= (in->size>>1) &&
+				    flocal > map->choose_local_fallback_tries)
 					item = bucket_perm_choose(in, x, r);
 				else
 					item = crush_bucket_choose(in, x, r);
@@ -422,13 +422,14 @@  reject:
 					ftotal++;
 					flocal++;
 
-					if (collide && flocal < 3)
+					if (collide && flocal <= map->choose_local_tries)
 						/* retry locally a few times */
 						retry_bucket = 1;
-					else if (flocal <= in->size + orig_tries)
+					else if (map->choose_local_fallback_tries > 0 &&
+						 flocal <= in->size + map->choose_local_fallback_tries)
 						/* exhaustive bucket search */
 						retry_bucket = 1;
-					else if (ftotal < 20)
+					else if (ftotal <= map->choose_total_tries)
 						/* then retry descent */
 						retry_descent = 1;
 					else
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9600674..3124b71 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -135,6 +135,21 @@  bad:
 	return -EINVAL;
 }
 
+static int skip_name_map(void **p, void *end)
+{
+        int len;
+        ceph_decode_32_safe(p, end, len ,bad);
+        while (len--) {
+                int strlen;
+                *p += sizeof(u32);
+                ceph_decode_32_safe(p, end, strlen, bad);
+                *p += strlen;
+}
+        return 0;
+bad:
+        return -EINVAL;
+}
+
 static struct crush_map *crush_decode(void *pbyval, void *end)
 {
 	struct crush_map *c;
@@ -143,6 +158,7 @@  static struct crush_map *crush_decode(void *pbyval, void *end)
 	void **p = &pbyval;
 	void *start = pbyval;
 	u32 magic;
+	u32 num_name_maps;
 
 	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
 
@@ -150,6 +166,11 @@  static struct crush_map *crush_decode(void *pbyval, void *end)
 	if (c == NULL)
 		return ERR_PTR(-ENOMEM);
 
+        /* set tunables to default values */
+        c->choose_local_tries = 2;
+        c->choose_local_fallback_tries = 5;
+        c->choose_total_tries = 19;
+
 	ceph_decode_need(p, end, 4*sizeof(u32), bad);
 	magic = ceph_decode_32(p);
 	if (magic != CRUSH_MAGIC) {
@@ -297,7 +318,25 @@  static struct crush_map *crush_decode(void *pbyval, void *end)
 	}
 
 	/* ignore trailing name maps. */
+        for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
+                err = skip_name_map(p, end);
+                if (err < 0)
+                        goto done;
+        }
+
+        /* tunables */
+        ceph_decode_need(p, end, 3*sizeof(u32), done);
+        c->choose_local_tries = ceph_decode_32(p);
+        c->choose_local_fallback_tries =  ceph_decode_32(p);
+        c->choose_total_tries = ceph_decode_32(p);
+        dout("crush decode tunable choose_local_tries = %d",
+             c->choose_local_tries);
+        dout("crush decode tunable choose_local_fallback_tries = %d",
+             c->choose_local_fallback_tries);
+        dout("crush decode tunable choose_total_tries = %d",
+             c->choose_total_tries);
 
+done:
 	dout("crush_decode success\n");
 	return c;