diff mbox series

[3/5] libceph: crush_location infrastructure

Message ID 20200529151952.15184-4-idryomov@gmail.com (mailing list archive)
State New, archived
Headers show
Series libceph: support for replica reads | expand

Commit Message

Ilya Dryomov May 29, 2020, 3:19 p.m. UTC
Allow expressing client's location in terms of CRUSH hierarchy as
a set of (bucket type name, bucket name) pairs.  The userspace syntax
"crush_location = key1=value1 key2=value2" is incompatible with mount
options and needed adaptation:

- ':' separator
- one key:value pair per crush_location option
- crush_location options are combined together

So for:

  crush_location = host=foo rack=bar

one would write:

  crush_location=host:foo,crush_location=rack:bar

As in userspace, "multipath" locations are supported, so indicating
locality for parallel hierarchies is possible:

  crush_location=rack:foo1,crush_location=rack:foo2,crush_location=datacenter:bar

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h |   1 +
 include/linux/ceph/osdmap.h  |  16 ++++-
 net/ceph/ceph_common.c       |  25 ++++++++
 net/ceph/osdmap.c            | 114 +++++++++++++++++++++++++++++++++++
 4 files changed, 155 insertions(+), 1 deletion(-)

Comments

Jeff Layton May 29, 2020, 5:27 p.m. UTC | #1
On Fri, 2020-05-29 at 17:19 +0200, Ilya Dryomov wrote:
> Allow expressing client's location in terms of CRUSH hierarchy as
> a set of (bucket type name, bucket name) pairs.  The userspace syntax
> "crush_location = key1=value1 key2=value2" is incompatible with mount
> options and needed adaptation:
> 
> - ':' separator
> - one key:value pair per crush_location option
> - crush_location options are combined together
> 
> So for:
> 
>   crush_location = host=foo rack=bar
> 
> one would write:
> 
>   crush_location=host:foo,crush_location=rack:bar
> 
> As in userspace, "multipath" locations are supported, so indicating
> locality for parallel hierarchies is possible:
> 
>   crush_location=rack:foo1,crush_location=rack:foo2,crush_location=datacenter:bar
> 

Blech, that syntax is hideous. It's also problematic in that the options
are additive -- you can't override an option that was given earlier
(e.g. in fstab), or in a shell script.

Is it not possible to do something with a single crush_location= option?
Maybe:

    crush_location=rack:foo1/rack:foo2/datacenter:bar

It's still ugly with the embedded '=' signs, but it would at least make
it so that the options aren't additive.


> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
> ---
>  include/linux/ceph/libceph.h |   1 +
>  include/linux/ceph/osdmap.h  |  16 ++++-
>  net/ceph/ceph_common.c       |  25 ++++++++
>  net/ceph/osdmap.c            | 114 +++++++++++++++++++++++++++++++++++
>  4 files changed, 155 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index 4b5a47bcaba4..4733959f1ec7 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -64,6 +64,7 @@ struct ceph_options {
>  	int num_mon;
>  	char *name;
>  	struct ceph_crypto_key *key;
> +	struct rb_root crush_locs;
>  };
>  
>  /*
> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
> index 5e601975745f..ef8619ad1401 100644
> --- a/include/linux/ceph/osdmap.h
> +++ b/include/linux/ceph/osdmap.h
> @@ -302,9 +302,23 @@ bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
>  int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
>  			      const struct ceph_pg *raw_pgid);
>  
> +struct crush_loc {
> +	char *cl_type_name;
> +	char *cl_name;
> +};
> +
> +struct crush_loc_node {
> +	struct rb_node cl_node;
> +	struct crush_loc cl_loc;  /* pointers into cl_data */
> +	char cl_data[];
> +};
> +
> +int ceph_parse_crush_loc(const char *str, struct rb_root *locs);
> +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
> +void ceph_clear_crush_locs(struct rb_root *locs);
> +
>  extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
>  						    u64 id);
> -
>  extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
>  extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
>  u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id);
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index a0e97f6c1072..6d495685ee03 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -176,6 +176,10 @@ int ceph_compare_options(struct ceph_options *new_opt,
>  		}
>  	}
>  
> +	ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs);
> +	if (ret)
> +		return ret;
> +
>  	/* any matching mon ip implies a match */
>  	for (i = 0; i < opt1->num_mon; i++) {
>  		if (ceph_monmap_contains(client->monc.monmap,
> @@ -260,6 +264,7 @@ enum {
>  	Opt_secret,
>  	Opt_key,
>  	Opt_ip,
> +	Opt_crush_location,
>  	/* string args above */
>  	Opt_share,
>  	Opt_crc,
> @@ -274,6 +279,7 @@ static const struct fs_parameter_spec ceph_parameters[] = {
>  	fsparam_flag_no ("cephx_require_signatures",	Opt_cephx_require_signatures),
>  	fsparam_flag_no ("cephx_sign_messages",		Opt_cephx_sign_messages),
>  	fsparam_flag_no ("crc",				Opt_crc),
> +	fsparam_string	("crush_location",		Opt_crush_location),
>  	fsparam_string	("fsid",			Opt_fsid),
>  	fsparam_string	("ip",				Opt_ip),
>  	fsparam_string	("key",				Opt_key),
> @@ -298,6 +304,7 @@ struct ceph_options *ceph_alloc_options(void)
>  	if (!opt)
>  		return NULL;
>  
> +	opt->crush_locs = RB_ROOT;
>  	opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
>  				GFP_KERNEL);
>  	if (!opt->mon_addr) {
> @@ -320,6 +327,7 @@ void ceph_destroy_options(struct ceph_options *opt)
>  	if (!opt)
>  		return;
>  
> +	ceph_clear_crush_locs(&opt->crush_locs);
>  	kfree(opt->name);
>  	if (opt->key) {
>  		ceph_crypto_key_destroy(opt->key);
> @@ -454,6 +462,14 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
>  		if (!opt->key)
>  			return -ENOMEM;
>  		return get_secret(opt->key, param->string, &log);
> +	case Opt_crush_location:
> +		err = ceph_parse_crush_loc(param->string, &opt->crush_locs);
> +		if (err) {
> +			error_plog(&log, "Failed to parse crush location: %d",
> +				   err);
> +			return err;
> +		}
> +		break;
>  
>  	case Opt_osdtimeout:
>  		warn_plog(&log, "Ignoring osdtimeout");
> @@ -536,6 +552,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
>  {
>  	struct ceph_options *opt = client->options;
>  	size_t pos = m->count;
> +	struct rb_node *n;
>  
>  	if (opt->name) {
>  		seq_puts(m, "name=");
> @@ -545,6 +562,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
>  	if (opt->key)
>  		seq_puts(m, "secret=<hidden>,");
>  
> +	for (n = rb_first(&opt->crush_locs); n; n = rb_next(n)) {
> +		struct crush_loc_node *loc =
> +		    rb_entry(n, struct crush_loc_node, cl_node);
> +
> +		seq_printf(m, "crush_location=%s:%s,",
> +			   loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
> +	}
> +
>  	if (opt->flags & CEPH_OPT_FSID)
>  		seq_printf(m, "fsid=%pU,", &opt->fsid);
>  	if (opt->flags & CEPH_OPT_NOSHARE)
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index e74130876d3a..995cdb8b559e 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -2715,3 +2715,117 @@ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
>  	return acting.primary;
>  }
>  EXPORT_SYMBOL(ceph_pg_to_acting_primary);
> +
> +static struct crush_loc_node *alloc_crush_loc(size_t type_name_len,
> +					      size_t name_len)
> +{
> +	struct crush_loc_node *loc;
> +
> +	loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO);
> +	if (!loc)
> +		return NULL;
> +
> +	RB_CLEAR_NODE(&loc->cl_node);
> +	return loc;
> +}
> +
> +static void free_crush_loc(struct crush_loc_node *loc)
> +{
> +	WARN_ON(!RB_EMPTY_NODE(&loc->cl_node));
> +
> +	kfree(loc);
> +}
> +
> +static int crush_loc_compare(const struct crush_loc *loc1,
> +			     const struct crush_loc *loc2)
> +{
> +	return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?:
> +	       strcmp(loc1->cl_name, loc2->cl_name);
> +}
> +
> +DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare,
> +		 RB_BYPTR, const struct crush_loc *, cl_node)
> +
> +/*
> + * A <bucket type name>:<bucket name> pair, e.g. "zone:us-east".
> + */
> +int ceph_parse_crush_loc(const char *str, struct rb_root *locs)
> +{
> +	struct crush_loc_node *loc;
> +	const char *type_name, *name;
> +	size_t type_name_len, name_len;
> +
> +	type_name = str;
> +	str = strchrnul(str, ':');
> +	if (*str == '\0')
> +		return -EINVAL;  /* no ':' */
> +
> +	type_name_len = str - type_name;
> +	if (type_name_len == 0)
> +		return -EINVAL;
> +
> +	name = ++str;
> +	str = strchrnul(str, ':');
> +	if (*str != '\0')
> +		return -EINVAL;  /* another ':' */
> +
> +	name_len = str - name;
> +	if (name_len == 0)
> +		return -EINVAL;
> +
> +	loc = alloc_crush_loc(type_name_len, name_len);
> +	if (!loc)
> +		return -ENOMEM;
> +
> +	loc->cl_loc.cl_type_name = loc->cl_data;
> +	memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len);
> +	loc->cl_loc.cl_type_name[type_name_len] = '\0';
> +
> +	loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1;
> +	memcpy(loc->cl_loc.cl_name, name, name_len);
> +	loc->cl_loc.cl_name[name_len] = '\0';
> +
> +	if (!__insert_crush_loc(locs, loc)) {
> +		free_crush_loc(loc);
> +		return -EEXIST;
> +	}
> +
> +	dout("%s type_name '%s' name '%s'\n", __func__,
> +	     loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
> +	return 0;
> +}
> +
> +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2)
> +{
> +	struct rb_node *n1 = rb_first(locs1);
> +	struct rb_node *n2 = rb_first(locs2);
> +	int ret;
> +
> +	for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) {
> +		struct crush_loc_node *loc1 =
> +		    rb_entry(n1, struct crush_loc_node, cl_node);
> +		struct crush_loc_node *loc2 =
> +		    rb_entry(n2, struct crush_loc_node, cl_node);
> +
> +		ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (!n1 && n2)
> +		return -1;
> +	if (n1 && !n2)
> +		return 1;
> +	return 0;
> +}
> +
> +void ceph_clear_crush_locs(struct rb_root *locs)
> +{
> +	while (!RB_EMPTY_ROOT(locs)) {
> +		struct crush_loc_node *loc =
> +		    rb_entry(rb_first(locs), struct crush_loc_node, cl_node);
> +
> +		erase_crush_loc(locs, loc);
> +		free_crush_loc(loc);
> +	}
> +}
Ilya Dryomov May 29, 2020, 6:38 p.m. UTC | #2
On Fri, May 29, 2020 at 7:27 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Fri, 2020-05-29 at 17:19 +0200, Ilya Dryomov wrote:
> > Allow expressing client's location in terms of CRUSH hierarchy as
> > a set of (bucket type name, bucket name) pairs.  The userspace syntax
> > "crush_location = key1=value1 key2=value2" is incompatible with mount
> > options and needed adaptation:
> >
> > - ':' separator
> > - one key:value pair per crush_location option
> > - crush_location options are combined together
> >
> > So for:
> >
> >   crush_location = host=foo rack=bar
> >
> > one would write:
> >
> >   crush_location=host:foo,crush_location=rack:bar
> >
> > As in userspace, "multipath" locations are supported, so indicating
> > locality for parallel hierarchies is possible:
> >
> >   crush_location=rack:foo1,crush_location=rack:foo2,crush_location=datacenter:bar
> >
>
> Blech, that syntax is hideous. It's also problematic in that the options
> are additive -- you can't override an option that was given earlier
> (e.g. in fstab), or in a shell script.
>
> Is it not possible to do something with a single crush_location= option?
> Maybe:
>
>     crush_location=rack:foo1/rack:foo2/datacenter:bar
>
> It's still ugly with the embedded '=' signs, but it would at least make
> it so that the options aren't additive.

I suppose we could do something like that at the cost of more
parsing boilerplate, but I'm not sure additive options are that
hideous.  I don't think additive options are unprecedented and
more importantly I think many simple boolean and integer options
are not properly overridable even in major filesystems.

What embedded '=' signs are you referring to?  I see ':' and '/'
in your suggested syntax.

Thanks,

                Ilya
Jeff Layton May 29, 2020, 7:10 p.m. UTC | #3
On Fri, 2020-05-29 at 20:38 +0200, Ilya Dryomov wrote:
> On Fri, May 29, 2020 at 7:27 PM Jeff Layton <jlayton@kernel.org> wrote:
> > On Fri, 2020-05-29 at 17:19 +0200, Ilya Dryomov wrote:
> > > Allow expressing client's location in terms of CRUSH hierarchy as
> > > a set of (bucket type name, bucket name) pairs.  The userspace syntax
> > > "crush_location = key1=value1 key2=value2" is incompatible with mount
> > > options and needed adaptation:
> > > 
> > > - ':' separator
> > > - one key:value pair per crush_location option
> > > - crush_location options are combined together
> > > 
> > > So for:
> > > 
> > >   crush_location = host=foo rack=bar
> > > 
> > > one would write:
> > > 
> > >   crush_location=host:foo,crush_location=rack:bar
> > > 
> > > As in userspace, "multipath" locations are supported, so indicating
> > > locality for parallel hierarchies is possible:
> > > 
> > >   crush_location=rack:foo1,crush_location=rack:foo2,crush_location=datacenter:bar
> > > 
> > 
> > Blech, that syntax is hideous. It's also problematic in that the options
> > are additive -- you can't override an option that was given earlier
> > (e.g. in fstab), or in a shell script.
> > 
> > Is it not possible to do something with a single crush_location= option?
> > Maybe:
> > 
> >     crush_location=rack:foo1/rack:foo2/datacenter:bar
> > 
> > It's still ugly with the embedded '=' signs, but it would at least make
> > it so that the options aren't additive.
> 
> I suppose we could do something like that at the cost of more
> parsing boilerplate, but I'm not sure additive options are that
> hideous.  I don't think additive options are unprecedented and
> more importantly I think many simple boolean and integer options
> are not properly overridable even in major filesystems.
> 

That is the long-standing convention though. There are reasons to
deviate from it, but I don't see it here. Plus, I think the syntax I
proposed above is more readable (and compact) as well.

It would mean a bit more parsing code though, granted.

> What embedded '=' signs are you referring to?  I see ':' and '/'
> in your suggested syntax.
> 

Sorry, yeah... I had originally done one that had '=' chars in it, but
converted it to the above. Please disregard that paragraph.
Ilya Dryomov May 29, 2020, 8:42 p.m. UTC | #4
On Fri, May 29, 2020 at 9:10 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Fri, 2020-05-29 at 20:38 +0200, Ilya Dryomov wrote:
> > On Fri, May 29, 2020 at 7:27 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > On Fri, 2020-05-29 at 17:19 +0200, Ilya Dryomov wrote:
> > > > Allow expressing client's location in terms of CRUSH hierarchy as
> > > > a set of (bucket type name, bucket name) pairs.  The userspace syntax
> > > > "crush_location = key1=value1 key2=value2" is incompatible with mount
> > > > options and needed adaptation:
> > > >
> > > > - ':' separator
> > > > - one key:value pair per crush_location option
> > > > - crush_location options are combined together
> > > >
> > > > So for:
> > > >
> > > >   crush_location = host=foo rack=bar
> > > >
> > > > one would write:
> > > >
> > > >   crush_location=host:foo,crush_location=rack:bar
> > > >
> > > > As in userspace, "multipath" locations are supported, so indicating
> > > > locality for parallel hierarchies is possible:
> > > >
> > > >   crush_location=rack:foo1,crush_location=rack:foo2,crush_location=datacenter:bar
> > > >
> > >
> > > Blech, that syntax is hideous. It's also problematic in that the options
> > > are additive -- you can't override an option that was given earlier
> > > (e.g. in fstab), or in a shell script.
> > >
> > > Is it not possible to do something with a single crush_location= option?
> > > Maybe:
> > >
> > >     crush_location=rack:foo1/rack:foo2/datacenter:bar
> > >
> > > It's still ugly with the embedded '=' signs, but it would at least make
> > > it so that the options aren't additive.
> >
> > I suppose we could do something like that at the cost of more
> > parsing boilerplate, but I'm not sure additive options are that
> > hideous.  I don't think additive options are unprecedented and
> > more importantly I think many simple boolean and integer options
> > are not properly overridable even in major filesystems.
> >
>
> That is the long-standing convention though. There are reasons to
> deviate from it, but I don't see it here. Plus, I think the syntax I
> proposed above is more readable (and compact) as well.
>
> It would mean a bit more parsing code though, granted.
>
> > What embedded '=' signs are you referring to?  I see ':' and '/'
> > in your suggested syntax.
> >
>
> Sorry, yeah... I had originally done one that had '=' chars in it, but
> converted it to the above. Please disregard that paragraph.

One of the reasons I did it this way is that crush_location is
inherently additive.  I don't have a strong opinion on this though
so let's adhere to the convention.

I'll implement the suggested syntax and repost.

Thanks,

                Ilya
Ilya Dryomov May 30, 2020, 3:47 p.m. UTC | #5
On Fri, May 29, 2020 at 10:42 PM Ilya Dryomov <idryomov@gmail.com> wrote:
>
> On Fri, May 29, 2020 at 9:10 PM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > On Fri, 2020-05-29 at 20:38 +0200, Ilya Dryomov wrote:
> > > On Fri, May 29, 2020 at 7:27 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > On Fri, 2020-05-29 at 17:19 +0200, Ilya Dryomov wrote:
> > > > > Allow expressing client's location in terms of CRUSH hierarchy as
> > > > > a set of (bucket type name, bucket name) pairs.  The userspace syntax
> > > > > "crush_location = key1=value1 key2=value2" is incompatible with mount
> > > > > options and needed adaptation:
> > > > >
> > > > > - ':' separator
> > > > > - one key:value pair per crush_location option
> > > > > - crush_location options are combined together
> > > > >
> > > > > So for:
> > > > >
> > > > >   crush_location = host=foo rack=bar
> > > > >
> > > > > one would write:
> > > > >
> > > > >   crush_location=host:foo,crush_location=rack:bar
> > > > >
> > > > > As in userspace, "multipath" locations are supported, so indicating
> > > > > locality for parallel hierarchies is possible:
> > > > >
> > > > >   crush_location=rack:foo1,crush_location=rack:foo2,crush_location=datacenter:bar
> > > > >
> > > >
> > > > Blech, that syntax is hideous. It's also problematic in that the options
> > > > are additive -- you can't override an option that was given earlier
> > > > (e.g. in fstab), or in a shell script.
> > > >
> > > > Is it not possible to do something with a single crush_location= option?
> > > > Maybe:
> > > >
> > > >     crush_location=rack:foo1/rack:foo2/datacenter:bar
> > > >
> > > > It's still ugly with the embedded '=' signs, but it would at least make
> > > > it so that the options aren't additive.
> > >
> > > I suppose we could do something like that at the cost of more
> > > parsing boilerplate, but I'm not sure additive options are that
> > > hideous.  I don't think additive options are unprecedented and
> > > more importantly I think many simple boolean and integer options
> > > are not properly overridable even in major filesystems.
> > >
> >
> > That is the long-standing convention though. There are reasons to
> > deviate from it, but I don't see it here. Plus, I think the syntax I
> > proposed above is more readable (and compact) as well.
> >
> > It would mean a bit more parsing code though, granted.
> >
> > > What embedded '=' signs are you referring to?  I see ':' and '/'
> > > in your suggested syntax.
> > >
> >
> > Sorry, yeah... I had originally done one that had '=' chars in it, but
> > converted it to the above. Please disregard that paragraph.
>
> One of the reasons I did it this way is that crush_location is
> inherently additive.  I don't have a strong opinion on this though
> so let's adhere to the convention.
>
> I'll implement the suggested syntax and repost.

I went with '|' instead of '/' for the separator to try to stress
the additivity (in the OR sense).  '/' makes it look like a path to
the root of the tree which it really isn't.

Thanks,

                Ilya
diff mbox series

Patch

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 4b5a47bcaba4..4733959f1ec7 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -64,6 +64,7 @@  struct ceph_options {
 	int num_mon;
 	char *name;
 	struct ceph_crypto_key *key;
+	struct rb_root crush_locs;
 };
 
 /*
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 5e601975745f..ef8619ad1401 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -302,9 +302,23 @@  bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
 int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
 			      const struct ceph_pg *raw_pgid);
 
+struct crush_loc {
+	char *cl_type_name;
+	char *cl_name;
+};
+
+struct crush_loc_node {
+	struct rb_node cl_node;
+	struct crush_loc cl_loc;  /* pointers into cl_data */
+	char cl_data[];
+};
+
+int ceph_parse_crush_loc(const char *str, struct rb_root *locs);
+int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
+void ceph_clear_crush_locs(struct rb_root *locs);
+
 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
 						    u64 id);
-
 extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
 extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
 u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index a0e97f6c1072..6d495685ee03 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -176,6 +176,10 @@  int ceph_compare_options(struct ceph_options *new_opt,
 		}
 	}
 
+	ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs);
+	if (ret)
+		return ret;
+
 	/* any matching mon ip implies a match */
 	for (i = 0; i < opt1->num_mon; i++) {
 		if (ceph_monmap_contains(client->monc.monmap,
@@ -260,6 +264,7 @@  enum {
 	Opt_secret,
 	Opt_key,
 	Opt_ip,
+	Opt_crush_location,
 	/* string args above */
 	Opt_share,
 	Opt_crc,
@@ -274,6 +279,7 @@  static const struct fs_parameter_spec ceph_parameters[] = {
 	fsparam_flag_no ("cephx_require_signatures",	Opt_cephx_require_signatures),
 	fsparam_flag_no ("cephx_sign_messages",		Opt_cephx_sign_messages),
 	fsparam_flag_no ("crc",				Opt_crc),
+	fsparam_string	("crush_location",		Opt_crush_location),
 	fsparam_string	("fsid",			Opt_fsid),
 	fsparam_string	("ip",				Opt_ip),
 	fsparam_string	("key",				Opt_key),
@@ -298,6 +304,7 @@  struct ceph_options *ceph_alloc_options(void)
 	if (!opt)
 		return NULL;
 
+	opt->crush_locs = RB_ROOT;
 	opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
 				GFP_KERNEL);
 	if (!opt->mon_addr) {
@@ -320,6 +327,7 @@  void ceph_destroy_options(struct ceph_options *opt)
 	if (!opt)
 		return;
 
+	ceph_clear_crush_locs(&opt->crush_locs);
 	kfree(opt->name);
 	if (opt->key) {
 		ceph_crypto_key_destroy(opt->key);
@@ -454,6 +462,14 @@  int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 		if (!opt->key)
 			return -ENOMEM;
 		return get_secret(opt->key, param->string, &log);
+	case Opt_crush_location:
+		err = ceph_parse_crush_loc(param->string, &opt->crush_locs);
+		if (err) {
+			error_plog(&log, "Failed to parse crush location: %d",
+				   err);
+			return err;
+		}
+		break;
 
 	case Opt_osdtimeout:
 		warn_plog(&log, "Ignoring osdtimeout");
@@ -536,6 +552,7 @@  int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
 {
 	struct ceph_options *opt = client->options;
 	size_t pos = m->count;
+	struct rb_node *n;
 
 	if (opt->name) {
 		seq_puts(m, "name=");
@@ -545,6 +562,14 @@  int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
 	if (opt->key)
 		seq_puts(m, "secret=<hidden>,");
 
+	for (n = rb_first(&opt->crush_locs); n; n = rb_next(n)) {
+		struct crush_loc_node *loc =
+		    rb_entry(n, struct crush_loc_node, cl_node);
+
+		seq_printf(m, "crush_location=%s:%s,",
+			   loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
+	}
+
 	if (opt->flags & CEPH_OPT_FSID)
 		seq_printf(m, "fsid=%pU,", &opt->fsid);
 	if (opt->flags & CEPH_OPT_NOSHARE)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index e74130876d3a..995cdb8b559e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2715,3 +2715,117 @@  int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
 	return acting.primary;
 }
 EXPORT_SYMBOL(ceph_pg_to_acting_primary);
+
+static struct crush_loc_node *alloc_crush_loc(size_t type_name_len,
+					      size_t name_len)
+{
+	struct crush_loc_node *loc;
+
+	loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO);
+	if (!loc)
+		return NULL;
+
+	RB_CLEAR_NODE(&loc->cl_node);
+	return loc;
+}
+
+static void free_crush_loc(struct crush_loc_node *loc)
+{
+	WARN_ON(!RB_EMPTY_NODE(&loc->cl_node));
+
+	kfree(loc);
+}
+
+static int crush_loc_compare(const struct crush_loc *loc1,
+			     const struct crush_loc *loc2)
+{
+	return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?:
+	       strcmp(loc1->cl_name, loc2->cl_name);
+}
+
+DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare,
+		 RB_BYPTR, const struct crush_loc *, cl_node)
+
+/*
+ * A <bucket type name>:<bucket name> pair, e.g. "zone:us-east".
+ */
+int ceph_parse_crush_loc(const char *str, struct rb_root *locs)
+{
+	struct crush_loc_node *loc;
+	const char *type_name, *name;
+	size_t type_name_len, name_len;
+
+	type_name = str;
+	str = strchrnul(str, ':');
+	if (*str == '\0')
+		return -EINVAL;  /* no ':' */
+
+	type_name_len = str - type_name;
+	if (type_name_len == 0)
+		return -EINVAL;
+
+	name = ++str;
+	str = strchrnul(str, ':');
+	if (*str != '\0')
+		return -EINVAL;  /* another ':' */
+
+	name_len = str - name;
+	if (name_len == 0)
+		return -EINVAL;
+
+	loc = alloc_crush_loc(type_name_len, name_len);
+	if (!loc)
+		return -ENOMEM;
+
+	loc->cl_loc.cl_type_name = loc->cl_data;
+	memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len);
+	loc->cl_loc.cl_type_name[type_name_len] = '\0';
+
+	loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1;
+	memcpy(loc->cl_loc.cl_name, name, name_len);
+	loc->cl_loc.cl_name[name_len] = '\0';
+
+	if (!__insert_crush_loc(locs, loc)) {
+		free_crush_loc(loc);
+		return -EEXIST;
+	}
+
+	dout("%s type_name '%s' name '%s'\n", __func__,
+	     loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
+	return 0;
+}
+
+int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2)
+{
+	struct rb_node *n1 = rb_first(locs1);
+	struct rb_node *n2 = rb_first(locs2);
+	int ret;
+
+	for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) {
+		struct crush_loc_node *loc1 =
+		    rb_entry(n1, struct crush_loc_node, cl_node);
+		struct crush_loc_node *loc2 =
+		    rb_entry(n2, struct crush_loc_node, cl_node);
+
+		ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc);
+		if (ret)
+			return ret;
+	}
+
+	if (!n1 && n2)
+		return -1;
+	if (n1 && !n2)
+		return 1;
+	return 0;
+}
+
+void ceph_clear_crush_locs(struct rb_root *locs)
+{
+	while (!RB_EMPTY_ROOT(locs)) {
+		struct crush_loc_node *loc =
+		    rb_entry(rb_first(locs), struct crush_loc_node, cl_node);
+
+		erase_crush_loc(locs, loc);
+		free_crush_loc(loc);
+	}
+}