diff mbox series

[v2,2/5] libceph: decode CRUSH device/bucket types and names

Message ID 20200530153439.31312-3-idryomov@gmail.com (mailing list archive)
State New, archived
Headers show
Series libceph: support for replica reads | expand

Commit Message

Ilya Dryomov May 30, 2020, 3:34 p.m. UTC
These would be matched with the provided client location to calculate
the locality value.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/crush/crush.h |  6 +++
 net/ceph/crush/crush.c      |  3 ++
 net/ceph/osdmap.c           | 85 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 92 insertions(+), 2 deletions(-)

Comments

Jeff Layton June 1, 2020, 10:49 a.m. UTC | #1
On Sat, 2020-05-30 at 17:34 +0200, Ilya Dryomov wrote:
> These would be matched with the provided client location to calculate
> the locality value.
> 
> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
> ---
>  include/linux/crush/crush.h |  6 +++
>  net/ceph/crush/crush.c      |  3 ++
>  net/ceph/osdmap.c           | 85 ++++++++++++++++++++++++++++++++++++-
>  3 files changed, 92 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
> index 38b0e4d50ed9..29b0de2e202b 100644
> --- a/include/linux/crush/crush.h
> +++ b/include/linux/crush/crush.h
> @@ -301,6 +301,12 @@ struct crush_map {
>  
>  	__u32 *choose_tries;
>  #else
> +	/* device/bucket type id -> type name (CrushWrapper::type_map) */
> +	struct rb_root type_names;
> +
> +	/* device/bucket id -> name (CrushWrapper::name_map) */
> +	struct rb_root names;
> +

I'm not as familiar with the OSD client-side code, but I don't see any
mention of locking here. What protects these new rbtrees? From reading
over the code, I'd assume the osdc->lock, but it might be nice to have a
comment here that makes that clear.

>  	/* CrushWrapper::choose_args */
>  	struct rb_root choose_args;
>  #endif
> diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
> index 3d70244bc1b6..2e6b29fa8518 100644
> --- a/net/ceph/crush/crush.c
> +++ b/net/ceph/crush/crush.c
> @@ -2,6 +2,7 @@
>  #ifdef __KERNEL__
>  # include <linux/slab.h>
>  # include <linux/crush/crush.h>
> +void clear_crush_names(struct rb_root *root);
>  void clear_choose_args(struct crush_map *c);
>  #else
>  # include "crush_compat.h"
> @@ -130,6 +131,8 @@ void crush_destroy(struct crush_map *map)
>  #ifndef __KERNEL__
>  	kfree(map->choose_tries);
>  #else
> +	clear_crush_names(&map->type_names);
> +	clear_crush_names(&map->names);
>  	clear_choose_args(map);
>  #endif
>  	kfree(map);
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index 5d00ce2b5339..e74130876d3a 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -138,6 +138,79 @@ static int crush_decode_straw2_bucket(void **p, void *end,
>  	return -EINVAL;
>  }
>  
> +struct crush_name_node {
> +	struct rb_node cn_node;
> +	int cn_id;
> +	char cn_name[];
> +};
> +
> +static struct crush_name_node *alloc_crush_name(size_t name_len)
> +{
> +	struct crush_name_node *cn;
> +
> +	cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO);
> +	if (!cn)
> +		return NULL;
> +
> +	RB_CLEAR_NODE(&cn->cn_node);
> +	return cn;
> +}
> +
> +static void free_crush_name(struct crush_name_node *cn)
> +{
> +	WARN_ON(!RB_EMPTY_NODE(&cn->cn_node));
> +
> +	kfree(cn);
> +}
> +
> +DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node)
> +
> +static int decode_crush_names(void **p, void *end, struct rb_root *root)
> +{
> +	u32 n;
> +
> +	ceph_decode_32_safe(p, end, n, e_inval);
> +	while (n--) {
> +		struct crush_name_node *cn;
> +		int id;
> +		u32 name_len;
> +
> +		ceph_decode_32_safe(p, end, id, e_inval);
> +		ceph_decode_32_safe(p, end, name_len, e_inval);
> +		ceph_decode_need(p, end, name_len, e_inval);
> +
> +		cn = alloc_crush_name(name_len);
> +		if (!cn)
> +			return -ENOMEM;
> +
> +		cn->cn_id = id;
> +		memcpy(cn->cn_name, *p, name_len);
> +		cn->cn_name[name_len] = '\0';
> +		*p += name_len;
> +
> +		if (!__insert_crush_name(root, cn)) {
> +			free_crush_name(cn);
> +			return -EEXIST;
> +		}
> +	}
> +
> +	return 0;
> +
> +e_inval:
> +	return -EINVAL;
> +}
> +
> +void clear_crush_names(struct rb_root *root)
> +{
> +	while (!RB_EMPTY_ROOT(root)) {
> +		struct crush_name_node *cn =
> +		    rb_entry(rb_first(root), struct crush_name_node, cn_node);
> +
> +		erase_crush_name(root, cn);
> +		free_crush_name(cn);
> +	}
> +}
> +
>  static struct crush_choose_arg_map *alloc_choose_arg_map(void)
>  {
>  	struct crush_choose_arg_map *arg_map;
> @@ -354,6 +427,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>  	if (c == NULL)
>  		return ERR_PTR(-ENOMEM);
>  
> +	c->type_names = RB_ROOT;
> +	c->names = RB_ROOT;
>  	c->choose_args = RB_ROOT;
>  
>          /* set tunables to default values */
> @@ -510,8 +585,14 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>  		}
>  	}
>  
> -	ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */
> -	ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */
> +	err = decode_crush_names(p, end, &c->type_names);
> +	if (err)
> +		goto fail;
> +
> +	err = decode_crush_names(p, end, &c->names);
> +	if (err)
> +		goto fail;
> +
>  	ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
>  
>          /* tunables */
Ilya Dryomov June 1, 2020, 11:14 a.m. UTC | #2
On Mon, Jun 1, 2020 at 12:49 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Sat, 2020-05-30 at 17:34 +0200, Ilya Dryomov wrote:
> > These would be matched with the provided client location to calculate
> > the locality value.
> >
> > Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
> > ---
> >  include/linux/crush/crush.h |  6 +++
> >  net/ceph/crush/crush.c      |  3 ++
> >  net/ceph/osdmap.c           | 85 ++++++++++++++++++++++++++++++++++++-
> >  3 files changed, 92 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
> > index 38b0e4d50ed9..29b0de2e202b 100644
> > --- a/include/linux/crush/crush.h
> > +++ b/include/linux/crush/crush.h
> > @@ -301,6 +301,12 @@ struct crush_map {
> >
> >       __u32 *choose_tries;
> >  #else
> > +     /* device/bucket type id -> type name (CrushWrapper::type_map) */
> > +     struct rb_root type_names;
> > +
> > +     /* device/bucket id -> name (CrushWrapper::name_map) */
> > +     struct rb_root names;
> > +
>
> I'm not as familiar with the OSD client-side code, but I don't see any
> mention of locking here. What protects these new rbtrees? From reading
> over the code, I'd assume the osdc->lock, but it might be nice to have a
> comment here that makes that clear.

Yes, everything osdmap and CRUSH (except for the actual invocation) is
protected by osdc->lock.  But these trees don't even need that -- they
are static and could be easily replaced with sorted arrays to save two
pointers per name.

Thanks,

                Ilya
diff mbox series

Patch

diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 38b0e4d50ed9..29b0de2e202b 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -301,6 +301,12 @@  struct crush_map {
 
 	__u32 *choose_tries;
 #else
+	/* device/bucket type id -> type name (CrushWrapper::type_map) */
+	struct rb_root type_names;
+
+	/* device/bucket id -> name (CrushWrapper::name_map) */
+	struct rb_root names;
+
 	/* CrushWrapper::choose_args */
 	struct rb_root choose_args;
 #endif
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 3d70244bc1b6..2e6b29fa8518 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -2,6 +2,7 @@ 
 #ifdef __KERNEL__
 # include <linux/slab.h>
 # include <linux/crush/crush.h>
+void clear_crush_names(struct rb_root *root);
 void clear_choose_args(struct crush_map *c);
 #else
 # include "crush_compat.h"
@@ -130,6 +131,8 @@  void crush_destroy(struct crush_map *map)
 #ifndef __KERNEL__
 	kfree(map->choose_tries);
 #else
+	clear_crush_names(&map->type_names);
+	clear_crush_names(&map->names);
 	clear_choose_args(map);
 #endif
 	kfree(map);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 5d00ce2b5339..e74130876d3a 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -138,6 +138,79 @@  static int crush_decode_straw2_bucket(void **p, void *end,
 	return -EINVAL;
 }
 
+struct crush_name_node {
+	struct rb_node cn_node;
+	int cn_id;
+	char cn_name[];
+};
+
+static struct crush_name_node *alloc_crush_name(size_t name_len)
+{
+	struct crush_name_node *cn;
+
+	cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO);
+	if (!cn)
+		return NULL;
+
+	RB_CLEAR_NODE(&cn->cn_node);
+	return cn;
+}
+
+static void free_crush_name(struct crush_name_node *cn)
+{
+	WARN_ON(!RB_EMPTY_NODE(&cn->cn_node));
+
+	kfree(cn);
+}
+
+DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node)
+
+static int decode_crush_names(void **p, void *end, struct rb_root *root)
+{
+	u32 n;
+
+	ceph_decode_32_safe(p, end, n, e_inval);
+	while (n--) {
+		struct crush_name_node *cn;
+		int id;
+		u32 name_len;
+
+		ceph_decode_32_safe(p, end, id, e_inval);
+		ceph_decode_32_safe(p, end, name_len, e_inval);
+		ceph_decode_need(p, end, name_len, e_inval);
+
+		cn = alloc_crush_name(name_len);
+		if (!cn)
+			return -ENOMEM;
+
+		cn->cn_id = id;
+		memcpy(cn->cn_name, *p, name_len);
+		cn->cn_name[name_len] = '\0';
+		*p += name_len;
+
+		if (!__insert_crush_name(root, cn)) {
+			free_crush_name(cn);
+			return -EEXIST;
+		}
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+void clear_crush_names(struct rb_root *root)
+{
+	while (!RB_EMPTY_ROOT(root)) {
+		struct crush_name_node *cn =
+		    rb_entry(rb_first(root), struct crush_name_node, cn_node);
+
+		erase_crush_name(root, cn);
+		free_crush_name(cn);
+	}
+}
+
 static struct crush_choose_arg_map *alloc_choose_arg_map(void)
 {
 	struct crush_choose_arg_map *arg_map;
@@ -354,6 +427,8 @@  static struct crush_map *crush_decode(void *pbyval, void *end)
 	if (c == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	c->type_names = RB_ROOT;
+	c->names = RB_ROOT;
 	c->choose_args = RB_ROOT;
 
         /* set tunables to default values */
@@ -510,8 +585,14 @@  static struct crush_map *crush_decode(void *pbyval, void *end)
 		}
 	}
 
-	ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */
-	ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */
+	err = decode_crush_names(p, end, &c->type_names);
+	if (err)
+		goto fail;
+
+	err = decode_crush_names(p, end, &c->names);
+	if (err)
+		goto fail;
+
 	ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
 
         /* tunables */