diff mbox

[v2,2/6] libceph: introduce reference counted string

Message ID 1454742006-85706-3-git-send-email-zyan@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yan, Zheng Feb. 6, 2016, 7 a.m. UTC
The data structure is for storing namesapce string. It allows namespace
string to be shared by cephfs inodes with same layout.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 include/linux/ceph/libceph.h      |   1 +
 include/linux/ceph/string_table.h |  58 ++++++++++++++++++
 net/ceph/Makefile                 |   2 +-
 net/ceph/ceph_common.c            |   1 +
 net/ceph/string_table.c           | 121 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/ceph/string_table.h
 create mode 100644 net/ceph/string_table.c

Comments

Ilya Dryomov March 22, 2016, 6:05 a.m. UTC | #1
On Sat, Feb 6, 2016 at 8:00 AM, Yan, Zheng <zyan@redhat.com> wrote:
> The data structure is for storing namesapce string. It allows namespace
> string to be shared by cephfs inodes with same layout.
>
> Signed-off-by: Yan, Zheng <zyan@redhat.com>
> ---
>  include/linux/ceph/libceph.h      |   1 +
>  include/linux/ceph/string_table.h |  58 ++++++++++++++++++
>  net/ceph/Makefile                 |   2 +-
>  net/ceph/ceph_common.c            |   1 +
>  net/ceph/string_table.c           | 121 ++++++++++++++++++++++++++++++++++++++
>  5 files changed, 182 insertions(+), 1 deletion(-)
>  create mode 100644 include/linux/ceph/string_table.h
>  create mode 100644 net/ceph/string_table.c
>
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index e7975e4..8e9868d 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -21,6 +21,7 @@
>  #include <linux/ceph/mon_client.h>
>  #include <linux/ceph/osd_client.h>
>  #include <linux/ceph/ceph_fs.h>
> +#include <linux/ceph/string_table.h>
>
>  /*
>   * mount options
> diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h
> new file mode 100644
> index 0000000..427b6f9
> --- /dev/null
> +++ b/include/linux/ceph/string_table.h
> @@ -0,0 +1,58 @@
> +#ifndef _FS_CEPH_STRING_TABLE_H
> +#define _FS_CEPH_STRING_TABLE_H
> +
> +#include <linux/types.h>
> +#include <linux/kref.h>
> +#include <linux/rbtree.h>
> +#include <linux/rcupdate.h>
> +
> +struct ceph_string {
> +       struct kref kref;
> +       union {
> +               struct rb_node node;
> +               struct rcu_head rcu;
> +       };
> +       size_t len;
> +       char str[];
> +};
> +
> +extern void ceph_release_string(struct kref *ref);
> +extern struct ceph_string *ceph_find_or_create_string(const char *str,
> +                                                     size_t len);
> +extern void ceph_string_cleanup(void);
> +
> +static inline void ceph_get_string(struct ceph_string *str)
> +{
> +       kref_get(&str->kref);
> +}
> +
> +static inline void ceph_put_string(struct ceph_string *str)
> +{
> +       if (!str)
> +               return;
> +       kref_put(&str->kref, ceph_release_string);
> +}
> +
> +static inline int ceph_compare_string(struct ceph_string *cs,
> +                                     const char* str, size_t len)
> +{
> +       size_t cs_len = cs ? cs->len : 0;
> +       if (cs_len != len)
> +               return cs_len - len;
> +       if (len == 0)
> +               return 0;
> +       return strncmp(cs->str, str, len);
> +}
> +
> +#define ceph_try_get_string(x)                                 \
> +({                                                             \
> +       struct ceph_string *___str;                             \
> +       rcu_read_lock();                                        \
> +       ___str = rcu_dereference(x);                            \
> +       if (___str && !kref_get_unless_zero(&___str->kref))     \
> +               ___str = NULL;                                  \
> +       rcu_read_unlock();                                      \
> +       (___str);                                               \
> +})
> +
> +#endif
> diff --git a/net/ceph/Makefile b/net/ceph/Makefile
> index 958d9856..84cbed6 100644
> --- a/net/ceph/Makefile
> +++ b/net/ceph/Makefile
> @@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
>         crypto.o armor.o \
>         auth_x.o \
>         ceph_fs.o ceph_strings.o ceph_hash.o \
> -       pagevec.o snapshot.o
> +       pagevec.o snapshot.o string_table.o
>
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index dcc18c6..baca649 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -751,6 +751,7 @@ static void __exit exit_ceph_lib(void)
>         ceph_msgr_exit();
>         ceph_crypto_shutdown();
>         ceph_debugfs_cleanup();
> +       ceph_string_cleanup();
>  }
>
>  module_init(init_ceph_lib);
> diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
> new file mode 100644
> index 0000000..bb49bd7
> --- /dev/null
> +++ b/net/ceph/string_table.c
> @@ -0,0 +1,121 @@
> +#include <linux/slab.h>
> +#include <linux/gfp.h>
> +#include <linux/string.h>
> +#include <linux/spinlock.h>
> +#include <linux/ceph/string_table.h>
> +
> +static DEFINE_SPINLOCK(string_tree_lock);
> +static struct rb_root string_tree = RB_ROOT;
> +
> +struct ceph_string *ceph_find_or_create_string(const char* str, size_t len)
> +{
> +       struct ceph_string *cs, *exist;
> +       struct rb_node **p, *parent;
> +       int ret;
> +
> +       exist = NULL;
> +       spin_lock(&string_tree_lock);
> +       p = &string_tree.rb_node;
> +       while (*p) {
> +               exist = rb_entry(*p, struct ceph_string, node);
> +               ret = ceph_compare_string(exist, str, len);
> +               if (ret > 0)
> +                       p = &(*p)->rb_left;
> +               else if (ret < 0)
> +                       p = &(*p)->rb_right;
> +               else
> +                       break;
> +               exist = NULL;
> +       }
> +       if (exist && !kref_get_unless_zero(&exist->kref)) {
> +               rb_erase(&exist->node, &string_tree);
> +               RB_CLEAR_NODE(&exist->node);
> +               exist = NULL;
> +       }
> +       spin_unlock(&string_tree_lock);
> +       if (exist)
> +               return exist;
> +
> +       cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS);
> +       if (!cs)
> +               return NULL;
> +
> +       kref_init(&cs->kref);
> +       cs->len = len;
> +       memcpy(cs->str, str, len);
> +       cs->str[len] = 0;
> +
> +retry:
> +       exist = NULL;
> +       parent = NULL;
> +       p = &string_tree.rb_node;
> +       spin_lock(&string_tree_lock);
> +       while (*p) {
> +               parent = *p;
> +               exist = rb_entry(*p, struct ceph_string, node);
> +               ret = ceph_compare_string(exist, str, len);
> +               if (ret > 0)
> +                       p = &(*p)->rb_left;
> +               else if (ret < 0)
> +                       p = &(*p)->rb_right;
> +               else
> +                       break;
> +               exist = NULL;
> +       }
> +       ret = 0;
> +       if (!exist) {
> +               rb_link_node(&cs->node, parent, p);
> +               rb_insert_color(&cs->node, &string_tree);
> +       } else if (!kref_get_unless_zero(&exist->kref)) {
> +               rb_erase(&exist->node, &string_tree);
> +               RB_CLEAR_NODE(&exist->node);
> +               ret = -EAGAIN;
> +       }
> +       spin_unlock(&string_tree_lock);
> +       if (ret == -EAGAIN)
> +               goto retry;
> +
> +       if (exist) {
> +               kfree(cs);
> +               cs = exist;
> +       }
> +
> +       return cs;
> +}
> +EXPORT_SYMBOL(ceph_find_or_create_string);
> +
> +static void ceph_free_string(struct rcu_head *head)
> +{
> +       struct ceph_string *cs = container_of(head, struct ceph_string, rcu);
> +       kfree(cs);
> +}
> +
> +void ceph_release_string(struct kref *ref)
> +{
> +       struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
> +
> +       spin_lock(&string_tree_lock);
> +       if (!RB_EMPTY_NODE(&cs->node)) {
> +               rb_erase(&cs->node, &string_tree);
> +               RB_CLEAR_NODE(&cs->node);
> +       }
> +       spin_unlock(&string_tree_lock);
> +
> +       call_rcu(&cs->rcu, ceph_free_string);
> +}
> +EXPORT_SYMBOL(ceph_release_string);
> +
> +void ceph_string_cleanup(void)
> +{
> +       struct rb_node *p;
> +       struct ceph_string *cs;
> +       if (RB_EMPTY_ROOT(&string_tree))
> +               return;
> +
> +       pr_err("libceph: detect string leaks\n");
> +       while ((p = rb_first(&string_tree))) {
> +               cs = rb_entry(p, struct ceph_string, node);
> +               rb_erase(p, &string_tree);
> +               kfree(cs);
> +       }
> +}

Hi Zheng,

A one and a half line commit message and an equally short cover letter
for a series such as this isn't enough.  I *happen* to know that the
basic use case for namespaces in cephfs is going to be restricting
users to different parts of the directory tree, with the enforcement
happening in ceph on the server side, as opposed to in ceph on the
client side, but I would appreciate some details on what the actual
namespace names are going to be, whether it's user input or not,
whether there are plans to use namespaces for anything else, etc.

I don't like the idea of sharing namespace name strings between libceph
and ceph modules, especially with the strings themselves hosted in
libceph.  rbd has no use for namespaces, so libceph can live with
namespace names embedded into ceph_osd_request by value or with
a simple non-owning pointer, leaving reference counting to the outside
modules, if one of the use cases is "one namespace with a long name for
the entire directory tree" or something close to it.

I think the sharing infrastructure should be moved into cephfs, and
probably changed to share entire file layouts along the way.  I don't
know this code well enough to be sure, but it seems that by sharing
file layouts and making ci->i_layout an owning pointer you might be
able to piggy back on i_ceph_lock and considerably simplify the whole
thing by dropping RCU and eliminating numerous get/put calls.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng March 22, 2016, 9:17 a.m. UTC | #2
> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Sat, Feb 6, 2016 at 8:00 AM, Yan, Zheng <zyan@redhat.com> wrote:
>> The data structure is for storing namesapce string. It allows namespace
>> string to be shared by cephfs inodes with same layout.
>> 
>> Signed-off-by: Yan, Zheng <zyan@redhat.com>
>> ---
>> include/linux/ceph/libceph.h      |   1 +
>> include/linux/ceph/string_table.h |  58 ++++++++++++++++++
>> net/ceph/Makefile                 |   2 +-
>> net/ceph/ceph_common.c            |   1 +
>> net/ceph/string_table.c           | 121 ++++++++++++++++++++++++++++++++++++++
>> 5 files changed, 182 insertions(+), 1 deletion(-)
>> create mode 100644 include/linux/ceph/string_table.h
>> create mode 100644 net/ceph/string_table.c
>> 
>> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
>> index e7975e4..8e9868d 100644
>> --- a/include/linux/ceph/libceph.h
>> +++ b/include/linux/ceph/libceph.h
>> @@ -21,6 +21,7 @@
>> #include <linux/ceph/mon_client.h>
>> #include <linux/ceph/osd_client.h>
>> #include <linux/ceph/ceph_fs.h>
>> +#include <linux/ceph/string_table.h>
>> 
>> /*
>>  * mount options
>> diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h
>> new file mode 100644
>> index 0000000..427b6f9
>> --- /dev/null
>> +++ b/include/linux/ceph/string_table.h
>> @@ -0,0 +1,58 @@
>> +#ifndef _FS_CEPH_STRING_TABLE_H
>> +#define _FS_CEPH_STRING_TABLE_H
>> +
>> +#include <linux/types.h>
>> +#include <linux/kref.h>
>> +#include <linux/rbtree.h>
>> +#include <linux/rcupdate.h>
>> +
>> +struct ceph_string {
>> +       struct kref kref;
>> +       union {
>> +               struct rb_node node;
>> +               struct rcu_head rcu;
>> +       };
>> +       size_t len;
>> +       char str[];
>> +};
>> +
>> +extern void ceph_release_string(struct kref *ref);
>> +extern struct ceph_string *ceph_find_or_create_string(const char *str,
>> +                                                     size_t len);
>> +extern void ceph_string_cleanup(void);
>> +
>> +static inline void ceph_get_string(struct ceph_string *str)
>> +{
>> +       kref_get(&str->kref);
>> +}
>> +
>> +static inline void ceph_put_string(struct ceph_string *str)
>> +{
>> +       if (!str)
>> +               return;
>> +       kref_put(&str->kref, ceph_release_string);
>> +}
>> +
>> +static inline int ceph_compare_string(struct ceph_string *cs,
>> +                                     const char* str, size_t len)
>> +{
>> +       size_t cs_len = cs ? cs->len : 0;
>> +       if (cs_len != len)
>> +               return cs_len - len;
>> +       if (len == 0)
>> +               return 0;
>> +       return strncmp(cs->str, str, len);
>> +}
>> +
>> +#define ceph_try_get_string(x)                                 \
>> +({                                                             \
>> +       struct ceph_string *___str;                             \
>> +       rcu_read_lock();                                        \
>> +       ___str = rcu_dereference(x);                            \
>> +       if (___str && !kref_get_unless_zero(&___str->kref))     \
>> +               ___str = NULL;                                  \
>> +       rcu_read_unlock();                                      \
>> +       (___str);                                               \
>> +})
>> +
>> +#endif
>> diff --git a/net/ceph/Makefile b/net/ceph/Makefile
>> index 958d9856..84cbed6 100644
>> --- a/net/ceph/Makefile
>> +++ b/net/ceph/Makefile
>> @@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
>>        crypto.o armor.o \
>>        auth_x.o \
>>        ceph_fs.o ceph_strings.o ceph_hash.o \
>> -       pagevec.o snapshot.o
>> +       pagevec.o snapshot.o string_table.o
>> 
>> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
>> index dcc18c6..baca649 100644
>> --- a/net/ceph/ceph_common.c
>> +++ b/net/ceph/ceph_common.c
>> @@ -751,6 +751,7 @@ static void __exit exit_ceph_lib(void)
>>        ceph_msgr_exit();
>>        ceph_crypto_shutdown();
>>        ceph_debugfs_cleanup();
>> +       ceph_string_cleanup();
>> }
>> 
>> module_init(init_ceph_lib);
>> diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
>> new file mode 100644
>> index 0000000..bb49bd7
>> --- /dev/null
>> +++ b/net/ceph/string_table.c
>> @@ -0,0 +1,121 @@
>> +#include <linux/slab.h>
>> +#include <linux/gfp.h>
>> +#include <linux/string.h>
>> +#include <linux/spinlock.h>
>> +#include <linux/ceph/string_table.h>
>> +
>> +static DEFINE_SPINLOCK(string_tree_lock);
>> +static struct rb_root string_tree = RB_ROOT;
>> +
>> +struct ceph_string *ceph_find_or_create_string(const char* str, size_t len)
>> +{
>> +       struct ceph_string *cs, *exist;
>> +       struct rb_node **p, *parent;
>> +       int ret;
>> +
>> +       exist = NULL;
>> +       spin_lock(&string_tree_lock);
>> +       p = &string_tree.rb_node;
>> +       while (*p) {
>> +               exist = rb_entry(*p, struct ceph_string, node);
>> +               ret = ceph_compare_string(exist, str, len);
>> +               if (ret > 0)
>> +                       p = &(*p)->rb_left;
>> +               else if (ret < 0)
>> +                       p = &(*p)->rb_right;
>> +               else
>> +                       break;
>> +               exist = NULL;
>> +       }
>> +       if (exist && !kref_get_unless_zero(&exist->kref)) {
>> +               rb_erase(&exist->node, &string_tree);
>> +               RB_CLEAR_NODE(&exist->node);
>> +               exist = NULL;
>> +       }
>> +       spin_unlock(&string_tree_lock);
>> +       if (exist)
>> +               return exist;
>> +
>> +       cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS);
>> +       if (!cs)
>> +               return NULL;
>> +
>> +       kref_init(&cs->kref);
>> +       cs->len = len;
>> +       memcpy(cs->str, str, len);
>> +       cs->str[len] = 0;
>> +
>> +retry:
>> +       exist = NULL;
>> +       parent = NULL;
>> +       p = &string_tree.rb_node;
>> +       spin_lock(&string_tree_lock);
>> +       while (*p) {
>> +               parent = *p;
>> +               exist = rb_entry(*p, struct ceph_string, node);
>> +               ret = ceph_compare_string(exist, str, len);
>> +               if (ret > 0)
>> +                       p = &(*p)->rb_left;
>> +               else if (ret < 0)
>> +                       p = &(*p)->rb_right;
>> +               else
>> +                       break;
>> +               exist = NULL;
>> +       }
>> +       ret = 0;
>> +       if (!exist) {
>> +               rb_link_node(&cs->node, parent, p);
>> +               rb_insert_color(&cs->node, &string_tree);
>> +       } else if (!kref_get_unless_zero(&exist->kref)) {
>> +               rb_erase(&exist->node, &string_tree);
>> +               RB_CLEAR_NODE(&exist->node);
>> +               ret = -EAGAIN;
>> +       }
>> +       spin_unlock(&string_tree_lock);
>> +       if (ret == -EAGAIN)
>> +               goto retry;
>> +
>> +       if (exist) {
>> +               kfree(cs);
>> +               cs = exist;
>> +       }
>> +
>> +       return cs;
>> +}
>> +EXPORT_SYMBOL(ceph_find_or_create_string);
>> +
>> +static void ceph_free_string(struct rcu_head *head)
>> +{
>> +       struct ceph_string *cs = container_of(head, struct ceph_string, rcu);
>> +       kfree(cs);
>> +}
>> +
>> +void ceph_release_string(struct kref *ref)
>> +{
>> +       struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
>> +
>> +       spin_lock(&string_tree_lock);
>> +       if (!RB_EMPTY_NODE(&cs->node)) {
>> +               rb_erase(&cs->node, &string_tree);
>> +               RB_CLEAR_NODE(&cs->node);
>> +       }
>> +       spin_unlock(&string_tree_lock);
>> +
>> +       call_rcu(&cs->rcu, ceph_free_string);
>> +}
>> +EXPORT_SYMBOL(ceph_release_string);
>> +
>> +void ceph_string_cleanup(void)
>> +{
>> +       struct rb_node *p;
>> +       struct ceph_string *cs;
>> +       if (RB_EMPTY_ROOT(&string_tree))
>> +               return;
>> +
>> +       pr_err("libceph: detect string leaks\n");
>> +       while ((p = rb_first(&string_tree))) {
>> +               cs = rb_entry(p, struct ceph_string, node);
>> +               rb_erase(p, &string_tree);
>> +               kfree(cs);
>> +       }
>> +}
> 
> Hi Zheng,
> 
> A one and a half line commit message and an equally short cover letter
> for a series such as this isn't enough.  I *happen* to know that the
> basic use case for namespaces in cephfs is going to be restricting
> users to different parts of the directory tree, with the enforcement
> happening in ceph on the server side, as opposed to in ceph on the
> client side, but I would appreciate some details on what the actual
> namespace names are going to be, whether it's user input or not,
> whether there are plans to use namespaces for anything else, etc.
> 

The namespace restriction you mentioned is for cephfs metadata. This namespace is restricting users to different namespaces in cephfs data pool. (At present, the only way to restrict data access is, creating multiple cephfs data pools, restrict users to different data pool. Creating lost of pools is not efficient for the cluster) 


> I don't like the idea of sharing namespace name strings between libceph
> and ceph modules, especially with the strings themselves hosted in
> libceph.  rbd has no use for namespaces, so libceph can live with
> namespace names embedded into ceph_osd_request by value or with
> a simple non-owning pointer, leaving reference counting to the outside
> modules, if one of the use cases is "one namespace with a long name for
> the entire directory tree" or something close to it.
> 
> I think the sharing infrastructure should be moved into cephfs, and
> probably changed to share entire file layouts along the way.  I don't
> know this code well enough to be sure, but it seems that by sharing
> file layouts and making ci->i_layout an owning pointer you might be
> able to piggy back on i_ceph_lock and considerably simplify the whole
> thing by dropping RCU and eliminating numerous get/put calls.

RBD may use namespace later.
http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support

The reason I use RCU here is that ci->i_layout.pool_ns can change at any time. For the same reason, using non-owning pointer for namespace or entire layout is unfeasible. Using embedded namespace is not elegant either. When allocating ceph_osd_request, cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer, unlock i_ceph_lock, pass ci->i_layout and the temporary buffer to the ceph_osdc_xxx_request().

Regards
Yan, Zheng



> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov March 22, 2016, 11 a.m. UTC | #3
On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:

[ snip ]

>> Hi Zheng,
>>
>> A one and a half line commit message and an equally short cover letter
>> for a series such as this isn't enough.  I *happen* to know that the
>> basic use case for namespaces in cephfs is going to be restricting
>> users to different parts of the directory tree, with the enforcement
>> happening in ceph on the server side, as opposed to in ceph on the
>> client side, but I would appreciate some details on what the actual
>> namespace names are going to be, whether it's user input or not,
>> whether there are plans to use namespaces for anything else, etc.
>>
>
> The namespace restriction you mentioned is for cephfs metadata. This
> namespace is restricting users to different namespaces in cephfs data
> pool. (At present, the only way to restrict data access is, creating
> multiple cephfs data pools, restrict users to different data pool.
> Creating lost of pools is not efficient for the cluster)

What about the namespace names, who is generating them, how long are
they going to be?  Please describe in detail how this is going to work
for both data and metadata pools.

>
>
>> I don't like the idea of sharing namespace name strings between libceph
>> and ceph modules, especially with the strings themselves hosted in
>> libceph.  rbd has no use for namespaces, so libceph can live with
>> namespace names embedded into ceph_osd_request by value or with
>> a simple non-owning pointer, leaving reference counting to the outside
>> modules, if one of the use cases is "one namespace with a long name for
>> the entire directory tree" or something close to it.
>>
>> I think the sharing infrastructure should be moved into cephfs, and
>> probably changed to share entire file layouts along the way.  I don't
>> know this code well enough to be sure, but it seems that by sharing
>> file layouts and making ci->i_layout an owning pointer you might be
>> able to piggy back on i_ceph_lock and considerably simplify the whole
>> thing by dropping RCU and eliminating numerous get/put calls.
>
> RBD may use namespace later.
> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support

Well, compared to cephfs, it's hard to call that "using" - in that
case, there will only ever be one namespace per image.  My point is
it's never going to use the string sharing infrastructure and is fine
with a non-owning pointer to a string in the file layout field of the
in-memory rbd image header.

>
> The reason I use RCU here is that ci->i_layout.pool_ns can change at
> any time. For the same reason, using non-owning pointer for namespace
> or entire layout is unfeasible. Using embedded namespace is not
> elegant either. When allocating ceph_osd_request, cephfs needs to
> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
> ceph_osdc_xxx_request().

Hmm, RCU doesn't protect you from pool_ns or other file layout fields
changing while the OSD request is in flight.  As used above, it allows
ceph_try_get_string() to not take any locks and that's it.

Why exactly can't file layouts be shared?  ci->i_layout would become
reference counted and you would give libceph a pointer to the pool_ns
string (or entire layout) after bumping it.  It doesn't matter if
pool_ns or the rest of the layout changes due to a cap grant or revoke
while libceph is servicing the OSD request: you would unlink it from
the tree but the bumped reference will keep the layout around, to be
put in the OSD request completion callback or so.  Layout lookup would
have to happen in exactly two places: when newing an inode and handling
cap grant/revoke, in other places you would simply bump the count on
the basis of already holding a valid pointer.  You wouldn't have to
unlink in the destructor, so no hassle with kref_get_unless_zero() and
no need for RCU, with i_ceph_lock providing the exclusion around the
tree.

Like I said, I'm not familiar with the code in question at the
necessary level of detail, so feel free to destroy this, but I don't
see a problem.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng March 22, 2016, 1:57 p.m. UTC | #4
> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> [ snip ]
> 
>>> Hi Zheng,
>>> 
>>> A one and a half line commit message and an equally short cover letter
>>> for a series such as this isn't enough.  I *happen* to know that the
>>> basic use case for namespaces in cephfs is going to be restricting
>>> users to different parts of the directory tree, with the enforcement
>>> happening in ceph on the server side, as opposed to in ceph on the
>>> client side, but I would appreciate some details on what the actual
>>> namespace names are going to be, whether it's user input or not,
>>> whether there are plans to use namespaces for anything else, etc.
>>> 
>> 
>> The namespace restriction you mentioned is for cephfs metadata. This
>> namespace is restricting users to different namespaces in cephfs data
>> pool. (At present, the only way to restrict data access is, creating
>> multiple cephfs data pools, restrict users to different data pool.
>> Creating lost of pools is not efficient for the cluster)
> 
> What about the namespace names, who is generating them, how long are
> they going to be?  Please describe in detail how this is going to work
> for both data and metadata pools.

For example, to restrict user foo to directory /foo_dir

// create auth caps for user foo.
ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’

// mount cephfs by user admin
mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx

// set directory’s layout.pool_namespace
setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir

Admin user chooses namespace name. In most cases, namespace name does not change. 

> 
>> 
>> 
>>> I don't like the idea of sharing namespace name strings between libceph
>>> and ceph modules, especially with the strings themselves hosted in
>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>> namespace names embedded into ceph_osd_request by value or with
>>> a simple non-owning pointer, leaving reference counting to the outside
>>> modules, if one of the use cases is "one namespace with a long name for
>>> the entire directory tree" or something close to it.
>>> 
>>> I think the sharing infrastructure should be moved into cephfs, and
>>> probably changed to share entire file layouts along the way.  I don't
>>> know this code well enough to be sure, but it seems that by sharing
>>> file layouts and making ci->i_layout an owning pointer you might be
>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>> thing by dropping RCU and eliminating numerous get/put calls.
>> 
>> RBD may use namespace later.
>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
> 
> Well, compared to cephfs, it's hard to call that "using" - in that
> case, there will only ever be one namespace per image.  My point is
> it's never going to use the string sharing infrastructure and is fine
> with a non-owning pointer to a string in the file layout field of the
> in-memory rbd image header.
> 
>> 
>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>> any time. For the same reason, using non-owning pointer for namespace
>> or entire layout is unfeasible. Using embedded namespace is not
>> elegant either. When allocating ceph_osd_request, cephfs needs to
>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>> ceph_osdc_xxx_request().
> 
> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
> changing while the OSD request is in flight.  As used above, it allows
> ceph_try_get_string() to not take any locks and that's it.

Yes.  But not taking lock simplify the code a lot.  we don't need to lock/unlock i_ceph_lock each time i_layout is used.

> 
> Why exactly can't file layouts be shared?  ci->i_layout would become
> reference counted and you would give libceph a pointer to the pool_ns
> string (or entire layout) after bumping it.  It doesn't matter if
> pool_ns or the rest of the layout changes due to a cap grant or revoke
> while libceph is servicing the OSD request: you would unlink it from
> the tree but the bumped reference will keep the layout around, to be
> put in the OSD request completion callback or so.  Layout lookup would
> have to happen in exactly two places: when newing an inode and handling
> cap grant/revoke, in other places you would simply bump the count on
> the basis of already holding a valid pointer.  You wouldn't have to
> unlink in the destructor, so no hassle with kref_get_unless_zero() and
> no need for RCU, with i_ceph_lock providing the exclusion around the
> tree.

This means cephfs needs to set r_callback for all ceph_osd_request, ceph_osd_request also needs a new field to store layout pointer. I don’t think it’s better/simpler than reference counted namespace string. 

> 
> Like I said, I'm not familiar with the code in question at the
> necessary level of detail, so feel free to destroy this, but I don't
> see a problem.
> 
> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov March 22, 2016, 4:13 p.m. UTC | #5
On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>
>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> [ snip ]
>>
>>>> Hi Zheng,
>>>>
>>>> A one and a half line commit message and an equally short cover letter
>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>> basic use case for namespaces in cephfs is going to be restricting
>>>> users to different parts of the directory tree, with the enforcement
>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>> client side, but I would appreciate some details on what the actual
>>>> namespace names are going to be, whether it's user input or not,
>>>> whether there are plans to use namespaces for anything else, etc.
>>>>
>>>
>>> The namespace restriction you mentioned is for cephfs metadata. This
>>> namespace is restricting users to different namespaces in cephfs data
>>> pool. (At present, the only way to restrict data access is, creating
>>> multiple cephfs data pools, restrict users to different data pool.
>>> Creating lost of pools is not efficient for the cluster)
>>
>> What about the namespace names, who is generating them, how long are
>> they going to be?  Please describe in detail how this is going to work
>> for both data and metadata pools.
>
> For example, to restrict user foo to directory /foo_dir
>
> // create auth caps for user foo.
> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>
> // mount cephfs by user admin
> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>
> // set directory’s layout.pool_namespace
> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>
> Admin user chooses namespace name. In most cases, namespace name does not change.

Good, I guess limiting it to 100 chars (or maybe even a smaller
number) is sensible then.

>
>>
>>>
>>>
>>>> I don't like the idea of sharing namespace name strings between libceph
>>>> and ceph modules, especially with the strings themselves hosted in
>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>> namespace names embedded into ceph_osd_request by value or with
>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>> modules, if one of the use cases is "one namespace with a long name for
>>>> the entire directory tree" or something close to it.
>>>>
>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>> probably changed to share entire file layouts along the way.  I don't
>>>> know this code well enough to be sure, but it seems that by sharing
>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>
>>> RBD may use namespace later.
>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>
>> Well, compared to cephfs, it's hard to call that "using" - in that
>> case, there will only ever be one namespace per image.  My point is
>> it's never going to use the string sharing infrastructure and is fine
>> with a non-owning pointer to a string in the file layout field of the
>> in-memory rbd image header.
>>
>>>
>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>> any time. For the same reason, using non-owning pointer for namespace
>>> or entire layout is unfeasible. Using embedded namespace is not
>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>> ceph_osdc_xxx_request().
>>
>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>> changing while the OSD request is in flight.  As used above, it allows
>> ceph_try_get_string() to not take any locks and that's it.
>
> Yes.  But not taking lock simplify the code a lot.  we don't need to
> lock/unlock i_ceph_lock each time i_layout is used.

I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
the code base a simplification, but, more importantly, is keeping the
pool_ns pointer valid really all you need?  Shouldn't there be some
kind of synchronization around "OK, I'm switching to a new layout for
this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
with two successive calls to ceph_osdc_new_request() potentially ending
up with two different namespaces, e.g. ceph_uninline_data().

>
>>
>> Why exactly can't file layouts be shared?  ci->i_layout would become
>> reference counted and you would give libceph a pointer to the pool_ns
>> string (or entire layout) after bumping it.  It doesn't matter if
>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>> while libceph is servicing the OSD request: you would unlink it from
>> the tree but the bumped reference will keep the layout around, to be
>> put in the OSD request completion callback or so.  Layout lookup would
>> have to happen in exactly two places: when newing an inode and handling
>> cap grant/revoke, in other places you would simply bump the count on
>> the basis of already holding a valid pointer.  You wouldn't have to
>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>> no need for RCU, with i_ceph_lock providing the exclusion around the
>> tree.
>
> This means cephfs needs to set r_callback for all ceph_osd_request,
> ceph_osd_request also needs a new field to store layout pointer.
> I don’t think it’s better/simpler than reference counted namespace
> string.

Not necessarily - you can put after ceph_osdc_wait_request() returns.
Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
and it'll probably be required that all OSD requests set one of the
callbacks, except for stateless fire-and-forget ones.

Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
me because a) it naturally hangs off of ceph inode and b) logically,
it is entire layouts and not just namespaces that are shared across the
directory tree.  If you think reference counted pool_ns strings are
better, I won't argue with that, but, with cephfs being the only user
of either solution, it'll have to live in fs/ceph.

Separately, I think passing non-owning pool_ns pointers into libceph is
worth exploring, but if that doesn't easily map onto cephfs lifetime or
ownership rules, we will go with embedding namespace names by value into
ceph_osd_request (or, rather, ceph_object_locator).

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Farnum March 22, 2016, 6:46 p.m. UTC | #6
On Tue, Mar 22, 2016 at 9:13 AM, Ilya Dryomov <idryomov@gmail.com> wrote:
> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>> the entire directory tree" or something close to it.
>>>>>
>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>
>>>> RBD may use namespace later.
>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>
>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>> case, there will only ever be one namespace per image.  My point is
>>> it's never going to use the string sharing infrastructure and is fine
>>> with a non-owning pointer to a string in the file layout field of the
>>> in-memory rbd image header.
>>>
>>>>
>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>> any time. For the same reason, using non-owning pointer for namespace
>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>> ceph_osdc_xxx_request().
>>>
>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>> changing while the OSD request is in flight.  As used above, it allows
>>> ceph_try_get_string() to not take any locks and that's it.
>>
>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>> lock/unlock i_ceph_lock each time i_layout is used.
>
> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
> the code base a simplification, but, more importantly, is keeping the
> pool_ns pointer valid really all you need?  Shouldn't there be some
> kind of synchronization around "OK, I'm switching to a new layout for
> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
> with two successive calls to ceph_osdc_new_request() potentially ending
> up with two different namespaces, e.g. ceph_uninline_data().

I haven't looked at the code at all, but this part is really confusing
me. Namespaces can't be changed arbitrarily, but when they are, a
subsequent write really needs to use the updated namespace! It needs
an atomic switch, not a lazy update. Is there some point at which
out-of-date users get updated prior to actually sending off OSD ops?
-Greg
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng March 23, 2016, 3:37 a.m. UTC | #7
> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>> 
>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> [ snip ]
>>> 
>>>>> Hi Zheng,
>>>>> 
>>>>> A one and a half line commit message and an equally short cover letter
>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>> users to different parts of the directory tree, with the enforcement
>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>> client side, but I would appreciate some details on what the actual
>>>>> namespace names are going to be, whether it's user input or not,
>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>> 
>>>> 
>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>> namespace is restricting users to different namespaces in cephfs data
>>>> pool. (At present, the only way to restrict data access is, creating
>>>> multiple cephfs data pools, restrict users to different data pool.
>>>> Creating lost of pools is not efficient for the cluster)
>>> 
>>> What about the namespace names, who is generating them, how long are
>>> they going to be?  Please describe in detail how this is going to work
>>> for both data and metadata pools.
>> 
>> For example, to restrict user foo to directory /foo_dir
>> 
>> // create auth caps for user foo.
>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>> 
>> // mount cephfs by user admin
>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>> 
>> // set directory’s layout.pool_namespace
>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>> 
>> Admin user chooses namespace name. In most cases, namespace name does not change.
> 
> Good, I guess limiting it to 100 chars (or maybe even a smaller
> number) is sensible then.
> 
>> 
>>> 
>>>> 
>>>> 
>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>> the entire directory tree" or something close to it.
>>>>> 
>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>> 
>>>> RBD may use namespace later.
>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>> 
>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>> case, there will only ever be one namespace per image.  My point is
>>> it's never going to use the string sharing infrastructure and is fine
>>> with a non-owning pointer to a string in the file layout field of the
>>> in-memory rbd image header.
>>> 
>>>> 
>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>> any time. For the same reason, using non-owning pointer for namespace
>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>> ceph_osdc_xxx_request().
>>> 
>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>> changing while the OSD request is in flight.  As used above, it allows
>>> ceph_try_get_string() to not take any locks and that's it.
>> 
>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>> lock/unlock i_ceph_lock each time i_layout is used.
> 
> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
> the code base a simplification, but, more importantly, is keeping the
> pool_ns pointer valid really all you need?  Shouldn't there be some
> kind of synchronization around "OK, I'm switching to a new layout for
> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
> with two successive calls to ceph_osdc_new_request() potentially ending
> up with two different namespaces, e.g. ceph_uninline_data().

There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.

>>> 
>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>> reference counted and you would give libceph a pointer to the pool_ns
>>> string (or entire layout) after bumping it.  It doesn't matter if
>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>> while libceph is servicing the OSD request: you would unlink it from
>>> the tree but the bumped reference will keep the layout around, to be
>>> put in the OSD request completion callback or so.  Layout lookup would
>>> have to happen in exactly two places: when newing an inode and handling
>>> cap grant/revoke, in other places you would simply bump the count on
>>> the basis of already holding a valid pointer.  You wouldn't have to
>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>> tree.
>> 
>> This means cephfs needs to set r_callback for all ceph_osd_request,
>> ceph_osd_request also needs a new field to store layout pointer.
>> I don’t think it’s better/simpler than reference counted namespace
>> string.
> 
> Not necessarily - you can put after ceph_osdc_wait_request() returns.
> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
> and it'll probably be required that all OSD requests set one of the
> callbacks, except for stateless fire-and-forget ones.

For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?

> 
> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
> me because a) it naturally hangs off of ceph inode and b) logically,
> it is entire layouts and not just namespaces that are shared across the
> directory tree.  If you think reference counted pool_ns strings are
> better, I won't argue with that, but, with cephfs being the only user
> of either solution, it'll have to live in fs/ceph.

I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes. 

> Separately, I think passing non-owning pool_ns pointers into libceph is
> worth exploring, but if that doesn't easily map onto cephfs lifetime or
> ownership rules, we will go with embedding namespace names by value into
> ceph_osd_request (or, rather, ceph_object_locator).
> 

As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.

Regards
Yan, Zheng

> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov March 23, 2016, 9:51 a.m. UTC | #8
On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>
>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>
>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>
>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>
>>>> [ snip ]
>>>>
>>>>>> Hi Zheng,
>>>>>>
>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>> client side, but I would appreciate some details on what the actual
>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>
>>>>>
>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>> Creating lost of pools is not efficient for the cluster)
>>>>
>>>> What about the namespace names, who is generating them, how long are
>>>> they going to be?  Please describe in detail how this is going to work
>>>> for both data and metadata pools.
>>>
>>> For example, to restrict user foo to directory /foo_dir
>>>
>>> // create auth caps for user foo.
>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>>>
>>> // mount cephfs by user admin
>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>
>>> // set directory’s layout.pool_namespace
>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>
>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>
>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>> number) is sensible then.
>>
>>>
>>>>
>>>>>
>>>>>
>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>> the entire directory tree" or something close to it.
>>>>>>
>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>
>>>>> RBD may use namespace later.
>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>
>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>> case, there will only ever be one namespace per image.  My point is
>>>> it's never going to use the string sharing infrastructure and is fine
>>>> with a non-owning pointer to a string in the file layout field of the
>>>> in-memory rbd image header.
>>>>
>>>>>
>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>> ceph_osdc_xxx_request().
>>>>
>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>> changing while the OSD request is in flight.  As used above, it allows
>>>> ceph_try_get_string() to not take any locks and that's it.
>>>
>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>> lock/unlock i_ceph_lock each time i_layout is used.
>>
>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>> the code base a simplification, but, more importantly, is keeping the
>> pool_ns pointer valid really all you need?  Shouldn't there be some
>> kind of synchronization around "OK, I'm switching to a new layout for
>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>> with two successive calls to ceph_osdc_new_request() potentially ending
>> up with two different namespaces, e.g. ceph_uninline_data().
>
> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.

I have a bit of trouble parsing "block new read/write, for in-progress
read/write".  So the client will stop issuing requests as soon as it
learns that it no longer has a cap, but what happens with the in-flight
requests?

>
>>>>
>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>> while libceph is servicing the OSD request: you would unlink it from
>>>> the tree but the bumped reference will keep the layout around, to be
>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>> have to happen in exactly two places: when newing an inode and handling
>>>> cap grant/revoke, in other places you would simply bump the count on
>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>> tree.
>>>
>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>> ceph_osd_request also needs a new field to store layout pointer.
>>> I don’t think it’s better/simpler than reference counted namespace
>>> string.
>>
>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>> and it'll probably be required that all OSD requests set one of the
>> callbacks, except for stateless fire-and-forget ones.
>
> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>
>>
>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>> me because a) it naturally hangs off of ceph inode and b) logically,
>> it is entire layouts and not just namespaces that are shared across the
>> directory tree.  If you think reference counted pool_ns strings are
>> better, I won't argue with that, but, with cephfs being the only user
>> of either solution, it'll have to live in fs/ceph.
>
> I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>
>> Separately, I think passing non-owning pool_ns pointers into libceph is
>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>> ownership rules, we will go with embedding namespace names by value into
>> ceph_osd_request (or, rather, ceph_object_locator).
>>
>
> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.

So you are maintaining that all that is needed is to keep the memory
valid and there is no locking around installing a new namespace for an
inode.  I didn't realize that when I suggested layout sharing, it makes
it much less attractive.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng March 23, 2016, 1:02 p.m. UTC | #9
> On Mar 23, 2016, at 17:51, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>> 
>>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>> 
>>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>> 
>>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>> 
>>>>> [ snip ]
>>>>> 
>>>>>>> Hi Zheng,
>>>>>>> 
>>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>>> client side, but I would appreciate some details on what the actual
>>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>> 
>>>>>> 
>>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>>> Creating lost of pools is not efficient for the cluster)
>>>>> 
>>>>> What about the namespace names, who is generating them, how long are
>>>>> they going to be?  Please describe in detail how this is going to work
>>>>> for both data and metadata pools.
>>>> 
>>>> For example, to restrict user foo to directory /foo_dir
>>>> 
>>>> // create auth caps for user foo.
>>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>>>> 
>>>> // mount cephfs by user admin
>>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>> 
>>>> // set directory’s layout.pool_namespace
>>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>> 
>>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>> 
>>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>>> number) is sensible then.
>>> 
>>>> 
>>>>> 
>>>>>> 
>>>>>> 
>>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>>> the entire directory tree" or something close to it.
>>>>>>> 
>>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>> 
>>>>>> RBD may use namespace later.
>>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>> 
>>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>>> case, there will only ever be one namespace per image.  My point is
>>>>> it's never going to use the string sharing infrastructure and is fine
>>>>> with a non-owning pointer to a string in the file layout field of the
>>>>> in-memory rbd image header.
>>>>> 
>>>>>> 
>>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>>> ceph_osdc_xxx_request().
>>>>> 
>>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>>> changing while the OSD request is in flight.  As used above, it allows
>>>>> ceph_try_get_string() to not take any locks and that's it.
>>>> 
>>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>>> lock/unlock i_ceph_lock each time i_layout is used.
>>> 
>>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>>> the code base a simplification, but, more importantly, is keeping the
>>> pool_ns pointer valid really all you need?  Shouldn't there be some
>>> kind of synchronization around "OK, I'm switching to a new layout for
>>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>>> with two successive calls to ceph_osdc_new_request() potentially ending
>>> up with two different namespaces, e.g. ceph_uninline_data().
>> 
>> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.
> 
> I have a bit of trouble parsing "block new read/write, for in-progress
> read/write".  So the client will stop issuing requests as soon as it
> learns that it no longer has a cap, but what happens with the in-flight
> requests?

When client know MDS is revoking Frw caps, it stops issuing new request and waits for in-flight requests. After all in-flight requests completes, client releases Frw caps to MDS.   

> 
>> 
>>>>> 
>>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>>> while libceph is servicing the OSD request: you would unlink it from
>>>>> the tree but the bumped reference will keep the layout around, to be
>>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>>> have to happen in exactly two places: when newing an inode and handling
>>>>> cap grant/revoke, in other places you would simply bump the count on
>>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>>> tree.
>>>> 
>>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>>> ceph_osd_request also needs a new field to store layout pointer.
>>>> I don’t think it’s better/simpler than reference counted namespace
>>>> string.
>>> 
>>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>>> and it'll probably be required that all OSD requests set one of the
>>> callbacks, except for stateless fire-and-forget ones.
>> 
>> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>> 
>>> 
>>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>>> me because a) it naturally hangs off of ceph inode and b) logically,
>>> it is entire layouts and not just namespaces that are shared across the
>>> directory tree.  If you think reference counted pool_ns strings are
>>> better, I won't argue with that, but, with cephfs being the only user
>>> of either solution, it'll have to live in fs/ceph.
>> 
>> I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>> 
>>> Separately, I think passing non-owning pool_ns pointers into libceph is
>>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>>> ownership rules, we will go with embedding namespace names by value into
>>> ceph_osd_request (or, rather, ceph_object_locator).
>>> 
>> 
>> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.
> 
> So you are maintaining that all that is needed is to keep the memory
> valid and there is no locking around installing a new namespace for an
> inode.  I didn't realize that when I suggested layout sharing, it makes
> it much less attractive.

Yes.  That’s the main reason I decided to use RCU.

Regards
Yan, Zheng

> 
> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Douglas Fuller March 23, 2016, 3:39 p.m. UTC | #10
> +#define ceph_try_get_string(x)					\
> +({								\
> +	struct ceph_string *___str;				\
> +	rcu_read_lock();					\
> +	___str = rcu_dereference(x);				\
> +	if (___str && !kref_get_unless_zero(&___str->kref))	\
> +		___str = NULL;					\
> +	rcu_read_unlock();					\
> +	(___str);						\

Isn’t this an unsafe use of __str? As I understand it, it could be updated after rcu_read_unlock().

> +})

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov March 23, 2016, 7:27 p.m. UTC | #11
On Wed, Mar 23, 2016 at 2:02 PM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 23, 2016, at 17:51, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>
>>>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>
>>>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>
>>>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>
>>>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>
>>>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>
>>>>>> [ snip ]
>>>>>>
>>>>>>>> Hi Zheng,
>>>>>>>>
>>>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>>>> client side, but I would appreciate some details on what the actual
>>>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>>>
>>>>>>>
>>>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>>>> Creating lost of pools is not efficient for the cluster)
>>>>>>
>>>>>> What about the namespace names, who is generating them, how long are
>>>>>> they going to be?  Please describe in detail how this is going to work
>>>>>> for both data and metadata pools.
>>>>>
>>>>> For example, to restrict user foo to directory /foo_dir
>>>>>
>>>>> // create auth caps for user foo.
>>>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>>>>>
>>>>> // mount cephfs by user admin
>>>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>>>
>>>>> // set directory’s layout.pool_namespace
>>>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>>>
>>>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>>>
>>>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>>>> number) is sensible then.
>>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>>>> the entire directory tree" or something close to it.
>>>>>>>>
>>>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>>>
>>>>>>> RBD may use namespace later.
>>>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>>>
>>>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>>>> case, there will only ever be one namespace per image.  My point is
>>>>>> it's never going to use the string sharing infrastructure and is fine
>>>>>> with a non-owning pointer to a string in the file layout field of the
>>>>>> in-memory rbd image header.
>>>>>>
>>>>>>>
>>>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>>>> ceph_osdc_xxx_request().
>>>>>>
>>>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>>>> changing while the OSD request is in flight.  As used above, it allows
>>>>>> ceph_try_get_string() to not take any locks and that's it.
>>>>>
>>>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>>>> lock/unlock i_ceph_lock each time i_layout is used.
>>>>
>>>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>>>> the code base a simplification, but, more importantly, is keeping the
>>>> pool_ns pointer valid really all you need?  Shouldn't there be some
>>>> kind of synchronization around "OK, I'm switching to a new layout for
>>>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>>>> with two successive calls to ceph_osdc_new_request() potentially ending
>>>> up with two different namespaces, e.g. ceph_uninline_data().
>>>
>>> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.
>>
>> I have a bit of trouble parsing "block new read/write, for in-progress
>> read/write".  So the client will stop issuing requests as soon as it
>> learns that it no longer has a cap, but what happens with the in-flight
>> requests?
>
> When client know MDS is revoking Frw caps, it stops issuing new request and waits for in-flight requests. After all in-flight requests completes, client releases Frw caps to MDS.
>
>>
>>>
>>>>>>
>>>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>>>> while libceph is servicing the OSD request: you would unlink it from
>>>>>> the tree but the bumped reference will keep the layout around, to be
>>>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>>>> have to happen in exactly two places: when newing an inode and handling
>>>>>> cap grant/revoke, in other places you would simply bump the count on
>>>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>>>> tree.
>>>>>
>>>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>>>> ceph_osd_request also needs a new field to store layout pointer.
>>>>> I don’t think it’s better/simpler than reference counted namespace
>>>>> string.
>>>>
>>>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>>>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>>>> and it'll probably be required that all OSD requests set one of the
>>>> callbacks, except for stateless fire-and-forget ones.
>>>
>>> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>>>
>>>>
>>>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>>>> me because a) it naturally hangs off of ceph inode and b) logically,
>>>> it is entire layouts and not just namespaces that are shared across the
>>>> directory tree.  If you think reference counted pool_ns strings are
>>>> better, I won't argue with that, but, with cephfs being the only user
>>>> of either solution, it'll have to live in fs/ceph.
>>>
>>> I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>>>
>>>> Separately, I think passing non-owning pool_ns pointers into libceph is
>>>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>>>> ownership rules, we will go with embedding namespace names by value into
>>>> ceph_osd_request (or, rather, ceph_object_locator).
>>>>
>>>
>>> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.
>>
>> So you are maintaining that all that is needed is to keep the memory
>> valid and there is no locking around installing a new namespace for an
>> inode.  I didn't realize that when I suggested layout sharing, it makes
>> it much less attractive.
>
> Yes.  That’s the main reason I decided to use RCU.

For the record, I don't think it's a good design and I doubt the
implementation is going to work reliably, but that's your call.

Why would embedded namespaces in ceph_object_locator in libceph be
a nightmare for cephfs?  What do you refer to as a temporary buffer?
This kind of copying already occurs: you grab a ceph_string with
ceph_try_get_string() in ceph_osdc_new_request() and it's copied into
the request message, as part of encoding.  How is grabbing ceph_string
before calling into libceph and explicitly copying into object locator
different?

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng March 24, 2016, 1:39 a.m. UTC | #12
> On Mar 23, 2016, at 23:39, Douglas Fuller <dfuller@redhat.com> wrote:
> 
> 
>> +#define ceph_try_get_string(x)					\
>> +({								\
>> +	struct ceph_string *___str;				\
>> +	rcu_read_lock();					\
>> +	___str = rcu_dereference(x);				\
>> +	if (___str && !kref_get_unless_zero(&___str->kref))	\
>> +		___str = NULL;					\
>> +	rcu_read_unlock();					\
>> +	(___str);						\
> 
> Isn’t this an unsafe use of __str? As I understand it, it could be updated after rcu_read_unlock().

where does it get updated?

Regards
Yan, Zheng

> 
>> +})
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng March 24, 2016, 2:33 a.m. UTC | #13
> On Mar 24, 2016, at 03:27, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Wed, Mar 23, 2016 at 2:02 PM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 23, 2016, at 17:51, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>> 
>>>>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>> 
>>>>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>> 
>>>>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>> 
>>>>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>> 
>>>>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>> 
>>>>>>> [ snip ]
>>>>>>> 
>>>>>>>>> Hi Zheng,
>>>>>>>>> 
>>>>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>>>>> client side, but I would appreciate some details on what the actual
>>>>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>>>> 
>>>>>>>> 
>>>>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>>>>> Creating lost of pools is not efficient for the cluster)
>>>>>>> 
>>>>>>> What about the namespace names, who is generating them, how long are
>>>>>>> they going to be?  Please describe in detail how this is going to work
>>>>>>> for both data and metadata pools.
>>>>>> 
>>>>>> For example, to restrict user foo to directory /foo_dir
>>>>>> 
>>>>>> // create auth caps for user foo.
>>>>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>>>>>> 
>>>>>> // mount cephfs by user admin
>>>>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>>>> 
>>>>>> // set directory’s layout.pool_namespace
>>>>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>>>> 
>>>>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>>>> 
>>>>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>>>>> number) is sensible then.
>>>>> 
>>>>>> 
>>>>>>> 
>>>>>>>> 
>>>>>>>> 
>>>>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>>>>> the entire directory tree" or something close to it.
>>>>>>>>> 
>>>>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>>>> 
>>>>>>>> RBD may use namespace later.
>>>>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>>>> 
>>>>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>>>>> case, there will only ever be one namespace per image.  My point is
>>>>>>> it's never going to use the string sharing infrastructure and is fine
>>>>>>> with a non-owning pointer to a string in the file layout field of the
>>>>>>> in-memory rbd image header.
>>>>>>> 
>>>>>>>> 
>>>>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>>>>> ceph_osdc_xxx_request().
>>>>>>> 
>>>>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>>>>> changing while the OSD request is in flight.  As used above, it allows
>>>>>>> ceph_try_get_string() to not take any locks and that's it.
>>>>>> 
>>>>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>>>>> lock/unlock i_ceph_lock each time i_layout is used.
>>>>> 
>>>>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>>>>> the code base a simplification, but, more importantly, is keeping the
>>>>> pool_ns pointer valid really all you need?  Shouldn't there be some
>>>>> kind of synchronization around "OK, I'm switching to a new layout for
>>>>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>>>>> with two successive calls to ceph_osdc_new_request() potentially ending
>>>>> up with two different namespaces, e.g. ceph_uninline_data().
>>>> 
>>>> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.
>>> 
>>> I have a bit of trouble parsing "block new read/write, for in-progress
>>> read/write".  So the client will stop issuing requests as soon as it
>>> learns that it no longer has a cap, but what happens with the in-flight
>>> requests?
>> 
>> When client know MDS is revoking Frw caps, it stops issuing new request and waits for in-flight requests. After all in-flight requests completes, client releases Frw caps to MDS.
>> 
>>> 
>>>> 
>>>>>>> 
>>>>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>>>>> while libceph is servicing the OSD request: you would unlink it from
>>>>>>> the tree but the bumped reference will keep the layout around, to be
>>>>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>>>>> have to happen in exactly two places: when newing an inode and handling
>>>>>>> cap grant/revoke, in other places you would simply bump the count on
>>>>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>>>>> tree.
>>>>>> 
>>>>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>>>>> ceph_osd_request also needs a new field to store layout pointer.
>>>>>> I don’t think it’s better/simpler than reference counted namespace
>>>>>> string.
>>>>> 
>>>>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>>>>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>>>>> and it'll probably be required that all OSD requests set one of the
>>>>> callbacks, except for stateless fire-and-forget ones.
>>>> 
>>>> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>>>> 
>>>>> 
>>>>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>>>>> me because a) it naturally hangs off of ceph inode and b) logically,
>>>>> it is entire layouts and not just namespaces that are shared across the
>>>>> directory tree.  If you think reference counted pool_ns strings are
>>>>> better, I won't argue with that, but, with cephfs being the only user
>>>>> of either solution, it'll have to live in fs/ceph.
>>>> 
>>>> I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>>>> 
>>>>> Separately, I think passing non-owning pool_ns pointers into libceph is
>>>>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>>>>> ownership rules, we will go with embedding namespace names by value into
>>>>> ceph_osd_request (or, rather, ceph_object_locator).
>>>>> 
>>>> 
>>>> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.
>>> 
>>> So you are maintaining that all that is needed is to keep the memory
>>> valid and there is no locking around installing a new namespace for an
>>> inode.  I didn't realize that when I suggested layout sharing, it makes
>>> it much less attractive.
>> 
>> Yes.  That’s the main reason I decided to use RCU.
> 
> For the record, I don't think it's a good design and I doubt the
> implementation is going to work reliably, but that's your call.
> 
> Why would embedded namespaces in ceph_object_locator in libceph be
> a nightmare for cephfs?  What do you refer to as a temporary buffer?
> This kind of copying already occurs: you grab a ceph_string with
> ceph_try_get_string() in ceph_osdc_new_request() and it's copied into
> the request message, as part of encoding.  How is grabbing ceph_string
> before calling into libceph and explicitly copying into object locator
> different?

I mean not using reference-counted ceph_string. 


Yes, we can make cephfs code manage the reference counting, call ceph_try_get_string() and ceph_put_string() each time we issue osd request. But this approach requires more code/overhead for both ceph and libceph (an extra parameter for ceph_osdc_xxx_request() functions, code that copies namespace to embedded buffer, lots of ceph_try_get_string()/ceph_put_string() pair in cephfs code).  All of this is just for removing one ceph_try_get_string() call and two ceph_put_string() call in libceph. does it worth the effort, does it make libceph code cleaner?

Regards
Yan,Zheng--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov March 24, 2016, 10:26 a.m. UTC | #14
On Thu, Mar 24, 2016 at 3:33 AM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 24, 2016, at 03:27, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Wed, Mar 23, 2016 at 2:02 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>
>>>> On Mar 23, 2016, at 17:51, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>
>>>> On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>
>>>>>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>
>>>>>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>
>>>>>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>>
>>>>>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>>>
>>>>>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>>
>>>>>>>> [ snip ]
>>>>>>>>
>>>>>>>>>> Hi Zheng,
>>>>>>>>>>
>>>>>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>>>>>> client side, but I would appreciate some details on what the actual
>>>>>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>>>>>> Creating lost of pools is not efficient for the cluster)
>>>>>>>>
>>>>>>>> What about the namespace names, who is generating them, how long are
>>>>>>>> they going to be?  Please describe in detail how this is going to work
>>>>>>>> for both data and metadata pools.
>>>>>>>
>>>>>>> For example, to restrict user foo to directory /foo_dir
>>>>>>>
>>>>>>> // create auth caps for user foo.
>>>>>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>>>>>>>
>>>>>>> // mount cephfs by user admin
>>>>>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>>>>>
>>>>>>> // set directory’s layout.pool_namespace
>>>>>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>>>>>
>>>>>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>>>>>
>>>>>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>>>>>> number) is sensible then.
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>>>>>> the entire directory tree" or something close to it.
>>>>>>>>>>
>>>>>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>>>>>
>>>>>>>>> RBD may use namespace later.
>>>>>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>>>>>
>>>>>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>>>>>> case, there will only ever be one namespace per image.  My point is
>>>>>>>> it's never going to use the string sharing infrastructure and is fine
>>>>>>>> with a non-owning pointer to a string in the file layout field of the
>>>>>>>> in-memory rbd image header.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>>>>>> ceph_osdc_xxx_request().
>>>>>>>>
>>>>>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>>>>>> changing while the OSD request is in flight.  As used above, it allows
>>>>>>>> ceph_try_get_string() to not take any locks and that's it.
>>>>>>>
>>>>>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>>>>>> lock/unlock i_ceph_lock each time i_layout is used.
>>>>>>
>>>>>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>>>>>> the code base a simplification, but, more importantly, is keeping the
>>>>>> pool_ns pointer valid really all you need?  Shouldn't there be some
>>>>>> kind of synchronization around "OK, I'm switching to a new layout for
>>>>>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>>>>>> with two successive calls to ceph_osdc_new_request() potentially ending
>>>>>> up with two different namespaces, e.g. ceph_uninline_data().
>>>>>
>>>>> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.
>>>>
>>>> I have a bit of trouble parsing "block new read/write, for in-progress
>>>> read/write".  So the client will stop issuing requests as soon as it
>>>> learns that it no longer has a cap, but what happens with the in-flight
>>>> requests?
>>>
>>> When client know MDS is revoking Frw caps, it stops issuing new request and waits for in-flight requests. After all in-flight requests completes, client releases Frw caps to MDS.
>>>
>>>>
>>>>>
>>>>>>>>
>>>>>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>>>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>>>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>>>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>>>>>> while libceph is servicing the OSD request: you would unlink it from
>>>>>>>> the tree but the bumped reference will keep the layout around, to be
>>>>>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>>>>>> have to happen in exactly two places: when newing an inode and handling
>>>>>>>> cap grant/revoke, in other places you would simply bump the count on
>>>>>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>>>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>>>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>>>>>> tree.
>>>>>>>
>>>>>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>>>>>> ceph_osd_request also needs a new field to store layout pointer.
>>>>>>> I don’t think it’s better/simpler than reference counted namespace
>>>>>>> string.
>>>>>>
>>>>>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>>>>>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>>>>>> and it'll probably be required that all OSD requests set one of the
>>>>>> callbacks, except for stateless fire-and-forget ones.
>>>>>
>>>>> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>>>>>
>>>>>>
>>>>>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>>>>>> me because a) it naturally hangs off of ceph inode and b) logically,
>>>>>> it is entire layouts and not just namespaces that are shared across the
>>>>>> directory tree.  If you think reference counted pool_ns strings are
>>>>>> better, I won't argue with that, but, with cephfs being the only user
>>>>>> of either solution, it'll have to live in fs/ceph.
>>>>>
>>>>> I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>>>>>
>>>>>> Separately, I think passing non-owning pool_ns pointers into libceph is
>>>>>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>>>>>> ownership rules, we will go with embedding namespace names by value into
>>>>>> ceph_osd_request (or, rather, ceph_object_locator).
>>>>>>
>>>>>
>>>>> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.
>>>>
>>>> So you are maintaining that all that is needed is to keep the memory
>>>> valid and there is no locking around installing a new namespace for an
>>>> inode.  I didn't realize that when I suggested layout sharing, it makes
>>>> it much less attractive.
>>>
>>> Yes.  That’s the main reason I decided to use RCU.
>>
>> For the record, I don't think it's a good design and I doubt the
>> implementation is going to work reliably, but that's your call.
>>
>> Why would embedded namespaces in ceph_object_locator in libceph be
>> a nightmare for cephfs?  What do you refer to as a temporary buffer?
>> This kind of copying already occurs: you grab a ceph_string with
>> ceph_try_get_string() in ceph_osdc_new_request() and it's copied into
>> the request message, as part of encoding.  How is grabbing ceph_string
>> before calling into libceph and explicitly copying into object locator
>> different?
>
> I mean not using reference-counted ceph_string.
>
>
> Yes, we can make cephfs code manage the reference counting, call ceph_try_get_string() and ceph_put_string() each time we issue osd request. But this approach requires more code/overhead for both ceph and libceph (an extra parameter for ceph_osdc_xxx_request() functions, code that copies namespace to embedded buffer, lots of ceph_try_get_string()/ceph_put_string() pair in cephfs code).  All of this is just for removing one ceph_try_get_string() call and two ceph_put_string() call in libceph. does it worth the effort, does it make libceph code cleaner?

It'll be printed into the request after it's allocated, just like OID
is now, so e.g. ceph_osdc_alloc_request() won't need any changes.  You
may need to patch ceph_osdc_new_request(), but that's a cephfs-specific
function and should probably be moved out of libceph.  You won't need
to add lots of get/put pairs in cephfs either - a single helper around
filling in an object locator would do it.

The two disadvantages (extra strcpy() and larger OSD requests) are
real, but so are the benefits of modularity and keeping object locator
a simple value type.  I'm just trying to explore all available options.

TBH I still can't wrap my head around lockless layout/namespace pointer
updates and how that can ever work reliably...

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov March 24, 2016, 12:19 p.m. UTC | #15
On Thu, Mar 24, 2016 at 11:26 AM, Ilya Dryomov <idryomov@gmail.com> wrote:
> On Thu, Mar 24, 2016 at 3:33 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>
>>> On Mar 24, 2016, at 03:27, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>
>>> On Wed, Mar 23, 2016 at 2:02 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>
>>>>> On Mar 23, 2016, at 17:51, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>
>>>>> On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>
>>>>>>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>
>>>>>>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>>
>>>>>>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>>>
>>>>>>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>>>>
>>>>>>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>>>
>>>>>>>>> [ snip ]
>>>>>>>>>
>>>>>>>>>>> Hi Zheng,
>>>>>>>>>>>
>>>>>>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>>>>>>> client side, but I would appreciate some details on what the actual
>>>>>>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>>>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>>>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>>>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>>>>>>> Creating lost of pools is not efficient for the cluster)
>>>>>>>>>
>>>>>>>>> What about the namespace names, who is generating them, how long are
>>>>>>>>> they going to be?  Please describe in detail how this is going to work
>>>>>>>>> for both data and metadata pools.
>>>>>>>>
>>>>>>>> For example, to restrict user foo to directory /foo_dir
>>>>>>>>
>>>>>>>> // create auth caps for user foo.
>>>>>>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns\u2019
>>>>>>>>
>>>>>>>> // mount cephfs by user admin
>>>>>>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>>>>>>
>>>>>>>> // set directory\u2019s layout.pool_namespace
>>>>>>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>>>>>>
>>>>>>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>>>>>>
>>>>>>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>>>>>>> number) is sensible then.
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>>>>>>> the entire directory tree" or something close to it.
>>>>>>>>>>>
>>>>>>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>>>>>>
>>>>>>>>>> RBD may use namespace later.
>>>>>>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>>>>>>
>>>>>>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>>>>>>> case, there will only ever be one namespace per image.  My point is
>>>>>>>>> it's never going to use the string sharing infrastructure and is fine
>>>>>>>>> with a non-owning pointer to a string in the file layout field of the
>>>>>>>>> in-memory rbd image header.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>>>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>>>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>>>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>>>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>>>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>>>>>>> ceph_osdc_xxx_request().
>>>>>>>>>
>>>>>>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>>>>>>> changing while the OSD request is in flight.  As used above, it allows
>>>>>>>>> ceph_try_get_string() to not take any locks and that's it.
>>>>>>>>
>>>>>>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>>>>>>> lock/unlock i_ceph_lock each time i_layout is used.
>>>>>>>
>>>>>>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>>>>>>> the code base a simplification, but, more importantly, is keeping the
>>>>>>> pool_ns pointer valid really all you need?  Shouldn't there be some
>>>>>>> kind of synchronization around "OK, I'm switching to a new layout for
>>>>>>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>>>>>>> with two successive calls to ceph_osdc_new_request() potentially ending
>>>>>>> up with two different namespaces, e.g. ceph_uninline_data().
>>>>>>
>>>>>> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.
>>>>>
>>>>> I have a bit of trouble parsing "block new read/write, for in-progress
>>>>> read/write".  So the client will stop issuing requests as soon as it
>>>>> learns that it no longer has a cap, but what happens with the in-flight
>>>>> requests?
>>>>
>>>> When client know MDS is revoking Frw caps, it stops issuing new request and waits for in-flight requests. After all in-flight requests completes, client releases Frw caps to MDS.
>>>>
>>>>>
>>>>>>
>>>>>>>>>
>>>>>>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>>>>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>>>>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>>>>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>>>>>>> while libceph is servicing the OSD request: you would unlink it from
>>>>>>>>> the tree but the bumped reference will keep the layout around, to be
>>>>>>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>>>>>>> have to happen in exactly two places: when newing an inode and handling
>>>>>>>>> cap grant/revoke, in other places you would simply bump the count on
>>>>>>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>>>>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>>>>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>>>>>>> tree.
>>>>>>>>
>>>>>>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>>>>>>> ceph_osd_request also needs a new field to store layout pointer.
>>>>>>>> I don\u2019t think it\u2019s better/simpler than reference counted namespace
>>>>>>>> string.
>>>>>>>
>>>>>>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>>>>>>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>>>>>>> and it'll probably be required that all OSD requests set one of the
>>>>>>> callbacks, except for stateless fire-and-forget ones.
>>>>>>
>>>>>> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>>>>>>
>>>>>>>
>>>>>>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>>>>>>> me because a) it naturally hangs off of ceph inode and b) logically,
>>>>>>> it is entire layouts and not just namespaces that are shared across the
>>>>>>> directory tree.  If you think reference counted pool_ns strings are
>>>>>>> better, I won't argue with that, but, with cephfs being the only user
>>>>>>> of either solution, it'll have to live in fs/ceph.
>>>>>>
>>>>>> I\u2019m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>>>>>>
>>>>>>> Separately, I think passing non-owning pool_ns pointers into libceph is
>>>>>>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>>>>>>> ownership rules, we will go with embedding namespace names by value into
>>>>>>> ceph_osd_request (or, rather, ceph_object_locator).
>>>>>>>
>>>>>>
>>>>>> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.
>>>>>
>>>>> So you are maintaining that all that is needed is to keep the memory
>>>>> valid and there is no locking around installing a new namespace for an
>>>>> inode.  I didn't realize that when I suggested layout sharing, it makes
>>>>> it much less attractive.
>>>>
>>>> Yes.  That\u2019s the main reason I decided to use RCU.
>>>
>>> For the record, I don't think it's a good design and I doubt the
>>> implementation is going to work reliably, but that's your call.
>>>
>>> Why would embedded namespaces in ceph_object_locator in libceph be
>>> a nightmare for cephfs?  What do you refer to as a temporary buffer?
>>> This kind of copying already occurs: you grab a ceph_string with
>>> ceph_try_get_string() in ceph_osdc_new_request() and it's copied into
>>> the request message, as part of encoding.  How is grabbing ceph_string
>>> before calling into libceph and explicitly copying into object locator
>>> different?
>>
>> I mean not using reference-counted ceph_string.
>>
>>
>> Yes, we can make cephfs code manage the reference counting, call ceph_try_get_string() and ceph_put_string() each time we issue osd request. But this approach requires more code/overhead for both ceph and libceph (an extra parameter for ceph_osdc_xxx_request() functions, code that copies namespace to embedded buffer, lots of ceph_try_get_string()/ceph_put_string() pair in cephfs code).  All of this is just for removing one ceph_try_get_string() call and two ceph_put_string() call in libceph. does it worth the effort, does it make libceph code cleaner\uff1f
>
> It'll be printed into the request after it's allocated, just like OID
> is now, so e.g. ceph_osdc_alloc_request() won't need any changes.  You
> may need to patch ceph_osdc_new_request(), but that's a cephfs-specific
> function and should probably be moved out of libceph.  You won't need
> to add lots of get/put pairs in cephfs either - a single helper around
> filling in an object locator would do it.
>
> The two disadvantages (extra strcpy() and larger OSD requests) are
> real, but so are the benefits of modularity and keeping object locator
> a simple value type.  I'm just trying to explore all available options.
>
> TBH I still can't wrap my head around lockless layout/namespace pointer
> updates and how that can ever work reliably...

I also don't get the purpose of making it lockless.  Neither the MDS
client nor the data path are light on locks: there is more than half
a dozen of them, i_ceph_lock in particular is taken dozens of times,
including in cap management code.  Doesn't that tranlate into "there is
no way to start an I/O operation w/o i_ceph_lock being involved"?

caps grant/revoke code also happens to be one of the two places where
the namespace is set.  The other is (re-)filling an inode struct and
both of those necessarily revolve around i_ceph_lock.  Could you lay
out the reasoning behind the current approach in more detail?

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng March 24, 2016, 1:18 p.m. UTC | #16
> On Mar 24, 2016, at 18:26, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Thu, Mar 24, 2016 at 3:33 AM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 24, 2016, at 03:27, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> On Wed, Mar 23, 2016 at 2:02 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>> 
>>>>> On Mar 23, 2016, at 17:51, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>> 
>>>>> On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>> 
>>>>>>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>> 
>>>>>>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>> 
>>>>>>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>>> 
>>>>>>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>>>> 
>>>>>>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>>> 
>>>>>>>>> [ snip ]
>>>>>>>>> 
>>>>>>>>>>> Hi Zheng,
>>>>>>>>>>> 
>>>>>>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>>>>>>> client side, but I would appreciate some details on what the actual
>>>>>>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>>>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>>>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>>>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>>>>>>> Creating lost of pools is not efficient for the cluster)
>>>>>>>>> 
>>>>>>>>> What about the namespace names, who is generating them, how long are
>>>>>>>>> they going to be?  Please describe in detail how this is going to work
>>>>>>>>> for both data and metadata pools.
>>>>>>>> 
>>>>>>>> For example, to restrict user foo to directory /foo_dir
>>>>>>>> 
>>>>>>>> // create auth caps for user foo.
>>>>>>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>>>>>>>> 
>>>>>>>> // mount cephfs by user admin
>>>>>>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>>>>>> 
>>>>>>>> // set directory’s layout.pool_namespace
>>>>>>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>>>>>> 
>>>>>>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>>>>>> 
>>>>>>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>>>>>>> number) is sensible then.
>>>>>>> 
>>>>>>>> 
>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>>>>>>> the entire directory tree" or something close to it.
>>>>>>>>>>> 
>>>>>>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>>>>>> 
>>>>>>>>>> RBD may use namespace later.
>>>>>>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>>>>>> 
>>>>>>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>>>>>>> case, there will only ever be one namespace per image.  My point is
>>>>>>>>> it's never going to use the string sharing infrastructure and is fine
>>>>>>>>> with a non-owning pointer to a string in the file layout field of the
>>>>>>>>> in-memory rbd image header.
>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>>>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>>>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>>>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>>>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>>>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>>>>>>> ceph_osdc_xxx_request().
>>>>>>>>> 
>>>>>>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>>>>>>> changing while the OSD request is in flight.  As used above, it allows
>>>>>>>>> ceph_try_get_string() to not take any locks and that's it.
>>>>>>>> 
>>>>>>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>>>>>>> lock/unlock i_ceph_lock each time i_layout is used.
>>>>>>> 
>>>>>>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>>>>>>> the code base a simplification, but, more importantly, is keeping the
>>>>>>> pool_ns pointer valid really all you need?  Shouldn't there be some
>>>>>>> kind of synchronization around "OK, I'm switching to a new layout for
>>>>>>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>>>>>>> with two successive calls to ceph_osdc_new_request() potentially ending
>>>>>>> up with two different namespaces, e.g. ceph_uninline_data().
>>>>>> 
>>>>>> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.
>>>>> 
>>>>> I have a bit of trouble parsing "block new read/write, for in-progress
>>>>> read/write".  So the client will stop issuing requests as soon as it
>>>>> learns that it no longer has a cap, but what happens with the in-flight
>>>>> requests?
>>>> 
>>>> When client know MDS is revoking Frw caps, it stops issuing new request and waits for in-flight requests. After all in-flight requests completes, client releases Frw caps to MDS.
>>>> 
>>>>> 
>>>>>> 
>>>>>>>>> 
>>>>>>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>>>>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>>>>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>>>>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>>>>>>> while libceph is servicing the OSD request: you would unlink it from
>>>>>>>>> the tree but the bumped reference will keep the layout around, to be
>>>>>>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>>>>>>> have to happen in exactly two places: when newing an inode and handling
>>>>>>>>> cap grant/revoke, in other places you would simply bump the count on
>>>>>>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>>>>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>>>>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>>>>>>> tree.
>>>>>>>> 
>>>>>>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>>>>>>> ceph_osd_request also needs a new field to store layout pointer.
>>>>>>>> I don’t think it’s better/simpler than reference counted namespace
>>>>>>>> string.
>>>>>>> 
>>>>>>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>>>>>>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>>>>>>> and it'll probably be required that all OSD requests set one of the
>>>>>>> callbacks, except for stateless fire-and-forget ones.
>>>>>> 
>>>>>> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>>>>>> 
>>>>>>> 
>>>>>>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>>>>>>> me because a) it naturally hangs off of ceph inode and b) logically,
>>>>>>> it is entire layouts and not just namespaces that are shared across the
>>>>>>> directory tree.  If you think reference counted pool_ns strings are
>>>>>>> better, I won't argue with that, but, with cephfs being the only user
>>>>>>> of either solution, it'll have to live in fs/ceph.
>>>>>> 
>>>>>> I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>>>>>> 
>>>>>>> Separately, I think passing non-owning pool_ns pointers into libceph is
>>>>>>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>>>>>>> ownership rules, we will go with embedding namespace names by value into
>>>>>>> ceph_osd_request (or, rather, ceph_object_locator).
>>>>>>> 
>>>>>> 
>>>>>> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.
>>>>> 
>>>>> So you are maintaining that all that is needed is to keep the memory
>>>>> valid and there is no locking around installing a new namespace for an
>>>>> inode.  I didn't realize that when I suggested layout sharing, it makes
>>>>> it much less attractive.
>>>> 
>>>> Yes.  That’s the main reason I decided to use RCU.
>>> 
>>> For the record, I don't think it's a good design and I doubt the
>>> implementation is going to work reliably, but that's your call.
>>> 
>>> Why would embedded namespaces in ceph_object_locator in libceph be
>>> a nightmare for cephfs?  What do you refer to as a temporary buffer?
>>> This kind of copying already occurs: you grab a ceph_string with
>>> ceph_try_get_string() in ceph_osdc_new_request() and it's copied into
>>> the request message, as part of encoding.  How is grabbing ceph_string
>>> before calling into libceph and explicitly copying into object locator
>>> different?
>> 
>> I mean not using reference-counted ceph_string.
>> 
>> 
>> Yes, we can make cephfs code manage the reference counting, call ceph_try_get_string() and ceph_put_string() each time we issue osd request. But this approach requires more code/overhead for both ceph and libceph (an extra parameter for ceph_osdc_xxx_request() functions, code that copies namespace to embedded buffer, lots of ceph_try_get_string()/ceph_put_string() pair in cephfs code).  All of this is just for removing one ceph_try_get_string() call and two ceph_put_string() call in libceph. does it worth the effort, does it make libceph code cleaner?
> 
> It'll be printed into the request after it's allocated, just like OID
> is now, so e.g. ceph_osdc_alloc_request() won't need any changes.  You
> may need to patch ceph_osdc_new_request(), but that's a cephfs-specific
> function and should probably be moved out of libceph.  You won't need
> to add lots of get/put pairs in cephfs either - a single helper around
> filling in an object locator would do it.
> 
> The two disadvantages (extra strcpy() and larger OSD requests) are
> real, but so are the benefits of modularity and keeping object locator
> a simple value type.  I'm just trying to explore all available options.

I agree that it’s easy for us to explore both options if we move ceph_osdc_new_request(), ceph_osdc_readpage and ceph_osdc_writepage into fs/ceph directory, 

> 
> TBH I still can't wrap my head around lockless layout/namespace pointer
> updates and how that can ever work reliably…

see Documentation/RCU/rcuref.txt. How RCU is used in my code is similar to the second example.

Regards
Yan, Zheng


> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov March 24, 2016, 1:55 p.m. UTC | #17
On Thu, Mar 24, 2016 at 2:18 PM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 24, 2016, at 18:26, Ilya Dryomov <idryomov@gmail.com> wrote:

[ snip ]

>> TBH I still can't wrap my head around lockless layout/namespace pointer
>> updates and how that can ever work reliably…
>
> see Documentation/RCU/rcuref.txt. How RCU is used in my code is similar to the second example.

It's the higher-level cephfs code and installing new namespaces in the
middle of whatever else that might be going on that I'm worried about.
See my other mail, I'd like to hear what exactly is being achieved with
this approach and have you considered reusing i_ceph_lock (or others,
of which there are plenty).

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Douglas Fuller March 24, 2016, 3:30 p.m. UTC | #18
> On Mar 23, 2016, at 7:39 PM, Yan, Zheng <zyan@redhat.com> wrote:
> 
> 
>> On Mar 23, 2016, at 23:39, Douglas Fuller <dfuller@redhat.com> wrote:
>> 
>> 
>>> +#define ceph_try_get_string(x)					\
>>> +({								\
>>> +	struct ceph_string *___str;				\
>>> +	rcu_read_lock();					\
>>> +	___str = rcu_dereference(x);				\
>>> +	if (___str && !kref_get_unless_zero(&___str->kref))	\
>>> +		___str = NULL;					\
>>> +	rcu_read_unlock();					\
>>> +	(___str);						\
>> 
>> Isn’t this an unsafe use of __str? As I understand it, it could be updated after rcu_read_unlock().
> 
> where does it get updated?

By a later RCU update. As I understand it, an rcu_dereference()d pointer is only guaranteed stable inside rcu_read_lock()/rcu_read_unlock().

> 
> Regards
> Yan, Zheng
> 
>> 
>>> +})
>> 
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng March 25, 2016, 3:05 a.m. UTC | #19
> On Mar 24, 2016, at 21:55, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Thu, Mar 24, 2016 at 2:18 PM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 24, 2016, at 18:26, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> [ snip ]
> 
>>> TBH I still can't wrap my head around lockless layout/namespace pointer
>>> updates and how that can ever work reliably…
>> 
>> see Documentation/RCU/rcuref.txt. How RCU is used in my code is similar to the second example.
> 
> It's the higher-level cephfs code and installing new namespaces in the
> middle of whatever else that might be going on that I'm worried about.
> See my other mail, I'd like to hear what exactly is being achieved with
> this approach and have you considered reusing i_ceph_lock (or others,
> of which there are plenty).

Updating namespace pointer is protected by i_ceph_lock, For read access of the namespace, using i_ceph_lock does not provide any extra guarantee. This approach provides lockless access of namespace, it’s convenient for functions that do not know 
i_ceph_lock.

Regards
Yan, Zheng

> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov March 28, 2016, 7:37 p.m. UTC | #20
On Fri, Mar 25, 2016 at 4:05 AM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 24, 2016, at 21:55, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Thu, Mar 24, 2016 at 2:18 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>
>>>> On Mar 24, 2016, at 18:26, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> [ snip ]
>>
>>>> TBH I still can't wrap my head around lockless layout/namespace pointer
>>>> updates and how that can ever work reliably…
>>>
>>> see Documentation/RCU/rcuref.txt. How RCU is used in my code is similar to the second example.
>>
>> It's the higher-level cephfs code and installing new namespaces in the
>> middle of whatever else that might be going on that I'm worried about.
>> See my other mail, I'd like to hear what exactly is being achieved with
>> this approach and have you considered reusing i_ceph_lock (or others,
>> of which there are plenty).
>
> Updating namespace pointer is protected by i_ceph_lock, For read
> access of the namespace, using i_ceph_lock does not provide any extra
> guarantee. This approach provides lockless access of namespace, it’s
> convenient for functions that do not know i_ceph_lock.

Could you name some of these functions?

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng March 29, 2016, 2:29 a.m. UTC | #21
> On Mar 29, 2016, at 03:37, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Fri, Mar 25, 2016 at 4:05 AM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 24, 2016, at 21:55, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> On Thu, Mar 24, 2016 at 2:18 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>> 
>>>>> On Mar 24, 2016, at 18:26, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> [ snip ]
>>> 
>>>>> TBH I still can't wrap my head around lockless layout/namespace pointer
>>>>> updates and how that can ever work reliably…
>>>> 
>>>> see Documentation/RCU/rcuref.txt. How RCU is used in my code is similar to the second example.
>>> 
>>> It's the higher-level cephfs code and installing new namespaces in the
>>> middle of whatever else that might be going on that I'm worried about.
>>> See my other mail, I'd like to hear what exactly is being achieved with
>>> this approach and have you considered reusing i_ceph_lock (or others,
>>> of which there are plenty).
>> 
>> Updating namespace pointer is protected by i_ceph_lock, For read
>> access of the namespace, using i_ceph_lock does not provide any extra
>> guarantee. This approach provides lockless access of namespace, it’s
>> convenient for functions that do not know i_ceph_lock.
> 
> Could you name some of these functions?

ceph_osdc_{new_request,readpages,write pages}

> 
> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng March 29, 2016, 9:56 a.m. UTC | #22
Here is code that share i_layout and embed namespace in osd_request.  But the code is more complex and less less efficient. I strongly doubt it’s good design. 

https://github.com/ceph/ceph-client/commits/wip-file-layout2 <https://github.com/ceph/ceph-client/commits/wip-file-layout2>

Regards
Yan, Zheng

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov March 29, 2016, 11:18 a.m. UTC | #23
On Tue, Mar 29, 2016 at 11:56 AM, Yan, Zheng <zyan@redhat.com> wrote:
> Here is code that share i_layout and embed namespace in osd_request.  But the code is more complex and less less efficient. I strongly doubt it’s good design.
>
> https://github.com/ceph/ceph-client/commits/wip-file-layout2 <https://github.com/ceph/ceph-client/commits/wip-file-layout2>

When I suggested it, I assumed namespace updates were more synchronous
than they actually are.  I said in one of my previous emails a few days
ago that it seemed less attractive, at least compared to the posted
patches, so I agree.

The string-sharing approach reaches deeply into osd_client though.
I have a *big* set of changes in the works for it - let me try to finish
it, prepare the ground for reference-counting for object locators and
ping you in couple of weeks?  Discussing particulars like locking and
RCU usage should be easier then.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index e7975e4..8e9868d 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -21,6 +21,7 @@ 
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/osd_client.h>
 #include <linux/ceph/ceph_fs.h>
+#include <linux/ceph/string_table.h>
 
 /*
  * mount options
diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h
new file mode 100644
index 0000000..427b6f9
--- /dev/null
+++ b/include/linux/ceph/string_table.h
@@ -0,0 +1,58 @@ 
+#ifndef _FS_CEPH_STRING_TABLE_H
+#define _FS_CEPH_STRING_TABLE_H
+
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+
+struct ceph_string {
+	struct kref kref;
+	union {
+		struct rb_node node;
+		struct rcu_head rcu;
+	};
+	size_t len;
+	char str[];
+};
+
+extern void ceph_release_string(struct kref *ref);
+extern struct ceph_string *ceph_find_or_create_string(const char *str,
+						      size_t len);
+extern void ceph_string_cleanup(void);
+
+static inline void ceph_get_string(struct ceph_string *str)
+{
+	kref_get(&str->kref);
+}
+
+static inline void ceph_put_string(struct ceph_string *str)
+{
+	if (!str)
+		return;
+	kref_put(&str->kref, ceph_release_string);
+}
+
+static inline int ceph_compare_string(struct ceph_string *cs,
+				      const char* str, size_t len)
+{
+	size_t cs_len = cs ? cs->len : 0;
+	if (cs_len != len)
+		return cs_len - len;
+	if (len == 0)
+		return 0;
+	return strncmp(cs->str, str, len);
+}
+
+#define ceph_try_get_string(x)					\
+({								\
+	struct ceph_string *___str;				\
+	rcu_read_lock();					\
+	___str = rcu_dereference(x);				\
+	if (___str && !kref_get_unless_zero(&___str->kref))	\
+		___str = NULL;					\
+	rcu_read_unlock();					\
+	(___str);						\
+})
+
+#endif
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index 958d9856..84cbed6 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -11,5 +11,5 @@  libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	crypto.o armor.o \
 	auth_x.o \
 	ceph_fs.o ceph_strings.o ceph_hash.o \
-	pagevec.o snapshot.o
+	pagevec.o snapshot.o string_table.o
 
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6..baca649 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -751,6 +751,7 @@  static void __exit exit_ceph_lib(void)
 	ceph_msgr_exit();
 	ceph_crypto_shutdown();
 	ceph_debugfs_cleanup();
+	ceph_string_cleanup();
 }
 
 module_init(init_ceph_lib);
diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
new file mode 100644
index 0000000..bb49bd7
--- /dev/null
+++ b/net/ceph/string_table.c
@@ -0,0 +1,121 @@ 
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/ceph/string_table.h>
+
+static DEFINE_SPINLOCK(string_tree_lock);
+static struct rb_root string_tree = RB_ROOT;
+
+struct ceph_string *ceph_find_or_create_string(const char* str, size_t len)
+{
+	struct ceph_string *cs, *exist;
+	struct rb_node **p, *parent;
+	int ret;
+
+	exist = NULL;
+	spin_lock(&string_tree_lock);
+	p = &string_tree.rb_node;
+	while (*p) {
+		exist = rb_entry(*p, struct ceph_string, node);
+		ret = ceph_compare_string(exist, str, len);
+		if (ret > 0)
+			p = &(*p)->rb_left;
+		else if (ret < 0)
+			p = &(*p)->rb_right;
+		else
+			break;
+		exist = NULL;
+	}
+	if (exist && !kref_get_unless_zero(&exist->kref)) {
+		rb_erase(&exist->node, &string_tree);
+		RB_CLEAR_NODE(&exist->node);
+		exist = NULL;
+	}
+	spin_unlock(&string_tree_lock);
+	if (exist)
+		return exist;
+
+	cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS);
+	if (!cs)
+		return NULL;
+
+	kref_init(&cs->kref);
+	cs->len = len;
+	memcpy(cs->str, str, len);
+	cs->str[len] = 0;
+
+retry:
+	exist = NULL;
+	parent = NULL;
+	p = &string_tree.rb_node;
+	spin_lock(&string_tree_lock);
+	while (*p) {
+		parent = *p;
+		exist = rb_entry(*p, struct ceph_string, node);
+		ret = ceph_compare_string(exist, str, len);
+		if (ret > 0)
+			p = &(*p)->rb_left;
+		else if (ret < 0)
+			p = &(*p)->rb_right;
+		else
+			break;
+		exist = NULL;
+	}
+	ret = 0;
+	if (!exist) {
+		rb_link_node(&cs->node, parent, p);
+		rb_insert_color(&cs->node, &string_tree);
+	} else if (!kref_get_unless_zero(&exist->kref)) {
+		rb_erase(&exist->node, &string_tree);
+		RB_CLEAR_NODE(&exist->node);
+		ret = -EAGAIN;
+	}
+	spin_unlock(&string_tree_lock);
+	if (ret == -EAGAIN)
+		goto retry;
+
+	if (exist) {
+		kfree(cs);
+		cs = exist;
+	}
+
+	return cs;
+}
+EXPORT_SYMBOL(ceph_find_or_create_string);
+
+static void ceph_free_string(struct rcu_head *head)
+{
+	struct ceph_string *cs = container_of(head, struct ceph_string, rcu);
+	kfree(cs);
+}
+
+void ceph_release_string(struct kref *ref)
+{
+	struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
+
+	spin_lock(&string_tree_lock);
+	if (!RB_EMPTY_NODE(&cs->node)) {
+		rb_erase(&cs->node, &string_tree);
+		RB_CLEAR_NODE(&cs->node);
+	}
+	spin_unlock(&string_tree_lock);
+
+	call_rcu(&cs->rcu, ceph_free_string);
+}
+EXPORT_SYMBOL(ceph_release_string);
+
+void ceph_string_cleanup(void)
+{
+	struct rb_node *p;
+	struct ceph_string *cs;
+	if (RB_EMPTY_ROOT(&string_tree))
+		return;
+
+	pr_err("libceph: detect string leaks\n");
+	while ((p = rb_first(&string_tree))) {
+		cs = rb_entry(p, struct ceph_string, node);
+		rb_erase(p, &string_tree);
+		kfree(cs);
+	}
+}