diff mbox series

[v2,2/5] ceph: periodically send perf metrics to ceph

Message ID 1592481599-7851-3-git-send-email-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series ceph: periodically send perf metrics to ceph | expand

Commit Message

Xiubo Li June 18, 2020, 11:59 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

This will send the caps/read/write/metadata metrics to any available
MDS only once per second as default, which will be the same as the
userland client, or every metric_send_interval seconds, which is a
module parameter.

URL: https://tracker.ceph.com/issues/43215
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c         |   3 +
 fs/ceph/metric.c             | 134 +++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/metric.h             |  78 +++++++++++++++++++++++++
 fs/ceph/super.c              |  49 ++++++++++++++++
 fs/ceph/super.h              |   2 +
 include/linux/ceph/ceph_fs.h |   1 +
 6 files changed, 267 insertions(+)

Comments

Jeffrey Layton June 18, 2020, 2:42 p.m. UTC | #1
On Thu, 2020-06-18 at 07:59 -0400, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> This will send the caps/read/write/metadata metrics to any available
> MDS only once per second as default, which will be the same as the
> userland client, or every metric_send_interval seconds, which is a
> module parameter.
> 
> URL: https://tracker.ceph.com/issues/43215
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/mds_client.c         |   3 +
>  fs/ceph/metric.c             | 134 +++++++++++++++++++++++++++++++++++++++++++
>  fs/ceph/metric.h             |  78 +++++++++++++++++++++++++
>  fs/ceph/super.c              |  49 ++++++++++++++++
>  fs/ceph/super.h              |   2 +
>  include/linux/ceph/ceph_fs.h |   1 +
>  6 files changed, 267 insertions(+)
> 
> 

I think 3/5 needs to moved ahead of this one or folded into it, as we'll
have a temporary regression otherwise.

> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index c9784eb1..5f409dd 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -27,6 +27,9 @@
>  #include <linux/ceph/auth.h>
>  #include <linux/ceph/debugfs.h>
>  
> +static DEFINE_MUTEX(ceph_fsc_lock);
> +static LIST_HEAD(ceph_fsc_list);
> +
>  /*
>   * Ceph superblock operations
>   *
> @@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>  	if (!fsc->wb_pagevec_pool)
>  		goto fail_cap_wq;
>  
> +	mutex_lock(&ceph_fsc_lock);
> +	list_add_tail(&fsc->list, &ceph_fsc_list);
> +	mutex_unlock(&ceph_fsc_lock);
> +
>  	return fsc;
>  
>  fail_cap_wq:
> @@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
>  {
>  	dout("destroy_fs_client %p\n", fsc);
>  
> +	mutex_lock(&ceph_fsc_lock);
> +	list_del(&fsc->list);
> +	mutex_unlock(&ceph_fsc_lock);
> +
>  	ceph_mdsc_destroy(fsc);
>  	destroy_workqueue(fsc->inode_wq);
>  	destroy_workqueue(fsc->cap_wq);
> @@ -1282,6 +1293,44 @@ static void __exit exit_ceph(void)
>  	destroy_caches();
>  }
>  
> +static int param_set_metric_interval(const char *val, const struct kernel_param *kp)
> +{
> +	struct ceph_fs_client *fsc;
> +	unsigned int interval;
> +	int ret;
> +
> +	ret = kstrtouint(val, 0, &interval);
> +	if (ret < 0) {
> +		pr_err("Failed to parse metric interval '%s'\n", val);
> +		return ret;
> +	}
> +
> +	if (interval > 5) {
> +		pr_err("Invalid metric interval %u\n", interval);
> +		return -EINVAL;
> +	}
> +

Why do we want to reject an interval larger than 5s? Is that problematic
for some reason? In any case, it would be good to replace this with a
#defined constant that describes what that value represents.

> +	metric_send_interval = interval;
> +
> +	// wake up all the mds clients
> +	mutex_lock(&ceph_fsc_lock);
> +	list_for_each_entry(fsc, &ceph_fsc_list, list) {
> +		metric_schedule_delayed(&fsc->mdsc->metric);
> +	}
> +	mutex_unlock(&ceph_fsc_lock);
> +
> +	return 0;
> +}
> +
> +static const struct kernel_param_ops param_ops_metric_interval = {
> +	.set = param_set_metric_interval,
> +	.get = param_get_uint,
> +};
> +
> +unsigned int metric_send_interval = 1;
> +module_param_cb(metric_send_interval, &param_ops_metric_interval, &metric_send_interval, 0644);
> +MODULE_PARM_DESC(metric_send_interval, "Interval (in seconds) of sending perf metric to ceph cluster, valid values are 0~5, 0 means disabled (default: 1)");
> +
>  module_init(init_ceph);
>  module_exit(exit_ceph);
>  
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 5a6cdd3..05edc9a 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -101,6 +101,8 @@ struct ceph_mount_options {
>  struct ceph_fs_client {
>  	struct super_block *sb;
>  
> +	struct list_head list;
> +
>  	struct ceph_mount_options *mount_options;
>  	struct ceph_client *client;
>  
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index ebf5ba6..455e9b9 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -130,6 +130,7 @@ struct ceph_dir_layout {
>  #define CEPH_MSG_CLIENT_REQUEST         24
>  #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
>  #define CEPH_MSG_CLIENT_REPLY           26
> +#define CEPH_MSG_CLIENT_METRICS         29
>  #define CEPH_MSG_CLIENT_CAPS            0x310
>  #define CEPH_MSG_CLIENT_LEASE           0x311
>  #define CEPH_MSG_CLIENT_SNAP            0x312

Thanks,
Xiubo Li June 19, 2020, 12:37 a.m. UTC | #2
On 2020/6/18 22:42, Jeff Layton wrote:
> On Thu, 2020-06-18 at 07:59 -0400, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> This will send the caps/read/write/metadata metrics to any available
>> MDS only once per second as default, which will be the same as the
>> userland client, or every metric_send_interval seconds, which is a
>> module parameter.
>>
>> URL: https://tracker.ceph.com/issues/43215
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/mds_client.c         |   3 +
>>   fs/ceph/metric.c             | 134 +++++++++++++++++++++++++++++++++++++++++++
>>   fs/ceph/metric.h             |  78 +++++++++++++++++++++++++
>>   fs/ceph/super.c              |  49 ++++++++++++++++
>>   fs/ceph/super.h              |   2 +
>>   include/linux/ceph/ceph_fs.h |   1 +
>>   6 files changed, 267 insertions(+)
>>
>>
> I think 3/5 needs to moved ahead of this one or folded into it, as we'll
> have a temporary regression otherwise.
>
>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>> index c9784eb1..5f409dd 100644
>> --- a/fs/ceph/super.c
>> +++ b/fs/ceph/super.c
>> @@ -27,6 +27,9 @@
>>   #include <linux/ceph/auth.h>
>>   #include <linux/ceph/debugfs.h>
>>   
>> +static DEFINE_MUTEX(ceph_fsc_lock);
>> +static LIST_HEAD(ceph_fsc_list);
>> +
>>   /*
>>    * Ceph superblock operations
>>    *
>> @@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>>   	if (!fsc->wb_pagevec_pool)
>>   		goto fail_cap_wq;
>>   
>> +	mutex_lock(&ceph_fsc_lock);
>> +	list_add_tail(&fsc->list, &ceph_fsc_list);
>> +	mutex_unlock(&ceph_fsc_lock);
>> +
>>   	return fsc;
>>   
>>   fail_cap_wq:
>> @@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
>>   {
>>   	dout("destroy_fs_client %p\n", fsc);
>>   
>> +	mutex_lock(&ceph_fsc_lock);
>> +	list_del(&fsc->list);
>> +	mutex_unlock(&ceph_fsc_lock);
>> +
>>   	ceph_mdsc_destroy(fsc);
>>   	destroy_workqueue(fsc->inode_wq);
>>   	destroy_workqueue(fsc->cap_wq);
>> @@ -1282,6 +1293,44 @@ static void __exit exit_ceph(void)
>>   	destroy_caches();
>>   }
>>   
>> +static int param_set_metric_interval(const char *val, const struct kernel_param *kp)
>> +{
>> +	struct ceph_fs_client *fsc;
>> +	unsigned int interval;
>> +	int ret;
>> +
>> +	ret = kstrtouint(val, 0, &interval);
>> +	if (ret < 0) {
>> +		pr_err("Failed to parse metric interval '%s'\n", val);
>> +		return ret;
>> +	}
>> +
>> +	if (interval > 5) {
>> +		pr_err("Invalid metric interval %u\n", interval);
>> +		return -EINVAL;
>> +	}
>> +
> Why do we want to reject an interval larger than 5s? Is that problematic
> for some reason?

IMO, a larger interval doesn't make much sense, to limit the interval 
value in 5s to make sure that the ceph side could show the client real 
metrics in time. Is this okay ? Or should we use a larger limit ?


> In any case, it would be good to replace this with a
> #defined constant that describes what that value represents.

Sure, I will add one macro in next version.

Thanks,

>> +	metric_send_interval = interval;
>> +
>> +	// wake up all the mds clients
>> +	mutex_lock(&ceph_fsc_lock);
>> +	list_for_each_entry(fsc, &ceph_fsc_list, list) {
>> +		metric_schedule_delayed(&fsc->mdsc->metric);
>> +	}
>> +	mutex_unlock(&ceph_fsc_lock);
>> +
>> +	return 0;
>> +}
>> +
>> +static const struct kernel_param_ops param_ops_metric_interval = {
>> +	.set = param_set_metric_interval,
>> +	.get = param_get_uint,
>> +};
>> +
>> +unsigned int metric_send_interval = 1;
>> +module_param_cb(metric_send_interval, &param_ops_metric_interval, &metric_send_interval, 0644);
>> +MODULE_PARM_DESC(metric_send_interval, "Interval (in seconds) of sending perf metric to ceph cluster, valid values are 0~5, 0 means disabled (default: 1)");
>> +
>>   module_init(init_ceph);
>>   module_exit(exit_ceph);
>>   
>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 5a6cdd3..05edc9a 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -101,6 +101,8 @@ struct ceph_mount_options {
>>   struct ceph_fs_client {
>>   	struct super_block *sb;
>>   
>> +	struct list_head list;
>> +
>>   	struct ceph_mount_options *mount_options;
>>   	struct ceph_client *client;
>>   
>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>> index ebf5ba6..455e9b9 100644
>> --- a/include/linux/ceph/ceph_fs.h
>> +++ b/include/linux/ceph/ceph_fs.h
>> @@ -130,6 +130,7 @@ struct ceph_dir_layout {
>>   #define CEPH_MSG_CLIENT_REQUEST         24
>>   #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
>>   #define CEPH_MSG_CLIENT_REPLY           26
>> +#define CEPH_MSG_CLIENT_METRICS         29
>>   #define CEPH_MSG_CLIENT_CAPS            0x310
>>   #define CEPH_MSG_CLIENT_LEASE           0x311
>>   #define CEPH_MSG_CLIENT_SNAP            0x312
> Thanks,
Ilya Dryomov June 19, 2020, 9:35 a.m. UTC | #3
On Fri, Jun 19, 2020 at 2:38 AM Xiubo Li <xiubli@redhat.com> wrote:
>
> On 2020/6/18 22:42, Jeff Layton wrote:
> > On Thu, 2020-06-18 at 07:59 -0400, xiubli@redhat.com wrote:
> >> From: Xiubo Li <xiubli@redhat.com>
> >>
> >> This will send the caps/read/write/metadata metrics to any available
> >> MDS only once per second as default, which will be the same as the
> >> userland client, or every metric_send_interval seconds, which is a
> >> module parameter.
> >>
> >> URL: https://tracker.ceph.com/issues/43215
> >> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> >> ---
> >>   fs/ceph/mds_client.c         |   3 +
> >>   fs/ceph/metric.c             | 134 +++++++++++++++++++++++++++++++++++++++++++
> >>   fs/ceph/metric.h             |  78 +++++++++++++++++++++++++
> >>   fs/ceph/super.c              |  49 ++++++++++++++++
> >>   fs/ceph/super.h              |   2 +
> >>   include/linux/ceph/ceph_fs.h |   1 +
> >>   6 files changed, 267 insertions(+)
> >>
> >>
> > I think 3/5 needs to moved ahead of this one or folded into it, as we'll
> > have a temporary regression otherwise.
> >
> >> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> >> index c9784eb1..5f409dd 100644
> >> --- a/fs/ceph/super.c
> >> +++ b/fs/ceph/super.c
> >> @@ -27,6 +27,9 @@
> >>   #include <linux/ceph/auth.h>
> >>   #include <linux/ceph/debugfs.h>
> >>
> >> +static DEFINE_MUTEX(ceph_fsc_lock);
> >> +static LIST_HEAD(ceph_fsc_list);
> >> +
> >>   /*
> >>    * Ceph superblock operations
> >>    *
> >> @@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
> >>      if (!fsc->wb_pagevec_pool)
> >>              goto fail_cap_wq;
> >>
> >> +    mutex_lock(&ceph_fsc_lock);
> >> +    list_add_tail(&fsc->list, &ceph_fsc_list);
> >> +    mutex_unlock(&ceph_fsc_lock);
> >> +
> >>      return fsc;
> >>
> >>   fail_cap_wq:
> >> @@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
> >>   {
> >>      dout("destroy_fs_client %p\n", fsc);
> >>
> >> +    mutex_lock(&ceph_fsc_lock);
> >> +    list_del(&fsc->list);
> >> +    mutex_unlock(&ceph_fsc_lock);
> >> +
> >>      ceph_mdsc_destroy(fsc);
> >>      destroy_workqueue(fsc->inode_wq);
> >>      destroy_workqueue(fsc->cap_wq);
> >> @@ -1282,6 +1293,44 @@ static void __exit exit_ceph(void)
> >>      destroy_caches();
> >>   }
> >>
> >> +static int param_set_metric_interval(const char *val, const struct kernel_param *kp)
> >> +{
> >> +    struct ceph_fs_client *fsc;
> >> +    unsigned int interval;
> >> +    int ret;
> >> +
> >> +    ret = kstrtouint(val, 0, &interval);
> >> +    if (ret < 0) {
> >> +            pr_err("Failed to parse metric interval '%s'\n", val);
> >> +            return ret;
> >> +    }
> >> +
> >> +    if (interval > 5) {
> >> +            pr_err("Invalid metric interval %u\n", interval);
> >> +            return -EINVAL;
> >> +    }
> >> +
> > Why do we want to reject an interval larger than 5s? Is that problematic
> > for some reason?
>
> IMO, a larger interval doesn't make much sense, to limit the interval
> value in 5s to make sure that the ceph side could show the client real
> metrics in time. Is this okay ? Or should we use a larger limit ?

I wonder if providing the option to tune the interval makes sense
at all then.  Since most clients will be sending their metrics every
second, the MDS may eventually start relying on that in some way.
Would a simple on/off switch, to be used if sending metrics causes
unforeseen trouble, work?

Thanks,

                Ilya
Jeffrey Layton June 19, 2020, 12:22 p.m. UTC | #4
On Fri, 2020-06-19 at 11:35 +0200, Ilya Dryomov wrote:
> On Fri, Jun 19, 2020 at 2:38 AM Xiubo Li <xiubli@redhat.com> wrote:
> > On 2020/6/18 22:42, Jeff Layton wrote:
> > > On Thu, 2020-06-18 at 07:59 -0400, xiubli@redhat.com wrote:
> > > > From: Xiubo Li <xiubli@redhat.com>
> > > > 
> > > > This will send the caps/read/write/metadata metrics to any available
> > > > MDS only once per second as default, which will be the same as the
> > > > userland client, or every metric_send_interval seconds, which is a
> > > > module parameter.
> > > > 
> > > > URL: https://tracker.ceph.com/issues/43215
> > > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > > ---
> > > >   fs/ceph/mds_client.c         |   3 +
> > > >   fs/ceph/metric.c             | 134 +++++++++++++++++++++++++++++++++++++++++++
> > > >   fs/ceph/metric.h             |  78 +++++++++++++++++++++++++
> > > >   fs/ceph/super.c              |  49 ++++++++++++++++
> > > >   fs/ceph/super.h              |   2 +
> > > >   include/linux/ceph/ceph_fs.h |   1 +
> > > >   6 files changed, 267 insertions(+)
> > > > 
> > > > 
> > > I think 3/5 needs to moved ahead of this one or folded into it, as we'll
> > > have a temporary regression otherwise.
> > > 
> > > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > > index c9784eb1..5f409dd 100644
> > > > --- a/fs/ceph/super.c
> > > > +++ b/fs/ceph/super.c
> > > > @@ -27,6 +27,9 @@
> > > >   #include <linux/ceph/auth.h>
> > > >   #include <linux/ceph/debugfs.h>
> > > > 
> > > > +static DEFINE_MUTEX(ceph_fsc_lock);
> > > > +static LIST_HEAD(ceph_fsc_list);
> > > > +
> > > >   /*
> > > >    * Ceph superblock operations
> > > >    *
> > > > @@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
> > > >      if (!fsc->wb_pagevec_pool)
> > > >              goto fail_cap_wq;
> > > > 
> > > > +    mutex_lock(&ceph_fsc_lock);
> > > > +    list_add_tail(&fsc->list, &ceph_fsc_list);
> > > > +    mutex_unlock(&ceph_fsc_lock);
> > > > +
> > > >      return fsc;
> > > > 
> > > >   fail_cap_wq:
> > > > @@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
> > > >   {
> > > >      dout("destroy_fs_client %p\n", fsc);
> > > > 
> > > > +    mutex_lock(&ceph_fsc_lock);
> > > > +    list_del(&fsc->list);
> > > > +    mutex_unlock(&ceph_fsc_lock);
> > > > +
> > > >      ceph_mdsc_destroy(fsc);
> > > >      destroy_workqueue(fsc->inode_wq);
> > > >      destroy_workqueue(fsc->cap_wq);
> > > > @@ -1282,6 +1293,44 @@ static void __exit exit_ceph(void)
> > > >      destroy_caches();
> > > >   }
> > > > 
> > > > +static int param_set_metric_interval(const char *val, const struct kernel_param *kp)
> > > > +{
> > > > +    struct ceph_fs_client *fsc;
> > > > +    unsigned int interval;
> > > > +    int ret;
> > > > +
> > > > +    ret = kstrtouint(val, 0, &interval);
> > > > +    if (ret < 0) {
> > > > +            pr_err("Failed to parse metric interval '%s'\n", val);
> > > > +            return ret;
> > > > +    }
> > > > +
> > > > +    if (interval > 5) {
> > > > +            pr_err("Invalid metric interval %u\n", interval);
> > > > +            return -EINVAL;
> > > > +    }
> > > > +
> > > Why do we want to reject an interval larger than 5s? Is that problematic
> > > for some reason?
> > 
> > IMO, a larger interval doesn't make much sense, to limit the interval
> > value in 5s to make sure that the ceph side could show the client real
> > metrics in time. Is this okay ? Or should we use a larger limit ?
> 

5s may seem like a lot now, but in 5-10 years we might be wishing we had
the ability to change this to something else (e.g.: maybe we find that
stats every 5s cause undue load on the MDS, and need to increase it).

> I wonder if providing the option to tune the interval makes sense
> at all then.  Since most clients will be sending their metrics every
> second, the MDS may eventually start relying on that in some way.
> Would a simple on/off switch, to be used if sending metrics causes
> unforeseen trouble, work?
> 

Yeah, I'm leery of grabbing values out of our nether regions like this,
and it doesn't seem like making this that configurable has many
benefits.

Perhaps the preferred interval should be something advertised by the MDS
in some way? New field in the mdsmap, perhaps?
Xiubo Li June 22, 2020, 12:54 a.m. UTC | #5
On 2020/6/19 17:35, Ilya Dryomov wrote:
> On Fri, Jun 19, 2020 at 2:38 AM Xiubo Li <xiubli@redhat.com> wrote:
>> On 2020/6/18 22:42, Jeff Layton wrote:
>>> On Thu, 2020-06-18 at 07:59 -0400, xiubli@redhat.com wrote:
>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>
>>>> This will send the caps/read/write/metadata metrics to any available
>>>> MDS only once per second as default, which will be the same as the
>>>> userland client, or every metric_send_interval seconds, which is a
>>>> module parameter.
>>>>
>>>> URL: https://tracker.ceph.com/issues/43215
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> ---
>>>>    fs/ceph/mds_client.c         |   3 +
>>>>    fs/ceph/metric.c             | 134 +++++++++++++++++++++++++++++++++++++++++++
>>>>    fs/ceph/metric.h             |  78 +++++++++++++++++++++++++
>>>>    fs/ceph/super.c              |  49 ++++++++++++++++
>>>>    fs/ceph/super.h              |   2 +
>>>>    include/linux/ceph/ceph_fs.h |   1 +
>>>>    6 files changed, 267 insertions(+)
>>>>
>>>>
>>> I think 3/5 needs to moved ahead of this one or folded into it, as we'll
>>> have a temporary regression otherwise.
>>>
>>>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>>>> index c9784eb1..5f409dd 100644
>>>> --- a/fs/ceph/super.c
>>>> +++ b/fs/ceph/super.c
>>>> @@ -27,6 +27,9 @@
>>>>    #include <linux/ceph/auth.h>
>>>>    #include <linux/ceph/debugfs.h>
>>>>
>>>> +static DEFINE_MUTEX(ceph_fsc_lock);
>>>> +static LIST_HEAD(ceph_fsc_list);
>>>> +
>>>>    /*
>>>>     * Ceph superblock operations
>>>>     *
>>>> @@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>>>>       if (!fsc->wb_pagevec_pool)
>>>>               goto fail_cap_wq;
>>>>
>>>> +    mutex_lock(&ceph_fsc_lock);
>>>> +    list_add_tail(&fsc->list, &ceph_fsc_list);
>>>> +    mutex_unlock(&ceph_fsc_lock);
>>>> +
>>>>       return fsc;
>>>>
>>>>    fail_cap_wq:
>>>> @@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
>>>>    {
>>>>       dout("destroy_fs_client %p\n", fsc);
>>>>
>>>> +    mutex_lock(&ceph_fsc_lock);
>>>> +    list_del(&fsc->list);
>>>> +    mutex_unlock(&ceph_fsc_lock);
>>>> +
>>>>       ceph_mdsc_destroy(fsc);
>>>>       destroy_workqueue(fsc->inode_wq);
>>>>       destroy_workqueue(fsc->cap_wq);
>>>> @@ -1282,6 +1293,44 @@ static void __exit exit_ceph(void)
>>>>       destroy_caches();
>>>>    }
>>>>
>>>> +static int param_set_metric_interval(const char *val, const struct kernel_param *kp)
>>>> +{
>>>> +    struct ceph_fs_client *fsc;
>>>> +    unsigned int interval;
>>>> +    int ret;
>>>> +
>>>> +    ret = kstrtouint(val, 0, &interval);
>>>> +    if (ret < 0) {
>>>> +            pr_err("Failed to parse metric interval '%s'\n", val);
>>>> +            return ret;
>>>> +    }
>>>> +
>>>> +    if (interval > 5) {
>>>> +            pr_err("Invalid metric interval %u\n", interval);
>>>> +            return -EINVAL;
>>>> +    }
>>>> +
>>> Why do we want to reject an interval larger than 5s? Is that problematic
>>> for some reason?
>> IMO, a larger interval doesn't make much sense, to limit the interval
>> value in 5s to make sure that the ceph side could show the client real
>> metrics in time. Is this okay ? Or should we use a larger limit ?
> I wonder if providing the option to tune the interval makes sense
> at all then.  Since most clients will be sending their metrics every
> second, the MDS may eventually start relying on that in some way.
> Would a simple on/off switch, to be used if sending metrics causes
> unforeseen trouble, work?

Hi Ilya,

Yeah, this sounds sensible.

Thanks

BRs

>
> Thanks,
>
>                  Ilya
>
Xiubo Li June 22, 2020, 1:19 a.m. UTC | #6
On 2020/6/19 20:22, Jeff Layton wrote:
> On Fri, 2020-06-19 at 11:35 +0200, Ilya Dryomov wrote:
>> On Fri, Jun 19, 2020 at 2:38 AM Xiubo Li <xiubli@redhat.com> wrote:
>>> On 2020/6/18 22:42, Jeff Layton wrote:
>>>> On Thu, 2020-06-18 at 07:59 -0400, xiubli@redhat.com wrote:
>>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>>
>>>>> This will send the caps/read/write/metadata metrics to any available
>>>>> MDS only once per second as default, which will be the same as the
>>>>> userland client, or every metric_send_interval seconds, which is a
>>>>> module parameter.
>>>>>
>>>>> URL: https://tracker.ceph.com/issues/43215
>>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>>> ---
>>>>>    fs/ceph/mds_client.c         |   3 +
>>>>>    fs/ceph/metric.c             | 134 +++++++++++++++++++++++++++++++++++++++++++
>>>>>    fs/ceph/metric.h             |  78 +++++++++++++++++++++++++
>>>>>    fs/ceph/super.c              |  49 ++++++++++++++++
>>>>>    fs/ceph/super.h              |   2 +
>>>>>    include/linux/ceph/ceph_fs.h |   1 +
>>>>>    6 files changed, 267 insertions(+)
>>>>>
>>>>>
>>>> I think 3/5 needs to moved ahead of this one or folded into it, as we'll
>>>> have a temporary regression otherwise.
>>>>
>>>>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>>>>> index c9784eb1..5f409dd 100644
>>>>> --- a/fs/ceph/super.c
>>>>> +++ b/fs/ceph/super.c
>>>>> @@ -27,6 +27,9 @@
>>>>>    #include <linux/ceph/auth.h>
>>>>>    #include <linux/ceph/debugfs.h>
>>>>>
>>>>> +static DEFINE_MUTEX(ceph_fsc_lock);
>>>>> +static LIST_HEAD(ceph_fsc_list);
>>>>> +
>>>>>    /*
>>>>>     * Ceph superblock operations
>>>>>     *
>>>>> @@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>>>>>       if (!fsc->wb_pagevec_pool)
>>>>>               goto fail_cap_wq;
>>>>>
>>>>> +    mutex_lock(&ceph_fsc_lock);
>>>>> +    list_add_tail(&fsc->list, &ceph_fsc_list);
>>>>> +    mutex_unlock(&ceph_fsc_lock);
>>>>> +
>>>>>       return fsc;
>>>>>
>>>>>    fail_cap_wq:
>>>>> @@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
>>>>>    {
>>>>>       dout("destroy_fs_client %p\n", fsc);
>>>>>
>>>>> +    mutex_lock(&ceph_fsc_lock);
>>>>> +    list_del(&fsc->list);
>>>>> +    mutex_unlock(&ceph_fsc_lock);
>>>>> +
>>>>>       ceph_mdsc_destroy(fsc);
>>>>>       destroy_workqueue(fsc->inode_wq);
>>>>>       destroy_workqueue(fsc->cap_wq);
>>>>> @@ -1282,6 +1293,44 @@ static void __exit exit_ceph(void)
>>>>>       destroy_caches();
>>>>>    }
>>>>>
>>>>> +static int param_set_metric_interval(const char *val, const struct kernel_param *kp)
>>>>> +{
>>>>> +    struct ceph_fs_client *fsc;
>>>>> +    unsigned int interval;
>>>>> +    int ret;
>>>>> +
>>>>> +    ret = kstrtouint(val, 0, &interval);
>>>>> +    if (ret < 0) {
>>>>> +            pr_err("Failed to parse metric interval '%s'\n", val);
>>>>> +            return ret;
>>>>> +    }
>>>>> +
>>>>> +    if (interval > 5) {
>>>>> +            pr_err("Invalid metric interval %u\n", interval);
>>>>> +            return -EINVAL;
>>>>> +    }
>>>>> +
>>>> Why do we want to reject an interval larger than 5s? Is that problematic
>>>> for some reason?
>>> IMO, a larger interval doesn't make much sense, to limit the interval
>>> value in 5s to make sure that the ceph side could show the client real
>>> metrics in time. Is this okay ? Or should we use a larger limit ?
> 5s may seem like a lot now, but in 5-10 years we might be wishing we had
> the ability to change this to something else (e.g.: maybe we find that
> stats every 5s cause undue load on the MDS, and need to increase it).
>
>> I wonder if providing the option to tune the interval makes sense
>> at all then.  Since most clients will be sending their metrics every
>> second, the MDS may eventually start relying on that in some way.
>> Would a simple on/off switch, to be used if sending metrics causes
>> unforeseen trouble, work?
>>
> Yeah, I'm leery of grabbing values out of our nether regions like this,
> and it doesn't seem like making this that configurable has many
> benefits.

Let's use an on/off switch here.

> Perhaps the preferred interval should be something advertised by the MDS
> in some way? New field in the mdsmap, perhaps?

For this maybe we could add one MDS or global option in ceph or one 
parameter in MDS perf query interface based PR[1] in future.

For now the on/off switch should be good and enough.

[1] https://github.com/ceph/ceph/pull/29951

Thanks

BRs
diff mbox series

Patch

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 608fb5c..f996363 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4625,6 +4625,7 @@  void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 
 	cancel_work_sync(&mdsc->cap_reclaim_work);
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+	cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
 
 	dout("stopped\n");
 }
@@ -4667,6 +4668,7 @@  static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
 	dout("stop\n");
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+	cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
 	if (mdsc->mdsmap)
 		ceph_mdsmap_destroy(mdsc->mdsmap);
 	kfree(mdsc->sessions);
@@ -4824,6 +4826,7 @@  void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 
 	mutex_unlock(&mdsc->mutex);
 	schedule_delayed(mdsc);
+	metric_schedule_delayed(&mdsc->metric);
 	return;
 
 bad_unlock:
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 9217f35..27cb541 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -1,10 +1,142 @@ 
 /* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/types.h>
 #include <linux/percpu_counter.h>
 #include <linux/math64.h>
 
 #include "metric.h"
+#include "mds_client.h"
+
+static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *s,
+				   u64 nr_caps)
+{
+	struct ceph_metric_head *head;
+	struct ceph_metric_cap *cap;
+	struct ceph_metric_read_latency *read;
+	struct ceph_metric_write_latency *write;
+	struct ceph_metric_metadata_latency *meta;
+	struct ceph_client_metric *m = &mdsc->metric;
+	struct ceph_msg *msg;
+	struct timespec64 ts;
+	s64 sum, total;
+	s32 items = 0;
+	s32 len;
+
+	len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
+	      + sizeof(*meta);
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
+	if (!msg) {
+		pr_err("send metrics to mds%d, failed to allocate message\n",
+		       s->s_mds);
+		return false;
+	}
+
+	head = msg->front.iov_base;
+
+	/* encode the cap metric */
+	cap = (struct ceph_metric_cap *)(head + 1);
+	cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
+	cap->ver = 1;
+	cap->compat = 1;
+	cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
+	cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
+	cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
+	cap->total = cpu_to_le64(nr_caps);
+	items++;
+
+	/* encode the read latency metric */
+	read = (struct ceph_metric_read_latency *)(cap + 1);
+	read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
+	read->ver = 1;
+	read->compat = 1;
+	read->data_len = cpu_to_le32(sizeof(*read) - 10);
+	total = m->total_reads;
+	sum = m->read_latency_sum;
+	jiffies_to_timespec64(sum, &ts);
+	read->sec = cpu_to_le32(ts.tv_sec);
+	read->nsec = cpu_to_le32(ts.tv_nsec);
+	items++;
+
+	/* encode the write latency metric */
+	write = (struct ceph_metric_write_latency *)(read + 1);
+	write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
+	write->ver = 1;
+	write->compat = 1;
+	write->data_len = cpu_to_le32(sizeof(*write) - 10);
+	total = m->total_writes;
+	sum = m->write_latency_sum;
+	jiffies_to_timespec64(sum, &ts);
+	write->sec = cpu_to_le32(ts.tv_sec);
+	write->nsec = cpu_to_le32(ts.tv_nsec);
+	items++;
+
+	/* encode the metadata latency metric */
+	meta = (struct ceph_metric_metadata_latency *)(write + 1);
+	meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
+	meta->ver = 1;
+	meta->compat = 1;
+	meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
+	total = m->total_metadatas;
+	sum = m->metadata_latency_sum;
+	jiffies_to_timespec64(sum, &ts);
+	meta->sec = cpu_to_le32(ts.tv_sec);
+	meta->nsec = cpu_to_le32(ts.tv_nsec);
+	items++;
+
+	put_unaligned_le32(items, &head->num);
+	msg->front.iov_len = cpu_to_le32(len);
+	msg->hdr.version = cpu_to_le16(1);
+	msg->hdr.compat_version = cpu_to_le16(1);
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+	dout("client%llu send metrics to mds%d\n",
+	     ceph_client_gid(mdsc->fsc->client), s->s_mds);
+	ceph_con_send(&s->s_con, msg);
+
+	return true;
+}
+
+static void metric_delayed_work(struct work_struct *work)
+{
+	struct ceph_client_metric *m =
+		container_of(work, struct ceph_client_metric, delayed_work.work);
+	struct ceph_mds_client *mdsc =
+		container_of(m, struct ceph_mds_client, metric);
+	struct ceph_mds_session *s;
+	u64 nr_caps = 0;
+	bool ret;
+	int i;
+
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		s = __ceph_lookup_mds_session(mdsc, i);
+		if (!s)
+			continue;
+		nr_caps += s->s_nr_caps;
+		ceph_put_mds_session(s);
+	}
+
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		s = __ceph_lookup_mds_session(mdsc, i);
+		if (!s)
+			continue;
+		if (!check_session_state(mdsc, s)) {
+			ceph_put_mds_session(s);
+			continue;
+		}
+
+		/* Only send the metric once in any available session */
+		ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
+		ceph_put_mds_session(s);
+		if (ret)
+			break;
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	metric_schedule_delayed(&mdsc->metric);
+}
 
 int ceph_metric_init(struct ceph_client_metric *m)
 {
@@ -51,6 +183,8 @@  int ceph_metric_init(struct ceph_client_metric *m)
 	m->total_metadatas = 0;
 	m->metadata_latency_sum = 0;
 
+	INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
+
 	return 0;
 
 err_i_caps_mis:
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index ccd8128..2af9e0b 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -6,6 +6,71 @@ 
 #include <linux/percpu_counter.h>
 #include <linux/ktime.h>
 
+extern unsigned int metric_send_interval;
+
+enum ceph_metric_type {
+	CLIENT_METRIC_TYPE_CAP_INFO,
+	CLIENT_METRIC_TYPE_READ_LATENCY,
+	CLIENT_METRIC_TYPE_WRITE_LATENCY,
+	CLIENT_METRIC_TYPE_METADATA_LATENCY,
+	CLIENT_METRIC_TYPE_DENTRY_LEASE,
+
+	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
+};
+
+/* metric caps header */
+struct ceph_metric_cap {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(hit + mis + total) */
+	__le64 hit;
+	__le64 mis;
+	__le64 total;
+} __packed;
+
+/* metric read latency header */
+struct ceph_metric_read_latency {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(sec + nsec) */
+	__le32 sec;
+	__le32 nsec;
+} __packed;
+
+/* metric write latency header */
+struct ceph_metric_write_latency {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(sec + nsec) */
+	__le32 sec;
+	__le32 nsec;
+} __packed;
+
+/* metric metadata latency header */
+struct ceph_metric_metadata_latency {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(sec + nsec) */
+	__le32 sec;
+	__le32 nsec;
+} __packed;
+
+struct ceph_metric_head {
+	__le32 num;	/* the number of metrics that will be sent */
+} __packed;
+
 /* This is the global metrics */
 struct ceph_client_metric {
 	atomic64_t            total_dentries;
@@ -35,8 +100,21 @@  struct ceph_client_metric {
 	ktime_t metadata_latency_sq_sum;
 	ktime_t metadata_latency_min;
 	ktime_t metadata_latency_max;
+
+	struct delayed_work delayed_work;  /* delayed work */
 };
 
+static inline void metric_schedule_delayed(struct ceph_client_metric *m)
+{
+	/* per second as default */
+	unsigned int hz = round_jiffies_relative(HZ * metric_send_interval);
+
+	if (!metric_send_interval)
+		return;
+
+	schedule_delayed_work(&m->delayed_work, hz);
+}
+
 extern int ceph_metric_init(struct ceph_client_metric *m);
 extern void ceph_metric_destroy(struct ceph_client_metric *m);
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c9784eb1..5f409dd 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -27,6 +27,9 @@ 
 #include <linux/ceph/auth.h>
 #include <linux/ceph/debugfs.h>
 
+static DEFINE_MUTEX(ceph_fsc_lock);
+static LIST_HEAD(ceph_fsc_list);
+
 /*
  * Ceph superblock operations
  *
@@ -691,6 +694,10 @@  static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 	if (!fsc->wb_pagevec_pool)
 		goto fail_cap_wq;
 
+	mutex_lock(&ceph_fsc_lock);
+	list_add_tail(&fsc->list, &ceph_fsc_list);
+	mutex_unlock(&ceph_fsc_lock);
+
 	return fsc;
 
 fail_cap_wq:
@@ -717,6 +724,10 @@  static void destroy_fs_client(struct ceph_fs_client *fsc)
 {
 	dout("destroy_fs_client %p\n", fsc);
 
+	mutex_lock(&ceph_fsc_lock);
+	list_del(&fsc->list);
+	mutex_unlock(&ceph_fsc_lock);
+
 	ceph_mdsc_destroy(fsc);
 	destroy_workqueue(fsc->inode_wq);
 	destroy_workqueue(fsc->cap_wq);
@@ -1282,6 +1293,44 @@  static void __exit exit_ceph(void)
 	destroy_caches();
 }
 
+static int param_set_metric_interval(const char *val, const struct kernel_param *kp)
+{
+	struct ceph_fs_client *fsc;
+	unsigned int interval;
+	int ret;
+
+	ret = kstrtouint(val, 0, &interval);
+	if (ret < 0) {
+		pr_err("Failed to parse metric interval '%s'\n", val);
+		return ret;
+	}
+
+	if (interval > 5) {
+		pr_err("Invalid metric interval %u\n", interval);
+		return -EINVAL;
+	}
+
+	metric_send_interval = interval;
+
+	// wake up all the mds clients
+	mutex_lock(&ceph_fsc_lock);
+	list_for_each_entry(fsc, &ceph_fsc_list, list) {
+		metric_schedule_delayed(&fsc->mdsc->metric);
+	}
+	mutex_unlock(&ceph_fsc_lock);
+
+	return 0;
+}
+
+static const struct kernel_param_ops param_ops_metric_interval = {
+	.set = param_set_metric_interval,
+	.get = param_get_uint,
+};
+
+unsigned int metric_send_interval = 1;
+module_param_cb(metric_send_interval, &param_ops_metric_interval, &metric_send_interval, 0644);
+MODULE_PARM_DESC(metric_send_interval, "Interval (in seconds) of sending perf metric to ceph cluster, valid values are 0~5, 0 means disabled (default: 1)");
+
 module_init(init_ceph);
 module_exit(exit_ceph);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 5a6cdd3..05edc9a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -101,6 +101,8 @@  struct ceph_mount_options {
 struct ceph_fs_client {
 	struct super_block *sb;
 
+	struct list_head list;
+
 	struct ceph_mount_options *mount_options;
 	struct ceph_client *client;
 
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index ebf5ba6..455e9b9 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -130,6 +130,7 @@  struct ceph_dir_layout {
 #define CEPH_MSG_CLIENT_REQUEST         24
 #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
 #define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_METRICS         29
 #define CEPH_MSG_CLIENT_CAPS            0x310
 #define CEPH_MSG_CLIENT_LEASE           0x311
 #define CEPH_MSG_CLIENT_SNAP            0x312