diff mbox series

[v6,6/9] ceph: periodically send perf metrics to ceph

Message ID 20200210053407.37237-7-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series ceph: add perf metrics support | expand

Commit Message

Xiubo Li Feb. 10, 2020, 5:34 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

Add metric_send_interval module parameter support, the default valume
is 0, means disabled. If none zero it will enable the transmission of
the metrics to the ceph cluster periodically per metric_send_interval
seconds.

This will send the caps, dentry lease and read/write/metadata perf
metrics to any available MDS only once per metric_send_interval
seconds.

URL: https://tracker.ceph.com/issues/43215
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c         | 235 +++++++++++++++++++++++++++++++----
 fs/ceph/mds_client.h         |   2 +
 fs/ceph/metric.h             |  76 +++++++++++
 fs/ceph/super.c              |   4 +
 fs/ceph/super.h              |   1 +
 include/linux/ceph/ceph_fs.h |   1 +
 6 files changed, 294 insertions(+), 25 deletions(-)

Comments

Ilya Dryomov Feb. 10, 2020, 3:34 p.m. UTC | #1
On Mon, Feb 10, 2020 at 6:34 AM <xiubli@redhat.com> wrote:
>
> From: Xiubo Li <xiubli@redhat.com>
>
> Add metric_send_interval module parameter support, the default valume
> is 0, means disabled. If none zero it will enable the transmission of
> the metrics to the ceph cluster periodically per metric_send_interval
> seconds.
>
> This will send the caps, dentry lease and read/write/metadata perf
> metrics to any available MDS only once per metric_send_interval
> seconds.
>
> URL: https://tracker.ceph.com/issues/43215
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/mds_client.c         | 235 +++++++++++++++++++++++++++++++----
>  fs/ceph/mds_client.h         |   2 +
>  fs/ceph/metric.h             |  76 +++++++++++
>  fs/ceph/super.c              |   4 +
>  fs/ceph/super.h              |   1 +
>  include/linux/ceph/ceph_fs.h |   1 +
>  6 files changed, 294 insertions(+), 25 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index d414eded6810..f9a6f95c7941 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -4085,16 +4085,167 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
>         ceph_force_reconnect(fsc->sb);
>  }
>
> -/*
> - * delayed work -- periodically trim expired leases, renew caps with mds
> - */
> +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
> +                                  struct ceph_mds_session *s,
> +                                  u64 nr_caps)
> +{
> +       struct ceph_metric_head *head;
> +       struct ceph_metric_cap *cap;
> +       struct ceph_metric_dentry_lease *lease;
> +       struct ceph_metric_read_latency *read;
> +       struct ceph_metric_write_latency *write;
> +       struct ceph_metric_metadata_latency *meta;
> +       struct ceph_msg *msg;
> +       struct timespec64 ts;
> +       s64 sum, total;
> +       s32 items = 0;
> +       s32 len;
> +
> +       if (!mdsc || !s)
> +               return false;
> +
> +       len = sizeof(*head) + sizeof(*cap) + sizeof(*lease) + sizeof(*read)
> +             + sizeof(*write) + sizeof(*meta);
> +
> +       msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
> +       if (!msg) {
> +               pr_err("send metrics to mds%d, failed to allocate message\n",
> +                      s->s_mds);
> +               return false;
> +       }
> +
> +       head = msg->front.iov_base;
> +
> +       /* encode the cap metric */
> +       cap = (struct ceph_metric_cap *)(head + 1);
> +       cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
> +       cap->ver = 1;
> +       cap->compat = 1;
> +       cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
> +       cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
> +       cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
> +       cap->total = cpu_to_le64(nr_caps);
> +       items++;
> +
> +       dout("cap metric hit %lld, mis %lld, total caps %lld",
> +            le64_to_cpu(cap->hit), le64_to_cpu(cap->mis),
> +            le64_to_cpu(cap->total));
> +
> +       /* encode the read latency metric */
> +       read = (struct ceph_metric_read_latency *)(cap + 1);
> +       read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
> +       read->ver = 1;
> +       read->compat = 1;
> +       read->data_len = cpu_to_le32(sizeof(*read) - 10);
> +       total = percpu_counter_sum(&mdsc->metric.total_reads),
> +       sum = percpu_counter_sum(&mdsc->metric.read_latency_sum);
> +       jiffies_to_timespec64(sum, &ts);
> +       read->sec = cpu_to_le32(ts.tv_sec);
> +       read->nsec = cpu_to_le32(ts.tv_nsec);
> +       items++;
> +       dout("read latency metric total %lld, sum lat %lld", total, sum);
> +
> +       /* encode the write latency metric */
> +       write = (struct ceph_metric_write_latency *)(read + 1);
> +       write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
> +       write->ver = 1;
> +       write->compat = 1;
> +       write->data_len = cpu_to_le32(sizeof(*write) - 10);
> +       total = percpu_counter_sum(&mdsc->metric.total_writes),
> +       sum = percpu_counter_sum(&mdsc->metric.write_latency_sum);
> +       jiffies_to_timespec64(sum, &ts);
> +       write->sec = cpu_to_le32(ts.tv_sec);
> +       write->nsec = cpu_to_le32(ts.tv_nsec);
> +       items++;
> +       dout("write latency metric total %lld, sum lat %lld", total, sum);
> +
> +       /* encode the metadata latency metric */
> +       meta = (struct ceph_metric_metadata_latency *)(write + 1);
> +       meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
> +       meta->ver = 1;
> +       meta->compat = 1;
> +       meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
> +       total = percpu_counter_sum(&mdsc->metric.total_metadatas),
> +       sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum);
> +       jiffies_to_timespec64(sum, &ts);
> +       meta->sec = cpu_to_le32(ts.tv_sec);
> +       meta->nsec = cpu_to_le32(ts.tv_nsec);
> +       items++;
> +       dout("metadata latency metric total %lld, sum lat %lld", total, sum);
> +
> +       /* encode the dentry lease metric */
> +       lease = (struct ceph_metric_dentry_lease *)(meta + 1);
> +       lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
> +       lease->ver = 1;
> +       lease->compat = 1;
> +       lease->data_len = cpu_to_le32(sizeof(*lease) - 10);
> +       lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit));
> +       lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis));
> +       lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries));
> +       items++;
> +       dout("dentry lease metric hit %lld, mis %lld, total dentries %lld",
> +            le64_to_cpu(lease->hit), le64_to_cpu(lease->mis),
> +            le64_to_cpu(lease->total));
> +
> +       put_unaligned_le32(items, &head->num);
> +       msg->front.iov_len = cpu_to_le32(len);
> +       msg->hdr.version = cpu_to_le16(1);
> +       msg->hdr.compat_version = cpu_to_le16(1);
> +       msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
> +       dout("send metrics to mds%d %p\n", s->s_mds, msg);
> +       ceph_con_send(&s->s_con, msg);
> +
> +       return true;
> +}
> +
> +#define CEPH_WORK_DELAY_DEF 5
> +static void __schedule_delayed(struct delayed_work *work, int delay)
> +{
> +       unsigned int hz = round_jiffies_relative(HZ * delay);
> +
> +       schedule_delayed_work(work, hz);
> +}
> +
>  static void schedule_delayed(struct ceph_mds_client *mdsc)
>  {
> -       int delay = 5;
> -       unsigned hz = round_jiffies_relative(HZ * delay);
> -       schedule_delayed_work(&mdsc->delayed_work, hz);
> +       __schedule_delayed(&mdsc->delayed_work, CEPH_WORK_DELAY_DEF);
> +}
> +
> +static void metric_schedule_delayed(struct ceph_mds_client *mdsc)
> +{
> +       /* delay CEPH_WORK_DELAY_DEF seconds when idle */
> +       int delay = metric_send_interval ? : CEPH_WORK_DELAY_DEF;
> +
> +       __schedule_delayed(&mdsc->metric_delayed_work, delay);
> +}
> +
> +static bool check_session_state(struct ceph_mds_client *mdsc,
> +                               struct ceph_mds_session *s)
> +{
> +       if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
> +               dout("resending session close request for mds%d\n",
> +                               s->s_mds);
> +               request_close_session(mdsc, s);
> +               return false;
> +       }
> +       if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
> +               if (s->s_state == CEPH_MDS_SESSION_OPEN) {
> +                       s->s_state = CEPH_MDS_SESSION_HUNG;
> +                       pr_info("mds%d hung\n", s->s_mds);
> +               }
> +       }
> +       if (s->s_state == CEPH_MDS_SESSION_NEW ||
> +           s->s_state == CEPH_MDS_SESSION_RESTARTING ||
> +           s->s_state == CEPH_MDS_SESSION_REJECTED)
> +               /* this mds is failed or recovering, just wait */
> +               return false;
> +
> +       return true;
>  }
>
> +/*
> + * delayed work -- periodically trim expired leases, renew caps with mds
> + */
>  static void delayed_work(struct work_struct *work)
>  {
>         int i;
> @@ -4116,23 +4267,8 @@ static void delayed_work(struct work_struct *work)
>                 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
>                 if (!s)
>                         continue;
> -               if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
> -                       dout("resending session close request for mds%d\n",
> -                            s->s_mds);
> -                       request_close_session(mdsc, s);
> -                       ceph_put_mds_session(s);
> -                       continue;
> -               }
> -               if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
> -                       if (s->s_state == CEPH_MDS_SESSION_OPEN) {
> -                               s->s_state = CEPH_MDS_SESSION_HUNG;
> -                               pr_info("mds%d hung\n", s->s_mds);
> -                       }
> -               }
> -               if (s->s_state == CEPH_MDS_SESSION_NEW ||
> -                   s->s_state == CEPH_MDS_SESSION_RESTARTING ||
> -                   s->s_state == CEPH_MDS_SESSION_REJECTED) {
> -                       /* this mds is failed or recovering, just wait */
> +
> +               if (!check_session_state(mdsc, s)) {
>                         ceph_put_mds_session(s);
>                         continue;
>                 }
> @@ -4164,8 +4300,53 @@ static void delayed_work(struct work_struct *work)
>         schedule_delayed(mdsc);
>  }
>
> -static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
> +static void metric_delayed_work(struct work_struct *work)
> +{
> +       struct ceph_mds_client *mdsc =
> +               container_of(work, struct ceph_mds_client, metric_delayed_work.work);
> +       struct ceph_mds_session *s;
> +       u64 nr_caps = 0;
> +       bool ret;
> +       int i;
> +
> +       if (!metric_send_interval)
> +               goto idle;
> +
> +       dout("mdsc metric_delayed_work\n");
> +
> +       mutex_lock(&mdsc->mutex);
> +       for (i = 0; i < mdsc->max_sessions; i++) {
> +               s = __ceph_lookup_mds_session(mdsc, i);
> +               if (!s)
> +                       continue;
> +               nr_caps += s->s_nr_caps;
> +               ceph_put_mds_session(s);
> +       }
> +
> +       for (i = 0; i < mdsc->max_sessions; i++) {
> +               s = __ceph_lookup_mds_session(mdsc, i);
> +               if (!s)
> +                       continue;
> +               if (!check_session_state(mdsc, s)) {
> +                       ceph_put_mds_session(s);
> +                       continue;
> +               }
> +
> +               /* Only send the metric once in any available session */
> +               ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
> +               ceph_put_mds_session(s);
> +               if (ret)
> +                       break;
> +       }
> +       mutex_unlock(&mdsc->mutex);
> +
> +idle:
> +       metric_schedule_delayed(mdsc);

Looks like this will schedule metric_delayed_work() every 5 seconds
even if metric_send_interval = 0 (i.e. sending is disabled).  What is
the reason for that?

Thanks,

                Ilya
Xiubo Li Feb. 11, 2020, 1:29 a.m. UTC | #2
On 2020/2/10 23:34, Ilya Dryomov wrote:
> On Mon, Feb 10, 2020 at 6:34 AM <xiubli@redhat.com> wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> Add metric_send_interval module parameter support, the default valume
>> is 0, means disabled. If none zero it will enable the transmission of
>> the metrics to the ceph cluster periodically per metric_send_interval
>> seconds.
>>
>> This will send the caps, dentry lease and read/write/metadata perf
>> metrics to any available MDS only once per metric_send_interval
>> seconds.
>>
>> URL: https://tracker.ceph.com/issues/43215
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/mds_client.c         | 235 +++++++++++++++++++++++++++++++----
>>   fs/ceph/mds_client.h         |   2 +
>>   fs/ceph/metric.h             |  76 +++++++++++
>>   fs/ceph/super.c              |   4 +
>>   fs/ceph/super.h              |   1 +
>>   include/linux/ceph/ceph_fs.h |   1 +
>>   6 files changed, 294 insertions(+), 25 deletions(-)
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index d414eded6810..f9a6f95c7941 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -4085,16 +4085,167 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
>>          ceph_force_reconnect(fsc->sb);
>>   }
>>
>> -/*
>> - * delayed work -- periodically trim expired leases, renew caps with mds
>> - */
>> +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
>> +                                  struct ceph_mds_session *s,
>> +                                  u64 nr_caps)
>> +{
>> +       struct ceph_metric_head *head;
>> +       struct ceph_metric_cap *cap;
>> +       struct ceph_metric_dentry_lease *lease;
>> +       struct ceph_metric_read_latency *read;
>> +       struct ceph_metric_write_latency *write;
>> +       struct ceph_metric_metadata_latency *meta;
>> +       struct ceph_msg *msg;
>> +       struct timespec64 ts;
>> +       s64 sum, total;
>> +       s32 items = 0;
>> +       s32 len;
>> +
>> +       if (!mdsc || !s)
>> +               return false;
>> +
>> +       len = sizeof(*head) + sizeof(*cap) + sizeof(*lease) + sizeof(*read)
>> +             + sizeof(*write) + sizeof(*meta);
>> +
>> +       msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
>> +       if (!msg) {
>> +               pr_err("send metrics to mds%d, failed to allocate message\n",
>> +                      s->s_mds);
>> +               return false;
>> +       }
>> +
>> +       head = msg->front.iov_base;
>> +
>> +       /* encode the cap metric */
>> +       cap = (struct ceph_metric_cap *)(head + 1);
>> +       cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
>> +       cap->ver = 1;
>> +       cap->compat = 1;
>> +       cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
>> +       cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
>> +       cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
>> +       cap->total = cpu_to_le64(nr_caps);
>> +       items++;
>> +
>> +       dout("cap metric hit %lld, mis %lld, total caps %lld",
>> +            le64_to_cpu(cap->hit), le64_to_cpu(cap->mis),
>> +            le64_to_cpu(cap->total));
>> +
>> +       /* encode the read latency metric */
>> +       read = (struct ceph_metric_read_latency *)(cap + 1);
>> +       read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
>> +       read->ver = 1;
>> +       read->compat = 1;
>> +       read->data_len = cpu_to_le32(sizeof(*read) - 10);
>> +       total = percpu_counter_sum(&mdsc->metric.total_reads),
>> +       sum = percpu_counter_sum(&mdsc->metric.read_latency_sum);
>> +       jiffies_to_timespec64(sum, &ts);
>> +       read->sec = cpu_to_le32(ts.tv_sec);
>> +       read->nsec = cpu_to_le32(ts.tv_nsec);
>> +       items++;
>> +       dout("read latency metric total %lld, sum lat %lld", total, sum);
>> +
>> +       /* encode the write latency metric */
>> +       write = (struct ceph_metric_write_latency *)(read + 1);
>> +       write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
>> +       write->ver = 1;
>> +       write->compat = 1;
>> +       write->data_len = cpu_to_le32(sizeof(*write) - 10);
>> +       total = percpu_counter_sum(&mdsc->metric.total_writes),
>> +       sum = percpu_counter_sum(&mdsc->metric.write_latency_sum);
>> +       jiffies_to_timespec64(sum, &ts);
>> +       write->sec = cpu_to_le32(ts.tv_sec);
>> +       write->nsec = cpu_to_le32(ts.tv_nsec);
>> +       items++;
>> +       dout("write latency metric total %lld, sum lat %lld", total, sum);
>> +
>> +       /* encode the metadata latency metric */
>> +       meta = (struct ceph_metric_metadata_latency *)(write + 1);
>> +       meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
>> +       meta->ver = 1;
>> +       meta->compat = 1;
>> +       meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
>> +       total = percpu_counter_sum(&mdsc->metric.total_metadatas),
>> +       sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum);
>> +       jiffies_to_timespec64(sum, &ts);
>> +       meta->sec = cpu_to_le32(ts.tv_sec);
>> +       meta->nsec = cpu_to_le32(ts.tv_nsec);
>> +       items++;
>> +       dout("metadata latency metric total %lld, sum lat %lld", total, sum);
>> +
>> +       /* encode the dentry lease metric */
>> +       lease = (struct ceph_metric_dentry_lease *)(meta + 1);
>> +       lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
>> +       lease->ver = 1;
>> +       lease->compat = 1;
>> +       lease->data_len = cpu_to_le32(sizeof(*lease) - 10);
>> +       lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit));
>> +       lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis));
>> +       lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries));
>> +       items++;
>> +       dout("dentry lease metric hit %lld, mis %lld, total dentries %lld",
>> +            le64_to_cpu(lease->hit), le64_to_cpu(lease->mis),
>> +            le64_to_cpu(lease->total));
>> +
>> +       put_unaligned_le32(items, &head->num);
>> +       msg->front.iov_len = cpu_to_le32(len);
>> +       msg->hdr.version = cpu_to_le16(1);
>> +       msg->hdr.compat_version = cpu_to_le16(1);
>> +       msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
>> +       dout("send metrics to mds%d %p\n", s->s_mds, msg);
>> +       ceph_con_send(&s->s_con, msg);
>> +
>> +       return true;
>> +}
>> +
>> +#define CEPH_WORK_DELAY_DEF 5
>> +static void __schedule_delayed(struct delayed_work *work, int delay)
>> +{
>> +       unsigned int hz = round_jiffies_relative(HZ * delay);
>> +
>> +       schedule_delayed_work(work, hz);
>> +}
>> +
>>   static void schedule_delayed(struct ceph_mds_client *mdsc)
>>   {
>> -       int delay = 5;
>> -       unsigned hz = round_jiffies_relative(HZ * delay);
>> -       schedule_delayed_work(&mdsc->delayed_work, hz);
>> +       __schedule_delayed(&mdsc->delayed_work, CEPH_WORK_DELAY_DEF);
>> +}
>> +
>> +static void metric_schedule_delayed(struct ceph_mds_client *mdsc)
>> +{
>> +       /* delay CEPH_WORK_DELAY_DEF seconds when idle */
>> +       int delay = metric_send_interval ? : CEPH_WORK_DELAY_DEF;
>> +
>> +       __schedule_delayed(&mdsc->metric_delayed_work, delay);
>> +}
>> +
>> +static bool check_session_state(struct ceph_mds_client *mdsc,
>> +                               struct ceph_mds_session *s)
>> +{
>> +       if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
>> +               dout("resending session close request for mds%d\n",
>> +                               s->s_mds);
>> +               request_close_session(mdsc, s);
>> +               return false;
>> +       }
>> +       if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
>> +               if (s->s_state == CEPH_MDS_SESSION_OPEN) {
>> +                       s->s_state = CEPH_MDS_SESSION_HUNG;
>> +                       pr_info("mds%d hung\n", s->s_mds);
>> +               }
>> +       }
>> +       if (s->s_state == CEPH_MDS_SESSION_NEW ||
>> +           s->s_state == CEPH_MDS_SESSION_RESTARTING ||
>> +           s->s_state == CEPH_MDS_SESSION_REJECTED)
>> +               /* this mds is failed or recovering, just wait */
>> +               return false;
>> +
>> +       return true;
>>   }
>>
>> +/*
>> + * delayed work -- periodically trim expired leases, renew caps with mds
>> + */
>>   static void delayed_work(struct work_struct *work)
>>   {
>>          int i;
>> @@ -4116,23 +4267,8 @@ static void delayed_work(struct work_struct *work)
>>                  struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
>>                  if (!s)
>>                          continue;
>> -               if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
>> -                       dout("resending session close request for mds%d\n",
>> -                            s->s_mds);
>> -                       request_close_session(mdsc, s);
>> -                       ceph_put_mds_session(s);
>> -                       continue;
>> -               }
>> -               if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
>> -                       if (s->s_state == CEPH_MDS_SESSION_OPEN) {
>> -                               s->s_state = CEPH_MDS_SESSION_HUNG;
>> -                               pr_info("mds%d hung\n", s->s_mds);
>> -                       }
>> -               }
>> -               if (s->s_state == CEPH_MDS_SESSION_NEW ||
>> -                   s->s_state == CEPH_MDS_SESSION_RESTARTING ||
>> -                   s->s_state == CEPH_MDS_SESSION_REJECTED) {
>> -                       /* this mds is failed or recovering, just wait */
>> +
>> +               if (!check_session_state(mdsc, s)) {
>>                          ceph_put_mds_session(s);
>>                          continue;
>>                  }
>> @@ -4164,8 +4300,53 @@ static void delayed_work(struct work_struct *work)
>>          schedule_delayed(mdsc);
>>   }
>>
>> -static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
>> +static void metric_delayed_work(struct work_struct *work)
>> +{
>> +       struct ceph_mds_client *mdsc =
>> +               container_of(work, struct ceph_mds_client, metric_delayed_work.work);
>> +       struct ceph_mds_session *s;
>> +       u64 nr_caps = 0;
>> +       bool ret;
>> +       int i;
>> +
>> +       if (!metric_send_interval)
>> +               goto idle;
>> +
>> +       dout("mdsc metric_delayed_work\n");
>> +
>> +       mutex_lock(&mdsc->mutex);
>> +       for (i = 0; i < mdsc->max_sessions; i++) {
>> +               s = __ceph_lookup_mds_session(mdsc, i);
>> +               if (!s)
>> +                       continue;
>> +               nr_caps += s->s_nr_caps;
>> +               ceph_put_mds_session(s);
>> +       }
>> +
>> +       for (i = 0; i < mdsc->max_sessions; i++) {
>> +               s = __ceph_lookup_mds_session(mdsc, i);
>> +               if (!s)
>> +                       continue;
>> +               if (!check_session_state(mdsc, s)) {
>> +                       ceph_put_mds_session(s);
>> +                       continue;
>> +               }
>> +
>> +               /* Only send the metric once in any available session */
>> +               ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
>> +               ceph_put_mds_session(s);
>> +               if (ret)
>> +                       break;
>> +       }
>> +       mutex_unlock(&mdsc->mutex);
>> +
>> +idle:
>> +       metric_schedule_delayed(mdsc);
> Looks like this will schedule metric_delayed_work() every 5 seconds
> even if metric_send_interval = 0 (i.e. sending is disabled).  What is
> the reason for that?

Hi Ilya,

Before I folded the metric_delayed_work() into delayed_work(). But for 
the this version since the interval is settable, so it hard to calculate 
the next schedule delay for that.

When it is idle just looping every 5 seconds, I thought though this is 
not a very graceful approach it won't introduce too much overload. If we 
do not like this, let's switch it to a completion.

Thanks,


> Thanks,
>
>                  Ilya
>
Ilya Dryomov Feb. 11, 2020, 5:42 p.m. UTC | #3
On Tue, Feb 11, 2020 at 2:30 AM Xiubo Li <xiubli@redhat.com> wrote:
>
> On 2020/2/10 23:34, Ilya Dryomov wrote:
> > On Mon, Feb 10, 2020 at 6:34 AM <xiubli@redhat.com> wrote:
> >> From: Xiubo Li <xiubli@redhat.com>
> >>
> >> Add metric_send_interval module parameter support, the default valume
> >> is 0, means disabled. If none zero it will enable the transmission of
> >> the metrics to the ceph cluster periodically per metric_send_interval
> >> seconds.
> >>
> >> This will send the caps, dentry lease and read/write/metadata perf
> >> metrics to any available MDS only once per metric_send_interval
> >> seconds.
> >>
> >> URL: https://tracker.ceph.com/issues/43215
> >> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> >> ---
> >>   fs/ceph/mds_client.c         | 235 +++++++++++++++++++++++++++++++----
> >>   fs/ceph/mds_client.h         |   2 +
> >>   fs/ceph/metric.h             |  76 +++++++++++
> >>   fs/ceph/super.c              |   4 +
> >>   fs/ceph/super.h              |   1 +
> >>   include/linux/ceph/ceph_fs.h |   1 +
> >>   6 files changed, 294 insertions(+), 25 deletions(-)
> >>
> >> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >> index d414eded6810..f9a6f95c7941 100644
> >> --- a/fs/ceph/mds_client.c
> >> +++ b/fs/ceph/mds_client.c
> >> @@ -4085,16 +4085,167 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
> >>          ceph_force_reconnect(fsc->sb);
> >>   }
> >>
> >> -/*
> >> - * delayed work -- periodically trim expired leases, renew caps with mds
> >> - */
> >> +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
> >> +                                  struct ceph_mds_session *s,
> >> +                                  u64 nr_caps)
> >> +{
> >> +       struct ceph_metric_head *head;
> >> +       struct ceph_metric_cap *cap;
> >> +       struct ceph_metric_dentry_lease *lease;
> >> +       struct ceph_metric_read_latency *read;
> >> +       struct ceph_metric_write_latency *write;
> >> +       struct ceph_metric_metadata_latency *meta;
> >> +       struct ceph_msg *msg;
> >> +       struct timespec64 ts;
> >> +       s64 sum, total;
> >> +       s32 items = 0;
> >> +       s32 len;
> >> +
> >> +       if (!mdsc || !s)
> >> +               return false;
> >> +
> >> +       len = sizeof(*head) + sizeof(*cap) + sizeof(*lease) + sizeof(*read)
> >> +             + sizeof(*write) + sizeof(*meta);
> >> +
> >> +       msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
> >> +       if (!msg) {
> >> +               pr_err("send metrics to mds%d, failed to allocate message\n",
> >> +                      s->s_mds);
> >> +               return false;
> >> +       }
> >> +
> >> +       head = msg->front.iov_base;
> >> +
> >> +       /* encode the cap metric */
> >> +       cap = (struct ceph_metric_cap *)(head + 1);
> >> +       cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
> >> +       cap->ver = 1;
> >> +       cap->compat = 1;
> >> +       cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
> >> +       cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
> >> +       cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
> >> +       cap->total = cpu_to_le64(nr_caps);
> >> +       items++;
> >> +
> >> +       dout("cap metric hit %lld, mis %lld, total caps %lld",
> >> +            le64_to_cpu(cap->hit), le64_to_cpu(cap->mis),
> >> +            le64_to_cpu(cap->total));
> >> +
> >> +       /* encode the read latency metric */
> >> +       read = (struct ceph_metric_read_latency *)(cap + 1);
> >> +       read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
> >> +       read->ver = 1;
> >> +       read->compat = 1;
> >> +       read->data_len = cpu_to_le32(sizeof(*read) - 10);
> >> +       total = percpu_counter_sum(&mdsc->metric.total_reads),
> >> +       sum = percpu_counter_sum(&mdsc->metric.read_latency_sum);
> >> +       jiffies_to_timespec64(sum, &ts);
> >> +       read->sec = cpu_to_le32(ts.tv_sec);
> >> +       read->nsec = cpu_to_le32(ts.tv_nsec);
> >> +       items++;
> >> +       dout("read latency metric total %lld, sum lat %lld", total, sum);
> >> +
> >> +       /* encode the write latency metric */
> >> +       write = (struct ceph_metric_write_latency *)(read + 1);
> >> +       write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
> >> +       write->ver = 1;
> >> +       write->compat = 1;
> >> +       write->data_len = cpu_to_le32(sizeof(*write) - 10);
> >> +       total = percpu_counter_sum(&mdsc->metric.total_writes),
> >> +       sum = percpu_counter_sum(&mdsc->metric.write_latency_sum);
> >> +       jiffies_to_timespec64(sum, &ts);
> >> +       write->sec = cpu_to_le32(ts.tv_sec);
> >> +       write->nsec = cpu_to_le32(ts.tv_nsec);
> >> +       items++;
> >> +       dout("write latency metric total %lld, sum lat %lld", total, sum);
> >> +
> >> +       /* encode the metadata latency metric */
> >> +       meta = (struct ceph_metric_metadata_latency *)(write + 1);
> >> +       meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
> >> +       meta->ver = 1;
> >> +       meta->compat = 1;
> >> +       meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
> >> +       total = percpu_counter_sum(&mdsc->metric.total_metadatas),
> >> +       sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum);
> >> +       jiffies_to_timespec64(sum, &ts);
> >> +       meta->sec = cpu_to_le32(ts.tv_sec);
> >> +       meta->nsec = cpu_to_le32(ts.tv_nsec);
> >> +       items++;
> >> +       dout("metadata latency metric total %lld, sum lat %lld", total, sum);
> >> +
> >> +       /* encode the dentry lease metric */
> >> +       lease = (struct ceph_metric_dentry_lease *)(meta + 1);
> >> +       lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
> >> +       lease->ver = 1;
> >> +       lease->compat = 1;
> >> +       lease->data_len = cpu_to_le32(sizeof(*lease) - 10);
> >> +       lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit));
> >> +       lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis));
> >> +       lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries));
> >> +       items++;
> >> +       dout("dentry lease metric hit %lld, mis %lld, total dentries %lld",
> >> +            le64_to_cpu(lease->hit), le64_to_cpu(lease->mis),
> >> +            le64_to_cpu(lease->total));
> >> +
> >> +       put_unaligned_le32(items, &head->num);
> >> +       msg->front.iov_len = cpu_to_le32(len);
> >> +       msg->hdr.version = cpu_to_le16(1);
> >> +       msg->hdr.compat_version = cpu_to_le16(1);
> >> +       msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
> >> +       dout("send metrics to mds%d %p\n", s->s_mds, msg);
> >> +       ceph_con_send(&s->s_con, msg);
> >> +
> >> +       return true;
> >> +}
> >> +
> >> +#define CEPH_WORK_DELAY_DEF 5
> >> +static void __schedule_delayed(struct delayed_work *work, int delay)
> >> +{
> >> +       unsigned int hz = round_jiffies_relative(HZ * delay);
> >> +
> >> +       schedule_delayed_work(work, hz);
> >> +}
> >> +
> >>   static void schedule_delayed(struct ceph_mds_client *mdsc)
> >>   {
> >> -       int delay = 5;
> >> -       unsigned hz = round_jiffies_relative(HZ * delay);
> >> -       schedule_delayed_work(&mdsc->delayed_work, hz);
> >> +       __schedule_delayed(&mdsc->delayed_work, CEPH_WORK_DELAY_DEF);
> >> +}
> >> +
> >> +static void metric_schedule_delayed(struct ceph_mds_client *mdsc)
> >> +{
> >> +       /* delay CEPH_WORK_DELAY_DEF seconds when idle */
> >> +       int delay = metric_send_interval ? : CEPH_WORK_DELAY_DEF;
> >> +
> >> +       __schedule_delayed(&mdsc->metric_delayed_work, delay);
> >> +}
> >> +
> >> +static bool check_session_state(struct ceph_mds_client *mdsc,
> >> +                               struct ceph_mds_session *s)
> >> +{
> >> +       if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
> >> +               dout("resending session close request for mds%d\n",
> >> +                               s->s_mds);
> >> +               request_close_session(mdsc, s);
> >> +               return false;
> >> +       }
> >> +       if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
> >> +               if (s->s_state == CEPH_MDS_SESSION_OPEN) {
> >> +                       s->s_state = CEPH_MDS_SESSION_HUNG;
> >> +                       pr_info("mds%d hung\n", s->s_mds);
> >> +               }
> >> +       }
> >> +       if (s->s_state == CEPH_MDS_SESSION_NEW ||
> >> +           s->s_state == CEPH_MDS_SESSION_RESTARTING ||
> >> +           s->s_state == CEPH_MDS_SESSION_REJECTED)
> >> +               /* this mds is failed or recovering, just wait */
> >> +               return false;
> >> +
> >> +       return true;
> >>   }
> >>
> >> +/*
> >> + * delayed work -- periodically trim expired leases, renew caps with mds
> >> + */
> >>   static void delayed_work(struct work_struct *work)
> >>   {
> >>          int i;
> >> @@ -4116,23 +4267,8 @@ static void delayed_work(struct work_struct *work)
> >>                  struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
> >>                  if (!s)
> >>                          continue;
> >> -               if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
> >> -                       dout("resending session close request for mds%d\n",
> >> -                            s->s_mds);
> >> -                       request_close_session(mdsc, s);
> >> -                       ceph_put_mds_session(s);
> >> -                       continue;
> >> -               }
> >> -               if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
> >> -                       if (s->s_state == CEPH_MDS_SESSION_OPEN) {
> >> -                               s->s_state = CEPH_MDS_SESSION_HUNG;
> >> -                               pr_info("mds%d hung\n", s->s_mds);
> >> -                       }
> >> -               }
> >> -               if (s->s_state == CEPH_MDS_SESSION_NEW ||
> >> -                   s->s_state == CEPH_MDS_SESSION_RESTARTING ||
> >> -                   s->s_state == CEPH_MDS_SESSION_REJECTED) {
> >> -                       /* this mds is failed or recovering, just wait */
> >> +
> >> +               if (!check_session_state(mdsc, s)) {
> >>                          ceph_put_mds_session(s);
> >>                          continue;
> >>                  }
> >> @@ -4164,8 +4300,53 @@ static void delayed_work(struct work_struct *work)
> >>          schedule_delayed(mdsc);
> >>   }
> >>
> >> -static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
> >> +static void metric_delayed_work(struct work_struct *work)
> >> +{
> >> +       struct ceph_mds_client *mdsc =
> >> +               container_of(work, struct ceph_mds_client, metric_delayed_work.work);
> >> +       struct ceph_mds_session *s;
> >> +       u64 nr_caps = 0;
> >> +       bool ret;
> >> +       int i;
> >> +
> >> +       if (!metric_send_interval)
> >> +               goto idle;
> >> +
> >> +       dout("mdsc metric_delayed_work\n");
> >> +
> >> +       mutex_lock(&mdsc->mutex);
> >> +       for (i = 0; i < mdsc->max_sessions; i++) {
> >> +               s = __ceph_lookup_mds_session(mdsc, i);
> >> +               if (!s)
> >> +                       continue;
> >> +               nr_caps += s->s_nr_caps;
> >> +               ceph_put_mds_session(s);
> >> +       }
> >> +
> >> +       for (i = 0; i < mdsc->max_sessions; i++) {
> >> +               s = __ceph_lookup_mds_session(mdsc, i);
> >> +               if (!s)
> >> +                       continue;
> >> +               if (!check_session_state(mdsc, s)) {
> >> +                       ceph_put_mds_session(s);
> >> +                       continue;
> >> +               }
> >> +
> >> +               /* Only send the metric once in any available session */
> >> +               ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
> >> +               ceph_put_mds_session(s);
> >> +               if (ret)
> >> +                       break;
> >> +       }
> >> +       mutex_unlock(&mdsc->mutex);
> >> +
> >> +idle:
> >> +       metric_schedule_delayed(mdsc);
> > Looks like this will schedule metric_delayed_work() every 5 seconds
> > even if metric_send_interval = 0 (i.e. sending is disabled).  What is
> > the reason for that?
>
> Hi Ilya,
>
> Before I folded the metric_delayed_work() into delayed_work(). But for
> the this version since the interval is settable, so it hard to calculate
> the next schedule delay for that.
>
> When it is idle just looping every 5 seconds, I thought though this is
> not a very graceful approach it won't introduce too much overload. If we
> do not like this, let's switch it to a completion.

Take a look at module_param_cb macro.  I think you can provide a
setter and schedule the first work / modify the delay from there.

That said, I'm not sure making the interval configurable is a good
idea.  I'm not saying you need to change anything -- just that if it
was me, I would send these metrics once per tick (i.e. delayed_work)
with an on/off switch and no other tunables.

Thanks,

                Ilya
Xiubo Li Feb. 12, 2020, 8:38 a.m. UTC | #4
On 2020/2/12 1:42, Ilya Dryomov wrote:
> On Tue, Feb 11, 2020 at 2:30 AM Xiubo Li <xiubli@redhat.com> wrote:
>> On 2020/2/10 23:34, Ilya Dryomov wrote:
>>> On Mon, Feb 10, 2020 at 6:34 AM <xiubli@redhat.com> wrote:
>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>
>>>> Add metric_send_interval module parameter support, the default valume
>>>> is 0, means disabled. If none zero it will enable the transmission of
>>>> the metrics to the ceph cluster periodically per metric_send_interval
>>>> seconds.
>>>>
>>>> This will send the caps, dentry lease and read/write/metadata perf
>>>> metrics to any available MDS only once per metric_send_interval
>>>> seconds.
>>>>
>>>> URL: https://tracker.ceph.com/issues/43215
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> ---
>>>>    fs/ceph/mds_client.c         | 235 +++++++++++++++++++++++++++++++----
>>>>    fs/ceph/mds_client.h         |   2 +
>>>>    fs/ceph/metric.h             |  76 +++++++++++
>>>>    fs/ceph/super.c              |   4 +
>>>>    fs/ceph/super.h              |   1 +
>>>>    include/linux/ceph/ceph_fs.h |   1 +
>>>>    6 files changed, 294 insertions(+), 25 deletions(-)
>>>>
>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>> index d414eded6810..f9a6f95c7941 100644
>>>> --- a/fs/ceph/mds_client.c
>>>> +++ b/fs/ceph/mds_client.c
>>>> @@ -4085,16 +4085,167 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
>>>>           ceph_force_reconnect(fsc->sb);
>>>>    }
>>>>
>>>> -/*
>>>> - * delayed work -- periodically trim expired leases, renew caps with mds
>>>> - */
>>>> +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
>>>> +                                  struct ceph_mds_session *s,
>>>> +                                  u64 nr_caps)
>>>> +{
>>>> +       struct ceph_metric_head *head;
>>>> +       struct ceph_metric_cap *cap;
>>>> +       struct ceph_metric_dentry_lease *lease;
>>>> +       struct ceph_metric_read_latency *read;
>>>> +       struct ceph_metric_write_latency *write;
>>>> +       struct ceph_metric_metadata_latency *meta;
>>>> +       struct ceph_msg *msg;
>>>> +       struct timespec64 ts;
>>>> +       s64 sum, total;
>>>> +       s32 items = 0;
>>>> +       s32 len;
>>>> +
>>>> +       if (!mdsc || !s)
>>>> +               return false;
>>>> +
>>>> +       len = sizeof(*head) + sizeof(*cap) + sizeof(*lease) + sizeof(*read)
>>>> +             + sizeof(*write) + sizeof(*meta);
>>>> +
>>>> +       msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
>>>> +       if (!msg) {
>>>> +               pr_err("send metrics to mds%d, failed to allocate message\n",
>>>> +                      s->s_mds);
>>>> +               return false;
>>>> +       }
>>>> +
>>>> +       head = msg->front.iov_base;
>>>> +
>>>> +       /* encode the cap metric */
>>>> +       cap = (struct ceph_metric_cap *)(head + 1);
>>>> +       cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
>>>> +       cap->ver = 1;
>>>> +       cap->compat = 1;
>>>> +       cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
>>>> +       cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
>>>> +       cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
>>>> +       cap->total = cpu_to_le64(nr_caps);
>>>> +       items++;
>>>> +
>>>> +       dout("cap metric hit %lld, mis %lld, total caps %lld",
>>>> +            le64_to_cpu(cap->hit), le64_to_cpu(cap->mis),
>>>> +            le64_to_cpu(cap->total));
>>>> +
>>>> +       /* encode the read latency metric */
>>>> +       read = (struct ceph_metric_read_latency *)(cap + 1);
>>>> +       read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
>>>> +       read->ver = 1;
>>>> +       read->compat = 1;
>>>> +       read->data_len = cpu_to_le32(sizeof(*read) - 10);
>>>> +       total = percpu_counter_sum(&mdsc->metric.total_reads),
>>>> +       sum = percpu_counter_sum(&mdsc->metric.read_latency_sum);
>>>> +       jiffies_to_timespec64(sum, &ts);
>>>> +       read->sec = cpu_to_le32(ts.tv_sec);
>>>> +       read->nsec = cpu_to_le32(ts.tv_nsec);
>>>> +       items++;
>>>> +       dout("read latency metric total %lld, sum lat %lld", total, sum);
>>>> +
>>>> +       /* encode the write latency metric */
>>>> +       write = (struct ceph_metric_write_latency *)(read + 1);
>>>> +       write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
>>>> +       write->ver = 1;
>>>> +       write->compat = 1;
>>>> +       write->data_len = cpu_to_le32(sizeof(*write) - 10);
>>>> +       total = percpu_counter_sum(&mdsc->metric.total_writes),
>>>> +       sum = percpu_counter_sum(&mdsc->metric.write_latency_sum);
>>>> +       jiffies_to_timespec64(sum, &ts);
>>>> +       write->sec = cpu_to_le32(ts.tv_sec);
>>>> +       write->nsec = cpu_to_le32(ts.tv_nsec);
>>>> +       items++;
>>>> +       dout("write latency metric total %lld, sum lat %lld", total, sum);
>>>> +
>>>> +       /* encode the metadata latency metric */
>>>> +       meta = (struct ceph_metric_metadata_latency *)(write + 1);
>>>> +       meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
>>>> +       meta->ver = 1;
>>>> +       meta->compat = 1;
>>>> +       meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
>>>> +       total = percpu_counter_sum(&mdsc->metric.total_metadatas),
>>>> +       sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum);
>>>> +       jiffies_to_timespec64(sum, &ts);
>>>> +       meta->sec = cpu_to_le32(ts.tv_sec);
>>>> +       meta->nsec = cpu_to_le32(ts.tv_nsec);
>>>> +       items++;
>>>> +       dout("metadata latency metric total %lld, sum lat %lld", total, sum);
>>>> +
>>>> +       /* encode the dentry lease metric */
>>>> +       lease = (struct ceph_metric_dentry_lease *)(meta + 1);
>>>> +       lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
>>>> +       lease->ver = 1;
>>>> +       lease->compat = 1;
>>>> +       lease->data_len = cpu_to_le32(sizeof(*lease) - 10);
>>>> +       lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit));
>>>> +       lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis));
>>>> +       lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries));
>>>> +       items++;
>>>> +       dout("dentry lease metric hit %lld, mis %lld, total dentries %lld",
>>>> +            le64_to_cpu(lease->hit), le64_to_cpu(lease->mis),
>>>> +            le64_to_cpu(lease->total));
>>>> +
>>>> +       put_unaligned_le32(items, &head->num);
>>>> +       msg->front.iov_len = cpu_to_le32(len);
>>>> +       msg->hdr.version = cpu_to_le16(1);
>>>> +       msg->hdr.compat_version = cpu_to_le16(1);
>>>> +       msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
>>>> +       dout("send metrics to mds%d %p\n", s->s_mds, msg);
>>>> +       ceph_con_send(&s->s_con, msg);
>>>> +
>>>> +       return true;
>>>> +}
>>>> +
>>>> +#define CEPH_WORK_DELAY_DEF 5
>>>> +static void __schedule_delayed(struct delayed_work *work, int delay)
>>>> +{
>>>> +       unsigned int hz = round_jiffies_relative(HZ * delay);
>>>> +
>>>> +       schedule_delayed_work(work, hz);
>>>> +}
>>>> +
>>>>    static void schedule_delayed(struct ceph_mds_client *mdsc)
>>>>    {
>>>> -       int delay = 5;
>>>> -       unsigned hz = round_jiffies_relative(HZ * delay);
>>>> -       schedule_delayed_work(&mdsc->delayed_work, hz);
>>>> +       __schedule_delayed(&mdsc->delayed_work, CEPH_WORK_DELAY_DEF);
>>>> +}
>>>> +
>>>> +static void metric_schedule_delayed(struct ceph_mds_client *mdsc)
>>>> +{
>>>> +       /* delay CEPH_WORK_DELAY_DEF seconds when idle */
>>>> +       int delay = metric_send_interval ? : CEPH_WORK_DELAY_DEF;
>>>> +
>>>> +       __schedule_delayed(&mdsc->metric_delayed_work, delay);
>>>> +}
>>>> +
>>>> +static bool check_session_state(struct ceph_mds_client *mdsc,
>>>> +                               struct ceph_mds_session *s)
>>>> +{
>>>> +       if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
>>>> +               dout("resending session close request for mds%d\n",
>>>> +                               s->s_mds);
>>>> +               request_close_session(mdsc, s);
>>>> +               return false;
>>>> +       }
>>>> +       if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
>>>> +               if (s->s_state == CEPH_MDS_SESSION_OPEN) {
>>>> +                       s->s_state = CEPH_MDS_SESSION_HUNG;
>>>> +                       pr_info("mds%d hung\n", s->s_mds);
>>>> +               }
>>>> +       }
>>>> +       if (s->s_state == CEPH_MDS_SESSION_NEW ||
>>>> +           s->s_state == CEPH_MDS_SESSION_RESTARTING ||
>>>> +           s->s_state == CEPH_MDS_SESSION_REJECTED)
>>>> +               /* this mds is failed or recovering, just wait */
>>>> +               return false;
>>>> +
>>>> +       return true;
>>>>    }
>>>>
>>>> +/*
>>>> + * delayed work -- periodically trim expired leases, renew caps with mds
>>>> + */
>>>>    static void delayed_work(struct work_struct *work)
>>>>    {
>>>>           int i;
>>>> @@ -4116,23 +4267,8 @@ static void delayed_work(struct work_struct *work)
>>>>                   struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
>>>>                   if (!s)
>>>>                           continue;
>>>> -               if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
>>>> -                       dout("resending session close request for mds%d\n",
>>>> -                            s->s_mds);
>>>> -                       request_close_session(mdsc, s);
>>>> -                       ceph_put_mds_session(s);
>>>> -                       continue;
>>>> -               }
>>>> -               if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
>>>> -                       if (s->s_state == CEPH_MDS_SESSION_OPEN) {
>>>> -                               s->s_state = CEPH_MDS_SESSION_HUNG;
>>>> -                               pr_info("mds%d hung\n", s->s_mds);
>>>> -                       }
>>>> -               }
>>>> -               if (s->s_state == CEPH_MDS_SESSION_NEW ||
>>>> -                   s->s_state == CEPH_MDS_SESSION_RESTARTING ||
>>>> -                   s->s_state == CEPH_MDS_SESSION_REJECTED) {
>>>> -                       /* this mds is failed or recovering, just wait */
>>>> +
>>>> +               if (!check_session_state(mdsc, s)) {
>>>>                           ceph_put_mds_session(s);
>>>>                           continue;
>>>>                   }
>>>> @@ -4164,8 +4300,53 @@ static void delayed_work(struct work_struct *work)
>>>>           schedule_delayed(mdsc);
>>>>    }
>>>>
>>>> -static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
>>>> +static void metric_delayed_work(struct work_struct *work)
>>>> +{
>>>> +       struct ceph_mds_client *mdsc =
>>>> +               container_of(work, struct ceph_mds_client, metric_delayed_work.work);
>>>> +       struct ceph_mds_session *s;
>>>> +       u64 nr_caps = 0;
>>>> +       bool ret;
>>>> +       int i;
>>>> +
>>>> +       if (!metric_send_interval)
>>>> +               goto idle;
>>>> +
>>>> +       dout("mdsc metric_delayed_work\n");
>>>> +
>>>> +       mutex_lock(&mdsc->mutex);
>>>> +       for (i = 0; i < mdsc->max_sessions; i++) {
>>>> +               s = __ceph_lookup_mds_session(mdsc, i);
>>>> +               if (!s)
>>>> +                       continue;
>>>> +               nr_caps += s->s_nr_caps;
>>>> +               ceph_put_mds_session(s);
>>>> +       }
>>>> +
>>>> +       for (i = 0; i < mdsc->max_sessions; i++) {
>>>> +               s = __ceph_lookup_mds_session(mdsc, i);
>>>> +               if (!s)
>>>> +                       continue;
>>>> +               if (!check_session_state(mdsc, s)) {
>>>> +                       ceph_put_mds_session(s);
>>>> +                       continue;
>>>> +               }
>>>> +
>>>> +               /* Only send the metric once in any available session */
>>>> +               ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
>>>> +               ceph_put_mds_session(s);
>>>> +               if (ret)
>>>> +                       break;
>>>> +       }
>>>> +       mutex_unlock(&mdsc->mutex);
>>>> +
>>>> +idle:
>>>> +       metric_schedule_delayed(mdsc);
>>> Looks like this will schedule metric_delayed_work() every 5 seconds
>>> even if metric_send_interval = 0 (i.e. sending is disabled).  What is
>>> the reason for that?
>> Hi Ilya,
>>
>> Before I folded the metric_delayed_work() into delayed_work(). But for
>> the this version since the interval is settable, so it hard to calculate
>> the next schedule delay for that.
>>
>> When it is idle just looping every 5 seconds, I thought though this is
>> not a very graceful approach it won't introduce too much overload. If we
>> do not like this, let's switch it to a completion.
> Take a look at module_param_cb macro.  I think you can provide a
> setter and schedule the first work / modify the delay from there.

Hi Ilya,

Yeah, this is what I was trying to switch to.

> That said, I'm not sure making the interval configurable is a good
> idea.  I'm not saying you need to change anything -- just that if it
> was me, I would send these metrics once per tick (i.e. delayed_work)
> with an on/off switch and no other tunables.

Currently I still couldn't be sure whether per second will introduce any 
potential overload in some cases. But for now I couldn't foresee it will.

Thanks,

Xiubo

>
> Thanks,
>
>                  Ilya
>
diff mbox series

Patch

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d414eded6810..f9a6f95c7941 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4085,16 +4085,167 @@  static void maybe_recover_session(struct ceph_mds_client *mdsc)
 	ceph_force_reconnect(fsc->sb);
 }
 
-/*
- * delayed work -- periodically trim expired leases, renew caps with mds
- */
+static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *s,
+				   u64 nr_caps)
+{
+	struct ceph_metric_head *head;
+	struct ceph_metric_cap *cap;
+	struct ceph_metric_dentry_lease *lease;
+	struct ceph_metric_read_latency *read;
+	struct ceph_metric_write_latency *write;
+	struct ceph_metric_metadata_latency *meta;
+	struct ceph_msg *msg;
+	struct timespec64 ts;
+	s64 sum, total;
+	s32 items = 0;
+	s32 len;
+
+	if (!mdsc || !s)
+		return false;
+
+	len = sizeof(*head) + sizeof(*cap) + sizeof(*lease) + sizeof(*read)
+	      + sizeof(*write) + sizeof(*meta);
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
+	if (!msg) {
+		pr_err("send metrics to mds%d, failed to allocate message\n",
+		       s->s_mds);
+		return false;
+	}
+
+	head = msg->front.iov_base;
+
+	/* encode the cap metric */
+	cap = (struct ceph_metric_cap *)(head + 1);
+	cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
+	cap->ver = 1;
+	cap->compat = 1;
+	cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
+	cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
+	cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
+	cap->total = cpu_to_le64(nr_caps);
+	items++;
+
+	dout("cap metric hit %lld, mis %lld, total caps %lld",
+	     le64_to_cpu(cap->hit), le64_to_cpu(cap->mis),
+	     le64_to_cpu(cap->total));
+
+	/* encode the read latency metric */
+	read = (struct ceph_metric_read_latency *)(cap + 1);
+	read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
+	read->ver = 1;
+	read->compat = 1;
+	read->data_len = cpu_to_le32(sizeof(*read) - 10);
+	total = percpu_counter_sum(&mdsc->metric.total_reads),
+	sum = percpu_counter_sum(&mdsc->metric.read_latency_sum);
+	jiffies_to_timespec64(sum, &ts);
+	read->sec = cpu_to_le32(ts.tv_sec);
+	read->nsec = cpu_to_le32(ts.tv_nsec);
+	items++;
+	dout("read latency metric total %lld, sum lat %lld", total, sum);
+
+	/* encode the write latency metric */
+	write = (struct ceph_metric_write_latency *)(read + 1);
+	write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
+	write->ver = 1;
+	write->compat = 1;
+	write->data_len = cpu_to_le32(sizeof(*write) - 10);
+	total = percpu_counter_sum(&mdsc->metric.total_writes),
+	sum = percpu_counter_sum(&mdsc->metric.write_latency_sum);
+	jiffies_to_timespec64(sum, &ts);
+	write->sec = cpu_to_le32(ts.tv_sec);
+	write->nsec = cpu_to_le32(ts.tv_nsec);
+	items++;
+	dout("write latency metric total %lld, sum lat %lld", total, sum);
+
+	/* encode the metadata latency metric */
+	meta = (struct ceph_metric_metadata_latency *)(write + 1);
+	meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
+	meta->ver = 1;
+	meta->compat = 1;
+	meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
+	total = percpu_counter_sum(&mdsc->metric.total_metadatas),
+	sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum);
+	jiffies_to_timespec64(sum, &ts);
+	meta->sec = cpu_to_le32(ts.tv_sec);
+	meta->nsec = cpu_to_le32(ts.tv_nsec);
+	items++;
+	dout("metadata latency metric total %lld, sum lat %lld", total, sum);
+
+	/* encode the dentry lease metric */
+	lease = (struct ceph_metric_dentry_lease *)(meta + 1);
+	lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
+	lease->ver = 1;
+	lease->compat = 1;
+	lease->data_len = cpu_to_le32(sizeof(*lease) - 10);
+	lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit));
+	lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis));
+	lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries));
+	items++;
+	dout("dentry lease metric hit %lld, mis %lld, total dentries %lld",
+	     le64_to_cpu(lease->hit), le64_to_cpu(lease->mis),
+	     le64_to_cpu(lease->total));
+
+	put_unaligned_le32(items, &head->num);
+	msg->front.iov_len = cpu_to_le32(len);
+	msg->hdr.version = cpu_to_le16(1);
+	msg->hdr.compat_version = cpu_to_le16(1);
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+	dout("send metrics to mds%d %p\n", s->s_mds, msg);
+	ceph_con_send(&s->s_con, msg);
+
+	return true;
+}
+
+#define CEPH_WORK_DELAY_DEF 5
+static void __schedule_delayed(struct delayed_work *work, int delay)
+{
+	unsigned int hz = round_jiffies_relative(HZ * delay);
+
+	schedule_delayed_work(work, hz);
+}
+
 static void schedule_delayed(struct ceph_mds_client *mdsc)
 {
-	int delay = 5;
-	unsigned hz = round_jiffies_relative(HZ * delay);
-	schedule_delayed_work(&mdsc->delayed_work, hz);
+	__schedule_delayed(&mdsc->delayed_work, CEPH_WORK_DELAY_DEF);
+}
+
+static void metric_schedule_delayed(struct ceph_mds_client *mdsc)
+{
+	/* delay CEPH_WORK_DELAY_DEF seconds when idle */
+	int delay = metric_send_interval ? : CEPH_WORK_DELAY_DEF;
+
+	__schedule_delayed(&mdsc->metric_delayed_work, delay);
+}
+
+static bool check_session_state(struct ceph_mds_client *mdsc,
+				struct ceph_mds_session *s)
+{
+	if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+		dout("resending session close request for mds%d\n",
+				s->s_mds);
+		request_close_session(mdsc, s);
+		return false;
+	}
+	if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+		if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+			s->s_state = CEPH_MDS_SESSION_HUNG;
+			pr_info("mds%d hung\n", s->s_mds);
+		}
+	}
+	if (s->s_state == CEPH_MDS_SESSION_NEW ||
+	    s->s_state == CEPH_MDS_SESSION_RESTARTING ||
+	    s->s_state == CEPH_MDS_SESSION_REJECTED)
+		/* this mds is failed or recovering, just wait */
+		return false;
+
+	return true;
 }
 
+/*
+ * delayed work -- periodically trim expired leases, renew caps with mds
+ */
 static void delayed_work(struct work_struct *work)
 {
 	int i;
@@ -4116,23 +4267,8 @@  static void delayed_work(struct work_struct *work)
 		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
 		if (!s)
 			continue;
-		if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
-			dout("resending session close request for mds%d\n",
-			     s->s_mds);
-			request_close_session(mdsc, s);
-			ceph_put_mds_session(s);
-			continue;
-		}
-		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
-			if (s->s_state == CEPH_MDS_SESSION_OPEN) {
-				s->s_state = CEPH_MDS_SESSION_HUNG;
-				pr_info("mds%d hung\n", s->s_mds);
-			}
-		}
-		if (s->s_state == CEPH_MDS_SESSION_NEW ||
-		    s->s_state == CEPH_MDS_SESSION_RESTARTING ||
-		    s->s_state == CEPH_MDS_SESSION_REJECTED) {
-			/* this mds is failed or recovering, just wait */
+
+		if (!check_session_state(mdsc, s)) {
 			ceph_put_mds_session(s);
 			continue;
 		}
@@ -4164,8 +4300,53 @@  static void delayed_work(struct work_struct *work)
 	schedule_delayed(mdsc);
 }
 
-static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
+static void metric_delayed_work(struct work_struct *work)
+{
+	struct ceph_mds_client *mdsc =
+		container_of(work, struct ceph_mds_client, metric_delayed_work.work);
+	struct ceph_mds_session *s;
+	u64 nr_caps = 0;
+	bool ret;
+	int i;
+
+	if (!metric_send_interval)
+		goto idle;
+
+	dout("mdsc metric_delayed_work\n");
+
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		s = __ceph_lookup_mds_session(mdsc, i);
+		if (!s)
+			continue;
+		nr_caps += s->s_nr_caps;
+		ceph_put_mds_session(s);
+	}
+
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		s = __ceph_lookup_mds_session(mdsc, i);
+		if (!s)
+			continue;
+		if (!check_session_state(mdsc, s)) {
+			ceph_put_mds_session(s);
+			continue;
+		}
+
+		/* Only send the metric once in any available session */
+		ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
+		ceph_put_mds_session(s);
+		if (ret)
+			break;
+	}
+	mutex_unlock(&mdsc->mutex);
+
+idle:
+	metric_schedule_delayed(mdsc);
+}
+
+static int ceph_mdsc_metric_init(struct ceph_mds_client *mdsc)
 {
+	struct ceph_client_metric *metric = &mdsc->metric;
 	int ret;
 
 	if (!metric)
@@ -4289,7 +4470,8 @@  int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	init_waitqueue_head(&mdsc->cap_flushing_wq);
 	INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
 	atomic_set(&mdsc->cap_reclaim_pending, 0);
-	err = ceph_mdsc_metric_init(&mdsc->metric);
+	INIT_DELAYED_WORK(&mdsc->metric_delayed_work, metric_delayed_work);
+	err = ceph_mdsc_metric_init(mdsc);
 	if (err)
 		goto err_mdsmap;
 
@@ -4511,6 +4693,7 @@  void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 
 	cancel_work_sync(&mdsc->cap_reclaim_work);
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+	cancel_delayed_work_sync(&mdsc->metric_delayed_work); /* cancel timer */
 
 	dout("stopped\n");
 }
@@ -4553,6 +4736,7 @@  static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
 	dout("stop\n");
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+	cancel_delayed_work_sync(&mdsc->metric_delayed_work); /* cancel timer */
 	if (mdsc->mdsmap)
 		ceph_mdsmap_destroy(mdsc->mdsmap);
 	kfree(mdsc->sessions);
@@ -4719,6 +4903,7 @@  void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 
 	mutex_unlock(&mdsc->mutex);
 	schedule_delayed(mdsc);
+	metric_schedule_delayed(mdsc);
 	return;
 
 bad_unlock:
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 674fc7725913..c13910da07c4 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -448,7 +448,9 @@  struct ceph_mds_client {
 	struct list_head  dentry_leases;     /* fifo list */
 	struct list_head  dentry_dir_leases; /* lru list */
 
+	/* metrics */
 	struct ceph_client_metric metric;
+	struct delayed_work	  metric_delayed_work;  /* delayed work */
 
 	spinlock_t		snapid_map_lock;
 	struct rb_root		snapid_map_tree;
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index 9de8beb436c7..224e92a70d88 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -4,6 +4,82 @@ 
 
 #include <linux/ceph/osd_client.h>
 
+enum ceph_metric_type {
+	CLIENT_METRIC_TYPE_CAP_INFO,
+	CLIENT_METRIC_TYPE_READ_LATENCY,
+	CLIENT_METRIC_TYPE_WRITE_LATENCY,
+	CLIENT_METRIC_TYPE_METADATA_LATENCY,
+	CLIENT_METRIC_TYPE_DENTRY_LEASE,
+
+	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
+};
+
+/* metric caps header */
+struct ceph_metric_cap {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(hit + mis + total) */
+	__le64 hit;
+	__le64 mis;
+	__le64 total;
+} __attribute__ ((packed));
+
+/* metric dentry lease header */
+struct ceph_metric_dentry_lease {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(hit + mis + total) */
+	__le64 hit;
+	__le64 mis;
+	__le64 total;
+} __attribute__ ((packed));
+
+/* metric read latency header */
+struct ceph_metric_read_latency {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(sec + nsec) */
+	__le32 sec;
+	__le32 nsec;
+} __attribute__ ((packed));
+
+/* metric write latency header */
+struct ceph_metric_write_latency {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(sec + nsec) */
+	__le32 sec;
+	__le32 nsec;
+} __attribute__ ((packed));
+
+/* metric metadata latency header */
+struct ceph_metric_metadata_latency {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(sec + nsec) */
+	__le32 sec;
+	__le32 nsec;
+} __attribute__ ((packed));
+
+struct ceph_metric_head {
+	__le32 num;	/* the number of metrics that will be sent */
+} __attribute__ ((packed));
+
 /* This is the global metrics */
 struct ceph_client_metric {
 	atomic64_t            total_dentries;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 196d547c7054..5fef4f59e13e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1315,6 +1315,10 @@  bool enable_async_dirops;
 module_param(enable_async_dirops, bool, 0644);
 MODULE_PARM_DESC(enable_async_dirops, "Asynchronous directory operations enabled");
 
+unsigned int metric_send_interval;
+module_param(metric_send_interval, uint, 0644);
+MODULE_PARM_DESC(metric_send_interval, "Interval (in seconds) of sending perf metric to ceph cluster (default: 0)");
+
 module_init(init_ceph);
 module_exit(exit_ceph);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 44b9a971ec9a..7eda7acc859a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -73,6 +73,7 @@ 
 #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
 
 extern bool enable_async_dirops;
+extern unsigned int metric_send_interval;
 
 struct ceph_mount_options {
 	unsigned int flags;
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index a099f60feb7b..6028d3e865e4 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -130,6 +130,7 @@  struct ceph_dir_layout {
 #define CEPH_MSG_CLIENT_REQUEST         24
 #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
 #define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_METRICS         29
 #define CEPH_MSG_CLIENT_CAPS            0x310
 #define CEPH_MSG_CLIENT_LEASE           0x311
 #define CEPH_MSG_CLIENT_SNAP            0x312