[resend,v5,08/11] ceph: periodically send perf metrics to MDS
diff mbox series

Message ID 20200129082715.5285-9-xiubli@redhat.com
State New
Headers show
Series
  • ceph: add perf metrics support
Related show

Commit Message

Xiubo Li Jan. 29, 2020, 8:27 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

Add enable/disable sending metrics to MDS debugfs and disabled as
default, if it's enabled the kclient will send metrics every
second.

This will send global dentry lease hit/miss and read/write/metadata
latency metrics and each session's caps hit/miss metric to MDS.

Every time only sends the global metrics once via any availible
session.

URL: https://tracker.ceph.com/issues/43215
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/debugfs.c            |  44 +++++++-
 fs/ceph/mds_client.c         | 201 ++++++++++++++++++++++++++++++++---
 fs/ceph/mds_client.h         |   3 +
 fs/ceph/metric.h             |  76 +++++++++++++
 fs/ceph/super.h              |   1 +
 include/linux/ceph/ceph_fs.h |   1 +
 6 files changed, 307 insertions(+), 19 deletions(-)

Comments

Jeff Layton Feb. 5, 2020, 9:43 p.m. UTC | #1
On Wed, 2020-01-29 at 03:27 -0500, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> Add enable/disable sending metrics to MDS debugfs and disabled as
> default, if it's enabled the kclient will send metrics every
> second.
> 
> This will send global dentry lease hit/miss and read/write/metadata
> latency metrics and each session's caps hit/miss metric to MDS.
> 
> Every time only sends the global metrics once via any availible
> session.
> 
> URL: https://tracker.ceph.com/issues/43215
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/debugfs.c            |  44 +++++++-
>  fs/ceph/mds_client.c         | 201 ++++++++++++++++++++++++++++++++---
>  fs/ceph/mds_client.h         |   3 +
>  fs/ceph/metric.h             |  76 +++++++++++++
>  fs/ceph/super.h              |   1 +
>  include/linux/ceph/ceph_fs.h |   1 +
>  6 files changed, 307 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> index 7fd031c18309..8aae7ecea54a 100644
> --- a/fs/ceph/debugfs.c
> +++ b/fs/ceph/debugfs.c
> @@ -124,6 +124,40 @@ static int mdsc_show(struct seq_file *s, void *p)
>  	return 0;
>  }
>  
> +/*
> + * metrics debugfs
> + */
> +static int sending_metrics_set(void *data, u64 val)
> +{
> +	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
> +	struct ceph_mds_client *mdsc = fsc->mdsc;
> +
> +	if (val > 1) {
> +		pr_err("Invalid sending metrics set value %llu\n", val);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&mdsc->mutex);
> +	mdsc->sending_metrics = (unsigned int)val;

Shouldn't that be a bool cast? Do we even need a cast there?

> +	mutex_unlock(&mdsc->mutex);
> +
> +	return 0;
> +}
> +
> +static int sending_metrics_get(void *data, u64 *val)
> +{
> +	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
> +	struct ceph_mds_client *mdsc = fsc->mdsc;
> +
> +	mutex_lock(&mdsc->mutex);
> +	*val = (u64)mdsc->sending_metrics;
> +	mutex_unlock(&mdsc->mutex);
> +
> +	return 0;
> +}
> +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get,
> +			sending_metrics_set, "%llu\n");
> +

I'd like to hear more about how we expect users to use this facility.
This debugfs file doesn't seem consistent with the rest of the UI, and I
imagine if the box reboots you'd have to (manually) re-enable it after
mount, right? Maybe this should be a mount option instead?


>  static int metric_show(struct seq_file *s, void *p)
>  {
>  	struct ceph_fs_client *fsc = s->private;
> @@ -302,11 +336,9 @@ static int congestion_kb_get(void *data, u64 *val)
>  	*val = (u64)fsc->mount_options->congestion_kb;
>  	return 0;
>  }
> -
>  DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
>  			congestion_kb_set, "%llu\n");
>  
> -
>  void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
>  {
>  	dout("ceph_fs_debugfs_cleanup\n");
> @@ -316,6 +348,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
>  	debugfs_remove(fsc->debugfs_mds_sessions);
>  	debugfs_remove(fsc->debugfs_caps);
>  	debugfs_remove(fsc->debugfs_metric);
> +	debugfs_remove(fsc->debugfs_sending_metrics);
>  	debugfs_remove(fsc->debugfs_mdsc);
>  }
>  
> @@ -356,6 +389,13 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
>  						fsc,
>  						&mdsc_show_fops);
>  
> +	fsc->debugfs_sending_metrics =
> +			debugfs_create_file("sending_metrics",
> +					    0600,
> +					    fsc->client->debugfs_dir,
> +					    fsc,
> +					    &sending_metrics_fops);
> +
>  	fsc->debugfs_metric = debugfs_create_file("metrics",
>  						  0400,
>  						  fsc->client->debugfs_dir,
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 92a933810a79..d765804dc855 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -4104,13 +4104,156 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
>  	ceph_force_reconnect(fsc->sb);
>  }
>  
> +/*
> + * called under s_mutex
> + */
> +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
> +				   struct ceph_mds_session *s,
> +				   bool skip_global)
> +{
> +	struct ceph_metric_head *head;
> +	struct ceph_metric_cap *cap;
> +	struct ceph_metric_dentry_lease *lease;
> +	struct ceph_metric_read_latency *read;
> +	struct ceph_metric_write_latency *write;
> +	struct ceph_metric_metadata_latency *meta;
> +	struct ceph_msg *msg;
> +	struct timespec64 ts;
> +	s32 len = sizeof(*head) + sizeof(*cap);
> +	s64 sum, total, avg;
> +	s32 items = 0;
> +
> +	if (!mdsc || !s)
> +		return false;
> +
> +	if (!skip_global) {
> +		len += sizeof(*lease);
> +		len += sizeof(*read);
> +		len += sizeof(*write);
> +		len += sizeof(*meta);
> +	}
> +
> +	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
> +	if (!msg) {
> +		pr_err("send metrics to mds%d, failed to allocate message\n",
> +		       s->s_mds);
> +		return false;
> +	}
> +
> +	head = msg->front.iov_base;
> +
> +	/* encode the cap metric */
> +	cap = (struct ceph_metric_cap *)(head + 1);
> +	cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
> +	cap->ver = 1;
> +	cap->campat = 1;
> +	cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
> +	cap->hit = cpu_to_le64(percpu_counter_sum(&s->i_caps_hit));
> +	cap->mis = cpu_to_le64(percpu_counter_sum(&s->i_caps_mis));
> +	cap->total = cpu_to_le64(s->s_nr_caps);
> +	items++;
> +
> +	dout("cap metric hit %lld, mis %lld, total caps %lld",
> +	     le64_to_cpu(cap->hit), le64_to_cpu(cap->mis),
> +	     le64_to_cpu(cap->total));
> +
> +	/* only send the global once */
> +	if (skip_global)
> +		goto skip_global;
> +
> +	/* encode the dentry lease metric */
> +	lease = (struct ceph_metric_dentry_lease *)(cap + 1);
> +	lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
> +	lease->ver = 1;
> +	lease->campat = 1;
> +	lease->data_len = cpu_to_le32(sizeof(*lease) - 10);
> +	lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit));
> +	lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis));
> +	lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries));
> +	items++;
> +
> +	dout("dentry lease metric hit %lld, mis %lld, total dentries %lld",
> +	     le64_to_cpu(lease->hit), le64_to_cpu(lease->mis),
> +	     le64_to_cpu(lease->total));
> +
> +	/* encode the read latency metric */
> +	read = (struct ceph_metric_read_latency *)(lease + 1);
> +	read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
> +	read->ver = 1;
> +	read->campat = 1;
> +	read->data_len = cpu_to_le32(sizeof(*read) - 10);
> +	total = percpu_counter_sum(&mdsc->metric.total_reads),
> +	sum = percpu_counter_sum(&mdsc->metric.read_latency_sum);
> +	avg = total ? sum / total : 0;
> +	ts = ns_to_timespec64(avg);
> +	read->sec = cpu_to_le32(ts.tv_sec);
> +	read->nsec = cpu_to_le32(ts.tv_nsec);
> +	items++;
> +
> +	dout("read latency metric total %lld, sum lat %lld, avg lat %lld",
> +	     total, sum, avg);
> +
> +	/* encode the write latency metric */
> +	write = (struct ceph_metric_write_latency *)(read + 1);
> +	write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
> +	write->ver = 1;
> +	write->campat = 1;
> +	write->data_len = cpu_to_le32(sizeof(*write) - 10);
> +	total = percpu_counter_sum(&mdsc->metric.total_writes),
> +	sum = percpu_counter_sum(&mdsc->metric.write_latency_sum);
> +	avg = total ? sum / total : 0;
> +	ts = ns_to_timespec64(avg);
> +	write->sec = cpu_to_le32(ts.tv_sec);
> +	write->nsec = cpu_to_le32(ts.tv_nsec);
> +	items++;
> +
> +	dout("write latency metric total %lld, sum lat %lld, avg lat %lld",
> +	     total, sum, avg);
> +
> +	/* encode the metadata latency metric */
> +	meta = (struct ceph_metric_metadata_latency *)(write + 1);
> +	meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
> +	meta->ver = 1;
> +	meta->campat = 1;
> +	meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
> +	total = percpu_counter_sum(&mdsc->metric.total_metadatas),
> +	sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum);
> +	avg = total ? sum / total : 0;
> +	ts = ns_to_timespec64(avg);
> +	meta->sec = cpu_to_le32(ts.tv_sec);
> +	meta->nsec = cpu_to_le32(ts.tv_nsec);
> +	items++;
> +
> +	dout("metadata latency metric total %lld, sum lat %lld, avg lat %lld",
> +	     total, sum, avg);
> +
> +skip_global:
> +	put_unaligned_le32(items, &head->num);
> +	msg->front.iov_len = cpu_to_le32(len);
> +	msg->hdr.version = cpu_to_le16(1);
> +	msg->hdr.compat_version = cpu_to_le16(1);
> +	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
> +	dout("send metrics to mds%d %p\n", s->s_mds, msg);
> +	ceph_con_send(&s->s_con, msg);
> +
> +	return true;
> +}
> +
>  /*
>   * delayed work -- periodically trim expired leases, renew caps with mds
>   */
> +#define CEPH_WORK_DELAY_DEF 5
>  static void schedule_delayed(struct ceph_mds_client *mdsc)
>  {
> -	int delay = 5;
> -	unsigned hz = round_jiffies_relative(HZ * delay);
> +	unsigned int hz;
> +	int delay = CEPH_WORK_DELAY_DEF;
> +
> +	mutex_lock(&mdsc->mutex);
> +	if (mdsc->sending_metrics)
> +		delay = 1;
> +	mutex_unlock(&mdsc->mutex);
> +

The mdsc->mutex is dropped in the callers a little before this is
called, so this is a little too mutex-thrashy. I think you'd be better
off changing this function to be called with the mutex still held.

> +	hz = round_jiffies_relative(HZ * delay);
>  	schedule_delayed_work(&mdsc->delayed_work, hz);
>  }
>  
> @@ -4121,18 +4264,28 @@ static void delayed_work(struct work_struct *work)
>  		container_of(work, struct ceph_mds_client, delayed_work.work);
>  	int renew_interval;
>  	int renew_caps;
> +	bool metric_only;
> +	bool sending_metrics;
> +	bool g_skip = false;
>  
>  	dout("mdsc delayed_work\n");
>  
>  	mutex_lock(&mdsc->mutex);
> -	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
> -	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
> -				   mdsc->last_renew_caps);
> -	if (renew_caps)
> -		mdsc->last_renew_caps = jiffies;
> +	sending_metrics = !!mdsc->sending_metrics;
> +	metric_only = mdsc->sending_metrics &&
> +		(mdsc->ticks++ % CEPH_WORK_DELAY_DEF);
> +
> +	if (!metric_only) {
> +		renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
> +		renew_caps = time_after_eq(jiffies, HZ*renew_interval +
> +					   mdsc->last_renew_caps);
> +		if (renew_caps)
> +			mdsc->last_renew_caps = jiffies;
> +	}
>  
>  	for (i = 0; i < mdsc->max_sessions; i++) {
>  		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
> +
>  		if (!s)
>  			continue;
>  		if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
> @@ -4158,13 +4311,20 @@ static void delayed_work(struct work_struct *work)
>  		mutex_unlock(&mdsc->mutex);
>  
>  		mutex_lock(&s->s_mutex);
> -		if (renew_caps)
> -			send_renew_caps(mdsc, s);
> -		else
> -			ceph_con_keepalive(&s->s_con);
> -		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
> -		    s->s_state == CEPH_MDS_SESSION_HUNG)
> -			ceph_send_cap_releases(mdsc, s);
> +
> +		if (sending_metrics)
> +			g_skip = ceph_mdsc_send_metrics(mdsc, s, g_skip);
> +
> +		if (!metric_only) {
> +			if (renew_caps)
> +				send_renew_caps(mdsc, s);
> +			else
> +				ceph_con_keepalive(&s->s_con);
> +			if (s->s_state == CEPH_MDS_SESSION_OPEN ||
> +					s->s_state == CEPH_MDS_SESSION_HUNG)
> +				ceph_send_cap_releases(mdsc, s);
> +		}
> +
>  		mutex_unlock(&s->s_mutex);
>  		ceph_put_mds_session(s);
>  
> @@ -4172,6 +4332,9 @@ static void delayed_work(struct work_struct *work)
>  	}
>  	mutex_unlock(&mdsc->mutex);
>  
> +	if (metric_only)
> +		goto delay_work;
> +
>  	ceph_check_delayed_caps(mdsc);
>  
>  	ceph_queue_cap_reclaim_work(mdsc);
> @@ -4180,11 +4343,13 @@ static void delayed_work(struct work_struct *work)
>  
>  	maybe_recover_session(mdsc);
>  
> +delay_work:
>  	schedule_delayed(mdsc);
>  }
>  
> -static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
> +static int ceph_mdsc_metric_init(struct ceph_mds_client *mdsc)
>  {
> +	struct ceph_client_metric *metric = &mdsc->metric;
>  	int ret;
>  
>  	if (!metric)
> @@ -4222,7 +4387,9 @@ static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
>  	if (ret)
>  		goto err_metadata_latency_sum;
>  
> -	return ret;
> +	mdsc->sending_metrics = 0;
> +	mdsc->ticks = 0;
> +	return 0;
>  err_metadata_latency_sum:
>  	percpu_counter_destroy(&metric->total_metadatas);
>  err_total_metadatas:
> @@ -4294,7 +4461,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>  	init_waitqueue_head(&mdsc->cap_flushing_wq);
>  	INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
>  	atomic_set(&mdsc->cap_reclaim_pending, 0);
> -	err = ceph_mdsc_metric_init(&mdsc->metric);
> +	err = ceph_mdsc_metric_init(mdsc);
>  	if (err)
>  		goto err_mdsmap;
>  
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 574d4e5a5de2..a0ece55d987c 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -451,6 +451,9 @@ struct ceph_mds_client {
>  	struct list_head  dentry_leases;     /* fifo list */
>  	struct list_head  dentry_dir_leases; /* lru list */
>  
> +	/* metrics */
> +	unsigned int		  sending_metrics;
> +	unsigned int		  ticks;
>  	struct ceph_client_metric metric;
>  
>  	spinlock_t		snapid_map_lock;
> diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
> index 3cda616ba594..352eb753ce25 100644
> --- a/fs/ceph/metric.h
> +++ b/fs/ceph/metric.h
> @@ -4,6 +4,82 @@
>  
>  #include <linux/ceph/osd_client.h>
>  
> +enum ceph_metric_type {
> +	CLIENT_METRIC_TYPE_CAP_INFO,
> +	CLIENT_METRIC_TYPE_READ_LATENCY,
> +	CLIENT_METRIC_TYPE_WRITE_LATENCY,
> +	CLIENT_METRIC_TYPE_METADATA_LATENCY,
> +	CLIENT_METRIC_TYPE_DENTRY_LEASE,
> +
> +	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
> +};
> +
> +/* metric caps header */
> +struct ceph_metric_cap {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  campat;

I think you meant "compat" here.

> +
> +	__le32 data_len; /* length of sizeof(hit + mis + total) */
> +	__le64 hit;
> +	__le64 mis;
> +	__le64 total;
> +} __attribute__ ((packed));
> +
> +/* metric dentry lease header */
> +struct ceph_metric_dentry_lease {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  campat;
> +
> +	__le32 data_len; /* length of sizeof(hit + mis + total) */
> +	__le64 hit;
> +	__le64 mis;
> +	__le64 total;
> +} __attribute__ ((packed));
> +
> +/* metric read latency header */
> +struct ceph_metric_read_latency {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  campat;
> +
> +	__le32 data_len; /* length of sizeof(sec + nsec) */
> +	__le32 sec;
> +	__le32 nsec;
> +} __attribute__ ((packed));
> +
> +/* metric write latency header */
> +struct ceph_metric_write_latency {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  campat;
> +
> +	__le32 data_len; /* length of sizeof(sec + nsec) */
> +	__le32 sec;
> +	__le32 nsec;
> +} __attribute__ ((packed));
> +
> +/* metric metadata latency header */
> +struct ceph_metric_metadata_latency {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  campat;
> +
> +	__le32 data_len; /* length of sizeof(sec + nsec) */
> +	__le32 sec;
> +	__le32 nsec;
> +} __attribute__ ((packed));
> +
> +struct ceph_metric_head {
> +	__le32 num;	/* the number of metrics will be sent */

"the number of metrics that will be sent"

> +} __attribute__ ((packed));
> +
>  /* This is the global metrics */
>  struct ceph_client_metric {
>  	atomic64_t		total_dentries;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 3f4829222528..a91431e9bdf7 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -128,6 +128,7 @@ struct ceph_fs_client {
>  	struct dentry *debugfs_congestion_kb;
>  	struct dentry *debugfs_bdi;
>  	struct dentry *debugfs_mdsc, *debugfs_mdsmap;
> +	struct dentry *debugfs_sending_metrics;
>  	struct dentry *debugfs_metric;
>  	struct dentry *debugfs_mds_sessions;
>  #endif
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index a099f60feb7b..6028d3e865e4 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -130,6 +130,7 @@ struct ceph_dir_layout {
>  #define CEPH_MSG_CLIENT_REQUEST         24
>  #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
>  #define CEPH_MSG_CLIENT_REPLY           26
> +#define CEPH_MSG_CLIENT_METRICS         29
>  #define CEPH_MSG_CLIENT_CAPS            0x310
>  #define CEPH_MSG_CLIENT_LEASE           0x311
>  #define CEPH_MSG_CLIENT_SNAP            0x312
Xiubo Li Feb. 6, 2020, 2:36 a.m. UTC | #2
On 2020/2/6 5:43, Jeff Layton wrote:
> On Wed, 2020-01-29 at 03:27 -0500, xiubli@redhat.com wrote:
[...]
>>   
>> +/*
>> + * metrics debugfs
>> + */
>> +static int sending_metrics_set(void *data, u64 val)
>> +{
>> +	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
>> +	struct ceph_mds_client *mdsc = fsc->mdsc;
>> +
>> +	if (val > 1) {
>> +		pr_err("Invalid sending metrics set value %llu\n", val);
>> +		return -EINVAL;
>> +	}
>> +
>> +	mutex_lock(&mdsc->mutex);
>> +	mdsc->sending_metrics = (unsigned int)val;
> Shouldn't that be a bool cast? Do we even need a cast there?
Will switch sending_metrics to bool type instead.
>> +	mutex_unlock(&mdsc->mutex);
>> +
>> +	return 0;
>> +}
>> +
>> +static int sending_metrics_get(void *data, u64 *val)
>> +{
>> +	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
>> +	struct ceph_mds_client *mdsc = fsc->mdsc;
>> +
>> +	mutex_lock(&mdsc->mutex);
>> +	*val = (u64)mdsc->sending_metrics;
>> +	mutex_unlock(&mdsc->mutex);
>> +
>> +	return 0;
>> +}
>> +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get,
>> +			sending_metrics_set, "%llu\n");
>> +
> I'd like to hear more about how we expect users to use this facility.
> This debugfs file doesn't seem consistent with the rest of the UI, and I
> imagine if the box reboots you'd have to (manually) re-enable it after
> mount, right? Maybe this should be a mount option instead?

A mount option means we must do the unmounting to disable it.

I was thinking with the debugfs file we can do the debug or tuning even 
in the product setups at any time, usually this should be disabled since 
it will send it per second.

Or we could merge the "sending_metric" to "metrics" UI, just writing 
"enable"/"disable" to enable/disable sending the metrics to ceph, and 
just like the "reset" does to clean the metrics.

Then the "/sys/kernel/debug/ceph/XXX.clientYYY/metrics" could be 
writable with:

"reset"  --> to clean and reset the metrics counters

"enable" --> enable sending metrics to ceph cluster

"disable" --> disable sending metrics to ceph cluster

Will this be better ?


[...]
>   /*
>    * delayed work -- periodically trim expired leases, renew caps with mds
>    */
> +#define CEPH_WORK_DELAY_DEF 5
>   static void schedule_delayed(struct ceph_mds_client *mdsc)
>   {
> -	int delay = 5;
> -	unsigned hz = round_jiffies_relative(HZ * delay);
> +	unsigned int hz;
> +	int delay = CEPH_WORK_DELAY_DEF;
> +
> +	mutex_lock(&mdsc->mutex);
> +	if (mdsc->sending_metrics)
> +		delay = 1;
> +	mutex_unlock(&mdsc->mutex);
> +
> The mdsc->mutex is dropped in the callers a little before this is
> called, so this is a little too mutex-thrashy. I think you'd be better
> off changing this function to be called with the mutex still held.

Will fix it.


[...]
>> +/* metric caps header */
>> +struct ceph_metric_cap {
>> +	__le32 type;     /* ceph metric type */
>> +
>> +	__u8  ver;
>> +	__u8  campat;
> I think you meant "compat" here.

Will fix it.


[...]
>> +/* metric metadata latency header */
>> +struct ceph_metric_metadata_latency {
>> +	__le32 type;     /* ceph metric type */
>> +
>> +	__u8  ver;
>> +	__u8  campat;
>> +
>> +	__le32 data_len; /* length of sizeof(sec + nsec) */
>> +	__le32 sec;
>> +	__le32 nsec;
>> +} __attribute__ ((packed));
>> +
>> +struct ceph_metric_head {
>> +	__le32 num;	/* the number of metrics will be sent */
> "the number of metrics that will be sent"

Will fix it.

Thanks,

>> +} __attribute__ ((packed));
>> +
>>   /* This is the global metrics */
>>   struct ceph_client_metric {
>>   	atomic64_t		total_dentries;
>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 3f4829222528..a91431e9bdf7 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -128,6 +128,7 @@ struct ceph_fs_client {
>>   	struct dentry *debugfs_congestion_kb;
>>   	struct dentry *debugfs_bdi;
>>   	struct dentry *debugfs_mdsc, *debugfs_mdsmap;
>> +	struct dentry *debugfs_sending_metrics;
>>   	struct dentry *debugfs_metric;
>>   	struct dentry *debugfs_mds_sessions;
>>   #endif
>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>> index a099f60feb7b..6028d3e865e4 100644
>> --- a/include/linux/ceph/ceph_fs.h
>> +++ b/include/linux/ceph/ceph_fs.h
>> @@ -130,6 +130,7 @@ struct ceph_dir_layout {
>>   #define CEPH_MSG_CLIENT_REQUEST         24
>>   #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
>>   #define CEPH_MSG_CLIENT_REPLY           26
>> +#define CEPH_MSG_CLIENT_METRICS         29
>>   #define CEPH_MSG_CLIENT_CAPS            0x310
>>   #define CEPH_MSG_CLIENT_LEASE           0x311
>>   #define CEPH_MSG_CLIENT_SNAP            0x312
Jeff Layton Feb. 6, 2020, 11:31 a.m. UTC | #3
On Thu, 2020-02-06 at 10:36 +0800, Xiubo Li wrote:
> On 2020/2/6 5:43, Jeff Layton wrote:
> > On Wed, 2020-01-29 at 03:27 -0500, xiubli@redhat.com wrote:
> [...]
> > > +
> > > +static int sending_metrics_get(void *data, u64 *val)
> > > +{
> > > +	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
> > > +	struct ceph_mds_client *mdsc = fsc->mdsc;
> > > +
> > > +	mutex_lock(&mdsc->mutex);
> > > +	*val = (u64)mdsc->sending_metrics;
> > > +	mutex_unlock(&mdsc->mutex);
> > > +
> > > +	return 0;
> > > +}
> > > +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get,
> > > +			sending_metrics_set, "%llu\n");
> > > +
> > I'd like to hear more about how we expect users to use this facility.
> > This debugfs file doesn't seem consistent with the rest of the UI, and I
> > imagine if the box reboots you'd have to (manually) re-enable it after
> > mount, right? Maybe this should be a mount option instead?
> 
> A mount option means we must do the unmounting to disable it.
> 

Technically, no. You could wire it up so that you could enable and
disable it via -o remount. For example:

    # mount -o remount,metrics=disabled

Another option might be a module parameter if this is something that you
really want to be global (and not per-mount or per-session).

> I was thinking with the debugfs file we can do the debug or tuning even 
> in the product setups at any time, usually this should be disabled since 
> it will send it per second.
> 

Meh, one frame per second doesn't seem like it'll add much overhead.

Also, why one update per second? Should that interval be tunable?

> Or we could merge the "sending_metric" to "metrics" UI, just writing 
> "enable"/"disable" to enable/disable sending the metrics to ceph, and 
> just like the "reset" does to clean the metrics.
> 
> Then the "/sys/kernel/debug/ceph/XXX.clientYYY/metrics" could be 
> writable with:
> 
> "reset"  --> to clean and reset the metrics counters
> 
> "enable" --> enable sending metrics to ceph cluster
> 
> "disable" --> disable sending metrics to ceph cluster
> 
> Will this be better ?
> 

I guess it's not clear to me how you intend for this to be used.

A debugfs switch means that this is being enabled and disabled on a per-
session basis. Is the user supposed to turn this on for all, or just one
session? How do they know?

Is this something we expect people to just turn on briefly when they are
experiencing a problem, or is this something that we expect to be turned
on and left on for long periods of time?

If it's the latter then setting up a mount in /etc/fstab is not going to
be sufficient for an admin. She'll have to write a script or something
that goes in after the mount and enables this by writing to debugfs
after rebooting. Yuck.
Xiubo Li Feb. 6, 2020, 12:26 p.m. UTC | #4
On 2020/2/6 19:31, Jeff Layton wrote:
> On Thu, 2020-02-06 at 10:36 +0800, Xiubo Li wrote:
>> On 2020/2/6 5:43, Jeff Layton wrote:
>>> On Wed, 2020-01-29 at 03:27 -0500, xiubli@redhat.com wrote:
>> [...]
>>>> +
>>>> +static int sending_metrics_get(void *data, u64 *val)
>>>> +{
>>>> +	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
>>>> +	struct ceph_mds_client *mdsc = fsc->mdsc;
>>>> +
>>>> +	mutex_lock(&mdsc->mutex);
>>>> +	*val = (u64)mdsc->sending_metrics;
>>>> +	mutex_unlock(&mdsc->mutex);
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get,
>>>> +			sending_metrics_set, "%llu\n");
>>>> +
>>> I'd like to hear more about how we expect users to use this facility.
>>> This debugfs file doesn't seem consistent with the rest of the UI, and I
>>> imagine if the box reboots you'd have to (manually) re-enable it after
>>> mount, right? Maybe this should be a mount option instead?
>> A mount option means we must do the unmounting to disable it.
>>
> Technically, no. You could wire it up so that you could enable and
> disable it via -o remount. For example:
>
>      # mount -o remount,metrics=disabled

Yeah, this is cool.

>
> Another option might be a module parameter if this is something that you
> really want to be global (and not per-mount or per-session).
>
>> I was thinking with the debugfs file we can do the debug or tuning even
>> in the product setups at any time, usually this should be disabled since
>> it will send it per second.
>>
> Meh, one frame per second doesn't seem like it'll add much overhead.
Okay.
>
> Also, why one update per second? Should that interval be tunable?

Per second just keep it the same with the fuse client.


>> Or we could merge the "sending_metric" to "metrics" UI, just writing
>> "enable"/"disable" to enable/disable sending the metrics to ceph, and
>> just like the "reset" does to clean the metrics.
>>
>> Then the "/sys/kernel/debug/ceph/XXX.clientYYY/metrics" could be
>> writable with:
>>
>> "reset"  --> to clean and reset the metrics counters
>>
>> "enable" --> enable sending metrics to ceph cluster
>>
>> "disable" --> disable sending metrics to ceph cluster
>>
>> Will this be better ?
>>
> I guess it's not clear to me how you intend for this to be used.
>
> A debugfs switch means that this is being enabled and disabled on a per-
> session basis. Is the user supposed to turn this on for all, or just one
> session? How do they know?

Not for all, just per-superblock.

>
> Is this something we expect people to just turn on briefly when they are
> experiencing a problem, or is this something that we expect to be turned
> on and left on for long periods of time?

If this won't add much overhead even per second, let's keep sending the 
metrics to ceph always and the mount option for this switch is not 
needed any more.

And there is already a switch to enable/disable showing the metrics in 
the ceph side, if here add another switch per client, it will be also 
yucky for admins .

Let's make the update interval tunable and per second as default. Maybe 
we should make this as a global UI for all clients ?

Is this okay ?

Thanks.


> If it's the latter then setting up a mount in /etc/fstab is not going to
> be sufficient for an admin. She'll have to write a script or something
> that goes in after the mount and enables this by writing to debugfs
> after rebooting. Yuck.
>
Jeff Layton Feb. 6, 2020, 3:21 p.m. UTC | #5
On Thu, 2020-02-06 at 20:26 +0800, Xiubo Li wrote:
> On 2020/2/6 19:31, Jeff Layton wrote:
> > On Thu, 2020-02-06 at 10:36 +0800, Xiubo Li wrote:
> > > On 2020/2/6 5:43, Jeff Layton wrote:
> > > > On Wed, 2020-01-29 at 03:27 -0500, xiubli@redhat.com wrote:
> > > [...]
> > > > > +
> > > > > +static int sending_metrics_get(void *data, u64 *val)
> > > > > +{
> > > > > +	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
> > > > > +	struct ceph_mds_client *mdsc = fsc->mdsc;
> > > > > +
> > > > > +	mutex_lock(&mdsc->mutex);
> > > > > +	*val = (u64)mdsc->sending_metrics;
> > > > > +	mutex_unlock(&mdsc->mutex);
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get,
> > > > > +			sending_metrics_set, "%llu\n");
> > > > > +
> > > > I'd like to hear more about how we expect users to use this facility.
> > > > This debugfs file doesn't seem consistent with the rest of the UI, and I
> > > > imagine if the box reboots you'd have to (manually) re-enable it after
> > > > mount, right? Maybe this should be a mount option instead?
> > > A mount option means we must do the unmounting to disable it.
> > > 
> > Technically, no. You could wire it up so that you could enable and
> > disable it via -o remount. For example:
> > 
> >      # mount -o remount,metrics=disabled
> 
> Yeah, this is cool.
> 
> > Another option might be a module parameter if this is something that you
> > really want to be global (and not per-mount or per-session).
> > 
> > > I was thinking with the debugfs file we can do the debug or tuning even
> > > in the product setups at any time, usually this should be disabled since
> > > it will send it per second.
> > > 
> > Meh, one frame per second doesn't seem like it'll add much overhead.
> Okay.
> > Also, why one update per second? Should that interval be tunable?
> 
> Per second just keep it the same with the fuse client.
> 

Ok.

> 
> > > Or we could merge the "sending_metric" to "metrics" UI, just writing
> > > "enable"/"disable" to enable/disable sending the metrics to ceph, and
> > > just like the "reset" does to clean the metrics.
> > > 
> > > Then the "/sys/kernel/debug/ceph/XXX.clientYYY/metrics" could be
> > > writable with:
> > > 
> > > "reset"  --> to clean and reset the metrics counters
> > > 
> > > "enable" --> enable sending metrics to ceph cluster
> > > 
> > > "disable" --> disable sending metrics to ceph cluster
> > > 
> > > Will this be better ?
> > > 
> > I guess it's not clear to me how you intend for this to be used.
> > 
> > A debugfs switch means that this is being enabled and disabled on a per-
> > session basis. Is the user supposed to turn this on for all, or just one
> > session? How do they know?
> 
> Not for all, just per-superblock.
> 

If it's per-superblock, then a debugfs-based switch seems particularly
ill-suited for this, as that's really a per-session interface.

> > Is this something we expect people to just turn on briefly when they are
> > experiencing a problem, or is this something that we expect to be turned
> > on and left on for long periods of time?
> 
> If this won't add much overhead even per second, let's keep sending the 
> metrics to ceph always and the mount option for this switch is not 
> needed any more.
> 

Note that I don't really _know_ that it won't be a problem, just that it
doesn't sound too bad. I think we probably will want some mechanism to
enable/disable this until we have some experience with it in the field.

> And there is already a switch to enable/disable showing the metrics in 
> the ceph side, if here add another switch per client, it will be also 
> yucky for admins .
> 
> Let's make the update interval tunable and per second as default. Maybe 
> we should make this as a global UI for all clients ?
> 

If you want a global setting for the interval that would take effect on
all ceph mounts, then maybe a "metric_send_interval" module parameter
would be best. Make it an unsigned int, and allow the admin to set it to
0 to turn off stats transmission in the client.

We have a well-defined interface for setting module parameters on most
distros (via /etc/modprobe.d/), so that would be better than monkeying
around with debugfs here, IMO.

As to the default, it might be best to have this default to 0 initially.
Once we have more experience with it we could make it default to 1 in a
later release.
Xiubo Li Feb. 7, 2020, 12:37 a.m. UTC | #6
On 2020/2/6 23:21, Jeff Layton wrote:
> On Thu, 2020-02-06 at 20:26 +0800, Xiubo Li wrote:
>> On 2020/2/6 19:31, Jeff Layton wrote:
>>> On Thu, 2020-02-06 at 10:36 +0800, Xiubo Li wrote:
>>>> On 2020/2/6 5:43, Jeff Layton wrote:
>>>>> On Wed, 2020-01-29 at 03:27 -0500, xiubli@redhat.com wrote:
>>>> [...]
>>>>>> +
>>>>>> +static int sending_metrics_get(void *data, u64 *val)
>>>>>> +{
>>>>>> +	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
>>>>>> +	struct ceph_mds_client *mdsc = fsc->mdsc;
>>>>>> +
>>>>>> +	mutex_lock(&mdsc->mutex);
>>>>>> +	*val = (u64)mdsc->sending_metrics;
>>>>>> +	mutex_unlock(&mdsc->mutex);
>>>>>> +
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get,
>>>>>> +			sending_metrics_set, "%llu\n");
>>>>>> +
>>>>> I'd like to hear more about how we expect users to use this facility.
>>>>> This debugfs file doesn't seem consistent with the rest of the UI, and I
>>>>> imagine if the box reboots you'd have to (manually) re-enable it after
>>>>> mount, right? Maybe this should be a mount option instead?
>>>> A mount option means we must do the unmounting to disable it.
>>>>
>>> Technically, no. You could wire it up so that you could enable and
>>> disable it via -o remount. For example:
>>>
>>>       # mount -o remount,metrics=disabled
>> Yeah, this is cool.
>>
>>> Another option might be a module parameter if this is something that you
>>> really want to be global (and not per-mount or per-session).
>>>
>>>> I was thinking with the debugfs file we can do the debug or tuning even
>>>> in the product setups at any time, usually this should be disabled since
>>>> it will send it per second.
>>>>
>>> Meh, one frame per second doesn't seem like it'll add much overhead.
>> Okay.
>>> Also, why one update per second? Should that interval be tunable?
>> Per second just keep it the same with the fuse client.
>>
> Ok.
>
>>>> Or we could merge the "sending_metric" to "metrics" UI, just writing
>>>> "enable"/"disable" to enable/disable sending the metrics to ceph, and
>>>> just like the "reset" does to clean the metrics.
>>>>
>>>> Then the "/sys/kernel/debug/ceph/XXX.clientYYY/metrics" could be
>>>> writable with:
>>>>
>>>> "reset"  --> to clean and reset the metrics counters
>>>>
>>>> "enable" --> enable sending metrics to ceph cluster
>>>>
>>>> "disable" --> disable sending metrics to ceph cluster
>>>>
>>>> Will this be better ?
>>>>
>>> I guess it's not clear to me how you intend for this to be used.
>>>
>>> A debugfs switch means that this is being enabled and disabled on a per-
>>> session basis. Is the user supposed to turn this on for all, or just one
>>> session? How do they know?
>> Not for all, just per-superblock.
>>
> If it's per-superblock, then a debugfs-based switch seems particularly
> ill-suited for this, as that's really a per-session interface.
>
>>> Is this something we expect people to just turn on briefly when they are
>>> experiencing a problem, or is this something that we expect to be turned
>>> on and left on for long periods of time?
>> If this won't add much overhead even per second, let's keep sending the
>> metrics to ceph always and the mount option for this switch is not
>> needed any more.
>>
> Note that I don't really _know_ that it won't be a problem, just that it
> doesn't sound too bad. I think we probably will want some mechanism to
> enable/disable this until we have some experience with it in the field.
>
>> And there is already a switch to enable/disable showing the metrics in
>> the ceph side, if here add another switch per client, it will be also
>> yucky for admins .
>>
>> Let's make the update interval tunable and per second as default. Maybe
>> we should make this as a global UI for all clients ?
>>
> If you want a global setting for the interval that would take effect on
> all ceph mounts, then maybe a "metric_send_interval" module parameter
> would be best. Make it an unsigned int, and allow the admin to set it to
> 0 to turn off stats transmission in the client.
>
> We have a well-defined interface for setting module parameters on most
> distros (via /etc/modprobe.d/), so that would be better than monkeying
> around with debugfs here, IMO.
>
> As to the default, it might be best to have this default to 0 initially.
> Once we have more experience with it we could make it default to 1 in a
> later release.

Yeah, this makes sense.

Let's switch to the module parameter "metric_send_interval", at the same 
time this will also act as a switch.

0 means off, >0 will be the interval.

Thanks,

Patch
diff mbox series

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 7fd031c18309..8aae7ecea54a 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -124,6 +124,40 @@  static int mdsc_show(struct seq_file *s, void *p)
 	return 0;
 }
 
+/*
+ * metrics debugfs
+ */
+static int sending_metrics_set(void *data, u64 val)
+{
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+
+	if (val > 1) {
+		pr_err("Invalid sending metrics set value %llu\n", val);
+		return -EINVAL;
+	}
+
+	mutex_lock(&mdsc->mutex);
+	mdsc->sending_metrics = (unsigned int)val;
+	mutex_unlock(&mdsc->mutex);
+
+	return 0;
+}
+
+static int sending_metrics_get(void *data, u64 *val)
+{
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+
+	mutex_lock(&mdsc->mutex);
+	*val = (u64)mdsc->sending_metrics;
+	mutex_unlock(&mdsc->mutex);
+
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get,
+			sending_metrics_set, "%llu\n");
+
 static int metric_show(struct seq_file *s, void *p)
 {
 	struct ceph_fs_client *fsc = s->private;
@@ -302,11 +336,9 @@  static int congestion_kb_get(void *data, u64 *val)
 	*val = (u64)fsc->mount_options->congestion_kb;
 	return 0;
 }
-
 DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
 			congestion_kb_set, "%llu\n");
 
-
 void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 {
 	dout("ceph_fs_debugfs_cleanup\n");
@@ -316,6 +348,7 @@  void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 	debugfs_remove(fsc->debugfs_mds_sessions);
 	debugfs_remove(fsc->debugfs_caps);
 	debugfs_remove(fsc->debugfs_metric);
+	debugfs_remove(fsc->debugfs_sending_metrics);
 	debugfs_remove(fsc->debugfs_mdsc);
 }
 
@@ -356,6 +389,13 @@  void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 						fsc,
 						&mdsc_show_fops);
 
+	fsc->debugfs_sending_metrics =
+			debugfs_create_file("sending_metrics",
+					    0600,
+					    fsc->client->debugfs_dir,
+					    fsc,
+					    &sending_metrics_fops);
+
 	fsc->debugfs_metric = debugfs_create_file("metrics",
 						  0400,
 						  fsc->client->debugfs_dir,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 92a933810a79..d765804dc855 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4104,13 +4104,156 @@  static void maybe_recover_session(struct ceph_mds_client *mdsc)
 	ceph_force_reconnect(fsc->sb);
 }
 
+/*
+ * called under s_mutex
+ */
+static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *s,
+				   bool skip_global)
+{
+	struct ceph_metric_head *head;
+	struct ceph_metric_cap *cap;
+	struct ceph_metric_dentry_lease *lease;
+	struct ceph_metric_read_latency *read;
+	struct ceph_metric_write_latency *write;
+	struct ceph_metric_metadata_latency *meta;
+	struct ceph_msg *msg;
+	struct timespec64 ts;
+	s32 len = sizeof(*head) + sizeof(*cap);
+	s64 sum, total, avg;
+	s32 items = 0;
+
+	if (!mdsc || !s)
+		return false;
+
+	if (!skip_global) {
+		len += sizeof(*lease);
+		len += sizeof(*read);
+		len += sizeof(*write);
+		len += sizeof(*meta);
+	}
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
+	if (!msg) {
+		pr_err("send metrics to mds%d, failed to allocate message\n",
+		       s->s_mds);
+		return false;
+	}
+
+	head = msg->front.iov_base;
+
+	/* encode the cap metric */
+	cap = (struct ceph_metric_cap *)(head + 1);
+	cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
+	cap->ver = 1;
+	cap->campat = 1;
+	cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
+	cap->hit = cpu_to_le64(percpu_counter_sum(&s->i_caps_hit));
+	cap->mis = cpu_to_le64(percpu_counter_sum(&s->i_caps_mis));
+	cap->total = cpu_to_le64(s->s_nr_caps);
+	items++;
+
+	dout("cap metric hit %lld, mis %lld, total caps %lld",
+	     le64_to_cpu(cap->hit), le64_to_cpu(cap->mis),
+	     le64_to_cpu(cap->total));
+
+	/* only send the global once */
+	if (skip_global)
+		goto skip_global;
+
+	/* encode the dentry lease metric */
+	lease = (struct ceph_metric_dentry_lease *)(cap + 1);
+	lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
+	lease->ver = 1;
+	lease->campat = 1;
+	lease->data_len = cpu_to_le32(sizeof(*lease) - 10);
+	lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit));
+	lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis));
+	lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries));
+	items++;
+
+	dout("dentry lease metric hit %lld, mis %lld, total dentries %lld",
+	     le64_to_cpu(lease->hit), le64_to_cpu(lease->mis),
+	     le64_to_cpu(lease->total));
+
+	/* encode the read latency metric */
+	read = (struct ceph_metric_read_latency *)(lease + 1);
+	read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
+	read->ver = 1;
+	read->campat = 1;
+	read->data_len = cpu_to_le32(sizeof(*read) - 10);
+	total = percpu_counter_sum(&mdsc->metric.total_reads),
+	sum = percpu_counter_sum(&mdsc->metric.read_latency_sum);
+	avg = total ? sum / total : 0;
+	ts = ns_to_timespec64(avg);
+	read->sec = cpu_to_le32(ts.tv_sec);
+	read->nsec = cpu_to_le32(ts.tv_nsec);
+	items++;
+
+	dout("read latency metric total %lld, sum lat %lld, avg lat %lld",
+	     total, sum, avg);
+
+	/* encode the write latency metric */
+	write = (struct ceph_metric_write_latency *)(read + 1);
+	write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
+	write->ver = 1;
+	write->campat = 1;
+	write->data_len = cpu_to_le32(sizeof(*write) - 10);
+	total = percpu_counter_sum(&mdsc->metric.total_writes),
+	sum = percpu_counter_sum(&mdsc->metric.write_latency_sum);
+	avg = total ? sum / total : 0;
+	ts = ns_to_timespec64(avg);
+	write->sec = cpu_to_le32(ts.tv_sec);
+	write->nsec = cpu_to_le32(ts.tv_nsec);
+	items++;
+
+	dout("write latency metric total %lld, sum lat %lld, avg lat %lld",
+	     total, sum, avg);
+
+	/* encode the metadata latency metric */
+	meta = (struct ceph_metric_metadata_latency *)(write + 1);
+	meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
+	meta->ver = 1;
+	meta->campat = 1;
+	meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
+	total = percpu_counter_sum(&mdsc->metric.total_metadatas),
+	sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum);
+	avg = total ? sum / total : 0;
+	ts = ns_to_timespec64(avg);
+	meta->sec = cpu_to_le32(ts.tv_sec);
+	meta->nsec = cpu_to_le32(ts.tv_nsec);
+	items++;
+
+	dout("metadata latency metric total %lld, sum lat %lld, avg lat %lld",
+	     total, sum, avg);
+
+skip_global:
+	put_unaligned_le32(items, &head->num);
+	msg->front.iov_len = cpu_to_le32(len);
+	msg->hdr.version = cpu_to_le16(1);
+	msg->hdr.compat_version = cpu_to_le16(1);
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+	dout("send metrics to mds%d %p\n", s->s_mds, msg);
+	ceph_con_send(&s->s_con, msg);
+
+	return true;
+}
+
 /*
  * delayed work -- periodically trim expired leases, renew caps with mds
  */
+#define CEPH_WORK_DELAY_DEF 5
 static void schedule_delayed(struct ceph_mds_client *mdsc)
 {
-	int delay = 5;
-	unsigned hz = round_jiffies_relative(HZ * delay);
+	unsigned int hz;
+	int delay = CEPH_WORK_DELAY_DEF;
+
+	mutex_lock(&mdsc->mutex);
+	if (mdsc->sending_metrics)
+		delay = 1;
+	mutex_unlock(&mdsc->mutex);
+
+	hz = round_jiffies_relative(HZ * delay);
 	schedule_delayed_work(&mdsc->delayed_work, hz);
 }
 
@@ -4121,18 +4264,28 @@  static void delayed_work(struct work_struct *work)
 		container_of(work, struct ceph_mds_client, delayed_work.work);
 	int renew_interval;
 	int renew_caps;
+	bool metric_only;
+	bool sending_metrics;
+	bool g_skip = false;
 
 	dout("mdsc delayed_work\n");
 
 	mutex_lock(&mdsc->mutex);
-	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
-	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
-				   mdsc->last_renew_caps);
-	if (renew_caps)
-		mdsc->last_renew_caps = jiffies;
+	sending_metrics = !!mdsc->sending_metrics;
+	metric_only = mdsc->sending_metrics &&
+		(mdsc->ticks++ % CEPH_WORK_DELAY_DEF);
+
+	if (!metric_only) {
+		renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
+		renew_caps = time_after_eq(jiffies, HZ*renew_interval +
+					   mdsc->last_renew_caps);
+		if (renew_caps)
+			mdsc->last_renew_caps = jiffies;
+	}
 
 	for (i = 0; i < mdsc->max_sessions; i++) {
 		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+
 		if (!s)
 			continue;
 		if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
@@ -4158,13 +4311,20 @@  static void delayed_work(struct work_struct *work)
 		mutex_unlock(&mdsc->mutex);
 
 		mutex_lock(&s->s_mutex);
-		if (renew_caps)
-			send_renew_caps(mdsc, s);
-		else
-			ceph_con_keepalive(&s->s_con);
-		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
-		    s->s_state == CEPH_MDS_SESSION_HUNG)
-			ceph_send_cap_releases(mdsc, s);
+
+		if (sending_metrics)
+			g_skip = ceph_mdsc_send_metrics(mdsc, s, g_skip);
+
+		if (!metric_only) {
+			if (renew_caps)
+				send_renew_caps(mdsc, s);
+			else
+				ceph_con_keepalive(&s->s_con);
+			if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+					s->s_state == CEPH_MDS_SESSION_HUNG)
+				ceph_send_cap_releases(mdsc, s);
+		}
+
 		mutex_unlock(&s->s_mutex);
 		ceph_put_mds_session(s);
 
@@ -4172,6 +4332,9 @@  static void delayed_work(struct work_struct *work)
 	}
 	mutex_unlock(&mdsc->mutex);
 
+	if (metric_only)
+		goto delay_work;
+
 	ceph_check_delayed_caps(mdsc);
 
 	ceph_queue_cap_reclaim_work(mdsc);
@@ -4180,11 +4343,13 @@  static void delayed_work(struct work_struct *work)
 
 	maybe_recover_session(mdsc);
 
+delay_work:
 	schedule_delayed(mdsc);
 }
 
-static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
+static int ceph_mdsc_metric_init(struct ceph_mds_client *mdsc)
 {
+	struct ceph_client_metric *metric = &mdsc->metric;
 	int ret;
 
 	if (!metric)
@@ -4222,7 +4387,9 @@  static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
 	if (ret)
 		goto err_metadata_latency_sum;
 
-	return ret;
+	mdsc->sending_metrics = 0;
+	mdsc->ticks = 0;
+	return 0;
 err_metadata_latency_sum:
 	percpu_counter_destroy(&metric->total_metadatas);
 err_total_metadatas:
@@ -4294,7 +4461,7 @@  int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	init_waitqueue_head(&mdsc->cap_flushing_wq);
 	INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
 	atomic_set(&mdsc->cap_reclaim_pending, 0);
-	err = ceph_mdsc_metric_init(&mdsc->metric);
+	err = ceph_mdsc_metric_init(mdsc);
 	if (err)
 		goto err_mdsmap;
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 574d4e5a5de2..a0ece55d987c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -451,6 +451,9 @@  struct ceph_mds_client {
 	struct list_head  dentry_leases;     /* fifo list */
 	struct list_head  dentry_dir_leases; /* lru list */
 
+	/* metrics */
+	unsigned int		  sending_metrics;
+	unsigned int		  ticks;
 	struct ceph_client_metric metric;
 
 	spinlock_t		snapid_map_lock;
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index 3cda616ba594..352eb753ce25 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -4,6 +4,82 @@ 
 
 #include <linux/ceph/osd_client.h>
 
+enum ceph_metric_type {
+	CLIENT_METRIC_TYPE_CAP_INFO,
+	CLIENT_METRIC_TYPE_READ_LATENCY,
+	CLIENT_METRIC_TYPE_WRITE_LATENCY,
+	CLIENT_METRIC_TYPE_METADATA_LATENCY,
+	CLIENT_METRIC_TYPE_DENTRY_LEASE,
+
+	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
+};
+
+/* metric caps header */
+struct ceph_metric_cap {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  campat;
+
+	__le32 data_len; /* length of sizeof(hit + mis + total) */
+	__le64 hit;
+	__le64 mis;
+	__le64 total;
+} __attribute__ ((packed));
+
+/* metric dentry lease header */
+struct ceph_metric_dentry_lease {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  campat;
+
+	__le32 data_len; /* length of sizeof(hit + mis + total) */
+	__le64 hit;
+	__le64 mis;
+	__le64 total;
+} __attribute__ ((packed));
+
+/* metric read latency header */
+struct ceph_metric_read_latency {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  campat;
+
+	__le32 data_len; /* length of sizeof(sec + nsec) */
+	__le32 sec;
+	__le32 nsec;
+} __attribute__ ((packed));
+
+/* metric write latency header */
+struct ceph_metric_write_latency {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  campat;
+
+	__le32 data_len; /* length of sizeof(sec + nsec) */
+	__le32 sec;
+	__le32 nsec;
+} __attribute__ ((packed));
+
+/* metric metadata latency header */
+struct ceph_metric_metadata_latency {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  campat;
+
+	__le32 data_len; /* length of sizeof(sec + nsec) */
+	__le32 sec;
+	__le32 nsec;
+} __attribute__ ((packed));
+
+struct ceph_metric_head {
+	__le32 num;	/* the number of metrics will be sent */
+} __attribute__ ((packed));
+
 /* This is the global metrics */
 struct ceph_client_metric {
 	atomic64_t		total_dentries;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3f4829222528..a91431e9bdf7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -128,6 +128,7 @@  struct ceph_fs_client {
 	struct dentry *debugfs_congestion_kb;
 	struct dentry *debugfs_bdi;
 	struct dentry *debugfs_mdsc, *debugfs_mdsmap;
+	struct dentry *debugfs_sending_metrics;
 	struct dentry *debugfs_metric;
 	struct dentry *debugfs_mds_sessions;
 #endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index a099f60feb7b..6028d3e865e4 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -130,6 +130,7 @@  struct ceph_dir_layout {
 #define CEPH_MSG_CLIENT_REQUEST         24
 #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
 #define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_METRICS         29
 #define CEPH_MSG_CLIENT_CAPS            0x310
 #define CEPH_MSG_CLIENT_LEASE           0x311
 #define CEPH_MSG_CLIENT_SNAP            0x312