diff mbox series

[v3] libceph: add osd op counter metric support

Message ID 20201110141937.414301-1-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series [v3] libceph: add osd op counter metric support | expand

Commit Message

Xiubo Li Nov. 10, 2020, 2:19 p.m. UTC
From: Xiubo Li <xiubli@redhat.com>

The logic is the same with osdc/Objecter.cc in ceph in user space.

URL: https://tracker.ceph.com/issues/48053
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---

V3:
- typo fixing about oring the _WRITE

 include/linux/ceph/osd_client.h |  9 ++++++
 net/ceph/debugfs.c              | 13 ++++++++
 net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+)

Comments

Jeffrey Layton Nov. 10, 2020, 3 p.m. UTC | #1
On Tue, 2020-11-10 at 22:19 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> The logic is the same with osdc/Objecter.cc in ceph in user space.
> 
> URL: https://tracker.ceph.com/issues/48053
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
> 
> V3:
> - typo fixing about oring the _WRITE
> 
>  include/linux/ceph/osd_client.h |  9 ++++++
>  net/ceph/debugfs.c              | 13 ++++++++
>  net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
>  3 files changed, 78 insertions(+)
> 
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index 83fa08a06507..24301513b186 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
>  	struct ceph_hobject_id *end;
>  };
>  
> 
> +struct ceph_osd_metric {
> +	struct percpu_counter op_ops;
> +	struct percpu_counter op_rmw;
> +	struct percpu_counter op_r;
> +	struct percpu_counter op_w;
> +};
> +
>  #define CEPH_LINGER_ID_START	0xffff000000000000ULL
>  
> 
>  struct ceph_osd_client {
> @@ -371,6 +378,8 @@ struct ceph_osd_client {
>  	struct ceph_msgpool	msgpool_op;
>  	struct ceph_msgpool	msgpool_op_reply;
>  
> 
> +	struct ceph_osd_metric  metric;
> +
>  	struct workqueue_struct	*notify_wq;
>  	struct workqueue_struct	*completion_wq;
>  };
> diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
> index 2110439f8a24..af90019386ab 100644
> --- a/net/ceph/debugfs.c
> +++ b/net/ceph/debugfs.c
> @@ -339,6 +339,16 @@ static void dump_backoffs(struct seq_file *s, struct ceph_osd *osd)
>  	mutex_unlock(&osd->lock);
>  }
>  
> 
> +static void dump_op_metric(struct seq_file *s, struct ceph_osd_client *osdc)
> +{
> +	struct ceph_osd_metric *m = &osdc->metric;
> +
> +	seq_printf(s, "  op_ops: %lld\n", percpu_counter_sum(&m->op_ops));
> +	seq_printf(s, "  op_rmw: %lld\n", percpu_counter_sum(&m->op_rmw));
> +	seq_printf(s, "  op_r:   %lld\n", percpu_counter_sum(&m->op_r));
> +	seq_printf(s, "  op_w:   %lld\n", percpu_counter_sum(&m->op_w));
> +}
> +
>  static int osdc_show(struct seq_file *s, void *pp)
>  {
>  	struct ceph_client *client = s->private;
> @@ -372,6 +382,9 @@ static int osdc_show(struct seq_file *s, void *pp)
>  	}
>  
> 
>  	up_read(&osdc->lock);
> +
> +	seq_puts(s, "OP METRIC:\n");
> +	dump_op_metric(s, osdc);
>  	return 0;
>  }
>  
> 
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 7901ab6c79fd..0e6e127dd669 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -2424,6 +2424,21 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
>  	goto again;
>  }
>  
> 
> +static void osd_acount_op_metric(struct ceph_osd_request *req)
> +{
> +	struct ceph_osd_metric *m = &req->r_osdc->metric;
> +
> +	percpu_counter_inc(&m->op_ops);
> +
> +	if ((req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))
> +	    == (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))
> +		percpu_counter_inc(&m->op_rmw);
> +	if (req->r_flags & CEPH_OSD_FLAG_READ)
> +		percpu_counter_inc(&m->op_r);
> +	else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
> +		percpu_counter_inc(&m->op_w);
> +}
> +
>  static void account_request(struct ceph_osd_request *req)
>  {
>  	WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
> @@ -2434,6 +2449,8 @@ static void account_request(struct ceph_osd_request *req)
>  
> 
>  	req->r_start_stamp = jiffies;
>  	req->r_start_latency = ktime_get();
> +
> +	osd_acount_op_metric(req);
>  }
>  
> 
>  static void submit_request(struct ceph_osd_request *req, bool wrlocked)
> @@ -5205,6 +5222,39 @@ void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc)
>  	up_write(&osdc->lock);
>  }
>  
> 
> +static void ceph_metric_destroy(struct ceph_osd_metric *m)
> +{
> +	percpu_counter_destroy(&m->op_ops);
> +	percpu_counter_destroy(&m->op_rmw);
> +	percpu_counter_destroy(&m->op_r);
> +	percpu_counter_destroy(&m->op_w);
> +}
> +
> +static int ceph_metric_init(struct ceph_osd_metric *m)
> +{
> +	int ret;
> +
> +	memset(m, 0, sizeof(*m));
> +
> +	ret = percpu_counter_init(&m->op_ops, 0, GFP_NOIO);
> +	if (ret)
> +		return ret;
> +	ret = percpu_counter_init(&m->op_rmw, 0, GFP_NOIO);
> +	if (ret)
> +		goto err;
> +	ret = percpu_counter_init(&m->op_r, 0, GFP_NOIO);
> +	if (ret)
> +		goto err;
> +	ret = percpu_counter_init(&m->op_w, 0, GFP_NOIO);
> +	if (ret)
> +		goto err;
> +	return 0;
> +
> +err:
> +	ceph_metric_destroy(m);
> +	return ret;
> +}
> +
>  /*
>   * init, shutdown
>   */
> @@ -5257,6 +5307,9 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
>  	if (!osdc->completion_wq)
>  		goto out_notify_wq;
>  
> 
> +	if (ceph_metric_init(&osdc->metric) < 0)
> +		goto out_completion_wq;
> +
>  	schedule_delayed_work(&osdc->timeout_work,
>  			      osdc->client->options->osd_keepalive_timeout);
>  	schedule_delayed_work(&osdc->osds_timeout_work,
> @@ -5264,6 +5317,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
>  
> 
>  	return 0;
>  
> 
> +out_completion_wq:
> +	destroy_workqueue(osdc->completion_wq);
>  out_notify_wq:
>  	destroy_workqueue(osdc->notify_wq);
>  out_msgpool_reply:
> @@ -5302,6 +5357,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
>  	WARN_ON(atomic_read(&osdc->num_requests));
>  	WARN_ON(atomic_read(&osdc->num_homeless));
>  
> 
> +	ceph_metric_destroy(&osdc->metric);
>  	ceph_osdmap_destroy(osdc->osdmap);
>  	mempool_destroy(osdc->req_mempool);
>  	ceph_msgpool_destroy(&osdc->msgpool_op);

Looks sane. Merged into testing.
Ilya Dryomov Nov. 10, 2020, 3:44 p.m. UTC | #2
On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote:
>
> From: Xiubo Li <xiubli@redhat.com>
>
> The logic is the same with osdc/Objecter.cc in ceph in user space.
>
> URL: https://tracker.ceph.com/issues/48053
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>
> V3:
> - typo fixing about oring the _WRITE
>
>  include/linux/ceph/osd_client.h |  9 ++++++
>  net/ceph/debugfs.c              | 13 ++++++++
>  net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
>  3 files changed, 78 insertions(+)
>
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index 83fa08a06507..24301513b186 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
>         struct ceph_hobject_id *end;
>  };
>
> +struct ceph_osd_metric {
> +       struct percpu_counter op_ops;
> +       struct percpu_counter op_rmw;
> +       struct percpu_counter op_r;
> +       struct percpu_counter op_w;
> +};

OK, so only reads and writes are really needed.  Why not expose them
through the existing metrics framework in fs/ceph?  Wouldn't "fs top"
want to display them?  Exposing latency information without exposing
overall counts seems rather weird to me anyway.

The fundamental problem is that debugfs output format is not stable.
The tracker mentions test_readahead -- updating some teuthology test
cases from time to time is not a big deal, but if a user facing tool
such as "fs top" starts relying on these, it would be bad.

Thanks,

                Ilya
Jeffrey Layton Nov. 10, 2020, 5:11 p.m. UTC | #3
On Tue, 2020-11-10 at 16:44 +0100, Ilya Dryomov wrote:
> On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote:
> > 
> > From: Xiubo Li <xiubli@redhat.com>
> > 
> > The logic is the same with osdc/Objecter.cc in ceph in user space.
> > 
> > URL: https://tracker.ceph.com/issues/48053
> > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > ---
> > 
> > V3:
> > - typo fixing about oring the _WRITE
> > 
> >  include/linux/ceph/osd_client.h |  9 ++++++
> >  net/ceph/debugfs.c              | 13 ++++++++
> >  net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
> >  3 files changed, 78 insertions(+)
> > 
> > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> > index 83fa08a06507..24301513b186 100644
> > --- a/include/linux/ceph/osd_client.h
> > +++ b/include/linux/ceph/osd_client.h
> > @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
> >         struct ceph_hobject_id *end;
> >  };
> > 
> > +struct ceph_osd_metric {
> > +       struct percpu_counter op_ops;
> > +       struct percpu_counter op_rmw;
> > +       struct percpu_counter op_r;
> > +       struct percpu_counter op_w;
> > +};
> 
> OK, so only reads and writes are really needed.  Why not expose them
> through the existing metrics framework in fs/ceph?  Wouldn't "fs top"
> want to display them?  Exposing latency information without exposing
> overall counts seems rather weird to me anyway.
> 
> The fundamental problem is that debugfs output format is not stable.
> The tracker mentions test_readahead -- updating some teuthology test
> cases from time to time is not a big deal, but if a user facing tool
> such as "fs top" starts relying on these, it would be bad.
> 
> Thanks,
> 
>                 Ilya

Those are all good points. The tracker is light on details. I had
assumed that you'd also be uploading this to the MDS in a later patch.
Is that also planned?

I'll also add that it might be nice to keeps stats on copy_from2 as
well, since we do have a copy_file_range operation in cephfs.
Xiubo Li Nov. 11, 2020, 1:32 a.m. UTC | #4
On 2020/11/10 23:44, Ilya Dryomov wrote:
> On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> The logic is the same with osdc/Objecter.cc in ceph in user space.
>>
>> URL: https://tracker.ceph.com/issues/48053
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>
>> V3:
>> - typo fixing about oring the _WRITE
>>
>>   include/linux/ceph/osd_client.h |  9 ++++++
>>   net/ceph/debugfs.c              | 13 ++++++++
>>   net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
>>   3 files changed, 78 insertions(+)
>>
>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>> index 83fa08a06507..24301513b186 100644
>> --- a/include/linux/ceph/osd_client.h
>> +++ b/include/linux/ceph/osd_client.h
>> @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
>>          struct ceph_hobject_id *end;
>>   };
>>
>> +struct ceph_osd_metric {
>> +       struct percpu_counter op_ops;
>> +       struct percpu_counter op_rmw;
>> +       struct percpu_counter op_r;
>> +       struct percpu_counter op_w;
>> +};
> OK, so only reads and writes are really needed.  Why not expose them
> through the existing metrics framework in fs/ceph?  Wouldn't "fs top"
> want to display them?  Exposing latency information without exposing
> overall counts seems rather weird to me anyway.

Okay, I just thought in future this may also be needed by rbd :-)


> The fundamental problem is that debugfs output format is not stable.
> The tracker mentions test_readahead -- updating some teuthology test
> cases from time to time is not a big deal, but if a user facing tool
> such as "fs top" starts relying on these, it would be bad.

No problem, let me move it to fs existing metric framework.

Thanks Ilya.

BRs

> Thanks,
>
>                  Ilya
>
Xiubo Li Nov. 11, 2020, 1:37 a.m. UTC | #5
On 2020/11/11 1:11, Jeff Layton wrote:
> On Tue, 2020-11-10 at 16:44 +0100, Ilya Dryomov wrote:
>> On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote:
>>> From: Xiubo Li <xiubli@redhat.com>
>>>
>>> The logic is the same with osdc/Objecter.cc in ceph in user space.
>>>
>>> URL: https://tracker.ceph.com/issues/48053
>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>> ---
>>>
>>> V3:
>>> - typo fixing about oring the _WRITE
>>>
>>>   include/linux/ceph/osd_client.h |  9 ++++++
>>>   net/ceph/debugfs.c              | 13 ++++++++
>>>   net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
>>>   3 files changed, 78 insertions(+)
>>>
>>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>>> index 83fa08a06507..24301513b186 100644
>>> --- a/include/linux/ceph/osd_client.h
>>> +++ b/include/linux/ceph/osd_client.h
>>> @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
>>>          struct ceph_hobject_id *end;
>>>   };
>>>
>>> +struct ceph_osd_metric {
>>> +       struct percpu_counter op_ops;
>>> +       struct percpu_counter op_rmw;
>>> +       struct percpu_counter op_r;
>>> +       struct percpu_counter op_w;
>>> +};
>> OK, so only reads and writes are really needed.  Why not expose them
>> through the existing metrics framework in fs/ceph?  Wouldn't "fs top"
>> want to display them?  Exposing latency information without exposing
>> overall counts seems rather weird to me anyway.
>>
>> The fundamental problem is that debugfs output format is not stable.
>> The tracker mentions test_readahead -- updating some teuthology test
>> cases from time to time is not a big deal, but if a user facing tool
>> such as "fs top" starts relying on these, it would be bad.
>>
>> Thanks,
>>
>>                  Ilya
> Those are all good points. The tracker is light on details. I had
> assumed that you'd also be uploading this to the MDS in a later patch.
> Is that also planned?

Yeah, this is on my todo list.


> I'll also add that it might be nice to keeps stats on copy_from2 as
> well, since we do have a copy_file_range operation in cephfs.

Make sense and I will add it.

Thanks

BRs
Patrick Donnelly Nov. 11, 2020, 3:18 p.m. UTC | #6
On Tue, Nov 10, 2020 at 7:45 AM Ilya Dryomov <idryomov@gmail.com> wrote:
>
> On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote:
> >
> > From: Xiubo Li <xiubli@redhat.com>
> >
> > The logic is the same with osdc/Objecter.cc in ceph in user space.
> >
> > URL: https://tracker.ceph.com/issues/48053
> > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > ---
> >
> > V3:
> > - typo fixing about oring the _WRITE
> >
> >  include/linux/ceph/osd_client.h |  9 ++++++
> >  net/ceph/debugfs.c              | 13 ++++++++
> >  net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
> >  3 files changed, 78 insertions(+)
> >
> > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> > index 83fa08a06507..24301513b186 100644
> > --- a/include/linux/ceph/osd_client.h
> > +++ b/include/linux/ceph/osd_client.h
> > @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
> >         struct ceph_hobject_id *end;
> >  };
> >
> > +struct ceph_osd_metric {
> > +       struct percpu_counter op_ops;
> > +       struct percpu_counter op_rmw;
> > +       struct percpu_counter op_r;
> > +       struct percpu_counter op_w;
> > +};
>
> OK, so only reads and writes are really needed.  Why not expose them
> through the existing metrics framework in fs/ceph?  Wouldn't "fs top"
> want to display them?  Exposing latency information without exposing
> overall counts seems rather weird to me anyway.

`fs top` may want to eventually display this information but the
intention was to have a "perf dump"-like debugfs file that has
information about the number of osd op reads/writes. We need that for
this test:

https://github.com/ceph/ceph/blob/master/qa/tasks/cephfs/test_readahead.py#L20

Pulling the information out through `fs top` is not a direction I'd like to go.

> The fundamental problem is that debugfs output format is not stable.
> The tracker mentions test_readahead -- updating some teuthology test
> cases from time to time is not a big deal, but if a user facing tool
> such as "fs top" starts relying on these, it would be bad.

`fs top` will not rely on debugfs files.
Xiubo Li Nov. 12, 2020, 2:30 a.m. UTC | #7
On 2020/11/11 23:18, Patrick Donnelly wrote:
> On Tue, Nov 10, 2020 at 7:45 AM Ilya Dryomov <idryomov@gmail.com> wrote:
>> On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote:
>>> From: Xiubo Li <xiubli@redhat.com>
>>>
>>> The logic is the same with osdc/Objecter.cc in ceph in user space.
>>>
>>> URL: https://tracker.ceph.com/issues/48053
>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>> ---
>>>
>>> V3:
>>> - typo fixing about oring the _WRITE
>>>
>>>   include/linux/ceph/osd_client.h |  9 ++++++
>>>   net/ceph/debugfs.c              | 13 ++++++++
>>>   net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
>>>   3 files changed, 78 insertions(+)
>>>
>>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>>> index 83fa08a06507..24301513b186 100644
>>> --- a/include/linux/ceph/osd_client.h
>>> +++ b/include/linux/ceph/osd_client.h
>>> @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
>>>          struct ceph_hobject_id *end;
>>>   };
>>>
>>> +struct ceph_osd_metric {
>>> +       struct percpu_counter op_ops;
>>> +       struct percpu_counter op_rmw;
>>> +       struct percpu_counter op_r;
>>> +       struct percpu_counter op_w;
>>> +};
>> OK, so only reads and writes are really needed.  Why not expose them
>> through the existing metrics framework in fs/ceph?  Wouldn't "fs top"
>> want to display them?  Exposing latency information without exposing
>> overall counts seems rather weird to me anyway.
> `fs top` may want to eventually display this information but the
> intention was to have a "perf dump"-like debugfs file that has
> information about the number of osd op reads/writes. We need that for
> this test:
>
> https://github.com/ceph/ceph/blob/master/qa/tasks/cephfs/test_readahead.py#L20
>
> Pulling the information out through `fs top` is not a direction I'd like to go.
>
>> The fundamental problem is that debugfs output format is not stable.
>> The tracker mentions test_readahead -- updating some teuthology test
>> cases from time to time is not a big deal, but if a user facing tool
>> such as "fs top" starts relying on these, it would be bad.
> `fs top` will not rely on debugfs files.
>
There has one bug in the "test_readahead.py", I have fixed it in [1]. I 
think the existing cephfs metric framework is far enough for us to 
support the readahead qa test for kclient.

[1] https://github.com/ceph/ceph/pull/38016

Thanks

BRs
Ilya Dryomov Nov. 12, 2020, 10:10 a.m. UTC | #8
On Thu, Nov 12, 2020 at 3:30 AM Xiubo Li <xiubli@redhat.com> wrote:
>
> On 2020/11/11 23:18, Patrick Donnelly wrote:
> > On Tue, Nov 10, 2020 at 7:45 AM Ilya Dryomov <idryomov@gmail.com> wrote:
> >> On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote:
> >>> From: Xiubo Li <xiubli@redhat.com>
> >>>
> >>> The logic is the same with osdc/Objecter.cc in ceph in user space.
> >>>
> >>> URL: https://tracker.ceph.com/issues/48053
> >>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> >>> ---
> >>>
> >>> V3:
> >>> - typo fixing about oring the _WRITE
> >>>
> >>>   include/linux/ceph/osd_client.h |  9 ++++++
> >>>   net/ceph/debugfs.c              | 13 ++++++++
> >>>   net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
> >>>   3 files changed, 78 insertions(+)
> >>>
> >>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> >>> index 83fa08a06507..24301513b186 100644
> >>> --- a/include/linux/ceph/osd_client.h
> >>> +++ b/include/linux/ceph/osd_client.h
> >>> @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
> >>>          struct ceph_hobject_id *end;
> >>>   };
> >>>
> >>> +struct ceph_osd_metric {
> >>> +       struct percpu_counter op_ops;
> >>> +       struct percpu_counter op_rmw;
> >>> +       struct percpu_counter op_r;
> >>> +       struct percpu_counter op_w;
> >>> +};
> >> OK, so only reads and writes are really needed.  Why not expose them
> >> through the existing metrics framework in fs/ceph?  Wouldn't "fs top"
> >> want to display them?  Exposing latency information without exposing
> >> overall counts seems rather weird to me anyway.
> > `fs top` may want to eventually display this information but the
> > intention was to have a "perf dump"-like debugfs file that has
> > information about the number of osd op reads/writes. We need that for
> > this test:
> >
> > https://github.com/ceph/ceph/blob/master/qa/tasks/cephfs/test_readahead.py#L20
> >
> > Pulling the information out through `fs top` is not a direction I'd like to go.
> >
> >> The fundamental problem is that debugfs output format is not stable.
> >> The tracker mentions test_readahead -- updating some teuthology test
> >> cases from time to time is not a big deal, but if a user facing tool
> >> such as "fs top" starts relying on these, it would be bad.
> > `fs top` will not rely on debugfs files.
> >
> There has one bug in the "test_readahead.py", I have fixed it in [1]. I
> think the existing cephfs metric framework is far enough for us to
> support the readahead qa test for kclient.
>
> [1] https://github.com/ceph/ceph/pull/38016

Yeah, it already provides a debugfs file, so the test wouldn't need
"fs top" to pull that counter out.

Thanks,

                Ilya
Jeffrey Layton April 26, 2021, 5:56 p.m. UTC | #9
On Wed, 2020-11-11 at 09:32 +0800, Xiubo Li wrote:
> On 2020/11/10 23:44, Ilya Dryomov wrote:
> > On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote:
> > > From: Xiubo Li <xiubli@redhat.com>
> > > 
> > > The logic is the same with osdc/Objecter.cc in ceph in user space.
> > > 
> > > URL: https://tracker.ceph.com/issues/48053
> > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > ---
> > > 
> > > V3:
> > > - typo fixing about oring the _WRITE
> > > 
> > >   include/linux/ceph/osd_client.h |  9 ++++++
> > >   net/ceph/debugfs.c              | 13 ++++++++
> > >   net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
> > >   3 files changed, 78 insertions(+)
> > > 
> > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> > > index 83fa08a06507..24301513b186 100644
> > > --- a/include/linux/ceph/osd_client.h
> > > +++ b/include/linux/ceph/osd_client.h
> > > @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
> > >          struct ceph_hobject_id *end;
> > >   };
> > > 
> > > +struct ceph_osd_metric {
> > > +       struct percpu_counter op_ops;
> > > +       struct percpu_counter op_rmw;
> > > +       struct percpu_counter op_r;
> > > +       struct percpu_counter op_w;
> > > +};
> > OK, so only reads and writes are really needed.  Why not expose them
> > through the existing metrics framework in fs/ceph?  Wouldn't "fs top"
> > want to display them?  Exposing latency information without exposing
> > overall counts seems rather weird to me anyway.
> 
> Okay, I just thought in future this may also be needed by rbd :-)
> 
> 
> > The fundamental problem is that debugfs output format is not stable.
> > The tracker mentions test_readahead -- updating some teuthology test
> > cases from time to time is not a big deal, but if a user facing tool
> > such as "fs top" starts relying on these, it would be bad.
> 
> No problem, let me move it to fs existing metric framework.
> 

Hi Xiubo/Ilya/Patrick :

Mea culpa...I had intended to drop this patch from testing branch after
this discussion, but got sidetracked and forgot to do so. I've now done
that though.

Cheers,
Jeff
Ilya Dryomov April 26, 2021, 8:33 p.m. UTC | #10
On Mon, Apr 26, 2021 at 7:56 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Wed, 2020-11-11 at 09:32 +0800, Xiubo Li wrote:
> > On 2020/11/10 23:44, Ilya Dryomov wrote:
> > > On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote:
> > > > From: Xiubo Li <xiubli@redhat.com>
> > > >
> > > > The logic is the same with osdc/Objecter.cc in ceph in user space.
> > > >
> > > > URL: https://tracker.ceph.com/issues/48053
> > > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > > ---
> > > >
> > > > V3:
> > > > - typo fixing about oring the _WRITE
> > > >
> > > >   include/linux/ceph/osd_client.h |  9 ++++++
> > > >   net/ceph/debugfs.c              | 13 ++++++++
> > > >   net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
> > > >   3 files changed, 78 insertions(+)
> > > >
> > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> > > > index 83fa08a06507..24301513b186 100644
> > > > --- a/include/linux/ceph/osd_client.h
> > > > +++ b/include/linux/ceph/osd_client.h
> > > > @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
> > > >          struct ceph_hobject_id *end;
> > > >   };
> > > >
> > > > +struct ceph_osd_metric {
> > > > +       struct percpu_counter op_ops;
> > > > +       struct percpu_counter op_rmw;
> > > > +       struct percpu_counter op_r;
> > > > +       struct percpu_counter op_w;
> > > > +};
> > > OK, so only reads and writes are really needed.  Why not expose them
> > > through the existing metrics framework in fs/ceph?  Wouldn't "fs top"
> > > want to display them?  Exposing latency information without exposing
> > > overall counts seems rather weird to me anyway.
> >
> > Okay, I just thought in future this may also be needed by rbd :-)
> >
> >
> > > The fundamental problem is that debugfs output format is not stable.
> > > The tracker mentions test_readahead -- updating some teuthology test
> > > cases from time to time is not a big deal, but if a user facing tool
> > > such as "fs top" starts relying on these, it would be bad.
> >
> > No problem, let me move it to fs existing metric framework.
> >
>
> Hi Xiubo/Ilya/Patrick :
>
> Mea culpa...I had intended to drop this patch from testing branch after
> this discussion, but got sidetracked and forgot to do so. I've now done
> that though.

On the subject of metrics, I think Xiubo's I/O size metrics patches
need a look -- he reposted the two that were skipped a while ago.

Thanks,

                Ilya
Xiubo Li April 27, 2021, 4:37 a.m. UTC | #11
On 2020/11/11 1:11, Jeff Layton wrote:
> On Tue, 2020-11-10 at 16:44 +0100, Ilya Dryomov wrote:
>> On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote:
>>> From: Xiubo Li <xiubli@redhat.com>
>>>
>>> The logic is the same with osdc/Objecter.cc in ceph in user space.
>>>
>>> URL: https://tracker.ceph.com/issues/48053
>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>> ---
>>>
>>> V3:
>>> - typo fixing about oring the _WRITE
>>>
>>>   include/linux/ceph/osd_client.h |  9 ++++++
>>>   net/ceph/debugfs.c              | 13 ++++++++
>>>   net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
>>>   3 files changed, 78 insertions(+)
>>>
>>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>>> index 83fa08a06507..24301513b186 100644
>>> --- a/include/linux/ceph/osd_client.h
>>> +++ b/include/linux/ceph/osd_client.h
>>> @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
>>>          struct ceph_hobject_id *end;
>>>   };
>>>
>>> +struct ceph_osd_metric {
>>> +       struct percpu_counter op_ops;
>>> +       struct percpu_counter op_rmw;
>>> +       struct percpu_counter op_r;
>>> +       struct percpu_counter op_w;
>>> +};
>> OK, so only reads and writes are really needed.  Why not expose them
>> through the existing metrics framework in fs/ceph?  Wouldn't "fs top"
>> want to display them?  Exposing latency information without exposing
>> overall counts seems rather weird to me anyway.
>>
>> The fundamental problem is that debugfs output format is not stable.
>> The tracker mentions test_readahead -- updating some teuthology test
>> cases from time to time is not a big deal, but if a user facing tool
>> such as "fs top" starts relying on these, it would be bad.
>>
>> Thanks,
>>
>>                  Ilya
> Those are all good points. The tracker is light on details. I had
> assumed that you'd also be uploading this to the MDS in a later patch.
> Is that also planned?

Yeah, it is, the next plan is to send these metrics to MDS.

> I'll also add that it might be nice to keeps stats on copy_from2 as
> well, since we do have a copy_file_range operation in cephfs.

Okay, will check it.
Xiubo Li April 27, 2021, 4:40 a.m. UTC | #12
On 2020/11/10 23:44, Ilya Dryomov wrote:
> On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> The logic is the same with osdc/Objecter.cc in ceph in user space.
>>
>> URL: https://tracker.ceph.com/issues/48053
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>
>> V3:
>> - typo fixing about oring the _WRITE
>>
>>   include/linux/ceph/osd_client.h |  9 ++++++
>>   net/ceph/debugfs.c              | 13 ++++++++
>>   net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
>>   3 files changed, 78 insertions(+)
>>
>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>> index 83fa08a06507..24301513b186 100644
>> --- a/include/linux/ceph/osd_client.h
>> +++ b/include/linux/ceph/osd_client.h
>> @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
>>          struct ceph_hobject_id *end;
>>   };
>>
>> +struct ceph_osd_metric {
>> +       struct percpu_counter op_ops;
>> +       struct percpu_counter op_rmw;
>> +       struct percpu_counter op_r;
>> +       struct percpu_counter op_w;
>> +};
> OK, so only reads and writes are really needed.  Why not expose them
> through the existing metrics framework in fs/ceph?  Wouldn't "fs top"
> want to display them?  Exposing latency information without exposing
> overall counts seems rather weird to me anyway.
>
> The fundamental problem is that debugfs output format is not stable.
> The tracker mentions test_readahead -- updating some teuthology test
> cases from time to time is not a big deal, but if a user facing tool
> such as "fs top" starts relying on these, it would be bad.

Sorry, I am not sure whether had I replied this, I may missed this whole 
thread.

As Patrick mentioned, the fs top won't rely on it, it will collect the 
metrics from the MDS instead.

Thanks.

> Thanks,
>
>                  Ilya
>
Jeffrey Layton April 27, 2021, 6:42 p.m. UTC | #13
On Mon, 2021-04-26 at 22:33 +0200, Ilya Dryomov wrote:
> On Mon, Apr 26, 2021 at 7:56 PM Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > On Wed, 2020-11-11 at 09:32 +0800, Xiubo Li wrote:
> > > On 2020/11/10 23:44, Ilya Dryomov wrote:
> > > > On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote:
> > > > > From: Xiubo Li <xiubli@redhat.com>
> > > > > 
> > > > > The logic is the same with osdc/Objecter.cc in ceph in user space.
> > > > > 
> > > > > URL: https://tracker.ceph.com/issues/48053
> > > > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > > > ---
> > > > > 
> > > > > V3:
> > > > > - typo fixing about oring the _WRITE
> > > > > 
> > > > >   include/linux/ceph/osd_client.h |  9 ++++++
> > > > >   net/ceph/debugfs.c              | 13 ++++++++
> > > > >   net/ceph/osd_client.c           | 56 +++++++++++++++++++++++++++++++++
> > > > >   3 files changed, 78 insertions(+)
> > > > > 
> > > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> > > > > index 83fa08a06507..24301513b186 100644
> > > > > --- a/include/linux/ceph/osd_client.h
> > > > > +++ b/include/linux/ceph/osd_client.h
> > > > > @@ -339,6 +339,13 @@ struct ceph_osd_backoff {
> > > > >          struct ceph_hobject_id *end;
> > > > >   };
> > > > > 
> > > > > +struct ceph_osd_metric {
> > > > > +       struct percpu_counter op_ops;
> > > > > +       struct percpu_counter op_rmw;
> > > > > +       struct percpu_counter op_r;
> > > > > +       struct percpu_counter op_w;
> > > > > +};
> > > > OK, so only reads and writes are really needed.  Why not expose them
> > > > through the existing metrics framework in fs/ceph?  Wouldn't "fs top"
> > > > want to display them?  Exposing latency information without exposing
> > > > overall counts seems rather weird to me anyway.
> > > 
> > > Okay, I just thought in future this may also be needed by rbd :-)
> > > 
> > > 
> > > > The fundamental problem is that debugfs output format is not stable.
> > > > The tracker mentions test_readahead -- updating some teuthology test
> > > > cases from time to time is not a big deal, but if a user facing tool
> > > > such as "fs top" starts relying on these, it would be bad.
> > > 
> > > No problem, let me move it to fs existing metric framework.
> > > 
> > 
> > Hi Xiubo/Ilya/Patrick :
> > 
> > Mea culpa...I had intended to drop this patch from testing branch after
> > this discussion, but got sidetracked and forgot to do so. I've now done
> > that though.
> 
> On the subject of metrics, I think Xiubo's I/O size metrics patches
> need a look -- he reposted the two that were skipped a while ago.
> 

Thanks for reminding me. I saw that he sent those when I was OOTO, and I
forgot to revisit them. In the future, if I do that, ping me about them
and I'll try to get to them sooner.
diff mbox series

Patch

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 83fa08a06507..24301513b186 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -339,6 +339,13 @@  struct ceph_osd_backoff {
 	struct ceph_hobject_id *end;
 };
 
+struct ceph_osd_metric {
+	struct percpu_counter op_ops;
+	struct percpu_counter op_rmw;
+	struct percpu_counter op_r;
+	struct percpu_counter op_w;
+};
+
 #define CEPH_LINGER_ID_START	0xffff000000000000ULL
 
 struct ceph_osd_client {
@@ -371,6 +378,8 @@  struct ceph_osd_client {
 	struct ceph_msgpool	msgpool_op;
 	struct ceph_msgpool	msgpool_op_reply;
 
+	struct ceph_osd_metric  metric;
+
 	struct workqueue_struct	*notify_wq;
 	struct workqueue_struct	*completion_wq;
 };
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 2110439f8a24..af90019386ab 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -339,6 +339,16 @@  static void dump_backoffs(struct seq_file *s, struct ceph_osd *osd)
 	mutex_unlock(&osd->lock);
 }
 
+static void dump_op_metric(struct seq_file *s, struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_metric *m = &osdc->metric;
+
+	seq_printf(s, "  op_ops: %lld\n", percpu_counter_sum(&m->op_ops));
+	seq_printf(s, "  op_rmw: %lld\n", percpu_counter_sum(&m->op_rmw));
+	seq_printf(s, "  op_r:   %lld\n", percpu_counter_sum(&m->op_r));
+	seq_printf(s, "  op_w:   %lld\n", percpu_counter_sum(&m->op_w));
+}
+
 static int osdc_show(struct seq_file *s, void *pp)
 {
 	struct ceph_client *client = s->private;
@@ -372,6 +382,9 @@  static int osdc_show(struct seq_file *s, void *pp)
 	}
 
 	up_read(&osdc->lock);
+
+	seq_puts(s, "OP METRIC:\n");
+	dump_op_metric(s, osdc);
 	return 0;
 }
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7901ab6c79fd..0e6e127dd669 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2424,6 +2424,21 @@  static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 	goto again;
 }
 
+static void osd_acount_op_metric(struct ceph_osd_request *req)
+{
+	struct ceph_osd_metric *m = &req->r_osdc->metric;
+
+	percpu_counter_inc(&m->op_ops);
+
+	if ((req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))
+	    == (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))
+		percpu_counter_inc(&m->op_rmw);
+	if (req->r_flags & CEPH_OSD_FLAG_READ)
+		percpu_counter_inc(&m->op_r);
+	else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+		percpu_counter_inc(&m->op_w);
+}
+
 static void account_request(struct ceph_osd_request *req)
 {
 	WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
@@ -2434,6 +2449,8 @@  static void account_request(struct ceph_osd_request *req)
 
 	req->r_start_stamp = jiffies;
 	req->r_start_latency = ktime_get();
+
+	osd_acount_op_metric(req);
 }
 
 static void submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -5205,6 +5222,39 @@  void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc)
 	up_write(&osdc->lock);
 }
 
+static void ceph_metric_destroy(struct ceph_osd_metric *m)
+{
+	percpu_counter_destroy(&m->op_ops);
+	percpu_counter_destroy(&m->op_rmw);
+	percpu_counter_destroy(&m->op_r);
+	percpu_counter_destroy(&m->op_w);
+}
+
+static int ceph_metric_init(struct ceph_osd_metric *m)
+{
+	int ret;
+
+	memset(m, 0, sizeof(*m));
+
+	ret = percpu_counter_init(&m->op_ops, 0, GFP_NOIO);
+	if (ret)
+		return ret;
+	ret = percpu_counter_init(&m->op_rmw, 0, GFP_NOIO);
+	if (ret)
+		goto err;
+	ret = percpu_counter_init(&m->op_r, 0, GFP_NOIO);
+	if (ret)
+		goto err;
+	ret = percpu_counter_init(&m->op_w, 0, GFP_NOIO);
+	if (ret)
+		goto err;
+	return 0;
+
+err:
+	ceph_metric_destroy(m);
+	return ret;
+}
+
 /*
  * init, shutdown
  */
@@ -5257,6 +5307,9 @@  int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	if (!osdc->completion_wq)
 		goto out_notify_wq;
 
+	if (ceph_metric_init(&osdc->metric) < 0)
+		goto out_completion_wq;
+
 	schedule_delayed_work(&osdc->timeout_work,
 			      osdc->client->options->osd_keepalive_timeout);
 	schedule_delayed_work(&osdc->osds_timeout_work,
@@ -5264,6 +5317,8 @@  int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
 	return 0;
 
+out_completion_wq:
+	destroy_workqueue(osdc->completion_wq);
 out_notify_wq:
 	destroy_workqueue(osdc->notify_wq);
 out_msgpool_reply:
@@ -5302,6 +5357,7 @@  void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	WARN_ON(atomic_read(&osdc->num_requests));
 	WARN_ON(atomic_read(&osdc->num_homeless));
 
+	ceph_metric_destroy(&osdc->metric);
 	ceph_osdmap_destroy(osdc->osdmap);
 	mempool_destroy(osdc->req_mempool);
 	ceph_msgpool_destroy(&osdc->msgpool_op);