Message ID | 20201110141937.414301-1-xiubli@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v3] libceph: add osd op counter metric support | expand |
On Tue, 2020-11-10 at 22:19 +0800, xiubli@redhat.com wrote: > From: Xiubo Li <xiubli@redhat.com> > > The logic is the same with osdc/Objecter.cc in ceph in user space. > > URL: https://tracker.ceph.com/issues/48053 > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > > V3: > - typo fixing about oring the _WRITE > > include/linux/ceph/osd_client.h | 9 ++++++ > net/ceph/debugfs.c | 13 ++++++++ > net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ > 3 files changed, 78 insertions(+) > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > index 83fa08a06507..24301513b186 100644 > --- a/include/linux/ceph/osd_client.h > +++ b/include/linux/ceph/osd_client.h > @@ -339,6 +339,13 @@ struct ceph_osd_backoff { > struct ceph_hobject_id *end; > }; > > > +struct ceph_osd_metric { > + struct percpu_counter op_ops; > + struct percpu_counter op_rmw; > + struct percpu_counter op_r; > + struct percpu_counter op_w; > +}; > + > #define CEPH_LINGER_ID_START 0xffff000000000000ULL > > > struct ceph_osd_client { > @@ -371,6 +378,8 @@ struct ceph_osd_client { > struct ceph_msgpool msgpool_op; > struct ceph_msgpool msgpool_op_reply; > > > + struct ceph_osd_metric metric; > + > struct workqueue_struct *notify_wq; > struct workqueue_struct *completion_wq; > }; > diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c > index 2110439f8a24..af90019386ab 100644 > --- a/net/ceph/debugfs.c > +++ b/net/ceph/debugfs.c > @@ -339,6 +339,16 @@ static void dump_backoffs(struct seq_file *s, struct ceph_osd *osd) > mutex_unlock(&osd->lock); > } > > > +static void dump_op_metric(struct seq_file *s, struct ceph_osd_client *osdc) > +{ > + struct ceph_osd_metric *m = &osdc->metric; > + > + seq_printf(s, " op_ops: %lld\n", percpu_counter_sum(&m->op_ops)); > + seq_printf(s, " op_rmw: %lld\n", percpu_counter_sum(&m->op_rmw)); > + seq_printf(s, " op_r: %lld\n", percpu_counter_sum(&m->op_r)); > + seq_printf(s, " op_w: %lld\n", percpu_counter_sum(&m->op_w)); > +} > + > static int osdc_show(struct seq_file *s, void *pp) > { > struct ceph_client *client = s->private; > @@ -372,6 +382,9 @@ static int osdc_show(struct seq_file *s, void *pp) > } > > > up_read(&osdc->lock); > + > + seq_puts(s, "OP METRIC:\n"); > + dump_op_metric(s, osdc); > return 0; > } > > > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c > index 7901ab6c79fd..0e6e127dd669 100644 > --- a/net/ceph/osd_client.c > +++ b/net/ceph/osd_client.c > @@ -2424,6 +2424,21 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) > goto again; > } > > > +static void osd_acount_op_metric(struct ceph_osd_request *req) > +{ > + struct ceph_osd_metric *m = &req->r_osdc->metric; > + > + percpu_counter_inc(&m->op_ops); > + > + if ((req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)) > + == (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)) > + percpu_counter_inc(&m->op_rmw); > + if (req->r_flags & CEPH_OSD_FLAG_READ) > + percpu_counter_inc(&m->op_r); > + else if (req->r_flags & CEPH_OSD_FLAG_WRITE) > + percpu_counter_inc(&m->op_w); > +} > + > static void account_request(struct ceph_osd_request *req) > { > WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); > @@ -2434,6 +2449,8 @@ static void account_request(struct ceph_osd_request *req) > > > req->r_start_stamp = jiffies; > req->r_start_latency = ktime_get(); > + > + osd_acount_op_metric(req); > } > > > static void submit_request(struct ceph_osd_request *req, bool wrlocked) > @@ -5205,6 +5222,39 @@ void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) > up_write(&osdc->lock); > } > > > +static void ceph_metric_destroy(struct ceph_osd_metric *m) > +{ > + percpu_counter_destroy(&m->op_ops); > + percpu_counter_destroy(&m->op_rmw); > + percpu_counter_destroy(&m->op_r); > + percpu_counter_destroy(&m->op_w); > +} > + > +static int ceph_metric_init(struct ceph_osd_metric *m) > +{ > + int ret; > + > + memset(m, 0, sizeof(*m)); > + > + ret = percpu_counter_init(&m->op_ops, 0, GFP_NOIO); > + if (ret) > + return ret; > + ret = percpu_counter_init(&m->op_rmw, 0, GFP_NOIO); > + if (ret) > + goto err; > + ret = percpu_counter_init(&m->op_r, 0, GFP_NOIO); > + if (ret) > + goto err; > + ret = percpu_counter_init(&m->op_w, 0, GFP_NOIO); > + if (ret) > + goto err; > + return 0; > + > +err: > + ceph_metric_destroy(m); > + return ret; > +} > + > /* > * init, shutdown > */ > @@ -5257,6 +5307,9 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) > if (!osdc->completion_wq) > goto out_notify_wq; > > > + if (ceph_metric_init(&osdc->metric) < 0) > + goto out_completion_wq; > + > schedule_delayed_work(&osdc->timeout_work, > osdc->client->options->osd_keepalive_timeout); > schedule_delayed_work(&osdc->osds_timeout_work, > @@ -5264,6 +5317,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) > > > return 0; > > > +out_completion_wq: > + destroy_workqueue(osdc->completion_wq); > out_notify_wq: > destroy_workqueue(osdc->notify_wq); > out_msgpool_reply: > @@ -5302,6 +5357,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) > WARN_ON(atomic_read(&osdc->num_requests)); > WARN_ON(atomic_read(&osdc->num_homeless)); > > > + ceph_metric_destroy(&osdc->metric); > ceph_osdmap_destroy(osdc->osdmap); > mempool_destroy(osdc->req_mempool); > ceph_msgpool_destroy(&osdc->msgpool_op); Looks sane. Merged into testing.
On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote: > > From: Xiubo Li <xiubli@redhat.com> > > The logic is the same with osdc/Objecter.cc in ceph in user space. > > URL: https://tracker.ceph.com/issues/48053 > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > > V3: > - typo fixing about oring the _WRITE > > include/linux/ceph/osd_client.h | 9 ++++++ > net/ceph/debugfs.c | 13 ++++++++ > net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ > 3 files changed, 78 insertions(+) > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > index 83fa08a06507..24301513b186 100644 > --- a/include/linux/ceph/osd_client.h > +++ b/include/linux/ceph/osd_client.h > @@ -339,6 +339,13 @@ struct ceph_osd_backoff { > struct ceph_hobject_id *end; > }; > > +struct ceph_osd_metric { > + struct percpu_counter op_ops; > + struct percpu_counter op_rmw; > + struct percpu_counter op_r; > + struct percpu_counter op_w; > +}; OK, so only reads and writes are really needed. Why not expose them through the existing metrics framework in fs/ceph? Wouldn't "fs top" want to display them? Exposing latency information without exposing overall counts seems rather weird to me anyway. The fundamental problem is that debugfs output format is not stable. The tracker mentions test_readahead -- updating some teuthology test cases from time to time is not a big deal, but if a user facing tool such as "fs top" starts relying on these, it would be bad. Thanks, Ilya
On Tue, 2020-11-10 at 16:44 +0100, Ilya Dryomov wrote: > On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote: > > > > From: Xiubo Li <xiubli@redhat.com> > > > > The logic is the same with osdc/Objecter.cc in ceph in user space. > > > > URL: https://tracker.ceph.com/issues/48053 > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > --- > > > > V3: > > - typo fixing about oring the _WRITE > > > > include/linux/ceph/osd_client.h | 9 ++++++ > > net/ceph/debugfs.c | 13 ++++++++ > > net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ > > 3 files changed, 78 insertions(+) > > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > > index 83fa08a06507..24301513b186 100644 > > --- a/include/linux/ceph/osd_client.h > > +++ b/include/linux/ceph/osd_client.h > > @@ -339,6 +339,13 @@ struct ceph_osd_backoff { > > struct ceph_hobject_id *end; > > }; > > > > +struct ceph_osd_metric { > > + struct percpu_counter op_ops; > > + struct percpu_counter op_rmw; > > + struct percpu_counter op_r; > > + struct percpu_counter op_w; > > +}; > > OK, so only reads and writes are really needed. Why not expose them > through the existing metrics framework in fs/ceph? Wouldn't "fs top" > want to display them? Exposing latency information without exposing > overall counts seems rather weird to me anyway. > > The fundamental problem is that debugfs output format is not stable. > The tracker mentions test_readahead -- updating some teuthology test > cases from time to time is not a big deal, but if a user facing tool > such as "fs top" starts relying on these, it would be bad. > > Thanks, > > Ilya Those are all good points. The tracker is light on details. I had assumed that you'd also be uploading this to the MDS in a later patch. Is that also planned? I'll also add that it might be nice to keeps stats on copy_from2 as well, since we do have a copy_file_range operation in cephfs.
On 2020/11/10 23:44, Ilya Dryomov wrote: > On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote: >> From: Xiubo Li <xiubli@redhat.com> >> >> The logic is the same with osdc/Objecter.cc in ceph in user space. >> >> URL: https://tracker.ceph.com/issues/48053 >> Signed-off-by: Xiubo Li <xiubli@redhat.com> >> --- >> >> V3: >> - typo fixing about oring the _WRITE >> >> include/linux/ceph/osd_client.h | 9 ++++++ >> net/ceph/debugfs.c | 13 ++++++++ >> net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ >> 3 files changed, 78 insertions(+) >> >> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h >> index 83fa08a06507..24301513b186 100644 >> --- a/include/linux/ceph/osd_client.h >> +++ b/include/linux/ceph/osd_client.h >> @@ -339,6 +339,13 @@ struct ceph_osd_backoff { >> struct ceph_hobject_id *end; >> }; >> >> +struct ceph_osd_metric { >> + struct percpu_counter op_ops; >> + struct percpu_counter op_rmw; >> + struct percpu_counter op_r; >> + struct percpu_counter op_w; >> +}; > OK, so only reads and writes are really needed. Why not expose them > through the existing metrics framework in fs/ceph? Wouldn't "fs top" > want to display them? Exposing latency information without exposing > overall counts seems rather weird to me anyway. Okay, I just thought in future this may also be needed by rbd :-) > The fundamental problem is that debugfs output format is not stable. > The tracker mentions test_readahead -- updating some teuthology test > cases from time to time is not a big deal, but if a user facing tool > such as "fs top" starts relying on these, it would be bad. No problem, let me move it to fs existing metric framework. Thanks Ilya. BRs > Thanks, > > Ilya >
On 2020/11/11 1:11, Jeff Layton wrote: > On Tue, 2020-11-10 at 16:44 +0100, Ilya Dryomov wrote: >> On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote: >>> From: Xiubo Li <xiubli@redhat.com> >>> >>> The logic is the same with osdc/Objecter.cc in ceph in user space. >>> >>> URL: https://tracker.ceph.com/issues/48053 >>> Signed-off-by: Xiubo Li <xiubli@redhat.com> >>> --- >>> >>> V3: >>> - typo fixing about oring the _WRITE >>> >>> include/linux/ceph/osd_client.h | 9 ++++++ >>> net/ceph/debugfs.c | 13 ++++++++ >>> net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ >>> 3 files changed, 78 insertions(+) >>> >>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h >>> index 83fa08a06507..24301513b186 100644 >>> --- a/include/linux/ceph/osd_client.h >>> +++ b/include/linux/ceph/osd_client.h >>> @@ -339,6 +339,13 @@ struct ceph_osd_backoff { >>> struct ceph_hobject_id *end; >>> }; >>> >>> +struct ceph_osd_metric { >>> + struct percpu_counter op_ops; >>> + struct percpu_counter op_rmw; >>> + struct percpu_counter op_r; >>> + struct percpu_counter op_w; >>> +}; >> OK, so only reads and writes are really needed. Why not expose them >> through the existing metrics framework in fs/ceph? Wouldn't "fs top" >> want to display them? Exposing latency information without exposing >> overall counts seems rather weird to me anyway. >> >> The fundamental problem is that debugfs output format is not stable. >> The tracker mentions test_readahead -- updating some teuthology test >> cases from time to time is not a big deal, but if a user facing tool >> such as "fs top" starts relying on these, it would be bad. >> >> Thanks, >> >> Ilya > Those are all good points. The tracker is light on details. I had > assumed that you'd also be uploading this to the MDS in a later patch. > Is that also planned? Yeah, this is on my todo list. > I'll also add that it might be nice to keeps stats on copy_from2 as > well, since we do have a copy_file_range operation in cephfs. Make sense and I will add it. Thanks BRs
On Tue, Nov 10, 2020 at 7:45 AM Ilya Dryomov <idryomov@gmail.com> wrote: > > On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote: > > > > From: Xiubo Li <xiubli@redhat.com> > > > > The logic is the same with osdc/Objecter.cc in ceph in user space. > > > > URL: https://tracker.ceph.com/issues/48053 > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > --- > > > > V3: > > - typo fixing about oring the _WRITE > > > > include/linux/ceph/osd_client.h | 9 ++++++ > > net/ceph/debugfs.c | 13 ++++++++ > > net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ > > 3 files changed, 78 insertions(+) > > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > > index 83fa08a06507..24301513b186 100644 > > --- a/include/linux/ceph/osd_client.h > > +++ b/include/linux/ceph/osd_client.h > > @@ -339,6 +339,13 @@ struct ceph_osd_backoff { > > struct ceph_hobject_id *end; > > }; > > > > +struct ceph_osd_metric { > > + struct percpu_counter op_ops; > > + struct percpu_counter op_rmw; > > + struct percpu_counter op_r; > > + struct percpu_counter op_w; > > +}; > > OK, so only reads and writes are really needed. Why not expose them > through the existing metrics framework in fs/ceph? Wouldn't "fs top" > want to display them? Exposing latency information without exposing > overall counts seems rather weird to me anyway. `fs top` may want to eventually display this information but the intention was to have a "perf dump"-like debugfs file that has information about the number of osd op reads/writes. We need that for this test: https://github.com/ceph/ceph/blob/master/qa/tasks/cephfs/test_readahead.py#L20 Pulling the information out through `fs top` is not a direction I'd like to go. > The fundamental problem is that debugfs output format is not stable. > The tracker mentions test_readahead -- updating some teuthology test > cases from time to time is not a big deal, but if a user facing tool > such as "fs top" starts relying on these, it would be bad. `fs top` will not rely on debugfs files.
On 2020/11/11 23:18, Patrick Donnelly wrote: > On Tue, Nov 10, 2020 at 7:45 AM Ilya Dryomov <idryomov@gmail.com> wrote: >> On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote: >>> From: Xiubo Li <xiubli@redhat.com> >>> >>> The logic is the same with osdc/Objecter.cc in ceph in user space. >>> >>> URL: https://tracker.ceph.com/issues/48053 >>> Signed-off-by: Xiubo Li <xiubli@redhat.com> >>> --- >>> >>> V3: >>> - typo fixing about oring the _WRITE >>> >>> include/linux/ceph/osd_client.h | 9 ++++++ >>> net/ceph/debugfs.c | 13 ++++++++ >>> net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ >>> 3 files changed, 78 insertions(+) >>> >>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h >>> index 83fa08a06507..24301513b186 100644 >>> --- a/include/linux/ceph/osd_client.h >>> +++ b/include/linux/ceph/osd_client.h >>> @@ -339,6 +339,13 @@ struct ceph_osd_backoff { >>> struct ceph_hobject_id *end; >>> }; >>> >>> +struct ceph_osd_metric { >>> + struct percpu_counter op_ops; >>> + struct percpu_counter op_rmw; >>> + struct percpu_counter op_r; >>> + struct percpu_counter op_w; >>> +}; >> OK, so only reads and writes are really needed. Why not expose them >> through the existing metrics framework in fs/ceph? Wouldn't "fs top" >> want to display them? Exposing latency information without exposing >> overall counts seems rather weird to me anyway. > `fs top` may want to eventually display this information but the > intention was to have a "perf dump"-like debugfs file that has > information about the number of osd op reads/writes. We need that for > this test: > > https://github.com/ceph/ceph/blob/master/qa/tasks/cephfs/test_readahead.py#L20 > > Pulling the information out through `fs top` is not a direction I'd like to go. > >> The fundamental problem is that debugfs output format is not stable. >> The tracker mentions test_readahead -- updating some teuthology test >> cases from time to time is not a big deal, but if a user facing tool >> such as "fs top" starts relying on these, it would be bad. > `fs top` will not rely on debugfs files. > There has one bug in the "test_readahead.py", I have fixed it in [1]. I think the existing cephfs metric framework is far enough for us to support the readahead qa test for kclient. [1] https://github.com/ceph/ceph/pull/38016 Thanks BRs
On Thu, Nov 12, 2020 at 3:30 AM Xiubo Li <xiubli@redhat.com> wrote: > > On 2020/11/11 23:18, Patrick Donnelly wrote: > > On Tue, Nov 10, 2020 at 7:45 AM Ilya Dryomov <idryomov@gmail.com> wrote: > >> On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote: > >>> From: Xiubo Li <xiubli@redhat.com> > >>> > >>> The logic is the same with osdc/Objecter.cc in ceph in user space. > >>> > >>> URL: https://tracker.ceph.com/issues/48053 > >>> Signed-off-by: Xiubo Li <xiubli@redhat.com> > >>> --- > >>> > >>> V3: > >>> - typo fixing about oring the _WRITE > >>> > >>> include/linux/ceph/osd_client.h | 9 ++++++ > >>> net/ceph/debugfs.c | 13 ++++++++ > >>> net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ > >>> 3 files changed, 78 insertions(+) > >>> > >>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > >>> index 83fa08a06507..24301513b186 100644 > >>> --- a/include/linux/ceph/osd_client.h > >>> +++ b/include/linux/ceph/osd_client.h > >>> @@ -339,6 +339,13 @@ struct ceph_osd_backoff { > >>> struct ceph_hobject_id *end; > >>> }; > >>> > >>> +struct ceph_osd_metric { > >>> + struct percpu_counter op_ops; > >>> + struct percpu_counter op_rmw; > >>> + struct percpu_counter op_r; > >>> + struct percpu_counter op_w; > >>> +}; > >> OK, so only reads and writes are really needed. Why not expose them > >> through the existing metrics framework in fs/ceph? Wouldn't "fs top" > >> want to display them? Exposing latency information without exposing > >> overall counts seems rather weird to me anyway. > > `fs top` may want to eventually display this information but the > > intention was to have a "perf dump"-like debugfs file that has > > information about the number of osd op reads/writes. We need that for > > this test: > > > > https://github.com/ceph/ceph/blob/master/qa/tasks/cephfs/test_readahead.py#L20 > > > > Pulling the information out through `fs top` is not a direction I'd like to go. > > > >> The fundamental problem is that debugfs output format is not stable. > >> The tracker mentions test_readahead -- updating some teuthology test > >> cases from time to time is not a big deal, but if a user facing tool > >> such as "fs top" starts relying on these, it would be bad. > > `fs top` will not rely on debugfs files. > > > There has one bug in the "test_readahead.py", I have fixed it in [1]. I > think the existing cephfs metric framework is far enough for us to > support the readahead qa test for kclient. > > [1] https://github.com/ceph/ceph/pull/38016 Yeah, it already provides a debugfs file, so the test wouldn't need "fs top" to pull that counter out. Thanks, Ilya
On Wed, 2020-11-11 at 09:32 +0800, Xiubo Li wrote: > On 2020/11/10 23:44, Ilya Dryomov wrote: > > On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote: > > > From: Xiubo Li <xiubli@redhat.com> > > > > > > The logic is the same with osdc/Objecter.cc in ceph in user space. > > > > > > URL: https://tracker.ceph.com/issues/48053 > > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > > --- > > > > > > V3: > > > - typo fixing about oring the _WRITE > > > > > > include/linux/ceph/osd_client.h | 9 ++++++ > > > net/ceph/debugfs.c | 13 ++++++++ > > > net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ > > > 3 files changed, 78 insertions(+) > > > > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > > > index 83fa08a06507..24301513b186 100644 > > > --- a/include/linux/ceph/osd_client.h > > > +++ b/include/linux/ceph/osd_client.h > > > @@ -339,6 +339,13 @@ struct ceph_osd_backoff { > > > struct ceph_hobject_id *end; > > > }; > > > > > > +struct ceph_osd_metric { > > > + struct percpu_counter op_ops; > > > + struct percpu_counter op_rmw; > > > + struct percpu_counter op_r; > > > + struct percpu_counter op_w; > > > +}; > > OK, so only reads and writes are really needed. Why not expose them > > through the existing metrics framework in fs/ceph? Wouldn't "fs top" > > want to display them? Exposing latency information without exposing > > overall counts seems rather weird to me anyway. > > Okay, I just thought in future this may also be needed by rbd :-) > > > > The fundamental problem is that debugfs output format is not stable. > > The tracker mentions test_readahead -- updating some teuthology test > > cases from time to time is not a big deal, but if a user facing tool > > such as "fs top" starts relying on these, it would be bad. > > No problem, let me move it to fs existing metric framework. > Hi Xiubo/Ilya/Patrick : Mea culpa...I had intended to drop this patch from testing branch after this discussion, but got sidetracked and forgot to do so. I've now done that though. Cheers, Jeff
On Mon, Apr 26, 2021 at 7:56 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Wed, 2020-11-11 at 09:32 +0800, Xiubo Li wrote: > > On 2020/11/10 23:44, Ilya Dryomov wrote: > > > On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote: > > > > From: Xiubo Li <xiubli@redhat.com> > > > > > > > > The logic is the same with osdc/Objecter.cc in ceph in user space. > > > > > > > > URL: https://tracker.ceph.com/issues/48053 > > > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > > > --- > > > > > > > > V3: > > > > - typo fixing about oring the _WRITE > > > > > > > > include/linux/ceph/osd_client.h | 9 ++++++ > > > > net/ceph/debugfs.c | 13 ++++++++ > > > > net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ > > > > 3 files changed, 78 insertions(+) > > > > > > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > > > > index 83fa08a06507..24301513b186 100644 > > > > --- a/include/linux/ceph/osd_client.h > > > > +++ b/include/linux/ceph/osd_client.h > > > > @@ -339,6 +339,13 @@ struct ceph_osd_backoff { > > > > struct ceph_hobject_id *end; > > > > }; > > > > > > > > +struct ceph_osd_metric { > > > > + struct percpu_counter op_ops; > > > > + struct percpu_counter op_rmw; > > > > + struct percpu_counter op_r; > > > > + struct percpu_counter op_w; > > > > +}; > > > OK, so only reads and writes are really needed. Why not expose them > > > through the existing metrics framework in fs/ceph? Wouldn't "fs top" > > > want to display them? Exposing latency information without exposing > > > overall counts seems rather weird to me anyway. > > > > Okay, I just thought in future this may also be needed by rbd :-) > > > > > > > The fundamental problem is that debugfs output format is not stable. > > > The tracker mentions test_readahead -- updating some teuthology test > > > cases from time to time is not a big deal, but if a user facing tool > > > such as "fs top" starts relying on these, it would be bad. > > > > No problem, let me move it to fs existing metric framework. > > > > Hi Xiubo/Ilya/Patrick : > > Mea culpa...I had intended to drop this patch from testing branch after > this discussion, but got sidetracked and forgot to do so. I've now done > that though. On the subject of metrics, I think Xiubo's I/O size metrics patches need a look -- he reposted the two that were skipped a while ago. Thanks, Ilya
On 2020/11/11 1:11, Jeff Layton wrote: > On Tue, 2020-11-10 at 16:44 +0100, Ilya Dryomov wrote: >> On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote: >>> From: Xiubo Li <xiubli@redhat.com> >>> >>> The logic is the same with osdc/Objecter.cc in ceph in user space. >>> >>> URL: https://tracker.ceph.com/issues/48053 >>> Signed-off-by: Xiubo Li <xiubli@redhat.com> >>> --- >>> >>> V3: >>> - typo fixing about oring the _WRITE >>> >>> include/linux/ceph/osd_client.h | 9 ++++++ >>> net/ceph/debugfs.c | 13 ++++++++ >>> net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ >>> 3 files changed, 78 insertions(+) >>> >>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h >>> index 83fa08a06507..24301513b186 100644 >>> --- a/include/linux/ceph/osd_client.h >>> +++ b/include/linux/ceph/osd_client.h >>> @@ -339,6 +339,13 @@ struct ceph_osd_backoff { >>> struct ceph_hobject_id *end; >>> }; >>> >>> +struct ceph_osd_metric { >>> + struct percpu_counter op_ops; >>> + struct percpu_counter op_rmw; >>> + struct percpu_counter op_r; >>> + struct percpu_counter op_w; >>> +}; >> OK, so only reads and writes are really needed. Why not expose them >> through the existing metrics framework in fs/ceph? Wouldn't "fs top" >> want to display them? Exposing latency information without exposing >> overall counts seems rather weird to me anyway. >> >> The fundamental problem is that debugfs output format is not stable. >> The tracker mentions test_readahead -- updating some teuthology test >> cases from time to time is not a big deal, but if a user facing tool >> such as "fs top" starts relying on these, it would be bad. >> >> Thanks, >> >> Ilya > Those are all good points. The tracker is light on details. I had > assumed that you'd also be uploading this to the MDS in a later patch. > Is that also planned? Yeah, it is, the next plan is to send these metrics to MDS. > I'll also add that it might be nice to keeps stats on copy_from2 as > well, since we do have a copy_file_range operation in cephfs. Okay, will check it.
On 2020/11/10 23:44, Ilya Dryomov wrote: > On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote: >> From: Xiubo Li <xiubli@redhat.com> >> >> The logic is the same with osdc/Objecter.cc in ceph in user space. >> >> URL: https://tracker.ceph.com/issues/48053 >> Signed-off-by: Xiubo Li <xiubli@redhat.com> >> --- >> >> V3: >> - typo fixing about oring the _WRITE >> >> include/linux/ceph/osd_client.h | 9 ++++++ >> net/ceph/debugfs.c | 13 ++++++++ >> net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ >> 3 files changed, 78 insertions(+) >> >> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h >> index 83fa08a06507..24301513b186 100644 >> --- a/include/linux/ceph/osd_client.h >> +++ b/include/linux/ceph/osd_client.h >> @@ -339,6 +339,13 @@ struct ceph_osd_backoff { >> struct ceph_hobject_id *end; >> }; >> >> +struct ceph_osd_metric { >> + struct percpu_counter op_ops; >> + struct percpu_counter op_rmw; >> + struct percpu_counter op_r; >> + struct percpu_counter op_w; >> +}; > OK, so only reads and writes are really needed. Why not expose them > through the existing metrics framework in fs/ceph? Wouldn't "fs top" > want to display them? Exposing latency information without exposing > overall counts seems rather weird to me anyway. > > The fundamental problem is that debugfs output format is not stable. > The tracker mentions test_readahead -- updating some teuthology test > cases from time to time is not a big deal, but if a user facing tool > such as "fs top" starts relying on these, it would be bad. Sorry, I am not sure whether had I replied this, I may missed this whole thread. As Patrick mentioned, the fs top won't rely on it, it will collect the metrics from the MDS instead. Thanks. > Thanks, > > Ilya >
On Mon, 2021-04-26 at 22:33 +0200, Ilya Dryomov wrote: > On Mon, Apr 26, 2021 at 7:56 PM Jeff Layton <jlayton@kernel.org> wrote: > > > > On Wed, 2020-11-11 at 09:32 +0800, Xiubo Li wrote: > > > On 2020/11/10 23:44, Ilya Dryomov wrote: > > > > On Tue, Nov 10, 2020 at 3:19 PM <xiubli@redhat.com> wrote: > > > > > From: Xiubo Li <xiubli@redhat.com> > > > > > > > > > > The logic is the same with osdc/Objecter.cc in ceph in user space. > > > > > > > > > > URL: https://tracker.ceph.com/issues/48053 > > > > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > > > > --- > > > > > > > > > > V3: > > > > > - typo fixing about oring the _WRITE > > > > > > > > > > include/linux/ceph/osd_client.h | 9 ++++++ > > > > > net/ceph/debugfs.c | 13 ++++++++ > > > > > net/ceph/osd_client.c | 56 +++++++++++++++++++++++++++++++++ > > > > > 3 files changed, 78 insertions(+) > > > > > > > > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > > > > > index 83fa08a06507..24301513b186 100644 > > > > > --- a/include/linux/ceph/osd_client.h > > > > > +++ b/include/linux/ceph/osd_client.h > > > > > @@ -339,6 +339,13 @@ struct ceph_osd_backoff { > > > > > struct ceph_hobject_id *end; > > > > > }; > > > > > > > > > > +struct ceph_osd_metric { > > > > > + struct percpu_counter op_ops; > > > > > + struct percpu_counter op_rmw; > > > > > + struct percpu_counter op_r; > > > > > + struct percpu_counter op_w; > > > > > +}; > > > > OK, so only reads and writes are really needed. Why not expose them > > > > through the existing metrics framework in fs/ceph? Wouldn't "fs top" > > > > want to display them? Exposing latency information without exposing > > > > overall counts seems rather weird to me anyway. > > > > > > Okay, I just thought in future this may also be needed by rbd :-) > > > > > > > > > > The fundamental problem is that debugfs output format is not stable. > > > > The tracker mentions test_readahead -- updating some teuthology test > > > > cases from time to time is not a big deal, but if a user facing tool > > > > such as "fs top" starts relying on these, it would be bad. > > > > > > No problem, let me move it to fs existing metric framework. > > > > > > > Hi Xiubo/Ilya/Patrick : > > > > Mea culpa...I had intended to drop this patch from testing branch after > > this discussion, but got sidetracked and forgot to do so. I've now done > > that though. > > On the subject of metrics, I think Xiubo's I/O size metrics patches > need a look -- he reposted the two that were skipped a while ago. > Thanks for reminding me. I saw that he sent those when I was OOTO, and I forgot to revisit them. In the future, if I do that, ping me about them and I'll try to get to them sooner.
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 83fa08a06507..24301513b186 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -339,6 +339,13 @@ struct ceph_osd_backoff { struct ceph_hobject_id *end; }; +struct ceph_osd_metric { + struct percpu_counter op_ops; + struct percpu_counter op_rmw; + struct percpu_counter op_r; + struct percpu_counter op_w; +}; + #define CEPH_LINGER_ID_START 0xffff000000000000ULL struct ceph_osd_client { @@ -371,6 +378,8 @@ struct ceph_osd_client { struct ceph_msgpool msgpool_op; struct ceph_msgpool msgpool_op_reply; + struct ceph_osd_metric metric; + struct workqueue_struct *notify_wq; struct workqueue_struct *completion_wq; }; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 2110439f8a24..af90019386ab 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -339,6 +339,16 @@ static void dump_backoffs(struct seq_file *s, struct ceph_osd *osd) mutex_unlock(&osd->lock); } +static void dump_op_metric(struct seq_file *s, struct ceph_osd_client *osdc) +{ + struct ceph_osd_metric *m = &osdc->metric; + + seq_printf(s, " op_ops: %lld\n", percpu_counter_sum(&m->op_ops)); + seq_printf(s, " op_rmw: %lld\n", percpu_counter_sum(&m->op_rmw)); + seq_printf(s, " op_r: %lld\n", percpu_counter_sum(&m->op_r)); + seq_printf(s, " op_w: %lld\n", percpu_counter_sum(&m->op_w)); +} + static int osdc_show(struct seq_file *s, void *pp) { struct ceph_client *client = s->private; @@ -372,6 +382,9 @@ static int osdc_show(struct seq_file *s, void *pp) } up_read(&osdc->lock); + + seq_puts(s, "OP METRIC:\n"); + dump_op_metric(s, osdc); return 0; } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7901ab6c79fd..0e6e127dd669 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2424,6 +2424,21 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) goto again; } +static void osd_acount_op_metric(struct ceph_osd_request *req) +{ + struct ceph_osd_metric *m = &req->r_osdc->metric; + + percpu_counter_inc(&m->op_ops); + + if ((req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)) + == (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)) + percpu_counter_inc(&m->op_rmw); + if (req->r_flags & CEPH_OSD_FLAG_READ) + percpu_counter_inc(&m->op_r); + else if (req->r_flags & CEPH_OSD_FLAG_WRITE) + percpu_counter_inc(&m->op_w); +} + static void account_request(struct ceph_osd_request *req) { WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); @@ -2434,6 +2449,8 @@ static void account_request(struct ceph_osd_request *req) req->r_start_stamp = jiffies; req->r_start_latency = ktime_get(); + + osd_acount_op_metric(req); } static void submit_request(struct ceph_osd_request *req, bool wrlocked) @@ -5205,6 +5222,39 @@ void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) up_write(&osdc->lock); } +static void ceph_metric_destroy(struct ceph_osd_metric *m) +{ + percpu_counter_destroy(&m->op_ops); + percpu_counter_destroy(&m->op_rmw); + percpu_counter_destroy(&m->op_r); + percpu_counter_destroy(&m->op_w); +} + +static int ceph_metric_init(struct ceph_osd_metric *m) +{ + int ret; + + memset(m, 0, sizeof(*m)); + + ret = percpu_counter_init(&m->op_ops, 0, GFP_NOIO); + if (ret) + return ret; + ret = percpu_counter_init(&m->op_rmw, 0, GFP_NOIO); + if (ret) + goto err; + ret = percpu_counter_init(&m->op_r, 0, GFP_NOIO); + if (ret) + goto err; + ret = percpu_counter_init(&m->op_w, 0, GFP_NOIO); + if (ret) + goto err; + return 0; + +err: + ceph_metric_destroy(m); + return ret; +} + /* * init, shutdown */ @@ -5257,6 +5307,9 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->completion_wq) goto out_notify_wq; + if (ceph_metric_init(&osdc->metric) < 0) + goto out_completion_wq; + schedule_delayed_work(&osdc->timeout_work, osdc->client->options->osd_keepalive_timeout); schedule_delayed_work(&osdc->osds_timeout_work, @@ -5264,6 +5317,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) return 0; +out_completion_wq: + destroy_workqueue(osdc->completion_wq); out_notify_wq: destroy_workqueue(osdc->notify_wq); out_msgpool_reply: @@ -5302,6 +5357,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) WARN_ON(atomic_read(&osdc->num_requests)); WARN_ON(atomic_read(&osdc->num_homeless)); + ceph_metric_destroy(&osdc->metric); ceph_osdmap_destroy(osdc->osdmap); mempool_destroy(osdc->req_mempool); ceph_msgpool_destroy(&osdc->msgpool_op);