[v5,2/6] libceph: abort already submitted but abortable requests when map or pool goes full
diff mbox

Message ID 20170225174323.20289-3-jlayton@redhat.com
State New
Headers show

Commit Message

Jeff Layton Feb. 25, 2017, 5:43 p.m. UTC
When a Ceph volume hits capacity, a flag is set in the OSD map to
indicate that, and a new map is sprayed around the cluster. With cephfs
we want it to shut down any abortable requests that are in progress with
an -ENOSPC error as they'd just hang otherwise.

Add a new ceph_osdc_abort_on_full helper function to handle this. It
will first check whether there is an out-of-space condition in the
cluster. It will then walk the tree and abort any request that has
r_abort_on_full set with an ENOSPC error. Call this new function
directly whenever we get a new OSD map.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 net/ceph/osd_client.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

Comments

Ilya Dryomov March 28, 2017, 2:34 p.m. UTC | #1
On Sat, Feb 25, 2017 at 6:43 PM, Jeff Layton <jlayton@redhat.com> wrote:
> When a Ceph volume hits capacity, a flag is set in the OSD map to
> indicate that, and a new map is sprayed around the cluster. With cephfs
> we want it to shut down any abortable requests that are in progress with
> an -ENOSPC error as they'd just hang otherwise.
>
> Add a new ceph_osdc_abort_on_full helper function to handle this. It
> will first check whether there is an out-of-space condition in the
> cluster. It will then walk the tree and abort any request that has
> r_abort_on_full set with an ENOSPC error. Call this new function
> directly whenever we get a new OSD map.
>
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
>  net/ceph/osd_client.c | 42 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 42 insertions(+)
>
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 8fc0ccc7126f..b286ff6dec29 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -1770,6 +1770,47 @@ static void complete_request(struct ceph_osd_request *req, int err)
>         ceph_osdc_put_request(req);
>  }
>
> +/*
> + * Drop all pending requests that are stalled waiting on a full condition to
> + * clear, and complete them with ENOSPC as the return code.
> + */
> +static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
> +{
> +       struct ceph_osd_request *req;
> +       struct ceph_osd *osd;

These variables could have narrower scope.

> +       struct rb_node *m, *n;
> +       u32 latest_epoch = 0;
> +       bool osdmap_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);

This variable is redundant.

> +
> +       dout("enter abort_on_full\n");
> +
> +       if (!osdmap_full && !have_pool_full(osdc))
> +               goto out;
> +
> +       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
> +               osd = rb_entry(n, struct ceph_osd, o_node);
> +               mutex_lock(&osd->lock);

No need to take osd->lock here as osdc->lock is held for write.

> +               m = rb_first(&osd->o_requests);
> +               while (m) {
> +                       req = rb_entry(m, struct ceph_osd_request, r_node);
> +                       m = rb_next(m);
> +
> +                       if (req->r_abort_on_full &&
> +                           (osdmap_full || pool_full(osdc, req->r_t.base_oloc.pool))) {

Over 80 lines and I think we need target_oloc instead of base_oloc
here.

> +                               u32 cur_epoch = le32_to_cpu(req->r_replay_version.epoch);

Is replay_version used this way in the userspace client?  It is an ack
vs commit leftover and AFAICT it is never set (i.e. always 0'0) in newer
Objecter, so something is wrong here.

> +
> +                               dout("%s: abort tid=%llu flags 0x%x\n", __func__, req->r_tid, req->r_flags);

Over 80 lines and probably unnecessary -- complete_request() has
a similar dout.

> +                               complete_request(req, -ENOSPC);

This should be abort_request() (newly introduced in 4.11).

> +                               if (cur_epoch > latest_epoch)
> +                                       latest_epoch = cur_epoch;
> +                       }
> +               }
> +               mutex_unlock(&osd->lock);
> +       }
> +out:
> +       dout("return abort_on_full latest_epoch=%u\n", latest_epoch);
> +}
> +
>  static void cancel_map_check(struct ceph_osd_request *req)
>  {
>         struct ceph_osd_client *osdc = req->r_osdc;
> @@ -3232,6 +3273,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
>
>         ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
>                           osdc->osdmap->epoch);
> +       ceph_osdc_abort_on_full(osdc);

Nit: swap ceph_osdc_abort_on_full() and ceph_monc_got_map() so that
ceph_osdc_abort_on_full() follows kick_requests().  No real reason
other than for "I processed this map" notification to go last, after
the map is fully processed.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton March 28, 2017, 8:44 p.m. UTC | #2
On Tue, 2017-03-28 at 16:34 +0200, Ilya Dryomov wrote:
> On Sat, Feb 25, 2017 at 6:43 PM, Jeff Layton <jlayton@redhat.com> wrote:
> > When a Ceph volume hits capacity, a flag is set in the OSD map to
> > indicate that, and a new map is sprayed around the cluster. With cephfs
> > we want it to shut down any abortable requests that are in progress with
> > an -ENOSPC error as they'd just hang otherwise.
> > 
> > Add a new ceph_osdc_abort_on_full helper function to handle this. It
> > will first check whether there is an out-of-space condition in the
> > cluster. It will then walk the tree and abort any request that has
> > r_abort_on_full set with an ENOSPC error. Call this new function
> > directly whenever we get a new OSD map.
> > 
> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > ---
> >  net/ceph/osd_client.c | 42 ++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 42 insertions(+)
> > 
> > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> > index 8fc0ccc7126f..b286ff6dec29 100644
> > --- a/net/ceph/osd_client.c
> > +++ b/net/ceph/osd_client.c
> > @@ -1770,6 +1770,47 @@ static void complete_request(struct ceph_osd_request *req, int err)
> >         ceph_osdc_put_request(req);
> >  }
> > 
> > +/*
> > + * Drop all pending requests that are stalled waiting on a full condition to
> > + * clear, and complete them with ENOSPC as the return code.
> > + */
> > +static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
> > +{
> > +       struct ceph_osd_request *req;
> > +       struct ceph_osd *osd;
> 
> These variables could have narrower scope.
> 
> > +       struct rb_node *m, *n;
> > +       u32 latest_epoch = 0;
> > +       bool osdmap_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
> 
> This variable is redundant.
> 
> > +
> > +       dout("enter abort_on_full\n");
> > +
> > +       if (!osdmap_full && !have_pool_full(osdc))
> > +               goto out;
> > +
> > +       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
> > +               osd = rb_entry(n, struct ceph_osd, o_node);
> > +               mutex_lock(&osd->lock);
> 
> No need to take osd->lock here as osdc->lock is held for write.
> 

Could you explain how this locking works?

It appeared to me that o_requests was protected by osd->lock based on
when link_request is called in __submit_request. If the osdc->lock is
sufficient, then why take the osd->lock before grabbing the tid and
linking it into the tree?

> > +               m = rb_first(&osd->o_requests);
> > +               while (m) {
> > +                       req = rb_entry(m, struct ceph_osd_request, r_node);
> > +                       m = rb_next(m);
> > +
> > +                       if (req->r_abort_on_full &&
> > +                           (osdmap_full || pool_full(osdc, req->r_t.base_oloc.pool))) {
> 
> Over 80 lines and I think we need target_oloc instead of base_oloc
> here.
> 
> > +                               u32 cur_epoch = le32_to_cpu(req->r_replay_version.epoch);
> 
> Is replay_version used this way in the userspace client?  It is an ack
> vs commit leftover and AFAICT it is never set (i.e. always 0'0) in newer
> Objecter, so something is wrong here.
> 
> > +
> > +                               dout("%s: abort tid=%llu flags 0x%x\n", __func__, req->r_tid, req->r_flags);
> 
> Over 80 lines and probably unnecessary -- complete_request() has
> a similar dout.
> 
> > +                               complete_request(req, -ENOSPC);
> 
> This should be abort_request() (newly introduced in 4.11).
> 

Fixed most of the above, but could you explain this bit?

abort_request() just calls cancel_map_check() and then does a
complete_request(). Why do we want to cancel the map check here?

> > +                               if (cur_epoch > latest_epoch)
> > +                                       latest_epoch = cur_epoch;
> > +                       }
> > +               }
> > +               mutex_unlock(&osd->lock);
> > +       }
> > +out:
> > +       dout("return abort_on_full latest_epoch=%u\n", latest_epoch);
> > +}
> > +
> >  static void cancel_map_check(struct ceph_osd_request *req)
> >  {
> >         struct ceph_osd_client *osdc = req->r_osdc;
> > @@ -3232,6 +3273,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
> > 
> >         ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
> >                           osdc->osdmap->epoch);
> > +       ceph_osdc_abort_on_full(osdc);
> 
> Nit: swap ceph_osdc_abort_on_full() and ceph_monc_got_map() so that
> ceph_osdc_abort_on_full() follows kick_requests().  No real reason
> other than for "I processed this map" notification to go last, after
> the map is fully processed.
> 

Sure, np.

Thanks,
Jeff Layton March 29, 2017, 2:01 p.m. UTC | #3
On Tue, 2017-03-28 at 16:34 +0200, Ilya Dryomov wrote:
> On Sat, Feb 25, 2017 at 6:43 PM, Jeff Layton <jlayton@redhat.com> wrote:
> > When a Ceph volume hits capacity, a flag is set in the OSD map to
> > indicate that, and a new map is sprayed around the cluster. With cephfs
> > we want it to shut down any abortable requests that are in progress with
> > an -ENOSPC error as they'd just hang otherwise.
> > 
> > Add a new ceph_osdc_abort_on_full helper function to handle this. It
> > will first check whether there is an out-of-space condition in the
> > cluster. It will then walk the tree and abort any request that has
> > r_abort_on_full set with an ENOSPC error. Call this new function
> > directly whenever we get a new OSD map.
> > 
> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > ---
> >  net/ceph/osd_client.c | 42 ++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 42 insertions(+)
> > 
> > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> > index 8fc0ccc7126f..b286ff6dec29 100644
> > --- a/net/ceph/osd_client.c
> > +++ b/net/ceph/osd_client.c
> > @@ -1770,6 +1770,47 @@ static void complete_request(struct ceph_osd_request *req, int err)
> >         ceph_osdc_put_request(req);
> >  }
> > 
> > +/*
> > + * Drop all pending requests that are stalled waiting on a full condition to
> > + * clear, and complete them with ENOSPC as the return code.
> > + */
> > +static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
> > +{
> > +       struct ceph_osd_request *req;
> > +       struct ceph_osd *osd;
> 
> These variables could have narrower scope.
> 
> > +       struct rb_node *m, *n;
> > +       u32 latest_epoch = 0;
> > +       bool osdmap_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
> 
> This variable is redundant.
> 
> > +
> > +       dout("enter abort_on_full\n");
> > +
> > +       if (!osdmap_full && !have_pool_full(osdc))
> > +               goto out;
> > +
> > +       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
> > +               osd = rb_entry(n, struct ceph_osd, o_node);
> > +               mutex_lock(&osd->lock);
> 
> No need to take osd->lock here as osdc->lock is held for write.
> 
> > +               m = rb_first(&osd->o_requests);
> > +               while (m) {
> > +                       req = rb_entry(m, struct ceph_osd_request, r_node);
> > +                       m = rb_next(m);
> > +
> > +                       if (req->r_abort_on_full &&
> > +                           (osdmap_full || pool_full(osdc, req->r_t.base_oloc.pool))) {
> 
> Over 80 lines and I think we need target_oloc instead of base_oloc
> here.
> 
> > +                               u32 cur_epoch = le32_to_cpu(req->r_replay_version.epoch);
> 
> Is replay_version used this way in the userspace client?  It is an ack
> vs commit leftover and AFAICT it is never set (i.e. always 0'0) in newer
> Objecter, so something is wrong here.
> 

Yeah I see what you mean now. This does look to be entirely vestigial
in the current code.

Basically, what we want is to scan the list of outstanding requests for
this OSD, and abort any that predate the current map epoch. I suppose
this means that we need to record the current map epoch in the request
somewhere. ISTR looking and seeing that it was recorded here, but I
don't think that's the case anymore.

Since that field is never set to anything now, what I'll probably do is
just remove it, and replace it with an epoch field that we record at
the time that the request is marshaled.

> > +
> > +                               dout("%s: abort tid=%llu flags 0x%x\n", __func__, req->r_tid, req->r_flags);
> 
> Over 80 lines and probably unnecessary -- complete_request() has
> a similar dout.
> 
> > +                               complete_request(req, -ENOSPC);
> 
> This should be abort_request() (newly introduced in 4.11).
> 
> > +                               if (cur_epoch > latest_epoch)
> > +                                       latest_epoch = cur_epoch;
> > +                       }
> > +               }
> > +               mutex_unlock(&osd->lock);
> > +       }
> > +out:
> > +       dout("return abort_on_full latest_epoch=%u\n", latest_epoch);
> > +}
> > +
> >  static void cancel_map_check(struct ceph_osd_request *req)
> >  {
> >         struct ceph_osd_client *osdc = req->r_osdc;
> > @@ -3232,6 +3273,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
> > 
> >         ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
> >                           osdc->osdmap->epoch);
> > +       ceph_osdc_abort_on_full(osdc);
> 
> Nit: swap ceph_osdc_abort_on_full() and ceph_monc_got_map() so that
> ceph_osdc_abort_on_full() follows kick_requests().  No real reason
> other than for "I processed this map" notification to go last, after
> the map is fully processed.
> 
> Thanks,
> 
>                 Ilya
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov March 29, 2017, 2:14 p.m. UTC | #4
On Wed, Mar 29, 2017 at 4:01 PM, Jeff Layton <jlayton@redhat.com> wrote:
> On Tue, 2017-03-28 at 16:34 +0200, Ilya Dryomov wrote:
>> On Sat, Feb 25, 2017 at 6:43 PM, Jeff Layton <jlayton@redhat.com> wrote:
>> > When a Ceph volume hits capacity, a flag is set in the OSD map to
>> > indicate that, and a new map is sprayed around the cluster. With cephfs
>> > we want it to shut down any abortable requests that are in progress with
>> > an -ENOSPC error as they'd just hang otherwise.
>> >
>> > Add a new ceph_osdc_abort_on_full helper function to handle this. It
>> > will first check whether there is an out-of-space condition in the
>> > cluster. It will then walk the tree and abort any request that has
>> > r_abort_on_full set with an ENOSPC error. Call this new function
>> > directly whenever we get a new OSD map.
>> >
>> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> > ---
>> >  net/ceph/osd_client.c | 42 ++++++++++++++++++++++++++++++++++++++++++
>> >  1 file changed, 42 insertions(+)
>> >
>> > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>> > index 8fc0ccc7126f..b286ff6dec29 100644
>> > --- a/net/ceph/osd_client.c
>> > +++ b/net/ceph/osd_client.c
>> > @@ -1770,6 +1770,47 @@ static void complete_request(struct ceph_osd_request *req, int err)
>> >         ceph_osdc_put_request(req);
>> >  }
>> >
>> > +/*
>> > + * Drop all pending requests that are stalled waiting on a full condition to
>> > + * clear, and complete them with ENOSPC as the return code.
>> > + */
>> > +static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
>> > +{
>> > +       struct ceph_osd_request *req;
>> > +       struct ceph_osd *osd;
>>
>> These variables could have narrower scope.
>>
>> > +       struct rb_node *m, *n;
>> > +       u32 latest_epoch = 0;
>> > +       bool osdmap_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
>>
>> This variable is redundant.
>>
>> > +
>> > +       dout("enter abort_on_full\n");
>> > +
>> > +       if (!osdmap_full && !have_pool_full(osdc))
>> > +               goto out;
>> > +
>> > +       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
>> > +               osd = rb_entry(n, struct ceph_osd, o_node);
>> > +               mutex_lock(&osd->lock);
>>
>> No need to take osd->lock here as osdc->lock is held for write.
>>
>> > +               m = rb_first(&osd->o_requests);
>> > +               while (m) {
>> > +                       req = rb_entry(m, struct ceph_osd_request, r_node);
>> > +                       m = rb_next(m);
>> > +
>> > +                       if (req->r_abort_on_full &&
>> > +                           (osdmap_full || pool_full(osdc, req->r_t.base_oloc.pool))) {
>>
>> Over 80 lines and I think we need target_oloc instead of base_oloc
>> here.
>>
>> > +                               u32 cur_epoch = le32_to_cpu(req->r_replay_version.epoch);
>>
>> Is replay_version used this way in the userspace client?  It is an ack
>> vs commit leftover and AFAICT it is never set (i.e. always 0'0) in newer
>> Objecter, so something is wrong here.
>>
>
> Yeah I see what you mean now. This does look to be entirely vestigial
> in the current code.
>
> Basically, what we want is to scan the list of outstanding requests for
> this OSD, and abort any that predate the current map epoch. I suppose
> this means that we need to record the current map epoch in the request
> somewhere. ISTR looking and seeing that it was recorded here, but I
> don't think that's the case anymore.

Why only those that predate the current epoch and not simply all of
those marked as r_abort_on_full?  I mean, this is triggered by a pool
or an entire cluster going full, right?

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton March 29, 2017, 4:39 p.m. UTC | #5
On Wed, 2017-03-29 at 16:14 +0200, Ilya Dryomov wrote:
> On Wed, Mar 29, 2017 at 4:01 PM, Jeff Layton <jlayton@redhat.com> wrote:
> > On Tue, 2017-03-28 at 16:34 +0200, Ilya Dryomov wrote:
> > > On Sat, Feb 25, 2017 at 6:43 PM, Jeff Layton <jlayton@redhat.com> wrote:
> > > > When a Ceph volume hits capacity, a flag is set in the OSD map to
> > > > indicate that, and a new map is sprayed around the cluster. With cephfs
> > > > we want it to shut down any abortable requests that are in progress with
> > > > an -ENOSPC error as they'd just hang otherwise.
> > > > 
> > > > Add a new ceph_osdc_abort_on_full helper function to handle this. It
> > > > will first check whether there is an out-of-space condition in the
> > > > cluster. It will then walk the tree and abort any request that has
> > > > r_abort_on_full set with an ENOSPC error. Call this new function
> > > > directly whenever we get a new OSD map.
> > > > 
> > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > > ---
> > > >  net/ceph/osd_client.c | 42 ++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 42 insertions(+)
> > > > 
> > > > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> > > > index 8fc0ccc7126f..b286ff6dec29 100644
> > > > --- a/net/ceph/osd_client.c
> > > > +++ b/net/ceph/osd_client.c
> > > > @@ -1770,6 +1770,47 @@ static void complete_request(struct ceph_osd_request *req, int err)
> > > >         ceph_osdc_put_request(req);
> > > >  }
> > > > 
> > > > +/*
> > > > + * Drop all pending requests that are stalled waiting on a full condition to
> > > > + * clear, and complete them with ENOSPC as the return code.
> > > > + */
> > > > +static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
> > > > +{
> > > > +       struct ceph_osd_request *req;
> > > > +       struct ceph_osd *osd;
> > > 
> > > These variables could have narrower scope.
> > > 
> > > > +       struct rb_node *m, *n;
> > > > +       u32 latest_epoch = 0;
> > > > +       bool osdmap_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
> > > 
> > > This variable is redundant.
> > > 
> > > > +
> > > > +       dout("enter abort_on_full\n");
> > > > +
> > > > +       if (!osdmap_full && !have_pool_full(osdc))
> > > > +               goto out;
> > > > +
> > > > +       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
> > > > +               osd = rb_entry(n, struct ceph_osd, o_node);
> > > > +               mutex_lock(&osd->lock);
> > > 
> > > No need to take osd->lock here as osdc->lock is held for write.
> > > 
> > > > +               m = rb_first(&osd->o_requests);
> > > > +               while (m) {
> > > > +                       req = rb_entry(m, struct ceph_osd_request, r_node);
> > > > +                       m = rb_next(m);
> > > > +
> > > > +                       if (req->r_abort_on_full &&
> > > > +                           (osdmap_full || pool_full(osdc, req->r_t.base_oloc.pool))) {
> > > 
> > > Over 80 lines and I think we need target_oloc instead of base_oloc
> > > here.
> > > 
> > > > +                               u32 cur_epoch = le32_to_cpu(req->r_replay_version.epoch);
> > > 
> > > Is replay_version used this way in the userspace client?  It is an ack
> > > vs commit leftover and AFAICT it is never set (i.e. always 0'0) in newer
> > > Objecter, so something is wrong here.
> > > 
> > 
> > Yeah I see what you mean now. This does look to be entirely vestigial
> > in the current code.
> > 
> > Basically, what we want is to scan the list of outstanding requests for
> > this OSD, and abort any that predate the current map epoch. I suppose
> > this means that we need to record the current map epoch in the request
> > somewhere. ISTR looking and seeing that it was recorded here, but I
> > don't think that's the case anymore.
> 
> Why only those that predate the current epoch and not simply all of
> those marked as r_abort_on_full?  I mean, this is triggered by a pool
> or an entire cluster going full, right?
> 
> 

My mistake, you're right. We cancel based on r_abort_on_full and the
pool/OSD full flags.

We do need to determine the latest cancelled map epoch though, so that
we can update the epoch_barrier properly when this occurs. What I'm
doing now is just recording the current map epoch before we link a
request into the tree and using that to determine this.
Ilya Dryomov March 29, 2017, 4:52 p.m. UTC | #6
On Tue, Mar 28, 2017 at 10:44 PM, Jeff Layton <jlayton@redhat.com> wrote:
> On Tue, 2017-03-28 at 16:34 +0200, Ilya Dryomov wrote:
>> On Sat, Feb 25, 2017 at 6:43 PM, Jeff Layton <jlayton@redhat.com> wrote:
>> > When a Ceph volume hits capacity, a flag is set in the OSD map to
>> > indicate that, and a new map is sprayed around the cluster. With cephfs
>> > we want it to shut down any abortable requests that are in progress with
>> > an -ENOSPC error as they'd just hang otherwise.
>> >
>> > Add a new ceph_osdc_abort_on_full helper function to handle this. It
>> > will first check whether there is an out-of-space condition in the
>> > cluster. It will then walk the tree and abort any request that has
>> > r_abort_on_full set with an ENOSPC error. Call this new function
>> > directly whenever we get a new OSD map.
>> >
>> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> > ---
>> >  net/ceph/osd_client.c | 42 ++++++++++++++++++++++++++++++++++++++++++
>> >  1 file changed, 42 insertions(+)
>> >
>> > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>> > index 8fc0ccc7126f..b286ff6dec29 100644
>> > --- a/net/ceph/osd_client.c
>> > +++ b/net/ceph/osd_client.c
>> > @@ -1770,6 +1770,47 @@ static void complete_request(struct ceph_osd_request *req, int err)
>> >         ceph_osdc_put_request(req);
>> >  }
>> >
>> > +/*
>> > + * Drop all pending requests that are stalled waiting on a full condition to
>> > + * clear, and complete them with ENOSPC as the return code.
>> > + */
>> > +static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
>> > +{
>> > +       struct ceph_osd_request *req;
>> > +       struct ceph_osd *osd;
>>
>> These variables could have narrower scope.
>>
>> > +       struct rb_node *m, *n;
>> > +       u32 latest_epoch = 0;
>> > +       bool osdmap_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
>>
>> This variable is redundant.
>>
>> > +
>> > +       dout("enter abort_on_full\n");
>> > +
>> > +       if (!osdmap_full && !have_pool_full(osdc))
>> > +               goto out;
>> > +
>> > +       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
>> > +               osd = rb_entry(n, struct ceph_osd, o_node);
>> > +               mutex_lock(&osd->lock);
>>
>> No need to take osd->lock here as osdc->lock is held for write.
>>
>
> Could you explain how this locking works?
>
> It appeared to me that o_requests was protected by osd->lock based on
> when link_request is called in __submit_request. If the osdc->lock is
> sufficient, then why take the osd->lock before grabbing the tid and
> linking it into the tree?

osd->lock protects the individual ceph_osd trees and is always taken
along with osdc->lock.  osdc->lock is the per-ceph_osd_client -- if you
have it down for write, osd->lock becomes unnecessary.

__submit_request() is called with osdc->lock down for read.

>
>> > +               m = rb_first(&osd->o_requests);
>> > +               while (m) {
>> > +                       req = rb_entry(m, struct ceph_osd_request, r_node);
>> > +                       m = rb_next(m);
>> > +
>> > +                       if (req->r_abort_on_full &&
>> > +                           (osdmap_full || pool_full(osdc, req->r_t.base_oloc.pool))) {
>>
>> Over 80 lines and I think we need target_oloc instead of base_oloc
>> here.
>>
>> > +                               u32 cur_epoch = le32_to_cpu(req->r_replay_version.epoch);
>>
>> Is replay_version used this way in the userspace client?  It is an ack
>> vs commit leftover and AFAICT it is never set (i.e. always 0'0) in newer
>> Objecter, so something is wrong here.
>>
>> > +
>> > +                               dout("%s: abort tid=%llu flags 0x%x\n", __func__, req->r_tid, req->r_flags);
>>
>> Over 80 lines and probably unnecessary -- complete_request() has
>> a similar dout.
>>
>> > +                               complete_request(req, -ENOSPC);
>>
>> This should be abort_request() (newly introduced in 4.11).
>>
>
> Fixed most of the above, but could you explain this bit?
>
> abort_request() just calls cancel_map_check() and then does a
> complete_request(). Why do we want to cancel the map check here?

Because the request in question could be in the map check tree.
ceph_osdc_abort_on_full() is asynchronous with respect to the rest of
request processing code, much like the recently merged timeout stuff.

complete_request() is for handle_reply() and map check code itself.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov March 29, 2017, 4:56 p.m. UTC | #7
On Wed, Mar 29, 2017 at 6:39 PM, Jeff Layton <jlayton@redhat.com> wrote:
> On Wed, 2017-03-29 at 16:14 +0200, Ilya Dryomov wrote:
>> On Wed, Mar 29, 2017 at 4:01 PM, Jeff Layton <jlayton@redhat.com> wrote:
>> > On Tue, 2017-03-28 at 16:34 +0200, Ilya Dryomov wrote:
>> > > On Sat, Feb 25, 2017 at 6:43 PM, Jeff Layton <jlayton@redhat.com> wrote:
>> > > > When a Ceph volume hits capacity, a flag is set in the OSD map to
>> > > > indicate that, and a new map is sprayed around the cluster. With cephfs
>> > > > we want it to shut down any abortable requests that are in progress with
>> > > > an -ENOSPC error as they'd just hang otherwise.
>> > > >
>> > > > Add a new ceph_osdc_abort_on_full helper function to handle this. It
>> > > > will first check whether there is an out-of-space condition in the
>> > > > cluster. It will then walk the tree and abort any request that has
>> > > > r_abort_on_full set with an ENOSPC error. Call this new function
>> > > > directly whenever we get a new OSD map.
>> > > >
>> > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> > > > ---
>> > > >  net/ceph/osd_client.c | 42 ++++++++++++++++++++++++++++++++++++++++++
>> > > >  1 file changed, 42 insertions(+)
>> > > >
>> > > > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>> > > > index 8fc0ccc7126f..b286ff6dec29 100644
>> > > > --- a/net/ceph/osd_client.c
>> > > > +++ b/net/ceph/osd_client.c
>> > > > @@ -1770,6 +1770,47 @@ static void complete_request(struct ceph_osd_request *req, int err)
>> > > >         ceph_osdc_put_request(req);
>> > > >  }
>> > > >
>> > > > +/*
>> > > > + * Drop all pending requests that are stalled waiting on a full condition to
>> > > > + * clear, and complete them with ENOSPC as the return code.
>> > > > + */
>> > > > +static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
>> > > > +{
>> > > > +       struct ceph_osd_request *req;
>> > > > +       struct ceph_osd *osd;
>> > >
>> > > These variables could have narrower scope.
>> > >
>> > > > +       struct rb_node *m, *n;
>> > > > +       u32 latest_epoch = 0;
>> > > > +       bool osdmap_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
>> > >
>> > > This variable is redundant.
>> > >
>> > > > +
>> > > > +       dout("enter abort_on_full\n");
>> > > > +
>> > > > +       if (!osdmap_full && !have_pool_full(osdc))
>> > > > +               goto out;
>> > > > +
>> > > > +       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
>> > > > +               osd = rb_entry(n, struct ceph_osd, o_node);
>> > > > +               mutex_lock(&osd->lock);
>> > >
>> > > No need to take osd->lock here as osdc->lock is held for write.
>> > >
>> > > > +               m = rb_first(&osd->o_requests);
>> > > > +               while (m) {
>> > > > +                       req = rb_entry(m, struct ceph_osd_request, r_node);
>> > > > +                       m = rb_next(m);
>> > > > +
>> > > > +                       if (req->r_abort_on_full &&
>> > > > +                           (osdmap_full || pool_full(osdc, req->r_t.base_oloc.pool))) {
>> > >
>> > > Over 80 lines and I think we need target_oloc instead of base_oloc
>> > > here.
>> > >
>> > > > +                               u32 cur_epoch = le32_to_cpu(req->r_replay_version.epoch);
>> > >
>> > > Is replay_version used this way in the userspace client?  It is an ack
>> > > vs commit leftover and AFAICT it is never set (i.e. always 0'0) in newer
>> > > Objecter, so something is wrong here.
>> > >
>> >
>> > Yeah I see what you mean now. This does look to be entirely vestigial
>> > in the current code.
>> >
>> > Basically, what we want is to scan the list of outstanding requests for
>> > this OSD, and abort any that predate the current map epoch. I suppose
>> > this means that we need to record the current map epoch in the request
>> > somewhere. ISTR looking and seeing that it was recorded here, but I
>> > don't think that's the case anymore.
>>
>> Why only those that predate the current epoch and not simply all of
>> those marked as r_abort_on_full?  I mean, this is triggered by a pool
>> or an entire cluster going full, right?
>>
>>
>
> My mistake, you're right. We cancel based on r_abort_on_full and the
> pool/OSD full flags.
>
> We do need to determine the latest cancelled map epoch though, so that
> we can update the epoch_barrier properly when this occurs. What I'm
> doing now is just recording the current map epoch before we link a
> request into the tree and using that to determine this.

I think you can get away with returning the current epoch.  It's always
going to be the latest, by definition ;)

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 8fc0ccc7126f..b286ff6dec29 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1770,6 +1770,47 @@  static void complete_request(struct ceph_osd_request *req, int err)
 	ceph_osdc_put_request(req);
 }
 
+/*
+ * Drop all pending requests that are stalled waiting on a full condition to
+ * clear, and complete them with ENOSPC as the return code.
+ */
+static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_request *req;
+	struct ceph_osd *osd;
+	struct rb_node *m, *n;
+	u32 latest_epoch = 0;
+	bool osdmap_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+
+	dout("enter abort_on_full\n");
+
+	if (!osdmap_full && !have_pool_full(osdc))
+		goto out;
+
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		osd = rb_entry(n, struct ceph_osd, o_node);
+		mutex_lock(&osd->lock);
+		m = rb_first(&osd->o_requests);
+		while (m) {
+			req = rb_entry(m, struct ceph_osd_request, r_node);
+			m = rb_next(m);
+
+			if (req->r_abort_on_full &&
+			    (osdmap_full || pool_full(osdc, req->r_t.base_oloc.pool))) {
+				u32 cur_epoch = le32_to_cpu(req->r_replay_version.epoch);
+
+				dout("%s: abort tid=%llu flags 0x%x\n", __func__, req->r_tid, req->r_flags);
+				complete_request(req, -ENOSPC);
+				if (cur_epoch > latest_epoch)
+					latest_epoch = cur_epoch;
+			}
+		}
+		mutex_unlock(&osd->lock);
+	}
+out:
+	dout("return abort_on_full latest_epoch=%u\n", latest_epoch);
+}
+
 static void cancel_map_check(struct ceph_osd_request *req)
 {
 	struct ceph_osd_client *osdc = req->r_osdc;
@@ -3232,6 +3273,7 @@  void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
 	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
 			  osdc->osdmap->epoch);
+	ceph_osdc_abort_on_full(osdc);
 	up_write(&osdc->lock);
 	wake_up_all(&osdc->client->auth_wq);
 	return;