diff mbox series

[-next,v2] md: synchronize flush io with array reconfiguration

Message ID 20231125065419.3518254-1-yukuai1@huaweicloud.com (mailing list archive)
State New, archived
Headers show
Series [-next,v2] md: synchronize flush io with array reconfiguration | expand

Commit Message

Yu Kuai Nov. 25, 2023, 6:54 a.m. UTC
From: Yu Kuai <yukuai3@huawei.com>

Currently rcu is used to protect iterating rdev from submit_flushes():

submit_flushes			remove_and_add_spares
				synchronize_rcu
				pers->hot_remove_disk()
 rcu_read_lock()
 rdev_for_each_rcu
  if (rdev->raid_disk >= 0)
				rdev->radi_disk = -1;
   atomic_inc(&rdev->nr_pending)
   rcu_read_unlock()
   bi = bio_alloc_bioset()
   bi->bi_end_io = md_end_flush
   bi->private = rdev
   submit_bio
   // issue io for removed rdev

Fix this problem by grabbing 'acive_io' before iterating rdev, make sure
that remove_and_add_spares() won't concurrent with submit_flushes().

Fixes: a2826aa92e2e ("md: support barrier requests on all personalities.")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
Changes v2:
 - Add WARN_ON in case md_flush_request() is not called from
 md_handle_request() in future.

 drivers/md/md.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

Comments

Song Liu Nov. 27, 2023, 10:16 p.m. UTC | #1
On Fri, Nov 24, 2023 at 10:54 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> Currently rcu is used to protect iterating rdev from submit_flushes():
>
> submit_flushes                  remove_and_add_spares
>                                 synchronize_rcu
>                                 pers->hot_remove_disk()
>  rcu_read_lock()
>  rdev_for_each_rcu
>   if (rdev->raid_disk >= 0)
>                                 rdev->radi_disk = -1;
>    atomic_inc(&rdev->nr_pending)
>    rcu_read_unlock()
>    bi = bio_alloc_bioset()
>    bi->bi_end_io = md_end_flush
>    bi->private = rdev
>    submit_bio
>    // issue io for removed rdev
>
> Fix this problem by grabbing 'acive_io' before iterating rdev, make sure
> that remove_and_add_spares() won't concurrent with submit_flushes().
>
> Fixes: a2826aa92e2e ("md: support barrier requests on all personalities.")
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
> Changes v2:
>  - Add WARN_ON in case md_flush_request() is not called from
>  md_handle_request() in future.
>
>  drivers/md/md.c | 22 ++++++++++++++++------
>  1 file changed, 16 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 86efc9c2ae56..2ffedc39edd6 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -538,6 +538,9 @@ static void md_end_flush(struct bio *bio)
>         rdev_dec_pending(rdev, mddev);
>
>         if (atomic_dec_and_test(&mddev->flush_pending)) {
> +               /* The pair is percpu_ref_tryget() from md_flush_request() */
> +               percpu_ref_put(&mddev->active_io);
> +
>                 /* The pre-request flush has finished */
>                 queue_work(md_wq, &mddev->flush_work);
>         }
> @@ -557,12 +560,8 @@ static void submit_flushes(struct work_struct *ws)
>         rdev_for_each_rcu(rdev, mddev)
>                 if (rdev->raid_disk >= 0 &&
>                     !test_bit(Faulty, &rdev->flags)) {
> -                       /* Take two references, one is dropped
> -                        * when request finishes, one after
> -                        * we reclaim rcu_read_lock
> -                        */
>                         struct bio *bi;
> -                       atomic_inc(&rdev->nr_pending);
> +
>                         atomic_inc(&rdev->nr_pending);
>                         rcu_read_unlock();
>                         bi = bio_alloc_bioset(rdev->bdev, 0,
> @@ -573,7 +572,6 @@ static void submit_flushes(struct work_struct *ws)
>                         atomic_inc(&mddev->flush_pending);
>                         submit_bio(bi);
>                         rcu_read_lock();
> -                       rdev_dec_pending(rdev, mddev);
>                 }
>         rcu_read_unlock();
>         if (atomic_dec_and_test(&mddev->flush_pending))
> @@ -626,6 +624,18 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio)
>         /* new request after previous flush is completed */
>         if (ktime_after(req_start, mddev->prev_flush_start)) {
>                 WARN_ON(mddev->flush_bio);
> +               /*
> +                * Grab a reference to make sure mddev_suspend() will wait for
> +                * this flush to be done.
> +                *
> +                * md_flush_reqeust() is called under md_handle_request() and
> +                * 'active_io' is already grabbed, hence percpu_ref_tryget()
> +                * won't fail, percpu_ref_tryget_live() can't be used because
> +                * percpu_ref_kill() can be called by mddev_suspend()
> +                * concurrently.
> +                */
> +               if (WARN_ON(percpu_ref_tryget(&mddev->active_io)))

This should be "if (!WARN_ON(..))", right?

Song

> +                       percpu_ref_get(&mddev->active_io);
>                 mddev->flush_bio = bio;
>                 bio = NULL;
>         }
> --
> 2.39.2
>
>
Song Liu Nov. 27, 2023, 11:32 p.m. UTC | #2
On Mon, Nov 27, 2023 at 2:16 PM Song Liu <song@kernel.org> wrote:
>
> On Fri, Nov 24, 2023 at 10:54 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
> >
> > From: Yu Kuai <yukuai3@huawei.com>
> >
> > Currently rcu is used to protect iterating rdev from submit_flushes():
> >
> > submit_flushes                  remove_and_add_spares
> >                                 synchronize_rcu
> >                                 pers->hot_remove_disk()
> >  rcu_read_lock()
> >  rdev_for_each_rcu
> >   if (rdev->raid_disk >= 0)
> >                                 rdev->radi_disk = -1;
> >    atomic_inc(&rdev->nr_pending)
> >    rcu_read_unlock()
> >    bi = bio_alloc_bioset()
> >    bi->bi_end_io = md_end_flush
> >    bi->private = rdev
> >    submit_bio
> >    // issue io for removed rdev
> >
> > Fix this problem by grabbing 'acive_io' before iterating rdev, make sure
> > that remove_and_add_spares() won't concurrent with submit_flushes().
> >
> > Fixes: a2826aa92e2e ("md: support barrier requests on all personalities.")
> > Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> > ---
> > Changes v2:
> >  - Add WARN_ON in case md_flush_request() is not called from
> >  md_handle_request() in future.
> >
> >  drivers/md/md.c | 22 ++++++++++++++++------
> >  1 file changed, 16 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > index 86efc9c2ae56..2ffedc39edd6 100644
> > --- a/drivers/md/md.c
> > +++ b/drivers/md/md.c
> > @@ -538,6 +538,9 @@ static void md_end_flush(struct bio *bio)
> >         rdev_dec_pending(rdev, mddev);
> >
> >         if (atomic_dec_and_test(&mddev->flush_pending)) {
> > +               /* The pair is percpu_ref_tryget() from md_flush_request() */
> > +               percpu_ref_put(&mddev->active_io);
> > +
> >                 /* The pre-request flush has finished */
> >                 queue_work(md_wq, &mddev->flush_work);
> >         }
> > @@ -557,12 +560,8 @@ static void submit_flushes(struct work_struct *ws)
> >         rdev_for_each_rcu(rdev, mddev)
> >                 if (rdev->raid_disk >= 0 &&
> >                     !test_bit(Faulty, &rdev->flags)) {
> > -                       /* Take two references, one is dropped
> > -                        * when request finishes, one after
> > -                        * we reclaim rcu_read_lock
> > -                        */
> >                         struct bio *bi;
> > -                       atomic_inc(&rdev->nr_pending);
> > +
> >                         atomic_inc(&rdev->nr_pending);
> >                         rcu_read_unlock();
> >                         bi = bio_alloc_bioset(rdev->bdev, 0,
> > @@ -573,7 +572,6 @@ static void submit_flushes(struct work_struct *ws)
> >                         atomic_inc(&mddev->flush_pending);
> >                         submit_bio(bi);
> >                         rcu_read_lock();
> > -                       rdev_dec_pending(rdev, mddev);
> >                 }
> >         rcu_read_unlock();
> >         if (atomic_dec_and_test(&mddev->flush_pending))
> > @@ -626,6 +624,18 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio)
> >         /* new request after previous flush is completed */
> >         if (ktime_after(req_start, mddev->prev_flush_start)) {
> >                 WARN_ON(mddev->flush_bio);
> > +               /*
> > +                * Grab a reference to make sure mddev_suspend() will wait for
> > +                * this flush to be done.
> > +                *
> > +                * md_flush_reqeust() is called under md_handle_request() and
> > +                * 'active_io' is already grabbed, hence percpu_ref_tryget()
> > +                * won't fail, percpu_ref_tryget_live() can't be used because
> > +                * percpu_ref_kill() can be called by mddev_suspend()
> > +                * concurrently.
> > +                */
> > +               if (WARN_ON(percpu_ref_tryget(&mddev->active_io)))
>
> This should be "if (!WARN_ON(..))", right?
>
> Song
>
> > +                       percpu_ref_get(&mddev->active_io);

Actually, we can just use percpu_ref_get(), no?

Thanks,
Song
Yu Kuai Nov. 28, 2023, 2:12 a.m. UTC | #3
Hi,

在 2023/11/28 7:32, Song Liu 写道:
> On Mon, Nov 27, 2023 at 2:16 PM Song Liu <song@kernel.org> wrote:
>>
>> On Fri, Nov 24, 2023 at 10:54 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>>
>>> From: Yu Kuai <yukuai3@huawei.com>
>>>
>>> Currently rcu is used to protect iterating rdev from submit_flushes():
>>>
>>> submit_flushes                  remove_and_add_spares
>>>                                  synchronize_rcu
>>>                                  pers->hot_remove_disk()
>>>   rcu_read_lock()
>>>   rdev_for_each_rcu
>>>    if (rdev->raid_disk >= 0)
>>>                                  rdev->radi_disk = -1;
>>>     atomic_inc(&rdev->nr_pending)
>>>     rcu_read_unlock()
>>>     bi = bio_alloc_bioset()
>>>     bi->bi_end_io = md_end_flush
>>>     bi->private = rdev
>>>     submit_bio
>>>     // issue io for removed rdev
>>>
>>> Fix this problem by grabbing 'acive_io' before iterating rdev, make sure
>>> that remove_and_add_spares() won't concurrent with submit_flushes().
>>>
>>> Fixes: a2826aa92e2e ("md: support barrier requests on all personalities.")
>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>> ---
>>> Changes v2:
>>>   - Add WARN_ON in case md_flush_request() is not called from
>>>   md_handle_request() in future.
>>>
>>>   drivers/md/md.c | 22 ++++++++++++++++------
>>>   1 file changed, 16 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>>> index 86efc9c2ae56..2ffedc39edd6 100644
>>> --- a/drivers/md/md.c
>>> +++ b/drivers/md/md.c
>>> @@ -538,6 +538,9 @@ static void md_end_flush(struct bio *bio)
>>>          rdev_dec_pending(rdev, mddev);
>>>
>>>          if (atomic_dec_and_test(&mddev->flush_pending)) {
>>> +               /* The pair is percpu_ref_tryget() from md_flush_request() */
>>> +               percpu_ref_put(&mddev->active_io);
>>> +
>>>                  /* The pre-request flush has finished */
>>>                  queue_work(md_wq, &mddev->flush_work);
>>>          }
>>> @@ -557,12 +560,8 @@ static void submit_flushes(struct work_struct *ws)
>>>          rdev_for_each_rcu(rdev, mddev)
>>>                  if (rdev->raid_disk >= 0 &&
>>>                      !test_bit(Faulty, &rdev->flags)) {
>>> -                       /* Take two references, one is dropped
>>> -                        * when request finishes, one after
>>> -                        * we reclaim rcu_read_lock
>>> -                        */
>>>                          struct bio *bi;
>>> -                       atomic_inc(&rdev->nr_pending);
>>> +
>>>                          atomic_inc(&rdev->nr_pending);
>>>                          rcu_read_unlock();
>>>                          bi = bio_alloc_bioset(rdev->bdev, 0,
>>> @@ -573,7 +572,6 @@ static void submit_flushes(struct work_struct *ws)
>>>                          atomic_inc(&mddev->flush_pending);
>>>                          submit_bio(bi);
>>>                          rcu_read_lock();
>>> -                       rdev_dec_pending(rdev, mddev);
>>>                  }
>>>          rcu_read_unlock();
>>>          if (atomic_dec_and_test(&mddev->flush_pending))
>>> @@ -626,6 +624,18 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio)
>>>          /* new request after previous flush is completed */
>>>          if (ktime_after(req_start, mddev->prev_flush_start)) {
>>>                  WARN_ON(mddev->flush_bio);
>>> +               /*
>>> +                * Grab a reference to make sure mddev_suspend() will wait for
>>> +                * this flush to be done.
>>> +                *
>>> +                * md_flush_reqeust() is called under md_handle_request() and
>>> +                * 'active_io' is already grabbed, hence percpu_ref_tryget()
>>> +                * won't fail, percpu_ref_tryget_live() can't be used because
>>> +                * percpu_ref_kill() can be called by mddev_suspend()
>>> +                * concurrently.
>>> +                */
>>> +               if (WARN_ON(percpu_ref_tryget(&mddev->active_io)))
>>
>> This should be "if (!WARN_ON(..))", right?

Sorry for the mistake, this actually should be:

if (WARN_ON(!percpu_ref_tryget(...))
>>
>> Song
>>
>>> +                       percpu_ref_get(&mddev->active_io);
> 
> Actually, we can just use percpu_ref_get(), no?

Yes, we can, but if someone else doesn't call md_flush_request() under
md_handle_request() in the fulture, there will be problem and
percpu_ref_get() can't catch this, do you think it'll make sense to
prevent such case?

Thanks,
Kuai

> 
> Thanks,
> Song
> .
>
Song Liu Nov. 28, 2023, 3:44 a.m. UTC | #4
On Mon, Nov 27, 2023 at 6:12 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi,
>
> 在 2023/11/28 7:32, Song Liu 写道:
> > On Mon, Nov 27, 2023 at 2:16 PM Song Liu <song@kernel.org> wrote:
> >>
> >> On Fri, Nov 24, 2023 at 10:54 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
> >>>
> >>> From: Yu Kuai <yukuai3@huawei.com>
> >>>
> >>> Currently rcu is used to protect iterating rdev from submit_flushes():
> >>>
> >>> submit_flushes                  remove_and_add_spares
> >>>                                  synchronize_rcu
> >>>                                  pers->hot_remove_disk()
> >>>   rcu_read_lock()
> >>>   rdev_for_each_rcu
> >>>    if (rdev->raid_disk >= 0)
> >>>                                  rdev->radi_disk = -1;
> >>>     atomic_inc(&rdev->nr_pending)
> >>>     rcu_read_unlock()
> >>>     bi = bio_alloc_bioset()
> >>>     bi->bi_end_io = md_end_flush
> >>>     bi->private = rdev
> >>>     submit_bio
> >>>     // issue io for removed rdev
> >>>
> >>> Fix this problem by grabbing 'acive_io' before iterating rdev, make sure
> >>> that remove_and_add_spares() won't concurrent with submit_flushes().
> >>>
> >>> Fixes: a2826aa92e2e ("md: support barrier requests on all personalities.")
> >>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> >>> ---
> >>> Changes v2:
> >>>   - Add WARN_ON in case md_flush_request() is not called from
> >>>   md_handle_request() in future.
> >>>
> >>>   drivers/md/md.c | 22 ++++++++++++++++------
> >>>   1 file changed, 16 insertions(+), 6 deletions(-)
> >>>
> >>> diff --git a/drivers/md/md.c b/drivers/md/md.c
> >>> index 86efc9c2ae56..2ffedc39edd6 100644
> >>> --- a/drivers/md/md.c
> >>> +++ b/drivers/md/md.c
> >>> @@ -538,6 +538,9 @@ static void md_end_flush(struct bio *bio)
> >>>          rdev_dec_pending(rdev, mddev);
> >>>
> >>>          if (atomic_dec_and_test(&mddev->flush_pending)) {
> >>> +               /* The pair is percpu_ref_tryget() from md_flush_request() */
> >>> +               percpu_ref_put(&mddev->active_io);
> >>> +
> >>>                  /* The pre-request flush has finished */
> >>>                  queue_work(md_wq, &mddev->flush_work);
> >>>          }
> >>> @@ -557,12 +560,8 @@ static void submit_flushes(struct work_struct *ws)
> >>>          rdev_for_each_rcu(rdev, mddev)
> >>>                  if (rdev->raid_disk >= 0 &&
> >>>                      !test_bit(Faulty, &rdev->flags)) {
> >>> -                       /* Take two references, one is dropped
> >>> -                        * when request finishes, one after
> >>> -                        * we reclaim rcu_read_lock
> >>> -                        */
> >>>                          struct bio *bi;
> >>> -                       atomic_inc(&rdev->nr_pending);
> >>> +
> >>>                          atomic_inc(&rdev->nr_pending);
> >>>                          rcu_read_unlock();
> >>>                          bi = bio_alloc_bioset(rdev->bdev, 0,
> >>> @@ -573,7 +572,6 @@ static void submit_flushes(struct work_struct *ws)
> >>>                          atomic_inc(&mddev->flush_pending);
> >>>                          submit_bio(bi);
> >>>                          rcu_read_lock();
> >>> -                       rdev_dec_pending(rdev, mddev);
> >>>                  }
> >>>          rcu_read_unlock();
> >>>          if (atomic_dec_and_test(&mddev->flush_pending))
> >>> @@ -626,6 +624,18 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio)
> >>>          /* new request after previous flush is completed */
> >>>          if (ktime_after(req_start, mddev->prev_flush_start)) {
> >>>                  WARN_ON(mddev->flush_bio);
> >>> +               /*
> >>> +                * Grab a reference to make sure mddev_suspend() will wait for
> >>> +                * this flush to be done.
> >>> +                *
> >>> +                * md_flush_reqeust() is called under md_handle_request() and
> >>> +                * 'active_io' is already grabbed, hence percpu_ref_tryget()
> >>> +                * won't fail, percpu_ref_tryget_live() can't be used because
> >>> +                * percpu_ref_kill() can be called by mddev_suspend()
> >>> +                * concurrently.
> >>> +                */
> >>> +               if (WARN_ON(percpu_ref_tryget(&mddev->active_io)))
> >>
> >> This should be "if (!WARN_ON(..))", right?
>
> Sorry for the mistake, this actually should be:
>
> if (WARN_ON(!percpu_ref_tryget(...))
> >>
> >> Song
> >>
> >>> +                       percpu_ref_get(&mddev->active_io);
> >
> > Actually, we can just use percpu_ref_get(), no?
>
> Yes, we can, but if someone else doesn't call md_flush_request() under
> md_handle_request() in the fulture, there will be problem and
> percpu_ref_get() can't catch this, do you think it'll make sense to
> prevent such case?

This combination is really weird

+               if (WARN_ON(percpu_ref_tryget(&mddev->active_io)))
+                       percpu_ref_get(&mddev->active_io);

We can use percpu_ref_get() here, and add
WARN_ON(percpu_ref_is_zero()) earlier in the function. Does this
make sense?

Thanks,
Song
diff mbox series

Patch

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 86efc9c2ae56..2ffedc39edd6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -538,6 +538,9 @@  static void md_end_flush(struct bio *bio)
 	rdev_dec_pending(rdev, mddev);
 
 	if (atomic_dec_and_test(&mddev->flush_pending)) {
+		/* The pair is percpu_ref_tryget() from md_flush_request() */
+		percpu_ref_put(&mddev->active_io);
+
 		/* The pre-request flush has finished */
 		queue_work(md_wq, &mddev->flush_work);
 	}
@@ -557,12 +560,8 @@  static void submit_flushes(struct work_struct *ws)
 	rdev_for_each_rcu(rdev, mddev)
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(Faulty, &rdev->flags)) {
-			/* Take two references, one is dropped
-			 * when request finishes, one after
-			 * we reclaim rcu_read_lock
-			 */
 			struct bio *bi;
-			atomic_inc(&rdev->nr_pending);
+
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
 			bi = bio_alloc_bioset(rdev->bdev, 0,
@@ -573,7 +572,6 @@  static void submit_flushes(struct work_struct *ws)
 			atomic_inc(&mddev->flush_pending);
 			submit_bio(bi);
 			rcu_read_lock();
-			rdev_dec_pending(rdev, mddev);
 		}
 	rcu_read_unlock();
 	if (atomic_dec_and_test(&mddev->flush_pending))
@@ -626,6 +624,18 @@  bool md_flush_request(struct mddev *mddev, struct bio *bio)
 	/* new request after previous flush is completed */
 	if (ktime_after(req_start, mddev->prev_flush_start)) {
 		WARN_ON(mddev->flush_bio);
+		/*
+		 * Grab a reference to make sure mddev_suspend() will wait for
+		 * this flush to be done.
+		 *
+		 * md_flush_reqeust() is called under md_handle_request() and
+		 * 'active_io' is already grabbed, hence percpu_ref_tryget()
+		 * won't fail, percpu_ref_tryget_live() can't be used because
+		 * percpu_ref_kill() can be called by mddev_suspend()
+		 * concurrently.
+		 */
+		if (WARN_ON(percpu_ref_tryget(&mddev->active_io)))
+			percpu_ref_get(&mddev->active_io);
 		mddev->flush_bio = bio;
 		bio = NULL;
 	}