Message ID | 20240826074843.1575099-1-yukuai1@huaweicloud.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | [md-6.12] md: remove flush handling | expand |
Dear Kuai, Thank you for your patch. Am 26.08.24 um 09:48 schrieb Yu Kuai: > From: Yu Kuai <yukuai3@huawei.com> > > For flush request, md has a special flush handling to merge concurrent > flush request into single one, however, the whole mechanism is based on > a disk level spin_lock 'mddev->lock'. And fsync can be called quite > often in some user cases, for consequence, spin lock from IO fast path can > cause performance degration. > > Fortunately, the block layer already have flush handling to merge s/have/has/ > concurrent flush request, and it only acquire hctx level spin lock(see 1. acquire*s* 2. Please add a space before the (. > details in blk-flush.c). > > This patch remove the flush handling in md, and convert to use general > block layer flush handling in underlying disks. remove*s*, convert*s* > Flush test for 4 nvme raid10: > start 128 threads to do fsync 100000 times, on arm64, see how long it > takes. Please share the script, so it’s easier to reproduce? > Test result: about 10 times faster for high concurrency. > Before this patch: 50943374 microseconds > After this patch: 5096347 microseconds > > BTW, this patch can fix the same problem as commit 611d5cbc0b35 ("md: fix > deadlock between mddev_suspend and flush bio"). So, should that be reverted? (Cc: +Li Nan) > Signed-off-by: Yu Kuai <yukuai3@huawei.com> > --- > drivers/md/md.c | 137 ++++-------------------------------------------- > drivers/md/md.h | 10 ---- > 2 files changed, 11 insertions(+), 136 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index a38981de8901..4d675f7cc2a7 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -546,137 +546,23 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n > return 0; > } > > -/* > - * Generic flush handling for md > - */ > - > -static void md_end_flush(struct bio *bio) > -{ > - struct md_rdev *rdev = bio->bi_private; > - struct mddev *mddev = rdev->mddev; > - > - bio_put(bio); > - > - rdev_dec_pending(rdev, mddev); > - > - if (atomic_dec_and_test(&mddev->flush_pending)) > - /* The pre-request flush has finished */ > - queue_work(md_wq, &mddev->flush_work); > -} > - > -static void md_submit_flush_data(struct work_struct *ws); > - > -static void submit_flushes(struct work_struct *ws) > +bool md_flush_request(struct mddev *mddev, struct bio *bio) > { > - struct mddev *mddev = container_of(ws, struct mddev, flush_work); > struct md_rdev *rdev; > + struct bio *new; > > - mddev->start_flush = ktime_get_boottime(); > - INIT_WORK(&mddev->flush_work, md_submit_flush_data); > - atomic_set(&mddev->flush_pending, 1); > - rcu_read_lock(); > - rdev_for_each_rcu(rdev, mddev) > - if (rdev->raid_disk >= 0 && > - !test_bit(Faulty, &rdev->flags)) { > - struct bio *bi; > - > - atomic_inc(&rdev->nr_pending); > - rcu_read_unlock(); > - bi = bio_alloc_bioset(rdev->bdev, 0, > - REQ_OP_WRITE | REQ_PREFLUSH, > - GFP_NOIO, &mddev->bio_set); > - bi->bi_end_io = md_end_flush; > - bi->bi_private = rdev; > - atomic_inc(&mddev->flush_pending); > - submit_bio(bi); > - rcu_read_lock(); > - } > - rcu_read_unlock(); > - if (atomic_dec_and_test(&mddev->flush_pending)) > - queue_work(md_wq, &mddev->flush_work); > -} > - > -static void md_submit_flush_data(struct work_struct *ws) > -{ > - struct mddev *mddev = container_of(ws, struct mddev, flush_work); > - struct bio *bio = mddev->flush_bio; > - > - /* > - * must reset flush_bio before calling into md_handle_request to avoid a > - * deadlock, because other bios passed md_handle_request suspend check > - * could wait for this and below md_handle_request could wait for those > - * bios because of suspend check > - */ > - spin_lock_irq(&mddev->lock); > - mddev->prev_flush_start = mddev->start_flush; > - mddev->flush_bio = NULL; > - spin_unlock_irq(&mddev->lock); > - wake_up(&mddev->sb_wait); > - > - if (bio->bi_iter.bi_size == 0) { > - /* an empty barrier - all done */ > - bio_endio(bio); > - } else { > - bio->bi_opf &= ~REQ_PREFLUSH; > - > - /* > - * make_requst() will never return error here, it only > - * returns error in raid5_make_request() by dm-raid. > - * Since dm always splits data and flush operation into > - * two separate io, io size of flush submitted by dm > - * always is 0, make_request() will not be called here. > - */ > - if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) > - bio_io_error(bio); > - } > - > - /* The pair is percpu_ref_get() from md_flush_request() */ > - percpu_ref_put(&mddev->active_io); > -} > + rdev_for_each(rdev, mddev) { > + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) > + continue; > > -/* > - * Manages consolidation of flushes and submitting any flushes needed for > - * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is > - * being finished in another context. Returns false if the flushing is > - * complete but still needs the I/O portion of the bio to be processed. > - */ > -bool md_flush_request(struct mddev *mddev, struct bio *bio) > -{ > - ktime_t req_start = ktime_get_boottime(); > - spin_lock_irq(&mddev->lock); > - /* flush requests wait until ongoing flush completes, > - * hence coalescing all the pending requests. > - */ > - wait_event_lock_irq(mddev->sb_wait, > - !mddev->flush_bio || > - ktime_before(req_start, mddev->prev_flush_start), > - mddev->lock); > - /* new request after previous flush is completed */ > - if (ktime_after(req_start, mddev->prev_flush_start)) { > - WARN_ON(mddev->flush_bio); > - /* > - * Grab a reference to make sure mddev_suspend() will wait for > - * this flush to be done. > - * > - * md_flush_reqeust() is called under md_handle_request() and > - * 'active_io' is already grabbed, hence percpu_ref_is_zero() > - * won't pass, percpu_ref_tryget_live() can't be used because > - * percpu_ref_kill() can be called by mddev_suspend() > - * concurrently. > - */ > - WARN_ON(percpu_ref_is_zero(&mddev->active_io)); > - percpu_ref_get(&mddev->active_io); > - mddev->flush_bio = bio; > - spin_unlock_irq(&mddev->lock); > - INIT_WORK(&mddev->flush_work, submit_flushes); > - queue_work(md_wq, &mddev->flush_work); > - return true; > + new = bio_alloc_bioset(rdev->bdev, 0, > + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, > + &mddev->bio_set); > + bio_chain(new, bio); > + submit_bio(new); > } > > - /* flush was performed for some other bio while we waited. */ > - spin_unlock_irq(&mddev->lock); > - if (bio->bi_iter.bi_size == 0) { > - /* pure flush without data - all done */ > + if (bio_sectors(bio) == 0) { > bio_endio(bio); > return true; > } > @@ -763,7 +649,6 @@ int mddev_init(struct mddev *mddev) > atomic_set(&mddev->openers, 0); > atomic_set(&mddev->sync_seq, 0); > spin_lock_init(&mddev->lock); > - atomic_set(&mddev->flush_pending, 0); > init_waitqueue_head(&mddev->sb_wait); > init_waitqueue_head(&mddev->recovery_wait); > mddev->reshape_position = MaxSector; > diff --git a/drivers/md/md.h b/drivers/md/md.h > index 1c6a5f41adca..5d2e6bd58e4d 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -572,16 +572,6 @@ struct mddev { > */ > struct bio_set io_clone_set; > > - /* Generic flush handling. > - * The last to finish preflush schedules a worker to submit > - * the rest of the request (without the REQ_PREFLUSH flag). > - */ > - struct bio *flush_bio; > - atomic_t flush_pending; > - ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed > - * flush was started. > - */ > - struct work_struct flush_work; > struct work_struct event_work; /* used by dm to report failure event */ > mempool_t *serial_info_pool; > void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); Code removal is always nice to see. Kind regards, Paul
[Add one more finding] Am 26.08.24 um 10:19 schrieb Paul Menzel: > Dear Kuai, > > > Thank you for your patch. > > > Am 26.08.24 um 09:48 schrieb Yu Kuai: >> From: Yu Kuai <yukuai3@huawei.com> >> >> For flush request, md has a special flush handling to merge concurrent >> flush request into single one, however, the whole mechanism is based on >> a disk level spin_lock 'mddev->lock'. And fsync can be called quite >> often in some user cases, for consequence, spin lock from IO fast path >> can >> cause performance degration. degradation >> Fortunately, the block layer already have flush handling to merge > > s/have/has/ > >> concurrent flush request, and it only acquire hctx level spin lock(see > > 1. acquire*s* > 2. Please add a space before the (. > >> details in blk-flush.c). >> >> This patch remove the flush handling in md, and convert to use general >> block layer flush handling in underlying disks. > > remove*s*, convert*s* > >> Flush test for 4 nvme raid10: >> start 128 threads to do fsync 100000 times, on arm64, see how long it >> takes. > > Please share the script, so it’s easier to reproduce? > >> Test result: about 10 times faster for high concurrency. >> Before this patch: 50943374 microseconds >> After this patch: 5096347 microseconds >> >> BTW, this patch can fix the same problem as commit 611d5cbc0b35 ("md: fix >> deadlock between mddev_suspend and flush bio"). > > So, should that be reverted? (Cc: +Li Nan) > >> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >> --- >> drivers/md/md.c | 137 ++++-------------------------------------------- >> drivers/md/md.h | 10 ---- >> 2 files changed, 11 insertions(+), 136 deletions(-) >> >> diff --git a/drivers/md/md.c b/drivers/md/md.c >> index a38981de8901..4d675f7cc2a7 100644 >> --- a/drivers/md/md.c >> +++ b/drivers/md/md.c >> @@ -546,137 +546,23 @@ static int >> mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n >> return 0; >> } >> -/* >> - * Generic flush handling for md >> - */ >> - >> -static void md_end_flush(struct bio *bio) >> -{ >> - struct md_rdev *rdev = bio->bi_private; >> - struct mddev *mddev = rdev->mddev; >> - >> - bio_put(bio); >> - >> - rdev_dec_pending(rdev, mddev); >> - >> - if (atomic_dec_and_test(&mddev->flush_pending)) >> - /* The pre-request flush has finished */ >> - queue_work(md_wq, &mddev->flush_work); >> -} >> - >> -static void md_submit_flush_data(struct work_struct *ws); >> - >> -static void submit_flushes(struct work_struct *ws) >> +bool md_flush_request(struct mddev *mddev, struct bio *bio) >> { >> - struct mddev *mddev = container_of(ws, struct mddev, flush_work); >> struct md_rdev *rdev; >> + struct bio *new; >> - mddev->start_flush = ktime_get_boottime(); >> - INIT_WORK(&mddev->flush_work, md_submit_flush_data); >> - atomic_set(&mddev->flush_pending, 1); >> - rcu_read_lock(); >> - rdev_for_each_rcu(rdev, mddev) >> - if (rdev->raid_disk >= 0 && >> - !test_bit(Faulty, &rdev->flags)) { >> - struct bio *bi; >> - >> - atomic_inc(&rdev->nr_pending); >> - rcu_read_unlock(); >> - bi = bio_alloc_bioset(rdev->bdev, 0, >> - REQ_OP_WRITE | REQ_PREFLUSH, >> - GFP_NOIO, &mddev->bio_set); >> - bi->bi_end_io = md_end_flush; >> - bi->bi_private = rdev; >> - atomic_inc(&mddev->flush_pending); >> - submit_bio(bi); >> - rcu_read_lock(); >> - } >> - rcu_read_unlock(); >> - if (atomic_dec_and_test(&mddev->flush_pending)) >> - queue_work(md_wq, &mddev->flush_work); >> -} >> - >> -static void md_submit_flush_data(struct work_struct *ws) >> -{ >> - struct mddev *mddev = container_of(ws, struct mddev, flush_work); >> - struct bio *bio = mddev->flush_bio; >> - >> - /* >> - * must reset flush_bio before calling into md_handle_request to >> avoid a >> - * deadlock, because other bios passed md_handle_request suspend >> check >> - * could wait for this and below md_handle_request could wait for >> those >> - * bios because of suspend check >> - */ >> - spin_lock_irq(&mddev->lock); >> - mddev->prev_flush_start = mddev->start_flush; >> - mddev->flush_bio = NULL; >> - spin_unlock_irq(&mddev->lock); >> - wake_up(&mddev->sb_wait); >> - >> - if (bio->bi_iter.bi_size == 0) { >> - /* an empty barrier - all done */ >> - bio_endio(bio); >> - } else { >> - bio->bi_opf &= ~REQ_PREFLUSH; >> - >> - /* >> - * make_requst() will never return error here, it only >> - * returns error in raid5_make_request() by dm-raid. >> - * Since dm always splits data and flush operation into >> - * two separate io, io size of flush submitted by dm >> - * always is 0, make_request() will not be called here. >> - */ >> - if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) >> - bio_io_error(bio); >> - } >> - >> - /* The pair is percpu_ref_get() from md_flush_request() */ >> - percpu_ref_put(&mddev->active_io); >> -} >> + rdev_for_each(rdev, mddev) { >> + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) >> + continue; >> -/* >> - * Manages consolidation of flushes and submitting any flushes needed for >> - * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is >> - * being finished in another context. Returns false if the flushing is >> - * complete but still needs the I/O portion of the bio to be processed. >> - */ >> -bool md_flush_request(struct mddev *mddev, struct bio *bio) >> -{ >> - ktime_t req_start = ktime_get_boottime(); >> - spin_lock_irq(&mddev->lock); >> - /* flush requests wait until ongoing flush completes, >> - * hence coalescing all the pending requests. >> - */ >> - wait_event_lock_irq(mddev->sb_wait, >> - !mddev->flush_bio || >> - ktime_before(req_start, mddev->prev_flush_start), >> - mddev->lock); >> - /* new request after previous flush is completed */ >> - if (ktime_after(req_start, mddev->prev_flush_start)) { >> - WARN_ON(mddev->flush_bio); >> - /* >> - * Grab a reference to make sure mddev_suspend() will wait for >> - * this flush to be done. >> - * >> - * md_flush_reqeust() is called under md_handle_request() and >> - * 'active_io' is already grabbed, hence percpu_ref_is_zero() >> - * won't pass, percpu_ref_tryget_live() can't be used because >> - * percpu_ref_kill() can be called by mddev_suspend() >> - * concurrently. >> - */ >> - WARN_ON(percpu_ref_is_zero(&mddev->active_io)); >> - percpu_ref_get(&mddev->active_io); >> - mddev->flush_bio = bio; >> - spin_unlock_irq(&mddev->lock); >> - INIT_WORK(&mddev->flush_work, submit_flushes); >> - queue_work(md_wq, &mddev->flush_work); >> - return true; >> + new = bio_alloc_bioset(rdev->bdev, 0, >> + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, >> + &mddev->bio_set); >> + bio_chain(new, bio); >> + submit_bio(new); >> } >> - /* flush was performed for some other bio while we waited. */ >> - spin_unlock_irq(&mddev->lock); >> - if (bio->bi_iter.bi_size == 0) { >> - /* pure flush without data - all done */ >> + if (bio_sectors(bio) == 0) { >> bio_endio(bio); >> return true; >> } >> @@ -763,7 +649,6 @@ int mddev_init(struct mddev *mddev) >> atomic_set(&mddev->openers, 0); >> atomic_set(&mddev->sync_seq, 0); >> spin_lock_init(&mddev->lock); >> - atomic_set(&mddev->flush_pending, 0); >> init_waitqueue_head(&mddev->sb_wait); >> init_waitqueue_head(&mddev->recovery_wait); >> mddev->reshape_position = MaxSector; >> diff --git a/drivers/md/md.h b/drivers/md/md.h >> index 1c6a5f41adca..5d2e6bd58e4d 100644 >> --- a/drivers/md/md.h >> +++ b/drivers/md/md.h >> @@ -572,16 +572,6 @@ struct mddev { >> */ >> struct bio_set io_clone_set; >> - /* Generic flush handling. >> - * The last to finish preflush schedules a worker to submit >> - * the rest of the request (without the REQ_PREFLUSH flag). >> - */ >> - struct bio *flush_bio; >> - atomic_t flush_pending; >> - ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed >> - * flush was started. >> - */ >> - struct work_struct flush_work; >> struct work_struct event_work; /* used by dm to report failure event */ >> mempool_t *serial_info_pool; >> void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); > > Code removal is always nice to see. > > > Kind regards, > > Paul
Hi, 在 2024/08/26 16:19, Paul Menzel 写道: > Dear Kuai, > > > Thank you for your patch. > > > Am 26.08.24 um 09:48 schrieb Yu Kuai: >> From: Yu Kuai <yukuai3@huawei.com> >> >> For flush request, md has a special flush handling to merge concurrent >> flush request into single one, however, the whole mechanism is based on >> a disk level spin_lock 'mddev->lock'. And fsync can be called quite >> often in some user cases, for consequence, spin lock from IO fast path >> can >> cause performance degration. >> >> Fortunately, the block layer already have flush handling to merge > > s/have/has/ > >> concurrent flush request, and it only acquire hctx level spin lock(see > > 1. acquire*s* > 2. Please add a space before the (. > >> details in blk-flush.c). >> >> This patch remove the flush handling in md, and convert to use general >> block layer flush handling in underlying disks. > > remove*s*, convert*s* > >> Flush test for 4 nvme raid10: >> start 128 threads to do fsync 100000 times, on arm64, see how long it >> takes. > > Please share the script, so it’s easier to reproduce? The script is simple, I can add it in the next version. #include <stdio.h> #include <stdlib.h> #include <pthread.h> #include <fcntl.h> #include <unistd.h> #include <sys/time.h> #define THREADS 128 #define FSYNC_COUNT 100000 void* thread_func(void* arg) { int fd = *(int*)arg; for (int i = 0; i < FSYNC_COUNT; i++) { fsync(fd); } return NULL; } int main() { int fd = open("/dev/md0", O_RDWR); if (fd < 0) { perror("open"); exit(1); } pthread_t threads[THREADS]; struct timeval start, end; gettimeofday(&start, NULL); for (int i = 0; i < THREADS; i++) { pthread_create(&threads[i], NULL, thread_func, &fd); } for (int i = 0; i < THREADS; i++) { pthread_join(threads[i], NULL); } gettimeofday(&end, NULL); close(fd); long long elapsed = (end.tv_sec - start.tv_sec) * 1000000LL + (end.tv_usec - start.tv_usec); printf("Elapsed time: %lld microseconds\n", elapsed); return 0; } > >> Test result: about 10 times faster for high concurrency. >> Before this patch: 50943374 microseconds >> After this patch: 5096347 microseconds >> >> BTW, this patch can fix the same problem as commit 611d5cbc0b35 ("md: fix >> deadlock between mddev_suspend and flush bio"). > > So, should that be reverted? (Cc: +Li Nan) I put a particular emphasis on this becasue 611d5cbc0b35 is the fix patch for CVE-2024-43855, and this can help people fix the CVE in low kernel version if they care like us. Revert is not necessary, I think. Thanks, Kuai > >> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >> --- >> drivers/md/md.c | 137 ++++-------------------------------------------- >> drivers/md/md.h | 10 ---- >> 2 files changed, 11 insertions(+), 136 deletions(-) >> >> diff --git a/drivers/md/md.c b/drivers/md/md.c >> index a38981de8901..4d675f7cc2a7 100644 >> --- a/drivers/md/md.c >> +++ b/drivers/md/md.c >> @@ -546,137 +546,23 @@ static int >> mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n >> return 0; >> } >> -/* >> - * Generic flush handling for md >> - */ >> - >> -static void md_end_flush(struct bio *bio) >> -{ >> - struct md_rdev *rdev = bio->bi_private; >> - struct mddev *mddev = rdev->mddev; >> - >> - bio_put(bio); >> - >> - rdev_dec_pending(rdev, mddev); >> - >> - if (atomic_dec_and_test(&mddev->flush_pending)) >> - /* The pre-request flush has finished */ >> - queue_work(md_wq, &mddev->flush_work); >> -} >> - >> -static void md_submit_flush_data(struct work_struct *ws); >> - >> -static void submit_flushes(struct work_struct *ws) >> +bool md_flush_request(struct mddev *mddev, struct bio *bio) >> { >> - struct mddev *mddev = container_of(ws, struct mddev, flush_work); >> struct md_rdev *rdev; >> + struct bio *new; >> - mddev->start_flush = ktime_get_boottime(); >> - INIT_WORK(&mddev->flush_work, md_submit_flush_data); >> - atomic_set(&mddev->flush_pending, 1); >> - rcu_read_lock(); >> - rdev_for_each_rcu(rdev, mddev) >> - if (rdev->raid_disk >= 0 && >> - !test_bit(Faulty, &rdev->flags)) { >> - struct bio *bi; >> - >> - atomic_inc(&rdev->nr_pending); >> - rcu_read_unlock(); >> - bi = bio_alloc_bioset(rdev->bdev, 0, >> - REQ_OP_WRITE | REQ_PREFLUSH, >> - GFP_NOIO, &mddev->bio_set); >> - bi->bi_end_io = md_end_flush; >> - bi->bi_private = rdev; >> - atomic_inc(&mddev->flush_pending); >> - submit_bio(bi); >> - rcu_read_lock(); >> - } >> - rcu_read_unlock(); >> - if (atomic_dec_and_test(&mddev->flush_pending)) >> - queue_work(md_wq, &mddev->flush_work); >> -} >> - >> -static void md_submit_flush_data(struct work_struct *ws) >> -{ >> - struct mddev *mddev = container_of(ws, struct mddev, flush_work); >> - struct bio *bio = mddev->flush_bio; >> - >> - /* >> - * must reset flush_bio before calling into md_handle_request to >> avoid a >> - * deadlock, because other bios passed md_handle_request suspend >> check >> - * could wait for this and below md_handle_request could wait for >> those >> - * bios because of suspend check >> - */ >> - spin_lock_irq(&mddev->lock); >> - mddev->prev_flush_start = mddev->start_flush; >> - mddev->flush_bio = NULL; >> - spin_unlock_irq(&mddev->lock); >> - wake_up(&mddev->sb_wait); >> - >> - if (bio->bi_iter.bi_size == 0) { >> - /* an empty barrier - all done */ >> - bio_endio(bio); >> - } else { >> - bio->bi_opf &= ~REQ_PREFLUSH; >> - >> - /* >> - * make_requst() will never return error here, it only >> - * returns error in raid5_make_request() by dm-raid. >> - * Since dm always splits data and flush operation into >> - * two separate io, io size of flush submitted by dm >> - * always is 0, make_request() will not be called here. >> - */ >> - if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) >> - bio_io_error(bio); >> - } >> - >> - /* The pair is percpu_ref_get() from md_flush_request() */ >> - percpu_ref_put(&mddev->active_io); >> -} >> + rdev_for_each(rdev, mddev) { >> + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) >> + continue; >> -/* >> - * Manages consolidation of flushes and submitting any flushes needed >> for >> - * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is >> - * being finished in another context. Returns false if the flushing is >> - * complete but still needs the I/O portion of the bio to be processed. >> - */ >> -bool md_flush_request(struct mddev *mddev, struct bio *bio) >> -{ >> - ktime_t req_start = ktime_get_boottime(); >> - spin_lock_irq(&mddev->lock); >> - /* flush requests wait until ongoing flush completes, >> - * hence coalescing all the pending requests. >> - */ >> - wait_event_lock_irq(mddev->sb_wait, >> - !mddev->flush_bio || >> - ktime_before(req_start, mddev->prev_flush_start), >> - mddev->lock); >> - /* new request after previous flush is completed */ >> - if (ktime_after(req_start, mddev->prev_flush_start)) { >> - WARN_ON(mddev->flush_bio); >> - /* >> - * Grab a reference to make sure mddev_suspend() will wait for >> - * this flush to be done. >> - * >> - * md_flush_reqeust() is called under md_handle_request() and >> - * 'active_io' is already grabbed, hence percpu_ref_is_zero() >> - * won't pass, percpu_ref_tryget_live() can't be used because >> - * percpu_ref_kill() can be called by mddev_suspend() >> - * concurrently. >> - */ >> - WARN_ON(percpu_ref_is_zero(&mddev->active_io)); >> - percpu_ref_get(&mddev->active_io); >> - mddev->flush_bio = bio; >> - spin_unlock_irq(&mddev->lock); >> - INIT_WORK(&mddev->flush_work, submit_flushes); >> - queue_work(md_wq, &mddev->flush_work); >> - return true; >> + new = bio_alloc_bioset(rdev->bdev, 0, >> + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, >> + &mddev->bio_set); >> + bio_chain(new, bio); >> + submit_bio(new); >> } >> - /* flush was performed for some other bio while we waited. */ >> - spin_unlock_irq(&mddev->lock); >> - if (bio->bi_iter.bi_size == 0) { >> - /* pure flush without data - all done */ >> + if (bio_sectors(bio) == 0) { >> bio_endio(bio); >> return true; >> } >> @@ -763,7 +649,6 @@ int mddev_init(struct mddev *mddev) >> atomic_set(&mddev->openers, 0); >> atomic_set(&mddev->sync_seq, 0); >> spin_lock_init(&mddev->lock); >> - atomic_set(&mddev->flush_pending, 0); >> init_waitqueue_head(&mddev->sb_wait); >> init_waitqueue_head(&mddev->recovery_wait); >> mddev->reshape_position = MaxSector; >> diff --git a/drivers/md/md.h b/drivers/md/md.h >> index 1c6a5f41adca..5d2e6bd58e4d 100644 >> --- a/drivers/md/md.h >> +++ b/drivers/md/md.h >> @@ -572,16 +572,6 @@ struct mddev { >> */ >> struct bio_set io_clone_set; >> - /* Generic flush handling. >> - * The last to finish preflush schedules a worker to submit >> - * the rest of the request (without the REQ_PREFLUSH flag). >> - */ >> - struct bio *flush_bio; >> - atomic_t flush_pending; >> - ktime_t start_flush, prev_flush_start; /* prev_flush_start is >> when the previous completed >> - * flush was started. >> - */ >> - struct work_struct flush_work; >> struct work_struct event_work; /* used by dm to report >> failure event */ >> mempool_t *serial_info_pool; >> void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); > > Code removal is always nice to see. > > > Kind regards, > > Paul > . >
diff --git a/drivers/md/md.c b/drivers/md/md.c index a38981de8901..4d675f7cc2a7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -546,137 +546,23 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n return 0; } -/* - * Generic flush handling for md - */ - -static void md_end_flush(struct bio *bio) -{ - struct md_rdev *rdev = bio->bi_private; - struct mddev *mddev = rdev->mddev; - - bio_put(bio); - - rdev_dec_pending(rdev, mddev); - - if (atomic_dec_and_test(&mddev->flush_pending)) - /* The pre-request flush has finished */ - queue_work(md_wq, &mddev->flush_work); -} - -static void md_submit_flush_data(struct work_struct *ws); - -static void submit_flushes(struct work_struct *ws) +bool md_flush_request(struct mddev *mddev, struct bio *bio) { - struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct md_rdev *rdev; + struct bio *new; - mddev->start_flush = ktime_get_boottime(); - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - atomic_set(&mddev->flush_pending, 1); - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Faulty, &rdev->flags)) { - struct bio *bi; - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - bi = bio_alloc_bioset(rdev->bdev, 0, - REQ_OP_WRITE | REQ_PREFLUSH, - GFP_NOIO, &mddev->bio_set); - bi->bi_end_io = md_end_flush; - bi->bi_private = rdev; - atomic_inc(&mddev->flush_pending); - submit_bio(bi); - rcu_read_lock(); - } - rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) - queue_work(md_wq, &mddev->flush_work); -} - -static void md_submit_flush_data(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct bio *bio = mddev->flush_bio; - - /* - * must reset flush_bio before calling into md_handle_request to avoid a - * deadlock, because other bios passed md_handle_request suspend check - * could wait for this and below md_handle_request could wait for those - * bios because of suspend check - */ - spin_lock_irq(&mddev->lock); - mddev->prev_flush_start = mddev->start_flush; - mddev->flush_bio = NULL; - spin_unlock_irq(&mddev->lock); - wake_up(&mddev->sb_wait); - - if (bio->bi_iter.bi_size == 0) { - /* an empty barrier - all done */ - bio_endio(bio); - } else { - bio->bi_opf &= ~REQ_PREFLUSH; - - /* - * make_requst() will never return error here, it only - * returns error in raid5_make_request() by dm-raid. - * Since dm always splits data and flush operation into - * two separate io, io size of flush submitted by dm - * always is 0, make_request() will not be called here. - */ - if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) - bio_io_error(bio); - } - - /* The pair is percpu_ref_get() from md_flush_request() */ - percpu_ref_put(&mddev->active_io); -} + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; -/* - * Manages consolidation of flushes and submitting any flushes needed for - * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is - * being finished in another context. Returns false if the flushing is - * complete but still needs the I/O portion of the bio to be processed. - */ -bool md_flush_request(struct mddev *mddev, struct bio *bio) -{ - ktime_t req_start = ktime_get_boottime(); - spin_lock_irq(&mddev->lock); - /* flush requests wait until ongoing flush completes, - * hence coalescing all the pending requests. - */ - wait_event_lock_irq(mddev->sb_wait, - !mddev->flush_bio || - ktime_before(req_start, mddev->prev_flush_start), - mddev->lock); - /* new request after previous flush is completed */ - if (ktime_after(req_start, mddev->prev_flush_start)) { - WARN_ON(mddev->flush_bio); - /* - * Grab a reference to make sure mddev_suspend() will wait for - * this flush to be done. - * - * md_flush_reqeust() is called under md_handle_request() and - * 'active_io' is already grabbed, hence percpu_ref_is_zero() - * won't pass, percpu_ref_tryget_live() can't be used because - * percpu_ref_kill() can be called by mddev_suspend() - * concurrently. - */ - WARN_ON(percpu_ref_is_zero(&mddev->active_io)); - percpu_ref_get(&mddev->active_io); - mddev->flush_bio = bio; - spin_unlock_irq(&mddev->lock); - INIT_WORK(&mddev->flush_work, submit_flushes); - queue_work(md_wq, &mddev->flush_work); - return true; + new = bio_alloc_bioset(rdev->bdev, 0, + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, + &mddev->bio_set); + bio_chain(new, bio); + submit_bio(new); } - /* flush was performed for some other bio while we waited. */ - spin_unlock_irq(&mddev->lock); - if (bio->bi_iter.bi_size == 0) { - /* pure flush without data - all done */ + if (bio_sectors(bio) == 0) { bio_endio(bio); return true; } @@ -763,7 +649,6 @@ int mddev_init(struct mddev *mddev) atomic_set(&mddev->openers, 0); atomic_set(&mddev->sync_seq, 0); spin_lock_init(&mddev->lock); - atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; diff --git a/drivers/md/md.h b/drivers/md/md.h index 1c6a5f41adca..5d2e6bd58e4d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -572,16 +572,6 @@ struct mddev { */ struct bio_set io_clone_set; - /* Generic flush handling. - * The last to finish preflush schedules a worker to submit - * the rest of the request (without the REQ_PREFLUSH flag). - */ - struct bio *flush_bio; - atomic_t flush_pending; - ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed - * flush was started. - */ - struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);