diff mbox

[PATCH/rfc] dm: revise 'rescue' strategy for md->bs allocations

Message ID 878ti0fefq.fsf@notabene.neil.brown.name (mailing list archive)
State Accepted, archived
Delegated to: Mike Snitzer
Headers show

Commit Message

NeilBrown Aug. 31, 2017, 6:18 a.m. UTC
The md->bs bioset currently needs the BIOSET_NEED_RESCUER flag
which I hope to deprecate.  It is currently needed because
__split_and_process_bio() can potentially allocate from the
bioset multiple times within the one make_request_fn context.
Subsequent allocations can deadlock waiting for the bio returned
by the first allocation to complete - but it is stuck on the
current->bio_list list.

The bioset is also used in setup_clone() in dm-rq.c, but there a
non-blocking (GFP_ATOMIC) allocation is used, so there is no risk of
deadlock.

dm also provide rescuing for ->map() calls.  If a map call ever blocks,
any bios already submitted with generic_make_request() will be passed to
the bioset rescuer thread.

This patch unifies these two separate needs for rescuing bios into a
single mechanism, and use the md->wq work queue to do the rescuing.

A blk_plug_cb is added to 'struct clone_info' so that it is available
throughout the __split_and_process_bio() process.
Prior to allocating from the bioset, or calling ->map(),
dm_offload_check() is called to ensure that the blk_plug_cb is active.
If either of these operation schedules, the callback is called,
and any queued bios get passed to the wq.

Note that only current->bio_list[0] is offloaded.  current->bio_list[1]
contains bios that were scheduled *before* the current one started, so
they must have been submitted from higher up the stack, and we cannot be
waiting for them here.  Also, we now rescue *all* bios on the list as
there is nothing to be gained by being more selective.

This allows us to remove BIOSET_NEED_RESCUER when allocating this
bioset.
It also keeps all the mechanics of rescuing inside dm, so dm can
be in full control.

Signed-off-by: NeilBrown <neilb@suse.com>
---

Hi,
 I have only tested this lightly as I don't have any test infrastructure
 for dm and don't know how to reproduce the expected deadlocks.
 I'm keen to know your thoughts on this approach if you find a few spare
 moment to have a look.

Thanks,
NeilBrown

 
 drivers/md/dm-core.h |  1 +
 drivers/md/dm.c      | 99 ++++++++++++++++++++++++++--------------------------
 2 files changed, 51 insertions(+), 49 deletions(-)

Comments

Mikulas Patocka Sept. 1, 2017, 2:38 a.m. UTC | #1
On Thu, 31 Aug 2017, NeilBrown wrote:

> 
> The md->bs bioset currently needs the BIOSET_NEED_RESCUER flag
> which I hope to deprecate.  It is currently needed because
> __split_and_process_bio() can potentially allocate from the
> bioset multiple times within the one make_request_fn context.
> Subsequent allocations can deadlock waiting for the bio returned
> by the first allocation to complete - but it is stuck on the
> current->bio_list list.
> 
> The bioset is also used in setup_clone() in dm-rq.c, but there a
> non-blocking (GFP_ATOMIC) allocation is used, so there is no risk of
> deadlock.
> 
> dm also provide rescuing for ->map() calls.  If a map call ever blocks,
> any bios already submitted with generic_make_request() will be passed to
> the bioset rescuer thread.
> 
> This patch unifies these two separate needs for rescuing bios into a
> single mechanism, and use the md->wq work queue to do the rescuing.
> 
> A blk_plug_cb is added to 'struct clone_info' so that it is available
> throughout the __split_and_process_bio() process.
> Prior to allocating from the bioset, or calling ->map(),
> dm_offload_check() is called to ensure that the blk_plug_cb is active.
> If either of these operation schedules, the callback is called,
> and any queued bios get passed to the wq.
> 
> Note that only current->bio_list[0] is offloaded.  current->bio_list[1]
> contains bios that were scheduled *before* the current one started, so

These bios need to be offloaded too, otherwise you re-introduce this 
deadlock: https://www.redhat.com/archives/dm-devel/2014-May/msg00089.html

Mikulas

> they must have been submitted from higher up the stack, and we cannot be
> waiting for them here.  Also, we now rescue *all* bios on the list as
> there is nothing to be gained by being more selective.
>
> This allows us to remove BIOSET_NEED_RESCUER when allocating this
> bioset.
> It also keeps all the mechanics of rescuing inside dm, so dm can
> be in full control.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
> 
> Hi,
>  I have only tested this lightly as I don't have any test infrastructure
>  for dm and don't know how to reproduce the expected deadlocks.
>  I'm keen to know your thoughts on this approach if you find a few spare
>  moment to have a look.
> 
> Thanks,
> NeilBrown
> 
>  
>  drivers/md/dm-core.h |  1 +
>  drivers/md/dm.c      | 99 ++++++++++++++++++++++++++--------------------------
>  2 files changed, 51 insertions(+), 49 deletions(-)
> 
> diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
> index 24eddbdf2ab4..cb060dd6d3ca 100644
> --- a/drivers/md/dm-core.h
> +++ b/drivers/md/dm-core.h
> @@ -71,6 +71,7 @@ struct mapped_device {
>  	struct work_struct work;
>  	spinlock_t deferred_lock;
>  	struct bio_list deferred;
> +	struct bio_list rescued;
>  
>  	/*
>  	 * Event handling.
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 2edbcc2d7d3f..774dd71cb49a 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -1134,65 +1134,63 @@ void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
>  }
>  EXPORT_SYMBOL_GPL(dm_remap_zone_report);
>  
> +struct clone_info {
> +	struct mapped_device *md;
> +	struct dm_table *map;
> +	struct bio *bio;
> +	struct dm_io *io;
> +	sector_t sector;
> +	unsigned sector_count;
> +	struct blk_plug plug;
> +	struct blk_plug_cb cb;
> +};
> +
>  /*
>   * Flush current->bio_list when the target map method blocks.
>   * This fixes deadlocks in snapshot and possibly in other targets.
>   */
> -struct dm_offload {
> -	struct blk_plug plug;
> -	struct blk_plug_cb cb;
> -};
>  
>  static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
>  {
> -	struct dm_offload *o = container_of(cb, struct dm_offload, cb);
> +	struct clone_info *ci = container_of(cb, struct clone_info, cb);
>  	struct bio_list list;
> -	struct bio *bio;
> -	int i;
>  
> -	INIT_LIST_HEAD(&o->cb.list);
> +	INIT_LIST_HEAD(&cb->list);
>  
> -	if (unlikely(!current->bio_list))
> +	if (unlikely(!current->bio_list || bio_list_empty(&current->bio_list[0])))
>  		return;
>  
> -	for (i = 0; i < 2; i++) {
> -		list = current->bio_list[i];
> -		bio_list_init(&current->bio_list[i]);
> -
> -		while ((bio = bio_list_pop(&list))) {
> -			struct bio_set *bs = bio->bi_pool;
> -			if (unlikely(!bs) || bs == fs_bio_set ||
> -			    !bs->rescue_workqueue) {
> -				bio_list_add(&current->bio_list[i], bio);
> -				continue;
> -			}
> +	list = current->bio_list[0];
> +	bio_list_init(&current->bio_list[0]);
> +	spin_lock(&ci->md->deferred_lock);
> +	bio_list_merge(&ci->md->rescued, &list);
> +	spin_unlock(&ci->md->deferred_lock);
> +	queue_work(ci->md->wq, &ci->md->work);
> +}
>  
> -			spin_lock(&bs->rescue_lock);
> -			bio_list_add(&bs->rescue_list, bio);
> -			queue_work(bs->rescue_workqueue, &bs->rescue_work);
> -			spin_unlock(&bs->rescue_lock);
> -		}
> -	}
> +static void dm_offload_start(struct clone_info *ci)
> +{
> +	blk_start_plug(&ci->plug);
> +	INIT_LIST_HEAD(&ci->cb.list);
> +	ci->cb.callback = flush_current_bio_list;
>  }
>  
> -static void dm_offload_start(struct dm_offload *o)
> +static void dm_offload_end(struct clone_info *ci)
>  {
> -	blk_start_plug(&o->plug);
> -	o->cb.callback = flush_current_bio_list;
> -	list_add(&o->cb.list, &current->plug->cb_list);
> +	list_del(&ci->cb.list);
> +	blk_finish_plug(&ci->plug);
>  }
>  
> -static void dm_offload_end(struct dm_offload *o)
> +static void dm_offload_check(struct clone_info *ci)
>  {
> -	list_del(&o->cb.list);
> -	blk_finish_plug(&o->plug);
> +	if (list_empty(&ci->cb.list))
> +		list_add(&ci->cb.list, &current->plug->cb_list);
>  }
>  
> -static void __map_bio(struct dm_target_io *tio)
> +static void __map_bio(struct clone_info *ci, struct dm_target_io *tio)
>  {
>  	int r;
>  	sector_t sector;
> -	struct dm_offload o;
>  	struct bio *clone = &tio->clone;
>  	struct dm_target *ti = tio->ti;
>  
> @@ -1206,9 +1204,8 @@ static void __map_bio(struct dm_target_io *tio)
>  	atomic_inc(&tio->io->io_count);
>  	sector = clone->bi_iter.bi_sector;
>  
> -	dm_offload_start(&o);
> +	dm_offload_check(ci);
>  	r = ti->type->map(ti, clone);
> -	dm_offload_end(&o);
>  
>  	switch (r) {
>  	case DM_MAPIO_SUBMITTED:
> @@ -1233,15 +1230,6 @@ static void __map_bio(struct dm_target_io *tio)
>  	}
>  }
>  
> -struct clone_info {
> -	struct mapped_device *md;
> -	struct dm_table *map;
> -	struct bio *bio;
> -	struct dm_io *io;
> -	sector_t sector;
> -	unsigned sector_count;
> -};
> -
>  static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
>  {
>  	bio->bi_iter.bi_sector = sector;
> @@ -1291,6 +1279,7 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
>  	struct dm_target_io *tio;
>  	struct bio *clone;
>  
> +	dm_offload_check(ci);
>  	clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
>  	tio = container_of(clone, struct dm_target_io, clone);
>  
> @@ -1314,7 +1303,7 @@ static void __clone_and_map_simple_bio(struct clone_info *ci,
>  	if (len)
>  		bio_setup_sector(clone, ci->sector, *len);
>  
> -	__map_bio(tio);
> +	__map_bio(ci, tio);
>  }
>  
>  static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
> @@ -1361,7 +1350,7 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
>  			free_tio(tio);
>  			break;
>  		}
> -		__map_bio(tio);
> +		__map_bio(ci, tio);
>  	}
>  
>  	return r;
> @@ -1493,6 +1482,7 @@ static void __split_and_process_bio(struct mapped_device *md,
>  		bio_io_error(bio);
>  		return;
>  	}
> +	dm_offload_start(&ci);
>  
>  	ci.map = map;
>  	ci.md = md;
> @@ -1521,6 +1511,7 @@ static void __split_and_process_bio(struct mapped_device *md,
>  		while (ci.sector_count && !error)
>  			error = __split_and_process_non_flush(&ci);
>  	}
> +	dm_offload_end(&ci);
>  
>  	/* drop the extra reference count */
>  	dec_pending(ci.io, error);
> @@ -2248,6 +2239,16 @@ static void dm_wq_work(struct work_struct *work)
>  	int srcu_idx;
>  	struct dm_table *map;
>  
> +	if (!bio_list_empty(&md->rescued)) {
> +		struct bio_list list;
> +		spin_lock_irq(&md->deferred_lock);
> +		list = md->deferred;
> +		bio_list_init(&md->deferred);
> +		spin_unlock_irq(&md->deferred_lock);
> +		while ((c = bio_list_pop(&md->deferred)) != NULL)
> +			generic_make_request(c);
> +	}
> +
>  	map = dm_get_live_table(md, &srcu_idx);
>  
>  	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
> @@ -2796,7 +2797,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
>  		BUG();
>  	}
>  
> -	pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER);
> +	pools->bs = bioset_create(pool_size, front_pad, 0);
>  	if (!pools->bs)
>  		goto out;
>  
> -- 
> 2.14.0.rc0.dirty
> 
> 

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
NeilBrown Sept. 1, 2017, 3:48 a.m. UTC | #2
On Thu, Aug 31 2017, Mikulas Patocka wrote:

>> 
>> Note that only current->bio_list[0] is offloaded.  current->bio_list[1]
>> contains bios that were scheduled *before* the current one started, so
>
> These bios need to be offloaded too, otherwise you re-introduce this 
> deadlock: https://www.redhat.com/archives/dm-devel/2014-May/msg00089.html

Thanks for the link.
In the example the bio that is stuck was created in step 4.  The call
to generic_make_request() will have placed it on current->bio_list[0].
The top-level generic_make_request call by Process A is still running,
so nothing will have moved the bio to ->bio_list[1].  That only happens
after the ->make_request_fn completes, which must be after step 7.
So the problem bio is on ->bio_list[0] and the code in my patch will
pass it to a workqueue for handling.

So I don't think the deadlock would be reintroduced.  Can you see
something that I am missing?

Thanks,
NeilBrown
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Mikulas Patocka Sept. 2, 2017, 1:34 a.m. UTC | #3
On Fri, 1 Sep 2017, NeilBrown wrote:

> On Thu, Aug 31 2017, Mikulas Patocka wrote:
> 
> >> 
> >> Note that only current->bio_list[0] is offloaded.  current->bio_list[1]
> >> contains bios that were scheduled *before* the current one started, so
> >
> > These bios need to be offloaded too, otherwise you re-introduce this 
> > deadlock: https://www.redhat.com/archives/dm-devel/2014-May/msg00089.html
> 
> Thanks for the link.
> In the example the bio that is stuck was created in step 4.  The call
> to generic_make_request() will have placed it on current->bio_list[0].
> The top-level generic_make_request call by Process A is still running,
> so nothing will have moved the bio to ->bio_list[1].  That only happens
> after the ->make_request_fn completes, which must be after step 7.
> So the problem bio is on ->bio_list[0] and the code in my patch will
> pass it to a workqueue for handling.
> 
> So I don't think the deadlock would be reintroduced.  Can you see
> something that I am missing?
> 
> Thanks,
> NeilBrown

Offloading only current->bio_list[0] will work in a simple case described 
above, but not in the general case.

For example, suppose that we have a dm device where the first part is 
linear and the second part is snapshot.

* We receive bio A that crosses the linear/snapshot boundary
* DM core creates bio B, submits it to the linear target and adds it to 
current->bio_list[0]
* DM core creates bio C, submits it to the snapshot target, the snapshot 
target calls track_chunk for this bio and appends the bio to 
current->bio_list[0]
* Now, we return back to generic_make_request
* We pop bio B from current->bio_list[0]
* We execute bio_list_on_stack[1] = bio_list_on_stack[0], that moves bio C 
to bio_list_on_stack[1] - and now we lose any possibility to offload bio C 
to the rescue thread
* The kcopyd thread for the snapshot takes the snapshot lock and waits for 
bio C to finish
* We process bio B - and if processing bio B reaches something that takes 
the snapshot lock (for example an origin target for the snapshot), a 
deadlock will happen. The deadlock could be avoided by offloading bio C to 
the rescue thread, but bio C is already on bio_list_on_stack[1] and so it 
won't be offloaded

Mikulas

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
NeilBrown Sept. 4, 2017, 12:12 a.m. UTC | #4
On Fri, Sep 01 2017, Mikulas Patocka wrote:

> On Fri, 1 Sep 2017, NeilBrown wrote:
>
>> On Thu, Aug 31 2017, Mikulas Patocka wrote:
>> 
>> >> 
>> >> Note that only current->bio_list[0] is offloaded.  current->bio_list[1]
>> >> contains bios that were scheduled *before* the current one started, so
>> >
>> > These bios need to be offloaded too, otherwise you re-introduce this 
>> > deadlock: https://www.redhat.com/archives/dm-devel/2014-May/msg00089.html
>> 
>> Thanks for the link.
>> In the example the bio that is stuck was created in step 4.  The call
>> to generic_make_request() will have placed it on current->bio_list[0].
>> The top-level generic_make_request call by Process A is still running,
>> so nothing will have moved the bio to ->bio_list[1].  That only happens
>> after the ->make_request_fn completes, which must be after step 7.
>> So the problem bio is on ->bio_list[0] and the code in my patch will
>> pass it to a workqueue for handling.
>> 
>> So I don't think the deadlock would be reintroduced.  Can you see
>> something that I am missing?
>> 
>> Thanks,
>> NeilBrown
>
> Offloading only current->bio_list[0] will work in a simple case described 
> above, but not in the general case.
>
> For example, suppose that we have a dm device where the first part is 
> linear and the second part is snapshot.
>
> * We receive bio A that crosses the linear/snapshot boundary
> * DM core creates bio B, submits it to the linear target and adds it to 
> current->bio_list[0]
> * DM core creates bio C, submits it to the snapshot target, the snapshot 
> target calls track_chunk for this bio and appends the bio to 
> current->bio_list[0]
> * Now, we return back to generic_make_request
> * We pop bio B from current->bio_list[0]
> * We execute bio_list_on_stack[1] = bio_list_on_stack[0], that moves bio C 
> to bio_list_on_stack[1] - and now we lose any possibility to offload bio C 
> to the rescue thread
> * The kcopyd thread for the snapshot takes the snapshot lock and waits for 
> bio C to finish

Thanks for the explanation.
I cannot find this last step in the code.  "waits for BIO C to finish"
is presumably the called to __check_for_conflicting_io().  This is
called from kcopyd in merge_callback() -> snapshot_merge_next_chunks().
What lock is held at that time?



> * We process bio B - and if processing bio B reaches something that takes 
> the snapshot lock (for example an origin target for the snapshot), a 
> deadlock will happen. The deadlock could be avoided by offloading bio C to 
> the rescue thread, but bio C is already on bio_list_on_stack[1] and so it 
> won't be offloaded

I think the core issue behind this deadlock is that the same volume
can appear twice in the top-level device, in different regions.  An
outstanding request to one region can then interfere with a new request
to a different region.  That seems like a reasonable thing to do.
I cannot immediately see a generic way to handle this case that I am
happy with.  I'll keep hunting.

Thanks,
NeilBrown
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Mikulas Patocka Sept. 4, 2017, 4:54 p.m. UTC | #5
On Mon, 4 Sep 2017, NeilBrown wrote:

> On Fri, Sep 01 2017, Mikulas Patocka wrote:
> 
> > On Fri, 1 Sep 2017, NeilBrown wrote:
> >
> >> On Thu, Aug 31 2017, Mikulas Patocka wrote:
> >> 
> >> >> 
> >> >> Note that only current->bio_list[0] is offloaded.  current->bio_list[1]
> >> >> contains bios that were scheduled *before* the current one started, so
> >> >
> >> > These bios need to be offloaded too, otherwise you re-introduce this 
> >> > deadlock: https://www.redhat.com/archives/dm-devel/2014-May/msg00089.html
> >> 
> >> Thanks for the link.
> >> In the example the bio that is stuck was created in step 4.  The call
> >> to generic_make_request() will have placed it on current->bio_list[0].
> >> The top-level generic_make_request call by Process A is still running,
> >> so nothing will have moved the bio to ->bio_list[1].  That only happens
> >> after the ->make_request_fn completes, which must be after step 7.
> >> So the problem bio is on ->bio_list[0] and the code in my patch will
> >> pass it to a workqueue for handling.
> >> 
> >> So I don't think the deadlock would be reintroduced.  Can you see
> >> something that I am missing?
> >> 
> >> Thanks,
> >> NeilBrown
> >
> > Offloading only current->bio_list[0] will work in a simple case described 
> > above, but not in the general case.
> >
> > For example, suppose that we have a dm device where the first part is 
> > linear and the second part is snapshot.
> >
> > * We receive bio A that crosses the linear/snapshot boundary
> > * DM core creates bio B, submits it to the linear target and adds it to 
> > current->bio_list[0]
> > * DM core creates bio C, submits it to the snapshot target, the snapshot 
> > target calls track_chunk for this bio and appends the bio to 
> > current->bio_list[0]
> > * Now, we return back to generic_make_request
> > * We pop bio B from current->bio_list[0]
> > * We execute bio_list_on_stack[1] = bio_list_on_stack[0], that moves bio C 
> > to bio_list_on_stack[1] - and now we lose any possibility to offload bio C 
> > to the rescue thread
> > * The kcopyd thread for the snapshot takes the snapshot lock and waits for 
> > bio C to finish
> 
> Thanks for the explanation.
> I cannot find this last step in the code.  "waits for BIO C to finish"
> is presumably the called to __check_for_conflicting_io().  This is
> called from kcopyd in merge_callback() -> snapshot_merge_next_chunks().
> What lock is held at that time?

The function pending_complete is called from the kcopyd callback. It takes 
"down_write(&s->lock)" and calls __check_for_conflicting_io which waits 
for I/Os to finish.

> > * We process bio B - and if processing bio B reaches something that takes 
> > the snapshot lock (for example an origin target for the snapshot), a 
> > deadlock will happen. The deadlock could be avoided by offloading bio C to 
> > the rescue thread, but bio C is already on bio_list_on_stack[1] and so it 
> > won't be offloaded
> 
> I think the core issue behind this deadlock is that the same volume
> can appear twice in the top-level device, in different regions.  An
> outstanding request to one region can then interfere with a new request
> to a different region.  That seems like a reasonable thing to do.
> I cannot immediately see a generic way to handle this case that I am
> happy with.  I'll keep hunting.

You can have a snapshot and an origin for the same device in the same 
tree - it is not common, but it is possible.

Offloafing bios queued on current->bio_list avoids the deadlock, but your 
patch breaks it by offloading only the first list and not the second.

> Thanks,
> NeilBrown

Mikulas

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
diff mbox

Patch

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 24eddbdf2ab4..cb060dd6d3ca 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -71,6 +71,7 @@  struct mapped_device {
 	struct work_struct work;
 	spinlock_t deferred_lock;
 	struct bio_list deferred;
+	struct bio_list rescued;
 
 	/*
 	 * Event handling.
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2edbcc2d7d3f..774dd71cb49a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1134,65 +1134,63 @@  void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
 }
 EXPORT_SYMBOL_GPL(dm_remap_zone_report);
 
+struct clone_info {
+	struct mapped_device *md;
+	struct dm_table *map;
+	struct bio *bio;
+	struct dm_io *io;
+	sector_t sector;
+	unsigned sector_count;
+	struct blk_plug plug;
+	struct blk_plug_cb cb;
+};
+
 /*
  * Flush current->bio_list when the target map method blocks.
  * This fixes deadlocks in snapshot and possibly in other targets.
  */
-struct dm_offload {
-	struct blk_plug plug;
-	struct blk_plug_cb cb;
-};
 
 static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
 {
-	struct dm_offload *o = container_of(cb, struct dm_offload, cb);
+	struct clone_info *ci = container_of(cb, struct clone_info, cb);
 	struct bio_list list;
-	struct bio *bio;
-	int i;
 
-	INIT_LIST_HEAD(&o->cb.list);
+	INIT_LIST_HEAD(&cb->list);
 
-	if (unlikely(!current->bio_list))
+	if (unlikely(!current->bio_list || bio_list_empty(&current->bio_list[0])))
 		return;
 
-	for (i = 0; i < 2; i++) {
-		list = current->bio_list[i];
-		bio_list_init(&current->bio_list[i]);
-
-		while ((bio = bio_list_pop(&list))) {
-			struct bio_set *bs = bio->bi_pool;
-			if (unlikely(!bs) || bs == fs_bio_set ||
-			    !bs->rescue_workqueue) {
-				bio_list_add(&current->bio_list[i], bio);
-				continue;
-			}
+	list = current->bio_list[0];
+	bio_list_init(&current->bio_list[0]);
+	spin_lock(&ci->md->deferred_lock);
+	bio_list_merge(&ci->md->rescued, &list);
+	spin_unlock(&ci->md->deferred_lock);
+	queue_work(ci->md->wq, &ci->md->work);
+}
 
-			spin_lock(&bs->rescue_lock);
-			bio_list_add(&bs->rescue_list, bio);
-			queue_work(bs->rescue_workqueue, &bs->rescue_work);
-			spin_unlock(&bs->rescue_lock);
-		}
-	}
+static void dm_offload_start(struct clone_info *ci)
+{
+	blk_start_plug(&ci->plug);
+	INIT_LIST_HEAD(&ci->cb.list);
+	ci->cb.callback = flush_current_bio_list;
 }
 
-static void dm_offload_start(struct dm_offload *o)
+static void dm_offload_end(struct clone_info *ci)
 {
-	blk_start_plug(&o->plug);
-	o->cb.callback = flush_current_bio_list;
-	list_add(&o->cb.list, &current->plug->cb_list);
+	list_del(&ci->cb.list);
+	blk_finish_plug(&ci->plug);
 }
 
-static void dm_offload_end(struct dm_offload *o)
+static void dm_offload_check(struct clone_info *ci)
 {
-	list_del(&o->cb.list);
-	blk_finish_plug(&o->plug);
+	if (list_empty(&ci->cb.list))
+		list_add(&ci->cb.list, &current->plug->cb_list);
 }
 
-static void __map_bio(struct dm_target_io *tio)
+static void __map_bio(struct clone_info *ci, struct dm_target_io *tio)
 {
 	int r;
 	sector_t sector;
-	struct dm_offload o;
 	struct bio *clone = &tio->clone;
 	struct dm_target *ti = tio->ti;
 
@@ -1206,9 +1204,8 @@  static void __map_bio(struct dm_target_io *tio)
 	atomic_inc(&tio->io->io_count);
 	sector = clone->bi_iter.bi_sector;
 
-	dm_offload_start(&o);
+	dm_offload_check(ci);
 	r = ti->type->map(ti, clone);
-	dm_offload_end(&o);
 
 	switch (r) {
 	case DM_MAPIO_SUBMITTED:
@@ -1233,15 +1230,6 @@  static void __map_bio(struct dm_target_io *tio)
 	}
 }
 
-struct clone_info {
-	struct mapped_device *md;
-	struct dm_table *map;
-	struct bio *bio;
-	struct dm_io *io;
-	sector_t sector;
-	unsigned sector_count;
-};
-
 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
 {
 	bio->bi_iter.bi_sector = sector;
@@ -1291,6 +1279,7 @@  static struct dm_target_io *alloc_tio(struct clone_info *ci,
 	struct dm_target_io *tio;
 	struct bio *clone;
 
+	dm_offload_check(ci);
 	clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
 	tio = container_of(clone, struct dm_target_io, clone);
 
@@ -1314,7 +1303,7 @@  static void __clone_and_map_simple_bio(struct clone_info *ci,
 	if (len)
 		bio_setup_sector(clone, ci->sector, *len);
 
-	__map_bio(tio);
+	__map_bio(ci, tio);
 }
 
 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
@@ -1361,7 +1350,7 @@  static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
 			free_tio(tio);
 			break;
 		}
-		__map_bio(tio);
+		__map_bio(ci, tio);
 	}
 
 	return r;
@@ -1493,6 +1482,7 @@  static void __split_and_process_bio(struct mapped_device *md,
 		bio_io_error(bio);
 		return;
 	}
+	dm_offload_start(&ci);
 
 	ci.map = map;
 	ci.md = md;
@@ -1521,6 +1511,7 @@  static void __split_and_process_bio(struct mapped_device *md,
 		while (ci.sector_count && !error)
 			error = __split_and_process_non_flush(&ci);
 	}
+	dm_offload_end(&ci);
 
 	/* drop the extra reference count */
 	dec_pending(ci.io, error);
@@ -2248,6 +2239,16 @@  static void dm_wq_work(struct work_struct *work)
 	int srcu_idx;
 	struct dm_table *map;
 
+	if (!bio_list_empty(&md->rescued)) {
+		struct bio_list list;
+		spin_lock_irq(&md->deferred_lock);
+		list = md->deferred;
+		bio_list_init(&md->deferred);
+		spin_unlock_irq(&md->deferred_lock);
+		while ((c = bio_list_pop(&md->deferred)) != NULL)
+			generic_make_request(c);
+	}
+
 	map = dm_get_live_table(md, &srcu_idx);
 
 	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
@@ -2796,7 +2797,7 @@  struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
 		BUG();
 	}
 
-	pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER);
+	pools->bs = bioset_create(pool_size, front_pad, 0);
 	if (!pools->bs)
 		goto out;