diff mbox series

[RFC,4/4] dm: support bio polling

Message ID 20210616130533.754248-5-ming.lei@redhat.com (mailing list archive)
State Superseded, archived
Delegated to: Mike Snitzer
Headers show
Series block/dm: support bio polling | expand

Commit Message

Ming Lei June 16, 2021, 1:05 p.m. UTC
Support bio(REQ_POLLED) polling in the following approach:

1) setup one list in instance of 'struct dm_io', adds every 'struct
dm_target_io' instance cloned for current dm bio into this list;
store the list in 1) into bio->bi_bio_drv_data

2) hold one refcnt on io->io_count after submitting this dm bio with
REQ_POLLED

4) implement .poll_bio() callback, and iterate over the list in 1) and
polled on each ->clone of 'dm_target_io' instance; call dec_pending()
if all target ios are done in .poll_bio().

4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
which is based on Jeffle's previous patch.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/md/dm-table.c | 24 ++++++++++++++++++
 drivers/md/dm.c       | 59 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 79 insertions(+), 4 deletions(-)

Comments

Mike Snitzer June 16, 2021, 4:05 p.m. UTC | #1
On Wed, Jun 16 2021 at  9:05P -0400,
Ming Lei <ming.lei@redhat.com> wrote:

> Support bio(REQ_POLLED) polling in the following approach:
> 
> 1) setup one list in instance of 'struct dm_io', adds every 'struct
> dm_target_io' instance cloned for current dm bio into this list;
> store the list in 1) into bio->bi_bio_drv_data
> 
> 2) hold one refcnt on io->io_count after submitting this dm bio with
> REQ_POLLED
> 
> 4) implement .poll_bio() callback, and iterate over the list in 1) and
> polled on each ->clone of 'dm_target_io' instance; call dec_pending()
> if all target ios are done in .poll_bio().
> 
> 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
> which is based on Jeffle's previous patch.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>

Thanks for refreshing this DM bio polling support Ming.

In general I'm really happy to see polling switch over to using bios,
nice job Christoph! Are you hoping for all this to land in time for
5.14 merge?

Once Ming responds to my review inlined below, and I Acked-by his
set, would you be willing to fold it at the end of your patchset so
that I don't need to rebase on block to get these changes in, etc?

Mike

> ---
>  drivers/md/dm-table.c | 24 ++++++++++++++++++
>  drivers/md/dm.c       | 59 ++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 79 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index ee47a332b462..b14b379442d2 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -1491,6 +1491,12 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
>  	return &t->targets[(KEYS_PER_NODE * n) + k];
>  }
>  
> +static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
> +				   sector_t start, sector_t len, void *data)
> +{
> +	return !blk_queue_poll(bdev_get_queue(dev->bdev));
> +}
> +
>  /*
>   * type->iterate_devices() should be called when the sanity check needs to
>   * iterate and check all underlying data devices. iterate_devices() will
> @@ -1541,6 +1547,11 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
>  	return 0;
>  }
>  
> +static int dm_table_supports_poll(struct dm_table *t)
> +{
> +	return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL);
> +}
> +
>  /*
>   * Check whether a table has no data devices attached using each
>   * target's iterate_devices method.
> @@ -2078,6 +2089,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
>  
>  	dm_update_keyslot_manager(q, t);
>  	blk_queue_update_readahead(q);
> +
> +	/*
> +	 * Check for request-based device is remained to
> +	 * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
> +	 * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
> +	 * devices supporting polling.
> +	 */
> +	if (__table_type_bio_based(t->type)) {
> +		if (dm_table_supports_poll(t))
> +			blk_queue_flag_set(QUEUE_FLAG_POLL, q);
> +		else
> +			blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
> +	}
>  }
>  
>  unsigned int dm_table_get_num_targets(struct dm_table *t)
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 363f12a285ce..0a0e4a38f435 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -84,6 +84,7 @@ struct dm_target_io {
>  	struct dm_target *ti;
>  	unsigned target_bio_nr;
>  	unsigned *len_ptr;
> +	struct list_head list;
>  	bool inside_dm_io;
>  	struct bio clone;
>  };
> @@ -99,6 +100,7 @@ struct dm_io {
>  	blk_status_t status;
>  	atomic_t io_count;
>  	struct bio *orig_bio;
> +	struct list_head poll_head;
>  	unsigned long start_time;
>  	spinlock_t endio_lock;
>  	struct dm_stats_aux stats_aux;
> @@ -655,6 +657,11 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
>  	io->md = md;
>  	spin_lock_init(&io->endio_lock);
>  
> +	if (bio->bi_opf & REQ_POLLED) {
> +		bio->bi_bio_drv_data = io;
> +		INIT_LIST_HEAD(&io->poll_head);
> +	}
> +
>  	start_io_acct(io);
>  
>  	return io;
> @@ -692,6 +699,8 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
>  
>  static void free_tio(struct dm_target_io *tio)
>  {
> +	list_del_init(&tio->list);
> +
>  	if (tio->inside_dm_io)
>  		return;
>  	bio_put(&tio->clone);
> @@ -936,10 +945,15 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
>  		io_error = io->status;
>  		bio = io->orig_bio;
>  		end_io_acct(io);
> +
>  		free_io(md, io);
>  
> -		if (io_error == BLK_STS_DM_REQUEUE)
> +		if (io_error == BLK_STS_DM_REQUEUE) {
> +			/* not poll any more in case of requeue */
> +			if (bio->bi_opf & REQ_POLLED)
> +				bio->bi_opf &= ~REQ_POLLED;
>  			return;
> +		}
>  
>  		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
>  			/*
> @@ -1043,7 +1057,9 @@ static void clone_endio(struct bio *bio)
>  		up(&md->swap_bios_semaphore);
>  	}
>  
> -	free_tio(tio);
> +	/* Any cloned bio submitted as POLLED, free them all after dm_io is done */
> +	if (list_empty(&tio->list))
> +		free_tio(tio);
>  	dec_pending(io, error);
>  }
>  
> @@ -1300,6 +1316,11 @@ static void __map_bio(struct dm_target_io *tio)
>  	struct dm_io *io = tio->io;
>  	struct dm_target *ti = tio->ti;
>  
> +	if (clone->bi_opf & REQ_POLLED)
> +		list_add_tail(&tio->list, &io->poll_head);
> +	else
> +		INIT_LIST_HEAD(&tio->list);
> +

Why not INIT_LIST_HEAD() at end of alloc_tio()? Shouldn't that be done
even if you have this else claue here because you can clear REQ_POLLED
on BLK_STS_DM_REQUEUE? (otherwise you're calling list_add_tail on list
that wasn't ever INIT_LIST_HEAD).

>  	clone->bi_end_io = clone_endio;
>  
>  	/*
> @@ -1666,8 +1687,9 @@ static void __split_and_process_bio(struct mapped_device *md,
>  		}
>  	}
>  
> -	/* drop the extra reference count */
> -	dec_pending(ci.io, errno_to_blk_status(error));
> +	/* drop the extra reference count for non-POLLED bio */
> +	if (!(bio->bi_opf & REQ_POLLED))
> +		dec_pending(ci.io, errno_to_blk_status(error));
>  }
>  
>  static void dm_submit_bio(struct bio *bio)
> @@ -1707,6 +1729,34 @@ static void dm_submit_bio(struct bio *bio)
>  	dm_put_live_table(md, srcu_idx);
>  }
>  
> +static int dm_poll_bio(struct bio *bio, unsigned int flags)
> +{
> +	struct dm_io *io = bio->bi_bio_drv_data;
> +	struct dm_target_io *tio;
> +
> +	if (!(bio->bi_opf & REQ_POLLED) || !io)
> +		return 0;

Should this be a WARN_ON()? Cannot see why this would ever happen
other than a bug?  Or is there some race that makes it more likely?

> +	list_for_each_entry(tio, &io->poll_head, list)
> +		bio_poll(&tio->clone, flags);
> +
> +	/* bio_poll holds the last reference */
> +	if (atomic_read(&io->io_count) == 1) {
> +		/* free all target IOs submitted as POLLED */
> +		while (!list_empty(&io->poll_head)) {
> +			struct dm_target_io *tio =
> +				list_entry(io->poll_head.next,
> +					struct dm_target_io, list);
> +			free_tio(tio);
> +		}
> +		bio->bi_bio_drv_data = NULL;
> +		dec_pending(io, 0);
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
>  /*-----------------------------------------------------------------
>   * An IDR is used to keep track of allocated minor numbers.
>   *---------------------------------------------------------------*/
> @@ -3121,6 +3171,7 @@ static const struct pr_ops dm_pr_ops = {
>  
>  static const struct block_device_operations dm_blk_dops = {
>  	.submit_bio = dm_submit_bio,
> +	.poll_bio = dm_poll_bio,
>  	.open = dm_blk_open,
>  	.release = dm_blk_close,
>  	.ioctl = dm_blk_ioctl,
> -- 
> 2.31.1
> 

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
Ming Lei June 17, 2021, 2:14 a.m. UTC | #2
On Wed, Jun 16, 2021 at 12:05:13PM -0400, Mike Snitzer wrote:
> On Wed, Jun 16 2021 at  9:05P -0400,
> Ming Lei <ming.lei@redhat.com> wrote:
> 
> > Support bio(REQ_POLLED) polling in the following approach:
> > 
> > 1) setup one list in instance of 'struct dm_io', adds every 'struct
> > dm_target_io' instance cloned for current dm bio into this list;
> > store the list in 1) into bio->bi_bio_drv_data
> > 
> > 2) hold one refcnt on io->io_count after submitting this dm bio with
> > REQ_POLLED
> > 
> > 4) implement .poll_bio() callback, and iterate over the list in 1) and
> > polled on each ->clone of 'dm_target_io' instance; call dec_pending()
> > if all target ios are done in .poll_bio().
> > 
> > 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
> > which is based on Jeffle's previous patch.
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> 
> Thanks for refreshing this DM bio polling support Ming.
> 
> In general I'm really happy to see polling switch over to using bios,
> nice job Christoph! Are you hoping for all this to land in time for
> 5.14 merge?
> 
> Once Ming responds to my review inlined below, and I Acked-by his
> set, would you be willing to fold it at the end of your patchset so
> that I don't need to rebase on block to get these changes in, etc?
> 
> Mike
> 
> > ---
> >  drivers/md/dm-table.c | 24 ++++++++++++++++++
> >  drivers/md/dm.c       | 59 ++++++++++++++++++++++++++++++++++++++++---
> >  2 files changed, 79 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> > index ee47a332b462..b14b379442d2 100644
> > --- a/drivers/md/dm-table.c
> > +++ b/drivers/md/dm-table.c
> > @@ -1491,6 +1491,12 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
> >  	return &t->targets[(KEYS_PER_NODE * n) + k];
> >  }
> >  
> > +static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
> > +				   sector_t start, sector_t len, void *data)
> > +{
> > +	return !blk_queue_poll(bdev_get_queue(dev->bdev));
> > +}
> > +
> >  /*
> >   * type->iterate_devices() should be called when the sanity check needs to
> >   * iterate and check all underlying data devices. iterate_devices() will
> > @@ -1541,6 +1547,11 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
> >  	return 0;
> >  }
> >  
> > +static int dm_table_supports_poll(struct dm_table *t)
> > +{
> > +	return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL);
> > +}
> > +
> >  /*
> >   * Check whether a table has no data devices attached using each
> >   * target's iterate_devices method.
> > @@ -2078,6 +2089,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
> >  
> >  	dm_update_keyslot_manager(q, t);
> >  	blk_queue_update_readahead(q);
> > +
> > +	/*
> > +	 * Check for request-based device is remained to
> > +	 * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
> > +	 * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
> > +	 * devices supporting polling.
> > +	 */
> > +	if (__table_type_bio_based(t->type)) {
> > +		if (dm_table_supports_poll(t))
> > +			blk_queue_flag_set(QUEUE_FLAG_POLL, q);
> > +		else
> > +			blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
> > +	}
> >  }
> >  
> >  unsigned int dm_table_get_num_targets(struct dm_table *t)
> > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > index 363f12a285ce..0a0e4a38f435 100644
> > --- a/drivers/md/dm.c
> > +++ b/drivers/md/dm.c
> > @@ -84,6 +84,7 @@ struct dm_target_io {
> >  	struct dm_target *ti;
> >  	unsigned target_bio_nr;
> >  	unsigned *len_ptr;
> > +	struct list_head list;
> >  	bool inside_dm_io;
> >  	struct bio clone;
> >  };
> > @@ -99,6 +100,7 @@ struct dm_io {
> >  	blk_status_t status;
> >  	atomic_t io_count;
> >  	struct bio *orig_bio;
> > +	struct list_head poll_head;
> >  	unsigned long start_time;
> >  	spinlock_t endio_lock;
> >  	struct dm_stats_aux stats_aux;
> > @@ -655,6 +657,11 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
> >  	io->md = md;
> >  	spin_lock_init(&io->endio_lock);
> >  
> > +	if (bio->bi_opf & REQ_POLLED) {
> > +		bio->bi_bio_drv_data = io;
> > +		INIT_LIST_HEAD(&io->poll_head);
> > +	}
> > +
> >  	start_io_acct(io);
> >  
> >  	return io;
> > @@ -692,6 +699,8 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
> >  
> >  static void free_tio(struct dm_target_io *tio)
> >  {
> > +	list_del_init(&tio->list);
> > +
> >  	if (tio->inside_dm_io)
> >  		return;
> >  	bio_put(&tio->clone);
> > @@ -936,10 +945,15 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
> >  		io_error = io->status;
> >  		bio = io->orig_bio;
> >  		end_io_acct(io);
> > +
> >  		free_io(md, io);
> >  
> > -		if (io_error == BLK_STS_DM_REQUEUE)
> > +		if (io_error == BLK_STS_DM_REQUEUE) {
> > +			/* not poll any more in case of requeue */
> > +			if (bio->bi_opf & REQ_POLLED)
> > +				bio->bi_opf &= ~REQ_POLLED;
> >  			return;
> > +		}
> >  
> >  		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
> >  			/*
> > @@ -1043,7 +1057,9 @@ static void clone_endio(struct bio *bio)
> >  		up(&md->swap_bios_semaphore);
> >  	}
> >  
> > -	free_tio(tio);
> > +	/* Any cloned bio submitted as POLLED, free them all after dm_io is done */
> > +	if (list_empty(&tio->list))
> > +		free_tio(tio);
> >  	dec_pending(io, error);
> >  }
> >  
> > @@ -1300,6 +1316,11 @@ static void __map_bio(struct dm_target_io *tio)
> >  	struct dm_io *io = tio->io;
> >  	struct dm_target *ti = tio->ti;
> >  
> > +	if (clone->bi_opf & REQ_POLLED)
> > +		list_add_tail(&tio->list, &io->poll_head);
> > +	else
> > +		INIT_LIST_HEAD(&tio->list);
> > +
> 
> Why not INIT_LIST_HEAD() at end of alloc_tio()? Shouldn't that be done
> even if you have this else claue here because you can clear REQ_POLLED
> on BLK_STS_DM_REQUEUE? (otherwise you're calling list_add_tail on list
> that wasn't ever INIT_LIST_HEAD).

It is fine to add one un-initialized list node via list_add_tail(), just
the list itself has to be initialized well.

> 
> >  	clone->bi_end_io = clone_endio;
> >  
> >  	/*
> > @@ -1666,8 +1687,9 @@ static void __split_and_process_bio(struct mapped_device *md,
> >  		}
> >  	}
> >  
> > -	/* drop the extra reference count */
> > -	dec_pending(ci.io, errno_to_blk_status(error));
> > +	/* drop the extra reference count for non-POLLED bio */
> > +	if (!(bio->bi_opf & REQ_POLLED))
> > +		dec_pending(ci.io, errno_to_blk_status(error));
> >  }
> >  
> >  static void dm_submit_bio(struct bio *bio)
> > @@ -1707,6 +1729,34 @@ static void dm_submit_bio(struct bio *bio)
> >  	dm_put_live_table(md, srcu_idx);
> >  }
> >  
> > +static int dm_poll_bio(struct bio *bio, unsigned int flags)
> > +{
> > +	struct dm_io *io = bio->bi_bio_drv_data;
> > +	struct dm_target_io *tio;
> > +
> > +	if (!(bio->bi_opf & REQ_POLLED) || !io)
> > +		return 0;
> 
> Should this be a WARN_ON()? Cannot see why this would ever happen
> other than a bug?  Or is there some race that makes it more likely?

REQ_POLLED can be cleared in case of requeue or blk_queue_split(), however
the upper layer still keeps polling. And we need to return simply for
bios which will be completed via IRQ.


Thanks,
Ming

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
Christoph Hellwig June 18, 2021, 2:33 p.m. UTC | #3
On Wed, Jun 16, 2021 at 12:05:13PM -0400, Mike Snitzer wrote:
> In general I'm really happy to see polling switch over to using bios,
> nice job Christoph! Are you hoping for all this to land in time for
> 5.14 merge?

Yes, although time is running out.

> Once Ming responds to my review inlined below, and I Acked-by his
> set, would you be willing to fold it at the end of your patchset so
> that I don't need to rebase on block to get these changes in, etc?

In principle yes, but I need to take a look first.

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
diff mbox series

Patch

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ee47a332b462..b14b379442d2 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1491,6 +1491,12 @@  struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 	return &t->targets[(KEYS_PER_NODE * n) + k];
 }
 
+static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
+				   sector_t start, sector_t len, void *data)
+{
+	return !blk_queue_poll(bdev_get_queue(dev->bdev));
+}
+
 /*
  * type->iterate_devices() should be called when the sanity check needs to
  * iterate and check all underlying data devices. iterate_devices() will
@@ -1541,6 +1547,11 @@  static int count_device(struct dm_target *ti, struct dm_dev *dev,
 	return 0;
 }
 
+static int dm_table_supports_poll(struct dm_table *t)
+{
+	return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL);
+}
+
 /*
  * Check whether a table has no data devices attached using each
  * target's iterate_devices method.
@@ -2078,6 +2089,19 @@  void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 
 	dm_update_keyslot_manager(q, t);
 	blk_queue_update_readahead(q);
+
+	/*
+	 * Check for request-based device is remained to
+	 * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
+	 * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
+	 * devices supporting polling.
+	 */
+	if (__table_type_bio_based(t->type)) {
+		if (dm_table_supports_poll(t))
+			blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+		else
+			blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+	}
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 363f12a285ce..0a0e4a38f435 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -84,6 +84,7 @@  struct dm_target_io {
 	struct dm_target *ti;
 	unsigned target_bio_nr;
 	unsigned *len_ptr;
+	struct list_head list;
 	bool inside_dm_io;
 	struct bio clone;
 };
@@ -99,6 +100,7 @@  struct dm_io {
 	blk_status_t status;
 	atomic_t io_count;
 	struct bio *orig_bio;
+	struct list_head poll_head;
 	unsigned long start_time;
 	spinlock_t endio_lock;
 	struct dm_stats_aux stats_aux;
@@ -655,6 +657,11 @@  static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 	io->md = md;
 	spin_lock_init(&io->endio_lock);
 
+	if (bio->bi_opf & REQ_POLLED) {
+		bio->bi_bio_drv_data = io;
+		INIT_LIST_HEAD(&io->poll_head);
+	}
+
 	start_io_acct(io);
 
 	return io;
@@ -692,6 +699,8 @@  static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
 
 static void free_tio(struct dm_target_io *tio)
 {
+	list_del_init(&tio->list);
+
 	if (tio->inside_dm_io)
 		return;
 	bio_put(&tio->clone);
@@ -936,10 +945,15 @@  static void dec_pending(struct dm_io *io, blk_status_t error)
 		io_error = io->status;
 		bio = io->orig_bio;
 		end_io_acct(io);
+
 		free_io(md, io);
 
-		if (io_error == BLK_STS_DM_REQUEUE)
+		if (io_error == BLK_STS_DM_REQUEUE) {
+			/* not poll any more in case of requeue */
+			if (bio->bi_opf & REQ_POLLED)
+				bio->bi_opf &= ~REQ_POLLED;
 			return;
+		}
 
 		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
 			/*
@@ -1043,7 +1057,9 @@  static void clone_endio(struct bio *bio)
 		up(&md->swap_bios_semaphore);
 	}
 
-	free_tio(tio);
+	/* Any cloned bio submitted as POLLED, free them all after dm_io is done */
+	if (list_empty(&tio->list))
+		free_tio(tio);
 	dec_pending(io, error);
 }
 
@@ -1300,6 +1316,11 @@  static void __map_bio(struct dm_target_io *tio)
 	struct dm_io *io = tio->io;
 	struct dm_target *ti = tio->ti;
 
+	if (clone->bi_opf & REQ_POLLED)
+		list_add_tail(&tio->list, &io->poll_head);
+	else
+		INIT_LIST_HEAD(&tio->list);
+
 	clone->bi_end_io = clone_endio;
 
 	/*
@@ -1666,8 +1687,9 @@  static void __split_and_process_bio(struct mapped_device *md,
 		}
 	}
 
-	/* drop the extra reference count */
-	dec_pending(ci.io, errno_to_blk_status(error));
+	/* drop the extra reference count for non-POLLED bio */
+	if (!(bio->bi_opf & REQ_POLLED))
+		dec_pending(ci.io, errno_to_blk_status(error));
 }
 
 static void dm_submit_bio(struct bio *bio)
@@ -1707,6 +1729,34 @@  static void dm_submit_bio(struct bio *bio)
 	dm_put_live_table(md, srcu_idx);
 }
 
+static int dm_poll_bio(struct bio *bio, unsigned int flags)
+{
+	struct dm_io *io = bio->bi_bio_drv_data;
+	struct dm_target_io *tio;
+
+	if (!(bio->bi_opf & REQ_POLLED) || !io)
+		return 0;
+
+	list_for_each_entry(tio, &io->poll_head, list)
+		bio_poll(&tio->clone, flags);
+
+	/* bio_poll holds the last reference */
+	if (atomic_read(&io->io_count) == 1) {
+		/* free all target IOs submitted as POLLED */
+		while (!list_empty(&io->poll_head)) {
+			struct dm_target_io *tio =
+				list_entry(io->poll_head.next,
+					struct dm_target_io, list);
+			free_tio(tio);
+		}
+		bio->bi_bio_drv_data = NULL;
+		dec_pending(io, 0);
+		return 1;
+	}
+
+	return 0;
+}
+
 /*-----------------------------------------------------------------
  * An IDR is used to keep track of allocated minor numbers.
  *---------------------------------------------------------------*/
@@ -3121,6 +3171,7 @@  static const struct pr_ops dm_pr_ops = {
 
 static const struct block_device_operations dm_blk_dops = {
 	.submit_bio = dm_submit_bio,
+	.poll_bio = dm_poll_bio,
 	.open = dm_blk_open,
 	.release = dm_blk_close,
 	.ioctl = dm_blk_ioctl,