diff mbox series

[RFC,V2,3/3] dm: support bio polling

Message ID 20210617103549.930311-4-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series block/dm: support bio polling | expand

Commit Message

Ming Lei June 17, 2021, 10:35 a.m. UTC
Support bio(REQ_POLLED) polling in the following approach:

1) only support io polling on normal READ/WRITE, and other abnormal IOs
still fallback on IRQ mode, so the target io is exactly inside the dm
io.

2) hold one refcnt on io->io_count after submitting this dm bio with
REQ_POLLED

3) support dm native bio splitting, any dm io instance associated with
current bio will be added into one list which head is bio->bi_end_io
which will be recovered before ending this bio

4) implement .poll_bio() callback, call bio_poll() on the single target
bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
dec_pending() after the target io is done in .poll_bio()

4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
which is based on Jeffle's previous patch.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/md/dm-table.c |  24 +++++++++
 drivers/md/dm.c       | 111 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 132 insertions(+), 3 deletions(-)

Comments

Ming Lei June 17, 2021, 11:08 p.m. UTC | #1
On Thu, Jun 17, 2021 at 06:35:49PM +0800, Ming Lei wrote:
> Support bio(REQ_POLLED) polling in the following approach:
> 
> 1) only support io polling on normal READ/WRITE, and other abnormal IOs
> still fallback on IRQ mode, so the target io is exactly inside the dm
> io.
> 
> 2) hold one refcnt on io->io_count after submitting this dm bio with
> REQ_POLLED
> 
> 3) support dm native bio splitting, any dm io instance associated with
> current bio will be added into one list which head is bio->bi_end_io
> which will be recovered before ending this bio
> 
> 4) implement .poll_bio() callback, call bio_poll() on the single target
> bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
> dec_pending() after the target io is done in .poll_bio()
> 
> 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
> which is based on Jeffle's previous patch.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>

...

> @@ -938,8 +945,12 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
>  		end_io_acct(io);
>  		free_io(md, io);
>  
> -		if (io_error == BLK_STS_DM_REQUEUE)
> +		if (io_error == BLK_STS_DM_REQUEUE) {
> +			/* not poll any more in case of requeue */
> +			if (bio->bi_opf & REQ_POLLED)
> +				bio->bi_opf &= ~REQ_POLLED;

It becomes not necessary to clear REQ_POLLED before requeuing since
every dm_io is added into the hlist_head which is reused from
bio->bi_end_io, so all dm-io(include the one to be requeued) will be
polled.

Thanks,
Ming
Jeffle Xu June 18, 2021, 8:19 a.m. UTC | #2
On 6/17/21 6:35 PM, Ming Lei wrote:
> Support bio(REQ_POLLED) polling in the following approach:
> 
> 1) only support io polling on normal READ/WRITE, and other abnormal IOs
> still fallback on IRQ mode, so the target io is exactly inside the dm
> io.
> 
> 2) hold one refcnt on io->io_count after submitting this dm bio with
> REQ_POLLED
> 
> 3) support dm native bio splitting, any dm io instance associated with
> current bio will be added into one list which head is bio->bi_end_io
> which will be recovered before ending this bio
> 
> 4) implement .poll_bio() callback, call bio_poll() on the single target
> bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
> dec_pending() after the target io is done in .poll_bio()
> 
> 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
> which is based on Jeffle's previous patch.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  drivers/md/dm-table.c |  24 +++++++++
>  drivers/md/dm.c       | 111 ++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 132 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index ee47a332b462..b14b379442d2 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -1491,6 +1491,12 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
>  	return &t->targets[(KEYS_PER_NODE * n) + k];
>  }
>  
> +static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
> +				   sector_t start, sector_t len, void *data)
> +{
> +	return !blk_queue_poll(bdev_get_queue(dev->bdev));
> +}
> +
>  /*
>   * type->iterate_devices() should be called when the sanity check needs to
>   * iterate and check all underlying data devices. iterate_devices() will
> @@ -1541,6 +1547,11 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
>  	return 0;
>  }
>  
> +static int dm_table_supports_poll(struct dm_table *t)
> +{
> +	return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL);
> +}
> +
>  /*
>   * Check whether a table has no data devices attached using each
>   * target's iterate_devices method.
> @@ -2078,6 +2089,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
>  
>  	dm_update_keyslot_manager(q, t);
>  	blk_queue_update_readahead(q);
> +
> +	/*
> +	 * Check for request-based device is remained to
> +	 * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
> +	 * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
> +	 * devices supporting polling.
> +	 */
> +	if (__table_type_bio_based(t->type)) {
> +		if (dm_table_supports_poll(t))
> +			blk_queue_flag_set(QUEUE_FLAG_POLL, q);
> +		else
> +			blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
> +	}
>  }
>  
>  unsigned int dm_table_get_num_targets(struct dm_table *t)
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 363f12a285ce..9745c3deacc3 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -39,6 +39,8 @@
>  #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
>  #define DM_COOKIE_LENGTH 24
>  
> +#define REQ_SAVED_END_IO             REQ_DRV
> +
>  static const char *_name = DM_NAME;
>  
>  static unsigned int major = 0;
> @@ -97,8 +99,11 @@ struct dm_io {
>  	unsigned magic;
>  	struct mapped_device *md;
>  	blk_status_t status;
> +	bool	submit_as_polled;
>  	atomic_t io_count;
>  	struct bio *orig_bio;
> +	void	*saved_bio_end_io;
> +	struct hlist_node  node;
>  	unsigned long start_time;
>  	spinlock_t endio_lock;
>  	struct dm_stats_aux stats_aux;
> @@ -687,6 +692,8 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
>  	tio->ti = ti;
>  	tio->target_bio_nr = target_bio_nr;
>  
> +	WARN_ON_ONCE(ci->io->submit_as_polled && !tio->inside_dm_io);
> +
>  	return tio;
>  }
>  
> @@ -938,8 +945,12 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
>  		end_io_acct(io);
>  		free_io(md, io);
>  
> -		if (io_error == BLK_STS_DM_REQUEUE)
> +		if (io_error == BLK_STS_DM_REQUEUE) {
> +			/* not poll any more in case of requeue */
> +			if (bio->bi_opf & REQ_POLLED)
> +				bio->bi_opf &= ~REQ_POLLED;
>  			return;
> +		}

Actually I'm not sure why REQ_POLLED should be cleared here.

Besides, there's one corner case where bio is submitted when dm device
is suspended.

```
static blk_qc_t dm_submit_bio(struct bio *bio)

	/* If suspended, queue this IO for later */
	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
		if (bio->bi_opf & REQ_NOWAIT)
			bio_wouldblock_error(bio);
		else if (bio->bi_opf & REQ_RAHEAD)
			bio_io_error(bio);
		else
			queue_io(md, bio);
		goto out;
	}
```

When the dm device is suspended with DMF_BLOCK_IO_FOR_SUSPEND, the
submitted bio is inserted into @md->deferred list by calling queue_io().
Then the following bio_poll() calling will trigger the
'WARN_ON_ONCE(!saved_bi_end_io)' in dm_poll_bio(), if the previously
submitted bio is still buffered in @md->deferred list.

```
submit_bio()
  dm_submit_bio
    queue_io

bio_poll
  dm_poll_bio
```


>  
>  		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
>  			/*
> @@ -1590,6 +1601,12 @@ static int __split_and_process_non_flush(struct clone_info *ci)
>  	if (__process_abnormal_io(ci, ti, &r))
>  		return r;
>  
> +	/*
> +	 * Only support bio polling for normal IO, and the target io is
> +	 * exactly inside the dm io instance
> +	 */
> +	ci->io->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);
> +
>  	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
>  
>  	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
> @@ -1608,6 +1625,22 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
>  	ci->map = map;
>  	ci->io = alloc_io(md, bio);
>  	ci->sector = bio->bi_iter.bi_sector;
> +
> +	if (bio->bi_opf & REQ_POLLED) {
> +		INIT_HLIST_NODE(&ci->io->node);
> +
> +		/*
> +		 * Save .bi_end_io into dm_io, so that we can reuse .bi_end_io
> +		 * for storing dm_io list
> +		 */
> +		if (bio->bi_opf & REQ_SAVED_END_IO) {
> +			ci->io->saved_bio_end_io = NULL;
> +		} else {
> +			ci->io->saved_bio_end_io = bio->bi_end_io;
> +			INIT_HLIST_HEAD((struct hlist_head *)&bio->bi_end_io);
> +			bio->bi_opf |= REQ_SAVED_END_IO;
> +		}
> +	}
>  }
>  
>  #define __dm_part_stat_sub(part, field, subnd)	\
> @@ -1666,8 +1699,19 @@ static void __split_and_process_bio(struct mapped_device *md,
>  		}
>  	}
>  
> -	/* drop the extra reference count */
> -	dec_pending(ci.io, errno_to_blk_status(error));
> +	/*
> +	 * Drop the extra reference count for non-POLLED bio, and hold one
> +	 * reference for POLLED bio, which will be released in dm_poll_bio
> +	 *
> +	 * Add every dm_io instance into the hlist_head which is stored in
> +	 * bio->bi_end_io, so that dm_poll_bio can poll them all.
> +	 */
> +	if (!ci.io->submit_as_polled)
> +		dec_pending(ci.io, errno_to_blk_status(error));
> +	else
> +		hlist_add_head(&ci.io->node,
> +				(struct hlist_head *)&bio->bi_end_io);
> +
>  }
>  
>  static void dm_submit_bio(struct bio *bio)
> @@ -1707,6 +1751,66 @@ static void dm_submit_bio(struct bio *bio)
>  	dm_put_live_table(md, srcu_idx);
>  }
>  
> +static int dm_poll_dm_io(struct dm_io *io, unsigned int flags)
> +{
> +	if (!io || !io->submit_as_polled)
> +		return 0;

Wondering if '!io->submit_as_polled' is necessary here. dm_io will be
added into the hash list only when 'io->submit_as_polled' is true in
__split_and_process_bio.


> +
> +	WARN_ON_ONCE(!io->tio.inside_dm_io);
> +	bio_poll(&io->tio.clone, flags);
> +
> +	/* bio_poll holds the last reference */
> +	if (atomic_read(&io->io_count) == 1)
> +		return 1;
> +	return 0;
> +}
> +
> +static int dm_poll_bio(struct bio *bio, unsigned int flags)
> +{
> +	struct dm_io *io;
> +	void *saved_bi_end_io = NULL;
> +	struct hlist_head tmp = HLIST_HEAD_INIT;
> +	struct hlist_head *head = (struct hlist_head *)&bio->bi_end_io;
> +	struct hlist_node *next;
> +
> +	/*
> +	 * This bio can be submitted from FS as POLLED so that FS may keep
> +	 * polling even though the flag is cleared by bio splitting or
> +	 * requeue, so return immediately.
> +	 */
> +	if (!(bio->bi_opf & REQ_POLLED))
> +		return 0;
> +
> +	hlist_move_list(head, &tmp);
> +
> +	hlist_for_each_entry(io, &tmp, node) {
> +		if (io->saved_bio_end_io && !saved_bi_end_io) {

Seems that checking if saved_bi_end_io NULL here is not necessary, since
you will exit thte loop once saved_bi_end_io is assigned.



> +			saved_bi_end_io = io->saved_bio_end_io;
> +			break;
> +		}
> +	}
> +
> +	/* restore .bi_end_io before completing dm io */
> +	WARN_ON_ONCE(!saved_bi_end_io);

Abnormal bio flagged with REQ_POLLED (is this possible?) can still be
called with dm_poll_bio(), and will trigger this warning.


> +	bio->bi_end_io = saved_bi_end_io;
> +
> +	hlist_for_each_entry_safe(io, next, &tmp, node) {
> +		if (dm_poll_dm_io(io, flags)) {
> +			hlist_del_init(&io->node);
> +			dec_pending(io, 0);
> +		}
> +	}
> +
> +	/* Not done, make sure at least one dm_io stores the .bi_end_io*/
> +	if (!hlist_empty(&tmp)) {
> +		io = hlist_entry(tmp.first, struct dm_io, node);
> +		io->saved_bio_end_io = saved_bi_end_io;
> +		hlist_move_list(&tmp, head);
> +		return 0;> +	}
> +	return 1;
> +}
> +
>  /*-----------------------------------------------------------------
>   * An IDR is used to keep track of allocated minor numbers.
>   *---------------------------------------------------------------*/
> @@ -3121,6 +3225,7 @@ static const struct pr_ops dm_pr_ops = {
>  
>  static const struct block_device_operations dm_blk_dops = {
>  	.submit_bio = dm_submit_bio,
> +	.poll_bio = dm_poll_bio,
>  	.open = dm_blk_open,
>  	.release = dm_blk_close,
>  	.ioctl = dm_blk_ioctl,
>
Ming Lei June 18, 2021, 1:29 p.m. UTC | #3
Hello Jeffle,

On Fri, Jun 18, 2021 at 04:19:10PM +0800, JeffleXu wrote:
> 
> 
> On 6/17/21 6:35 PM, Ming Lei wrote:
> > Support bio(REQ_POLLED) polling in the following approach:
> > 
> > 1) only support io polling on normal READ/WRITE, and other abnormal IOs
> > still fallback on IRQ mode, so the target io is exactly inside the dm
> > io.
> > 
> > 2) hold one refcnt on io->io_count after submitting this dm bio with
> > REQ_POLLED
> > 
> > 3) support dm native bio splitting, any dm io instance associated with
> > current bio will be added into one list which head is bio->bi_end_io
> > which will be recovered before ending this bio
> > 
> > 4) implement .poll_bio() callback, call bio_poll() on the single target
> > bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
> > dec_pending() after the target io is done in .poll_bio()
> > 
> > 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
> > which is based on Jeffle's previous patch.
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  drivers/md/dm-table.c |  24 +++++++++
> >  drivers/md/dm.c       | 111 ++++++++++++++++++++++++++++++++++++++++--
> >  2 files changed, 132 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> > index ee47a332b462..b14b379442d2 100644
> > --- a/drivers/md/dm-table.c
> > +++ b/drivers/md/dm-table.c
> > @@ -1491,6 +1491,12 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
> >  	return &t->targets[(KEYS_PER_NODE * n) + k];
> >  }
> >  
> > +static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
> > +				   sector_t start, sector_t len, void *data)
> > +{
> > +	return !blk_queue_poll(bdev_get_queue(dev->bdev));
> > +}
> > +
> >  /*
> >   * type->iterate_devices() should be called when the sanity check needs to
> >   * iterate and check all underlying data devices. iterate_devices() will
> > @@ -1541,6 +1547,11 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
> >  	return 0;
> >  }
> >  
> > +static int dm_table_supports_poll(struct dm_table *t)
> > +{
> > +	return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL);
> > +}
> > +
> >  /*
> >   * Check whether a table has no data devices attached using each
> >   * target's iterate_devices method.
> > @@ -2078,6 +2089,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
> >  
> >  	dm_update_keyslot_manager(q, t);
> >  	blk_queue_update_readahead(q);
> > +
> > +	/*
> > +	 * Check for request-based device is remained to
> > +	 * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
> > +	 * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
> > +	 * devices supporting polling.
> > +	 */
> > +	if (__table_type_bio_based(t->type)) {
> > +		if (dm_table_supports_poll(t))
> > +			blk_queue_flag_set(QUEUE_FLAG_POLL, q);
> > +		else
> > +			blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
> > +	}
> >  }
> >  
> >  unsigned int dm_table_get_num_targets(struct dm_table *t)
> > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > index 363f12a285ce..9745c3deacc3 100644
> > --- a/drivers/md/dm.c
> > +++ b/drivers/md/dm.c
> > @@ -39,6 +39,8 @@
> >  #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
> >  #define DM_COOKIE_LENGTH 24
> >  
> > +#define REQ_SAVED_END_IO             REQ_DRV
> > +
> >  static const char *_name = DM_NAME;
> >  
> >  static unsigned int major = 0;
> > @@ -97,8 +99,11 @@ struct dm_io {
> >  	unsigned magic;
> >  	struct mapped_device *md;
> >  	blk_status_t status;
> > +	bool	submit_as_polled;
> >  	atomic_t io_count;
> >  	struct bio *orig_bio;
> > +	void	*saved_bio_end_io;
> > +	struct hlist_node  node;
> >  	unsigned long start_time;
> >  	spinlock_t endio_lock;
> >  	struct dm_stats_aux stats_aux;
> > @@ -687,6 +692,8 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
> >  	tio->ti = ti;
> >  	tio->target_bio_nr = target_bio_nr;
> >  
> > +	WARN_ON_ONCE(ci->io->submit_as_polled && !tio->inside_dm_io);
> > +
> >  	return tio;
> >  }
> >  
> > @@ -938,8 +945,12 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
> >  		end_io_acct(io);
> >  		free_io(md, io);
> >  
> > -		if (io_error == BLK_STS_DM_REQUEUE)
> > +		if (io_error == BLK_STS_DM_REQUEUE) {
> > +			/* not poll any more in case of requeue */
> > +			if (bio->bi_opf & REQ_POLLED)
> > +				bio->bi_opf &= ~REQ_POLLED;
> >  			return;
> > +		}
> 
> Actually I'm not sure why REQ_POLLED should be cleared here.

It this is one dm native split bio(the one returned from bio_split() in 
__split_and_process_bio()), we will never get chance to poll it after it
is requeued.

> 
> Besides, there's one corner case where bio is submitted when dm device
> is suspended.
> 
> ```
> static blk_qc_t dm_submit_bio(struct bio *bio)
> 
> 	/* If suspended, queue this IO for later */
> 	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
> 		if (bio->bi_opf & REQ_NOWAIT)
> 			bio_wouldblock_error(bio);
> 		else if (bio->bi_opf & REQ_RAHEAD)
> 			bio_io_error(bio);
> 		else
> 			queue_io(md, bio);
> 		goto out;
> 	}
> ```
> 
> When the dm device is suspended with DMF_BLOCK_IO_FOR_SUSPEND, the
> submitted bio is inserted into @md->deferred list by calling queue_io().
> Then the following bio_poll() calling will trigger the
> 'WARN_ON_ONCE(!saved_bi_end_io)' in dm_poll_bio(), if the previously
> submitted bio is still buffered in @md->deferred list.

Good catch, bio_poll() shouldn't call on bio which isn't submitted
really, we can clear REQ_POLLED for avoiding the race. 

> 
> ```
> submit_bio()
>   dm_submit_bio
>     queue_io
> 
> bio_poll
>   dm_poll_bio
> ```
> 
> 
> >  
> >  		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
> >  			/*
> > @@ -1590,6 +1601,12 @@ static int __split_and_process_non_flush(struct clone_info *ci)
> >  	if (__process_abnormal_io(ci, ti, &r))
> >  		return r;
> >  
> > +	/*
> > +	 * Only support bio polling for normal IO, and the target io is
> > +	 * exactly inside the dm io instance
> > +	 */
> > +	ci->io->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);
> > +
> >  	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
> >  
> >  	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
> > @@ -1608,6 +1625,22 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
> >  	ci->map = map;
> >  	ci->io = alloc_io(md, bio);
> >  	ci->sector = bio->bi_iter.bi_sector;
> > +
> > +	if (bio->bi_opf & REQ_POLLED) {
> > +		INIT_HLIST_NODE(&ci->io->node);
> > +
> > +		/*
> > +		 * Save .bi_end_io into dm_io, so that we can reuse .bi_end_io
> > +		 * for storing dm_io list
> > +		 */
> > +		if (bio->bi_opf & REQ_SAVED_END_IO) {
> > +			ci->io->saved_bio_end_io = NULL;
> > +		} else {
> > +			ci->io->saved_bio_end_io = bio->bi_end_io;
> > +			INIT_HLIST_HEAD((struct hlist_head *)&bio->bi_end_io);
> > +			bio->bi_opf |= REQ_SAVED_END_IO;
> > +		}
> > +	}
> >  }
> >  
> >  #define __dm_part_stat_sub(part, field, subnd)	\
> > @@ -1666,8 +1699,19 @@ static void __split_and_process_bio(struct mapped_device *md,
> >  		}
> >  	}
> >  
> > -	/* drop the extra reference count */
> > -	dec_pending(ci.io, errno_to_blk_status(error));
> > +	/*
> > +	 * Drop the extra reference count for non-POLLED bio, and hold one
> > +	 * reference for POLLED bio, which will be released in dm_poll_bio
> > +	 *
> > +	 * Add every dm_io instance into the hlist_head which is stored in
> > +	 * bio->bi_end_io, so that dm_poll_bio can poll them all.
> > +	 */
> > +	if (!ci.io->submit_as_polled)
> > +		dec_pending(ci.io, errno_to_blk_status(error));
> > +	else
> > +		hlist_add_head(&ci.io->node,
> > +				(struct hlist_head *)&bio->bi_end_io);
> > +
> >  }
> >  
> >  static void dm_submit_bio(struct bio *bio)
> > @@ -1707,6 +1751,66 @@ static void dm_submit_bio(struct bio *bio)
> >  	dm_put_live_table(md, srcu_idx);
> >  }
> >  
> > +static int dm_poll_dm_io(struct dm_io *io, unsigned int flags)
> > +{
> > +	if (!io || !io->submit_as_polled)
> > +		return 0;
> 
> Wondering if '!io->submit_as_polled' is necessary here. dm_io will be
> added into the hash list only when 'io->submit_as_polled' is true in
> __split_and_process_bio.

OK, we can kill the above check.

> 
> 
> > +
> > +	WARN_ON_ONCE(!io->tio.inside_dm_io);
> > +	bio_poll(&io->tio.clone, flags);
> > +
> > +	/* bio_poll holds the last reference */
> > +	if (atomic_read(&io->io_count) == 1)
> > +		return 1;
> > +	return 0;
> > +}
> > +
> > +static int dm_poll_bio(struct bio *bio, unsigned int flags)
> > +{
> > +	struct dm_io *io;
> > +	void *saved_bi_end_io = NULL;
> > +	struct hlist_head tmp = HLIST_HEAD_INIT;
> > +	struct hlist_head *head = (struct hlist_head *)&bio->bi_end_io;
> > +	struct hlist_node *next;
> > +
> > +	/*
> > +	 * This bio can be submitted from FS as POLLED so that FS may keep
> > +	 * polling even though the flag is cleared by bio splitting or
> > +	 * requeue, so return immediately.
> > +	 */
> > +	if (!(bio->bi_opf & REQ_POLLED))
> > +		return 0;
> > +
> > +	hlist_move_list(head, &tmp);
> > +
> > +	hlist_for_each_entry(io, &tmp, node) {
> > +		if (io->saved_bio_end_io && !saved_bi_end_io) {
> 
> Seems that checking if saved_bi_end_io NULL here is not necessary, since
> you will exit thte loop once saved_bi_end_io is assigned.

Yeah, indeed.

> 
> 
> 
> > +			saved_bi_end_io = io->saved_bio_end_io;
> > +			break;
> > +		}
> > +	}
> > +
> > +	/* restore .bi_end_io before completing dm io */
> > +	WARN_ON_ONCE(!saved_bi_end_io);
> 
> Abnormal bio flagged with REQ_POLLED (is this possible?) can still be
> called with dm_poll_bio(), and will trigger this warning.

OK, I think we need to return 0 from dm_poll_bio() immediately if
the hlist stored in bio->bi_end_io is empty.


Thanks, 
Ming
Ming Lei June 18, 2021, 2:39 p.m. UTC | #4
From 47e523b9ee988317369eaadb96826323cd86819e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Jun 2021 16:13:46 +0800
Subject: [RFC PATCH V3 3/3] dm: support bio polling

Support bio(REQ_POLLED) polling in the following approach:

1) only support io polling on normal READ/WRITE, and other abnormal IOs
still fallback on IRQ mode, so the target io is exactly inside the dm
io.

2) hold one refcnt on io->io_count after submitting this dm bio with
REQ_POLLED

3) support dm native bio splitting, any dm io instance associated with
current bio will be added into one list which head is bio->bi_end_io
which will be recovered before ending this bio

4) implement .poll_bio() callback, call bio_poll() on the single target
bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
dec_pending() after the target io is done in .poll_bio()

4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
which is based on Jeffle's previous patch.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
V3:
	- covers all comments from Jeffle
	- fix corner cases when polling on abnormal ios

 drivers/md/dm-table.c |  24 ++++++++
 drivers/md/dm.c       | 127 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 147 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ee47a332b462..b14b379442d2 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1491,6 +1491,12 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 	return &t->targets[(KEYS_PER_NODE * n) + k];
 }
 
+static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
+				   sector_t start, sector_t len, void *data)
+{
+	return !blk_queue_poll(bdev_get_queue(dev->bdev));
+}
+
 /*
  * type->iterate_devices() should be called when the sanity check needs to
  * iterate and check all underlying data devices. iterate_devices() will
@@ -1541,6 +1547,11 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
 	return 0;
 }
 
+static int dm_table_supports_poll(struct dm_table *t)
+{
+	return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL);
+}
+
 /*
  * Check whether a table has no data devices attached using each
  * target's iterate_devices method.
@@ -2078,6 +2089,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 
 	dm_update_keyslot_manager(q, t);
 	blk_queue_update_readahead(q);
+
+	/*
+	 * Check for request-based device is remained to
+	 * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
+	 * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
+	 * devices supporting polling.
+	 */
+	if (__table_type_bio_based(t->type)) {
+		if (dm_table_supports_poll(t))
+			blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+		else
+			blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+	}
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 363f12a285ce..df4a6a999014 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -39,6 +39,8 @@
 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
 #define DM_COOKIE_LENGTH 24
 
+#define REQ_SAVED_END_IO             REQ_DRV
+
 static const char *_name = DM_NAME;
 
 static unsigned int major = 0;
@@ -72,6 +74,7 @@ struct clone_info {
 	struct dm_io *io;
 	sector_t sector;
 	unsigned sector_count;
+	bool	submit_as_polled;
 };
 
 /*
@@ -99,6 +102,8 @@ struct dm_io {
 	blk_status_t status;
 	atomic_t io_count;
 	struct bio *orig_bio;
+	void	*saved_bio_end_io;
+	struct hlist_node  node;
 	unsigned long start_time;
 	spinlock_t endio_lock;
 	struct dm_stats_aux stats_aux;
@@ -687,6 +692,8 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
 	tio->ti = ti;
 	tio->target_bio_nr = target_bio_nr;
 
+	WARN_ON_ONCE(ci->submit_as_polled && !tio->inside_dm_io);
+
 	return tio;
 }
 
@@ -938,8 +945,14 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
 		end_io_acct(io);
 		free_io(md, io);
 
-		if (io_error == BLK_STS_DM_REQUEUE)
+		if (io_error == BLK_STS_DM_REQUEUE) {
+			/*
+			 * Upper layer won't help us poll split bio, so
+			 * clear REQ_POLLED in case of requeue
+			 */
+			bio->bi_opf &= ~REQ_POLLED;
 			return;
+		}
 
 		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
 			/*
@@ -1574,6 +1587,32 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
 	return true;
 }
 
+static void dm_setup_polled_io(struct clone_info *ci)
+{
+	struct bio *bio = ci->bio;
+
+	/*
+	 * Only support bio polling for normal IO, and the target io is
+	 * exactly inside the dm io instance
+	 */
+	ci->submit_as_polled = !!(bio->bi_opf & REQ_POLLED);
+	if (!ci->submit_as_polled)
+		return;
+
+	INIT_HLIST_NODE(&ci->io->node);
+	/*
+	 * Save .bi_end_io into dm_io, so that we can reuse .bi_end_io
+	 * for storing dm_io list
+	 */
+	if (bio->bi_opf & REQ_SAVED_END_IO) {
+		ci->io->saved_bio_end_io = NULL;
+	} else {
+		ci->io->saved_bio_end_io = bio->bi_end_io;
+		INIT_HLIST_HEAD((struct hlist_head *)&bio->bi_end_io);
+		bio->bi_opf |= REQ_SAVED_END_IO;
+	}
+}
+
 /*
  * Select the correct strategy for processing a non-flush bio.
  */
@@ -1590,6 +1629,8 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 	if (__process_abnormal_io(ci, ti, &r))
 		return r;
 
+	dm_setup_polled_io(ci);
+
 	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
 
 	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
@@ -1666,8 +1707,18 @@ static void __split_and_process_bio(struct mapped_device *md,
 		}
 	}
 
-	/* drop the extra reference count */
-	dec_pending(ci.io, errno_to_blk_status(error));
+	/*
+	 * Drop the extra reference count for non-POLLED bio, and hold one
+	 * reference for POLLED bio, which will be released in dm_poll_bio
+	 *
+	 * Add every dm_io instance into the hlist_head which is stored in
+	 * bio->bi_end_io, so that dm_poll_bio can poll them all.
+	 */
+	if (!ci.submit_as_polled)
+		dec_pending(ci.io, errno_to_blk_status(error));
+	else
+		hlist_add_head(&ci.io->node,
+				(struct hlist_head *)&bio->bi_end_io);
 }
 
 static void dm_submit_bio(struct bio *bio)
@@ -1690,8 +1741,11 @@ static void dm_submit_bio(struct bio *bio)
 			bio_wouldblock_error(bio);
 		else if (bio->bi_opf & REQ_RAHEAD)
 			bio_io_error(bio);
-		else
+		else {
+			/* Not ready for poll */
+			bio->bi_opf &= ~REQ_POLLED;
 			queue_io(md, bio);
+		}
 		goto out;
 	}
 
@@ -1707,6 +1761,70 @@ static void dm_submit_bio(struct bio *bio)
 	dm_put_live_table(md, srcu_idx);
 }
 
+static bool dm_poll_dm_io(struct dm_io *io, unsigned int flags)
+{
+	WARN_ON_ONCE(!io->tio.inside_dm_io);
+
+	bio_poll(&io->tio.clone, flags);
+
+	/* bio_poll holds the last reference */
+	return atomic_read(&io->io_count) == 1;
+}
+
+static int dm_poll_bio(struct bio *bio, unsigned int flags)
+{
+	struct dm_io *io;
+	void *saved_bi_end_io = NULL;
+	struct hlist_head tmp = HLIST_HEAD_INIT;
+	struct hlist_head *head = (struct hlist_head *)&bio->bi_end_io;
+	struct hlist_node *next;
+
+	/*
+	 * This bio can be submitted from FS as POLLED so that FS may keep
+	 * polling even though the flag is cleared by bio splitting or
+	 * requeue, so return immediately.
+	 */
+	if (!(bio->bi_opf & REQ_POLLED))
+		return 0;
+
+	/* We only poll normal bio which was marked as REQ_SAVED_END_IO */
+	if (!(bio->bi_opf & REQ_SAVED_END_IO))
+		return 0;
+
+	WARN_ON_ONCE(hlist_empty(head));
+
+	hlist_move_list(head, &tmp);
+
+	hlist_for_each_entry(io, &tmp, node) {
+		if (io->saved_bio_end_io) {
+			saved_bi_end_io = io->saved_bio_end_io;
+			break;
+		}
+	}
+
+	/* restore .bi_end_io before completing dm io */
+	WARN_ON_ONCE(!saved_bi_end_io);
+	bio->bi_opf &= ~REQ_SAVED_END_IO;
+	bio->bi_end_io = saved_bi_end_io;
+
+	hlist_for_each_entry_safe(io, next, &tmp, node) {
+		if (dm_poll_dm_io(io, flags)) {
+			hlist_del_init(&io->node);
+			dec_pending(io, 0);
+		}
+	}
+
+	/* Not done, make sure at least one dm_io stores the .bi_end_io*/
+	if (!hlist_empty(&tmp)) {
+		io = hlist_entry(tmp.first, struct dm_io, node);
+		io->saved_bio_end_io = saved_bi_end_io;
+		bio->bi_opf |= REQ_SAVED_END_IO;
+		hlist_move_list(&tmp, head);
+		return 0;
+	}
+	return 1;
+}
+
 /*-----------------------------------------------------------------
  * An IDR is used to keep track of allocated minor numbers.
  *---------------------------------------------------------------*/
@@ -3121,6 +3239,7 @@ static const struct pr_ops dm_pr_ops = {
 
 static const struct block_device_operations dm_blk_dops = {
 	.submit_bio = dm_submit_bio,
+	.poll_bio = dm_poll_bio,
 	.open = dm_blk_open,
 	.release = dm_blk_close,
 	.ioctl = dm_blk_ioctl,
Mike Snitzer June 18, 2021, 8:56 p.m. UTC | #5
[you really should've changed the subject of this email to
"[RFC PATCH V3 3/3] dm: support bio polling"]

On Fri, Jun 18 2021 at 10:39P -0400,
Ming Lei <ming.lei@redhat.com> wrote:

> From 47e523b9ee988317369eaadb96826323cd86819e Mon Sep 17 00:00:00 2001
> From: Ming Lei <ming.lei@redhat.com>
> Date: Wed, 16 Jun 2021 16:13:46 +0800
> Subject: [RFC PATCH V3 3/3] dm: support bio polling
> 
> Support bio(REQ_POLLED) polling in the following approach:
> 
> 1) only support io polling on normal READ/WRITE, and other abnormal IOs
> still fallback on IRQ mode, so the target io is exactly inside the dm
> io.
> 
> 2) hold one refcnt on io->io_count after submitting this dm bio with
> REQ_POLLED
> 
> 3) support dm native bio splitting, any dm io instance associated with
> current bio will be added into one list which head is bio->bi_end_io
> which will be recovered before ending this bio
> 
> 4) implement .poll_bio() callback, call bio_poll() on the single target
> bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
> dec_pending() after the target io is done in .poll_bio()
> 
> 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
> which is based on Jeffle's previous patch.

^ nit: two "4)", last should be 5.

> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
> V3:
> 	- covers all comments from Jeffle

Would really appreciate it if Jeffle could test these changes like he
did previous dm IO polling patchsets he implemented.  Jeffle?

> 	- fix corner cases when polling on abnormal ios
> 
>  drivers/md/dm-table.c |  24 ++++++++
>  drivers/md/dm.c       | 127 ++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 147 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index ee47a332b462..b14b379442d2 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -1491,6 +1491,12 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
>  	return &t->targets[(KEYS_PER_NODE * n) + k];
>  }
>  
> +static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
> +				   sector_t start, sector_t len, void *data)
> +{
> +	return !blk_queue_poll(bdev_get_queue(dev->bdev));
> +}
> +
>  /*
>   * type->iterate_devices() should be called when the sanity check needs to
>   * iterate and check all underlying data devices. iterate_devices() will
> @@ -1541,6 +1547,11 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
>  	return 0;
>  }
>  
> +static int dm_table_supports_poll(struct dm_table *t)
> +{
> +	return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL);
> +}
> +
>  /*
>   * Check whether a table has no data devices attached using each
>   * target's iterate_devices method.
> @@ -2078,6 +2089,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
>  
>  	dm_update_keyslot_manager(q, t);
>  	blk_queue_update_readahead(q);
> +
> +	/*
> +	 * Check for request-based device is remained to
> +	 * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
> +	 * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
> +	 * devices supporting polling.
> +	 */
> +	if (__table_type_bio_based(t->type)) {
> +		if (dm_table_supports_poll(t))
> +			blk_queue_flag_set(QUEUE_FLAG_POLL, q);
> +		else
> +			blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
> +	}
>  }
>  
>  unsigned int dm_table_get_num_targets(struct dm_table *t)
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 363f12a285ce..df4a6a999014 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -39,6 +39,8 @@
>  #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
>  #define DM_COOKIE_LENGTH 24
>  
> +#define REQ_SAVED_END_IO             REQ_DRV
> +
>  static const char *_name = DM_NAME;
>  
>  static unsigned int major = 0;
> @@ -72,6 +74,7 @@ struct clone_info {
>  	struct dm_io *io;
>  	sector_t sector;
>  	unsigned sector_count;
> +	bool	submit_as_polled;
>  };
>  
>  /*
> @@ -99,6 +102,8 @@ struct dm_io {
>  	blk_status_t status;
>  	atomic_t io_count;
>  	struct bio *orig_bio;
> +	void	*saved_bio_end_io;
> +	struct hlist_node  node;
>  	unsigned long start_time;
>  	spinlock_t endio_lock;
>  	struct dm_stats_aux stats_aux;

I'd need to check these changes with pahole, but have you looked
closely at whether these new members would be better placed (e.g. is
there an existing hole? does the new 'struct hlist_node' span a
cacheline? etc).

Also, a zoned dm-5.14 change moved this struct out to
drivers/md/dm-core.h, see:
https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=dm-5.14&id=e2118b3c3d94289852417f70ec128c25f4833aad

So seems no matter what we'll have a merge conflict.  But that's OK,
I'll just let Linus know when I send the linux-dm.git pull request for
the 5.14 merge window (assuming hch's bio polling changes land in time
with Jens).

> @@ -687,6 +692,8 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
>  	tio->ti = ti;
>  	tio->target_bio_nr = target_bio_nr;
>  
> +	WARN_ON_ONCE(ci->submit_as_polled && !tio->inside_dm_io);
> +
>  	return tio;
>  }
>  
> @@ -938,8 +945,14 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
>  		end_io_acct(io);
>  		free_io(md, io);
>  
> -		if (io_error == BLK_STS_DM_REQUEUE)
> +		if (io_error == BLK_STS_DM_REQUEUE) {
> +			/*
> +			 * Upper layer won't help us poll split bio, so
> +			 * clear REQ_POLLED in case of requeue
> +			 */

This comment isn't very clear. Meaning block core cannot handle
preserving old bio (which is poll cookie) across requeue?

> +			bio->bi_opf &= ~REQ_POLLED;
>  			return;
> +		}
>  
>  		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
>  			/*
> @@ -1574,6 +1587,32 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
>  	return true;
>  }
>  
> +static void dm_setup_polled_io(struct clone_info *ci)
> +{
> +	struct bio *bio = ci->bio;
> +
> +	/*
> +	 * Only support bio polling for normal IO, and the target io is
> +	 * exactly inside the dm io instance
> +	 */

This comment should be clearer with:

> +	/*
> +	 * Only support bio polling for normal IO that also
> +	 * hasn't been split or cloned further.
> +	 */

That way DM's use of 'inside_dm_io' flag isn't a factor.
(it just so happens 'inside_dm_io' reflects IO is first DM clone bio).

But wouldn't early return if !ci->io->tio.inside_dm_io also make sense
here?  That way you could get rid of the WARN_ON_ONCE in dm_poll_dm_io?

> +	ci->submit_as_polled = !!(bio->bi_opf & REQ_POLLED);
> +	if (!ci->submit_as_polled)
> +		return;
> +
> +	INIT_HLIST_NODE(&ci->io->node);
> +	/*
> +	 * Save .bi_end_io into dm_io, so that we can reuse .bi_end_io
> +	 * for storing dm_io list
> +	 */
> +	if (bio->bi_opf & REQ_SAVED_END_IO) {
> +		ci->io->saved_bio_end_io = NULL;
> +	} else {
> +		ci->io->saved_bio_end_io = bio->bi_end_io;
> +		INIT_HLIST_HEAD((struct hlist_head *)&bio->bi_end_io);
> +		bio->bi_opf |= REQ_SAVED_END_IO;
> +	}
> +}
> +
>  /*
>   * Select the correct strategy for processing a non-flush bio.
>   */
> @@ -1590,6 +1629,8 @@ static int __split_and_process_non_flush(struct clone_info *ci)
>  	if (__process_abnormal_io(ci, ti, &r))
>  		return r;
>  
> +	dm_setup_polled_io(ci);
> +
>  	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
>  
>  	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
> @@ -1666,8 +1707,18 @@ static void __split_and_process_bio(struct mapped_device *md,
>  		}
>  	}
>  
> -	/* drop the extra reference count */
> -	dec_pending(ci.io, errno_to_blk_status(error));
> +	/*
> +	 * Drop the extra reference count for non-POLLED bio, and hold one
> +	 * reference for POLLED bio, which will be released in dm_poll_bio
> +	 *
> +	 * Add every dm_io instance into the hlist_head which is stored in
> +	 * bio->bi_end_io, so that dm_poll_bio can poll them all.
> +	 */
> +	if (!ci.submit_as_polled)
> +		dec_pending(ci.io, errno_to_blk_status(error));
> +	else
> +		hlist_add_head(&ci.io->node,
> +				(struct hlist_head *)&bio->bi_end_io);
>  }
>  
>  static void dm_submit_bio(struct bio *bio)
> @@ -1690,8 +1741,11 @@ static void dm_submit_bio(struct bio *bio)
>  			bio_wouldblock_error(bio);
>  		else if (bio->bi_opf & REQ_RAHEAD)
>  			bio_io_error(bio);
> -		else
> +		else {
> +			/* Not ready for poll */
> +			bio->bi_opf &= ~REQ_POLLED;
>  			queue_io(md, bio);
> +		}

"Not ready for poll" isn't really a useful comment.  Maybe?:
/* Cannot support polling once IO is queued */

But should you just bio_wouldblock_error() the IO if REQ_POLLED set?
Same as done for REQ_NOWAIT?

(thought I've seen a comparable response to REQ_POLLED before but not
finding one now that I look, maybe it was in Jeffle's earlier patchset?).

But all said, I'm missing why this particular instance of queue_io()
is a problem relative to polling.  The bio will just block waiting to
be processed, if blocking is a problem then it really does seem like
bio_wouldblock_error() is appropriate.

And what about dm_io_dec_pending() should its use of queue_io() also
clear REQ_POLLED (so push clearing REQ_POLLED into queue_io?)?
Or is flush-with-data (via REQ_PREFLUSH) and REQ_POLLING mutually
exclussive?

>  		goto out;
>  	}
>  
> @@ -1707,6 +1761,70 @@ static void dm_submit_bio(struct bio *bio)
>  	dm_put_live_table(md, srcu_idx);
>  }
>  
> +static bool dm_poll_dm_io(struct dm_io *io, unsigned int flags)
> +{
> +	WARN_ON_ONCE(!io->tio.inside_dm_io);
> +
> +	bio_poll(&io->tio.clone, flags);
> +
> +	/* bio_poll holds the last reference */
> +	return atomic_read(&io->io_count) == 1;
> +}
> +
> +static int dm_poll_bio(struct bio *bio, unsigned int flags)
> +{
> +	struct dm_io *io;
> +	void *saved_bi_end_io = NULL;
> +	struct hlist_head tmp = HLIST_HEAD_INIT;
> +	struct hlist_head *head = (struct hlist_head *)&bio->bi_end_io;
> +	struct hlist_node *next;
> +
> +	/*
> +	 * This bio can be submitted from FS as POLLED so that FS may keep
> +	 * polling even though the flag is cleared by bio splitting or
> +	 * requeue, so return immediately.
> +	 */
> +	if (!(bio->bi_opf & REQ_POLLED))
> +		return 0;
> +
> +	/* We only poll normal bio which was marked as REQ_SAVED_END_IO */
> +	if (!(bio->bi_opf & REQ_SAVED_END_IO))
> +		return 0;
> +
> +	WARN_ON_ONCE(hlist_empty(head));
> +
> +	hlist_move_list(head, &tmp);
> +
> +	hlist_for_each_entry(io, &tmp, node) {
> +		if (io->saved_bio_end_io) {
> +			saved_bi_end_io = io->saved_bio_end_io;
> +			break;
> +		}
> +	}
> +
> +	/* restore .bi_end_io before completing dm io */
> +	WARN_ON_ONCE(!saved_bi_end_io);
> +	bio->bi_opf &= ~REQ_SAVED_END_IO;
> +	bio->bi_end_io = saved_bi_end_io;
> +
> +	hlist_for_each_entry_safe(io, next, &tmp, node) {
> +		if (dm_poll_dm_io(io, flags)) {
> +			hlist_del_init(&io->node);
> +			dec_pending(io, 0);
> +		}
> +	}
> +
> +	/* Not done, make sure at least one dm_io stores the .bi_end_io*/
> +	if (!hlist_empty(&tmp)) {
> +		io = hlist_entry(tmp.first, struct dm_io, node);
> +		io->saved_bio_end_io = saved_bi_end_io;
> +		bio->bi_opf |= REQ_SAVED_END_IO;
> +		hlist_move_list(&tmp, head);
> +		return 0;
> +	}
> +	return 1;
> +}
> +
>  /*-----------------------------------------------------------------
>   * An IDR is used to keep track of allocated minor numbers.
>   *---------------------------------------------------------------*/
> @@ -3121,6 +3239,7 @@ static const struct pr_ops dm_pr_ops = {
>  
>  static const struct block_device_operations dm_blk_dops = {
>  	.submit_bio = dm_submit_bio,
> +	.poll_bio = dm_poll_bio,
>  	.open = dm_blk_open,
>  	.release = dm_blk_close,
>  	.ioctl = dm_blk_ioctl,
> -- 
> 2.31.1
>
Ming Lei June 19, 2021, 12:27 a.m. UTC | #6
On Fri, Jun 18, 2021 at 04:56:45PM -0400, Mike Snitzer wrote:
> [you really should've changed the subject of this email to
> "[RFC PATCH V3 3/3] dm: support bio polling"]
> 
> On Fri, Jun 18 2021 at 10:39P -0400,
> Ming Lei <ming.lei@redhat.com> wrote:
> 
> > From 47e523b9ee988317369eaadb96826323cd86819e Mon Sep 17 00:00:00 2001
> > From: Ming Lei <ming.lei@redhat.com>
> > Date: Wed, 16 Jun 2021 16:13:46 +0800
> > Subject: [RFC PATCH V3 3/3] dm: support bio polling
> > 
> > Support bio(REQ_POLLED) polling in the following approach:
> > 
> > 1) only support io polling on normal READ/WRITE, and other abnormal IOs
> > still fallback on IRQ mode, so the target io is exactly inside the dm
> > io.
> > 
> > 2) hold one refcnt on io->io_count after submitting this dm bio with
> > REQ_POLLED
> > 
> > 3) support dm native bio splitting, any dm io instance associated with
> > current bio will be added into one list which head is bio->bi_end_io
> > which will be recovered before ending this bio
> > 
> > 4) implement .poll_bio() callback, call bio_poll() on the single target
> > bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
> > dec_pending() after the target io is done in .poll_bio()
> > 
> > 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
> > which is based on Jeffle's previous patch.
> 
> ^ nit: two "4)", last should be 5.
> 
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> > V3:
> > 	- covers all comments from Jeffle
> 
> Would really appreciate it if Jeffle could test these changes like he
> did previous dm IO polling patchsets he implemented.  Jeffle?

Yeah, I am looking forward to Jeffle's test too, :-)

> 
> > 	- fix corner cases when polling on abnormal ios
> > 
> >  drivers/md/dm-table.c |  24 ++++++++
> >  drivers/md/dm.c       | 127 ++++++++++++++++++++++++++++++++++++++++--
> >  2 files changed, 147 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> > index ee47a332b462..b14b379442d2 100644
> > --- a/drivers/md/dm-table.c
> > +++ b/drivers/md/dm-table.c
> > @@ -1491,6 +1491,12 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
> >  	return &t->targets[(KEYS_PER_NODE * n) + k];
> >  }
> >  
> > +static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
> > +				   sector_t start, sector_t len, void *data)
> > +{
> > +	return !blk_queue_poll(bdev_get_queue(dev->bdev));
> > +}
> > +
> >  /*
> >   * type->iterate_devices() should be called when the sanity check needs to
> >   * iterate and check all underlying data devices. iterate_devices() will
> > @@ -1541,6 +1547,11 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
> >  	return 0;
> >  }
> >  
> > +static int dm_table_supports_poll(struct dm_table *t)
> > +{
> > +	return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL);
> > +}
> > +
> >  /*
> >   * Check whether a table has no data devices attached using each
> >   * target's iterate_devices method.
> > @@ -2078,6 +2089,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
> >  
> >  	dm_update_keyslot_manager(q, t);
> >  	blk_queue_update_readahead(q);
> > +
> > +	/*
> > +	 * Check for request-based device is remained to
> > +	 * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
> > +	 * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
> > +	 * devices supporting polling.
> > +	 */
> > +	if (__table_type_bio_based(t->type)) {
> > +		if (dm_table_supports_poll(t))
> > +			blk_queue_flag_set(QUEUE_FLAG_POLL, q);
> > +		else
> > +			blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
> > +	}
> >  }
> >  
> >  unsigned int dm_table_get_num_targets(struct dm_table *t)
> > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > index 363f12a285ce..df4a6a999014 100644
> > --- a/drivers/md/dm.c
> > +++ b/drivers/md/dm.c
> > @@ -39,6 +39,8 @@
> >  #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
> >  #define DM_COOKIE_LENGTH 24
> >  
> > +#define REQ_SAVED_END_IO             REQ_DRV
> > +
> >  static const char *_name = DM_NAME;
> >  
> >  static unsigned int major = 0;
> > @@ -72,6 +74,7 @@ struct clone_info {
> >  	struct dm_io *io;
> >  	sector_t sector;
> >  	unsigned sector_count;
> > +	bool	submit_as_polled;
> >  };
> >  
> >  /*
> > @@ -99,6 +102,8 @@ struct dm_io {
> >  	blk_status_t status;
> >  	atomic_t io_count;
> >  	struct bio *orig_bio;
> > +	void	*saved_bio_end_io;
> > +	struct hlist_node  node;
> >  	unsigned long start_time;
> >  	spinlock_t endio_lock;
> >  	struct dm_stats_aux stats_aux;
> 
> I'd need to check these changes with pahole, but have you looked
> closely at whether these new members would be better placed (e.g. is
> there an existing hole? does the new 'struct hlist_node' span a
> cacheline? etc).

'hlist_node' won't span a cacheline, and there isn't hole in 'dm_io'
available for the new two fields too.

But looks we have to add one extra cacheline for holding the two new
fields. Originally all fields except for the last one(dm_target_io)
can be held in single cacheline.

> 
> Also, a zoned dm-5.14 change moved this struct out to
> drivers/md/dm-core.h, see:
> https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=dm-5.14&id=e2118b3c3d94289852417f70ec128c25f4833aad
> 
> So seems no matter what we'll have a merge conflict.  But that's OK,
> I'll just let Linus know when I send the linux-dm.git pull request for
> the 5.14 merge window (assuming hch's bio polling changes land in time
> with Jens).
> 
> > @@ -687,6 +692,8 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
> >  	tio->ti = ti;
> >  	tio->target_bio_nr = target_bio_nr;
> >  
> > +	WARN_ON_ONCE(ci->submit_as_polled && !tio->inside_dm_io);
> > +
> >  	return tio;
> >  }
> >  
> > @@ -938,8 +945,14 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
> >  		end_io_acct(io);
> >  		free_io(md, io);
> >  
> > -		if (io_error == BLK_STS_DM_REQUEUE)
> > +		if (io_error == BLK_STS_DM_REQUEUE) {
> > +			/*
> > +			 * Upper layer won't help us poll split bio, so
> > +			 * clear REQ_POLLED in case of requeue
> > +			 */
> 
> This comment isn't very clear. Meaning block core cannot handle
> preserving old bio (which is poll cookie) across requeue?

bio->bi_cookie isn't used by dm queue yet, and we simply clear
POLLED if it isn't submitted directly or needs requeue. So in future,
bio->bi_cookie can be reused for other purpose.

The upper layer(FS) can only call bio_poll() on the bio which is submitted
via submit_bio() from upper layer code. And bio_poll() won't be called on
split bio originated from block layer or dm driver.

The above commit means that this bio may be split bio from bio_split()
in __split_and_process_bio(). And if it is requeued, we will handle it
in 'IRQ' mode, so it can be completed.

> 
> > +			bio->bi_opf &= ~REQ_POLLED;
> >  			return;
> > +		}
> >  
> >  		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
> >  			/*
> > @@ -1574,6 +1587,32 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
> >  	return true;
> >  }
> >  
> > +static void dm_setup_polled_io(struct clone_info *ci)
> > +{
> > +	struct bio *bio = ci->bio;
> > +
> > +	/*
> > +	 * Only support bio polling for normal IO, and the target io is
> > +	 * exactly inside the dm io instance
> > +	 */
> 
> This comment should be clearer with:
> 
> > +	/*
> > +	 * Only support bio polling for normal IO that also
> > +	 * hasn't been split or cloned further.
> > +	 */

This bio could have been split in __split_and_process_bio() already and
re-submitted via submit_bio_noacct(), and we still can support poll on it
because it is always the bio originated from FS layer. All split bios(dm io)
from this bio are added into hlist which head is stored in this FS bio's bi_end_io().

> 
> That way DM's use of 'inside_dm_io' flag isn't a factor.
> (it just so happens 'inside_dm_io' reflects IO is first DM clone bio).
> 
> But wouldn't early return if !ci->io->tio.inside_dm_io also make sense
> here?  That way you could get rid of the WARN_ON_ONCE in dm_poll_dm_io?

inside_dm_io is always true here, and it will be updated in alloc_tio()
which isn't run yet.

> 
> > +	ci->submit_as_polled = !!(bio->bi_opf & REQ_POLLED);
> > +	if (!ci->submit_as_polled)
> > +		return;
> > +
> > +	INIT_HLIST_NODE(&ci->io->node);
> > +	/*
> > +	 * Save .bi_end_io into dm_io, so that we can reuse .bi_end_io
> > +	 * for storing dm_io list
> > +	 */
> > +	if (bio->bi_opf & REQ_SAVED_END_IO) {
> > +		ci->io->saved_bio_end_io = NULL;
> > +	} else {
> > +		ci->io->saved_bio_end_io = bio->bi_end_io;
> > +		INIT_HLIST_HEAD((struct hlist_head *)&bio->bi_end_io);
> > +		bio->bi_opf |= REQ_SAVED_END_IO;
> > +	}
> > +}
> > +
> >  /*
> >   * Select the correct strategy for processing a non-flush bio.
> >   */
> > @@ -1590,6 +1629,8 @@ static int __split_and_process_non_flush(struct clone_info *ci)
> >  	if (__process_abnormal_io(ci, ti, &r))
> >  		return r;
> >  
> > +	dm_setup_polled_io(ci);
> > +
> >  	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
> >  
> >  	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
> > @@ -1666,8 +1707,18 @@ static void __split_and_process_bio(struct mapped_device *md,
> >  		}
> >  	}
> >  
> > -	/* drop the extra reference count */
> > -	dec_pending(ci.io, errno_to_blk_status(error));
> > +	/*
> > +	 * Drop the extra reference count for non-POLLED bio, and hold one
> > +	 * reference for POLLED bio, which will be released in dm_poll_bio
> > +	 *
> > +	 * Add every dm_io instance into the hlist_head which is stored in
> > +	 * bio->bi_end_io, so that dm_poll_bio can poll them all.
> > +	 */
> > +	if (!ci.submit_as_polled)
> > +		dec_pending(ci.io, errno_to_blk_status(error));
> > +	else
> > +		hlist_add_head(&ci.io->node,
> > +				(struct hlist_head *)&bio->bi_end_io);
> >  }
> >  
> >  static void dm_submit_bio(struct bio *bio)
> > @@ -1690,8 +1741,11 @@ static void dm_submit_bio(struct bio *bio)
> >  			bio_wouldblock_error(bio);
> >  		else if (bio->bi_opf & REQ_RAHEAD)
> >  			bio_io_error(bio);
> > -		else
> > +		else {
> > +			/* Not ready for poll */
> > +			bio->bi_opf &= ~REQ_POLLED;
> >  			queue_io(md, bio);
> > +		}
> 
> "Not ready for poll" isn't really a useful comment.  Maybe?:
> /* Cannot support polling once IO is queued */

We setup dm bio polling mechanism only after __split_and_process_bio()
is called, so 'not ready for poll'.

> 
> But should you just bio_wouldblock_error() the IO if REQ_POLLED set?
> Same as done for REQ_NOWAIT?

I guess we can't, because the two flags are independent.
> 
> (thought I've seen a comparable response to REQ_POLLED before but not
> finding one now that I look, maybe it was in Jeffle's earlier patchset?).
> 
> But all said, I'm missing why this particular instance of queue_io()
> is a problem relative to polling.  The bio will just block waiting to
> be processed, if blocking is a problem then it really does seem like
> bio_wouldblock_error() is appropriate.

It was cleared because I thought we can't call dm_poll_bio() for this
bio queued via queue_io() here, because we don't call dm_setup_polled_io
yet.

But looks we needn't to clear REQ_POLLED here, since REQ_SAVED_END_IO
isn't set for this bio, and it won't be polled in dm_poll_bio()
really until they are submitted finally.

> 
> And what about dm_io_dec_pending() should its use of queue_io() also
> clear REQ_POLLED (so push clearing REQ_POLLED into queue_io?)?
> Or is flush-with-data (via REQ_PREFLUSH) and REQ_POLLING mutually
> exclussive?

We only poll normal bio, and REQ_SAVED_END_IO isn't set for flush &
other abnormal bios, so they won't be polled via dm_poll_bio() really
and still rely on underlying's interrupt handler to complete.


Thanks, 
Ming
Jeffle Xu June 21, 2021, 1:32 a.m. UTC | #7
On 6/19/21 4:56 AM, Mike Snitzer wrote:
> [you really should've changed the subject of this email to
> "[RFC PATCH V3 3/3] dm: support bio polling"]
> 
> On Fri, Jun 18 2021 at 10:39P -0400,
> Ming Lei <ming.lei@redhat.com> wrote:
> 
>> From 47e523b9ee988317369eaadb96826323cd86819e Mon Sep 17 00:00:00 2001
>> From: Ming Lei <ming.lei@redhat.com>
>> Date: Wed, 16 Jun 2021 16:13:46 +0800
>> Subject: [RFC PATCH V3 3/3] dm: support bio polling
>>
>> Support bio(REQ_POLLED) polling in the following approach:
>>
>> 1) only support io polling on normal READ/WRITE, and other abnormal IOs
>> still fallback on IRQ mode, so the target io is exactly inside the dm
>> io.
>>
>> 2) hold one refcnt on io->io_count after submitting this dm bio with
>> REQ_POLLED
>>
>> 3) support dm native bio splitting, any dm io instance associated with
>> current bio will be added into one list which head is bio->bi_end_io
>> which will be recovered before ending this bio
>>
>> 4) implement .poll_bio() callback, call bio_poll() on the single target
>> bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
>> dec_pending() after the target io is done in .poll_bio()
>>
>> 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
>> which is based on Jeffle's previous patch.
> 
> ^ nit: two "4)", last should be 5.
> 
>>
>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>> ---
>> V3:
>> 	- covers all comments from Jeffle
> 
> Would really appreciate it if Jeffle could test these changes like he
> did previous dm IO polling patchsets he implemented.  Jeffle?

My pleasure. I would test it today and post the test result as soon as
possible.
Christoph Hellwig June 21, 2021, 7:36 a.m. UTC | #8
On Thu, Jun 17, 2021 at 06:35:49PM +0800, Ming Lei wrote:
> +	/*
> +	 * Only support bio polling for normal IO, and the target io is
> +	 * exactly inside the dm io instance
> +	 */
> +	ci->io->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);

Nit: the !! is not needed.

> @@ -1608,6 +1625,22 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
>  	ci->map = map;
>  	ci->io = alloc_io(md, bio);
>  	ci->sector = bio->bi_iter.bi_sector;
> +
> +	if (bio->bi_opf & REQ_POLLED) {
> +		INIT_HLIST_NODE(&ci->io->node);
> +
> +		/*
> +		 * Save .bi_end_io into dm_io, so that we can reuse .bi_end_io
> +		 * for storing dm_io list
> +		 */
> +		if (bio->bi_opf & REQ_SAVED_END_IO) {
> +			ci->io->saved_bio_end_io = NULL;

So if it already was saved the list gets cleared here?  Can you explain
this logic a little more?

> +		} else {
> +			ci->io->saved_bio_end_io = bio->bi_end_io;
> +			INIT_HLIST_HEAD((struct hlist_head *)&bio->bi_end_io);

I think you want to hide these casts in helpers that clearly document
why this is safe rather than sprinkling the casts all over the code.
I also wonder if there is any better way to structur this.

> +static int dm_poll_bio(struct bio *bio, unsigned int flags)
> +{
> +	struct dm_io *io;
> +	void *saved_bi_end_io = NULL;
> +	struct hlist_head tmp = HLIST_HEAD_INIT;
> +	struct hlist_head *head = (struct hlist_head *)&bio->bi_end_io;
> +	struct hlist_node *next;
> +
> +	/*
> +	 * This bio can be submitted from FS as POLLED so that FS may keep
> +	 * polling even though the flag is cleared by bio splitting or
> +	 * requeue, so return immediately.
> +	 */
> +	if (!(bio->bi_opf & REQ_POLLED))
> +		return 0;

I can't really parse the comment, can you explain this a little more?
But if we need this check, shouldn't it move to bio_poll()?

> +	hlist_for_each_entry(io, &tmp, node) {
> +		if (io->saved_bio_end_io && !saved_bi_end_io) {
> +			saved_bi_end_io = io->saved_bio_end_io;
> +			break;
> +		}
> +	}

So it seems like you don't use bi_cookie at all.  Why not turn
bi_cookie into a union to stash the hlist_head and use that?
Ming Lei June 21, 2021, 9:09 a.m. UTC | #9
On Mon, Jun 21, 2021 at 09:36:56AM +0200, Christoph Hellwig wrote:
> On Thu, Jun 17, 2021 at 06:35:49PM +0800, Ming Lei wrote:
> > +	/*
> > +	 * Only support bio polling for normal IO, and the target io is
> > +	 * exactly inside the dm io instance
> > +	 */
> > +	ci->io->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);
> 
> Nit: the !! is not needed.

OK.

> 
> > @@ -1608,6 +1625,22 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
> >  	ci->map = map;
> >  	ci->io = alloc_io(md, bio);
> >  	ci->sector = bio->bi_iter.bi_sector;
> > +
> > +	if (bio->bi_opf & REQ_POLLED) {
> > +		INIT_HLIST_NODE(&ci->io->node);
> > +
> > +		/*
> > +		 * Save .bi_end_io into dm_io, so that we can reuse .bi_end_io
> > +		 * for storing dm_io list
> > +		 */
> > +		if (bio->bi_opf & REQ_SAVED_END_IO) {
> > +			ci->io->saved_bio_end_io = NULL;
> 
> So if it already was saved the list gets cleared here?  Can you explain
> this logic a little more?

Inside dm_poll_bio() we recognize non-NULL ->saved_bio_end_io as
valid, so it has to be initialized it here.

> 
> > +		} else {
> > +			ci->io->saved_bio_end_io = bio->bi_end_io;
> > +			INIT_HLIST_HEAD((struct hlist_head *)&bio->bi_end_io);
> 
> I think you want to hide these casts in helpers that clearly document
> why this is safe rather than sprinkling the casts all over the code.
> I also wonder if there is any better way to structur this.

OK, I will add a helper of dm_get_bio_hlist_head() with comment.

> 
> > +static int dm_poll_bio(struct bio *bio, unsigned int flags)
> > +{
> > +	struct dm_io *io;
> > +	void *saved_bi_end_io = NULL;
> > +	struct hlist_head tmp = HLIST_HEAD_INIT;
> > +	struct hlist_head *head = (struct hlist_head *)&bio->bi_end_io;
> > +	struct hlist_node *next;
> > +
> > +	/*
> > +	 * This bio can be submitted from FS as POLLED so that FS may keep
> > +	 * polling even though the flag is cleared by bio splitting or
> > +	 * requeue, so return immediately.
> > +	 */
> > +	if (!(bio->bi_opf & REQ_POLLED))
> > +		return 0;
> 
> I can't really parse the comment, can you explain this a little more?
> But if we need this check, shouldn't it move to bio_poll()?

Upper layer keeps to poll one bio with POLLED, but the flag can be
cleared by driver or block layer. Once it is cleared, we should return
immediately.

Yeah, we can move it to bio_poll().

> 
> > +	hlist_for_each_entry(io, &tmp, node) {
> > +		if (io->saved_bio_end_io && !saved_bi_end_io) {
> > +			saved_bi_end_io = io->saved_bio_end_io;
> > +			break;
> > +		}
> > +	}
> 
> So it seems like you don't use bi_cookie at all.  Why not turn
> bi_cookie into a union to stash the hlist_head and use that?

hlist_head is 'void *', but ->bi_cookie is 'unsigned int'.


Thanks,
Ming
Jeffle Xu June 21, 2021, 11:33 a.m. UTC | #10
On 6/18/21 10:39 PM, Ming Lei wrote:
> From 47e523b9ee988317369eaadb96826323cd86819e Mon Sep 17 00:00:00 2001
> From: Ming Lei <ming.lei@redhat.com>
> Date: Wed, 16 Jun 2021 16:13:46 +0800
> Subject: [RFC PATCH V3 3/3] dm: support bio polling
> 
> Support bio(REQ_POLLED) polling in the following approach:
> 
> 1) only support io polling on normal READ/WRITE, and other abnormal IOs
> still fallback on IRQ mode, so the target io is exactly inside the dm
> io.
> 
> 2) hold one refcnt on io->io_count after submitting this dm bio with
> REQ_POLLED
> 
> 3) support dm native bio splitting, any dm io instance associated with
> current bio will be added into one list which head is bio->bi_end_io
> which will be recovered before ending this bio
> 
> 4) implement .poll_bio() callback, call bio_poll() on the single target
> bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
> dec_pending() after the target io is done in .poll_bio()
> 
> 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
> which is based on Jeffle's previous patch.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
> V3:
> 	- covers all comments from Jeffle
> 	- fix corner cases when polling on abnormal ios
> 
...

One bug and one performance issue, though I haven't investigated deep
for both.


kernel base: based on Jens' for-next, applying Christoph and Leiming's
patchset.


1. One bug when there's DM device stack, e.g., dm-linear upon another
dm-linear. Can be reproduced by following steps:

```
$ sudo dmsetup create tmpdev --table '0 2097152 linear /dev/nvme0n1 0'

$ cat tmp.table
0 2097152 linear /dev/mapper/tmpdev 0
2097152 2097152 linear /dev/nvme0n1 0

$ cat tmp.table | dmsetup create testdev

$ fio -name=test -ioengine=io_uring -iodepth=128 -numjobs=1 -thread
-rw=randread -direct=1 -bs=4k -time_based -runtime=10 -cpus_allowed=6
-filename=/dev/mapper/testdev -hipri=1
```


BUG: unable to handle page fault for address: ffffffffc01a6208
#PF: supervisor write access in kernel mode
#PF: error_code(0x0003) - permissions violation
PGD 39740c067 P4D 39740c067 PUD 39740e067 PMD 1035db067 PTE 1ddf6f061
Oops: 0003 [#1] SMP PTI
CPU: 6 PID: 5899 Comm: fio Tainted: G S
5.13.0-0.1.git.81bcdc3.al7.x86_64 #1
Hardware name: Inventec     K900G3-10G/B900G3, BIOS A2.20 06/23/2017
RIP: 0010:dm_submit_bio+0x171/0x3e0 [dm_mod]
Code: 08 85 c0 0f 84 78 01 00 00 80 7c 24 2c 00 0f 84 b8 00 00 00 48 8b
53 38 48 8b 44 24 18 48 85 d2 48 8d 48 28 48 89 50 28 74 04 <48> 89 4a
08 48 89 4b 38 48 83 c3 38 48 89 58 30 41 f7 c5 fe ff ff
RSP: 0018:ffff9e5c45e1b9a0 EFLAGS: 00010286
RAX: ffff8ab59fd50140 RBX: ffff8ab59fd50088 RCX: ffff8ab59fd50168
RDX: ffffffffc01a6200 RSI: 0000000000052f08 RDI: 0000000000000000
RBP: ffff8ab59fd501c8 R08: 0000000000000000 R09: 0000000000000000
R10: ffff9e5c45e1b950 R11: 0000000000000007 R12: ffff8ab4c2bc2000
R13: 0000000000000000 R14: ffff8ab4c2bc2548 R15: ffff8ab59fd50140
FS:  00007f555de42700(0000) GS:ffff8af33f180000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffffc01a6208 CR3: 0000000124990005 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 submit_bio_noacct+0x144/0x3f0
 ? submit_bio+0x42/0x120
 submit_bio+0x42/0x120
 blkdev_direct_IO+0x454/0x4b0
 ? io_resubmit_prep+0x40/0x40
 ? __fsnotify_parent+0xff/0x350
 ? __fsnotify_parent+0x10f/0x350
 ? generic_file_read_iter+0x83/0x150
 generic_file_read_iter+0x83/0x150
 blkdev_read_iter+0x41/0x50
 io_read+0xe9/0x420
 ? __cond_resched+0x16/0x40
 ? __kmalloc_node+0x16e/0x4e0
 ? memcg_alloc_page_obj_cgroups+0x32/0x90
 ? io_issue_sqe+0x7e8/0x1260
 io_issue_sqe+0x7e8/0x1260
 ? io_submit_sqes+0x47b/0x1420
 __io_queue_sqe+0x56/0x380
 ? io_submit_sqes+0x120a/0x1420
 io_submit_sqes+0x120a/0x1420
 ? __x64_sys_io_uring_enter+0x1d2/0x3e0
 __x64_sys_io_uring_enter+0x1d2/0x3e0
 ? exit_to_user_mode_prepare+0x4c/0x210
 do_syscall_64+0x36/0x70
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f55d3cb1b59
Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89
f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01
f0 ff ff 73 01 c3 48 8b 0d ff e2 2b 00 f7 d8 64 89 01 48
RSP: 002b:00007f555de41b18 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f55d3cb1b59
RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000005
RBP: 00007f557ce81000 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000001 R11: 0000000000000246 R12: 0000000001276000
R13: 0000000000000001 R14: 0000000000000000 R15: 00000000012c8328




2. Performance Issue

I test both on x86 (with only one NVMe) and aarch64 (with multiple NVMes).

The result (IOPS) on x86 is as expected:

Type 	  |IRQ   | Polling
--------- | ---- | ----
dm-linear | 239k | 357k

- dm-linear built upon one NVMe,bs=4k, iopoll=1, iodepth=128,
numjobs=1, direct, randread, ioengine=io_uring



While the result on aarch64 is a little confusing.

Type 	      |IRQ   | Polling
------------- | ---- | ----
dm-linear [1] | 208k | 230k
dm-linear [2] | 637k | 691k
dm-stripe     | 310k | 354k

- dm-linear [1] built upon *one* NVMe,bs=4k, iopoll=1, iodepth=128,
*numjobs=1*, direct, randread, ioengine=io_uring
- dm-linear [2] built upon *three* NVMes,bs=4k, iopoll=1, iodepth=128,
*numjobs=3*, direct, randread, ioengine=io_uring
- dm-stripe built upon *three* NVMes,chunk_size=4k, bs=12k, iopoll=1,
iodepth=128, numjobs=3, direct, randread, ioengine=io_uring


Following is the corresponding test result of Leiming's last
implementation for bio-based polling on aarch64.
IRQ	IOPOLL	ratio
dm-linear [2]	639K	835K	~30%
dm-stripe 	314K	408K	~30%
Ming Lei June 21, 2021, 2:04 p.m. UTC | #11
On Mon, Jun 21, 2021 at 07:33:34PM +0800, JeffleXu wrote:
> 
> 
> On 6/18/21 10:39 PM, Ming Lei wrote:
> > From 47e523b9ee988317369eaadb96826323cd86819e Mon Sep 17 00:00:00 2001
> > From: Ming Lei <ming.lei@redhat.com>
> > Date: Wed, 16 Jun 2021 16:13:46 +0800
> > Subject: [RFC PATCH V3 3/3] dm: support bio polling
> > 
> > Support bio(REQ_POLLED) polling in the following approach:
> > 
> > 1) only support io polling on normal READ/WRITE, and other abnormal IOs
> > still fallback on IRQ mode, so the target io is exactly inside the dm
> > io.
> > 
> > 2) hold one refcnt on io->io_count after submitting this dm bio with
> > REQ_POLLED
> > 
> > 3) support dm native bio splitting, any dm io instance associated with
> > current bio will be added into one list which head is bio->bi_end_io
> > which will be recovered before ending this bio
> > 
> > 4) implement .poll_bio() callback, call bio_poll() on the single target
> > bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
> > dec_pending() after the target io is done in .poll_bio()
> > 
> > 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
> > which is based on Jeffle's previous patch.
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> > V3:
> > 	- covers all comments from Jeffle
> > 	- fix corner cases when polling on abnormal ios
> > 
> ...
> 
> One bug and one performance issue, though I haven't investigated deep
> for both.
> 
> 
> kernel base: based on Jens' for-next, applying Christoph and Leiming's
> patchset.
> 
> 
> 1. One bug when there's DM device stack, e.g., dm-linear upon another
> dm-linear. Can be reproduced by following steps:
> 
> ```
> $ sudo dmsetup create tmpdev --table '0 2097152 linear /dev/nvme0n1 0'
> 
> $ cat tmp.table
> 0 2097152 linear /dev/mapper/tmpdev 0
> 2097152 2097152 linear /dev/nvme0n1 0
> 
> $ cat tmp.table | dmsetup create testdev
> 
> $ fio -name=test -ioengine=io_uring -iodepth=128 -numjobs=1 -thread
> -rw=randread -direct=1 -bs=4k -time_based -runtime=10 -cpus_allowed=6
> -filename=/dev/mapper/testdev -hipri=1
> ```
> 
> 
> BUG: unable to handle page fault for address: ffffffffc01a6208
> #PF: supervisor write access in kernel mode
> #PF: error_code(0x0003) - permissions violation
> PGD 39740c067 P4D 39740c067 PUD 39740e067 PMD 1035db067 PTE 1ddf6f061
> Oops: 0003 [#1] SMP PTI
> CPU: 6 PID: 5899 Comm: fio Tainted: G S
> 5.13.0-0.1.git.81bcdc3.al7.x86_64 #1
> Hardware name: Inventec     K900G3-10G/B900G3, BIOS A2.20 06/23/2017
> RIP: 0010:dm_submit_bio+0x171/0x3e0 [dm_mod]

It has been fixed in my local repo:

@@ -1608,6 +1649,7 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
        ci->map = map;
        ci->io = alloc_io(md, bio);
        ci->sector = bio->bi_iter.bi_sector;
+       ci->submit_as_polled = false;

> 
> 
> 2. Performance Issue
> 
> I test both on x86 (with only one NVMe) and aarch64 (with multiple NVMes).
> 
> The result (IOPS) on x86 is as expected:
> 
> Type 	  |IRQ   | Polling
> --------- | ---- | ----
> dm-linear | 239k | 357k
> 
> - dm-linear built upon one NVMe,bs=4k, iopoll=1, iodepth=128,
> numjobs=1, direct, randread, ioengine=io_uring

This data looks good.

> 
> 
> 
> While the result on aarch64 is a little confusing.
> 
> Type 	      |IRQ   | Polling
> ------------- | ---- | ----
> dm-linear [1] | 208k | 230k
> dm-linear [2] | 637k | 691k
> dm-stripe     | 310k | 354k
> 
> - dm-linear [1] built upon *one* NVMe,bs=4k, iopoll=1, iodepth=128,
> *numjobs=1*, direct, randread, ioengine=io_uring
> - dm-linear [2] built upon *three* NVMes,bs=4k, iopoll=1, iodepth=128,
> *numjobs=3*, direct, randread, ioengine=io_uring
> - dm-stripe built upon *three* NVMes,chunk_size=4k, bs=12k, iopoll=1,
> iodepth=128, numjobs=3, direct, randread, ioengine=io_uring
> 
> 
> Following is the corresponding test result of Leiming's last
> implementation for bio-based polling on aarch64.
> IRQ	IOPOLL	ratio
> dm-linear [2]	639K	835K	~30%
> dm-stripe 	314K	408K	~30%

The previous version polls one hw queue once if bios are submitted to
same hw queue. We might improve it in future.


Thanks,
Ming
Jeffle Xu June 22, 2021, 2:26 a.m. UTC | #12
On 6/21/21 10:04 PM, Ming Lei wrote:
> On Mon, Jun 21, 2021 at 07:33:34PM +0800, JeffleXu wrote:
>>
>>
>> On 6/18/21 10:39 PM, Ming Lei wrote:
>>> From 47e523b9ee988317369eaadb96826323cd86819e Mon Sep 17 00:00:00 2001
>>> From: Ming Lei <ming.lei@redhat.com>
>>> Date: Wed, 16 Jun 2021 16:13:46 +0800
>>> Subject: [RFC PATCH V3 3/3] dm: support bio polling
>>>
>>> Support bio(REQ_POLLED) polling in the following approach:
>>>
>>> 1) only support io polling on normal READ/WRITE, and other abnormal IOs
>>> still fallback on IRQ mode, so the target io is exactly inside the dm
>>> io.
>>>
>>> 2) hold one refcnt on io->io_count after submitting this dm bio with
>>> REQ_POLLED
>>>
>>> 3) support dm native bio splitting, any dm io instance associated with
>>> current bio will be added into one list which head is bio->bi_end_io
>>> which will be recovered before ending this bio
>>>
>>> 4) implement .poll_bio() callback, call bio_poll() on the single target
>>> bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
>>> dec_pending() after the target io is done in .poll_bio()
>>>
>>> 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
>>> which is based on Jeffle's previous patch.
>>>
>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>> ---
>>> V3:
>>> 	- covers all comments from Jeffle
>>> 	- fix corner cases when polling on abnormal ios
>>>
>> ...
>>
>> One bug and one performance issue, though I haven't investigated deep
>> for both.
>>
>>
>> kernel base: based on Jens' for-next, applying Christoph and Leiming's
>> patchset.
>>
>>
>> 1. One bug when there's DM device stack, e.g., dm-linear upon another
>> dm-linear. Can be reproduced by following steps:
>>
>> ```
>> $ sudo dmsetup create tmpdev --table '0 2097152 linear /dev/nvme0n1 0'
>>
>> $ cat tmp.table
>> 0 2097152 linear /dev/mapper/tmpdev 0
>> 2097152 2097152 linear /dev/nvme0n1 0
>>
>> $ cat tmp.table | dmsetup create testdev
>>
>> $ fio -name=test -ioengine=io_uring -iodepth=128 -numjobs=1 -thread
>> -rw=randread -direct=1 -bs=4k -time_based -runtime=10 -cpus_allowed=6
>> -filename=/dev/mapper/testdev -hipri=1
>> ```
>>
>>
>> BUG: unable to handle page fault for address: ffffffffc01a6208
>> #PF: supervisor write access in kernel mode
>> #PF: error_code(0x0003) - permissions violation
>> PGD 39740c067 P4D 39740c067 PUD 39740e067 PMD 1035db067 PTE 1ddf6f061
>> Oops: 0003 [#1] SMP PTI
>> CPU: 6 PID: 5899 Comm: fio Tainted: G S
>> 5.13.0-0.1.git.81bcdc3.al7.x86_64 #1
>> Hardware name: Inventec     K900G3-10G/B900G3, BIOS A2.20 06/23/2017
>> RIP: 0010:dm_submit_bio+0x171/0x3e0 [dm_mod]
> 
> It has been fixed in my local repo:
> 
> @@ -1608,6 +1649,7 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
>         ci->map = map;
>         ci->io = alloc_io(md, bio);
>         ci->sector = bio->bi_iter.bi_sector;
> +       ci->submit_as_polled = false;
> 

It doesn't work in my test environment. Actually the following fix
should be applied.


@@ -1390,6 +1403,8 @@ static int clone_bio(struct dm_target_io *tio,
struct bio *bio,
        if (bio_integrity(bio))
                bio_integrity_trim(clone);

+       clone->bi_opf &= ~REQ_SAVED_END_IO;
+
        return 0;
 }


The rationale is that, REQ_SAVED_END_IO should be cleared once the bio
*passes through* the device stack layer. Or the cloned bio for next
layer will inherit REQ_SAVED_END_IO flag, in which case
'cloned_bio->bi_end_io' (actually acts as the hlist head) won't be
initialized in dm_setup_polled_io(), and thus it gets crashed when
trying to insert into this hash list in __split_and_process_bio().
Ming Lei June 22, 2021, 2:45 a.m. UTC | #13
On Tue, Jun 22, 2021 at 10:26:15AM +0800, JeffleXu wrote:
> 
> 
> On 6/21/21 10:04 PM, Ming Lei wrote:
> > On Mon, Jun 21, 2021 at 07:33:34PM +0800, JeffleXu wrote:
> >>
> >>
> >> On 6/18/21 10:39 PM, Ming Lei wrote:
> >>> From 47e523b9ee988317369eaadb96826323cd86819e Mon Sep 17 00:00:00 2001
> >>> From: Ming Lei <ming.lei@redhat.com>
> >>> Date: Wed, 16 Jun 2021 16:13:46 +0800
> >>> Subject: [RFC PATCH V3 3/3] dm: support bio polling
> >>>
> >>> Support bio(REQ_POLLED) polling in the following approach:
> >>>
> >>> 1) only support io polling on normal READ/WRITE, and other abnormal IOs
> >>> still fallback on IRQ mode, so the target io is exactly inside the dm
> >>> io.
> >>>
> >>> 2) hold one refcnt on io->io_count after submitting this dm bio with
> >>> REQ_POLLED
> >>>
> >>> 3) support dm native bio splitting, any dm io instance associated with
> >>> current bio will be added into one list which head is bio->bi_end_io
> >>> which will be recovered before ending this bio
> >>>
> >>> 4) implement .poll_bio() callback, call bio_poll() on the single target
> >>> bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
> >>> dec_pending() after the target io is done in .poll_bio()
> >>>
> >>> 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
> >>> which is based on Jeffle's previous patch.
> >>>
> >>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> >>> ---
> >>> V3:
> >>> 	- covers all comments from Jeffle
> >>> 	- fix corner cases when polling on abnormal ios
> >>>
> >> ...
> >>
> >> One bug and one performance issue, though I haven't investigated deep
> >> for both.
> >>
> >>
> >> kernel base: based on Jens' for-next, applying Christoph and Leiming's
> >> patchset.
> >>
> >>
> >> 1. One bug when there's DM device stack, e.g., dm-linear upon another
> >> dm-linear. Can be reproduced by following steps:
> >>
> >> ```
> >> $ sudo dmsetup create tmpdev --table '0 2097152 linear /dev/nvme0n1 0'
> >>
> >> $ cat tmp.table
> >> 0 2097152 linear /dev/mapper/tmpdev 0
> >> 2097152 2097152 linear /dev/nvme0n1 0
> >>
> >> $ cat tmp.table | dmsetup create testdev
> >>
> >> $ fio -name=test -ioengine=io_uring -iodepth=128 -numjobs=1 -thread
> >> -rw=randread -direct=1 -bs=4k -time_based -runtime=10 -cpus_allowed=6
> >> -filename=/dev/mapper/testdev -hipri=1
> >> ```
> >>
> >>
> >> BUG: unable to handle page fault for address: ffffffffc01a6208
> >> #PF: supervisor write access in kernel mode
> >> #PF: error_code(0x0003) - permissions violation
> >> PGD 39740c067 P4D 39740c067 PUD 39740e067 PMD 1035db067 PTE 1ddf6f061
> >> Oops: 0003 [#1] SMP PTI
> >> CPU: 6 PID: 5899 Comm: fio Tainted: G S
> >> 5.13.0-0.1.git.81bcdc3.al7.x86_64 #1
> >> Hardware name: Inventec     K900G3-10G/B900G3, BIOS A2.20 06/23/2017
> >> RIP: 0010:dm_submit_bio+0x171/0x3e0 [dm_mod]
> > 
> > It has been fixed in my local repo:
> > 
> > @@ -1608,6 +1649,7 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
> >         ci->map = map;
> >         ci->io = alloc_io(md, bio);
> >         ci->sector = bio->bi_iter.bi_sector;
> > +       ci->submit_as_polled = false;
> > 
> 
> It doesn't work in my test environment. Actually the following fix
> should be applied.
> 
> 
> @@ -1390,6 +1403,8 @@ static int clone_bio(struct dm_target_io *tio,
> struct bio *bio,
>         if (bio_integrity(bio))
>                 bio_integrity_trim(clone);
> 
> +       clone->bi_opf &= ~REQ_SAVED_END_IO;
> +

This change is good, but it shouldn't fix the panic except for nested
device map, I will fold into V3.

>         return 0;
>  }
> 
> 
> The rationale is that, REQ_SAVED_END_IO should be cleared once the bio
> *passes through* the device stack layer. Or the cloned bio for next
> layer will inherit REQ_SAVED_END_IO flag, in which case
> 'cloned_bio->bi_end_io' (actually acts as the hlist head) won't be
> initialized in dm_setup_polled_io(), and thus it gets crashed when
> trying to insert into this hash list in __split_and_process_bio().

'cloned_bio' can't reach dm_submit_bio() if it isn't one DM bio.


Thanks,
Ming
Jeffle Xu June 22, 2021, 7:45 a.m. UTC | #14
On 6/22/21 10:45 AM, Ming Lei wrote:
> On Tue, Jun 22, 2021 at 10:26:15AM +0800, JeffleXu wrote:
>>
>>
>> On 6/21/21 10:04 PM, Ming Lei wrote:
>>> On Mon, Jun 21, 2021 at 07:33:34PM +0800, JeffleXu wrote:
>>>>
>>>>
>>>> On 6/18/21 10:39 PM, Ming Lei wrote:
>>>>> From 47e523b9ee988317369eaadb96826323cd86819e Mon Sep 17 00:00:00 2001
>>>>> From: Ming Lei <ming.lei@redhat.com>
>>>>> Date: Wed, 16 Jun 2021 16:13:46 +0800
>>>>> Subject: [RFC PATCH V3 3/3] dm: support bio polling
>>>>>
>>>>> Support bio(REQ_POLLED) polling in the following approach:
>>>>>
>>>>> 1) only support io polling on normal READ/WRITE, and other abnormal IOs
>>>>> still fallback on IRQ mode, so the target io is exactly inside the dm
>>>>> io.
>>>>>
>>>>> 2) hold one refcnt on io->io_count after submitting this dm bio with
>>>>> REQ_POLLED
>>>>>
>>>>> 3) support dm native bio splitting, any dm io instance associated with
>>>>> current bio will be added into one list which head is bio->bi_end_io
>>>>> which will be recovered before ending this bio
>>>>>
>>>>> 4) implement .poll_bio() callback, call bio_poll() on the single target
>>>>> bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
>>>>> dec_pending() after the target io is done in .poll_bio()
>>>>>
>>>>> 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
>>>>> which is based on Jeffle's previous patch.
>>>>>
>>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>>>> ---
>>>>> V3:
>>>>> 	- covers all comments from Jeffle
>>>>> 	- fix corner cases when polling on abnormal ios
>>>>>
>>>> ...
>>>>
>>>> One bug and one performance issue, though I haven't investigated deep
>>>> for both.
>>>>
>>>>
>>>> kernel base: based on Jens' for-next, applying Christoph and Leiming's
>>>> patchset.
>>>>
>>>>
>>>> 1. One bug when there's DM device stack, e.g., dm-linear upon another
>>>> dm-linear. Can be reproduced by following steps:
>>>>
>>>> ```
>>>> $ sudo dmsetup create tmpdev --table '0 2097152 linear /dev/nvme0n1 0'
>>>>
>>>> $ cat tmp.table
>>>> 0 2097152 linear /dev/mapper/tmpdev 0
>>>> 2097152 2097152 linear /dev/nvme0n1 0
>>>>
>>>> $ cat tmp.table | dmsetup create testdev
>>>>
>>>> $ fio -name=test -ioengine=io_uring -iodepth=128 -numjobs=1 -thread
>>>> -rw=randread -direct=1 -bs=4k -time_based -runtime=10 -cpus_allowed=6
>>>> -filename=/dev/mapper/testdev -hipri=1
>>>> ```
>>>>
>>>>
>>>> BUG: unable to handle page fault for address: ffffffffc01a6208
>>>> #PF: supervisor write access in kernel mode
>>>> #PF: error_code(0x0003) - permissions violation
>>>> PGD 39740c067 P4D 39740c067 PUD 39740e067 PMD 1035db067 PTE 1ddf6f061
>>>> Oops: 0003 [#1] SMP PTI
>>>> CPU: 6 PID: 5899 Comm: fio Tainted: G S
>>>> 5.13.0-0.1.git.81bcdc3.al7.x86_64 #1
>>>> Hardware name: Inventec     K900G3-10G/B900G3, BIOS A2.20 06/23/2017
>>>> RIP: 0010:dm_submit_bio+0x171/0x3e0 [dm_mod]
>>>
>>> It has been fixed in my local repo:
>>>
>>> @@ -1608,6 +1649,7 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
>>>         ci->map = map;
>>>         ci->io = alloc_io(md, bio);
>>>         ci->sector = bio->bi_iter.bi_sector;
>>> +       ci->submit_as_polled = false;
>>>
>>
>> It doesn't work in my test environment. Actually the following fix
>> should be applied.
>>
>>
>> @@ -1390,6 +1403,8 @@ static int clone_bio(struct dm_target_io *tio,
>> struct bio *bio,
>>         if (bio_integrity(bio))
>>                 bio_integrity_trim(clone);
>>
>> +       clone->bi_opf &= ~REQ_SAVED_END_IO;
>> +
> 
> This change is good, but it shouldn't fix the panic except for nested
> device map, I will fold into V3.

The panic I posted exactly happen for nested device map.

>>
>> The rationale is that, REQ_SAVED_END_IO should be cleared once the bio
>> *passes through* the device stack layer. Or the cloned bio for next
>> layer will inherit REQ_SAVED_END_IO flag, in which case
>> 'cloned_bio->bi_end_io' (actually acts as the hlist head) won't be
>> initialized in dm_setup_polled_io(), and thus it gets crashed when
>> trying to insert into this hash list in __split_and_process_bio().
> 
> 'cloned_bio' can't reach dm_submit_bio() if it isn't one DM bio.
> 

'cloned_bio' actually refers to dm_io.tio.clone, i.e., the cloned bio
used to submit to the device of the next level.

	dm1
	/\
     dm2  NVMe1
     /\
 NVMe2 NVMe3

For the above example, 'cloned_bio' refers to dm_io.tio.clone, where
this dm_io is to be submitted to dm2.


			  @bi_private
		split bio ------------------> original bio (for dm1)
		   ^			       ^
		   | @orig_bio		       | @orig_bio
		   |			       |
		dm_io(for dm2)         	   dm_io(for NVME1)
		struct dm_target_io tio
		struct bio clone
	(...following omitted for NVMe2 and NVMe3)

I mean, for above 'struct bio clone', REQ_SAVED_END_IO shall be cleared.
Ming Lei June 30, 2021, 8:30 a.m. UTC | #15
On Mon, Jun 21, 2021 at 07:33:34PM +0800, JeffleXu wrote:
> 
> 
> On 6/18/21 10:39 PM, Ming Lei wrote:
> > From 47e523b9ee988317369eaadb96826323cd86819e Mon Sep 17 00:00:00 2001
> > From: Ming Lei <ming.lei@redhat.com>
> > Date: Wed, 16 Jun 2021 16:13:46 +0800
> > Subject: [RFC PATCH V3 3/3] dm: support bio polling
> > 
> > Support bio(REQ_POLLED) polling in the following approach:
> > 
> > 1) only support io polling on normal READ/WRITE, and other abnormal IOs
> > still fallback on IRQ mode, so the target io is exactly inside the dm
> > io.
> > 
> > 2) hold one refcnt on io->io_count after submitting this dm bio with
> > REQ_POLLED
> > 
> > 3) support dm native bio splitting, any dm io instance associated with
> > current bio will be added into one list which head is bio->bi_end_io
> > which will be recovered before ending this bio
> > 
> > 4) implement .poll_bio() callback, call bio_poll() on the single target
> > bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call
> > dec_pending() after the target io is done in .poll_bio()
> > 
> > 4) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL,
> > which is based on Jeffle's previous patch.
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> > V3:
> > 	- covers all comments from Jeffle
> > 	- fix corner cases when polling on abnormal ios
> > 
> ...
> 
> One bug and one performance issue, though I haven't investigated deep
> for both.
> 
> 
> kernel base: based on Jens' for-next, applying Christoph and Leiming's
> patchset.
> 
> 
> 1. One bug when there's DM device stack, e.g., dm-linear upon another
> dm-linear. Can be reproduced by following steps:
> 
> ```
> $ sudo dmsetup create tmpdev --table '0 2097152 linear /dev/nvme0n1 0'
> 
> $ cat tmp.table
> 0 2097152 linear /dev/mapper/tmpdev 0
> 2097152 2097152 linear /dev/nvme0n1 0
> 
> $ cat tmp.table | dmsetup create testdev
> 
> $ fio -name=test -ioengine=io_uring -iodepth=128 -numjobs=1 -thread
> -rw=randread -direct=1 -bs=4k -time_based -runtime=10 -cpus_allowed=6
> -filename=/dev/mapper/testdev -hipri=1
> ```
> 
> 
> BUG: unable to handle page fault for address: ffffffffc01a6208
> #PF: supervisor write access in kernel mode
> #PF: error_code(0x0003) - permissions violation
> PGD 39740c067 P4D 39740c067 PUD 39740e067 PMD 1035db067 PTE 1ddf6f061
> Oops: 0003 [#1] SMP PTI
> CPU: 6 PID: 5899 Comm: fio Tainted: G S
> 5.13.0-0.1.git.81bcdc3.al7.x86_64 #1
> Hardware name: Inventec     K900G3-10G/B900G3, BIOS A2.20 06/23/2017
> RIP: 0010:dm_submit_bio+0x171/0x3e0 [dm_mod]
> Code: 08 85 c0 0f 84 78 01 00 00 80 7c 24 2c 00 0f 84 b8 00 00 00 48 8b
> 53 38 48 8b 44 24 18 48 85 d2 48 8d 48 28 48 89 50 28 74 04 <48> 89 4a
> 08 48 89 4b 38 48 83 c3 38 48 89 58 30 41 f7 c5 fe ff ff
> RSP: 0018:ffff9e5c45e1b9a0 EFLAGS: 00010286
> RAX: ffff8ab59fd50140 RBX: ffff8ab59fd50088 RCX: ffff8ab59fd50168
> RDX: ffffffffc01a6200 RSI: 0000000000052f08 RDI: 0000000000000000
> RBP: ffff8ab59fd501c8 R08: 0000000000000000 R09: 0000000000000000
> R10: ffff9e5c45e1b950 R11: 0000000000000007 R12: ffff8ab4c2bc2000
> R13: 0000000000000000 R14: ffff8ab4c2bc2548 R15: ffff8ab59fd50140
> FS:  00007f555de42700(0000) GS:ffff8af33f180000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: ffffffffc01a6208 CR3: 0000000124990005 CR4: 00000000003706e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
>  submit_bio_noacct+0x144/0x3f0
>  ? submit_bio+0x42/0x120
>  submit_bio+0x42/0x120
>  blkdev_direct_IO+0x454/0x4b0
>  ? io_resubmit_prep+0x40/0x40
>  ? __fsnotify_parent+0xff/0x350
>  ? __fsnotify_parent+0x10f/0x350
>  ? generic_file_read_iter+0x83/0x150
>  generic_file_read_iter+0x83/0x150
>  blkdev_read_iter+0x41/0x50
>  io_read+0xe9/0x420
>  ? __cond_resched+0x16/0x40
>  ? __kmalloc_node+0x16e/0x4e0
>  ? memcg_alloc_page_obj_cgroups+0x32/0x90
>  ? io_issue_sqe+0x7e8/0x1260
>  io_issue_sqe+0x7e8/0x1260
>  ? io_submit_sqes+0x47b/0x1420
>  __io_queue_sqe+0x56/0x380
>  ? io_submit_sqes+0x120a/0x1420
>  io_submit_sqes+0x120a/0x1420
>  ? __x64_sys_io_uring_enter+0x1d2/0x3e0
>  __x64_sys_io_uring_enter+0x1d2/0x3e0
>  ? exit_to_user_mode_prepare+0x4c/0x210
>  do_syscall_64+0x36/0x70
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> RIP: 0033:0x7f55d3cb1b59
> Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89
> f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01
> f0 ff ff 73 01 c3 48 8b 0d ff e2 2b 00 f7 d8 64 89 01 48
> RSP: 002b:00007f555de41b18 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
> RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f55d3cb1b59
> RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000005
> RBP: 00007f557ce81000 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000001 R11: 0000000000000246 R12: 0000000001276000
> R13: 0000000000000001 R14: 0000000000000000 R15: 00000000012c8328
> 
> 
> 
> 
> 2. Performance Issue
> 
> I test both on x86 (with only one NVMe) and aarch64 (with multiple NVMes).
> 
> The result (IOPS) on x86 is as expected:
> 
> Type 	  |IRQ   | Polling
> --------- | ---- | ----
> dm-linear | 239k | 357k
> 
> - dm-linear built upon one NVMe,bs=4k, iopoll=1, iodepth=128,
> numjobs=1, direct, randread, ioengine=io_uring
> 
> 
> 
> While the result on aarch64 is a little confusing.
> 
> Type 	      |IRQ   | Polling
> ------------- | ---- | ----
> dm-linear [1] | 208k | 230k
> dm-linear [2] | 637k | 691k
> dm-stripe     | 310k | 354k
> 
> - dm-linear [1] built upon *one* NVMe,bs=4k, iopoll=1, iodepth=128,
> *numjobs=1*, direct, randread, ioengine=io_uring
> - dm-linear [2] built upon *three* NVMes,bs=4k, iopoll=1, iodepth=128,
> *numjobs=3*, direct, randread, ioengine=io_uring
> - dm-stripe built upon *three* NVMes,chunk_size=4k, bs=12k, iopoll=1,
> iodepth=128, numjobs=3, direct, randread, ioengine=io_uring

Today I found the following patch makes a big difference on aarch64
nvme io polling, and you can try that in your test:

https://lore.kernel.org/linux-block/YNwfY6kExqJM65+L@T590/T/#m934fabf588d709109fd99040a3e26d7a9838db1f
https://lore.kernel.org/linux-block/YNwfY6kExqJM65+L@T590/T/#m4504b01d06566b2080216640625fac5fdd3929e5

BTW, my test machine is ampere(160cores, dual numa nodes).


Thanks,
Ming
diff mbox series

Patch

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ee47a332b462..b14b379442d2 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1491,6 +1491,12 @@  struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 	return &t->targets[(KEYS_PER_NODE * n) + k];
 }
 
+static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
+				   sector_t start, sector_t len, void *data)
+{
+	return !blk_queue_poll(bdev_get_queue(dev->bdev));
+}
+
 /*
  * type->iterate_devices() should be called when the sanity check needs to
  * iterate and check all underlying data devices. iterate_devices() will
@@ -1541,6 +1547,11 @@  static int count_device(struct dm_target *ti, struct dm_dev *dev,
 	return 0;
 }
 
+static int dm_table_supports_poll(struct dm_table *t)
+{
+	return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL);
+}
+
 /*
  * Check whether a table has no data devices attached using each
  * target's iterate_devices method.
@@ -2078,6 +2089,19 @@  void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 
 	dm_update_keyslot_manager(q, t);
 	blk_queue_update_readahead(q);
+
+	/*
+	 * Check for request-based device is remained to
+	 * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
+	 * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
+	 * devices supporting polling.
+	 */
+	if (__table_type_bio_based(t->type)) {
+		if (dm_table_supports_poll(t))
+			blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+		else
+			blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+	}
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 363f12a285ce..9745c3deacc3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -39,6 +39,8 @@ 
 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
 #define DM_COOKIE_LENGTH 24
 
+#define REQ_SAVED_END_IO             REQ_DRV
+
 static const char *_name = DM_NAME;
 
 static unsigned int major = 0;
@@ -97,8 +99,11 @@  struct dm_io {
 	unsigned magic;
 	struct mapped_device *md;
 	blk_status_t status;
+	bool	submit_as_polled;
 	atomic_t io_count;
 	struct bio *orig_bio;
+	void	*saved_bio_end_io;
+	struct hlist_node  node;
 	unsigned long start_time;
 	spinlock_t endio_lock;
 	struct dm_stats_aux stats_aux;
@@ -687,6 +692,8 @@  static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
 	tio->ti = ti;
 	tio->target_bio_nr = target_bio_nr;
 
+	WARN_ON_ONCE(ci->io->submit_as_polled && !tio->inside_dm_io);
+
 	return tio;
 }
 
@@ -938,8 +945,12 @@  static void dec_pending(struct dm_io *io, blk_status_t error)
 		end_io_acct(io);
 		free_io(md, io);
 
-		if (io_error == BLK_STS_DM_REQUEUE)
+		if (io_error == BLK_STS_DM_REQUEUE) {
+			/* not poll any more in case of requeue */
+			if (bio->bi_opf & REQ_POLLED)
+				bio->bi_opf &= ~REQ_POLLED;
 			return;
+		}
 
 		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
 			/*
@@ -1590,6 +1601,12 @@  static int __split_and_process_non_flush(struct clone_info *ci)
 	if (__process_abnormal_io(ci, ti, &r))
 		return r;
 
+	/*
+	 * Only support bio polling for normal IO, and the target io is
+	 * exactly inside the dm io instance
+	 */
+	ci->io->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);
+
 	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
 
 	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
@@ -1608,6 +1625,22 @@  static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
 	ci->map = map;
 	ci->io = alloc_io(md, bio);
 	ci->sector = bio->bi_iter.bi_sector;
+
+	if (bio->bi_opf & REQ_POLLED) {
+		INIT_HLIST_NODE(&ci->io->node);
+
+		/*
+		 * Save .bi_end_io into dm_io, so that we can reuse .bi_end_io
+		 * for storing dm_io list
+		 */
+		if (bio->bi_opf & REQ_SAVED_END_IO) {
+			ci->io->saved_bio_end_io = NULL;
+		} else {
+			ci->io->saved_bio_end_io = bio->bi_end_io;
+			INIT_HLIST_HEAD((struct hlist_head *)&bio->bi_end_io);
+			bio->bi_opf |= REQ_SAVED_END_IO;
+		}
+	}
 }
 
 #define __dm_part_stat_sub(part, field, subnd)	\
@@ -1666,8 +1699,19 @@  static void __split_and_process_bio(struct mapped_device *md,
 		}
 	}
 
-	/* drop the extra reference count */
-	dec_pending(ci.io, errno_to_blk_status(error));
+	/*
+	 * Drop the extra reference count for non-POLLED bio, and hold one
+	 * reference for POLLED bio, which will be released in dm_poll_bio
+	 *
+	 * Add every dm_io instance into the hlist_head which is stored in
+	 * bio->bi_end_io, so that dm_poll_bio can poll them all.
+	 */
+	if (!ci.io->submit_as_polled)
+		dec_pending(ci.io, errno_to_blk_status(error));
+	else
+		hlist_add_head(&ci.io->node,
+				(struct hlist_head *)&bio->bi_end_io);
+
 }
 
 static void dm_submit_bio(struct bio *bio)
@@ -1707,6 +1751,66 @@  static void dm_submit_bio(struct bio *bio)
 	dm_put_live_table(md, srcu_idx);
 }
 
+static int dm_poll_dm_io(struct dm_io *io, unsigned int flags)
+{
+	if (!io || !io->submit_as_polled)
+		return 0;
+
+	WARN_ON_ONCE(!io->tio.inside_dm_io);
+	bio_poll(&io->tio.clone, flags);
+
+	/* bio_poll holds the last reference */
+	if (atomic_read(&io->io_count) == 1)
+		return 1;
+	return 0;
+}
+
+static int dm_poll_bio(struct bio *bio, unsigned int flags)
+{
+	struct dm_io *io;
+	void *saved_bi_end_io = NULL;
+	struct hlist_head tmp = HLIST_HEAD_INIT;
+	struct hlist_head *head = (struct hlist_head *)&bio->bi_end_io;
+	struct hlist_node *next;
+
+	/*
+	 * This bio can be submitted from FS as POLLED so that FS may keep
+	 * polling even though the flag is cleared by bio splitting or
+	 * requeue, so return immediately.
+	 */
+	if (!(bio->bi_opf & REQ_POLLED))
+		return 0;
+
+	hlist_move_list(head, &tmp);
+
+	hlist_for_each_entry(io, &tmp, node) {
+		if (io->saved_bio_end_io && !saved_bi_end_io) {
+			saved_bi_end_io = io->saved_bio_end_io;
+			break;
+		}
+	}
+
+	/* restore .bi_end_io before completing dm io */
+	WARN_ON_ONCE(!saved_bi_end_io);
+	bio->bi_end_io = saved_bi_end_io;
+
+	hlist_for_each_entry_safe(io, next, &tmp, node) {
+		if (dm_poll_dm_io(io, flags)) {
+			hlist_del_init(&io->node);
+			dec_pending(io, 0);
+		}
+	}
+
+	/* Not done, make sure at least one dm_io stores the .bi_end_io*/
+	if (!hlist_empty(&tmp)) {
+		io = hlist_entry(tmp.first, struct dm_io, node);
+		io->saved_bio_end_io = saved_bi_end_io;
+		hlist_move_list(&tmp, head);
+		return 0;
+	}
+	return 1;
+}
+
 /*-----------------------------------------------------------------
  * An IDR is used to keep track of allocated minor numbers.
  *---------------------------------------------------------------*/
@@ -3121,6 +3225,7 @@  static const struct pr_ops dm_pr_ops = {
 
 static const struct block_device_operations dm_blk_dops = {
 	.submit_bio = dm_submit_bio,
+	.poll_bio = dm_poll_bio,
 	.open = dm_blk_open,
 	.release = dm_blk_close,
 	.ioctl = dm_blk_ioctl,