diff mbox series

[RFC,6/7] block: track cookies of split bios for bio-based device

Message ID 20201223112624.78955-7-jefflexu@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series dm: add support of iopoll | expand

Commit Message

Jingbo Xu Dec. 23, 2020, 11:26 a.m. UTC
This is actuaaly the core when supporting iopoll for bio-based device.

A list is maintained in the top bio (the original bio submitted to dm
device), which is used to maintain all valid cookies of split bios. The
IO polling routine will actually iterate this list and poll on
corresponding hardware queues of the underlying mq devices.

Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
---
 block/bio.c               |  8 ++++
 block/blk-core.c          | 84 ++++++++++++++++++++++++++++++++++++++-
 include/linux/blk_types.h | 39 ++++++++++++++++++
 3 files changed, 129 insertions(+), 2 deletions(-)

Comments

Mike Snitzer Jan. 7, 2021, 10:18 p.m. UTC | #1
On Wed, Dec 23 2020 at  6:26am -0500,
Jeffle Xu <jefflexu@linux.alibaba.com> wrote:

> This is actuaaly the core when supporting iopoll for bio-based device.
> 
> A list is maintained in the top bio (the original bio submitted to dm
> device), which is used to maintain all valid cookies of split bios. The
> IO polling routine will actually iterate this list and poll on
> corresponding hardware queues of the underlying mq devices.
> 
> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>

Like I said in response to patch 4 in this series: please fold patch 4
into this patch and _really_ improve this patch header.

In particular, the (ab)use of bio_inc_remaining() needs be documented in
this patch header very well.

But its use could easily be why you're seeing a performance hit (coupled
with the extra spinlock locking and list management used).  Just added
latency and contention across CPUs.

> ---
>  block/bio.c               |  8 ++++
>  block/blk-core.c          | 84 ++++++++++++++++++++++++++++++++++++++-
>  include/linux/blk_types.h | 39 ++++++++++++++++++
>  3 files changed, 129 insertions(+), 2 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 1f2cc1fbe283..ca6d1a7ee196 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -284,6 +284,10 @@ void bio_init(struct bio *bio, struct bio_vec *table,
>  
>  	bio->bi_io_vec = table;
>  	bio->bi_max_vecs = max_vecs;
> +
> +	INIT_LIST_HEAD(&bio->bi_plist);
> +	INIT_LIST_HEAD(&bio->bi_pnode);
> +	spin_lock_init(&bio->bi_plock);
>  }
>  EXPORT_SYMBOL(bio_init);
>  
> @@ -689,6 +693,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
>  	bio->bi_write_hint = bio_src->bi_write_hint;
>  	bio->bi_iter = bio_src->bi_iter;
>  	bio->bi_io_vec = bio_src->bi_io_vec;
> +	bio->bi_root = bio_src->bi_root;
>  
>  	bio_clone_blkg_association(bio, bio_src);
>  	blkcg_bio_issue_init(bio);
> @@ -1425,6 +1430,8 @@ void bio_endio(struct bio *bio)
>  	if (bio->bi_disk)
>  		rq_qos_done_bio(bio->bi_disk->queue, bio);
>  
> +	bio_del_poll_list(bio);
> +
>  	/*
>  	 * Need to have a real endio function for chained bios, otherwise
>  	 * various corner cases will break (like stacking block devices that
> @@ -1446,6 +1453,7 @@ void bio_endio(struct bio *bio)
>  	blk_throtl_bio_endio(bio);
>  	/* release cgroup info */
>  	bio_uninit(bio);
> +
>  	if (bio->bi_end_io)
>  		bio->bi_end_io(bio);
>  }
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 2f5c51ce32e3..5a332af01939 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -960,12 +960,31 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>  {
>  	struct bio_list bio_list_on_stack[2];
>  	blk_qc_t ret = BLK_QC_T_NONE;
> +	bool iopoll;
> +	struct bio *root;
>  
>  	BUG_ON(bio->bi_next);
>  
>  	bio_list_init(&bio_list_on_stack[0]);
>  	current->bio_list = bio_list_on_stack;
>  
> +	iopoll = test_bit(QUEUE_FLAG_POLL, &bio->bi_disk->queue->queue_flags);
> +	iopoll = iopoll && (bio->bi_opf & REQ_HIPRI);
> +
> +	if (iopoll) {
> +		bio->bi_root = root = bio;
> +		/*
> +		 * We need to pin root bio here since there's a reference from
> +		 * the returned cookie. bio_get() is not enough since the whole
> +		 * bio and the corresponding kiocb/dio may have already
> +		 * completed and thus won't call blk_poll() at all, in which
> +		 * case the pairing bio_put() in blk_bio_poll() won't be called.
> +		 * The side effect of bio_inc_remaining() is that, the whole bio
> +		 * won't complete until blk_poll() called.
> +		 */
> +		bio_inc_remaining(root);
> +	}
> +
>  	do {
>  		struct request_queue *q = bio->bi_disk->queue;
>  		struct bio_list lower, same;
> @@ -979,7 +998,18 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>  		bio_list_on_stack[1] = bio_list_on_stack[0];
>  		bio_list_init(&bio_list_on_stack[0]);
>  
> -		ret = __submit_bio(bio);
> +		if (iopoll) {
> +			/* See the comments of above bio_inc_remaining(). */
> +			bio_inc_remaining(bio);
> +			bio->bi_cookie = __submit_bio(bio);
> +
> +			if (blk_qc_t_valid(bio->bi_cookie))
> +				bio_add_poll_list(bio);
> +
> +			bio_endio(bio);
> +		} else {
> +			ret = __submit_bio(bio);
> +		}
>  
>  		/*
>  		 * Sort new bios into those for a lower level and those for the
> @@ -1002,7 +1032,11 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>  	} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
>  
>  	current->bio_list = NULL;
> -	return ret;
> +
> +	if (iopoll)
> +		return (blk_qc_t)root;
> +
> +	return BLK_QC_T_NONE;
>  }
>  
>  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
> @@ -1131,6 +1165,52 @@ blk_qc_t submit_bio(struct bio *bio)
>  }
>  EXPORT_SYMBOL(submit_bio);
>  
> +int blk_bio_poll(struct request_queue *q, blk_qc_t cookie)
> +{
> +	int ret = 0;
> +	struct bio *bio, *root = (struct bio*)cookie;
> +
> +	if (list_empty(&root->bi_plist)) {
> +		bio_endio(root);
> +		return 1;
> +	}
> +
> +	spin_lock(&root->bi_plock);
> +	bio = list_first_entry_or_null(&root->bi_plist, struct bio, bi_pnode);
> +
> +	while (bio) {
> +		struct request_queue *q = bio->bi_disk->queue;
> +		blk_qc_t cookie = bio->bi_cookie;
> +
> +		spin_unlock(&root->bi_plock);
> +		BUG_ON(!blk_qc_t_valid(cookie));
> +
> +		ret += blk_mq_poll(q, cookie);

Not yet clear to me how you _know_ this q is blk-mq...
What about a deep stack of bio-based DM devices?

Or are you confining bio-based DM IO polling support to bio-based
stacked directly on blk-mq? (patch 7 likely shows that to be the case).

If so, I'm not liking that at all.  So if your implementation doesn't
support arbitrary bio-based IO stacks then this bio-based IO polling
support really has limited utility still.

Helps explin how you got away with having bio-based DM always returning
BLK_QC_T_NONE in patch 5 though... feels far too simplistic.  Patch 5+6
together are implicitly ignoring the complexity that results from
arbitrary bio-based DM stacking.

Or am I missing something?

Mike


> +
> +		spin_lock(&root->bi_plock);
> +		/*
> +		 * One blk_mq_poll() call could complete multiple bios, and
> +		 * thus multiple bios could be removed from root->bi_plock
> +		 * list.
> +		 */
> +		bio = list_first_entry_or_null(&root->bi_plist, struct bio, bi_pnode);
> +	}
> +
> +	spin_unlock(&root->bi_plock);
> +
> +	if (list_empty(&root->bi_plist)) {
> +		bio_endio(root);
> +		/*
> +		 * 'ret' may be 0 here. root->bi_plist may be empty once we
> +		 * acquire the list spinlock.
> +		 */
> +		ret = max(ret, 1);
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(blk_bio_poll);
> +
>  static bool blk_poll_hybrid(struct request_queue *q, blk_qc_t cookie)
>  {
>  	struct blk_mq_hw_ctx *hctx;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 2e05244fc16d..2cf5d8f0ea34 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -277,6 +277,12 @@ struct bio {
>  
>  	struct bio_set		*bi_pool;
>  
> +	struct bio		*bi_root;	/* original bio of submit_bio() */
> +	struct list_head        bi_plist;
> +	struct list_head        bi_pnode;
> +	struct spinlock         bi_plock;
> +	blk_qc_t		bi_cookie;
> +
>  	/*
>  	 * We can inline a number of vecs at the end of the bio, to avoid
>  	 * double allocations for a small number of bio_vecs. This member
> @@ -557,6 +563,39 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
>  	return (cookie & BLK_QC_T_INTERNAL) != 0;
>  }
>  
> +static inline void bio_add_poll_list(struct bio *bio)
> +{
> +	struct bio *root = bio->bi_root;
> +
> +	/*
> +	 * The spin_lock() variant is enough since bios in root->bi_plist are
> +	 * all enqueued into polling mode hardware queue, thus the list_del()
> +	 * operation is handled only in process context.
> +	 */
> +	spin_lock(&root->bi_plock);
> +	list_add_tail(&bio->bi_pnode, &root->bi_plist);
> +	spin_unlock(&root->bi_plock);
> +}
> +
> +static inline void bio_del_poll_list(struct bio *bio)
> +{
> +	struct bio *root = bio->bi_root;
> +
> +	/*
> +	 * bios in mq routine: @bi_root is NULL, @bi_cookie is 0;
> +	 * bios in bio-based routine: @bi_root is non-NULL, @bi_cookie is valid
> +	 * (including 0) for those in root->bi_plist, invalid for the
> +	 * remaining.
> +	 */
> +	if (bio->bi_root && blk_qc_t_valid(bio->bi_cookie)) {
> +		spin_lock(&root->bi_plock);
> +		list_del(&bio->bi_pnode);
> +		spin_unlock(&root->bi_plock);
> +	}
> +}
> +
> +int blk_bio_poll(struct request_queue *q, blk_qc_t cookie);
> +
>  struct blk_rq_stat {
>  	u64 mean;
>  	u64 min;
> -- 
> 2.27.0
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel
Jingbo Xu Jan. 8, 2021, 3:08 a.m. UTC | #2
Thanks for reviewing.


On 1/8/21 6:18 AM, Mike Snitzer wrote:
> On Wed, Dec 23 2020 at  6:26am -0500,
> Jeffle Xu <jefflexu@linux.alibaba.com> wrote:
> 
>> This is actuaaly the core when supporting iopoll for bio-based device.
>>
>> A list is maintained in the top bio (the original bio submitted to dm
>> device), which is used to maintain all valid cookies of split bios. The
>> IO polling routine will actually iterate this list and poll on
>> corresponding hardware queues of the underlying mq devices.
>>
>> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
> 
> Like I said in response to patch 4 in this series: please fold patch 4
> into this patch and _really_ improve this patch header.
> 
> In particular, the (ab)use of bio_inc_remaining() needs be documented in
> this patch header very well.
> 
> But its use could easily be why you're seeing a performance hit (coupled
> with the extra spinlock locking and list management used).  Just added
> latency and contention across CPUs.

Indeed bio_inc_remaining() is abused here and the code seems quite hacky
here.

Actually I'm regarding implementing the split bio tracking mechanism in
a recursive way you had ever suggested. That is, the split bios could be
maintained in an array, which is allocated with 'struct dm_io'. This way
the overhead of spinlock protecting the &root->bi_plist may be omitted
here. Also the lifetime management may be simplified somehow. But the
block core needs to fetch the per-bio private data now, just like what
you had ever suggested before.

How do you think, Mike?


Besides the lifetime management is quite annoying to me. As long as the
tracking object representing a valid split bio) is dynamically
allocated, no matter it's embedded directly in 'struct bio' (in this
patch), or allocated with 'struct dm_io', the lifetime management of the
tracking object comes in. Here the so called tracking object is
something like

struct node {
    struct blk_mq_hw_ctx *hctx;
    blk_qc_t cookie;
};


Actually currently the tracking objects are all allocated with 'struct
bio', then the lifetime management of the tracking objects is actually
equivalent to lifetime management of bio. Since the returned cookie is
actually a pointer to the bio, the refcount of this bio must be
incremented, since we release a reference to this bio through the
returned cookie, in which case the abuse of the refcount trick seems
unavoidable? Unless we allocate the tracking object individually, then
the returned cookie is actually pointing to the tracking object, and the
refcount is individually maintained for the tracking object.


> 
>> ---
>>  block/bio.c               |  8 ++++
>>  block/blk-core.c          | 84 ++++++++++++++++++++++++++++++++++++++-
>>  include/linux/blk_types.h | 39 ++++++++++++++++++
>>  3 files changed, 129 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/bio.c b/block/bio.c
>> index 1f2cc1fbe283..ca6d1a7ee196 100644
>> --- a/block/bio.c
>> +++ b/block/bio.c
>> @@ -284,6 +284,10 @@ void bio_init(struct bio *bio, struct bio_vec *table,
>>  
>>  	bio->bi_io_vec = table;
>>  	bio->bi_max_vecs = max_vecs;
>> +
>> +	INIT_LIST_HEAD(&bio->bi_plist);
>> +	INIT_LIST_HEAD(&bio->bi_pnode);
>> +	spin_lock_init(&bio->bi_plock);
>>  }
>>  EXPORT_SYMBOL(bio_init);
>>  
>> @@ -689,6 +693,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
>>  	bio->bi_write_hint = bio_src->bi_write_hint;
>>  	bio->bi_iter = bio_src->bi_iter;
>>  	bio->bi_io_vec = bio_src->bi_io_vec;
>> +	bio->bi_root = bio_src->bi_root;
>>  
>>  	bio_clone_blkg_association(bio, bio_src);
>>  	blkcg_bio_issue_init(bio);
>> @@ -1425,6 +1430,8 @@ void bio_endio(struct bio *bio)
>>  	if (bio->bi_disk)
>>  		rq_qos_done_bio(bio->bi_disk->queue, bio);
>>  
>> +	bio_del_poll_list(bio);
>> +
>>  	/*
>>  	 * Need to have a real endio function for chained bios, otherwise
>>  	 * various corner cases will break (like stacking block devices that
>> @@ -1446,6 +1453,7 @@ void bio_endio(struct bio *bio)
>>  	blk_throtl_bio_endio(bio);
>>  	/* release cgroup info */
>>  	bio_uninit(bio);
>> +
>>  	if (bio->bi_end_io)
>>  		bio->bi_end_io(bio);
>>  }
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 2f5c51ce32e3..5a332af01939 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -960,12 +960,31 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>>  {
>>  	struct bio_list bio_list_on_stack[2];
>>  	blk_qc_t ret = BLK_QC_T_NONE;
>> +	bool iopoll;
>> +	struct bio *root;
>>  
>>  	BUG_ON(bio->bi_next);
>>  
>>  	bio_list_init(&bio_list_on_stack[0]);
>>  	current->bio_list = bio_list_on_stack;
>>  
>> +	iopoll = test_bit(QUEUE_FLAG_POLL, &bio->bi_disk->queue->queue_flags);
>> +	iopoll = iopoll && (bio->bi_opf & REQ_HIPRI);
>> +
>> +	if (iopoll) {
>> +		bio->bi_root = root = bio;
>> +		/*
>> +		 * We need to pin root bio here since there's a reference from
>> +		 * the returned cookie. bio_get() is not enough since the whole
>> +		 * bio and the corresponding kiocb/dio may have already
>> +		 * completed and thus won't call blk_poll() at all, in which
>> +		 * case the pairing bio_put() in blk_bio_poll() won't be called.
>> +		 * The side effect of bio_inc_remaining() is that, the whole bio
>> +		 * won't complete until blk_poll() called.
>> +		 */
>> +		bio_inc_remaining(root);
>> +	}
>> +
>>  	do {
>>  		struct request_queue *q = bio->bi_disk->queue;
>>  		struct bio_list lower, same;
>> @@ -979,7 +998,18 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>>  		bio_list_on_stack[1] = bio_list_on_stack[0];
>>  		bio_list_init(&bio_list_on_stack[0]);
>>  
>> -		ret = __submit_bio(bio);
>> +		if (iopoll) {
>> +			/* See the comments of above bio_inc_remaining(). */
>> +			bio_inc_remaining(bio);
>> +			bio->bi_cookie = __submit_bio(bio);
>> +
>> +			if (blk_qc_t_valid(bio->bi_cookie))
>> +				bio_add_poll_list(bio);
>> +
>> +			bio_endio(bio);
>> +		} else {
>> +			ret = __submit_bio(bio);
>> +		}
>>  
>>  		/*
>>  		 * Sort new bios into those for a lower level and those for the
>> @@ -1002,7 +1032,11 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>>  	} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
>>  
>>  	current->bio_list = NULL;
>> -	return ret;
>> +
>> +	if (iopoll)
>> +		return (blk_qc_t)root;
>> +
>> +	return BLK_QC_T_NONE;
>>  }
>>  
>>  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
>> @@ -1131,6 +1165,52 @@ blk_qc_t submit_bio(struct bio *bio)
>>  }
>>  EXPORT_SYMBOL(submit_bio);
>>  
>> +int blk_bio_poll(struct request_queue *q, blk_qc_t cookie)
>> +{
>> +	int ret = 0;
>> +	struct bio *bio, *root = (struct bio*)cookie;
>> +
>> +	if (list_empty(&root->bi_plist)) {
>> +		bio_endio(root);
>> +		return 1;
>> +	}
>> +
>> +	spin_lock(&root->bi_plock);
>> +	bio = list_first_entry_or_null(&root->bi_plist, struct bio, bi_pnode);
>> +
>> +	while (bio) {
>> +		struct request_queue *q = bio->bi_disk->queue;
>> +		blk_qc_t cookie = bio->bi_cookie;
>> +
>> +		spin_unlock(&root->bi_plock);
>> +		BUG_ON(!blk_qc_t_valid(cookie));
>> +
>> +		ret += blk_mq_poll(q, cookie);
> 
> Not yet clear to me how you _know_ this q is blk-mq...
> What about a deep stack of bio-based DM devices?
> 

This design works in arbitrary bio-based DM stacking.


> Or are you confining bio-based DM IO polling support to bio-based
> stacked directly on blk-mq? (patch 7 likely shows that to be the case).
> 

patch 7 works in arbitrary bio-based DM stacking. Please see the reply
for patch 7 for details.


> If so, I'm not liking that at all.  So if your implementation doesn't
> support arbitrary bio-based IO stacks then this bio-based IO polling
> support really has limited utility still.
> 
> Helps explin how you got away with having bio-based DM always returning
> BLK_QC_T_NONE in patch 5 though... feels far too simplistic.  Patch 5+6
> together are implicitly ignoring the complexity that results from
> arbitrary bio-based DM stacking.
> 
> Or am I missing something?

The magic is in patch 5. Bios submitted directly to DM device won't be
enqueue into this &root->bi_plist list, since all bios submitted
directly to DM device will return BLK_QC_T_NONE since patch 5, and
__submit_bio_noacct() only enqueues split bios with valid cookie into
&root->bi_plist list. Thus only bios submitted to mq device will be
enqueued into this &root->bi_plist list.

Following is the related logic (the blk_qc_t_valid() part).


>> -		ret = __submit_bio(bio);
>> +		if (iopoll) {
>> +			/* See the comments of above bio_inc_remaining(). */
>> +			bio_inc_remaining(bio);
>> +			bio->bi_cookie = __submit_bio(bio);
>> +
>> +			if (blk_qc_t_valid(bio->bi_cookie))
>> +				bio_add_poll_list(bio);
>> +
>> +			bio_endio(bio);
>> +		} else {
>> +			ret = __submit_bio(bio);
>> +		}



Suppose we have the following device stack hierarchy, that is, dm0 is
stacked on dm1, while dm1 is stacked on nvme0 and nvme1.

    dm0
    dm1
nvme0  nvme1


Then the bio graph is like:


                                   +------------+
                                   |bio0(to dm0)|
                                   +------------+
                                         ^
                                         | orig_bio
                                   +--------------------+
                                   |struct dm_io of bio0|
+--------------------+ bi_private  ----------------------
|bio3(to dm1)        |------------>|bio1(to dm1)        |
+--------------------+             +--------------------+
        ^                                ^
        | ->orig_bio                     | ->orig_bio
+--------------------+             +--------------------+
|struct dm_io        |             |struct dm_io        |
----------------------             ----------------------
|bio2(to nvme0)      |             |bio4(to nvme1)      |
+--------------------+             +--------------------+

In this way, bio 0/1/3 will return BLK_QC_T_NONE and won't be enqueued
into &root->bi_plist list, while bio 2/4 will be enqueued if they return
valid cookie.

> 
> 
>> +
>> +		spin_lock(&root->bi_plock);
>> +		/*
>> +		 * One blk_mq_poll() call could complete multiple bios, and
>> +		 * thus multiple bios could be removed from root->bi_plock
>> +		 * list.
>> +		 */
>> +		bio = list_first_entry_or_null(&root->bi_plist, struct bio, bi_pnode);
>> +	}
>> +
>> +	spin_unlock(&root->bi_plock);
>> +
>> +	if (list_empty(&root->bi_plist)) {
>> +		bio_endio(root);
>> +		/*
>> +		 * 'ret' may be 0 here. root->bi_plist may be empty once we
>> +		 * acquire the list spinlock.
>> +		 */
>> +		ret = max(ret, 1);
>> +	}
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(blk_bio_poll);
>> +
>>  static bool blk_poll_hybrid(struct request_queue *q, blk_qc_t cookie)
>>  {
>>  	struct blk_mq_hw_ctx *hctx;
>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>> index 2e05244fc16d..2cf5d8f0ea34 100644
>> --- a/include/linux/blk_types.h
>> +++ b/include/linux/blk_types.h
>> @@ -277,6 +277,12 @@ struct bio {
>>  
>>  	struct bio_set		*bi_pool;
>>  
>> +	struct bio		*bi_root;	/* original bio of submit_bio() */
>> +	struct list_head        bi_plist;
>> +	struct list_head        bi_pnode;
>> +	struct spinlock         bi_plock;
>> +	blk_qc_t		bi_cookie;
>> +
>>  	/*
>>  	 * We can inline a number of vecs at the end of the bio, to avoid
>>  	 * double allocations for a small number of bio_vecs. This member
>> @@ -557,6 +563,39 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
>>  	return (cookie & BLK_QC_T_INTERNAL) != 0;
>>  }
>>  
>> +static inline void bio_add_poll_list(struct bio *bio)
>> +{
>> +	struct bio *root = bio->bi_root;
>> +
>> +	/*
>> +	 * The spin_lock() variant is enough since bios in root->bi_plist are
>> +	 * all enqueued into polling mode hardware queue, thus the list_del()
>> +	 * operation is handled only in process context.
>> +	 */
>> +	spin_lock(&root->bi_plock);
>> +	list_add_tail(&bio->bi_pnode, &root->bi_plist);
>> +	spin_unlock(&root->bi_plock);
>> +}
>> +
>> +static inline void bio_del_poll_list(struct bio *bio)
>> +{
>> +	struct bio *root = bio->bi_root;
>> +
>> +	/*
>> +	 * bios in mq routine: @bi_root is NULL, @bi_cookie is 0;
>> +	 * bios in bio-based routine: @bi_root is non-NULL, @bi_cookie is valid
>> +	 * (including 0) for those in root->bi_plist, invalid for the
>> +	 * remaining.
>> +	 */
>> +	if (bio->bi_root && blk_qc_t_valid(bio->bi_cookie)) {
>> +		spin_lock(&root->bi_plock);
>> +		list_del(&bio->bi_pnode);
>> +		spin_unlock(&root->bi_plock);
>> +	}
>> +}
>> +
>> +int blk_bio_poll(struct request_queue *q, blk_qc_t cookie);
>> +
>>  struct blk_rq_stat {
>>  	u64 mean;
>>  	u64 min;
>> -- 
>> 2.27.0
>>
>> --
>> dm-devel mailing list
>> dm-devel@redhat.com
>> https://www.redhat.com/mailman/listinfo/dm-devel
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel
>
Mike Snitzer Jan. 8, 2021, 5:26 p.m. UTC | #3
On Thu, Jan 07 2021 at 10:08pm -0500,
JeffleXu <jefflexu@linux.alibaba.com> wrote:

> Thanks for reviewing.
> 
> 
> On 1/8/21 6:18 AM, Mike Snitzer wrote:
> > On Wed, Dec 23 2020 at  6:26am -0500,
> > Jeffle Xu <jefflexu@linux.alibaba.com> wrote:
> > 
> >> This is actuaaly the core when supporting iopoll for bio-based device.
> >>
> >> A list is maintained in the top bio (the original bio submitted to dm
> >> device), which is used to maintain all valid cookies of split bios. The
> >> IO polling routine will actually iterate this list and poll on
> >> corresponding hardware queues of the underlying mq devices.
> >>
> >> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
> > 
> > Like I said in response to patch 4 in this series: please fold patch 4
> > into this patch and _really_ improve this patch header.
> > 
> > In particular, the (ab)use of bio_inc_remaining() needs be documented in
> > this patch header very well.
> > 
> > But its use could easily be why you're seeing a performance hit (coupled
> > with the extra spinlock locking and list management used).  Just added
> > latency and contention across CPUs.
> 
> Indeed bio_inc_remaining() is abused here and the code seems quite hacky
> here.
> 
> Actually I'm regarding implementing the split bio tracking mechanism in
> a recursive way you had ever suggested. That is, the split bios could be
> maintained in an array, which is allocated with 'struct dm_io'. This way
> the overhead of spinlock protecting the &root->bi_plist may be omitted
> here. Also the lifetime management may be simplified somehow. But the
> block core needs to fetch the per-bio private data now, just like what
> you had ever suggested before.
> 
> How do you think, Mike?

Yes, using per-bio-data is a requirement (we cannot bloat 'struct bio').

As for using an array, how would you index the array?  blk-mq is able to
use an array (with cookie to hctx index translation) because there are a
finite number of fixed hctx for the life of the device.  But with
stacked bio-based DM devices, each DM table associated with a DM device
can change via table reload.  Any reloads should flush outstanding IO,
but there are cases where no flushing occurs (e.g. dm-multipath when no
paths are available, _but_ in that instance, there wouldn't be any
mapping that results in a blk-mq hctx endpoint).

All the DM edge cases aside, you need to ensure that the lifetime of the
per-bio-data that holds the 'struct node' (that you correctly detailed
needing below) doesn't somehow get used _after_ the hctx and/or cookie
are no longer valid.  So to start we'll need some BUG_ON() to validate
the lifetime is correct.

> Besides the lifetime management is quite annoying to me. As long as the
> tracking object representing a valid split bio) is dynamically
> allocated, no matter it's embedded directly in 'struct bio' (in this
> patch), or allocated with 'struct dm_io', the lifetime management of the
> tracking object comes in. Here the so called tracking object is
> something like
> 
> struct node {
>     struct blk_mq_hw_ctx *hctx;
>     blk_qc_t cookie;
> };

Needs a better name, think I had 'struct dm_poll_data'
 
> Actually currently the tracking objects are all allocated with 'struct
> bio', then the lifetime management of the tracking objects is actually
> equivalent to lifetime management of bio. Since the returned cookie is
> actually a pointer to the bio, the refcount of this bio must be
> incremented, since we release a reference to this bio through the
> returned cookie, in which case the abuse of the refcount trick seems
> unavoidable? Unless we allocate the tracking object individually, then
> the returned cookie is actually pointing to the tracking object, and the
> refcount is individually maintained for the tracking object.

The refcounting and lifetime of the per-bio-data should all work as is.
Would hope you can avoid extra bio_inc_remaining().. that infratsructure
is way too tightly coupled to bio_chain()'ing, etc.

The challenge you have is the array that would point at these various
per-bio-data needs to be rooted somewhere (you put it in the topmost
original bio with the current patchset).  But why not manage that as
part of 'struct mapped_device'?  It'd need proper management at DM table
reload boundaries and such but it seems like the most logical place to
put the array.  But again, this array needs to be dynamic.. so thinking
further, maybe a better model would be to have a fixed array in 'struct
dm_table' for each hctx associated with a blk_mq _data_ device directly
used/managed by that dm_table?

And ideally, access to these arrays should be as lockless as possible
(rcu, or whatever) so that scaling to many cpus isn't a problem.

> >> ---
> >>  block/bio.c               |  8 ++++
> >>  block/blk-core.c          | 84 ++++++++++++++++++++++++++++++++++++++-
> >>  include/linux/blk_types.h | 39 ++++++++++++++++++
> >>  3 files changed, 129 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/block/bio.c b/block/bio.c
> >> index 1f2cc1fbe283..ca6d1a7ee196 100644
> >> --- a/block/bio.c
> >> +++ b/block/bio.c
> >> @@ -284,6 +284,10 @@ void bio_init(struct bio *bio, struct bio_vec *table,
> >>  
> >>  	bio->bi_io_vec = table;
> >>  	bio->bi_max_vecs = max_vecs;
> >> +
> >> +	INIT_LIST_HEAD(&bio->bi_plist);
> >> +	INIT_LIST_HEAD(&bio->bi_pnode);
> >> +	spin_lock_init(&bio->bi_plock);
> >>  }
> >>  EXPORT_SYMBOL(bio_init);
> >>  
> >> @@ -689,6 +693,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
> >>  	bio->bi_write_hint = bio_src->bi_write_hint;
> >>  	bio->bi_iter = bio_src->bi_iter;
> >>  	bio->bi_io_vec = bio_src->bi_io_vec;
> >> +	bio->bi_root = bio_src->bi_root;
> >>  
> >>  	bio_clone_blkg_association(bio, bio_src);
> >>  	blkcg_bio_issue_init(bio);
> >> @@ -1425,6 +1430,8 @@ void bio_endio(struct bio *bio)
> >>  	if (bio->bi_disk)
> >>  		rq_qos_done_bio(bio->bi_disk->queue, bio);
> >>  
> >> +	bio_del_poll_list(bio);
> >> +
> >>  	/*
> >>  	 * Need to have a real endio function for chained bios, otherwise
> >>  	 * various corner cases will break (like stacking block devices that
> >> @@ -1446,6 +1453,7 @@ void bio_endio(struct bio *bio)
> >>  	blk_throtl_bio_endio(bio);
> >>  	/* release cgroup info */
> >>  	bio_uninit(bio);
> >> +
> >>  	if (bio->bi_end_io)
> >>  		bio->bi_end_io(bio);
> >>  }
> >> diff --git a/block/blk-core.c b/block/blk-core.c
> >> index 2f5c51ce32e3..5a332af01939 100644
> >> --- a/block/blk-core.c
> >> +++ b/block/blk-core.c
> >> @@ -960,12 +960,31 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >>  {
> >>  	struct bio_list bio_list_on_stack[2];
> >>  	blk_qc_t ret = BLK_QC_T_NONE;
> >> +	bool iopoll;
> >> +	struct bio *root;
> >>  
> >>  	BUG_ON(bio->bi_next);
> >>  
> >>  	bio_list_init(&bio_list_on_stack[0]);
> >>  	current->bio_list = bio_list_on_stack;
> >>  
> >> +	iopoll = test_bit(QUEUE_FLAG_POLL, &bio->bi_disk->queue->queue_flags);
> >> +	iopoll = iopoll && (bio->bi_opf & REQ_HIPRI);
> >> +
> >> +	if (iopoll) {
> >> +		bio->bi_root = root = bio;
> >> +		/*
> >> +		 * We need to pin root bio here since there's a reference from
> >> +		 * the returned cookie. bio_get() is not enough since the whole
> >> +		 * bio and the corresponding kiocb/dio may have already
> >> +		 * completed and thus won't call blk_poll() at all, in which
> >> +		 * case the pairing bio_put() in blk_bio_poll() won't be called.
> >> +		 * The side effect of bio_inc_remaining() is that, the whole bio
> >> +		 * won't complete until blk_poll() called.
> >> +		 */
> >> +		bio_inc_remaining(root);
> >> +	}
> >> +
> >>  	do {
> >>  		struct request_queue *q = bio->bi_disk->queue;
> >>  		struct bio_list lower, same;
> >> @@ -979,7 +998,18 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >>  		bio_list_on_stack[1] = bio_list_on_stack[0];
> >>  		bio_list_init(&bio_list_on_stack[0]);
> >>  
> >> -		ret = __submit_bio(bio);
> >> +		if (iopoll) {
> >> +			/* See the comments of above bio_inc_remaining(). */
> >> +			bio_inc_remaining(bio);
> >> +			bio->bi_cookie = __submit_bio(bio);
> >> +
> >> +			if (blk_qc_t_valid(bio->bi_cookie))
> >> +				bio_add_poll_list(bio);
> >> +
> >> +			bio_endio(bio);
> >> +		} else {
> >> +			ret = __submit_bio(bio);
> >> +		}
> >>  
> >>  		/*
> >>  		 * Sort new bios into those for a lower level and those for the
> >> @@ -1002,7 +1032,11 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >>  	} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
> >>  
> >>  	current->bio_list = NULL;
> >> -	return ret;
> >> +
> >> +	if (iopoll)
> >> +		return (blk_qc_t)root;
> >> +
> >> +	return BLK_QC_T_NONE;
> >>  }
> >>  
> >>  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
> >> @@ -1131,6 +1165,52 @@ blk_qc_t submit_bio(struct bio *bio)
> >>  }
> >>  EXPORT_SYMBOL(submit_bio);
> >>  
> >> +int blk_bio_poll(struct request_queue *q, blk_qc_t cookie)
> >> +{
> >> +	int ret = 0;
> >> +	struct bio *bio, *root = (struct bio*)cookie;
> >> +
> >> +	if (list_empty(&root->bi_plist)) {
> >> +		bio_endio(root);
> >> +		return 1;
> >> +	}
> >> +
> >> +	spin_lock(&root->bi_plock);
> >> +	bio = list_first_entry_or_null(&root->bi_plist, struct bio, bi_pnode);
> >> +
> >> +	while (bio) {
> >> +		struct request_queue *q = bio->bi_disk->queue;
> >> +		blk_qc_t cookie = bio->bi_cookie;
> >> +
> >> +		spin_unlock(&root->bi_plock);
> >> +		BUG_ON(!blk_qc_t_valid(cookie));
> >> +
> >> +		ret += blk_mq_poll(q, cookie);
> > 
> > Not yet clear to me how you _know_ this q is blk-mq...
> > What about a deep stack of bio-based DM devices?
> > 
> 
> This design works in arbitrary bio-based DM stacking.
> 
> 
> > Or are you confining bio-based DM IO polling support to bio-based
> > stacked directly on blk-mq? (patch 7 likely shows that to be the case).
> > 
> 
> patch 7 works in arbitrary bio-based DM stacking. Please see the reply
> for patch 7 for details.

OK, I see.  Definitely need to capture that aspect of the design in the
relevant patch header(s).

And likely a block comment in blk_bio_poll().

> > If so, I'm not liking that at all.  So if your implementation doesn't
> > support arbitrary bio-based IO stacks then this bio-based IO polling
> > support really has limited utility still.
> > 
> > Helps explin how you got away with having bio-based DM always returning
> > BLK_QC_T_NONE in patch 5 though... feels far too simplistic.  Patch 5+6
> > together are implicitly ignoring the complexity that results from
> > arbitrary bio-based DM stacking.
> > 
> > Or am I missing something?
> 
> The magic is in patch 5. Bios submitted directly to DM device won't be
> enqueue into this &root->bi_plist list, since all bios submitted
> directly to DM device will return BLK_QC_T_NONE since patch 5, and
> __submit_bio_noacct() only enqueues split bios with valid cookie into
> &root->bi_plist list. Thus only bios submitted to mq device will be
> enqueued into this &root->bi_plist list.
> 
> Following is the related logic (the blk_qc_t_valid() part).
> 
> 
> >> -		ret = __submit_bio(bio);
> >> +		if (iopoll) {
> >> +			/* See the comments of above bio_inc_remaining(). */
> >> +			bio_inc_remaining(bio);
> >> +			bio->bi_cookie = __submit_bio(bio);
> >> +
> >> +			if (blk_qc_t_valid(bio->bi_cookie))
> >> +				bio_add_poll_list(bio);
> >> +
> >> +			bio_endio(bio);
> >> +		} else {
> >> +			ret = __submit_bio(bio);
> >> +		}
> 
> 
> 
> Suppose we have the following device stack hierarchy, that is, dm0 is
> stacked on dm1, while dm1 is stacked on nvme0 and nvme1.
> 
>     dm0
>     dm1
> nvme0  nvme1
> 
> 
> Then the bio graph is like:
> 
> 
>                                    +------------+
>                                    |bio0(to dm0)|
>                                    +------------+
>                                          ^
>                                          | orig_bio
>                                    +--------------------+
>                                    |struct dm_io of bio0|
> +--------------------+ bi_private  ----------------------
> |bio3(to dm1)        |------------>|bio1(to dm1)        |
> +--------------------+             +--------------------+
>         ^                                ^
>         | ->orig_bio                     | ->orig_bio
> +--------------------+             +--------------------+
> |struct dm_io        |             |struct dm_io        |
> ----------------------             ----------------------
> |bio2(to nvme0)      |             |bio4(to nvme1)      |
> +--------------------+             +--------------------+
> 
> In this way, bio 0/1/3 will return BLK_QC_T_NONE and won't be enqueued
> into &root->bi_plist list, while bio 2/4 will be enqueued if they return
> valid cookie.

Yes, useful insight, thanks.

Mike
Jingbo Xu Jan. 12, 2021, 5:46 a.m. UTC | #4
On 1/9/21 1:26 AM, Mike Snitzer wrote:
> On Thu, Jan 07 2021 at 10:08pm -0500,
> JeffleXu <jefflexu@linux.alibaba.com> wrote:
> 
>> Thanks for reviewing.
>>
>>
>> On 1/8/21 6:18 AM, Mike Snitzer wrote:
>>> On Wed, Dec 23 2020 at  6:26am -0500,
>>> Jeffle Xu <jefflexu@linux.alibaba.com> wrote:
>>>
>>>> This is actuaaly the core when supporting iopoll for bio-based device.
>>>>
>>>> A list is maintained in the top bio (the original bio submitted to dm
>>>> device), which is used to maintain all valid cookies of split bios. The
>>>> IO polling routine will actually iterate this list and poll on
>>>> corresponding hardware queues of the underlying mq devices.
>>>>
>>>> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
>>>
>>> Like I said in response to patch 4 in this series: please fold patch 4
>>> into this patch and _really_ improve this patch header.
>>>
>>> In particular, the (ab)use of bio_inc_remaining() needs be documented in
>>> this patch header very well.
>>>
>>> But its use could easily be why you're seeing a performance hit (coupled
>>> with the extra spinlock locking and list management used).  Just added
>>> latency and contention across CPUs.
>>
>> Indeed bio_inc_remaining() is abused here and the code seems quite hacky
>> here.
>>
>> Actually I'm regarding implementing the split bio tracking mechanism in
>> a recursive way you had ever suggested. That is, the split bios could be
>> maintained in an array, which is allocated with 'struct dm_io'. This way
>> the overhead of spinlock protecting the &root->bi_plist may be omitted
>> here. Also the lifetime management may be simplified somehow. But the
>> block core needs to fetch the per-bio private data now, just like what
>> you had ever suggested before.
>>
>> How do you think, Mike?
> 
> Yes, using per-bio-data is a requirement (we cannot bloat 'struct bio').

Agreed. Then MD will need some refactor to support IO polling, if
possible, since just like I mentioned in patch 0 before, MD doesn't
allocate extra clone bio, and just re-uses the original bio structure.


> 
> As for using an array, how would you index the array?  

The 'array' here is not an array of 'struct blk_mq_hw_ctx *' maintained
in struct dm_table as you mentioned. Actually what I mean is to maintain
an array of struct dm_poll_data (or something like that, e.g. just
struct blk_mq_hw_ctx *) in per-bio private data. The size of the array
just equals the number of the target devices.

For example, for the following device stack,

>>
>> Suppose we have the following device stack hierarchy, that is, dm0 is
>> stacked on dm1, while dm1 is stacked on nvme0 and nvme1.
>>
>>     dm0
>>     dm1
>> nvme0  nvme1
>>
>>
>> Then the bio graph is like:
>>
>>
>>                                    +------------+
>>                                    |bio0(to dm0)|
>>                                    +------------+
>>                                          ^
>>                                          | orig_bio
>>                                    +--------------------+
>>                                    |struct dm_io A      |
>> +--------------------+ bi_private  ----------------------
>> |bio3(to dm1)        |------------>|bio1(to dm1)        |
>> +--------------------+             +--------------------+
>>         ^                                ^
>>         | ->orig_bio                     | ->orig_bio
>> +--------------------+             +--------------------+
>> |struct dm_io        |             |struct dm_io B      |
>> ----------------------             ----------------------
>> |bio2(to nvme0)      |             |bio4(to nvme1)      |
>> +--------------------+             +--------------------+
>>

An array of struct blk_mq_hw_ctx * is maintained in struct dm_io B.


struct blk_mq_hw_ctx * hctxs[2];

The array size is two since dm1 maps to two target devices (i.e. nvme0
and nvme1). Then hctxs[0] points to the hw queue of nvme0, while
hctxs[1] points to the hw queue of nvme1.


This mechanism supports arbitrary device stacking. Similarly, an array
of struct blk_mq_hw_ctx * is maintained in struct dm_io A. The array
size is one since dm0 only maps to one target device (i.e. dm1). In this
case, hctx[0] points to the struct dm_io of the next level, i.e. struct
dm_io B.


But I'm afraid the implementation of this style may be more complex.


>> struct node {
>>     struct blk_mq_hw_ctx *hctx;
>>     blk_qc_t cookie;
>> };
> 
> Needs a better name, think I had 'struct dm_poll_data'

Sure, the name here is just for example.


>  
>> Actually currently the tracking objects are all allocated with 'struct
>> bio', then the lifetime management of the tracking objects is actually
>> equivalent to lifetime management of bio. Since the returned cookie is
>> actually a pointer to the bio, the refcount of this bio must be
>> incremented, since we release a reference to this bio through the
>> returned cookie, in which case the abuse of the refcount trick seems
>> unavoidable? Unless we allocate the tracking object individually, then
>> the returned cookie is actually pointing to the tracking object, and the
>> refcount is individually maintained for the tracking object.
> 
> The refcounting and lifetime of the per-bio-data should all work as is.
> Would hope you can avoid extra bio_inc_remaining().. that infratsructure
> is way too tightly coupled to bio_chain()'ing, etc.
> 
> The challenge you have is the array that would point at these various
> per-bio-data needs to be rooted somewhere (you put it in the topmost
> original bio with the current patchset).  But why not manage that as
> part of 'struct mapped_device'?  It'd need proper management at DM table
> reload boundaries and such but it seems like the most logical place to
> put the array.  But again, this array needs to be dynamic.. so thinking
> further, maybe a better model would be to have a fixed array in 'struct
> dm_table' for each hctx associated with a blk_mq _data_ device directly
> used/managed by that dm_table?

It seems that you are referring 'array' here as an array of 'struct
blk_mq_hw_ctx *'? Such as

struct dm_table {
    ...
    struct blk_mq_hw_ctx *hctxs[];
};

Certainly with this we can replace the original 'struct blk_mq_hw_ctx *'
pointer in 'struct dm_poll_data' with the index into this array, such as

struct dm_poll_data {
     int hctx_index; /* index into dm_table->hctxs[] */
     blk_qc_t cookie;
};


But I'm doubted if this makes much sense. The core difficulty here is
maintaining a list (or dynamic sized array) to track all split bios.
With the array of 'struct blk_mq_hw_ctx *' maintained in struct
dm_table, we still need some **per-bio** structure (e.g., &bio->bi_plist
in current patch set) to track these split bios.
Jingbo Xu Jan. 12, 2021, 7:11 a.m. UTC | #5
Actually I'm thinking why the first RFC version [1] I implemented can't
work.

Indeed iterating all hw queues of all target devices may not work well
as for performance. Then since we have an array of struct blk_mq_hw_ctx
* in struct dm_table now, we can also maintain a bitmap or something to
track if currently there's polling bio enqueued into these hw queues.

```
struct dm_table {
    ...
    struct blk_mq_hw_ctx *hctxs[];
};
```

Then when doing polling for one specific bio, we could iterate and poll
on all hw queues that are flagged pending (there are polling bios
enqueued into these hw queues).


However this may also increase the competition. Consider the following
device stacking,

```
       dm 0
nvme0  nvme1 nvme2
```

If bio 0 is actually enqueue into nvme0, and bio 1 is enqueued into
nvme1, then polling of bio 0 will poll on both nvme0 and nvme1, the same
for bio 1, which will increase the competition.

Then we can maintain a second bitmap tracking if there are polling
instances polling on these hw queues. Polling instance will skip these
hw queues flagged as busy (there's polling instance polling on this hw
queue currently).


By this way we don't need to implement the complex split bio tracking
mechanism anymore.

How about this approach, Mike?


[1] https://www.spinics.net/lists/dm-devel/msg43307.html


On 1/9/21 1:26 AM, Mike Snitzer wrote:
> On Thu, Jan 07 2021 at 10:08pm -0500,
> JeffleXu <jefflexu@linux.alibaba.com> wrote:
> 
>> Thanks for reviewing.
>>
>>
>> On 1/8/21 6:18 AM, Mike Snitzer wrote:
>>> On Wed, Dec 23 2020 at  6:26am -0500,
>>> Jeffle Xu <jefflexu@linux.alibaba.com> wrote:
>>>
>>>> This is actuaaly the core when supporting iopoll for bio-based device.
>>>>
>>>> A list is maintained in the top bio (the original bio submitted to dm
>>>> device), which is used to maintain all valid cookies of split bios. The
>>>> IO polling routine will actually iterate this list and poll on
>>>> corresponding hardware queues of the underlying mq devices.
>>>>
>>>> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
>>>
>>> Like I said in response to patch 4 in this series: please fold patch 4
>>> into this patch and _really_ improve this patch header.
>>>
>>> In particular, the (ab)use of bio_inc_remaining() needs be documented in
>>> this patch header very well.
>>>
>>> But its use could easily be why you're seeing a performance hit (coupled
>>> with the extra spinlock locking and list management used).  Just added
>>> latency and contention across CPUs.
>>
>> Indeed bio_inc_remaining() is abused here and the code seems quite hacky
>> here.
>>
>> Actually I'm regarding implementing the split bio tracking mechanism in
>> a recursive way you had ever suggested. That is, the split bios could be
>> maintained in an array, which is allocated with 'struct dm_io'. This way
>> the overhead of spinlock protecting the &root->bi_plist may be omitted
>> here. Also the lifetime management may be simplified somehow. But the
>> block core needs to fetch the per-bio private data now, just like what
>> you had ever suggested before.
>>
>> How do you think, Mike?
> 
> Yes, using per-bio-data is a requirement (we cannot bloat 'struct bio').
> 
> As for using an array, how would you index the array?  blk-mq is able to
> use an array (with cookie to hctx index translation) because there are a
> finite number of fixed hctx for the life of the device.  But with
> stacked bio-based DM devices, each DM table associated with a DM device
> can change via table reload.  Any reloads should flush outstanding IO,
> but there are cases where no flushing occurs (e.g. dm-multipath when no
> paths are available, _but_ in that instance, there wouldn't be any
> mapping that results in a blk-mq hctx endpoint).
> 
> All the DM edge cases aside, you need to ensure that the lifetime of the
> per-bio-data that holds the 'struct node' (that you correctly detailed
> needing below) doesn't somehow get used _after_ the hctx and/or cookie
> are no longer valid.  So to start we'll need some BUG_ON() to validate
> the lifetime is correct.
> 
>> Besides the lifetime management is quite annoying to me. As long as the
>> tracking object representing a valid split bio) is dynamically
>> allocated, no matter it's embedded directly in 'struct bio' (in this
>> patch), or allocated with 'struct dm_io', the lifetime management of the
>> tracking object comes in. Here the so called tracking object is
>> something like
>>
>> struct node {
>>     struct blk_mq_hw_ctx *hctx;
>>     blk_qc_t cookie;
>> };
> 
> Needs a better name, think I had 'struct dm_poll_data'
>  
>> Actually currently the tracking objects are all allocated with 'struct
>> bio', then the lifetime management of the tracking objects is actually
>> equivalent to lifetime management of bio. Since the returned cookie is
>> actually a pointer to the bio, the refcount of this bio must be
>> incremented, since we release a reference to this bio through the
>> returned cookie, in which case the abuse of the refcount trick seems
>> unavoidable? Unless we allocate the tracking object individually, then
>> the returned cookie is actually pointing to the tracking object, and the
>> refcount is individually maintained for the tracking object.
> 
> The refcounting and lifetime of the per-bio-data should all work as is.
> Would hope you can avoid extra bio_inc_remaining().. that infratsructure
> is way too tightly coupled to bio_chain()'ing, etc.
> 
> The challenge you have is the array that would point at these various
> per-bio-data needs to be rooted somewhere (you put it in the topmost
> original bio with the current patchset).  But why not manage that as
> part of 'struct mapped_device'?  It'd need proper management at DM table
> reload boundaries and such but it seems like the most logical place to
> put the array.  But again, this array needs to be dynamic.. so thinking
> further, maybe a better model would be to have a fixed array in 'struct
> dm_table' for each hctx associated with a blk_mq _data_ device directly
> used/managed by that dm_table?
> 
> And ideally, access to these arrays should be as lockless as possible
> (rcu, or whatever) so that scaling to many cpus isn't a problem.
> 
>>>> ---
>>>>  block/bio.c               |  8 ++++
>>>>  block/blk-core.c          | 84 ++++++++++++++++++++++++++++++++++++++-
>>>>  include/linux/blk_types.h | 39 ++++++++++++++++++
>>>>  3 files changed, 129 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/block/bio.c b/block/bio.c
>>>> index 1f2cc1fbe283..ca6d1a7ee196 100644
>>>> --- a/block/bio.c
>>>> +++ b/block/bio.c
>>>> @@ -284,6 +284,10 @@ void bio_init(struct bio *bio, struct bio_vec *table,
>>>>  
>>>>  	bio->bi_io_vec = table;
>>>>  	bio->bi_max_vecs = max_vecs;
>>>> +
>>>> +	INIT_LIST_HEAD(&bio->bi_plist);
>>>> +	INIT_LIST_HEAD(&bio->bi_pnode);
>>>> +	spin_lock_init(&bio->bi_plock);
>>>>  }
>>>>  EXPORT_SYMBOL(bio_init);
>>>>  
>>>> @@ -689,6 +693,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
>>>>  	bio->bi_write_hint = bio_src->bi_write_hint;
>>>>  	bio->bi_iter = bio_src->bi_iter;
>>>>  	bio->bi_io_vec = bio_src->bi_io_vec;
>>>> +	bio->bi_root = bio_src->bi_root;
>>>>  
>>>>  	bio_clone_blkg_association(bio, bio_src);
>>>>  	blkcg_bio_issue_init(bio);
>>>> @@ -1425,6 +1430,8 @@ void bio_endio(struct bio *bio)
>>>>  	if (bio->bi_disk)
>>>>  		rq_qos_done_bio(bio->bi_disk->queue, bio);
>>>>  
>>>> +	bio_del_poll_list(bio);
>>>> +
>>>>  	/*
>>>>  	 * Need to have a real endio function for chained bios, otherwise
>>>>  	 * various corner cases will break (like stacking block devices that
>>>> @@ -1446,6 +1453,7 @@ void bio_endio(struct bio *bio)
>>>>  	blk_throtl_bio_endio(bio);
>>>>  	/* release cgroup info */
>>>>  	bio_uninit(bio);
>>>> +
>>>>  	if (bio->bi_end_io)
>>>>  		bio->bi_end_io(bio);
>>>>  }
>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>> index 2f5c51ce32e3..5a332af01939 100644
>>>> --- a/block/blk-core.c
>>>> +++ b/block/blk-core.c
>>>> @@ -960,12 +960,31 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>>>>  {
>>>>  	struct bio_list bio_list_on_stack[2];
>>>>  	blk_qc_t ret = BLK_QC_T_NONE;
>>>> +	bool iopoll;
>>>> +	struct bio *root;
>>>>  
>>>>  	BUG_ON(bio->bi_next);
>>>>  
>>>>  	bio_list_init(&bio_list_on_stack[0]);
>>>>  	current->bio_list = bio_list_on_stack;
>>>>  
>>>> +	iopoll = test_bit(QUEUE_FLAG_POLL, &bio->bi_disk->queue->queue_flags);
>>>> +	iopoll = iopoll && (bio->bi_opf & REQ_HIPRI);
>>>> +
>>>> +	if (iopoll) {
>>>> +		bio->bi_root = root = bio;
>>>> +		/*
>>>> +		 * We need to pin root bio here since there's a reference from
>>>> +		 * the returned cookie. bio_get() is not enough since the whole
>>>> +		 * bio and the corresponding kiocb/dio may have already
>>>> +		 * completed and thus won't call blk_poll() at all, in which
>>>> +		 * case the pairing bio_put() in blk_bio_poll() won't be called.
>>>> +		 * The side effect of bio_inc_remaining() is that, the whole bio
>>>> +		 * won't complete until blk_poll() called.
>>>> +		 */
>>>> +		bio_inc_remaining(root);
>>>> +	}
>>>> +
>>>>  	do {
>>>>  		struct request_queue *q = bio->bi_disk->queue;
>>>>  		struct bio_list lower, same;
>>>> @@ -979,7 +998,18 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>>>>  		bio_list_on_stack[1] = bio_list_on_stack[0];
>>>>  		bio_list_init(&bio_list_on_stack[0]);
>>>>  
>>>> -		ret = __submit_bio(bio);
>>>> +		if (iopoll) {
>>>> +			/* See the comments of above bio_inc_remaining(). */
>>>> +			bio_inc_remaining(bio);
>>>> +			bio->bi_cookie = __submit_bio(bio);
>>>> +
>>>> +			if (blk_qc_t_valid(bio->bi_cookie))
>>>> +				bio_add_poll_list(bio);
>>>> +
>>>> +			bio_endio(bio);
>>>> +		} else {
>>>> +			ret = __submit_bio(bio);
>>>> +		}
>>>>  
>>>>  		/*
>>>>  		 * Sort new bios into those for a lower level and those for the
>>>> @@ -1002,7 +1032,11 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>>>>  	} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
>>>>  
>>>>  	current->bio_list = NULL;
>>>> -	return ret;
>>>> +
>>>> +	if (iopoll)
>>>> +		return (blk_qc_t)root;
>>>> +
>>>> +	return BLK_QC_T_NONE;
>>>>  }
>>>>  
>>>>  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
>>>> @@ -1131,6 +1165,52 @@ blk_qc_t submit_bio(struct bio *bio)
>>>>  }
>>>>  EXPORT_SYMBOL(submit_bio);
>>>>  
>>>> +int blk_bio_poll(struct request_queue *q, blk_qc_t cookie)
>>>> +{
>>>> +	int ret = 0;
>>>> +	struct bio *bio, *root = (struct bio*)cookie;
>>>> +
>>>> +	if (list_empty(&root->bi_plist)) {
>>>> +		bio_endio(root);
>>>> +		return 1;
>>>> +	}
>>>> +
>>>> +	spin_lock(&root->bi_plock);
>>>> +	bio = list_first_entry_or_null(&root->bi_plist, struct bio, bi_pnode);
>>>> +
>>>> +	while (bio) {
>>>> +		struct request_queue *q = bio->bi_disk->queue;
>>>> +		blk_qc_t cookie = bio->bi_cookie;
>>>> +
>>>> +		spin_unlock(&root->bi_plock);
>>>> +		BUG_ON(!blk_qc_t_valid(cookie));
>>>> +
>>>> +		ret += blk_mq_poll(q, cookie);
>>>
>>> Not yet clear to me how you _know_ this q is blk-mq...
>>> What about a deep stack of bio-based DM devices?
>>>
>>
>> This design works in arbitrary bio-based DM stacking.
>>
>>
>>> Or are you confining bio-based DM IO polling support to bio-based
>>> stacked directly on blk-mq? (patch 7 likely shows that to be the case).
>>>
>>
>> patch 7 works in arbitrary bio-based DM stacking. Please see the reply
>> for patch 7 for details.
> 
> OK, I see.  Definitely need to capture that aspect of the design in the
> relevant patch header(s).
> 
> And likely a block comment in blk_bio_poll().
> 
>>> If so, I'm not liking that at all.  So if your implementation doesn't
>>> support arbitrary bio-based IO stacks then this bio-based IO polling
>>> support really has limited utility still.
>>>
>>> Helps explin how you got away with having bio-based DM always returning
>>> BLK_QC_T_NONE in patch 5 though... feels far too simplistic.  Patch 5+6
>>> together are implicitly ignoring the complexity that results from
>>> arbitrary bio-based DM stacking.
>>>
>>> Or am I missing something?
>>
>> The magic is in patch 5. Bios submitted directly to DM device won't be
>> enqueue into this &root->bi_plist list, since all bios submitted
>> directly to DM device will return BLK_QC_T_NONE since patch 5, and
>> __submit_bio_noacct() only enqueues split bios with valid cookie into
>> &root->bi_plist list. Thus only bios submitted to mq device will be
>> enqueued into this &root->bi_plist list.
>>
>> Following is the related logic (the blk_qc_t_valid() part).
>>
>>
>>>> -		ret = __submit_bio(bio);
>>>> +		if (iopoll) {
>>>> +			/* See the comments of above bio_inc_remaining(). */
>>>> +			bio_inc_remaining(bio);
>>>> +			bio->bi_cookie = __submit_bio(bio);
>>>> +
>>>> +			if (blk_qc_t_valid(bio->bi_cookie))
>>>> +				bio_add_poll_list(bio);
>>>> +
>>>> +			bio_endio(bio);
>>>> +		} else {
>>>> +			ret = __submit_bio(bio);
>>>> +		}
>>
>>
>>
>> Suppose we have the following device stack hierarchy, that is, dm0 is
>> stacked on dm1, while dm1 is stacked on nvme0 and nvme1.
>>
>>     dm0
>>     dm1
>> nvme0  nvme1
>>
>>
>> Then the bio graph is like:
>>
>>
>>                                    +------------+
>>                                    |bio0(to dm0)|
>>                                    +------------+
>>                                          ^
>>                                          | orig_bio
>>                                    +--------------------+
>>                                    |struct dm_io of bio0|
>> +--------------------+ bi_private  ----------------------
>> |bio3(to dm1)        |------------>|bio1(to dm1)        |
>> +--------------------+             +--------------------+
>>         ^                                ^
>>         | ->orig_bio                     | ->orig_bio
>> +--------------------+             +--------------------+
>> |struct dm_io        |             |struct dm_io        |
>> ----------------------             ----------------------
>> |bio2(to nvme0)      |             |bio4(to nvme1)      |
>> +--------------------+             +--------------------+
>>
>> In this way, bio 0/1/3 will return BLK_QC_T_NONE and won't be enqueued
>> into &root->bi_plist list, while bio 2/4 will be enqueued if they return
>> valid cookie.
> 
> Yes, useful insight, thanks.
> 
> Mike
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel
>
Mike Snitzer Jan. 12, 2021, 4:13 p.m. UTC | #6
On Tue, Jan 12 2021 at 12:46am -0500,
JeffleXu <jefflexu@linux.alibaba.com> wrote:

> 
> 
> On 1/9/21 1:26 AM, Mike Snitzer wrote:
> > On Thu, Jan 07 2021 at 10:08pm -0500,
> > JeffleXu <jefflexu@linux.alibaba.com> wrote:
> > 
> >> Thanks for reviewing.
> >>
> >>
> >> On 1/8/21 6:18 AM, Mike Snitzer wrote:
> >>> On Wed, Dec 23 2020 at  6:26am -0500,
> >>> Jeffle Xu <jefflexu@linux.alibaba.com> wrote:
> >>>
> >>>> This is actuaaly the core when supporting iopoll for bio-based device.
> >>>>
> >>>> A list is maintained in the top bio (the original bio submitted to dm
> >>>> device), which is used to maintain all valid cookies of split bios. The
> >>>> IO polling routine will actually iterate this list and poll on
> >>>> corresponding hardware queues of the underlying mq devices.
> >>>>
> >>>> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
> >>>
> >>> Like I said in response to patch 4 in this series: please fold patch 4
> >>> into this patch and _really_ improve this patch header.
> >>>
> >>> In particular, the (ab)use of bio_inc_remaining() needs be documented in
> >>> this patch header very well.
> >>>
> >>> But its use could easily be why you're seeing a performance hit (coupled
> >>> with the extra spinlock locking and list management used).  Just added
> >>> latency and contention across CPUs.
> >>
> >> Indeed bio_inc_remaining() is abused here and the code seems quite hacky
> >> here.
> >>
> >> Actually I'm regarding implementing the split bio tracking mechanism in
> >> a recursive way you had ever suggested. That is, the split bios could be
> >> maintained in an array, which is allocated with 'struct dm_io'. This way
> >> the overhead of spinlock protecting the &root->bi_plist may be omitted
> >> here. Also the lifetime management may be simplified somehow. But the
> >> block core needs to fetch the per-bio private data now, just like what
> >> you had ever suggested before.
> >>
> >> How do you think, Mike?
> > 
> > Yes, using per-bio-data is a requirement (we cannot bloat 'struct bio').
> 
> Agreed. Then MD will need some refactor to support IO polling, if
> possible, since just like I mentioned in patch 0 before, MD doesn't
> allocate extra clone bio, and just re-uses the original bio structure.
> 
> 
> > 
> > As for using an array, how would you index the array?  
> 
> The 'array' here is not an array of 'struct blk_mq_hw_ctx *' maintained
> in struct dm_table as you mentioned. Actually what I mean is to maintain
> an array of struct dm_poll_data (or something like that, e.g. just
> struct blk_mq_hw_ctx *) in per-bio private data. The size of the array
> just equals the number of the target devices.
> 
> For example, for the following device stack,
> 
> >>
> >> Suppose we have the following device stack hierarchy, that is, dm0 is
> >> stacked on dm1, while dm1 is stacked on nvme0 and nvme1.
> >>
> >>     dm0
> >>     dm1
> >> nvme0  nvme1
> >>
> >>
> >> Then the bio graph is like:
> >>
> >>
> >>                                    +------------+
> >>                                    |bio0(to dm0)|
> >>                                    +------------+
> >>                                          ^
> >>                                          | orig_bio
> >>                                    +--------------------+
> >>                                    |struct dm_io A      |
> >> +--------------------+ bi_private  ----------------------
> >> |bio3(to dm1)        |------------>|bio1(to dm1)        |
> >> +--------------------+             +--------------------+
> >>         ^                                ^
> >>         | ->orig_bio                     | ->orig_bio
> >> +--------------------+             +--------------------+
> >> |struct dm_io        |             |struct dm_io B      |
> >> ----------------------             ----------------------
> >> |bio2(to nvme0)      |             |bio4(to nvme1)      |
> >> +--------------------+             +--------------------+
> >>
> 
> An array of struct blk_mq_hw_ctx * is maintained in struct dm_io B.
> 
> 
> struct blk_mq_hw_ctx * hctxs[2];
> 
> The array size is two since dm1 maps to two target devices (i.e. nvme0
> and nvme1). Then hctxs[0] points to the hw queue of nvme0, while
> hctxs[1] points to the hw queue of nvme1.

Both nvme0 and nvme1 may have multiple hctxs.  Not sure why you're
thinking there is just one per device?

> 
> 
> This mechanism supports arbitrary device stacking. Similarly, an array
> of struct blk_mq_hw_ctx * is maintained in struct dm_io A. The array
> size is one since dm0 only maps to one target device (i.e. dm1). In this
> case, hctx[0] points to the struct dm_io of the next level, i.e. struct
> dm_io B.
> 
> 
> But I'm afraid the implementation of this style may be more complex.

We are running the risk of talking in circles about this design...


> >> struct node {
> >>     struct blk_mq_hw_ctx *hctx;
> >>     blk_qc_t cookie;
> >> };
> > 
> > Needs a better name, think I had 'struct dm_poll_data'
> 
> Sure, the name here is just for example.
> 
> 
> >  
> >> Actually currently the tracking objects are all allocated with 'struct
> >> bio', then the lifetime management of the tracking objects is actually
> >> equivalent to lifetime management of bio. Since the returned cookie is
> >> actually a pointer to the bio, the refcount of this bio must be
> >> incremented, since we release a reference to this bio through the
> >> returned cookie, in which case the abuse of the refcount trick seems
> >> unavoidable? Unless we allocate the tracking object individually, then
> >> the returned cookie is actually pointing to the tracking object, and the
> >> refcount is individually maintained for the tracking object.
> > 
> > The refcounting and lifetime of the per-bio-data should all work as is.
> > Would hope you can avoid extra bio_inc_remaining().. that infratsructure
> > is way too tightly coupled to bio_chain()'ing, etc.
> > 
> > The challenge you have is the array that would point at these various
> > per-bio-data needs to be rooted somewhere (you put it in the topmost
> > original bio with the current patchset).  But why not manage that as
> > part of 'struct mapped_device'?  It'd need proper management at DM table
> > reload boundaries and such but it seems like the most logical place to
> > put the array.  But again, this array needs to be dynamic.. so thinking
> > further, maybe a better model would be to have a fixed array in 'struct
> > dm_table' for each hctx associated with a blk_mq _data_ device directly
> > used/managed by that dm_table?
> 
> It seems that you are referring 'array' here as an array of 'struct
> blk_mq_hw_ctx *'? Such as
> 
> struct dm_table {
>     ...
>     struct blk_mq_hw_ctx *hctxs[];
> };
> 
> Certainly with this we can replace the original 'struct blk_mq_hw_ctx *'
> pointer in 'struct dm_poll_data' with the index into this array, such as
> 
> struct dm_poll_data {
>      int hctx_index; /* index into dm_table->hctxs[] */
>      blk_qc_t cookie;
> };

You seized on my mentioning blk-mq's array of hctx too literally.  I was
illustrating that blk-mq's cookie is converted to an index into that
array.

But for this DM bio-polling application we'd need to map the blk-mq
returned cookie to a request_queue.  Hence the original 2 members of
dm_poll_data needing to be 'struct request_queue *' and blk_qc_t.

> But I'm doubted if this makes much sense. The core difficulty here is
> maintaining a list (or dynamic sized array) to track all split bios.
> With the array of 'struct blk_mq_hw_ctx *' maintained in struct
> dm_table, we still need some **per-bio** structure (e.g., &bio->bi_plist
> in current patch set) to track these split bios.

One primary goal of all of this design is to achieve bio-polling cleanly
(without extra locking, without block core data structure bloat, etc).
I know you share that goal.  But we need to nail down the core data
structures and what needs tracking at scale and then associate them with
DM's associated objects with consideration for object lifetime.

My suggestion was to anchor your core data structures (e.g. 'struct
dm_poll_data' array, etc) to 'struct dm_table'.  I suggested that
because the dm_table is what dm_get_device()s each underlying _data_
device (a subset of all devices in a dm_table, as iterated through
.iterate_devices).  But a DM 'struct mapped_device' has 2 potential
dm_tables, active and inactive slots, that would imply some complexity
in handing off any outstanding bio's associated 'struct dm_poll_data'
array on DM table reload.

Anyway, you seem to be gravitating to a more simplistic approach of a
single array of 'struct dm_poll_data' for each DM device (regardless of
how arbitrarily deep that DM device stack is, the topmost DM device
would accumulate the list of 'struct dm_poll_data'?).

I'm now questioning the need for any high-level data structure to track
all N of the 'struct dm_poll_data' that may result from a given bio (as
it is split to multiple blk-mq hctxs across multiple blk-mq devices).
Each 'struct dm_poll_data', that will be returned to block core and
stored in struct kiocb's ki_cookie, would have an object lifetime that
matches the original DM bio clone's per-bio-data that the 'struct
dm_poll_data' was part of; then we just need to cast that ki_cookie's
blk_qc_t as 'struct dm_poll_data' and call blk_poll().

The hardest part is to ensure that all the disparate 'struct
dm_poll_data' (and associated clone bios) aren't free'd until the
_original_ bio completes.  That would create quite some back-pressure
with more potential to exhaust system resources -- because then the
cataylst for dropping reference counts on these clone bios would then
need to be tied to the blk_bio_poll() interface... which feels "wrong"
(e.g. it ushers in the (ab)use of bio_inc_remaining you had in your most
recent patchset).

All said, maybe post a v2 that takes the incremental steps of:
1) using DM per-bio-data for 'struct dm_poll_data'
2) simplify blk_bio_poll() to call into DM to translate provided
   blk_qc_t (from struct kiocb's ki_cookie) to request_queue and
   blk_qc_t.
   - this eliminates any need for extra list processing
3) keep your (ab)use of bio_inc_remaining() to allow for exploring this

?

Thanks,
Mike
Jingbo Xu Jan. 14, 2021, 9:16 a.m. UTC | #7
On 1/13/21 12:13 AM, Mike Snitzer wrote:
> On Tue, Jan 12 2021 at 12:46am -0500,
> JeffleXu <jefflexu@linux.alibaba.com> wrote:
> 
>>
>>
>> On 1/9/21 1:26 AM, Mike Snitzer wrote:
>>> On Thu, Jan 07 2021 at 10:08pm -0500,
>>> JeffleXu <jefflexu@linux.alibaba.com> wrote:
>>>
>>>> Thanks for reviewing.
>>>>
>>>>
>>>> On 1/8/21 6:18 AM, Mike Snitzer wrote:
>>>>> On Wed, Dec 23 2020 at  6:26am -0500,
>>>>> Jeffle Xu <jefflexu@linux.alibaba.com> wrote:
>>>>>
>>>>>> This is actuaaly the core when supporting iopoll for bio-based device.
>>>>>>
>>>>>> A list is maintained in the top bio (the original bio submitted to dm
>>>>>> device), which is used to maintain all valid cookies of split bios. The
>>>>>> IO polling routine will actually iterate this list and poll on
>>>>>> corresponding hardware queues of the underlying mq devices.
>>>>>>
>>>>>> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
>>>>>
>>>>> Like I said in response to patch 4 in this series: please fold patch 4
>>>>> into this patch and _really_ improve this patch header.
>>>>>
>>>>> In particular, the (ab)use of bio_inc_remaining() needs be documented in
>>>>> this patch header very well.
>>>>>
>>>>> But its use could easily be why you're seeing a performance hit (coupled
>>>>> with the extra spinlock locking and list management used).  Just added
>>>>> latency and contention across CPUs.
>>>>
>>>> Indeed bio_inc_remaining() is abused here and the code seems quite hacky
>>>> here.
>>>>
>>>> Actually I'm regarding implementing the split bio tracking mechanism in
>>>> a recursive way you had ever suggested. That is, the split bios could be
>>>> maintained in an array, which is allocated with 'struct dm_io'. This way
>>>> the overhead of spinlock protecting the &root->bi_plist may be omitted
>>>> here. Also the lifetime management may be simplified somehow. But the
>>>> block core needs to fetch the per-bio private data now, just like what
>>>> you had ever suggested before.
>>>>
>>>> How do you think, Mike?
>>>
>>> Yes, using per-bio-data is a requirement (we cannot bloat 'struct bio').
>>
>> Agreed. Then MD will need some refactor to support IO polling, if
>> possible, since just like I mentioned in patch 0 before, MD doesn't
>> allocate extra clone bio, and just re-uses the original bio structure.
>>
>>
>>>
>>> As for using an array, how would you index the array?  
>>
>> The 'array' here is not an array of 'struct blk_mq_hw_ctx *' maintained
>> in struct dm_table as you mentioned. Actually what I mean is to maintain
>> an array of struct dm_poll_data (or something like that, e.g. just
>> struct blk_mq_hw_ctx *) in per-bio private data. The size of the array
>> just equals the number of the target devices.
>>
>> For example, for the following device stack,
>>
>>>>
>>>> Suppose we have the following device stack hierarchy, that is, dm0 is
>>>> stacked on dm1, while dm1 is stacked on nvme0 and nvme1.
>>>>
>>>>     dm0
>>>>     dm1
>>>> nvme0  nvme1
>>>>
>>>>
>>>> Then the bio graph is like:
>>>>
>>>>
>>>>                                    +------------+
>>>>                                    |bio0(to dm0)|
>>>>                                    +------------+
>>>>                                          ^
>>>>                                          | orig_bio
>>>>                                    +--------------------+
>>>>                                    |struct dm_io A      |
>>>> +--------------------+ bi_private  ----------------------
>>>> |bio3(to dm1)        |------------>|bio1(to dm1)        |
>>>> +--------------------+             +--------------------+
>>>>         ^                                ^
>>>>         | ->orig_bio                     | ->orig_bio
>>>> +--------------------+             +--------------------+
>>>> |struct dm_io        |             |struct dm_io B      |
>>>> ----------------------             ----------------------
>>>> |bio2(to nvme0)      |             |bio4(to nvme1)      |
>>>> +--------------------+             +--------------------+
>>>>
>>
>> An array of struct blk_mq_hw_ctx * is maintained in struct dm_io B.
>>
>>
>> struct blk_mq_hw_ctx * hctxs[2];
>>
>> The array size is two since dm1 maps to two target devices (i.e. nvme0
>> and nvme1). Then hctxs[0] points to the hw queue of nvme0, while
>> hctxs[1] points to the hw queue of nvme1.
> 
> Both nvme0 and nvme1 may have multiple hctxs.  Not sure why you're
> thinking there is just one per device?
> 
>>
>>
>> This mechanism supports arbitrary device stacking. Similarly, an array
>> of struct blk_mq_hw_ctx * is maintained in struct dm_io A. The array
>> size is one since dm0 only maps to one target device (i.e. dm1). In this
>> case, hctx[0] points to the struct dm_io of the next level, i.e. struct
>> dm_io B.
>>
>>
>> But I'm afraid the implementation of this style may be more complex.
> 
> We are running the risk of talking in circles about this design...

Sorry for the inconvenience. I have started working on the next version,
but I do want to clarify some design issues first.

> 
> 
>>>> struct node {
>>>>     struct blk_mq_hw_ctx *hctx;
>>>>     blk_qc_t cookie;
>>>> };
>>>
>>> Needs a better name, think I had 'struct dm_poll_data'
>>
>> Sure, the name here is just for example.
>>
>>
>>>  
>>>> Actually currently the tracking objects are all allocated with 'struct
>>>> bio', then the lifetime management of the tracking objects is actually
>>>> equivalent to lifetime management of bio. Since the returned cookie is
>>>> actually a pointer to the bio, the refcount of this bio must be
>>>> incremented, since we release a reference to this bio through the
>>>> returned cookie, in which case the abuse of the refcount trick seems
>>>> unavoidable? Unless we allocate the tracking object individually, then
>>>> the returned cookie is actually pointing to the tracking object, and the
>>>> refcount is individually maintained for the tracking object.
>>>
>>> The refcounting and lifetime of the per-bio-data should all work as is.
>>> Would hope you can avoid extra bio_inc_remaining().. that infratsructure
>>> is way too tightly coupled to bio_chain()'ing, etc.
>>>
>>> The challenge you have is the array that would point at these various
>>> per-bio-data needs to be rooted somewhere (you put it in the topmost
>>> original bio with the current patchset).  But why not manage that as
>>> part of 'struct mapped_device'?  It'd need proper management at DM table
>>> reload boundaries and such but it seems like the most logical place to
>>> put the array.  But again, this array needs to be dynamic.. so thinking
>>> further, maybe a better model would be to have a fixed array in 'struct
>>> dm_table' for each hctx associated with a blk_mq _data_ device directly
>>> used/managed by that dm_table?
>>

Confusion also stated in the following comment. How 'struct
dm_poll_data' could be involved with 'struct dm_table' or 'struct
mapped_device'. In the current patchset, every bio need to maintain one
list to track all its 'struct dm_poll_data' structures. Then how to
maintain this per-bio information in one single 'struct dm_table' or
'struct mapped_device'?


>> It seems that you are referring 'array' here as an array of 'struct
>> blk_mq_hw_ctx *'? Such as
>>
>> struct dm_table {
>>     ...
>>     struct blk_mq_hw_ctx *hctxs[];
>> };
>>
>> Certainly with this we can replace the original 'struct blk_mq_hw_ctx *'
>> pointer in 'struct dm_poll_data' with the index into this array, such as
>>
>> struct dm_poll_data {
>>      int hctx_index; /* index into dm_table->hctxs[] */
>>      blk_qc_t cookie;
>> };
> 
> You seized on my mentioning blk-mq's array of hctx too literally.  I was
> illustrating that blk-mq's cookie is converted to an index into that
> array.
> 
> But for this DM bio-polling application we'd need to map the blk-mq
> returned cookie to a request_queue.  Hence the original 2 members of
> dm_poll_data needing to be 'struct request_queue *' and blk_qc_t.
> 
>> But I'm doubted if this makes much sense. The core difficulty here is
>> maintaining a list (or dynamic sized array) to track all split bios.
>> With the array of 'struct blk_mq_hw_ctx *' maintained in struct
>> dm_table, we still need some **per-bio** structure (e.g., &bio->bi_plist
>> in current patch set) to track these split bios.
> 
> One primary goal of all of this design is to achieve bio-polling cleanly
> (without extra locking, without block core data structure bloat, etc).
> I know you share that goal.  But we need to nail down the core data
> structures and what needs tracking at scale and then associate them with
> DM's associated objects with consideration for object lifetime.
> 
> My suggestion was to anchor your core data structures (e.g. 'struct
> dm_poll_data' array, etc) to 'struct dm_table'.  I suggested that
> because the dm_table is what dm_get_device()s each underlying _data_
> device (a subset of all devices in a dm_table, as iterated through
> .iterate_devices).  But a DM 'struct mapped_device' has 2 potential
> dm_tables, active and inactive slots, that would imply some complexity
> in handing off any outstanding bio's associated 'struct dm_poll_data'
> array on DM table reload.

1) If 'struct dm_poll_data' resides in per-bio-data, then how do you
**link** or **associate** all the 'struct dm_poll_data' structures from
one original bio? Do we link them by the internal relationship between
bio/dm_io/dm_target_io, or some other auxiliary data structure?

2) I get confused how 'struct dm_poll_data' could be involved with
'struct dm_table'. Is there an array of 'struct dm_poll_data' or 'struct
dm_poll_data *' maintained in 'struct dm_table'? If this is the case,
then the size of the array may be incredible large, or expanded/shrank
frequently, since one dm_table could correspond to millions bios.



> 
> Anyway, you seem to be gravitating to a more simplistic approach of a
> single array of 'struct dm_poll_data' for each DM device (regardless of
> how arbitrarily deep that DM device stack is, the topmost DM device
> would accumulate the list of 'struct dm_poll_data'?).

I'm open to this. At least you don't need to care the lifetime of other
disparate 'struct dm_poll_data's, if all 'struct dm_poll_data's are
accumulated in one (e.g., the topmost) place.


> 
> I'm now questioning the need for any high-level data structure to track
> all N of the 'struct dm_poll_data' that may result from a given bio (as
> it is split to multiple blk-mq hctxs across multiple blk-mq devices).
> Each 'struct dm_poll_data', that will be returned to block core and
> stored in struct kiocb's ki_cookie, would have an object lifetime that
> matches the original DM bio clone's per-bio-data that the 'struct
> dm_poll_data' was part of; then we just need to cast that ki_cookie's
> blk_qc_t as 'struct dm_poll_data' and call blk_poll().
> 
> The hardest part is to ensure that all the disparate 'struct
> dm_poll_data' (and associated clone bios) aren't free'd until the
> _original_ bio completes.  That would create quite some back-pressure
> with more potential to exhaust system resources -- because then the
> cataylst for dropping reference counts on these clone bios would then
> need to be tied to the blk_bio_poll() interface... which feels "wrong"
> (e.g. it ushers in the (ab)use of bio_inc_remaining you had in your most
> recent patchset).
> 
> All said, maybe post a v2 that takes the incremental steps of:
> 1) using DM per-bio-data for 'struct dm_poll_data'
> 2) simplify blk_bio_poll() to call into DM to translate provided
>    blk_qc_t (from struct kiocb's ki_cookie) to request_queue and
>    blk_qc_t.
>    - this eliminates any need for extra list processing
> 3) keep your (ab)use of bio_inc_remaining() to allow for exploring this
Mike Snitzer Jan. 14, 2021, 2:30 p.m. UTC | #8
On Thu, Jan 14 2021 at  4:16am -0500,
JeffleXu <jefflexu@linux.alibaba.com> wrote:

> 
> 
> On 1/13/21 12:13 AM, Mike Snitzer wrote:
> > On Tue, Jan 12 2021 at 12:46am -0500,
> > JeffleXu <jefflexu@linux.alibaba.com> wrote:
> > 
> >>
> >>
> >> On 1/9/21 1:26 AM, Mike Snitzer wrote:
> >>> On Thu, Jan 07 2021 at 10:08pm -0500,
> >>> JeffleXu <jefflexu@linux.alibaba.com> wrote:
> >>>
> >>>> Thanks for reviewing.
> >>>>
> >>>>
> >>>> On 1/8/21 6:18 AM, Mike Snitzer wrote:
> >>>>> On Wed, Dec 23 2020 at  6:26am -0500,
> >>>>> Jeffle Xu <jefflexu@linux.alibaba.com> wrote:
> >>>>>
> >>>>>> This is actuaaly the core when supporting iopoll for bio-based device.
> >>>>>>
> >>>>>> A list is maintained in the top bio (the original bio submitted to dm
> >>>>>> device), which is used to maintain all valid cookies of split bios. The
> >>>>>> IO polling routine will actually iterate this list and poll on
> >>>>>> corresponding hardware queues of the underlying mq devices.
> >>>>>>
> >>>>>> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
> >>>>>
> >>>>> Like I said in response to patch 4 in this series: please fold patch 4
> >>>>> into this patch and _really_ improve this patch header.
> >>>>>
> >>>>> In particular, the (ab)use of bio_inc_remaining() needs be documented in
> >>>>> this patch header very well.
> >>>>>
> >>>>> But its use could easily be why you're seeing a performance hit (coupled
> >>>>> with the extra spinlock locking and list management used).  Just added
> >>>>> latency and contention across CPUs.
> >>>>
> >>>> Indeed bio_inc_remaining() is abused here and the code seems quite hacky
> >>>> here.
> >>>>
> >>>> Actually I'm regarding implementing the split bio tracking mechanism in
> >>>> a recursive way you had ever suggested. That is, the split bios could be
> >>>> maintained in an array, which is allocated with 'struct dm_io'. This way
> >>>> the overhead of spinlock protecting the &root->bi_plist may be omitted
> >>>> here. Also the lifetime management may be simplified somehow. But the
> >>>> block core needs to fetch the per-bio private data now, just like what
> >>>> you had ever suggested before.
> >>>>
> >>>> How do you think, Mike?
> >>>
> >>> Yes, using per-bio-data is a requirement (we cannot bloat 'struct bio').
> >>
> >> Agreed. Then MD will need some refactor to support IO polling, if
> >> possible, since just like I mentioned in patch 0 before, MD doesn't
> >> allocate extra clone bio, and just re-uses the original bio structure.
> >>
> >>
> >>>
> >>> As for using an array, how would you index the array?  
> >>
> >> The 'array' here is not an array of 'struct blk_mq_hw_ctx *' maintained
> >> in struct dm_table as you mentioned. Actually what I mean is to maintain
> >> an array of struct dm_poll_data (or something like that, e.g. just
> >> struct blk_mq_hw_ctx *) in per-bio private data. The size of the array
> >> just equals the number of the target devices.
> >>
> >> For example, for the following device stack,
> >>
> >>>>
> >>>> Suppose we have the following device stack hierarchy, that is, dm0 is
> >>>> stacked on dm1, while dm1 is stacked on nvme0 and nvme1.
> >>>>
> >>>>     dm0
> >>>>     dm1
> >>>> nvme0  nvme1
> >>>>
> >>>>
> >>>> Then the bio graph is like:
> >>>>
> >>>>
> >>>>                                    +------------+
> >>>>                                    |bio0(to dm0)|
> >>>>                                    +------------+
> >>>>                                          ^
> >>>>                                          | orig_bio
> >>>>                                    +--------------------+
> >>>>                                    |struct dm_io A      |
> >>>> +--------------------+ bi_private  ----------------------
> >>>> |bio3(to dm1)        |------------>|bio1(to dm1)        |
> >>>> +--------------------+             +--------------------+
> >>>>         ^                                ^
> >>>>         | ->orig_bio                     | ->orig_bio
> >>>> +--------------------+             +--------------------+
> >>>> |struct dm_io        |             |struct dm_io B      |
> >>>> ----------------------             ----------------------
> >>>> |bio2(to nvme0)      |             |bio4(to nvme1)      |
> >>>> +--------------------+             +--------------------+
> >>>>
> >>
> >> An array of struct blk_mq_hw_ctx * is maintained in struct dm_io B.
> >>
> >>
> >> struct blk_mq_hw_ctx * hctxs[2];
> >>
> >> The array size is two since dm1 maps to two target devices (i.e. nvme0
> >> and nvme1). Then hctxs[0] points to the hw queue of nvme0, while
> >> hctxs[1] points to the hw queue of nvme1.
> > 
> > Both nvme0 and nvme1 may have multiple hctxs.  Not sure why you're
> > thinking there is just one per device?
> > 
> >>
> >>
> >> This mechanism supports arbitrary device stacking. Similarly, an array
> >> of struct blk_mq_hw_ctx * is maintained in struct dm_io A. The array
> >> size is one since dm0 only maps to one target device (i.e. dm1). In this
> >> case, hctx[0] points to the struct dm_io of the next level, i.e. struct
> >> dm_io B.
> >>
> >>
> >> But I'm afraid the implementation of this style may be more complex.
> > 
> > We are running the risk of talking in circles about this design...
> 
> Sorry for the inconvenience. I have started working on the next version,
> but I do want to clarify some design issues first.
> 
> > 
> > 
> >>>> struct node {
> >>>>     struct blk_mq_hw_ctx *hctx;
> >>>>     blk_qc_t cookie;
> >>>> };
> >>>
> >>> Needs a better name, think I had 'struct dm_poll_data'
> >>
> >> Sure, the name here is just for example.
> >>
> >>
> >>>  
> >>>> Actually currently the tracking objects are all allocated with 'struct
> >>>> bio', then the lifetime management of the tracking objects is actually
> >>>> equivalent to lifetime management of bio. Since the returned cookie is
> >>>> actually a pointer to the bio, the refcount of this bio must be
> >>>> incremented, since we release a reference to this bio through the
> >>>> returned cookie, in which case the abuse of the refcount trick seems
> >>>> unavoidable? Unless we allocate the tracking object individually, then
> >>>> the returned cookie is actually pointing to the tracking object, and the
> >>>> refcount is individually maintained for the tracking object.
> >>>
> >>> The refcounting and lifetime of the per-bio-data should all work as is.
> >>> Would hope you can avoid extra bio_inc_remaining().. that infratsructure
> >>> is way too tightly coupled to bio_chain()'ing, etc.
> >>>
> >>> The challenge you have is the array that would point at these various
> >>> per-bio-data needs to be rooted somewhere (you put it in the topmost
> >>> original bio with the current patchset).  But why not manage that as
> >>> part of 'struct mapped_device'?  It'd need proper management at DM table
> >>> reload boundaries and such but it seems like the most logical place to
> >>> put the array.  But again, this array needs to be dynamic.. so thinking
> >>> further, maybe a better model would be to have a fixed array in 'struct
> >>> dm_table' for each hctx associated with a blk_mq _data_ device directly
> >>> used/managed by that dm_table?
> >>
> 
> Confusion also stated in the following comment. How 'struct
> dm_poll_data' could be involved with 'struct dm_table' or 'struct
> mapped_device'. In the current patchset, every bio need to maintain one
> list to track all its 'struct dm_poll_data' structures. Then how to
> maintain this per-bio information in one single 'struct dm_table' or
> 'struct mapped_device'?
> 
> 
> >> It seems that you are referring 'array' here as an array of 'struct
> >> blk_mq_hw_ctx *'? Such as
> >>
> >> struct dm_table {
> >>     ...
> >>     struct blk_mq_hw_ctx *hctxs[];
> >> };
> >>
> >> Certainly with this we can replace the original 'struct blk_mq_hw_ctx *'
> >> pointer in 'struct dm_poll_data' with the index into this array, such as
> >>
> >> struct dm_poll_data {
> >>      int hctx_index; /* index into dm_table->hctxs[] */
> >>      blk_qc_t cookie;
> >> };
> > 
> > You seized on my mentioning blk-mq's array of hctx too literally.  I was
> > illustrating that blk-mq's cookie is converted to an index into that
> > array.
> > 
> > But for this DM bio-polling application we'd need to map the blk-mq
> > returned cookie to a request_queue.  Hence the original 2 members of
> > dm_poll_data needing to be 'struct request_queue *' and blk_qc_t.
> > 
> >> But I'm doubted if this makes much sense. The core difficulty here is
> >> maintaining a list (or dynamic sized array) to track all split bios.
> >> With the array of 'struct blk_mq_hw_ctx *' maintained in struct
> >> dm_table, we still need some **per-bio** structure (e.g., &bio->bi_plist
> >> in current patch set) to track these split bios.
> > 
> > One primary goal of all of this design is to achieve bio-polling cleanly
> > (without extra locking, without block core data structure bloat, etc).
> > I know you share that goal.  But we need to nail down the core data
> > structures and what needs tracking at scale and then associate them with
> > DM's associated objects with consideration for object lifetime.
> > 
> > My suggestion was to anchor your core data structures (e.g. 'struct
> > dm_poll_data' array, etc) to 'struct dm_table'.  I suggested that
> > because the dm_table is what dm_get_device()s each underlying _data_
> > device (a subset of all devices in a dm_table, as iterated through
> > .iterate_devices).  But a DM 'struct mapped_device' has 2 potential
> > dm_tables, active and inactive slots, that would imply some complexity
> > in handing off any outstanding bio's associated 'struct dm_poll_data'
> > array on DM table reload.
> 
> 1) If 'struct dm_poll_data' resides in per-bio-data, then how do you
> **link** or **associate** all the 'struct dm_poll_data' structures from
> one original bio? Do we link them by the internal relationship between
> bio/dm_io/dm_target_io, or some other auxiliary data structure?
> 
> 2) I get confused how 'struct dm_poll_data' could be involved with
> 'struct dm_table'. Is there an array of 'struct dm_poll_data' or 'struct
> dm_poll_data *' maintained in 'struct dm_table'? If this is the case,
> then the size of the array may be incredible large, or expanded/shrank
> frequently, since one dm_table could correspond to millions bios.

My line of thinking didn't account for the fan-out of clone bios and
the 'struct dm_poll_data' associated with each needing to be tracked
with an auxillary data structure.  I was just thinking in terms of a
single cookie for each bio.  That model works for blk-mq because a
request isn't ever split.

So I had a blindspot/hope we could avoid the complexity but I was
mistaken.

> > Anyway, you seem to be gravitating to a more simplistic approach of a
> > single array of 'struct dm_poll_data' for each DM device (regardless of
> > how arbitrarily deep that DM device stack is, the topmost DM device
> > would accumulate the list of 'struct dm_poll_data'?).
> 
> I'm open to this. At least you don't need to care the lifetime of other
> disparate 'struct dm_poll_data's, if all 'struct dm_poll_data's are
> accumulated in one (e.g., the topmost) place.

Treating the entire IO stack as if it can all be accumulated/managed in
a single pool of objects is dangerous.  It ushers in serious lifetime
problems associated with completion of IO that must occur in order for
DM targets to work as designed.  Waiting for a chain of bios to complete
at various layers is fine.  But if that chain spans targets boundaries
I think we could easily introduce problems.

So not only am I struggling to see how we avoid a data structure to
track all split bios' dm_poll_data: I also don't yet see how we can
safely allow per-bio-data to linger waiting for blk_bio_poll() to
eventually reap bios whose completion has been delayed for IO polling's
benefit.

This IO polling model is really awkward to apply to bio-based IO.

Mike

> > I'm now questioning the need for any high-level data structure to track
> > all N of the 'struct dm_poll_data' that may result from a given bio (as
> > it is split to multiple blk-mq hctxs across multiple blk-mq devices).
> > Each 'struct dm_poll_data', that will be returned to block core and
> > stored in struct kiocb's ki_cookie, would have an object lifetime that
> > matches the original DM bio clone's per-bio-data that the 'struct
> > dm_poll_data' was part of; then we just need to cast that ki_cookie's
> > blk_qc_t as 'struct dm_poll_data' and call blk_poll().
> > 
> > The hardest part is to ensure that all the disparate 'struct
> > dm_poll_data' (and associated clone bios) aren't free'd until the
> > _original_ bio completes.  That would create quite some back-pressure
> > with more potential to exhaust system resources -- because then the
> > cataylst for dropping reference counts on these clone bios would then
> > need to be tied to the blk_bio_poll() interface... which feels "wrong"
> > (e.g. it ushers in the (ab)use of bio_inc_remaining you had in your most
> > recent patchset).
> > 
> > All said, maybe post a v2 that takes the incremental steps of:
> > 1) using DM per-bio-data for 'struct dm_poll_data'
> > 2) simplify blk_bio_poll() to call into DM to translate provided
> >    blk_qc_t (from struct kiocb's ki_cookie) to request_queue and
> >    blk_qc_t.
> >    - this eliminates any need for extra list processing
> > 3) keep your (ab)use of bio_inc_remaining() to allow for exploring this 
> 
> -- 
> Thanks,
> Jeffle
>
diff mbox series

Patch

diff --git a/block/bio.c b/block/bio.c
index 1f2cc1fbe283..ca6d1a7ee196 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -284,6 +284,10 @@  void bio_init(struct bio *bio, struct bio_vec *table,
 
 	bio->bi_io_vec = table;
 	bio->bi_max_vecs = max_vecs;
+
+	INIT_LIST_HEAD(&bio->bi_plist);
+	INIT_LIST_HEAD(&bio->bi_pnode);
+	spin_lock_init(&bio->bi_plock);
 }
 EXPORT_SYMBOL(bio_init);
 
@@ -689,6 +693,7 @@  void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	bio->bi_write_hint = bio_src->bi_write_hint;
 	bio->bi_iter = bio_src->bi_iter;
 	bio->bi_io_vec = bio_src->bi_io_vec;
+	bio->bi_root = bio_src->bi_root;
 
 	bio_clone_blkg_association(bio, bio_src);
 	blkcg_bio_issue_init(bio);
@@ -1425,6 +1430,8 @@  void bio_endio(struct bio *bio)
 	if (bio->bi_disk)
 		rq_qos_done_bio(bio->bi_disk->queue, bio);
 
+	bio_del_poll_list(bio);
+
 	/*
 	 * Need to have a real endio function for chained bios, otherwise
 	 * various corner cases will break (like stacking block devices that
@@ -1446,6 +1453,7 @@  void bio_endio(struct bio *bio)
 	blk_throtl_bio_endio(bio);
 	/* release cgroup info */
 	bio_uninit(bio);
+
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio);
 }
diff --git a/block/blk-core.c b/block/blk-core.c
index 2f5c51ce32e3..5a332af01939 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -960,12 +960,31 @@  static blk_qc_t __submit_bio_noacct(struct bio *bio)
 {
 	struct bio_list bio_list_on_stack[2];
 	blk_qc_t ret = BLK_QC_T_NONE;
+	bool iopoll;
+	struct bio *root;
 
 	BUG_ON(bio->bi_next);
 
 	bio_list_init(&bio_list_on_stack[0]);
 	current->bio_list = bio_list_on_stack;
 
+	iopoll = test_bit(QUEUE_FLAG_POLL, &bio->bi_disk->queue->queue_flags);
+	iopoll = iopoll && (bio->bi_opf & REQ_HIPRI);
+
+	if (iopoll) {
+		bio->bi_root = root = bio;
+		/*
+		 * We need to pin root bio here since there's a reference from
+		 * the returned cookie. bio_get() is not enough since the whole
+		 * bio and the corresponding kiocb/dio may have already
+		 * completed and thus won't call blk_poll() at all, in which
+		 * case the pairing bio_put() in blk_bio_poll() won't be called.
+		 * The side effect of bio_inc_remaining() is that, the whole bio
+		 * won't complete until blk_poll() called.
+		 */
+		bio_inc_remaining(root);
+	}
+
 	do {
 		struct request_queue *q = bio->bi_disk->queue;
 		struct bio_list lower, same;
@@ -979,7 +998,18 @@  static blk_qc_t __submit_bio_noacct(struct bio *bio)
 		bio_list_on_stack[1] = bio_list_on_stack[0];
 		bio_list_init(&bio_list_on_stack[0]);
 
-		ret = __submit_bio(bio);
+		if (iopoll) {
+			/* See the comments of above bio_inc_remaining(). */
+			bio_inc_remaining(bio);
+			bio->bi_cookie = __submit_bio(bio);
+
+			if (blk_qc_t_valid(bio->bi_cookie))
+				bio_add_poll_list(bio);
+
+			bio_endio(bio);
+		} else {
+			ret = __submit_bio(bio);
+		}
 
 		/*
 		 * Sort new bios into those for a lower level and those for the
@@ -1002,7 +1032,11 @@  static blk_qc_t __submit_bio_noacct(struct bio *bio)
 	} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
 
 	current->bio_list = NULL;
-	return ret;
+
+	if (iopoll)
+		return (blk_qc_t)root;
+
+	return BLK_QC_T_NONE;
 }
 
 static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
@@ -1131,6 +1165,52 @@  blk_qc_t submit_bio(struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio);
 
+int blk_bio_poll(struct request_queue *q, blk_qc_t cookie)
+{
+	int ret = 0;
+	struct bio *bio, *root = (struct bio*)cookie;
+
+	if (list_empty(&root->bi_plist)) {
+		bio_endio(root);
+		return 1;
+	}
+
+	spin_lock(&root->bi_plock);
+	bio = list_first_entry_or_null(&root->bi_plist, struct bio, bi_pnode);
+
+	while (bio) {
+		struct request_queue *q = bio->bi_disk->queue;
+		blk_qc_t cookie = bio->bi_cookie;
+
+		spin_unlock(&root->bi_plock);
+		BUG_ON(!blk_qc_t_valid(cookie));
+
+		ret += blk_mq_poll(q, cookie);
+
+		spin_lock(&root->bi_plock);
+		/*
+		 * One blk_mq_poll() call could complete multiple bios, and
+		 * thus multiple bios could be removed from root->bi_plock
+		 * list.
+		 */
+		bio = list_first_entry_or_null(&root->bi_plist, struct bio, bi_pnode);
+	}
+
+	spin_unlock(&root->bi_plock);
+
+	if (list_empty(&root->bi_plist)) {
+		bio_endio(root);
+		/*
+		 * 'ret' may be 0 here. root->bi_plist may be empty once we
+		 * acquire the list spinlock.
+		 */
+		ret = max(ret, 1);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(blk_bio_poll);
+
 static bool blk_poll_hybrid(struct request_queue *q, blk_qc_t cookie)
 {
 	struct blk_mq_hw_ctx *hctx;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 2e05244fc16d..2cf5d8f0ea34 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -277,6 +277,12 @@  struct bio {
 
 	struct bio_set		*bi_pool;
 
+	struct bio		*bi_root;	/* original bio of submit_bio() */
+	struct list_head        bi_plist;
+	struct list_head        bi_pnode;
+	struct spinlock         bi_plock;
+	blk_qc_t		bi_cookie;
+
 	/*
 	 * We can inline a number of vecs at the end of the bio, to avoid
 	 * double allocations for a small number of bio_vecs. This member
@@ -557,6 +563,39 @@  static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
 	return (cookie & BLK_QC_T_INTERNAL) != 0;
 }
 
+static inline void bio_add_poll_list(struct bio *bio)
+{
+	struct bio *root = bio->bi_root;
+
+	/*
+	 * The spin_lock() variant is enough since bios in root->bi_plist are
+	 * all enqueued into polling mode hardware queue, thus the list_del()
+	 * operation is handled only in process context.
+	 */
+	spin_lock(&root->bi_plock);
+	list_add_tail(&bio->bi_pnode, &root->bi_plist);
+	spin_unlock(&root->bi_plock);
+}
+
+static inline void bio_del_poll_list(struct bio *bio)
+{
+	struct bio *root = bio->bi_root;
+
+	/*
+	 * bios in mq routine: @bi_root is NULL, @bi_cookie is 0;
+	 * bios in bio-based routine: @bi_root is non-NULL, @bi_cookie is valid
+	 * (including 0) for those in root->bi_plist, invalid for the
+	 * remaining.
+	 */
+	if (bio->bi_root && blk_qc_t_valid(bio->bi_cookie)) {
+		spin_lock(&root->bi_plock);
+		list_del(&bio->bi_pnode);
+		spin_unlock(&root->bi_plock);
+	}
+}
+
+int blk_bio_poll(struct request_queue *q, blk_qc_t cookie);
+
 struct blk_rq_stat {
 	u64 mean;
 	u64 min;