diff mbox series

[RFC,V2,09/13] block: use per-task poll context to implement bio based io poll

Message ID 20210318164827.1481133-10-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series block: support bio based io polling | expand

Commit Message

Ming Lei March 18, 2021, 4:48 p.m. UTC
Currently bio based IO poll needs to poll all hw queue blindly, this way
is very inefficient, and the big reason is that we can't pass bio
submission result to io poll task.

In IO submission context, track associated underlying bios by per-task
submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data,
and return current->pid to caller of submit_bio() for any bio based
driver's IO, which is submitted from FS.

In IO poll context, the passed cookie tells us the PID of submission
context, and we can find the bio from that submission context. Moving
bio from submission queue to poll queue of the poll context, and keep
polling until these bios are ended. Remove bio from poll queue if the
bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose.

In previous version, kfifo is used to implement submission queue, and
Jeffle Xu found that kfifo can't scale well in case of high queue depth.
So far bio's size is close to 2 cacheline size, and it may not be
accepted to add new field into bio for solving the scalability issue by
tracking bios via linked list, switch to bio group list for tracking bio,
the idea is to reuse .bi_end_io for linking bios into a linked list for
all sharing same .bi_end_io(call it bio group), which is recovered before
really end bio, since BIO_END_BY_POLL is added for enhancing this point.
Usually .bi_end_bio is same for all bios in same layer, so it is enough to
provide very limited groups, such as 32 for fixing the scalability issue.

Usually submission shares context with io poll. The per-task poll context
is just like stack variable, and it is cheap to move data between the two
per-task queues.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/bio.c               |   5 ++
 block/blk-core.c          | 149 +++++++++++++++++++++++++++++++-
 block/blk-mq.c            | 173 +++++++++++++++++++++++++++++++++++++-
 block/blk.h               |   9 ++
 include/linux/blk_types.h |  16 +++-
 5 files changed, 348 insertions(+), 4 deletions(-)

Comments

Mike Snitzer March 18, 2021, 5:26 p.m. UTC | #1
On Thu, Mar 18 2021 at 12:48pm -0400,
Ming Lei <ming.lei@redhat.com> wrote:

> Currently bio based IO poll needs to poll all hw queue blindly, this way
> is very inefficient, and the big reason is that we can't pass bio
> submission result to io poll task.
> 
> In IO submission context, track associated underlying bios by per-task
> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data,
> and return current->pid to caller of submit_bio() for any bio based
> driver's IO, which is submitted from FS.
> 
> In IO poll context, the passed cookie tells us the PID of submission
> context, and we can find the bio from that submission context. Moving
> bio from submission queue to poll queue of the poll context, and keep
> polling until these bios are ended. Remove bio from poll queue if the
> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose.
> 
> In previous version, kfifo is used to implement submission queue, and
> Jeffle Xu found that kfifo can't scale well in case of high queue depth.
> So far bio's size is close to 2 cacheline size, and it may not be
> accepted to add new field into bio for solving the scalability issue by
> tracking bios via linked list, switch to bio group list for tracking bio,
> the idea is to reuse .bi_end_io for linking bios into a linked list for
> all sharing same .bi_end_io(call it bio group), which is recovered before
> really end bio, since BIO_END_BY_POLL is added for enhancing this point.
> Usually .bi_end_bio is same for all bios in same layer, so it is enough to
> provide very limited groups, such as 32 for fixing the scalability issue.
> 
> Usually submission shares context with io poll. The per-task poll context
> is just like stack variable, and it is cheap to move data between the two
> per-task queues.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/bio.c               |   5 ++
>  block/blk-core.c          | 149 +++++++++++++++++++++++++++++++-
>  block/blk-mq.c            | 173 +++++++++++++++++++++++++++++++++++++-
>  block/blk.h               |   9 ++
>  include/linux/blk_types.h |  16 +++-
>  5 files changed, 348 insertions(+), 4 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 26b7f721cda8..04c043dc60fc 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio)
>   **/
>  void bio_endio(struct bio *bio)
>  {
> +	/* BIO_END_BY_POLL has to be set before calling submit_bio */
> +	if (bio_flagged(bio, BIO_END_BY_POLL)) {
> +		bio_set_flag(bio, BIO_DONE);
> +		return;
> +	}
>  again:
>  	if (!bio_remaining_done(bio))
>  		return;
> diff --git a/block/blk-core.c b/block/blk-core.c
> index efc7a61a84b4..778d25a7e76c 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -805,6 +805,77 @@ static inline unsigned int bio_grp_list_size(unsigned int nr_grps)
>  		sizeof(struct bio_grp_list_data);
>  }
>  
> +static inline void *bio_grp_data(struct bio *bio)
> +{
> +	return bio->bi_poll;
> +}
> +
> +/* add bio into bio group list, return true if it is added */
> +static bool bio_grp_list_add(struct bio_grp_list *list, struct bio *bio)
> +{
> +	int i;
> +	struct bio_grp_list_data *grp;
> +
> +	for (i = 0; i < list->nr_grps; i++) {
> +		grp = &list->head[i];
> +		if (grp->grp_data == bio_grp_data(bio)) {
> +			__bio_grp_list_add(&grp->list, bio);
> +			return true;
> +		}
> +	}
> +
> +	if (i == list->max_nr_grps)
> +		return false;
> +
> +	/* create a new group */
> +	grp = &list->head[i];
> +	bio_list_init(&grp->list);
> +	grp->grp_data = bio_grp_data(bio);
> +	__bio_grp_list_add(&grp->list, bio);
> +	list->nr_grps++;
> +
> +	return true;
> +}
> +
> +static int bio_grp_list_find_grp(struct bio_grp_list *list, void *grp_data)
> +{
> +	int i;
> +	struct bio_grp_list_data *grp;
> +
> +	for (i = 0; i < list->max_nr_grps; i++) {
> +		grp = &list->head[i];
> +		if (grp->grp_data == grp_data)
> +			return i;
> +	}
> +	for (i = 0; i < list->max_nr_grps; i++) {
> +		grp = &list->head[i];
> +		if (bio_grp_list_grp_empty(grp))
> +			return i;
> +	}
> +	return -1;
> +}
> +
> +/* Move as many as possible groups from 'src' to 'dst' */
> +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src)
> +{
> +	int i, j, cnt = 0;
> +	struct bio_grp_list_data *grp;
> +
> +	for (i = src->nr_grps - 1; i >= 0; i--) {
> +		grp = &src->head[i];
> +		j = bio_grp_list_find_grp(dst, grp->grp_data);
> +		if (j < 0)
> +			break;
> +		if (bio_grp_list_grp_empty(&dst->head[j]))
> +			dst->head[j].grp_data = grp->grp_data;
> +		__bio_grp_list_merge(&dst->head[j].list, &grp->list);
> +		bio_list_init(&grp->list);
> +		cnt++;
> +	}
> +
> +	src->nr_grps -= cnt;
> +}
> +
>  static void bio_poll_ctx_init(struct blk_bio_poll_ctx *pc)
>  {
>  	pc->sq = (void *)pc + sizeof(*pc);
> @@ -866,6 +937,46 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q,
>  		bio->bi_opf |= REQ_TAG;
>  }
>  
> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio)
> +{
> +	struct blk_bio_poll_ctx *pc = ioc->data;
> +	unsigned int queued;
> +
> +	/*
> +	 * We rely on immutable .bi_end_io between blk-mq bio submission
> +	 * and completion. However, bio crypt may update .bi_end_io during
> +	 * submitting, so simply not support bio based polling for this
> +	 * setting.
> +	 */
> +	if (likely(!bio_has_crypt_ctx(bio))) {
> +		/* track this bio via bio group list */
> +		spin_lock(&pc->sq_lock);
> +		queued = bio_grp_list_add(pc->sq, bio);
> +		spin_unlock(&pc->sq_lock);
> +	} else {
> +		queued = false;
> +	}
> +
> +	/*
> +	 * Now the bio is added per-task fifo, mark it as END_BY_POLL,
> +	 * and the bio is always completed from the pair poll context.
> +	 *
> +	 * One invariant is that if bio isn't completed, blk_poll() will
> +	 * be called by passing cookie returned from submitting this bio.
> +	 */
> +	if (!queued)
> +		bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG);
> +	else
> +		bio_set_flag(bio, BIO_END_BY_POLL);
> +
> +	return queued;
> +}
> +
> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
> +{
> +	bio->bi_iter.bi_private_data = cookie;
> +}
> +
>  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
>  {
>  	struct block_device *bdev = bio->bi_bdev;
> @@ -1020,7 +1131,7 @@ static blk_qc_t __submit_bio(struct bio *bio)
>   * bio_list_on_stack[1] contains bios that were submitted before the current
>   *	->submit_bio_bio, but that haven't been processed yet.
>   */
> -static blk_qc_t __submit_bio_noacct(struct bio *bio)
> +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc)
>  {
>  	struct bio_list bio_list_on_stack[2];
>  	blk_qc_t ret = BLK_QC_T_NONE;
> @@ -1043,7 +1154,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>  		bio_list_on_stack[1] = bio_list_on_stack[0];
>  		bio_list_init(&bio_list_on_stack[0]);
>  
> -		ret = __submit_bio(bio);
> +		if (ioc && queue_is_mq(q) &&
> +				(bio->bi_opf & (REQ_HIPRI | REQ_TAG))) {
> +			bool queued = blk_bio_poll_prep_submit(ioc, bio);
> +
> +			ret = __submit_bio(bio);
> +			if (queued)
> +				blk_bio_poll_post_submit(bio, ret);
> +		} else {
> +			ret = __submit_bio(bio);
> +		}

So you're only supporting bio-based polling if the bio-based device is
stacked _directly_ ontop of blk-mq?  Severely limits the utility of
bio-based IO polling support if such shallow stacking is required.

Mike
Mike Snitzer March 18, 2021, 5:38 p.m. UTC | #2
On Thu, Mar 18, 2021 at 1:26 PM Mike Snitzer <snitzer@redhat.com> wrote:
>
> On Thu, Mar 18 2021 at 12:48pm -0400,
> Ming Lei <ming.lei@redhat.com> wrote:
>
> > Currently bio based IO poll needs to poll all hw queue blindly, this way
> > is very inefficient, and the big reason is that we can't pass bio
> > submission result to io poll task.
> >
> > In IO submission context, track associated underlying bios by per-task
> > submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data,
> > and return current->pid to caller of submit_bio() for any bio based
> > driver's IO, which is submitted from FS.
> >
> > In IO poll context, the passed cookie tells us the PID of submission
> > context, and we can find the bio from that submission context. Moving
> > bio from submission queue to poll queue of the poll context, and keep
> > polling until these bios are ended. Remove bio from poll queue if the
> > bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose.
> >
> > In previous version, kfifo is used to implement submission queue, and
> > Jeffle Xu found that kfifo can't scale well in case of high queue depth.
> > So far bio's size is close to 2 cacheline size, and it may not be
> > accepted to add new field into bio for solving the scalability issue by
> > tracking bios via linked list, switch to bio group list for tracking bio,
> > the idea is to reuse .bi_end_io for linking bios into a linked list for
> > all sharing same .bi_end_io(call it bio group), which is recovered before
> > really end bio, since BIO_END_BY_POLL is added for enhancing this point.
> > Usually .bi_end_bio is same for all bios in same layer, so it is enough to
> > provide very limited groups, such as 32 for fixing the scalability issue.
> >
> > Usually submission shares context with io poll. The per-task poll context
> > is just like stack variable, and it is cheap to move data between the two
> > per-task queues.
> >
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/bio.c               |   5 ++
> >  block/blk-core.c          | 149 +++++++++++++++++++++++++++++++-
> >  block/blk-mq.c            | 173 +++++++++++++++++++++++++++++++++++++-
> >  block/blk.h               |   9 ++
> >  include/linux/blk_types.h |  16 +++-
> >  5 files changed, 348 insertions(+), 4 deletions(-)
> >
> > diff --git a/block/bio.c b/block/bio.c
> > index 26b7f721cda8..04c043dc60fc 100644
> > --- a/block/bio.c
> > +++ b/block/bio.c
> > @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio)
> >   **/
> >  void bio_endio(struct bio *bio)
> >  {
> > +     /* BIO_END_BY_POLL has to be set before calling submit_bio */
> > +     if (bio_flagged(bio, BIO_END_BY_POLL)) {
> > +             bio_set_flag(bio, BIO_DONE);
> > +             return;
> > +     }
> >  again:
> >       if (!bio_remaining_done(bio))
> >               return;
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index efc7a61a84b4..778d25a7e76c 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -805,6 +805,77 @@ static inline unsigned int bio_grp_list_size(unsigned int nr_grps)
> >               sizeof(struct bio_grp_list_data);
> >  }
> >
> > +static inline void *bio_grp_data(struct bio *bio)
> > +{
> > +     return bio->bi_poll;
> > +}
> > +
> > +/* add bio into bio group list, return true if it is added */
> > +static bool bio_grp_list_add(struct bio_grp_list *list, struct bio *bio)
> > +{
> > +     int i;
> > +     struct bio_grp_list_data *grp;
> > +
> > +     for (i = 0; i < list->nr_grps; i++) {
> > +             grp = &list->head[i];
> > +             if (grp->grp_data == bio_grp_data(bio)) {
> > +                     __bio_grp_list_add(&grp->list, bio);
> > +                     return true;
> > +             }
> > +     }
> > +
> > +     if (i == list->max_nr_grps)
> > +             return false;
> > +
> > +     /* create a new group */
> > +     grp = &list->head[i];
> > +     bio_list_init(&grp->list);
> > +     grp->grp_data = bio_grp_data(bio);
> > +     __bio_grp_list_add(&grp->list, bio);
> > +     list->nr_grps++;
> > +
> > +     return true;
> > +}
> > +
> > +static int bio_grp_list_find_grp(struct bio_grp_list *list, void *grp_data)
> > +{
> > +     int i;
> > +     struct bio_grp_list_data *grp;
> > +
> > +     for (i = 0; i < list->max_nr_grps; i++) {
> > +             grp = &list->head[i];
> > +             if (grp->grp_data == grp_data)
> > +                     return i;
> > +     }
> > +     for (i = 0; i < list->max_nr_grps; i++) {
> > +             grp = &list->head[i];
> > +             if (bio_grp_list_grp_empty(grp))
> > +                     return i;
> > +     }
> > +     return -1;
> > +}
> > +
> > +/* Move as many as possible groups from 'src' to 'dst' */
> > +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src)
> > +{
> > +     int i, j, cnt = 0;
> > +     struct bio_grp_list_data *grp;
> > +
> > +     for (i = src->nr_grps - 1; i >= 0; i--) {
> > +             grp = &src->head[i];
> > +             j = bio_grp_list_find_grp(dst, grp->grp_data);
> > +             if (j < 0)
> > +                     break;
> > +             if (bio_grp_list_grp_empty(&dst->head[j]))
> > +                     dst->head[j].grp_data = grp->grp_data;
> > +             __bio_grp_list_merge(&dst->head[j].list, &grp->list);
> > +             bio_list_init(&grp->list);
> > +             cnt++;
> > +     }
> > +
> > +     src->nr_grps -= cnt;
> > +}
> > +
> >  static void bio_poll_ctx_init(struct blk_bio_poll_ctx *pc)
> >  {
> >       pc->sq = (void *)pc + sizeof(*pc);
> > @@ -866,6 +937,46 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q,
> >               bio->bi_opf |= REQ_TAG;
> >  }
> >
> > +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio)
> > +{
> > +     struct blk_bio_poll_ctx *pc = ioc->data;
> > +     unsigned int queued;
> > +
> > +     /*
> > +      * We rely on immutable .bi_end_io between blk-mq bio submission
> > +      * and completion. However, bio crypt may update .bi_end_io during
> > +      * submitting, so simply not support bio based polling for this
> > +      * setting.
> > +      */
> > +     if (likely(!bio_has_crypt_ctx(bio))) {
> > +             /* track this bio via bio group list */
> > +             spin_lock(&pc->sq_lock);
> > +             queued = bio_grp_list_add(pc->sq, bio);
> > +             spin_unlock(&pc->sq_lock);
> > +     } else {
> > +             queued = false;
> > +     }
> > +
> > +     /*
> > +      * Now the bio is added per-task fifo, mark it as END_BY_POLL,
> > +      * and the bio is always completed from the pair poll context.
> > +      *
> > +      * One invariant is that if bio isn't completed, blk_poll() will
> > +      * be called by passing cookie returned from submitting this bio.
> > +      */
> > +     if (!queued)
> > +             bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG);
> > +     else
> > +             bio_set_flag(bio, BIO_END_BY_POLL);
> > +
> > +     return queued;
> > +}
> > +
> > +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
> > +{
> > +     bio->bi_iter.bi_private_data = cookie;
> > +}
> > +
> >  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
> >  {
> >       struct block_device *bdev = bio->bi_bdev;
> > @@ -1020,7 +1131,7 @@ static blk_qc_t __submit_bio(struct bio *bio)
> >   * bio_list_on_stack[1] contains bios that were submitted before the current
> >   *   ->submit_bio_bio, but that haven't been processed yet.
> >   */
> > -static blk_qc_t __submit_bio_noacct(struct bio *bio)
> > +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc)
> >  {
> >       struct bio_list bio_list_on_stack[2];
> >       blk_qc_t ret = BLK_QC_T_NONE;
> > @@ -1043,7 +1154,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >               bio_list_on_stack[1] = bio_list_on_stack[0];
> >               bio_list_init(&bio_list_on_stack[0]);
> >
> > -             ret = __submit_bio(bio);
> > +             if (ioc && queue_is_mq(q) &&
> > +                             (bio->bi_opf & (REQ_HIPRI | REQ_TAG))) {
> > +                     bool queued = blk_bio_poll_prep_submit(ioc, bio);
> > +
> > +                     ret = __submit_bio(bio);
> > +                     if (queued)
> > +                             blk_bio_poll_post_submit(bio, ret);
> > +             } else {
> > +                     ret = __submit_bio(bio);
> > +             }
>
> So you're only supporting bio-based polling if the bio-based device is
> stacked _directly_ ontop of blk-mq?  Severely limits the utility of
> bio-based IO polling support if such shallow stacking is required.

Sorry, I was too focused on this core change, I didn't look far enough
to see you're returning current->pid for all the intermediate
bio-based layers in an arbitrarily deep bio-based IO device stack.

I think I understand it now, but I _do_ think we need a better name
than "__submit_bio_noacct_int"

Mike
Ming Lei March 19, 2021, 12:30 a.m. UTC | #3
On Thu, Mar 18, 2021 at 01:26:22PM -0400, Mike Snitzer wrote:
> On Thu, Mar 18 2021 at 12:48pm -0400,
> Ming Lei <ming.lei@redhat.com> wrote:
> 
> > Currently bio based IO poll needs to poll all hw queue blindly, this way
> > is very inefficient, and the big reason is that we can't pass bio
> > submission result to io poll task.
> > 
> > In IO submission context, track associated underlying bios by per-task
> > submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data,
> > and return current->pid to caller of submit_bio() for any bio based
> > driver's IO, which is submitted from FS.
> > 
> > In IO poll context, the passed cookie tells us the PID of submission
> > context, and we can find the bio from that submission context. Moving
> > bio from submission queue to poll queue of the poll context, and keep
> > polling until these bios are ended. Remove bio from poll queue if the
> > bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose.
> > 
> > In previous version, kfifo is used to implement submission queue, and
> > Jeffle Xu found that kfifo can't scale well in case of high queue depth.
> > So far bio's size is close to 2 cacheline size, and it may not be
> > accepted to add new field into bio for solving the scalability issue by
> > tracking bios via linked list, switch to bio group list for tracking bio,
> > the idea is to reuse .bi_end_io for linking bios into a linked list for
> > all sharing same .bi_end_io(call it bio group), which is recovered before
> > really end bio, since BIO_END_BY_POLL is added for enhancing this point.
> > Usually .bi_end_bio is same for all bios in same layer, so it is enough to
> > provide very limited groups, such as 32 for fixing the scalability issue.
> > 
> > Usually submission shares context with io poll. The per-task poll context
> > is just like stack variable, and it is cheap to move data between the two
> > per-task queues.
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/bio.c               |   5 ++
> >  block/blk-core.c          | 149 +++++++++++++++++++++++++++++++-
> >  block/blk-mq.c            | 173 +++++++++++++++++++++++++++++++++++++-
> >  block/blk.h               |   9 ++
> >  include/linux/blk_types.h |  16 +++-
> >  5 files changed, 348 insertions(+), 4 deletions(-)
> > 
> > diff --git a/block/bio.c b/block/bio.c
> > index 26b7f721cda8..04c043dc60fc 100644
> > --- a/block/bio.c
> > +++ b/block/bio.c
> > @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio)
> >   **/
> >  void bio_endio(struct bio *bio)
> >  {
> > +	/* BIO_END_BY_POLL has to be set before calling submit_bio */
> > +	if (bio_flagged(bio, BIO_END_BY_POLL)) {
> > +		bio_set_flag(bio, BIO_DONE);
> > +		return;
> > +	}
> >  again:
> >  	if (!bio_remaining_done(bio))
> >  		return;
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index efc7a61a84b4..778d25a7e76c 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -805,6 +805,77 @@ static inline unsigned int bio_grp_list_size(unsigned int nr_grps)
> >  		sizeof(struct bio_grp_list_data);
> >  }
> >  
> > +static inline void *bio_grp_data(struct bio *bio)
> > +{
> > +	return bio->bi_poll;
> > +}
> > +
> > +/* add bio into bio group list, return true if it is added */
> > +static bool bio_grp_list_add(struct bio_grp_list *list, struct bio *bio)
> > +{
> > +	int i;
> > +	struct bio_grp_list_data *grp;
> > +
> > +	for (i = 0; i < list->nr_grps; i++) {
> > +		grp = &list->head[i];
> > +		if (grp->grp_data == bio_grp_data(bio)) {
> > +			__bio_grp_list_add(&grp->list, bio);
> > +			return true;
> > +		}
> > +	}
> > +
> > +	if (i == list->max_nr_grps)
> > +		return false;
> > +
> > +	/* create a new group */
> > +	grp = &list->head[i];
> > +	bio_list_init(&grp->list);
> > +	grp->grp_data = bio_grp_data(bio);
> > +	__bio_grp_list_add(&grp->list, bio);
> > +	list->nr_grps++;
> > +
> > +	return true;
> > +}
> > +
> > +static int bio_grp_list_find_grp(struct bio_grp_list *list, void *grp_data)
> > +{
> > +	int i;
> > +	struct bio_grp_list_data *grp;
> > +
> > +	for (i = 0; i < list->max_nr_grps; i++) {
> > +		grp = &list->head[i];
> > +		if (grp->grp_data == grp_data)
> > +			return i;
> > +	}
> > +	for (i = 0; i < list->max_nr_grps; i++) {
> > +		grp = &list->head[i];
> > +		if (bio_grp_list_grp_empty(grp))
> > +			return i;
> > +	}
> > +	return -1;
> > +}
> > +
> > +/* Move as many as possible groups from 'src' to 'dst' */
> > +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src)
> > +{
> > +	int i, j, cnt = 0;
> > +	struct bio_grp_list_data *grp;
> > +
> > +	for (i = src->nr_grps - 1; i >= 0; i--) {
> > +		grp = &src->head[i];
> > +		j = bio_grp_list_find_grp(dst, grp->grp_data);
> > +		if (j < 0)
> > +			break;
> > +		if (bio_grp_list_grp_empty(&dst->head[j]))
> > +			dst->head[j].grp_data = grp->grp_data;
> > +		__bio_grp_list_merge(&dst->head[j].list, &grp->list);
> > +		bio_list_init(&grp->list);
> > +		cnt++;
> > +	}
> > +
> > +	src->nr_grps -= cnt;
> > +}
> > +
> >  static void bio_poll_ctx_init(struct blk_bio_poll_ctx *pc)
> >  {
> >  	pc->sq = (void *)pc + sizeof(*pc);
> > @@ -866,6 +937,46 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q,
> >  		bio->bi_opf |= REQ_TAG;
> >  }
> >  
> > +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio)
> > +{
> > +	struct blk_bio_poll_ctx *pc = ioc->data;
> > +	unsigned int queued;
> > +
> > +	/*
> > +	 * We rely on immutable .bi_end_io between blk-mq bio submission
> > +	 * and completion. However, bio crypt may update .bi_end_io during
> > +	 * submitting, so simply not support bio based polling for this
> > +	 * setting.
> > +	 */
> > +	if (likely(!bio_has_crypt_ctx(bio))) {
> > +		/* track this bio via bio group list */
> > +		spin_lock(&pc->sq_lock);
> > +		queued = bio_grp_list_add(pc->sq, bio);
> > +		spin_unlock(&pc->sq_lock);
> > +	} else {
> > +		queued = false;
> > +	}
> > +
> > +	/*
> > +	 * Now the bio is added per-task fifo, mark it as END_BY_POLL,
> > +	 * and the bio is always completed from the pair poll context.
> > +	 *
> > +	 * One invariant is that if bio isn't completed, blk_poll() will
> > +	 * be called by passing cookie returned from submitting this bio.
> > +	 */
> > +	if (!queued)
> > +		bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG);
> > +	else
> > +		bio_set_flag(bio, BIO_END_BY_POLL);
> > +
> > +	return queued;
> > +}
> > +
> > +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
> > +{
> > +	bio->bi_iter.bi_private_data = cookie;
> > +}
> > +
> >  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
> >  {
> >  	struct block_device *bdev = bio->bi_bdev;
> > @@ -1020,7 +1131,7 @@ static blk_qc_t __submit_bio(struct bio *bio)
> >   * bio_list_on_stack[1] contains bios that were submitted before the current
> >   *	->submit_bio_bio, but that haven't been processed yet.
> >   */
> > -static blk_qc_t __submit_bio_noacct(struct bio *bio)
> > +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc)
> >  {
> >  	struct bio_list bio_list_on_stack[2];
> >  	blk_qc_t ret = BLK_QC_T_NONE;
> > @@ -1043,7 +1154,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >  		bio_list_on_stack[1] = bio_list_on_stack[0];
> >  		bio_list_init(&bio_list_on_stack[0]);
> >  
> > -		ret = __submit_bio(bio);
> > +		if (ioc && queue_is_mq(q) &&
> > +				(bio->bi_opf & (REQ_HIPRI | REQ_TAG))) {
> > +			bool queued = blk_bio_poll_prep_submit(ioc, bio);
> > +
> > +			ret = __submit_bio(bio);
> > +			if (queued)
> > +				blk_bio_poll_post_submit(bio, ret);
> > +		} else {
> > +			ret = __submit_bio(bio);
> > +		}
> 
> So you're only supporting bio-based polling if the bio-based device is
> stacked _directly_ ontop of blk-mq?  Severely limits the utility of
> bio-based IO polling support if such shallow stacking is required.

No, not directly ontop of blk-mq, and it can be any descendant blk-mq
device, so far only blk-mq can provide direct polling support, see
blk_poll():

                ret = q->mq_ops->poll(hctx);

If not any descendant blk-mq device is involved in this bio based
device, we can't support polling so far.


Thanks,
Ming
Jingbo Xu March 19, 2021, 9:38 a.m. UTC | #4
I'm thinking how this mechanism could work with *original* bio-based
devices that don't ne built upon mq devices, such as nvdimm. This
mechanism (also including my original design) mainly focuses on virtual
devices that built upon mq devices, i.e., md/dm.

As the original bio-based devices wants to support IO polling in the
future, then they should be somehow distingushed from md/dm.


On 3/19/21 12:48 AM, Ming Lei wrote:
> Currently bio based IO poll needs to poll all hw queue blindly, this way
> is very inefficient, and the big reason is that we can't pass bio
> submission result to io poll task.
> 
> In IO submission context, track associated underlying bios by per-task
> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data,
> and return current->pid to caller of submit_bio() for any bio based
> driver's IO, which is submitted from FS.
> 
> In IO poll context, the passed cookie tells us the PID of submission
> context, and we can find the bio from that submission context. Moving
> bio from submission queue to poll queue of the poll context, and keep
> polling until these bios are ended. Remove bio from poll queue if the
> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose.
> 
> In previous version, kfifo is used to implement submission queue, and
> Jeffle Xu found that kfifo can't scale well in case of high queue depth.
> So far bio's size is close to 2 cacheline size, and it may not be
> accepted to add new field into bio for solving the scalability issue by
> tracking bios via linked list, switch to bio group list for tracking bio,
> the idea is to reuse .bi_end_io for linking bios into a linked list for
> all sharing same .bi_end_io(call it bio group), which is recovered before
> really end bio, since BIO_END_BY_POLL is added for enhancing this point.
> Usually .bi_end_bio is same for all bios in same layer, so it is enough to
> provide very limited groups, such as 32 for fixing the scalability issue.
> 
> Usually submission shares context with io poll. The per-task poll context
> is just like stack variable, and it is cheap to move data between the two
> per-task queues.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/bio.c               |   5 ++
>  block/blk-core.c          | 149 +++++++++++++++++++++++++++++++-
>  block/blk-mq.c            | 173 +++++++++++++++++++++++++++++++++++++-
>  block/blk.h               |   9 ++
>  include/linux/blk_types.h |  16 +++-
>  5 files changed, 348 insertions(+), 4 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 26b7f721cda8..04c043dc60fc 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio)
>   **/
>  void bio_endio(struct bio *bio)
>  {
> +	/* BIO_END_BY_POLL has to be set before calling submit_bio */
> +	if (bio_flagged(bio, BIO_END_BY_POLL)) {
> +		bio_set_flag(bio, BIO_DONE);
> +		return;
> +	}
>  again:
>  	if (!bio_remaining_done(bio))
>  		return;
> diff --git a/block/blk-core.c b/block/blk-core.c
> index efc7a61a84b4..778d25a7e76c 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -805,6 +805,77 @@ static inline unsigned int bio_grp_list_size(unsigned int nr_grps)
>  		sizeof(struct bio_grp_list_data);
>  }
>  
> +static inline void *bio_grp_data(struct bio *bio)
> +{
> +	return bio->bi_poll;
> +}
> +
> +/* add bio into bio group list, return true if it is added */
> +static bool bio_grp_list_add(struct bio_grp_list *list, struct bio *bio)
> +{
> +	int i;
> +	struct bio_grp_list_data *grp;
> +
> +	for (i = 0; i < list->nr_grps; i++) {
> +		grp = &list->head[i];
> +		if (grp->grp_data == bio_grp_data(bio)) {
> +			__bio_grp_list_add(&grp->list, bio);
> +			return true;
> +		}
> +	}
> +
> +	if (i == list->max_nr_grps)
> +		return false;
> +
> +	/* create a new group */
> +	grp = &list->head[i];
> +	bio_list_init(&grp->list);
> +	grp->grp_data = bio_grp_data(bio);
> +	__bio_grp_list_add(&grp->list, bio);
> +	list->nr_grps++;
> +
> +	return true;
> +}
> +
> +static int bio_grp_list_find_grp(struct bio_grp_list *list, void *grp_data)
> +{
> +	int i;
> +	struct bio_grp_list_data *grp;
> +
> +	for (i = 0; i < list->max_nr_grps; i++) {
> +		grp = &list->head[i];
> +		if (grp->grp_data == grp_data)
> +			return i;
> +	}
> +	for (i = 0; i < list->max_nr_grps; i++) {
> +		grp = &list->head[i];
> +		if (bio_grp_list_grp_empty(grp))
> +			return i;
> +	}
> +	return -1;
> +}
> +
> +/* Move as many as possible groups from 'src' to 'dst' */
> +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src)
> +{
> +	int i, j, cnt = 0;
> +	struct bio_grp_list_data *grp;
> +
> +	for (i = src->nr_grps - 1; i >= 0; i--) {
> +		grp = &src->head[i];
> +		j = bio_grp_list_find_grp(dst, grp->grp_data);
> +		if (j < 0)
> +			break;
> +		if (bio_grp_list_grp_empty(&dst->head[j]))
> +			dst->head[j].grp_data = grp->grp_data;
> +		__bio_grp_list_merge(&dst->head[j].list, &grp->list);
> +		bio_list_init(&grp->list);
> +		cnt++;
> +	}
> +
> +	src->nr_grps -= cnt;
> +}

Not sure why it's checked in reverse order (starting from 'nr_grps - 1').


> +
>  static void bio_poll_ctx_init(struct blk_bio_poll_ctx *pc)
>  {
>  	pc->sq = (void *)pc + sizeof(*pc);
> @@ -866,6 +937,46 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q,
>  		bio->bi_opf |= REQ_TAG;
>  }
>  
> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio)
> +{
> +	struct blk_bio_poll_ctx *pc = ioc->data;
> +	unsigned int queued;
> +
> +	/*
> +	 * We rely on immutable .bi_end_io between blk-mq bio submission
> +	 * and completion. However, bio crypt may update .bi_end_io during
> +	 * submitting, so simply not support bio based polling for this
> +	 * setting.
> +	 */
> +	if (likely(!bio_has_crypt_ctx(bio))) {
> +		/* track this bio via bio group list */
> +		spin_lock(&pc->sq_lock);
> +		queued = bio_grp_list_add(pc->sq, bio);
> +		spin_unlock(&pc->sq_lock);
> +	} else {
> +		queued = false;
> +	}
> +
> +	/*
> +	 * Now the bio is added per-task fifo, mark it as END_BY_POLL,
> +	 * and the bio is always completed from the pair poll context.
> +	 *
> +	 * One invariant is that if bio isn't completed, blk_poll() will
> +	 * be called by passing cookie returned from submitting this bio.
> +	 */
> +	if (!queued)
> +		bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG);
> +	else
> +		bio_set_flag(bio, BIO_END_BY_POLL);
> +
> +	return queued;
> +}
> +
> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
> +{
> +	bio->bi_iter.bi_private_data = cookie;
> +}
> +
>  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
>  {
>  	struct block_device *bdev = bio->bi_bdev;
> @@ -1020,7 +1131,7 @@ static blk_qc_t __submit_bio(struct bio *bio)
>   * bio_list_on_stack[1] contains bios that were submitted before the current
>   *	->submit_bio_bio, but that haven't been processed yet.
>   */
> -static blk_qc_t __submit_bio_noacct(struct bio *bio)
> +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc)
>  {
>  	struct bio_list bio_list_on_stack[2];
>  	blk_qc_t ret = BLK_QC_T_NONE;
> @@ -1043,7 +1154,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>  		bio_list_on_stack[1] = bio_list_on_stack[0];
>  		bio_list_init(&bio_list_on_stack[0]);
>  
> -		ret = __submit_bio(bio);
> +		if (ioc && queue_is_mq(q) &&
> +				(bio->bi_opf & (REQ_HIPRI | REQ_TAG))) {
> +			bool queued = blk_bio_poll_prep_submit(ioc, bio);
> +
> +			ret = __submit_bio(bio);
> +			if (queued)
> +				blk_bio_poll_post_submit(bio, ret);
> +		} else {
> +			ret = __submit_bio(bio);
> +		}

If input @ioc is NULL, it will still return cookie (returned by
__submit_bio()), which will call into blk_bio_poll(), which is not
expected. (It can pass blk_queue_poll() check in blk_poll(), e.g., dm
device itself is marked as QUEUE_FLAG_POLL, but ->io_context of IO
submitting process failed to allocate the io_context, and thus calls
__submit_bio_noacct_int() with @ioc is NULL).


>  
>  		/*
>  		 * Sort new bios into those for a lower level and those for the
> @@ -1069,6 +1189,31 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>  	return ret;
>  }
>  
> +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio,
> +		struct io_context *ioc)
> +{
> +	struct blk_bio_poll_ctx *pc = ioc->data;
> +
> +	__submit_bio_noacct_int(bio, ioc);
> +
> +	/* bio submissions queued to per-task poll context */
> +	if (READ_ONCE(pc->sq->nr_grps))
> +		return current->pid;
> +
> +	/* swapper's pid is 0, but it can't submit poll IO for us */
> +	return 0;
> +}
> +
> +static inline blk_qc_t __submit_bio_noacct(struct bio *bio)
> +{
> +	struct io_context *ioc = current->io_context;
> +
> +	if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI))
> +		return __submit_bio_noacct_poll(bio, ioc);
> +


> +	return __submit_bio_noacct_int(bio, NULL);
> +}
> +
>  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
>  {
>  	struct bio_list bio_list[2] = { };
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 03f59915fe2c..f26950a51f4a 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -3865,14 +3865,185 @@ static inline int blk_mq_poll_hctx(struct request_queue *q,
>  	return ret;
>  }
>  
> +static blk_qc_t bio_get_poll_cookie(struct bio *bio)
> +{
> +	return bio->bi_iter.bi_private_data;
> +}
> +
> +static int blk_mq_poll_io(struct bio *bio)
> +{
> +	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
> +	blk_qc_t cookie = bio_get_poll_cookie(bio);
> +	int ret = 0;
> +
> +	if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) {
> +		struct blk_mq_hw_ctx *hctx =
> +			q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
> +
> +		ret += blk_mq_poll_hctx(q, hctx);
> +	}
> +	return ret;
> +}
> +
> +static int blk_bio_poll_and_end_io(struct request_queue *q,
> +		struct blk_bio_poll_ctx *poll_ctx)
> +{
> +	int ret = 0;
> +	int i;
> +
> +	/*
> +	 * Poll hw queue first.
> +	 *
> +	 * TODO: limit max poll times and make sure to not poll same
> +	 * hw queue one more time.
> +	 */
> +	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
> +		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
> +		struct bio *bio;
> +
> +		if (bio_grp_list_grp_empty(grp))
> +			continue;
> +
> +		for (bio = grp->list.head; bio; bio = bio->bi_poll)
> +			ret += blk_mq_poll_io(bio);
> +	}
> +
> +	/* reap bios */
> +	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
> +		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
> +		struct bio *bio;
> +		struct bio_list bl;
> +
> +		if (bio_grp_list_grp_empty(grp))
> +			continue;
> +
> +		bio_list_init(&bl);
> +
> +		while ((bio = __bio_grp_list_pop(&grp->list))) {
> +			if (bio_flagged(bio, BIO_DONE)) {
> +
> +				/* now recover original data */
> +				bio->bi_poll = grp->grp_data;
> +
> +				/* clear BIO_END_BY_POLL and end me really */
> +				bio_clear_flag(bio, BIO_END_BY_POLL);
> +				bio_endio(bio);
> +			} else {
> +				__bio_grp_list_add(&bl, bio);
> +			}
> +		}
> +		__bio_grp_list_merge(&grp->list, &bl);
> +	}
> +	return ret;
> +}
> +
> +static int __blk_bio_poll_io(struct request_queue *q,
> +		struct blk_bio_poll_ctx *submit_ctx,
> +		struct blk_bio_poll_ctx *poll_ctx)
> +{
> +	/*
> +	 * Move IO submission result from submission queue in submission
> +	 * context to poll queue of poll context.
> +	 */
> +	spin_lock(&submit_ctx->sq_lock);
> +	bio_grp_list_move(poll_ctx->pq, submit_ctx->sq);
> +	spin_unlock(&submit_ctx->sq_lock);
> +
> +	return blk_bio_poll_and_end_io(q, poll_ctx);
> +}
> +
> +static int blk_bio_poll_io(struct request_queue *q,
> +		struct io_context *submit_ioc,
> +		struct io_context *poll_ioc)
> +{
> +	struct blk_bio_poll_ctx *submit_ctx = submit_ioc->data;
> +	struct blk_bio_poll_ctx *poll_ctx = poll_ioc->data;
> +	int ret;
> +
> +	if (unlikely(atomic_read(&poll_ioc->nr_tasks) > 1)) {
> +		mutex_lock(&poll_ctx->pq_lock);

Why mutex is used to protect pq here rather than spinlock? Where will
the polling routine go to sleep?

Besides, how to protect the concurrent bio_list operation to sq between
producer (submission routine) and consumer (polling routine)? As far as
I understand, pc->sq_lock is used to prevent concurrent access from
multiple submission processes, while pc->pq_lock is used to prevent
concurrent access from multiple polling processes.


> +		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
> +		mutex_unlock(&poll_ctx->pq_lock);
> +	} else {
> +		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
> +	}
> +	return ret;
> +}
> +
> +static bool blk_bio_ioc_valid(struct task_struct *t)
> +{
> +	if (!t)
> +		return false;
> +
> +	if (!t->io_context)
> +		return false;
> +
> +	if (!t->io_context->data)
> +		return false;
> +
> +	return true;
> +}
> +
> +static int __blk_bio_poll(struct request_queue *q, blk_qc_t cookie)
> +{
> +	struct io_context *poll_ioc = current->io_context;
> +	pid_t pid;
> +	struct task_struct *submit_task;
> +	int ret;
> +
> +	pid = (pid_t)cookie;
> +
> +	/* io poll often share io submission context */
> +	if (likely(current->pid == pid && blk_bio_ioc_valid(current)))
> +		return blk_bio_poll_io(q, poll_ioc, poll_ioc);
> +

> +	submit_task = find_get_task_by_vpid(pid);

What if the process to which the returned cookie refers has exited?
find_get_task_by_vpid() will return NULL, thus blk_poll() won't help
reap anything, while maybe there are still bios in the poll context,
waiting to be reaped.

Maybe we need to flush the poll context when a task detaches from the
io_context.


> +	if (likely(blk_bio_ioc_valid(submit_task)))
> +		ret = blk_bio_poll_io(q, submit_task->io_context,
> +				poll_ioc);

poll_ioc may be invalid in this case, since the previous
blk_create_io_context() in blk_bio_poll() may fail.



> +	else
> +		ret = 0;
> +
> +	put_task_struct(submit_task);
> +
> +	return ret;
> +}
> +
>  static int blk_bio_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
>  {
> +	long state;
> +
> +	/* no need to poll */
> +	if (cookie == 0)
> +		return 0;
> +
>  	/*
>  	 * Create poll queue for storing poll bio and its cookie from
>  	 * submission queue
>  	 */
>  	blk_create_io_context(q, true);
>  
> +	state = current->state;
> +	do {
> +		int ret;
> +
> +		ret = __blk_bio_poll(q, cookie);
> +		if (ret > 0) {
> +			__set_current_state(TASK_RUNNING);
> +			return ret;
> +		}
> +
> +		if (signal_pending_state(state, current))
> +			__set_current_state(TASK_RUNNING);
> +
> +		if (current->state == TASK_RUNNING)
> +			return 1;
> +		if (ret < 0 || !spin)
> +			break;
> +		cpu_relax();
> +	} while (!need_resched());
> +
> +	__set_current_state(TASK_RUNNING);
>  	return 0;
>  }
>  
> @@ -3893,7 +4064,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
>  	struct blk_mq_hw_ctx *hctx;
>  	long state;
>  
> -	if (!blk_qc_t_valid(cookie) || !blk_queue_poll(q))
> +	if (!blk_queue_poll(q) || (queue_is_mq(q) && !blk_qc_t_valid(cookie)))
>  		return 0;
>  
>  	if (current->plug)
> diff --git a/block/blk.h b/block/blk.h
> index ae58a706327e..05b9f5eafdd1 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -403,4 +403,13 @@ static inline void blk_create_io_context(struct request_queue *q,
>  		bio_poll_ctx_alloc(ioc);
>  }
>  
> +BIO_LIST_HELPERS(__bio_grp_list, poll);
> +
> +static inline bool bio_grp_list_grp_empty(struct bio_grp_list_data *grp)
> +{
> +	return bio_list_empty(&grp->list);
> +}
> +
> +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src);
> +
>  #endif /* BLK_INTERNAL_H */
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index a1bcade4bcc3..2d47679bac71 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -235,7 +235,18 @@ struct bio {
>  
>  	struct bvec_iter	bi_iter;
>  
> -	bio_end_io_t		*bi_end_io;
> +	union {
> +		bio_end_io_t		*bi_end_io;
> +		/*
> +		 * bio based io poll need to track bio via bio group list
> +		 * which groups bio by same .bi_end_io, and original
> +		 * .bi_end_io is save into the group head. Will recover
> +		 * .bi_end_io before end this bio really. BIO_END_BY_POLL
> +		 * will make sure that this bio won't be really ended
> +		 * before recovering .bi_end_io.
> +		 */
> +		struct bio		*bi_poll;
> +	};
>  
>  	void			*bi_private;
>  #ifdef CONFIG_BLK_CGROUP
> @@ -304,6 +315,9 @@ enum {
>  	BIO_CGROUP_ACCT,	/* has been accounted to a cgroup */
>  	BIO_TRACKED,		/* set if bio goes through the rq_qos path */
>  	BIO_REMAPPED,
> +	BIO_END_BY_POLL,	/* end by blk_bio_poll() explicitly */
> +	/* set when bio can be ended, used for bio with BIO_END_BY_POLL */
> +	BIO_DONE,
>  	BIO_FLAG_LAST
>  };
>  
>
Ming Lei March 19, 2021, 1:46 p.m. UTC | #5
On Fri, Mar 19, 2021 at 05:38:38PM +0800, JeffleXu wrote:
> I'm thinking how this mechanism could work with *original* bio-based
> devices that don't ne built upon mq devices, such as nvdimm. This

non-mq device needs driver to implement io polling by itself, block
layer can't help it, and that can't be this patchset's job.

> mechanism (also including my original design) mainly focuses on virtual
> devices that built upon mq devices, i.e., md/dm.
> 
> As the original bio-based devices wants to support IO polling in the
> future, then they should be somehow distingushed from md/dm.
> 
> 
> On 3/19/21 12:48 AM, Ming Lei wrote:
> > Currently bio based IO poll needs to poll all hw queue blindly, this way
> > is very inefficient, and the big reason is that we can't pass bio
> > submission result to io poll task.
> > 
> > In IO submission context, track associated underlying bios by per-task
> > submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data,
> > and return current->pid to caller of submit_bio() for any bio based
> > driver's IO, which is submitted from FS.
> > 
> > In IO poll context, the passed cookie tells us the PID of submission
> > context, and we can find the bio from that submission context. Moving
> > bio from submission queue to poll queue of the poll context, and keep
> > polling until these bios are ended. Remove bio from poll queue if the
> > bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose.
> > 
> > In previous version, kfifo is used to implement submission queue, and
> > Jeffle Xu found that kfifo can't scale well in case of high queue depth.
> > So far bio's size is close to 2 cacheline size, and it may not be
> > accepted to add new field into bio for solving the scalability issue by
> > tracking bios via linked list, switch to bio group list for tracking bio,
> > the idea is to reuse .bi_end_io for linking bios into a linked list for
> > all sharing same .bi_end_io(call it bio group), which is recovered before
> > really end bio, since BIO_END_BY_POLL is added for enhancing this point.
> > Usually .bi_end_bio is same for all bios in same layer, so it is enough to
> > provide very limited groups, such as 32 for fixing the scalability issue.
> > 
> > Usually submission shares context with io poll. The per-task poll context
> > is just like stack variable, and it is cheap to move data between the two
> > per-task queues.
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/bio.c               |   5 ++
> >  block/blk-core.c          | 149 +++++++++++++++++++++++++++++++-
> >  block/blk-mq.c            | 173 +++++++++++++++++++++++++++++++++++++-
> >  block/blk.h               |   9 ++
> >  include/linux/blk_types.h |  16 +++-
> >  5 files changed, 348 insertions(+), 4 deletions(-)
> > 
> > diff --git a/block/bio.c b/block/bio.c
> > index 26b7f721cda8..04c043dc60fc 100644
> > --- a/block/bio.c
> > +++ b/block/bio.c
> > @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio)
> >   **/
> >  void bio_endio(struct bio *bio)
> >  {
> > +	/* BIO_END_BY_POLL has to be set before calling submit_bio */
> > +	if (bio_flagged(bio, BIO_END_BY_POLL)) {
> > +		bio_set_flag(bio, BIO_DONE);
> > +		return;
> > +	}
> >  again:
> >  	if (!bio_remaining_done(bio))
> >  		return;
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index efc7a61a84b4..778d25a7e76c 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -805,6 +805,77 @@ static inline unsigned int bio_grp_list_size(unsigned int nr_grps)
> >  		sizeof(struct bio_grp_list_data);
> >  }
> >  
> > +static inline void *bio_grp_data(struct bio *bio)
> > +{
> > +	return bio->bi_poll;
> > +}
> > +
> > +/* add bio into bio group list, return true if it is added */
> > +static bool bio_grp_list_add(struct bio_grp_list *list, struct bio *bio)
> > +{
> > +	int i;
> > +	struct bio_grp_list_data *grp;
> > +
> > +	for (i = 0; i < list->nr_grps; i++) {
> > +		grp = &list->head[i];
> > +		if (grp->grp_data == bio_grp_data(bio)) {
> > +			__bio_grp_list_add(&grp->list, bio);
> > +			return true;
> > +		}
> > +	}
> > +
> > +	if (i == list->max_nr_grps)
> > +		return false;
> > +
> > +	/* create a new group */
> > +	grp = &list->head[i];
> > +	bio_list_init(&grp->list);
> > +	grp->grp_data = bio_grp_data(bio);
> > +	__bio_grp_list_add(&grp->list, bio);
> > +	list->nr_grps++;
> > +
> > +	return true;
> > +}
> > +
> > +static int bio_grp_list_find_grp(struct bio_grp_list *list, void *grp_data)
> > +{
> > +	int i;
> > +	struct bio_grp_list_data *grp;
> > +
> > +	for (i = 0; i < list->max_nr_grps; i++) {
> > +		grp = &list->head[i];
> > +		if (grp->grp_data == grp_data)
> > +			return i;
> > +	}
> > +	for (i = 0; i < list->max_nr_grps; i++) {
> > +		grp = &list->head[i];
> > +		if (bio_grp_list_grp_empty(grp))
> > +			return i;
> > +	}
> > +	return -1;
> > +}
> > +
> > +/* Move as many as possible groups from 'src' to 'dst' */
> > +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src)
> > +{
> > +	int i, j, cnt = 0;
> > +	struct bio_grp_list_data *grp;
> > +
> > +	for (i = src->nr_grps - 1; i >= 0; i--) {
> > +		grp = &src->head[i];
> > +		j = bio_grp_list_find_grp(dst, grp->grp_data);
> > +		if (j < 0)
> > +			break;
> > +		if (bio_grp_list_grp_empty(&dst->head[j]))
> > +			dst->head[j].grp_data = grp->grp_data;
> > +		__bio_grp_list_merge(&dst->head[j].list, &grp->list);
> > +		bio_list_init(&grp->list);
> > +		cnt++;
> > +	}
> > +
> > +	src->nr_grps -= cnt;
> > +}
> 
> Not sure why it's checked in reverse order (starting from 'nr_grps - 1').

Then for bio group list in submission side, only first .nr_grps groups
includes bios.

> 
> 
> > +
> >  static void bio_poll_ctx_init(struct blk_bio_poll_ctx *pc)
> >  {
> >  	pc->sq = (void *)pc + sizeof(*pc);
> > @@ -866,6 +937,46 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q,
> >  		bio->bi_opf |= REQ_TAG;
> >  }
> >  
> > +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio)
> > +{
> > +	struct blk_bio_poll_ctx *pc = ioc->data;
> > +	unsigned int queued;
> > +
> > +	/*
> > +	 * We rely on immutable .bi_end_io between blk-mq bio submission
> > +	 * and completion. However, bio crypt may update .bi_end_io during
> > +	 * submitting, so simply not support bio based polling for this
> > +	 * setting.
> > +	 */
> > +	if (likely(!bio_has_crypt_ctx(bio))) {
> > +		/* track this bio via bio group list */
> > +		spin_lock(&pc->sq_lock);
> > +		queued = bio_grp_list_add(pc->sq, bio);
> > +		spin_unlock(&pc->sq_lock);
> > +	} else {
> > +		queued = false;
> > +	}
> > +
> > +	/*
> > +	 * Now the bio is added per-task fifo, mark it as END_BY_POLL,
> > +	 * and the bio is always completed from the pair poll context.
> > +	 *
> > +	 * One invariant is that if bio isn't completed, blk_poll() will
> > +	 * be called by passing cookie returned from submitting this bio.
> > +	 */
> > +	if (!queued)
> > +		bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG);
> > +	else
> > +		bio_set_flag(bio, BIO_END_BY_POLL);
> > +
> > +	return queued;
> > +}
> > +
> > +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
> > +{
> > +	bio->bi_iter.bi_private_data = cookie;
> > +}
> > +
> >  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
> >  {
> >  	struct block_device *bdev = bio->bi_bdev;
> > @@ -1020,7 +1131,7 @@ static blk_qc_t __submit_bio(struct bio *bio)
> >   * bio_list_on_stack[1] contains bios that were submitted before the current
> >   *	->submit_bio_bio, but that haven't been processed yet.
> >   */
> > -static blk_qc_t __submit_bio_noacct(struct bio *bio)
> > +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc)
> >  {
> >  	struct bio_list bio_list_on_stack[2];
> >  	blk_qc_t ret = BLK_QC_T_NONE;
> > @@ -1043,7 +1154,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >  		bio_list_on_stack[1] = bio_list_on_stack[0];
> >  		bio_list_init(&bio_list_on_stack[0]);
> >  
> > -		ret = __submit_bio(bio);
> > +		if (ioc && queue_is_mq(q) &&
> > +				(bio->bi_opf & (REQ_HIPRI | REQ_TAG))) {
> > +			bool queued = blk_bio_poll_prep_submit(ioc, bio);
> > +
> > +			ret = __submit_bio(bio);
> > +			if (queued)
> > +				blk_bio_poll_post_submit(bio, ret);
> > +		} else {
> > +			ret = __submit_bio(bio);
> > +		}
> 
> If input @ioc is NULL, it will still return cookie (returned by
> __submit_bio()), which will call into blk_bio_poll(), which is not
> expected. (It can pass blk_queue_poll() check in blk_poll(), e.g., dm
> device itself is marked as QUEUE_FLAG_POLL, but ->io_context of IO
> submitting process failed to allocate the io_context, and thus calls
> __submit_bio_noacct_int() with @ioc is NULL).

Good catch, looks the following change is needed:

diff --git a/block/blk-core.c b/block/blk-core.c
index 778d25a7e76c..dba12ba0fa48 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1211,7 +1211,8 @@ static inline blk_qc_t __submit_bio_noacct(struct bio *bio)
 	if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI))
 		return __submit_bio_noacct_poll(bio, ioc);
 
-	return __submit_bio_noacct_int(bio, NULL);
+	 __submit_bio_noacct_int(bio, NULL);
+	return 0;
 }
 
 static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)

> 
> 
> >  
> >  		/*
> >  		 * Sort new bios into those for a lower level and those for the
> > @@ -1069,6 +1189,31 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >  	return ret;
> >  }
> >  
> > +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio,
> > +		struct io_context *ioc)
> > +{
> > +	struct blk_bio_poll_ctx *pc = ioc->data;
> > +
> > +	__submit_bio_noacct_int(bio, ioc);
> > +
> > +	/* bio submissions queued to per-task poll context */
> > +	if (READ_ONCE(pc->sq->nr_grps))
> > +		return current->pid;
> > +
> > +	/* swapper's pid is 0, but it can't submit poll IO for us */
> > +	return 0;
> > +}
> > +
> > +static inline blk_qc_t __submit_bio_noacct(struct bio *bio)
> > +{
> > +	struct io_context *ioc = current->io_context;
> > +
> > +	if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI))
> > +		return __submit_bio_noacct_poll(bio, ioc);
> > +
> 
> 
> > +	return __submit_bio_noacct_int(bio, NULL);
> > +}
> > +
> >  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
> >  {
> >  	struct bio_list bio_list[2] = { };
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 03f59915fe2c..f26950a51f4a 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -3865,14 +3865,185 @@ static inline int blk_mq_poll_hctx(struct request_queue *q,
> >  	return ret;
> >  }
> >  
> > +static blk_qc_t bio_get_poll_cookie(struct bio *bio)
> > +{
> > +	return bio->bi_iter.bi_private_data;
> > +}
> > +
> > +static int blk_mq_poll_io(struct bio *bio)
> > +{
> > +	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
> > +	blk_qc_t cookie = bio_get_poll_cookie(bio);
> > +	int ret = 0;
> > +
> > +	if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) {
> > +		struct blk_mq_hw_ctx *hctx =
> > +			q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
> > +
> > +		ret += blk_mq_poll_hctx(q, hctx);
> > +	}
> > +	return ret;
> > +}
> > +
> > +static int blk_bio_poll_and_end_io(struct request_queue *q,
> > +		struct blk_bio_poll_ctx *poll_ctx)
> > +{
> > +	int ret = 0;
> > +	int i;
> > +
> > +	/*
> > +	 * Poll hw queue first.
> > +	 *
> > +	 * TODO: limit max poll times and make sure to not poll same
> > +	 * hw queue one more time.
> > +	 */
> > +	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
> > +		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
> > +		struct bio *bio;
> > +
> > +		if (bio_grp_list_grp_empty(grp))
> > +			continue;
> > +
> > +		for (bio = grp->list.head; bio; bio = bio->bi_poll)
> > +			ret += blk_mq_poll_io(bio);
> > +	}
> > +
> > +	/* reap bios */
> > +	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
> > +		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
> > +		struct bio *bio;
> > +		struct bio_list bl;
> > +
> > +		if (bio_grp_list_grp_empty(grp))
> > +			continue;
> > +
> > +		bio_list_init(&bl);
> > +
> > +		while ((bio = __bio_grp_list_pop(&grp->list))) {
> > +			if (bio_flagged(bio, BIO_DONE)) {
> > +
> > +				/* now recover original data */
> > +				bio->bi_poll = grp->grp_data;
> > +
> > +				/* clear BIO_END_BY_POLL and end me really */
> > +				bio_clear_flag(bio, BIO_END_BY_POLL);
> > +				bio_endio(bio);
> > +			} else {
> > +				__bio_grp_list_add(&bl, bio);
> > +			}
> > +		}
> > +		__bio_grp_list_merge(&grp->list, &bl);
> > +	}
> > +	return ret;
> > +}
> > +
> > +static int __blk_bio_poll_io(struct request_queue *q,
> > +		struct blk_bio_poll_ctx *submit_ctx,
> > +		struct blk_bio_poll_ctx *poll_ctx)
> > +{
> > +	/*
> > +	 * Move IO submission result from submission queue in submission
> > +	 * context to poll queue of poll context.
> > +	 */
> > +	spin_lock(&submit_ctx->sq_lock);
> > +	bio_grp_list_move(poll_ctx->pq, submit_ctx->sq);
> > +	spin_unlock(&submit_ctx->sq_lock);
> > +
> > +	return blk_bio_poll_and_end_io(q, poll_ctx);
> > +}
> > +
> > +static int blk_bio_poll_io(struct request_queue *q,
> > +		struct io_context *submit_ioc,
> > +		struct io_context *poll_ioc)
> > +{
> > +	struct blk_bio_poll_ctx *submit_ctx = submit_ioc->data;
> > +	struct blk_bio_poll_ctx *poll_ctx = poll_ioc->data;
> > +	int ret;
> > +
> > +	if (unlikely(atomic_read(&poll_ioc->nr_tasks) > 1)) {
> > +		mutex_lock(&poll_ctx->pq_lock);
> 
> Why mutex is used to protect pq here rather than spinlock? Where will

spinlock should be fine.

> the polling routine go to sleep?

The current blk_poll() can go to sleep really.

> 
> Besides, how to protect the concurrent bio_list operation to sq between
> producer (submission routine) and consumer (polling routine)? As far as

submit_ctx->sq_lock is held in __blk_bio_poll_io() for moving bios
from sq to pq, see __blk_bio_poll_io().

> I understand, pc->sq_lock is used to prevent concurrent access from
> multiple submission processes, while pc->pq_lock is used to prevent
> concurrent access from multiple polling processes.

Usually poll context needn't any lock except for shared io context,
because blk_bio_poll_ctx->pq is only accessed in poll context, and
it is still per-task.

> 
> 
> > +		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
> > +		mutex_unlock(&poll_ctx->pq_lock);
> > +	} else {
> > +		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
> > +	}
> > +	return ret;
> > +}
> > +
> > +static bool blk_bio_ioc_valid(struct task_struct *t)
> > +{
> > +	if (!t)
> > +		return false;
> > +
> > +	if (!t->io_context)
> > +		return false;
> > +
> > +	if (!t->io_context->data)
> > +		return false;
> > +
> > +	return true;
> > +}
> > +
> > +static int __blk_bio_poll(struct request_queue *q, blk_qc_t cookie)
> > +{
> > +	struct io_context *poll_ioc = current->io_context;
> > +	pid_t pid;
> > +	struct task_struct *submit_task;
> > +	int ret;
> > +
> > +	pid = (pid_t)cookie;
> > +
> > +	/* io poll often share io submission context */
> > +	if (likely(current->pid == pid && blk_bio_ioc_valid(current)))
> > +		return blk_bio_poll_io(q, poll_ioc, poll_ioc);
> > +
> 
> > +	submit_task = find_get_task_by_vpid(pid);
> 
> What if the process to which the returned cookie refers has exited?
> find_get_task_by_vpid() will return NULL, thus blk_poll() won't help
> reap anything, while maybe there are still bios in the poll context,
> waiting to be reaped.
> 
> Maybe we need to flush the poll context when a task detaches from the
> io_context.

Yeah, I know that issue, and just not address it in RFC stage.

It can be handled in the following way:

1) drain all bios in submission context until all bios are completed before
it exits since the code won't sleep.

OR

2) schedule wq for completing all submitted bios.

If poll context exits, and there is still bios not completed, not sure
if it can happen because the current in-tree code has the same issue.

> 
> 
> > +	if (likely(blk_bio_ioc_valid(submit_task)))
> > +		ret = blk_bio_poll_io(q, submit_task->io_context,
> > +				poll_ioc);
> 
> poll_ioc may be invalid in this case, since the previous
> blk_create_io_context() in blk_bio_poll() may fail.

Yeah, it can be addressed by above patch, 0 will be returned
for this case.


Thanks, 
Ming
Mike Snitzer March 19, 2021, 6:38 p.m. UTC | #6
On Thu, Mar 18 2021 at 12:48pm -0400,
Ming Lei <ming.lei@redhat.com> wrote:

> Currently bio based IO poll needs to poll all hw queue blindly, this way
> is very inefficient, and the big reason is that we can't pass bio
> submission result to io poll task.

This is awkward because bio-based IO polling doesn't exist upstream yet,
so this header should be covering your approach as a clean slate, e.g.:

The complexity associated with frequent bio splitting with bio-based
devices makes it difficult to implement IO polling efficiently because
the fan-out of underlying hw queues that need to be polled (as a
side-effect of bios being split) creates a need for more easily mapping
a group of bios to the hw queues that need to be polled.

> In IO submission context, track associated underlying bios by per-task
> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data,
> and return current->pid to caller of submit_bio() for any bio based
> driver's IO, which is submitted from FS.
> 
> In IO poll context, the passed cookie tells us the PID of submission
> context, and we can find the bio from that submission context. Moving

Maybe be more precise by covering how all bios from that task's
submission context will be moved to poll queue of poll context?

> bio from submission queue to poll queue of the poll context, and keep
> polling until these bios are ended. Remove bio from poll queue if the
> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose.
> 
> In previous version, kfifo is used to implement submission queue, and
> Jeffle Xu found that kfifo can't scale well in case of high queue depth.

Awkward to reference "previous version", maybe instead say:

In was found that kfifo doesn't scale well for a submission queue as
queue depth is increased, so a new mechanism for tracking bios is
needed.

> So far bio's size is close to 2 cacheline size, and it may not be
> accepted to add new field into bio for solving the scalability issue by
> tracking bios via linked list, switch to bio group list for tracking bio,
> the idea is to reuse .bi_end_io for linking bios into a linked list for
> all sharing same .bi_end_io(call it bio group), which is recovered before
> really end bio, since BIO_END_BY_POLL is added for enhancing this point.
> Usually .bi_end_bio is same for all bios in same layer, so it is enough to
> provide very limited groups, such as 32 for fixing the scalability issue.
> 
> Usually submission shares context with io poll. The per-task poll context
> is just like stack variable, and it is cheap to move data between the two
> per-task queues.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/bio.c               |   5 ++
>  block/blk-core.c          | 149 +++++++++++++++++++++++++++++++-
>  block/blk-mq.c            | 173 +++++++++++++++++++++++++++++++++++++-
>  block/blk.h               |   9 ++
>  include/linux/blk_types.h |  16 +++-
>  5 files changed, 348 insertions(+), 4 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 26b7f721cda8..04c043dc60fc 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio)
>   **/
>  void bio_endio(struct bio *bio)
>  {
> +	/* BIO_END_BY_POLL has to be set before calling submit_bio */
> +	if (bio_flagged(bio, BIO_END_BY_POLL)) {
> +		bio_set_flag(bio, BIO_DONE);
> +		return;
> +	}
>  again:
>  	if (!bio_remaining_done(bio))
>  		return;
> diff --git a/block/blk-core.c b/block/blk-core.c
> index efc7a61a84b4..778d25a7e76c 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -805,6 +805,77 @@ static inline unsigned int bio_grp_list_size(unsigned int nr_grps)
>  		sizeof(struct bio_grp_list_data);
>  }
>  
> +static inline void *bio_grp_data(struct bio *bio)
> +{
> +	return bio->bi_poll;
> +}
> +
> +/* add bio into bio group list, return true if it is added */
> +static bool bio_grp_list_add(struct bio_grp_list *list, struct bio *bio)
> +{
> +	int i;
> +	struct bio_grp_list_data *grp;
> +
> +	for (i = 0; i < list->nr_grps; i++) {
> +		grp = &list->head[i];
> +		if (grp->grp_data == bio_grp_data(bio)) {
> +			__bio_grp_list_add(&grp->list, bio);
> +			return true;
> +		}
> +	}
> +
> +	if (i == list->max_nr_grps)
> +		return false;
> +
> +	/* create a new group */
> +	grp = &list->head[i];
> +	bio_list_init(&grp->list);
> +	grp->grp_data = bio_grp_data(bio);
> +	__bio_grp_list_add(&grp->list, bio);
> +	list->nr_grps++;
> +
> +	return true;
> +}
> +
> +static int bio_grp_list_find_grp(struct bio_grp_list *list, void *grp_data)
> +{
> +	int i;
> +	struct bio_grp_list_data *grp;
> +
> +	for (i = 0; i < list->max_nr_grps; i++) {
> +		grp = &list->head[i];
> +		if (grp->grp_data == grp_data)
> +			return i;
> +	}
> +	for (i = 0; i < list->max_nr_grps; i++) {
> +		grp = &list->head[i];
> +		if (bio_grp_list_grp_empty(grp))
> +			return i;
> +	}
> +	return -1;
> +}
> +
> +/* Move as many as possible groups from 'src' to 'dst' */
> +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src)
> +{
> +	int i, j, cnt = 0;
> +	struct bio_grp_list_data *grp;
> +
> +	for (i = src->nr_grps - 1; i >= 0; i--) {
> +		grp = &src->head[i];
> +		j = bio_grp_list_find_grp(dst, grp->grp_data);
> +		if (j < 0)
> +			break;
> +		if (bio_grp_list_grp_empty(&dst->head[j]))
> +			dst->head[j].grp_data = grp->grp_data;
> +		__bio_grp_list_merge(&dst->head[j].list, &grp->list);
> +		bio_list_init(&grp->list);
> +		cnt++;
> +	}
> +
> +	src->nr_grps -= cnt;
> +}
> +
>  static void bio_poll_ctx_init(struct blk_bio_poll_ctx *pc)
>  {
>  	pc->sq = (void *)pc + sizeof(*pc);
> @@ -866,6 +937,46 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q,
>  		bio->bi_opf |= REQ_TAG;
>  }
>  
> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio)
> +{
> +	struct blk_bio_poll_ctx *pc = ioc->data;
> +	unsigned int queued;
> +
> +	/*
> +	 * We rely on immutable .bi_end_io between blk-mq bio submission
> +	 * and completion. However, bio crypt may update .bi_end_io during
> +	 * submitting, so simply not support bio based polling for this

s/submitting/submission/
s/not/don't/

> +	 * setting.
> +	 */
> +	if (likely(!bio_has_crypt_ctx(bio))) {
> +		/* track this bio via bio group list */
> +		spin_lock(&pc->sq_lock);
> +		queued = bio_grp_list_add(pc->sq, bio);
> +		spin_unlock(&pc->sq_lock);
> +	} else {
> +		queued = false;
> +	}
> +
> +	/*
> +	 * Now the bio is added per-task fifo, mark it as END_BY_POLL,
> +	 * and the bio is always completed from the pair poll context.

This reads awkwardly.. "pair poll context"?

> +	 *
> +	 * One invariant is that if bio isn't completed, blk_poll() will
> +	 * be called by passing cookie returned from submitting this bio.
> +	 */
> +	if (!queued)
> +		bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG);
> +	else
> +		bio_set_flag(bio, BIO_END_BY_POLL);
> +
> +	return queued;
> +}
> +
> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
> +{
> +	bio->bi_iter.bi_private_data = cookie;
> +}
> +
>  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
>  {
>  	struct block_device *bdev = bio->bi_bdev;
> @@ -1020,7 +1131,7 @@ static blk_qc_t __submit_bio(struct bio *bio)
>   * bio_list_on_stack[1] contains bios that were submitted before the current
>   *	->submit_bio_bio, but that haven't been processed yet.
>   */
> -static blk_qc_t __submit_bio_noacct(struct bio *bio)
> +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc)
>  {
>  	struct bio_list bio_list_on_stack[2];
>  	blk_qc_t ret = BLK_QC_T_NONE;

I mentioned this in previous mail, but what is it you're trying to
convey with _int?

Think we need a better function name here.

> @@ -1043,7 +1154,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>  		bio_list_on_stack[1] = bio_list_on_stack[0];
>  		bio_list_init(&bio_list_on_stack[0]);
>  
> -		ret = __submit_bio(bio);
> +		if (ioc && queue_is_mq(q) &&
> +				(bio->bi_opf & (REQ_HIPRI | REQ_TAG))) {
> +			bool queued = blk_bio_poll_prep_submit(ioc, bio);
> +
> +			ret = __submit_bio(bio);
> +			if (queued)
> +				blk_bio_poll_post_submit(bio, ret);
> +		} else {
> +			ret = __submit_bio(bio);
> +		}
>  
>  		/*
>  		 * Sort new bios into those for a lower level and those for the
> @@ -1069,6 +1189,31 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>  	return ret;
>  }
>  
> +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio,
> +		struct io_context *ioc)
> +{
> +	struct blk_bio_poll_ctx *pc = ioc->data;
> +
> +	__submit_bio_noacct_int(bio, ioc);
> +
> +	/* bio submissions queued to per-task poll context */
> +	if (READ_ONCE(pc->sq->nr_grps))
> +		return current->pid;
> +
> +	/* swapper's pid is 0, but it can't submit poll IO for us */
> +	return 0;
> +}
> +
> +static inline blk_qc_t __submit_bio_noacct(struct bio *bio)
> +{
> +	struct io_context *ioc = current->io_context;
> +
> +	if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI))
> +		return __submit_bio_noacct_poll(bio, ioc);
> +
> +	return __submit_bio_noacct_int(bio, NULL);
> +}
> +
>  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
>  {
>  	struct bio_list bio_list[2] = { };
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 03f59915fe2c..f26950a51f4a 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -3865,14 +3865,185 @@ static inline int blk_mq_poll_hctx(struct request_queue *q,
>  	return ret;
>  }
>  
> +static blk_qc_t bio_get_poll_cookie(struct bio *bio)
> +{
> +	return bio->bi_iter.bi_private_data;
> +}
> +
> +static int blk_mq_poll_io(struct bio *bio)
> +{
> +	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
> +	blk_qc_t cookie = bio_get_poll_cookie(bio);
> +	int ret = 0;
> +
> +	if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) {
> +		struct blk_mq_hw_ctx *hctx =
> +			q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
> +
> +		ret += blk_mq_poll_hctx(q, hctx);
> +	}
> +	return ret;
> +}
> +
> +static int blk_bio_poll_and_end_io(struct request_queue *q,
> +		struct blk_bio_poll_ctx *poll_ctx)
> +{
> +	int ret = 0;
> +	int i;
> +
> +	/*
> +	 * Poll hw queue first.
> +	 *
> +	 * TODO: limit max poll times and make sure to not poll same
> +	 * hw queue one more time.
> +	 */
> +	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
> +		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
> +		struct bio *bio;
> +
> +		if (bio_grp_list_grp_empty(grp))
> +			continue;
> +
> +		for (bio = grp->list.head; bio; bio = bio->bi_poll)
> +			ret += blk_mq_poll_io(bio);
> +	}
> +
> +	/* reap bios */
> +	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
> +		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
> +		struct bio *bio;
> +		struct bio_list bl;
> +
> +		if (bio_grp_list_grp_empty(grp))
> +			continue;
> +
> +		bio_list_init(&bl);
> +
> +		while ((bio = __bio_grp_list_pop(&grp->list))) {
> +			if (bio_flagged(bio, BIO_DONE)) {
> +

Remove empty newline? ^

> +				/* now recover original data */
> +				bio->bi_poll = grp->grp_data;
> +
> +				/* clear BIO_END_BY_POLL and end me really */
> +				bio_clear_flag(bio, BIO_END_BY_POLL);
> +				bio_endio(bio);
> +			} else {
> +				__bio_grp_list_add(&bl, bio);
> +			}
> +		}
> +		__bio_grp_list_merge(&grp->list, &bl);
> +	}
> +	return ret;
> +}
> +
> +static int __blk_bio_poll_io(struct request_queue *q,
> +		struct blk_bio_poll_ctx *submit_ctx,
> +		struct blk_bio_poll_ctx *poll_ctx)
> +{
> +	/*
> +	 * Move IO submission result from submission queue in submission
> +	 * context to poll queue of poll context.
> +	 */
> +	spin_lock(&submit_ctx->sq_lock);
> +	bio_grp_list_move(poll_ctx->pq, submit_ctx->sq);
> +	spin_unlock(&submit_ctx->sq_lock);
> +
> +	return blk_bio_poll_and_end_io(q, poll_ctx);
> +}
> +
> +static int blk_bio_poll_io(struct request_queue *q,
> +		struct io_context *submit_ioc,
> +		struct io_context *poll_ioc)
> +{
> +	struct blk_bio_poll_ctx *submit_ctx = submit_ioc->data;
> +	struct blk_bio_poll_ctx *poll_ctx = poll_ioc->data;
> +	int ret;
> +
> +	if (unlikely(atomic_read(&poll_ioc->nr_tasks) > 1)) {
> +		mutex_lock(&poll_ctx->pq_lock);
> +		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
> +		mutex_unlock(&poll_ctx->pq_lock);
> +	} else {
> +		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
> +	}
> +	return ret;
> +}
> +
> +static bool blk_bio_ioc_valid(struct task_struct *t)
> +{
> +	if (!t)
> +		return false;
> +
> +	if (!t->io_context)
> +		return false;
> +
> +	if (!t->io_context->data)
> +		return false;
> +
> +	return true;
> +}
> +
> +static int __blk_bio_poll(struct request_queue *q, blk_qc_t cookie)
> +{
> +	struct io_context *poll_ioc = current->io_context;
> +	pid_t pid;
> +	struct task_struct *submit_task;
> +	int ret;
> +
> +	pid = (pid_t)cookie;
> +
> +	/* io poll often share io submission context */
> +	if (likely(current->pid == pid && blk_bio_ioc_valid(current)))
> +		return blk_bio_poll_io(q, poll_ioc, poll_ioc);
> +
> +	submit_task = find_get_task_by_vpid(pid);
> +	if (likely(blk_bio_ioc_valid(submit_task)))
> +		ret = blk_bio_poll_io(q, submit_task->io_context,
> +				poll_ioc);

Style nit, but think it fine to put "poll_ioc);" on previous line.
Otherwise, best to add braces.

> +	else
> +		ret = 0;
> +
> +	put_task_struct(submit_task);
> +
> +	return ret;
> +}
> +
>  static int blk_bio_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
>  {
> +	long state;
> +
> +	/* no need to poll */
> +	if (cookie == 0)
> +		return 0;
> +
>  	/*
>  	 * Create poll queue for storing poll bio and its cookie from
>  	 * submission queue
>  	 */
>  	blk_create_io_context(q, true);
>  
> +	state = current->state;
> +	do {
> +		int ret;
> +
> +		ret = __blk_bio_poll(q, cookie);
> +		if (ret > 0) {
> +			__set_current_state(TASK_RUNNING);
> +			return ret;
> +		}
> +
> +		if (signal_pending_state(state, current))
> +			__set_current_state(TASK_RUNNING);
> +
> +		if (current->state == TASK_RUNNING)
> +			return 1;
> +		if (ret < 0 || !spin)
> +			break;
> +		cpu_relax();
> +	} while (!need_resched());
> +
> +	__set_current_state(TASK_RUNNING);
>  	return 0;
>  }
>  
> @@ -3893,7 +4064,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
>  	struct blk_mq_hw_ctx *hctx;
>  	long state;
>  
> -	if (!blk_qc_t_valid(cookie) || !blk_queue_poll(q))
> +	if (!blk_queue_poll(q) || (queue_is_mq(q) && !blk_qc_t_valid(cookie)))
>  		return 0;
>  
>  	if (current->plug)
> diff --git a/block/blk.h b/block/blk.h
> index ae58a706327e..05b9f5eafdd1 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -403,4 +403,13 @@ static inline void blk_create_io_context(struct request_queue *q,
>  		bio_poll_ctx_alloc(ioc);
>  }
>  
> +BIO_LIST_HELPERS(__bio_grp_list, poll);

Why the leading double underscore?
Especially given the following 2 helpers don't have leading double underscore?

Whichever you decide, just looking for consistency.

> +
> +static inline bool bio_grp_list_grp_empty(struct bio_grp_list_data *grp)
> +{
> +	return bio_list_empty(&grp->list);
> +}
> +
> +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src);
> +
>  #endif /* BLK_INTERNAL_H */
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index a1bcade4bcc3..2d47679bac71 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -235,7 +235,18 @@ struct bio {
>  
>  	struct bvec_iter	bi_iter;
>  
> -	bio_end_io_t		*bi_end_io;
> +	union {
> +		bio_end_io_t		*bi_end_io;
> +		/*
> +		 * bio based io poll need to track bio via bio group list

s/poll need/polling needs/

> +		 * which groups bio by same .bi_end_io, and original
> +		 * .bi_end_io is save into the group head. Will recover

s/save/saved/

> +		 * .bi_end_io before end this bio really. BIO_END_BY_POLL

s/before end this bio really/before really ending bio/

> +		 * will make sure that this bio won't be really ended

s/really//

> +		 * before recovering .bi_end_io.
> +		 */
> +		struct bio		*bi_poll;
> +	};
>  
>  	void			*bi_private;
>  #ifdef CONFIG_BLK_CGROUP
> @@ -304,6 +315,9 @@ enum {
>  	BIO_CGROUP_ACCT,	/* has been accounted to a cgroup */
>  	BIO_TRACKED,		/* set if bio goes through the rq_qos path */
>  	BIO_REMAPPED,
> +	BIO_END_BY_POLL,	/* end by blk_bio_poll() explicitly */
> +	/* set when bio can be ended, used for bio with BIO_END_BY_POLL */
> +	BIO_DONE,
>  	BIO_FLAG_LAST
>  };
>  
> -- 
> 2.29.2
>
Jingbo Xu March 20, 2021, 5:56 a.m. UTC | #7
On 3/19/21 9:46 PM, Ming Lei wrote:
> On Fri, Mar 19, 2021 at 05:38:38PM +0800, JeffleXu wrote:
>> I'm thinking how this mechanism could work with *original* bio-based
>> devices that don't ne built upon mq devices, such as nvdimm. This
> 
> non-mq device needs driver to implement io polling by itself, block
> layer can't help it, and that can't be this patchset's job.
> 
>> mechanism (also including my original design) mainly focuses on virtual
>> devices that built upon mq devices, i.e., md/dm.
>>
>> As the original bio-based devices wants to support IO polling in the
>> future, then they should be somehow distingushed from md/dm.
>>
>>
>> On 3/19/21 12:48 AM, Ming Lei wrote:
>>> Currently bio based IO poll needs to poll all hw queue blindly, this way
>>> is very inefficient, and the big reason is that we can't pass bio
>>> submission result to io poll task.
>>>
>>> In IO submission context, track associated underlying bios by per-task
>>> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data,
>>> and return current->pid to caller of submit_bio() for any bio based
>>> driver's IO, which is submitted from FS.
>>>
>>> In IO poll context, the passed cookie tells us the PID of submission
>>> context, and we can find the bio from that submission context. Moving
>>> bio from submission queue to poll queue of the poll context, and keep
>>> polling until these bios are ended. Remove bio from poll queue if the
>>> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose.
>>>
>>> In previous version, kfifo is used to implement submission queue, and
>>> Jeffle Xu found that kfifo can't scale well in case of high queue depth.
>>> So far bio's size is close to 2 cacheline size, and it may not be
>>> accepted to add new field into bio for solving the scalability issue by
>>> tracking bios via linked list, switch to bio group list for tracking bio,
>>> the idea is to reuse .bi_end_io for linking bios into a linked list for
>>> all sharing same .bi_end_io(call it bio group), which is recovered before
>>> really end bio, since BIO_END_BY_POLL is added for enhancing this point.
>>> Usually .bi_end_bio is same for all bios in same layer, so it is enough to
>>> provide very limited groups, such as 32 for fixing the scalability issue.
>>>
>>> Usually submission shares context with io poll. The per-task poll context
>>> is just like stack variable, and it is cheap to move data between the two
>>> per-task queues.
>>>
>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>> ---
>>>  block/bio.c               |   5 ++
>>>  block/blk-core.c          | 149 +++++++++++++++++++++++++++++++-
>>>  block/blk-mq.c            | 173 +++++++++++++++++++++++++++++++++++++-
>>>  block/blk.h               |   9 ++
>>>  include/linux/blk_types.h |  16 +++-
>>>  5 files changed, 348 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/block/bio.c b/block/bio.c
>>> index 26b7f721cda8..04c043dc60fc 100644
>>> --- a/block/bio.c
>>> +++ b/block/bio.c
>>> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio)
>>>   **/
>>>  void bio_endio(struct bio *bio)
>>>  {
>>> +	/* BIO_END_BY_POLL has to be set before calling submit_bio */
>>> +	if (bio_flagged(bio, BIO_END_BY_POLL)) {
>>> +		bio_set_flag(bio, BIO_DONE);
>>> +		return;
>>> +	}
>>>  again:
>>>  	if (!bio_remaining_done(bio))
>>>  		return;
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index efc7a61a84b4..778d25a7e76c 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -805,6 +805,77 @@ static inline unsigned int bio_grp_list_size(unsigned int nr_grps)
>>>  		sizeof(struct bio_grp_list_data);
>>>  }
>>>  
>>> +static inline void *bio_grp_data(struct bio *bio)
>>> +{
>>> +	return bio->bi_poll;
>>> +}
>>> +
>>> +/* add bio into bio group list, return true if it is added */
>>> +static bool bio_grp_list_add(struct bio_grp_list *list, struct bio *bio)
>>> +{
>>> +	int i;
>>> +	struct bio_grp_list_data *grp;
>>> +
>>> +	for (i = 0; i < list->nr_grps; i++) {
>>> +		grp = &list->head[i];
>>> +		if (grp->grp_data == bio_grp_data(bio)) {
>>> +			__bio_grp_list_add(&grp->list, bio);
>>> +			return true;
>>> +		}
>>> +	}
>>> +
>>> +	if (i == list->max_nr_grps)
>>> +		return false;
>>> +
>>> +	/* create a new group */
>>> +	grp = &list->head[i];
>>> +	bio_list_init(&grp->list);
>>> +	grp->grp_data = bio_grp_data(bio);
>>> +	__bio_grp_list_add(&grp->list, bio);
>>> +	list->nr_grps++;
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +static int bio_grp_list_find_grp(struct bio_grp_list *list, void *grp_data)
>>> +{
>>> +	int i;
>>> +	struct bio_grp_list_data *grp;
>>> +
>>> +	for (i = 0; i < list->max_nr_grps; i++) {
>>> +		grp = &list->head[i];
>>> +		if (grp->grp_data == grp_data)
>>> +			return i;
>>> +	}
>>> +	for (i = 0; i < list->max_nr_grps; i++) {
>>> +		grp = &list->head[i];
>>> +		if (bio_grp_list_grp_empty(grp))
>>> +			return i;
>>> +	}
>>> +	return -1;
>>> +}
>>> +
>>> +/* Move as many as possible groups from 'src' to 'dst' */
>>> +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src)
>>> +{
>>> +	int i, j, cnt = 0;
>>> +	struct bio_grp_list_data *grp;
>>> +
>>> +	for (i = src->nr_grps - 1; i >= 0; i--) {
>>> +		grp = &src->head[i];
>>> +		j = bio_grp_list_find_grp(dst, grp->grp_data);
>>> +		if (j < 0)
>>> +			break;
>>> +		if (bio_grp_list_grp_empty(&dst->head[j]))
>>> +			dst->head[j].grp_data = grp->grp_data;
>>> +		__bio_grp_list_merge(&dst->head[j].list, &grp->list);
>>> +		bio_list_init(&grp->list);
>>> +		cnt++;
>>> +	}
>>> +
>>> +	src->nr_grps -= cnt;
>>> +}
>>
>> Not sure why it's checked in reverse order (starting from 'nr_grps - 1').
> 
> Then for bio group list in submission side, only first .nr_grps groups
> includes bios.
> 
>>
>>
>>> +
>>>  static void bio_poll_ctx_init(struct blk_bio_poll_ctx *pc)
>>>  {
>>>  	pc->sq = (void *)pc + sizeof(*pc);
>>> @@ -866,6 +937,46 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q,
>>>  		bio->bi_opf |= REQ_TAG;
>>>  }
>>>  
>>> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio)
>>> +{
>>> +	struct blk_bio_poll_ctx *pc = ioc->data;
>>> +	unsigned int queued;
>>> +
>>> +	/*
>>> +	 * We rely on immutable .bi_end_io between blk-mq bio submission
>>> +	 * and completion. However, bio crypt may update .bi_end_io during
>>> +	 * submitting, so simply not support bio based polling for this
>>> +	 * setting.
>>> +	 */
>>> +	if (likely(!bio_has_crypt_ctx(bio))) {
>>> +		/* track this bio via bio group list */
>>> +		spin_lock(&pc->sq_lock);
>>> +		queued = bio_grp_list_add(pc->sq, bio);
>>> +		spin_unlock(&pc->sq_lock);
>>> +	} else {
>>> +		queued = false;
>>> +	}
>>> +
>>> +	/*
>>> +	 * Now the bio is added per-task fifo, mark it as END_BY_POLL,
>>> +	 * and the bio is always completed from the pair poll context.
>>> +	 *
>>> +	 * One invariant is that if bio isn't completed, blk_poll() will
>>> +	 * be called by passing cookie returned from submitting this bio.
>>> +	 */
>>> +	if (!queued)
>>> +		bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG);
>>> +	else
>>> +		bio_set_flag(bio, BIO_END_BY_POLL);
>>> +
>>> +	return queued;
>>> +}
>>> +
>>> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
>>> +{
>>> +	bio->bi_iter.bi_private_data = cookie;
>>> +}
>>> +
>>>  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
>>>  {
>>>  	struct block_device *bdev = bio->bi_bdev;
>>> @@ -1020,7 +1131,7 @@ static blk_qc_t __submit_bio(struct bio *bio)
>>>   * bio_list_on_stack[1] contains bios that were submitted before the current
>>>   *	->submit_bio_bio, but that haven't been processed yet.
>>>   */
>>> -static blk_qc_t __submit_bio_noacct(struct bio *bio)
>>> +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc)
>>>  {
>>>  	struct bio_list bio_list_on_stack[2];
>>>  	blk_qc_t ret = BLK_QC_T_NONE;
>>> @@ -1043,7 +1154,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>>>  		bio_list_on_stack[1] = bio_list_on_stack[0];
>>>  		bio_list_init(&bio_list_on_stack[0]);
>>>  
>>> -		ret = __submit_bio(bio);
>>> +		if (ioc && queue_is_mq(q) &&
>>> +				(bio->bi_opf & (REQ_HIPRI | REQ_TAG))) {
>>> +			bool queued = blk_bio_poll_prep_submit(ioc, bio);
>>> +
>>> +			ret = __submit_bio(bio);
>>> +			if (queued)
>>> +				blk_bio_poll_post_submit(bio, ret);
>>> +		} else {
>>> +			ret = __submit_bio(bio);
>>> +		}
>>
>> If input @ioc is NULL, it will still return cookie (returned by
>> __submit_bio()), which will call into blk_bio_poll(), which is not
>> expected. (It can pass blk_queue_poll() check in blk_poll(), e.g., dm
>> device itself is marked as QUEUE_FLAG_POLL, but ->io_context of IO
>> submitting process failed to allocate the io_context, and thus calls
>> __submit_bio_noacct_int() with @ioc is NULL).
> 
> Good catch, looks the following change is needed:
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 778d25a7e76c..dba12ba0fa48 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1211,7 +1211,8 @@ static inline blk_qc_t __submit_bio_noacct(struct bio *bio)
>  	if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI))
>  		return __submit_bio_noacct_poll(bio, ioc);
>  
> -	return __submit_bio_noacct_int(bio, NULL);
> +	 __submit_bio_noacct_int(bio, NULL);
> +	return 0;
>  }

Looks good as far as now no original bio-based device supporting IO polling.


>  
>  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
> 
>>
>>
>>>  
>>>  		/*
>>>  		 * Sort new bios into those for a lower level and those for the
>>> @@ -1069,6 +1189,31 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>>>  	return ret;
>>>  }
>>>  
>>> +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio,
>>> +		struct io_context *ioc)
>>> +{
>>> +	struct blk_bio_poll_ctx *pc = ioc->data;
>>> +
>>> +	__submit_bio_noacct_int(bio, ioc);
>>> +
>>> +	/* bio submissions queued to per-task poll context */
>>> +	if (READ_ONCE(pc->sq->nr_grps))
>>> +		return current->pid;
>>> +
>>> +	/* swapper's pid is 0, but it can't submit poll IO for us */
>>> +	return 0;
>>> +}
>>> +
>>> +static inline blk_qc_t __submit_bio_noacct(struct bio *bio)
>>> +{
>>> +	struct io_context *ioc = current->io_context;
>>> +
>>> +	if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI))
>>> +		return __submit_bio_noacct_poll(bio, ioc);
>>> +
>>
>>
>>> +	return __submit_bio_noacct_int(bio, NULL);
>>> +}
>>> +
>>>  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
>>>  {
>>>  	struct bio_list bio_list[2] = { };
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index 03f59915fe2c..f26950a51f4a 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -3865,14 +3865,185 @@ static inline int blk_mq_poll_hctx(struct request_queue *q,
>>>  	return ret;
>>>  }
>>>  
>>> +static blk_qc_t bio_get_poll_cookie(struct bio *bio)
>>> +{
>>> +	return bio->bi_iter.bi_private_data;
>>> +}
>>> +
>>> +static int blk_mq_poll_io(struct bio *bio)
>>> +{
>>> +	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
>>> +	blk_qc_t cookie = bio_get_poll_cookie(bio);
>>> +	int ret = 0;
>>> +
>>> +	if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) {
>>> +		struct blk_mq_hw_ctx *hctx =
>>> +			q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
>>> +
>>> +		ret += blk_mq_poll_hctx(q, hctx);
>>> +	}
>>> +	return ret;
>>> +}
>>> +
>>> +static int blk_bio_poll_and_end_io(struct request_queue *q,
>>> +		struct blk_bio_poll_ctx *poll_ctx)
>>> +{
>>> +	int ret = 0;
>>> +	int i;
>>> +
>>> +	/*
>>> +	 * Poll hw queue first.
>>> +	 *
>>> +	 * TODO: limit max poll times and make sure to not poll same
>>> +	 * hw queue one more time.
>>> +	 */
>>> +	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
>>> +		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
>>> +		struct bio *bio;
>>> +
>>> +		if (bio_grp_list_grp_empty(grp))
>>> +			continue;
>>> +
>>> +		for (bio = grp->list.head; bio; bio = bio->bi_poll)
>>> +			ret += blk_mq_poll_io(bio);
>>> +	}
>>> +
>>> +	/* reap bios */
>>> +	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
>>> +		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
>>> +		struct bio *bio;
>>> +		struct bio_list bl;
>>> +
>>> +		if (bio_grp_list_grp_empty(grp))
>>> +			continue;
>>> +
>>> +		bio_list_init(&bl);
>>> +
>>> +		while ((bio = __bio_grp_list_pop(&grp->list))) {
>>> +			if (bio_flagged(bio, BIO_DONE)) {
>>> +
>>> +				/* now recover original data */
>>> +				bio->bi_poll = grp->grp_data;
>>> +
>>> +				/* clear BIO_END_BY_POLL and end me really */
>>> +				bio_clear_flag(bio, BIO_END_BY_POLL);
>>> +				bio_endio(bio);
>>> +			} else {
>>> +				__bio_grp_list_add(&bl, bio);
>>> +			}
>>> +		}
>>> +		__bio_grp_list_merge(&grp->list, &bl);
>>> +	}
>>> +	return ret;
>>> +}
>>> +
>>> +static int __blk_bio_poll_io(struct request_queue *q,
>>> +		struct blk_bio_poll_ctx *submit_ctx,
>>> +		struct blk_bio_poll_ctx *poll_ctx)
>>> +{
>>> +	/*
>>> +	 * Move IO submission result from submission queue in submission
>>> +	 * context to poll queue of poll context.
>>> +	 */
>>> +	spin_lock(&submit_ctx->sq_lock);
>>> +	bio_grp_list_move(poll_ctx->pq, submit_ctx->sq);
>>> +	spin_unlock(&submit_ctx->sq_lock);
>>> +
>>> +	return blk_bio_poll_and_end_io(q, poll_ctx);
>>> +}
>>> +
>>> +static int blk_bio_poll_io(struct request_queue *q,
>>> +		struct io_context *submit_ioc,
>>> +		struct io_context *poll_ioc)
>>> +{
>>> +	struct blk_bio_poll_ctx *submit_ctx = submit_ioc->data;
>>> +	struct blk_bio_poll_ctx *poll_ctx = poll_ioc->data;
>>> +	int ret;
>>> +
>>> +	if (unlikely(atomic_read(&poll_ioc->nr_tasks) > 1)) {
>>> +		mutex_lock(&poll_ctx->pq_lock);
>>
>> Why mutex is used to protect pq here rather than spinlock? Where will
> 
> spinlock should be fine.
> 
>> the polling routine go to sleep?
> 
> The current blk_poll() can go to sleep really.

I know that hybrid polling can go to sleep. Except that, it seems no
other place can sleep?


> 
>>
>> Besides, how to protect the concurrent bio_list operation to sq between
>> producer (submission routine) and consumer (polling routine)? As far as
> 
> submit_ctx->sq_lock is held in __blk_bio_poll_io() for moving bios
> from sq to pq, see __blk_bio_poll_io().
> 
>> I understand, pc->sq_lock is used to prevent concurrent access from
>> multiple submission processes, while pc->pq_lock is used to prevent
>> concurrent access from multiple polling processes.
> 
> Usually poll context needn't any lock except for shared io context,
> because blk_bio_poll_ctx->pq is only accessed in poll context, and
> it is still per-task.
> 
>>
>>
>>> +		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
>>> +		mutex_unlock(&poll_ctx->pq_lock);
>>> +	} else {
>>> +		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
>>> +	}
>>> +	return ret;
>>> +}
>>> +
>>> +static bool blk_bio_ioc_valid(struct task_struct *t)
>>> +{
>>> +	if (!t)
>>> +		return false;
>>> +
>>> +	if (!t->io_context)
>>> +		return false;
>>> +
>>> +	if (!t->io_context->data)
>>> +		return false;
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +static int __blk_bio_poll(struct request_queue *q, blk_qc_t cookie)
>>> +{
>>> +	struct io_context *poll_ioc = current->io_context;
>>> +	pid_t pid;
>>> +	struct task_struct *submit_task;
>>> +	int ret;
>>> +
>>> +	pid = (pid_t)cookie;
>>> +
>>> +	/* io poll often share io submission context */
>>> +	if (likely(current->pid == pid && blk_bio_ioc_valid(current)))
>>> +		return blk_bio_poll_io(q, poll_ioc, poll_ioc);
>>> +
>>
>>> +	submit_task = find_get_task_by_vpid(pid);
>>
>> What if the process to which the returned cookie refers has exited?
>> find_get_task_by_vpid() will return NULL, thus blk_poll() won't help
>> reap anything, while maybe there are still bios in the poll context,
>> waiting to be reaped.
>>
>> Maybe we need to flush the poll context when a task detaches from the
>> io_context.
> 
> Yeah, I know that issue, and just not address it in RFC stage.
> 
> It can be handled in the following way:
> 
> 1) drain all bios in submission context until all bios are completed before
> it exits since the code won't sleep.
> 
> OR
> 
> 2) schedule wq for completing all submitted bios.
> 
> If poll context exits, and there is still bios not completed, not sure
> if it can happen because the current in-tree code has the same issue.
> 
>>
>>
>>> +	if (likely(blk_bio_ioc_valid(submit_task)))
>>> +		ret = blk_bio_poll_io(q, submit_task->io_context,
>>> +				poll_ioc);
>>
>> poll_ioc may be invalid in this case, since the previous
>> blk_create_io_context() in blk_bio_poll() may fail.
> 
> Yeah, it can be addressed by above patch, 0 will be returned
> for this case.
> 

Not really. I mean in the case of 'current->pid != pid', poll_ioc, i.e.,
current->io_context could be invalid, i.e., current->io_context is NULL,
or current->io_context->data is NULL. Maybe it should be fixed by

+	if (likely(blk_bio_ioc_valid(submit_task) && blk_bio_ioc_valid(current)))
+		ret = blk_bio_poll_io(q, submit_task->io_context,
+				poll_ioc);


Can't understand why the above patch you mentioned could fix this.
Sagi Grimberg March 23, 2021, 3:46 a.m. UTC | #8
> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
> +{
> +	bio->bi_iter.bi_private_data = cookie;
> +}
> +

Hey Ming, thinking about nvme-mpath, I'm thinking that this should be
an exported function for failover. nvme-mpath updates bio.bi_dev
when re-submitting I/Os to an alternate path, so I'm thinking
that if this function is exported then nvme-mpath could do as little
as the below to allow polling?

--
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 92adebfaf86f..e562e296153b 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -345,6 +345,7 @@ static void nvme_requeue_work(struct work_struct *work)
         struct nvme_ns_head *head =
                 container_of(work, struct nvme_ns_head, requeue_work);
         struct bio *bio, *next;
+       blk_qc_t cookie;

         spin_lock_irq(&head->requeue_lock);
         next = bio_list_get(&head->requeue_list);
@@ -359,7 +360,8 @@ static void nvme_requeue_work(struct work_struct *work)
                  * path.
                  */
                 bio_set_dev(bio, head->disk->part0);
-               submit_bio_noacct(bio);
+               cookie = submit_bio_noacct(bio);
+               blk_bio_poll_post_submit(bio, cookie);
         }
  }
--

I/O failover will create misalignment from the polling context cpu and
the submission cpu (running requeue_work), but I don't see if there is
something that would break here...

Thoughts?
Ming Lei March 23, 2021, 11:39 a.m. UTC | #9
On Sat, Mar 20, 2021 at 01:56:13PM +0800, JeffleXu wrote:
> 
> 
> On 3/19/21 9:46 PM, Ming Lei wrote:
> > On Fri, Mar 19, 2021 at 05:38:38PM +0800, JeffleXu wrote:
> >> I'm thinking how this mechanism could work with *original* bio-based
> >> devices that don't ne built upon mq devices, such as nvdimm. This
> > 
> > non-mq device needs driver to implement io polling by itself, block
> > layer can't help it, and that can't be this patchset's job.
> > 
> >> mechanism (also including my original design) mainly focuses on virtual
> >> devices that built upon mq devices, i.e., md/dm.
> >>
> >> As the original bio-based devices wants to support IO polling in the
> >> future, then they should be somehow distingushed from md/dm.
> >>
> >>
> >> On 3/19/21 12:48 AM, Ming Lei wrote:
> >>> Currently bio based IO poll needs to poll all hw queue blindly, this way
> >>> is very inefficient, and the big reason is that we can't pass bio
> >>> submission result to io poll task.
> >>>
> >>> In IO submission context, track associated underlying bios by per-task
> >>> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data,
> >>> and return current->pid to caller of submit_bio() for any bio based
> >>> driver's IO, which is submitted from FS.
> >>>
> >>> In IO poll context, the passed cookie tells us the PID of submission
> >>> context, and we can find the bio from that submission context. Moving
> >>> bio from submission queue to poll queue of the poll context, and keep
> >>> polling until these bios are ended. Remove bio from poll queue if the
> >>> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose.
> >>>
> >>> In previous version, kfifo is used to implement submission queue, and
> >>> Jeffle Xu found that kfifo can't scale well in case of high queue depth.
> >>> So far bio's size is close to 2 cacheline size, and it may not be
> >>> accepted to add new field into bio for solving the scalability issue by
> >>> tracking bios via linked list, switch to bio group list for tracking bio,
> >>> the idea is to reuse .bi_end_io for linking bios into a linked list for
> >>> all sharing same .bi_end_io(call it bio group), which is recovered before
> >>> really end bio, since BIO_END_BY_POLL is added for enhancing this point.
> >>> Usually .bi_end_bio is same for all bios in same layer, so it is enough to
> >>> provide very limited groups, such as 32 for fixing the scalability issue.
> >>>
> >>> Usually submission shares context with io poll. The per-task poll context
> >>> is just like stack variable, and it is cheap to move data between the two
> >>> per-task queues.
> >>>
> >>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> >>> ---
> >>>  block/bio.c               |   5 ++
> >>>  block/blk-core.c          | 149 +++++++++++++++++++++++++++++++-
> >>>  block/blk-mq.c            | 173 +++++++++++++++++++++++++++++++++++++-
> >>>  block/blk.h               |   9 ++
> >>>  include/linux/blk_types.h |  16 +++-
> >>>  5 files changed, 348 insertions(+), 4 deletions(-)
> >>>
> >>> diff --git a/block/bio.c b/block/bio.c
> >>> index 26b7f721cda8..04c043dc60fc 100644
> >>> --- a/block/bio.c
> >>> +++ b/block/bio.c
> >>> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio)
> >>>   **/
> >>>  void bio_endio(struct bio *bio)
> >>>  {
> >>> +	/* BIO_END_BY_POLL has to be set before calling submit_bio */
> >>> +	if (bio_flagged(bio, BIO_END_BY_POLL)) {
> >>> +		bio_set_flag(bio, BIO_DONE);
> >>> +		return;
> >>> +	}
> >>>  again:
> >>>  	if (!bio_remaining_done(bio))
> >>>  		return;
> >>> diff --git a/block/blk-core.c b/block/blk-core.c
> >>> index efc7a61a84b4..778d25a7e76c 100644
> >>> --- a/block/blk-core.c
> >>> +++ b/block/blk-core.c
> >>> @@ -805,6 +805,77 @@ static inline unsigned int bio_grp_list_size(unsigned int nr_grps)
> >>>  		sizeof(struct bio_grp_list_data);
> >>>  }
> >>>  
> >>> +static inline void *bio_grp_data(struct bio *bio)
> >>> +{
> >>> +	return bio->bi_poll;
> >>> +}
> >>> +
> >>> +/* add bio into bio group list, return true if it is added */
> >>> +static bool bio_grp_list_add(struct bio_grp_list *list, struct bio *bio)
> >>> +{
> >>> +	int i;
> >>> +	struct bio_grp_list_data *grp;
> >>> +
> >>> +	for (i = 0; i < list->nr_grps; i++) {
> >>> +		grp = &list->head[i];
> >>> +		if (grp->grp_data == bio_grp_data(bio)) {
> >>> +			__bio_grp_list_add(&grp->list, bio);
> >>> +			return true;
> >>> +		}
> >>> +	}
> >>> +
> >>> +	if (i == list->max_nr_grps)
> >>> +		return false;
> >>> +
> >>> +	/* create a new group */
> >>> +	grp = &list->head[i];
> >>> +	bio_list_init(&grp->list);
> >>> +	grp->grp_data = bio_grp_data(bio);
> >>> +	__bio_grp_list_add(&grp->list, bio);
> >>> +	list->nr_grps++;
> >>> +
> >>> +	return true;
> >>> +}
> >>> +
> >>> +static int bio_grp_list_find_grp(struct bio_grp_list *list, void *grp_data)
> >>> +{
> >>> +	int i;
> >>> +	struct bio_grp_list_data *grp;
> >>> +
> >>> +	for (i = 0; i < list->max_nr_grps; i++) {
> >>> +		grp = &list->head[i];
> >>> +		if (grp->grp_data == grp_data)
> >>> +			return i;
> >>> +	}
> >>> +	for (i = 0; i < list->max_nr_grps; i++) {
> >>> +		grp = &list->head[i];
> >>> +		if (bio_grp_list_grp_empty(grp))
> >>> +			return i;
> >>> +	}
> >>> +	return -1;
> >>> +}
> >>> +
> >>> +/* Move as many as possible groups from 'src' to 'dst' */
> >>> +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src)
> >>> +{
> >>> +	int i, j, cnt = 0;
> >>> +	struct bio_grp_list_data *grp;
> >>> +
> >>> +	for (i = src->nr_grps - 1; i >= 0; i--) {
> >>> +		grp = &src->head[i];
> >>> +		j = bio_grp_list_find_grp(dst, grp->grp_data);
> >>> +		if (j < 0)
> >>> +			break;
> >>> +		if (bio_grp_list_grp_empty(&dst->head[j]))
> >>> +			dst->head[j].grp_data = grp->grp_data;
> >>> +		__bio_grp_list_merge(&dst->head[j].list, &grp->list);
> >>> +		bio_list_init(&grp->list);
> >>> +		cnt++;
> >>> +	}
> >>> +
> >>> +	src->nr_grps -= cnt;
> >>> +}
> >>
> >> Not sure why it's checked in reverse order (starting from 'nr_grps - 1').
> > 
> > Then for bio group list in submission side, only first .nr_grps groups
> > includes bios.
> > 
> >>
> >>
> >>> +
> >>>  static void bio_poll_ctx_init(struct blk_bio_poll_ctx *pc)
> >>>  {
> >>>  	pc->sq = (void *)pc + sizeof(*pc);
> >>> @@ -866,6 +937,46 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q,
> >>>  		bio->bi_opf |= REQ_TAG;
> >>>  }
> >>>  
> >>> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio)
> >>> +{
> >>> +	struct blk_bio_poll_ctx *pc = ioc->data;
> >>> +	unsigned int queued;
> >>> +
> >>> +	/*
> >>> +	 * We rely on immutable .bi_end_io between blk-mq bio submission
> >>> +	 * and completion. However, bio crypt may update .bi_end_io during
> >>> +	 * submitting, so simply not support bio based polling for this
> >>> +	 * setting.
> >>> +	 */
> >>> +	if (likely(!bio_has_crypt_ctx(bio))) {
> >>> +		/* track this bio via bio group list */
> >>> +		spin_lock(&pc->sq_lock);
> >>> +		queued = bio_grp_list_add(pc->sq, bio);
> >>> +		spin_unlock(&pc->sq_lock);
> >>> +	} else {
> >>> +		queued = false;
> >>> +	}
> >>> +
> >>> +	/*
> >>> +	 * Now the bio is added per-task fifo, mark it as END_BY_POLL,
> >>> +	 * and the bio is always completed from the pair poll context.
> >>> +	 *
> >>> +	 * One invariant is that if bio isn't completed, blk_poll() will
> >>> +	 * be called by passing cookie returned from submitting this bio.
> >>> +	 */
> >>> +	if (!queued)
> >>> +		bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG);
> >>> +	else
> >>> +		bio_set_flag(bio, BIO_END_BY_POLL);
> >>> +
> >>> +	return queued;
> >>> +}
> >>> +
> >>> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
> >>> +{
> >>> +	bio->bi_iter.bi_private_data = cookie;
> >>> +}
> >>> +
> >>>  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
> >>>  {
> >>>  	struct block_device *bdev = bio->bi_bdev;
> >>> @@ -1020,7 +1131,7 @@ static blk_qc_t __submit_bio(struct bio *bio)
> >>>   * bio_list_on_stack[1] contains bios that were submitted before the current
> >>>   *	->submit_bio_bio, but that haven't been processed yet.
> >>>   */
> >>> -static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >>> +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc)
> >>>  {
> >>>  	struct bio_list bio_list_on_stack[2];
> >>>  	blk_qc_t ret = BLK_QC_T_NONE;
> >>> @@ -1043,7 +1154,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >>>  		bio_list_on_stack[1] = bio_list_on_stack[0];
> >>>  		bio_list_init(&bio_list_on_stack[0]);
> >>>  
> >>> -		ret = __submit_bio(bio);
> >>> +		if (ioc && queue_is_mq(q) &&
> >>> +				(bio->bi_opf & (REQ_HIPRI | REQ_TAG))) {
> >>> +			bool queued = blk_bio_poll_prep_submit(ioc, bio);
> >>> +
> >>> +			ret = __submit_bio(bio);
> >>> +			if (queued)
> >>> +				blk_bio_poll_post_submit(bio, ret);
> >>> +		} else {
> >>> +			ret = __submit_bio(bio);
> >>> +		}
> >>
> >> If input @ioc is NULL, it will still return cookie (returned by
> >> __submit_bio()), which will call into blk_bio_poll(), which is not
> >> expected. (It can pass blk_queue_poll() check in blk_poll(), e.g., dm
> >> device itself is marked as QUEUE_FLAG_POLL, but ->io_context of IO
> >> submitting process failed to allocate the io_context, and thus calls
> >> __submit_bio_noacct_int() with @ioc is NULL).
> > 
> > Good catch, looks the following change is needed:
> > 
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index 778d25a7e76c..dba12ba0fa48 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -1211,7 +1211,8 @@ static inline blk_qc_t __submit_bio_noacct(struct bio *bio)
> >  	if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI))
> >  		return __submit_bio_noacct_poll(bio, ioc);
> >  
> > -	return __submit_bio_noacct_int(bio, NULL);
> > +	 __submit_bio_noacct_int(bio, NULL);
> > +	return 0;
> >  }
> 
> Looks good as far as now no original bio-based device supporting IO polling.
> 
> 
> >  
> >  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
> > 
> >>
> >>
> >>>  
> >>>  		/*
> >>>  		 * Sort new bios into those for a lower level and those for the
> >>> @@ -1069,6 +1189,31 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >>>  	return ret;
> >>>  }
> >>>  
> >>> +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio,
> >>> +		struct io_context *ioc)
> >>> +{
> >>> +	struct blk_bio_poll_ctx *pc = ioc->data;
> >>> +
> >>> +	__submit_bio_noacct_int(bio, ioc);
> >>> +
> >>> +	/* bio submissions queued to per-task poll context */
> >>> +	if (READ_ONCE(pc->sq->nr_grps))
> >>> +		return current->pid;
> >>> +
> >>> +	/* swapper's pid is 0, but it can't submit poll IO for us */
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +static inline blk_qc_t __submit_bio_noacct(struct bio *bio)
> >>> +{
> >>> +	struct io_context *ioc = current->io_context;
> >>> +
> >>> +	if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI))
> >>> +		return __submit_bio_noacct_poll(bio, ioc);
> >>> +
> >>
> >>
> >>> +	return __submit_bio_noacct_int(bio, NULL);
> >>> +}
> >>> +
> >>>  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
> >>>  {
> >>>  	struct bio_list bio_list[2] = { };
> >>> diff --git a/block/blk-mq.c b/block/blk-mq.c
> >>> index 03f59915fe2c..f26950a51f4a 100644
> >>> --- a/block/blk-mq.c
> >>> +++ b/block/blk-mq.c
> >>> @@ -3865,14 +3865,185 @@ static inline int blk_mq_poll_hctx(struct request_queue *q,
> >>>  	return ret;
> >>>  }
> >>>  
> >>> +static blk_qc_t bio_get_poll_cookie(struct bio *bio)
> >>> +{
> >>> +	return bio->bi_iter.bi_private_data;
> >>> +}
> >>> +
> >>> +static int blk_mq_poll_io(struct bio *bio)
> >>> +{
> >>> +	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
> >>> +	blk_qc_t cookie = bio_get_poll_cookie(bio);
> >>> +	int ret = 0;
> >>> +
> >>> +	if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) {
> >>> +		struct blk_mq_hw_ctx *hctx =
> >>> +			q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
> >>> +
> >>> +		ret += blk_mq_poll_hctx(q, hctx);
> >>> +	}
> >>> +	return ret;
> >>> +}
> >>> +
> >>> +static int blk_bio_poll_and_end_io(struct request_queue *q,
> >>> +		struct blk_bio_poll_ctx *poll_ctx)
> >>> +{
> >>> +	int ret = 0;
> >>> +	int i;
> >>> +
> >>> +	/*
> >>> +	 * Poll hw queue first.
> >>> +	 *
> >>> +	 * TODO: limit max poll times and make sure to not poll same
> >>> +	 * hw queue one more time.
> >>> +	 */
> >>> +	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
> >>> +		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
> >>> +		struct bio *bio;
> >>> +
> >>> +		if (bio_grp_list_grp_empty(grp))
> >>> +			continue;
> >>> +
> >>> +		for (bio = grp->list.head; bio; bio = bio->bi_poll)
> >>> +			ret += blk_mq_poll_io(bio);
> >>> +	}
> >>> +
> >>> +	/* reap bios */
> >>> +	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
> >>> +		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
> >>> +		struct bio *bio;
> >>> +		struct bio_list bl;
> >>> +
> >>> +		if (bio_grp_list_grp_empty(grp))
> >>> +			continue;
> >>> +
> >>> +		bio_list_init(&bl);
> >>> +
> >>> +		while ((bio = __bio_grp_list_pop(&grp->list))) {
> >>> +			if (bio_flagged(bio, BIO_DONE)) {
> >>> +
> >>> +				/* now recover original data */
> >>> +				bio->bi_poll = grp->grp_data;
> >>> +
> >>> +				/* clear BIO_END_BY_POLL and end me really */
> >>> +				bio_clear_flag(bio, BIO_END_BY_POLL);
> >>> +				bio_endio(bio);
> >>> +			} else {
> >>> +				__bio_grp_list_add(&bl, bio);
> >>> +			}
> >>> +		}
> >>> +		__bio_grp_list_merge(&grp->list, &bl);
> >>> +	}
> >>> +	return ret;
> >>> +}
> >>> +
> >>> +static int __blk_bio_poll_io(struct request_queue *q,
> >>> +		struct blk_bio_poll_ctx *submit_ctx,
> >>> +		struct blk_bio_poll_ctx *poll_ctx)
> >>> +{
> >>> +	/*
> >>> +	 * Move IO submission result from submission queue in submission
> >>> +	 * context to poll queue of poll context.
> >>> +	 */
> >>> +	spin_lock(&submit_ctx->sq_lock);
> >>> +	bio_grp_list_move(poll_ctx->pq, submit_ctx->sq);
> >>> +	spin_unlock(&submit_ctx->sq_lock);
> >>> +
> >>> +	return blk_bio_poll_and_end_io(q, poll_ctx);
> >>> +}
> >>> +
> >>> +static int blk_bio_poll_io(struct request_queue *q,
> >>> +		struct io_context *submit_ioc,
> >>> +		struct io_context *poll_ioc)
> >>> +{
> >>> +	struct blk_bio_poll_ctx *submit_ctx = submit_ioc->data;
> >>> +	struct blk_bio_poll_ctx *poll_ctx = poll_ioc->data;
> >>> +	int ret;
> >>> +
> >>> +	if (unlikely(atomic_read(&poll_ioc->nr_tasks) > 1)) {
> >>> +		mutex_lock(&poll_ctx->pq_lock);
> >>
> >> Why mutex is used to protect pq here rather than spinlock? Where will
> > 
> > spinlock should be fine.
> > 
> >> the polling routine go to sleep?
> > 
> > The current blk_poll() can go to sleep really.
> 
> I know that hybrid polling can go to sleep. Except that, it seems no
> other place can sleep?
> 
> 
> > 
> >>
> >> Besides, how to protect the concurrent bio_list operation to sq between
> >> producer (submission routine) and consumer (polling routine)? As far as
> > 
> > submit_ctx->sq_lock is held in __blk_bio_poll_io() for moving bios
> > from sq to pq, see __blk_bio_poll_io().
> > 
> >> I understand, pc->sq_lock is used to prevent concurrent access from
> >> multiple submission processes, while pc->pq_lock is used to prevent
> >> concurrent access from multiple polling processes.
> > 
> > Usually poll context needn't any lock except for shared io context,
> > because blk_bio_poll_ctx->pq is only accessed in poll context, and
> > it is still per-task.
> > 
> >>
> >>
> >>> +		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
> >>> +		mutex_unlock(&poll_ctx->pq_lock);
> >>> +	} else {
> >>> +		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
> >>> +	}
> >>> +	return ret;
> >>> +}
> >>> +
> >>> +static bool blk_bio_ioc_valid(struct task_struct *t)
> >>> +{
> >>> +	if (!t)
> >>> +		return false;
> >>> +
> >>> +	if (!t->io_context)
> >>> +		return false;
> >>> +
> >>> +	if (!t->io_context->data)
> >>> +		return false;
> >>> +
> >>> +	return true;
> >>> +}
> >>> +
> >>> +static int __blk_bio_poll(struct request_queue *q, blk_qc_t cookie)
> >>> +{
> >>> +	struct io_context *poll_ioc = current->io_context;
> >>> +	pid_t pid;
> >>> +	struct task_struct *submit_task;
> >>> +	int ret;
> >>> +
> >>> +	pid = (pid_t)cookie;
> >>> +
> >>> +	/* io poll often share io submission context */
> >>> +	if (likely(current->pid == pid && blk_bio_ioc_valid(current)))
> >>> +		return blk_bio_poll_io(q, poll_ioc, poll_ioc);
> >>> +
> >>
> >>> +	submit_task = find_get_task_by_vpid(pid);
> >>
> >> What if the process to which the returned cookie refers has exited?
> >> find_get_task_by_vpid() will return NULL, thus blk_poll() won't help
> >> reap anything, while maybe there are still bios in the poll context,
> >> waiting to be reaped.
> >>
> >> Maybe we need to flush the poll context when a task detaches from the
> >> io_context.
> > 
> > Yeah, I know that issue, and just not address it in RFC stage.
> > 
> > It can be handled in the following way:
> > 
> > 1) drain all bios in submission context until all bios are completed before
> > it exits since the code won't sleep.
> > 
> > OR
> > 
> > 2) schedule wq for completing all submitted bios.
> > 
> > If poll context exits, and there is still bios not completed, not sure
> > if it can happen because the current in-tree code has the same issue.
> > 
> >>
> >>
> >>> +	if (likely(blk_bio_ioc_valid(submit_task)))
> >>> +		ret = blk_bio_poll_io(q, submit_task->io_context,
> >>> +				poll_ioc);
> >>
> >> poll_ioc may be invalid in this case, since the previous
> >> blk_create_io_context() in blk_bio_poll() may fail.
> > 
> > Yeah, it can be addressed by above patch, 0 will be returned
> > for this case.
> > 
> 
> Not really. I mean in the case of 'current->pid != pid', poll_ioc, i.e.,
> current->io_context could be invalid, i.e., current->io_context is NULL,
> or current->io_context->data is NULL. Maybe it should be fixed by
> 
> +	if (likely(blk_bio_ioc_valid(submit_task) && blk_bio_ioc_valid(current)))
> +		ret = blk_bio_poll_io(q, submit_task->io_context,
> +				poll_ioc);
> 
> 
> Can't understand why the above patch you mentioned could fix this.

I meant the patch in which 0 is returned from '__submit_bio_noacct()'.

For submission context, either cookie is zero or one really valid
context which can be used to fetch bios.

But poll context could be invalid, that is fine to poll on submission
context directly, and I will handle that in V3.


Thanks,
Ming
Ming Lei March 23, 2021, 11:55 a.m. UTC | #10
On Fri, Mar 19, 2021 at 02:38:11PM -0400, Mike Snitzer wrote:
> On Thu, Mar 18 2021 at 12:48pm -0400,
> Ming Lei <ming.lei@redhat.com> wrote:
> 
> > Currently bio based IO poll needs to poll all hw queue blindly, this way
> > is very inefficient, and the big reason is that we can't pass bio
> > submission result to io poll task.
> 
> This is awkward because bio-based IO polling doesn't exist upstream yet,
> so this header should be covering your approach as a clean slate, e.g.:
> 
> The complexity associated with frequent bio splitting with bio-based
> devices makes it difficult to implement IO polling efficiently because
> the fan-out of underlying hw queues that need to be polled (as a
> side-effect of bios being split) creates a need for more easily mapping
> a group of bios to the hw queues that need to be polled.
> 
> > In IO submission context, track associated underlying bios by per-task
> > submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data,
> > and return current->pid to caller of submit_bio() for any bio based
> > driver's IO, which is submitted from FS.
> > 
> > In IO poll context, the passed cookie tells us the PID of submission
> > context, and we can find the bio from that submission context. Moving
> 
> Maybe be more precise by covering how all bios from that task's
> submission context will be moved to poll queue of poll context?

Strictly speaking, it isn't a must to add poll context which is just
more efficient for polling locally after taking bio grouping list in V2.

> 
> > bio from submission queue to poll queue of the poll context, and keep
> > polling until these bios are ended. Remove bio from poll queue if the
> > bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose.
> > 
> > In previous version, kfifo is used to implement submission queue, and
> > Jeffle Xu found that kfifo can't scale well in case of high queue depth.
> 
> Awkward to reference "previous version", maybe instead say:
> 
> In was found that kfifo doesn't scale well for a submission queue as
> queue depth is increased, so a new mechanism for tracking bios is
> needed.

OK.

> 
> > So far bio's size is close to 2 cacheline size, and it may not be
> > accepted to add new field into bio for solving the scalability issue by
> > tracking bios via linked list, switch to bio group list for tracking bio,
> > the idea is to reuse .bi_end_io for linking bios into a linked list for
> > all sharing same .bi_end_io(call it bio group), which is recovered before
> > really end bio, since BIO_END_BY_POLL is added for enhancing this point.
> > Usually .bi_end_bio is same for all bios in same layer, so it is enough to
> > provide very limited groups, such as 32 for fixing the scalability issue.
> > 
> > Usually submission shares context with io poll. The per-task poll context
> > is just like stack variable, and it is cheap to move data between the two
> > per-task queues.
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/bio.c               |   5 ++
> >  block/blk-core.c          | 149 +++++++++++++++++++++++++++++++-
> >  block/blk-mq.c            | 173 +++++++++++++++++++++++++++++++++++++-
> >  block/blk.h               |   9 ++
> >  include/linux/blk_types.h |  16 +++-
> >  5 files changed, 348 insertions(+), 4 deletions(-)
> > 
> > diff --git a/block/bio.c b/block/bio.c
> > index 26b7f721cda8..04c043dc60fc 100644
> > --- a/block/bio.c
> > +++ b/block/bio.c
> > @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio)
> >   **/
> >  void bio_endio(struct bio *bio)
> >  {
> > +	/* BIO_END_BY_POLL has to be set before calling submit_bio */
> > +	if (bio_flagged(bio, BIO_END_BY_POLL)) {
> > +		bio_set_flag(bio, BIO_DONE);
> > +		return;
> > +	}
> >  again:
> >  	if (!bio_remaining_done(bio))
> >  		return;
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index efc7a61a84b4..778d25a7e76c 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -805,6 +805,77 @@ static inline unsigned int bio_grp_list_size(unsigned int nr_grps)
> >  		sizeof(struct bio_grp_list_data);
> >  }
> >  
> > +static inline void *bio_grp_data(struct bio *bio)
> > +{
> > +	return bio->bi_poll;
> > +}
> > +
> > +/* add bio into bio group list, return true if it is added */
> > +static bool bio_grp_list_add(struct bio_grp_list *list, struct bio *bio)
> > +{
> > +	int i;
> > +	struct bio_grp_list_data *grp;
> > +
> > +	for (i = 0; i < list->nr_grps; i++) {
> > +		grp = &list->head[i];
> > +		if (grp->grp_data == bio_grp_data(bio)) {
> > +			__bio_grp_list_add(&grp->list, bio);
> > +			return true;
> > +		}
> > +	}
> > +
> > +	if (i == list->max_nr_grps)
> > +		return false;
> > +
> > +	/* create a new group */
> > +	grp = &list->head[i];
> > +	bio_list_init(&grp->list);
> > +	grp->grp_data = bio_grp_data(bio);
> > +	__bio_grp_list_add(&grp->list, bio);
> > +	list->nr_grps++;
> > +
> > +	return true;
> > +}
> > +
> > +static int bio_grp_list_find_grp(struct bio_grp_list *list, void *grp_data)
> > +{
> > +	int i;
> > +	struct bio_grp_list_data *grp;
> > +
> > +	for (i = 0; i < list->max_nr_grps; i++) {
> > +		grp = &list->head[i];
> > +		if (grp->grp_data == grp_data)
> > +			return i;
> > +	}
> > +	for (i = 0; i < list->max_nr_grps; i++) {
> > +		grp = &list->head[i];
> > +		if (bio_grp_list_grp_empty(grp))
> > +			return i;
> > +	}
> > +	return -1;
> > +}
> > +
> > +/* Move as many as possible groups from 'src' to 'dst' */
> > +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src)
> > +{
> > +	int i, j, cnt = 0;
> > +	struct bio_grp_list_data *grp;
> > +
> > +	for (i = src->nr_grps - 1; i >= 0; i--) {
> > +		grp = &src->head[i];
> > +		j = bio_grp_list_find_grp(dst, grp->grp_data);
> > +		if (j < 0)
> > +			break;
> > +		if (bio_grp_list_grp_empty(&dst->head[j]))
> > +			dst->head[j].grp_data = grp->grp_data;
> > +		__bio_grp_list_merge(&dst->head[j].list, &grp->list);
> > +		bio_list_init(&grp->list);
> > +		cnt++;
> > +	}
> > +
> > +	src->nr_grps -= cnt;
> > +}
> > +
> >  static void bio_poll_ctx_init(struct blk_bio_poll_ctx *pc)
> >  {
> >  	pc->sq = (void *)pc + sizeof(*pc);
> > @@ -866,6 +937,46 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q,
> >  		bio->bi_opf |= REQ_TAG;
> >  }
> >  
> > +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio)
> > +{
> > +	struct blk_bio_poll_ctx *pc = ioc->data;
> > +	unsigned int queued;
> > +
> > +	/*
> > +	 * We rely on immutable .bi_end_io between blk-mq bio submission
> > +	 * and completion. However, bio crypt may update .bi_end_io during
> > +	 * submitting, so simply not support bio based polling for this
> 
> s/submitting/submission/
> s/not/don't/
> 
> > +	 * setting.
> > +	 */
> > +	if (likely(!bio_has_crypt_ctx(bio))) {
> > +		/* track this bio via bio group list */
> > +		spin_lock(&pc->sq_lock);
> > +		queued = bio_grp_list_add(pc->sq, bio);
> > +		spin_unlock(&pc->sq_lock);
> > +	} else {
> > +		queued = false;
> > +	}
> > +
> > +	/*
> > +	 * Now the bio is added per-task fifo, mark it as END_BY_POLL,
> > +	 * and the bio is always completed from the pair poll context.
> 
> This reads awkwardly.. "pair poll context"?

Actually I meant that poll context in which blk_poll(cookie) is called
and 'cookie' is the exact return value of submit_bio(bio based bio) and
this blk-mq bio is for complete the bio driver's bio, and share same
pool context with the bio driver's bio.

> 
> > +	 *
> > +	 * One invariant is that if bio isn't completed, blk_poll() will
> > +	 * be called by passing cookie returned from submitting this bio.
> > +	 */
> > +	if (!queued)
> > +		bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG);
> > +	else
> > +		bio_set_flag(bio, BIO_END_BY_POLL);
> > +
> > +	return queued;
> > +}
> > +
> > +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
> > +{
> > +	bio->bi_iter.bi_private_data = cookie;
> > +}
> > +
> >  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
> >  {
> >  	struct block_device *bdev = bio->bi_bdev;
> > @@ -1020,7 +1131,7 @@ static blk_qc_t __submit_bio(struct bio *bio)
> >   * bio_list_on_stack[1] contains bios that were submitted before the current
> >   *	->submit_bio_bio, but that haven't been processed yet.
> >   */
> > -static blk_qc_t __submit_bio_noacct(struct bio *bio)
> > +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc)
> >  {
> >  	struct bio_list bio_list_on_stack[2];
> >  	blk_qc_t ret = BLK_QC_T_NONE;
> 
> I mentioned this in previous mail, but what is it you're trying to
> convey with _int?
> 
> Think we need a better function name here.

I will think about one better name, not got one yet.

> 
> > @@ -1043,7 +1154,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >  		bio_list_on_stack[1] = bio_list_on_stack[0];
> >  		bio_list_init(&bio_list_on_stack[0]);
> >  
> > -		ret = __submit_bio(bio);
> > +		if (ioc && queue_is_mq(q) &&
> > +				(bio->bi_opf & (REQ_HIPRI | REQ_TAG))) {
> > +			bool queued = blk_bio_poll_prep_submit(ioc, bio);
> > +
> > +			ret = __submit_bio(bio);
> > +			if (queued)
> > +				blk_bio_poll_post_submit(bio, ret);
> > +		} else {
> > +			ret = __submit_bio(bio);
> > +		}
> >  
> >  		/*
> >  		 * Sort new bios into those for a lower level and those for the
> > @@ -1069,6 +1189,31 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >  	return ret;
> >  }
> >  
> > +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio,
> > +		struct io_context *ioc)
> > +{
> > +	struct blk_bio_poll_ctx *pc = ioc->data;
> > +
> > +	__submit_bio_noacct_int(bio, ioc);
> > +
> > +	/* bio submissions queued to per-task poll context */
> > +	if (READ_ONCE(pc->sq->nr_grps))
> > +		return current->pid;
> > +
> > +	/* swapper's pid is 0, but it can't submit poll IO for us */
> > +	return 0;
> > +}
> > +
> > +static inline blk_qc_t __submit_bio_noacct(struct bio *bio)
> > +{
> > +	struct io_context *ioc = current->io_context;
> > +
> > +	if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI))
> > +		return __submit_bio_noacct_poll(bio, ioc);
> > +
> > +	return __submit_bio_noacct_int(bio, NULL);
> > +}
> > +
> >  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
> >  {
> >  	struct bio_list bio_list[2] = { };
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 03f59915fe2c..f26950a51f4a 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -3865,14 +3865,185 @@ static inline int blk_mq_poll_hctx(struct request_queue *q,
> >  	return ret;
> >  }
> >  
> > +static blk_qc_t bio_get_poll_cookie(struct bio *bio)
> > +{
> > +	return bio->bi_iter.bi_private_data;
> > +}
> > +
> > +static int blk_mq_poll_io(struct bio *bio)
> > +{
> > +	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
> > +	blk_qc_t cookie = bio_get_poll_cookie(bio);
> > +	int ret = 0;
> > +
> > +	if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) {
> > +		struct blk_mq_hw_ctx *hctx =
> > +			q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
> > +
> > +		ret += blk_mq_poll_hctx(q, hctx);
> > +	}
> > +	return ret;
> > +}
> > +
> > +static int blk_bio_poll_and_end_io(struct request_queue *q,
> > +		struct blk_bio_poll_ctx *poll_ctx)
> > +{
> > +	int ret = 0;
> > +	int i;
> > +
> > +	/*
> > +	 * Poll hw queue first.
> > +	 *
> > +	 * TODO: limit max poll times and make sure to not poll same
> > +	 * hw queue one more time.
> > +	 */
> > +	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
> > +		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
> > +		struct bio *bio;
> > +
> > +		if (bio_grp_list_grp_empty(grp))
> > +			continue;
> > +
> > +		for (bio = grp->list.head; bio; bio = bio->bi_poll)
> > +			ret += blk_mq_poll_io(bio);
> > +	}
> > +
> > +	/* reap bios */
> > +	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
> > +		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
> > +		struct bio *bio;
> > +		struct bio_list bl;
> > +
> > +		if (bio_grp_list_grp_empty(grp))
> > +			continue;
> > +
> > +		bio_list_init(&bl);
> > +
> > +		while ((bio = __bio_grp_list_pop(&grp->list))) {
> > +			if (bio_flagged(bio, BIO_DONE)) {
> > +
> 
> Remove empty newline? ^
> 
> > +				/* now recover original data */
> > +				bio->bi_poll = grp->grp_data;
> > +
> > +				/* clear BIO_END_BY_POLL and end me really */
> > +				bio_clear_flag(bio, BIO_END_BY_POLL);
> > +				bio_endio(bio);
> > +			} else {
> > +				__bio_grp_list_add(&bl, bio);
> > +			}
> > +		}
> > +		__bio_grp_list_merge(&grp->list, &bl);
> > +	}
> > +	return ret;
> > +}
> > +
> > +static int __blk_bio_poll_io(struct request_queue *q,
> > +		struct blk_bio_poll_ctx *submit_ctx,
> > +		struct blk_bio_poll_ctx *poll_ctx)
> > +{
> > +	/*
> > +	 * Move IO submission result from submission queue in submission
> > +	 * context to poll queue of poll context.
> > +	 */
> > +	spin_lock(&submit_ctx->sq_lock);
> > +	bio_grp_list_move(poll_ctx->pq, submit_ctx->sq);
> > +	spin_unlock(&submit_ctx->sq_lock);
> > +
> > +	return blk_bio_poll_and_end_io(q, poll_ctx);
> > +}
> > +
> > +static int blk_bio_poll_io(struct request_queue *q,
> > +		struct io_context *submit_ioc,
> > +		struct io_context *poll_ioc)
> > +{
> > +	struct blk_bio_poll_ctx *submit_ctx = submit_ioc->data;
> > +	struct blk_bio_poll_ctx *poll_ctx = poll_ioc->data;
> > +	int ret;
> > +
> > +	if (unlikely(atomic_read(&poll_ioc->nr_tasks) > 1)) {
> > +		mutex_lock(&poll_ctx->pq_lock);
> > +		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
> > +		mutex_unlock(&poll_ctx->pq_lock);
> > +	} else {
> > +		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
> > +	}
> > +	return ret;
> > +}
> > +
> > +static bool blk_bio_ioc_valid(struct task_struct *t)
> > +{
> > +	if (!t)
> > +		return false;
> > +
> > +	if (!t->io_context)
> > +		return false;
> > +
> > +	if (!t->io_context->data)
> > +		return false;
> > +
> > +	return true;
> > +}
> > +
> > +static int __blk_bio_poll(struct request_queue *q, blk_qc_t cookie)
> > +{
> > +	struct io_context *poll_ioc = current->io_context;
> > +	pid_t pid;
> > +	struct task_struct *submit_task;
> > +	int ret;
> > +
> > +	pid = (pid_t)cookie;
> > +
> > +	/* io poll often share io submission context */
> > +	if (likely(current->pid == pid && blk_bio_ioc_valid(current)))
> > +		return blk_bio_poll_io(q, poll_ioc, poll_ioc);
> > +
> > +	submit_task = find_get_task_by_vpid(pid);
> > +	if (likely(blk_bio_ioc_valid(submit_task)))
> > +		ret = blk_bio_poll_io(q, submit_task->io_context,
> > +				poll_ioc);
> 
> Style nit, but think it fine to put "poll_ioc);" on previous line.
> Otherwise, best to add braces.
> 
> > +	else
> > +		ret = 0;
> > +
> > +	put_task_struct(submit_task);
> > +
> > +	return ret;
> > +}
> > +
> >  static int blk_bio_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
> >  {
> > +	long state;
> > +
> > +	/* no need to poll */
> > +	if (cookie == 0)
> > +		return 0;
> > +
> >  	/*
> >  	 * Create poll queue for storing poll bio and its cookie from
> >  	 * submission queue
> >  	 */
> >  	blk_create_io_context(q, true);
> >  
> > +	state = current->state;
> > +	do {
> > +		int ret;
> > +
> > +		ret = __blk_bio_poll(q, cookie);
> > +		if (ret > 0) {
> > +			__set_current_state(TASK_RUNNING);
> > +			return ret;
> > +		}
> > +
> > +		if (signal_pending_state(state, current))
> > +			__set_current_state(TASK_RUNNING);
> > +
> > +		if (current->state == TASK_RUNNING)
> > +			return 1;
> > +		if (ret < 0 || !spin)
> > +			break;
> > +		cpu_relax();
> > +	} while (!need_resched());
> > +
> > +	__set_current_state(TASK_RUNNING);
> >  	return 0;
> >  }
> >  
> > @@ -3893,7 +4064,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
> >  	struct blk_mq_hw_ctx *hctx;
> >  	long state;
> >  
> > -	if (!blk_qc_t_valid(cookie) || !blk_queue_poll(q))
> > +	if (!blk_queue_poll(q) || (queue_is_mq(q) && !blk_qc_t_valid(cookie)))
> >  		return 0;
> >  
> >  	if (current->plug)
> > diff --git a/block/blk.h b/block/blk.h
> > index ae58a706327e..05b9f5eafdd1 100644
> > --- a/block/blk.h
> > +++ b/block/blk.h
> > @@ -403,4 +403,13 @@ static inline void blk_create_io_context(struct request_queue *q,
> >  		bio_poll_ctx_alloc(ioc);
> >  }
> >  
> > +BIO_LIST_HELPERS(__bio_grp_list, poll);
> 
> Why the leading double underscore?
> Especially given the following 2 helpers don't have leading double underscore?
> 
> Whichever you decide, just looking for consistency.

The double underscore helpers are base helpers to build helper without
double underscore.


Thanks, 
Ming
Ming Lei March 23, 2021, 12:01 p.m. UTC | #11
On Mon, Mar 22, 2021 at 08:46:04PM -0700, Sagi Grimberg wrote:
> 
> > +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
> > +{
> > +	bio->bi_iter.bi_private_data = cookie;
> > +}
> > +
> 
> Hey Ming, thinking about nvme-mpath, I'm thinking that this should be
> an exported function for failover. nvme-mpath updates bio.bi_dev
> when re-submitting I/Os to an alternate path, so I'm thinking
> that if this function is exported then nvme-mpath could do as little
> as the below to allow polling?
> 
> --
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 92adebfaf86f..e562e296153b 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -345,6 +345,7 @@ static void nvme_requeue_work(struct work_struct *work)
>         struct nvme_ns_head *head =
>                 container_of(work, struct nvme_ns_head, requeue_work);
>         struct bio *bio, *next;
> +       blk_qc_t cookie;
> 
>         spin_lock_irq(&head->requeue_lock);
>         next = bio_list_get(&head->requeue_list);
> @@ -359,7 +360,8 @@ static void nvme_requeue_work(struct work_struct *work)
>                  * path.
>                  */
>                 bio_set_dev(bio, head->disk->part0);
> -               submit_bio_noacct(bio);
> +               cookie = submit_bio_noacct(bio);
> +               blk_bio_poll_post_submit(bio, cookie);
>         }
>  }
> --
> 
> I/O failover will create misalignment from the polling context cpu and
> the submission cpu (running requeue_work), but I don't see if there is
> something that would break here...

I understand requeue shouldn't be one usual event, and I guess it is just
fine to fallback to IRQ based mode?

This patchset actually doesn't cover such bio submission from kernel context.
Sagi Grimberg March 23, 2021, 4:54 p.m. UTC | #12
>>> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
>>> +{
>>> +	bio->bi_iter.bi_private_data = cookie;
>>> +}
>>> +
>>
>> Hey Ming, thinking about nvme-mpath, I'm thinking that this should be
>> an exported function for failover. nvme-mpath updates bio.bi_dev
>> when re-submitting I/Os to an alternate path, so I'm thinking
>> that if this function is exported then nvme-mpath could do as little
>> as the below to allow polling?
>>
>> --
>> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
>> index 92adebfaf86f..e562e296153b 100644
>> --- a/drivers/nvme/host/multipath.c
>> +++ b/drivers/nvme/host/multipath.c
>> @@ -345,6 +345,7 @@ static void nvme_requeue_work(struct work_struct *work)
>>          struct nvme_ns_head *head =
>>                  container_of(work, struct nvme_ns_head, requeue_work);
>>          struct bio *bio, *next;
>> +       blk_qc_t cookie;
>>
>>          spin_lock_irq(&head->requeue_lock);
>>          next = bio_list_get(&head->requeue_list);
>> @@ -359,7 +360,8 @@ static void nvme_requeue_work(struct work_struct *work)
>>                   * path.
>>                   */
>>                  bio_set_dev(bio, head->disk->part0);
>> -               submit_bio_noacct(bio);
>> +               cookie = submit_bio_noacct(bio);
>> +               blk_bio_poll_post_submit(bio, cookie);
>>          }
>>   }
>> --
>>
>> I/O failover will create misalignment from the polling context cpu and
>> the submission cpu (running requeue_work), but I don't see if there is
>> something that would break here...
> 
> I understand requeue shouldn't be one usual event, and I guess it is just
> fine to fallback to IRQ based mode?

Well, when it will failover, it will probably be directed to the poll
queues. Maybe I'm missing something...

> This patchset actually doesn't cover such bio submission from kernel context.

What is the difference?
Ming Lei March 24, 2021, 12:10 a.m. UTC | #13
On Tue, Mar 23, 2021 at 09:54:36AM -0700, Sagi Grimberg wrote:
> 
> > > > +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
> > > > +{
> > > > +	bio->bi_iter.bi_private_data = cookie;
> > > > +}
> > > > +
> > > 
> > > Hey Ming, thinking about nvme-mpath, I'm thinking that this should be
> > > an exported function for failover. nvme-mpath updates bio.bi_dev
> > > when re-submitting I/Os to an alternate path, so I'm thinking
> > > that if this function is exported then nvme-mpath could do as little
> > > as the below to allow polling?
> > > 
> > > --
> > > diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> > > index 92adebfaf86f..e562e296153b 100644
> > > --- a/drivers/nvme/host/multipath.c
> > > +++ b/drivers/nvme/host/multipath.c
> > > @@ -345,6 +345,7 @@ static void nvme_requeue_work(struct work_struct *work)
> > >          struct nvme_ns_head *head =
> > >                  container_of(work, struct nvme_ns_head, requeue_work);
> > >          struct bio *bio, *next;
> > > +       blk_qc_t cookie;
> > > 
> > >          spin_lock_irq(&head->requeue_lock);
> > >          next = bio_list_get(&head->requeue_list);
> > > @@ -359,7 +360,8 @@ static void nvme_requeue_work(struct work_struct *work)
> > >                   * path.
> > >                   */
> > >                  bio_set_dev(bio, head->disk->part0);
> > > -               submit_bio_noacct(bio);
> > > +               cookie = submit_bio_noacct(bio);
> > > +               blk_bio_poll_post_submit(bio, cookie);
> > >          }
> > >   }
> > > --
> > > 
> > > I/O failover will create misalignment from the polling context cpu and
> > > the submission cpu (running requeue_work), but I don't see if there is
> > > something that would break here...
> > 
> > I understand requeue shouldn't be one usual event, and I guess it is just
> > fine to fallback to IRQ based mode?
> 
> Well, when it will failover, it will probably be directed to the poll
> queues. Maybe I'm missing something...

In this patchset, because it isn't submitted directly from FS, there
isn't one polling context associated with this bio, so its HIPRI flag
will be cleared, then fallback to irq mode.

> 
> > This patchset actually doesn't cover such bio submission from kernel context.
> 
> What is the difference?

So far upper layer(io_uring, or dio, ..) needs to get the returned cookie, then
pass it to blk_poll().

For this case, the cookie can't be passed to FS caller of submit_bio(FS bio), so
it can't be polled by in-tree's code.



Thanks,
Ming
Sagi Grimberg March 24, 2021, 3:43 p.m. UTC | #14
>> Well, when it will failover, it will probably be directed to the poll
>> queues. Maybe I'm missing something...
> 
> In this patchset, because it isn't submitted directly from FS, there
> isn't one polling context associated with this bio, so its HIPRI flag
> will be cleared, then fallback to irq mode.

I think that's fine for failover I/O...
diff mbox series

Patch

diff --git a/block/bio.c b/block/bio.c
index 26b7f721cda8..04c043dc60fc 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1402,6 +1402,11 @@  static inline bool bio_remaining_done(struct bio *bio)
  **/
 void bio_endio(struct bio *bio)
 {
+	/* BIO_END_BY_POLL has to be set before calling submit_bio */
+	if (bio_flagged(bio, BIO_END_BY_POLL)) {
+		bio_set_flag(bio, BIO_DONE);
+		return;
+	}
 again:
 	if (!bio_remaining_done(bio))
 		return;
diff --git a/block/blk-core.c b/block/blk-core.c
index efc7a61a84b4..778d25a7e76c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -805,6 +805,77 @@  static inline unsigned int bio_grp_list_size(unsigned int nr_grps)
 		sizeof(struct bio_grp_list_data);
 }
 
+static inline void *bio_grp_data(struct bio *bio)
+{
+	return bio->bi_poll;
+}
+
+/* add bio into bio group list, return true if it is added */
+static bool bio_grp_list_add(struct bio_grp_list *list, struct bio *bio)
+{
+	int i;
+	struct bio_grp_list_data *grp;
+
+	for (i = 0; i < list->nr_grps; i++) {
+		grp = &list->head[i];
+		if (grp->grp_data == bio_grp_data(bio)) {
+			__bio_grp_list_add(&grp->list, bio);
+			return true;
+		}
+	}
+
+	if (i == list->max_nr_grps)
+		return false;
+
+	/* create a new group */
+	grp = &list->head[i];
+	bio_list_init(&grp->list);
+	grp->grp_data = bio_grp_data(bio);
+	__bio_grp_list_add(&grp->list, bio);
+	list->nr_grps++;
+
+	return true;
+}
+
+static int bio_grp_list_find_grp(struct bio_grp_list *list, void *grp_data)
+{
+	int i;
+	struct bio_grp_list_data *grp;
+
+	for (i = 0; i < list->max_nr_grps; i++) {
+		grp = &list->head[i];
+		if (grp->grp_data == grp_data)
+			return i;
+	}
+	for (i = 0; i < list->max_nr_grps; i++) {
+		grp = &list->head[i];
+		if (bio_grp_list_grp_empty(grp))
+			return i;
+	}
+	return -1;
+}
+
+/* Move as many as possible groups from 'src' to 'dst' */
+void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src)
+{
+	int i, j, cnt = 0;
+	struct bio_grp_list_data *grp;
+
+	for (i = src->nr_grps - 1; i >= 0; i--) {
+		grp = &src->head[i];
+		j = bio_grp_list_find_grp(dst, grp->grp_data);
+		if (j < 0)
+			break;
+		if (bio_grp_list_grp_empty(&dst->head[j]))
+			dst->head[j].grp_data = grp->grp_data;
+		__bio_grp_list_merge(&dst->head[j].list, &grp->list);
+		bio_list_init(&grp->list);
+		cnt++;
+	}
+
+	src->nr_grps -= cnt;
+}
+
 static void bio_poll_ctx_init(struct blk_bio_poll_ctx *pc)
 {
 	pc->sq = (void *)pc + sizeof(*pc);
@@ -866,6 +937,46 @@  static inline void blk_bio_poll_preprocess(struct request_queue *q,
 		bio->bi_opf |= REQ_TAG;
 }
 
+static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio)
+{
+	struct blk_bio_poll_ctx *pc = ioc->data;
+	unsigned int queued;
+
+	/*
+	 * We rely on immutable .bi_end_io between blk-mq bio submission
+	 * and completion. However, bio crypt may update .bi_end_io during
+	 * submitting, so simply not support bio based polling for this
+	 * setting.
+	 */
+	if (likely(!bio_has_crypt_ctx(bio))) {
+		/* track this bio via bio group list */
+		spin_lock(&pc->sq_lock);
+		queued = bio_grp_list_add(pc->sq, bio);
+		spin_unlock(&pc->sq_lock);
+	} else {
+		queued = false;
+	}
+
+	/*
+	 * Now the bio is added per-task fifo, mark it as END_BY_POLL,
+	 * and the bio is always completed from the pair poll context.
+	 *
+	 * One invariant is that if bio isn't completed, blk_poll() will
+	 * be called by passing cookie returned from submitting this bio.
+	 */
+	if (!queued)
+		bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG);
+	else
+		bio_set_flag(bio, BIO_END_BY_POLL);
+
+	return queued;
+}
+
+static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie)
+{
+	bio->bi_iter.bi_private_data = cookie;
+}
+
 static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
@@ -1020,7 +1131,7 @@  static blk_qc_t __submit_bio(struct bio *bio)
  * bio_list_on_stack[1] contains bios that were submitted before the current
  *	->submit_bio_bio, but that haven't been processed yet.
  */
-static blk_qc_t __submit_bio_noacct(struct bio *bio)
+static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc)
 {
 	struct bio_list bio_list_on_stack[2];
 	blk_qc_t ret = BLK_QC_T_NONE;
@@ -1043,7 +1154,16 @@  static blk_qc_t __submit_bio_noacct(struct bio *bio)
 		bio_list_on_stack[1] = bio_list_on_stack[0];
 		bio_list_init(&bio_list_on_stack[0]);
 
-		ret = __submit_bio(bio);
+		if (ioc && queue_is_mq(q) &&
+				(bio->bi_opf & (REQ_HIPRI | REQ_TAG))) {
+			bool queued = blk_bio_poll_prep_submit(ioc, bio);
+
+			ret = __submit_bio(bio);
+			if (queued)
+				blk_bio_poll_post_submit(bio, ret);
+		} else {
+			ret = __submit_bio(bio);
+		}
 
 		/*
 		 * Sort new bios into those for a lower level and those for the
@@ -1069,6 +1189,31 @@  static blk_qc_t __submit_bio_noacct(struct bio *bio)
 	return ret;
 }
 
+static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio,
+		struct io_context *ioc)
+{
+	struct blk_bio_poll_ctx *pc = ioc->data;
+
+	__submit_bio_noacct_int(bio, ioc);
+
+	/* bio submissions queued to per-task poll context */
+	if (READ_ONCE(pc->sq->nr_grps))
+		return current->pid;
+
+	/* swapper's pid is 0, but it can't submit poll IO for us */
+	return 0;
+}
+
+static inline blk_qc_t __submit_bio_noacct(struct bio *bio)
+{
+	struct io_context *ioc = current->io_context;
+
+	if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI))
+		return __submit_bio_noacct_poll(bio, ioc);
+
+	return __submit_bio_noacct_int(bio, NULL);
+}
+
 static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
 {
 	struct bio_list bio_list[2] = { };
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 03f59915fe2c..f26950a51f4a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3865,14 +3865,185 @@  static inline int blk_mq_poll_hctx(struct request_queue *q,
 	return ret;
 }
 
+static blk_qc_t bio_get_poll_cookie(struct bio *bio)
+{
+	return bio->bi_iter.bi_private_data;
+}
+
+static int blk_mq_poll_io(struct bio *bio)
+{
+	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+	blk_qc_t cookie = bio_get_poll_cookie(bio);
+	int ret = 0;
+
+	if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) {
+		struct blk_mq_hw_ctx *hctx =
+			q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
+
+		ret += blk_mq_poll_hctx(q, hctx);
+	}
+	return ret;
+}
+
+static int blk_bio_poll_and_end_io(struct request_queue *q,
+		struct blk_bio_poll_ctx *poll_ctx)
+{
+	int ret = 0;
+	int i;
+
+	/*
+	 * Poll hw queue first.
+	 *
+	 * TODO: limit max poll times and make sure to not poll same
+	 * hw queue one more time.
+	 */
+	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
+		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
+		struct bio *bio;
+
+		if (bio_grp_list_grp_empty(grp))
+			continue;
+
+		for (bio = grp->list.head; bio; bio = bio->bi_poll)
+			ret += blk_mq_poll_io(bio);
+	}
+
+	/* reap bios */
+	for (i = 0; i < poll_ctx->pq->max_nr_grps; i++) {
+		struct bio_grp_list_data *grp = &poll_ctx->pq->head[i];
+		struct bio *bio;
+		struct bio_list bl;
+
+		if (bio_grp_list_grp_empty(grp))
+			continue;
+
+		bio_list_init(&bl);
+
+		while ((bio = __bio_grp_list_pop(&grp->list))) {
+			if (bio_flagged(bio, BIO_DONE)) {
+
+				/* now recover original data */
+				bio->bi_poll = grp->grp_data;
+
+				/* clear BIO_END_BY_POLL and end me really */
+				bio_clear_flag(bio, BIO_END_BY_POLL);
+				bio_endio(bio);
+			} else {
+				__bio_grp_list_add(&bl, bio);
+			}
+		}
+		__bio_grp_list_merge(&grp->list, &bl);
+	}
+	return ret;
+}
+
+static int __blk_bio_poll_io(struct request_queue *q,
+		struct blk_bio_poll_ctx *submit_ctx,
+		struct blk_bio_poll_ctx *poll_ctx)
+{
+	/*
+	 * Move IO submission result from submission queue in submission
+	 * context to poll queue of poll context.
+	 */
+	spin_lock(&submit_ctx->sq_lock);
+	bio_grp_list_move(poll_ctx->pq, submit_ctx->sq);
+	spin_unlock(&submit_ctx->sq_lock);
+
+	return blk_bio_poll_and_end_io(q, poll_ctx);
+}
+
+static int blk_bio_poll_io(struct request_queue *q,
+		struct io_context *submit_ioc,
+		struct io_context *poll_ioc)
+{
+	struct blk_bio_poll_ctx *submit_ctx = submit_ioc->data;
+	struct blk_bio_poll_ctx *poll_ctx = poll_ioc->data;
+	int ret;
+
+	if (unlikely(atomic_read(&poll_ioc->nr_tasks) > 1)) {
+		mutex_lock(&poll_ctx->pq_lock);
+		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
+		mutex_unlock(&poll_ctx->pq_lock);
+	} else {
+		ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx);
+	}
+	return ret;
+}
+
+static bool blk_bio_ioc_valid(struct task_struct *t)
+{
+	if (!t)
+		return false;
+
+	if (!t->io_context)
+		return false;
+
+	if (!t->io_context->data)
+		return false;
+
+	return true;
+}
+
+static int __blk_bio_poll(struct request_queue *q, blk_qc_t cookie)
+{
+	struct io_context *poll_ioc = current->io_context;
+	pid_t pid;
+	struct task_struct *submit_task;
+	int ret;
+
+	pid = (pid_t)cookie;
+
+	/* io poll often share io submission context */
+	if (likely(current->pid == pid && blk_bio_ioc_valid(current)))
+		return blk_bio_poll_io(q, poll_ioc, poll_ioc);
+
+	submit_task = find_get_task_by_vpid(pid);
+	if (likely(blk_bio_ioc_valid(submit_task)))
+		ret = blk_bio_poll_io(q, submit_task->io_context,
+				poll_ioc);
+	else
+		ret = 0;
+
+	put_task_struct(submit_task);
+
+	return ret;
+}
+
 static int blk_bio_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 {
+	long state;
+
+	/* no need to poll */
+	if (cookie == 0)
+		return 0;
+
 	/*
 	 * Create poll queue for storing poll bio and its cookie from
 	 * submission queue
 	 */
 	blk_create_io_context(q, true);
 
+	state = current->state;
+	do {
+		int ret;
+
+		ret = __blk_bio_poll(q, cookie);
+		if (ret > 0) {
+			__set_current_state(TASK_RUNNING);
+			return ret;
+		}
+
+		if (signal_pending_state(state, current))
+			__set_current_state(TASK_RUNNING);
+
+		if (current->state == TASK_RUNNING)
+			return 1;
+		if (ret < 0 || !spin)
+			break;
+		cpu_relax();
+	} while (!need_resched());
+
+	__set_current_state(TASK_RUNNING);
 	return 0;
 }
 
@@ -3893,7 +4064,7 @@  int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 	struct blk_mq_hw_ctx *hctx;
 	long state;
 
-	if (!blk_qc_t_valid(cookie) || !blk_queue_poll(q))
+	if (!blk_queue_poll(q) || (queue_is_mq(q) && !blk_qc_t_valid(cookie)))
 		return 0;
 
 	if (current->plug)
diff --git a/block/blk.h b/block/blk.h
index ae58a706327e..05b9f5eafdd1 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -403,4 +403,13 @@  static inline void blk_create_io_context(struct request_queue *q,
 		bio_poll_ctx_alloc(ioc);
 }
 
+BIO_LIST_HELPERS(__bio_grp_list, poll);
+
+static inline bool bio_grp_list_grp_empty(struct bio_grp_list_data *grp)
+{
+	return bio_list_empty(&grp->list);
+}
+
+void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src);
+
 #endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a1bcade4bcc3..2d47679bac71 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -235,7 +235,18 @@  struct bio {
 
 	struct bvec_iter	bi_iter;
 
-	bio_end_io_t		*bi_end_io;
+	union {
+		bio_end_io_t		*bi_end_io;
+		/*
+		 * bio based io poll need to track bio via bio group list
+		 * which groups bio by same .bi_end_io, and original
+		 * .bi_end_io is save into the group head. Will recover
+		 * .bi_end_io before end this bio really. BIO_END_BY_POLL
+		 * will make sure that this bio won't be really ended
+		 * before recovering .bi_end_io.
+		 */
+		struct bio		*bi_poll;
+	};
 
 	void			*bi_private;
 #ifdef CONFIG_BLK_CGROUP
@@ -304,6 +315,9 @@  enum {
 	BIO_CGROUP_ACCT,	/* has been accounted to a cgroup */
 	BIO_TRACKED,		/* set if bio goes through the rq_qos path */
 	BIO_REMAPPED,
+	BIO_END_BY_POLL,	/* end by blk_bio_poll() explicitly */
+	/* set when bio can be ended, used for bio with BIO_END_BY_POLL */
+	BIO_DONE,
 	BIO_FLAG_LAST
 };