diff mbox

[05/12] blk-mq: Introduce blk_mq_quiesce_queue()

Message ID 5143c240-39af-9fe2-d3e6-ed69f9c20531@sandisk.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Bart Van Assche Oct. 26, 2016, 10:53 p.m. UTC
blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations
have finished. This function does *not* wait until all outstanding
requests have finished (this means invocation of request.end_io()).
The algorithm used by blk_mq_quiesce_queue() is as follows:
* Hold either an RCU read lock or an SRCU read lock around
  .queue_rq() calls. The former is used if .queue_rq() does not
  block and the latter if .queue_rq() may block.
* blk_mq_quiesce_queue() calls synchronize_srcu() or
  synchronize_rcu() to wait for .queue_rq() invocations that
  started before blk_mq_quiesce_queue() was called.
* The blk_mq_hctx_stopped() calls that control whether or not
  .queue_rq() will be called are called with the (S)RCU read lock
  held. This is necessary to avoid race conditions against
  the "blk_mq_stop_hw_queues(q); blk_mq_quiesce_queue(q);"
  sequence from another thread.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <tom.leiming@gmail.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
---
 block/Kconfig          |  1 +
 block/blk-mq.c         | 69 +++++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/blk-mq.h |  3 +++
 include/linux/blkdev.h |  1 +
 4 files changed, 67 insertions(+), 7 deletions(-)

Comments

Ming Lei Oct. 27, 2016, 1:30 a.m. UTC | #1
On Thu, Oct 27, 2016 at 6:53 AM, Bart Van Assche
<bart.vanassche@sandisk.com> wrote:
> blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations
> have finished. This function does *not* wait until all outstanding
> requests have finished (this means invocation of request.end_io()).
> The algorithm used by blk_mq_quiesce_queue() is as follows:
> * Hold either an RCU read lock or an SRCU read lock around
>   .queue_rq() calls. The former is used if .queue_rq() does not
>   block and the latter if .queue_rq() may block.
> * blk_mq_quiesce_queue() calls synchronize_srcu() or
>   synchronize_rcu() to wait for .queue_rq() invocations that
>   started before blk_mq_quiesce_queue() was called.
> * The blk_mq_hctx_stopped() calls that control whether or not
>   .queue_rq() will be called are called with the (S)RCU read lock
>   held. This is necessary to avoid race conditions against
>   the "blk_mq_stop_hw_queues(q); blk_mq_quiesce_queue(q);"
>   sequence from another thread.
>
> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Ming Lei <tom.leiming@gmail.com>
> Cc: Hannes Reinecke <hare@suse.com>
> Cc: Johannes Thumshirn <jthumshirn@suse.de>
> ---
>  block/Kconfig          |  1 +
>  block/blk-mq.c         | 69 +++++++++++++++++++++++++++++++++++++++++++++-----
>  include/linux/blk-mq.h |  3 +++
>  include/linux/blkdev.h |  1 +
>  4 files changed, 67 insertions(+), 7 deletions(-)
>
> diff --git a/block/Kconfig b/block/Kconfig
> index 1d4d624..0562ef9 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -5,6 +5,7 @@ menuconfig BLOCK
>         bool "Enable the block layer" if EXPERT
>         default y
>         select SBITMAP
> +       select SRCU
>         help
>          Provide block layer support for the kernel.
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0cf21c2..4945437 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -115,6 +115,31 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
>
> +/**
> + * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
> + * @q: request queue.
> + *
> + * Note: this function does not prevent that the struct request end_io()
> + * callback function is invoked. Additionally, it is not prevented that
> + * new queue_rq() calls occur unless the queue has been stopped first.
> + */
> +void blk_mq_quiesce_queue(struct request_queue *q)
> +{
> +       struct blk_mq_hw_ctx *hctx;
> +       unsigned int i;
> +       bool rcu = false;

Before synchronizing SRCU/RCU, we have to set a per-hctx flag
or per-queue flag to block comming .queue_rq(), seems I mentioned
that before:

   https://www.spinics.net/lists/linux-rdma/msg41389.html

> +
> +       queue_for_each_hw_ctx(q, hctx, i) {
> +               if (hctx->flags & BLK_MQ_F_BLOCKING)
> +                       synchronize_srcu(&hctx->queue_rq_srcu);
> +               else
> +                       rcu = true;
> +       }
> +       if (rcu)
> +               synchronize_rcu();
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
> +
>  void blk_mq_wake_waiters(struct request_queue *q)
>  {
>         struct blk_mq_hw_ctx *hctx;
> @@ -778,7 +803,7 @@ static inline unsigned int queued_to_index(unsigned int queued)
>   * of IO. In particular, we'd like FIFO behaviour on handling existing
>   * items on the hctx->dispatch list. Ignore that for now.
>   */
> -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
> +static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
>  {
>         struct request_queue *q = hctx->queue;
>         struct request *rq;
> @@ -790,9 +815,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>         if (unlikely(blk_mq_hctx_stopped(hctx)))
>                 return;
>
> -       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
> -               cpu_online(hctx->next_cpu));
> -
>         hctx->run++;
>
>         /*
> @@ -883,6 +905,24 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>         }
>  }
>
> +static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
> +{
> +       int srcu_idx;
> +
> +       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
> +               cpu_online(hctx->next_cpu));
> +
> +       if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> +               rcu_read_lock();
> +               blk_mq_process_rq_list(hctx);
> +               rcu_read_unlock();
> +       } else {
> +               srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
> +               blk_mq_process_rq_list(hctx);
> +               srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
> +       }
> +}
> +
>  /*
>   * It'd be great if the workqueue API had a way to pass
>   * in a mask and had some smarts for more clever placement.
> @@ -1293,7 +1333,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
>         const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
>         struct blk_map_ctx data;
>         struct request *rq;
> -       unsigned int request_count = 0;
> +       unsigned int request_count = 0, srcu_idx;
>         struct blk_plug *plug;
>         struct request *same_queue_rq = NULL;
>         blk_qc_t cookie;
> @@ -1336,7 +1376,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
>                 blk_mq_bio_to_request(rq, bio);
>
>                 /*
> -                * We do limited pluging. If the bio can be merged, do that.
> +                * We do limited plugging. If the bio can be merged, do that.
>                  * Otherwise the existing request in the plug list will be
>                  * issued. So the plug list will have one request at most
>                  */
> @@ -1356,7 +1396,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
>                 blk_mq_put_ctx(data.ctx);
>                 if (!old_rq)
>                         goto done;
> -               blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
> +
> +               if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
> +                       rcu_read_lock();
> +                       blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
> +                       rcu_read_unlock();
> +               } else {
> +                       srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
> +                       blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
> +                       srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
> +               }
>                 goto done;
>         }
>
> @@ -1635,6 +1684,9 @@ static void blk_mq_exit_hctx(struct request_queue *q,
>         if (set->ops->exit_hctx)
>                 set->ops->exit_hctx(hctx, hctx_idx);
>
> +       if (hctx->flags & BLK_MQ_F_BLOCKING)
> +               cleanup_srcu_struct(&hctx->queue_rq_srcu);
> +
>         blk_mq_remove_cpuhp(hctx);
>         blk_free_flush_queue(hctx->fq);
>         sbitmap_free(&hctx->ctx_map);
> @@ -1715,6 +1767,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
>                                    flush_start_tag + hctx_idx, node))
>                 goto free_fq;
>
> +       if (hctx->flags & BLK_MQ_F_BLOCKING)
> +               init_srcu_struct(&hctx->queue_rq_srcu);
> +
>         return 0;
>
>   free_fq:
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index aa93000..61be48b 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -3,6 +3,7 @@
>
>  #include <linux/blkdev.h>
>  #include <linux/sbitmap.h>
> +#include <linux/srcu.h>
>
>  struct blk_mq_tags;
>  struct blk_flush_queue;
> @@ -35,6 +36,8 @@ struct blk_mq_hw_ctx {
>
>         struct blk_mq_tags      *tags;
>
> +       struct srcu_struct      queue_rq_srcu;
> +
>         unsigned long           queued;
>         unsigned long           run;
>  #define BLK_MQ_MAX_DISPATCH_ORDER      7
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index c47c358..8259d87 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -824,6 +824,7 @@ extern void __blk_run_queue(struct request_queue *q);
>  extern void __blk_run_queue_uncond(struct request_queue *q);
>  extern void blk_run_queue(struct request_queue *);
>  extern void blk_run_queue_async(struct request_queue *q);
> +extern void blk_mq_quiesce_queue(struct request_queue *q);
>  extern int blk_rq_map_user(struct request_queue *, struct request *,
>                            struct rq_map_data *, void __user *, unsigned long,
>                            gfp_t);
> --
> 2.10.1
>
Bart Van Assche Oct. 27, 2016, 2:04 a.m. UTC | #2
On 10/26/16 18:30, Ming Lei wrote:
> On Thu, Oct 27, 2016 at 6:53 AM, Bart Van Assche
> <bart.vanassche@sandisk.com> wrote:
>> blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations
>> have finished. This function does *not* wait until all outstanding
>> requests have finished (this means invocation of request.end_io()).
>> The algorithm used by blk_mq_quiesce_queue() is as follows:
>> * Hold either an RCU read lock or an SRCU read lock around
>>   .queue_rq() calls. The former is used if .queue_rq() does not
>>   block and the latter if .queue_rq() may block.
>> * blk_mq_quiesce_queue() calls synchronize_srcu() or
>>   synchronize_rcu() to wait for .queue_rq() invocations that
>>   started before blk_mq_quiesce_queue() was called.
>> * The blk_mq_hctx_stopped() calls that control whether or not
>>   .queue_rq() will be called are called with the (S)RCU read lock
>>   held. This is necessary to avoid race conditions against
>>   the "blk_mq_stop_hw_queues(q); blk_mq_quiesce_queue(q);"
>>   sequence from another thread.
>>
>> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
>> Cc: Christoph Hellwig <hch@lst.de>
>> Cc: Ming Lei <tom.leiming@gmail.com>
>> Cc: Hannes Reinecke <hare@suse.com>
>> Cc: Johannes Thumshirn <jthumshirn@suse.de>
>> ---
>>  block/Kconfig          |  1 +
>>  block/blk-mq.c         | 69 +++++++++++++++++++++++++++++++++++++++++++++-----
>>  include/linux/blk-mq.h |  3 +++
>>  include/linux/blkdev.h |  1 +
>>  4 files changed, 67 insertions(+), 7 deletions(-)
>>
>> diff --git a/block/Kconfig b/block/Kconfig
>> index 1d4d624..0562ef9 100644
>> --- a/block/Kconfig
>> +++ b/block/Kconfig
>> @@ -5,6 +5,7 @@ menuconfig BLOCK
>>         bool "Enable the block layer" if EXPERT
>>         default y
>>         select SBITMAP
>> +       select SRCU
>>         help
>>          Provide block layer support for the kernel.
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 0cf21c2..4945437 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -115,6 +115,31 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
>>  }
>>  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
>>
>> +/**
>> + * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
>> + * @q: request queue.
>> + *
>> + * Note: this function does not prevent that the struct request end_io()
>> + * callback function is invoked. Additionally, it is not prevented that
>> + * new queue_rq() calls occur unless the queue has been stopped first.
>> + */
>> +void blk_mq_quiesce_queue(struct request_queue *q)
>> +{
>> +       struct blk_mq_hw_ctx *hctx;
>> +       unsigned int i;
>> +       bool rcu = false;
>
> Before synchronizing SRCU/RCU, we have to set a per-hctx flag
> or per-queue flag to block comming .queue_rq(), seems I mentioned
> that before:
>
>    https://www.spinics.net/lists/linux-rdma/msg41389.html

Hello Ming,

Thanks for having included an URL to an archived version of that 
discussion. What I remember about that discussion is that I proposed to 
use the existing flag BLK_MQ_S_STOPPED instead of introducing a
new QUEUE_FLAG_QUIESCING flag and that you agreed with that proposal. 
See also https://www.spinics.net/lists/linux-rdma/msg41430.html.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ming Lei Oct. 27, 2016, 2:31 a.m. UTC | #3
On Thu, Oct 27, 2016 at 10:04 AM, Bart Van Assche <bvanassche@acm.org> wrote:
> On 10/26/16 18:30, Ming Lei wrote:
>>
>> On Thu, Oct 27, 2016 at 6:53 AM, Bart Van Assche
>> <bart.vanassche@sandisk.com> wrote:
>>>
>>> blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations
>>> have finished. This function does *not* wait until all outstanding
>>> requests have finished (this means invocation of request.end_io()).
>>> The algorithm used by blk_mq_quiesce_queue() is as follows:
>>> * Hold either an RCU read lock or an SRCU read lock around
>>>   .queue_rq() calls. The former is used if .queue_rq() does not
>>>   block and the latter if .queue_rq() may block.
>>> * blk_mq_quiesce_queue() calls synchronize_srcu() or
>>>   synchronize_rcu() to wait for .queue_rq() invocations that
>>>   started before blk_mq_quiesce_queue() was called.
>>> * The blk_mq_hctx_stopped() calls that control whether or not
>>>   .queue_rq() will be called are called with the (S)RCU read lock
>>>   held. This is necessary to avoid race conditions against
>>>   the "blk_mq_stop_hw_queues(q); blk_mq_quiesce_queue(q);"
>>>   sequence from another thread.
>>>
>>> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
>>> Cc: Christoph Hellwig <hch@lst.de>
>>> Cc: Ming Lei <tom.leiming@gmail.com>
>>> Cc: Hannes Reinecke <hare@suse.com>
>>> Cc: Johannes Thumshirn <jthumshirn@suse.de>
>>> ---
>>>  block/Kconfig          |  1 +
>>>  block/blk-mq.c         | 69
>>> +++++++++++++++++++++++++++++++++++++++++++++-----
>>>  include/linux/blk-mq.h |  3 +++
>>>  include/linux/blkdev.h |  1 +
>>>  4 files changed, 67 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/block/Kconfig b/block/Kconfig
>>> index 1d4d624..0562ef9 100644
>>> --- a/block/Kconfig
>>> +++ b/block/Kconfig
>>> @@ -5,6 +5,7 @@ menuconfig BLOCK
>>>         bool "Enable the block layer" if EXPERT
>>>         default y
>>>         select SBITMAP
>>> +       select SRCU
>>>         help
>>>          Provide block layer support for the kernel.
>>>
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index 0cf21c2..4945437 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -115,6 +115,31 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
>>>  }
>>>  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
>>>
>>> +/**
>>> + * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have
>>> finished
>>> + * @q: request queue.
>>> + *
>>> + * Note: this function does not prevent that the struct request end_io()
>>> + * callback function is invoked. Additionally, it is not prevented that
>>> + * new queue_rq() calls occur unless the queue has been stopped first.
>>> + */
>>> +void blk_mq_quiesce_queue(struct request_queue *q)
>>> +{
>>> +       struct blk_mq_hw_ctx *hctx;
>>> +       unsigned int i;
>>> +       bool rcu = false;
>>
>>
>> Before synchronizing SRCU/RCU, we have to set a per-hctx flag
>> or per-queue flag to block comming .queue_rq(), seems I mentioned
>> that before:
>>
>>    https://www.spinics.net/lists/linux-rdma/msg41389.html
>
>
> Hello Ming,
>
> Thanks for having included an URL to an archived version of that discussion.
> What I remember about that discussion is that I proposed to use the existing
> flag BLK_MQ_S_STOPPED instead of introducing a
> new QUEUE_FLAG_QUIESCING flag and that you agreed with that proposal. See
> also https://www.spinics.net/lists/linux-rdma/msg41430.html.

Yes, I am fine with either one, but the flag need to set in
blk_mq_quiesce_queue(), doesnt't it?


Thanks,
Ming Lei
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Oct. 27, 2016, 2:40 a.m. UTC | #4
On 10/26/16 19:31, Ming Lei wrote:
> On Thu, Oct 27, 2016 at 10:04 AM, Bart Van Assche <bvanassche@acm.org> wrote:
>> On 10/26/16 18:30, Ming Lei wrote:
>>>
>>> On Thu, Oct 27, 2016 at 6:53 AM, Bart Van Assche
>>> <bart.vanassche@sandisk.com> wrote:
>>>>
>>>> blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations
>>>> have finished. This function does *not* wait until all outstanding
>>>> requests have finished (this means invocation of request.end_io()).
>>>> The algorithm used by blk_mq_quiesce_queue() is as follows:
>>>> * Hold either an RCU read lock or an SRCU read lock around
>>>>   .queue_rq() calls. The former is used if .queue_rq() does not
>>>>   block and the latter if .queue_rq() may block.
>>>> * blk_mq_quiesce_queue() calls synchronize_srcu() or
>>>>   synchronize_rcu() to wait for .queue_rq() invocations that
>>>>   started before blk_mq_quiesce_queue() was called.
>>>> * The blk_mq_hctx_stopped() calls that control whether or not
>>>>   .queue_rq() will be called are called with the (S)RCU read lock
>>>>   held. This is necessary to avoid race conditions against
>>>>   the "blk_mq_stop_hw_queues(q); blk_mq_quiesce_queue(q);"
>>>>   sequence from another thread.
>>>>
>>>> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
>>>> Cc: Christoph Hellwig <hch@lst.de>
>>>> Cc: Ming Lei <tom.leiming@gmail.com>
>>>> Cc: Hannes Reinecke <hare@suse.com>
>>>> Cc: Johannes Thumshirn <jthumshirn@suse.de>
>>>> ---
>>>>  block/Kconfig          |  1 +
>>>>  block/blk-mq.c         | 69
>>>> +++++++++++++++++++++++++++++++++++++++++++++-----
>>>>  include/linux/blk-mq.h |  3 +++
>>>>  include/linux/blkdev.h |  1 +
>>>>  4 files changed, 67 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/block/Kconfig b/block/Kconfig
>>>> index 1d4d624..0562ef9 100644
>>>> --- a/block/Kconfig
>>>> +++ b/block/Kconfig
>>>> @@ -5,6 +5,7 @@ menuconfig BLOCK
>>>>         bool "Enable the block layer" if EXPERT
>>>>         default y
>>>>         select SBITMAP
>>>> +       select SRCU
>>>>         help
>>>>          Provide block layer support for the kernel.
>>>>
>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>> index 0cf21c2..4945437 100644
>>>> --- a/block/blk-mq.c
>>>> +++ b/block/blk-mq.c
>>>> @@ -115,6 +115,31 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
>>>>
>>>> +/**
>>>> + * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have
>>>> finished
>>>> + * @q: request queue.
>>>> + *
>>>> + * Note: this function does not prevent that the struct request end_io()
>>>> + * callback function is invoked. Additionally, it is not prevented that
>>>> + * new queue_rq() calls occur unless the queue has been stopped first.
>>>> + */
>>>> +void blk_mq_quiesce_queue(struct request_queue *q)
>>>> +{
>>>> +       struct blk_mq_hw_ctx *hctx;
>>>> +       unsigned int i;
>>>> +       bool rcu = false;
>>>
>>>
>>> Before synchronizing SRCU/RCU, we have to set a per-hctx flag
>>> or per-queue flag to block comming .queue_rq(), seems I mentioned
>>> that before:
>>>
>>>    https://www.spinics.net/lists/linux-rdma/msg41389.html
>>
>>
>> Hello Ming,
>>
>> Thanks for having included an URL to an archived version of that discussion.
>> What I remember about that discussion is that I proposed to use the existing
>> flag BLK_MQ_S_STOPPED instead of introducing a
>> new QUEUE_FLAG_QUIESCING flag and that you agreed with that proposal. See
>> also https://www.spinics.net/lists/linux-rdma/msg41430.html.
>
> Yes, I am fine with either one, but the flag need to set in
> blk_mq_quiesce_queue(), doesnt't it?

Hello Ming,

If you have a look at the later patches in this series then you will see 
that the dm core and the NVMe driver have been modified such that
blk_mq_stop_hw_queues(q) is called immediately before 
blk_mq_quiesce_queue(q) is called.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ming Lei Oct. 27, 2016, 2:48 a.m. UTC | #5
On Thu, Oct 27, 2016 at 10:40 AM, Bart Van Assche <bvanassche@acm.org> wrote:
> On 10/26/16 19:31, Ming Lei wrote:
>>
>> On Thu, Oct 27, 2016 at 10:04 AM, Bart Van Assche <bvanassche@acm.org>
>> wrote:
>>>
>>> On 10/26/16 18:30, Ming Lei wrote:
>>>>
>>>>
>>>> On Thu, Oct 27, 2016 at 6:53 AM, Bart Van Assche
>>>> <bart.vanassche@sandisk.com> wrote:
>>>>>
>>>>>
>>>>> blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations
>>>>> have finished. This function does *not* wait until all outstanding
>>>>> requests have finished (this means invocation of request.end_io()).
>>>>> The algorithm used by blk_mq_quiesce_queue() is as follows:
>>>>> * Hold either an RCU read lock or an SRCU read lock around
>>>>>   .queue_rq() calls. The former is used if .queue_rq() does not
>>>>>   block and the latter if .queue_rq() may block.
>>>>> * blk_mq_quiesce_queue() calls synchronize_srcu() or
>>>>>   synchronize_rcu() to wait for .queue_rq() invocations that
>>>>>   started before blk_mq_quiesce_queue() was called.
>>>>> * The blk_mq_hctx_stopped() calls that control whether or not
>>>>>   .queue_rq() will be called are called with the (S)RCU read lock
>>>>>   held. This is necessary to avoid race conditions against
>>>>>   the "blk_mq_stop_hw_queues(q); blk_mq_quiesce_queue(q);"
>>>>>   sequence from another thread.
>>>>>
>>>>> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
>>>>> Cc: Christoph Hellwig <hch@lst.de>
>>>>> Cc: Ming Lei <tom.leiming@gmail.com>
>>>>> Cc: Hannes Reinecke <hare@suse.com>
>>>>> Cc: Johannes Thumshirn <jthumshirn@suse.de>
>>>>> ---
>>>>>  block/Kconfig          |  1 +
>>>>>  block/blk-mq.c         | 69
>>>>> +++++++++++++++++++++++++++++++++++++++++++++-----
>>>>>  include/linux/blk-mq.h |  3 +++
>>>>>  include/linux/blkdev.h |  1 +
>>>>>  4 files changed, 67 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/block/Kconfig b/block/Kconfig
>>>>> index 1d4d624..0562ef9 100644
>>>>> --- a/block/Kconfig
>>>>> +++ b/block/Kconfig
>>>>> @@ -5,6 +5,7 @@ menuconfig BLOCK
>>>>>         bool "Enable the block layer" if EXPERT
>>>>>         default y
>>>>>         select SBITMAP
>>>>> +       select SRCU
>>>>>         help
>>>>>          Provide block layer support for the kernel.
>>>>>
>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>>> index 0cf21c2..4945437 100644
>>>>> --- a/block/blk-mq.c
>>>>> +++ b/block/blk-mq.c
>>>>> @@ -115,6 +115,31 @@ void blk_mq_unfreeze_queue(struct request_queue
>>>>> *q)
>>>>>  }
>>>>>  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
>>>>>
>>>>> +/**
>>>>> + * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have
>>>>> finished
>>>>> + * @q: request queue.
>>>>> + *
>>>>> + * Note: this function does not prevent that the struct request
>>>>> end_io()
>>>>> + * callback function is invoked. Additionally, it is not prevented
>>>>> that
>>>>> + * new queue_rq() calls occur unless the queue has been stopped first.
>>>>> + */
>>>>> +void blk_mq_quiesce_queue(struct request_queue *q)
>>>>> +{
>>>>> +       struct blk_mq_hw_ctx *hctx;
>>>>> +       unsigned int i;
>>>>> +       bool rcu = false;
>>>>
>>>>
>>>>
>>>> Before synchronizing SRCU/RCU, we have to set a per-hctx flag
>>>> or per-queue flag to block comming .queue_rq(), seems I mentioned
>>>> that before:
>>>>
>>>>    https://www.spinics.net/lists/linux-rdma/msg41389.html
>>>
>>>
>>>
>>> Hello Ming,
>>>
>>> Thanks for having included an URL to an archived version of that
>>> discussion.
>>> What I remember about that discussion is that I proposed to use the
>>> existing
>>> flag BLK_MQ_S_STOPPED instead of introducing a
>>> new QUEUE_FLAG_QUIESCING flag and that you agreed with that proposal. See
>>> also https://www.spinics.net/lists/linux-rdma/msg41430.html.
>>
>>
>> Yes, I am fine with either one, but the flag need to set in
>> blk_mq_quiesce_queue(), doesnt't it?
>
>
> Hello Ming,
>
> If you have a look at the later patches in this series then you will see
> that the dm core and the NVMe driver have been modified such that
> blk_mq_stop_hw_queues(q) is called immediately before
> blk_mq_quiesce_queue(q) is called.

Cause any current and future users of blk_mq_quiesce_queue(q)
have to set the flag via blk_mq_stop_hw_queues(q), why not set
the flag explicitly in blk_mq_quiesce_queue(q)?


thanks,
Ming Lei
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Oct. 27, 2016, 3:05 a.m. UTC | #6
On 10/26/16 19:48, Ming Lei wrote:
> On Thu, Oct 27, 2016 at 10:40 AM, Bart Van Assche <bvanassche@acm.org> wrote:
>> If you have a look at the later patches in this series then you will see
>> that the dm core and the NVMe driver have been modified such that
>> blk_mq_stop_hw_queues(q) is called immediately before
>> blk_mq_quiesce_queue(q) is called.
>
> Cause any current and future users of blk_mq_quiesce_queue(q)
> have to set the flag via blk_mq_stop_hw_queues(q), why not set
> the flag explicitly in blk_mq_quiesce_queue(q)?

Hello Ming,

I'll leave it to Jens to decide whether I should repost the patch series 
with this change integrated or whether to realize this change with a 
follow-up patch.

Bart.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Reinecke Oct. 27, 2016, 5:52 a.m. UTC | #7
On 10/27/2016 12:53 AM, Bart Van Assche wrote:
> blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations
> have finished. This function does *not* wait until all outstanding
> requests have finished (this means invocation of request.end_io()).
> The algorithm used by blk_mq_quiesce_queue() is as follows:
> * Hold either an RCU read lock or an SRCU read lock around
>   .queue_rq() calls. The former is used if .queue_rq() does not
>   block and the latter if .queue_rq() may block.
> * blk_mq_quiesce_queue() calls synchronize_srcu() or
>   synchronize_rcu() to wait for .queue_rq() invocations that
>   started before blk_mq_quiesce_queue() was called.
> * The blk_mq_hctx_stopped() calls that control whether or not
>   .queue_rq() will be called are called with the (S)RCU read lock
>   held. This is necessary to avoid race conditions against
>   the "blk_mq_stop_hw_queues(q); blk_mq_quiesce_queue(q);"
>   sequence from another thread.
> 
> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Ming Lei <tom.leiming@gmail.com>
> Cc: Hannes Reinecke <hare@suse.com>
> Cc: Johannes Thumshirn <jthumshirn@suse.de>
> ---
>  block/Kconfig          |  1 +
>  block/blk-mq.c         | 69 +++++++++++++++++++++++++++++++++++++++++++++-----
>  include/linux/blk-mq.h |  3 +++
>  include/linux/blkdev.h |  1 +
>  4 files changed, 67 insertions(+), 7 deletions(-)
> 
Hmm. Can't say I like having to have two RCU structure for the same
purpose, but I guess that can't be helped.

Reviewed-by: Hannes Reinecke <hare@suse.com>

Cheers,

Hannes
Christoph Hellwig Oct. 27, 2016, 12:41 p.m. UTC | #8
Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Oct. 27, 2016, 12:42 p.m. UTC | #9
On Wed, Oct 26, 2016 at 08:05:14PM -0700, Bart Van Assche wrote:
> I'll leave it to Jens to decide whether I should repost the patch series 
> with this change integrated or whether to realize this change with a 
> follow-up patch.

I would prefer to get the series in now.  I think we should eventually
the stop queues call, the quience call and even the cancellation of
the requeues into a single function, but it's not as urgent.  I think
this series actually is Linux 4.9 material.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ming Lei Oct. 27, 2016, 1:16 p.m. UTC | #10
On Thu, Oct 27, 2016 at 8:42 PM, Christoph Hellwig <hch@lst.de> wrote:
> On Wed, Oct 26, 2016 at 08:05:14PM -0700, Bart Van Assche wrote:
>> I'll leave it to Jens to decide whether I should repost the patch series
>> with this change integrated or whether to realize this change with a
>> follow-up patch.
>
> I would prefer to get the series in now.  I think we should eventually
> the stop queues call, the quience call and even the cancellation of
> the requeues into a single function, but it's not as urgent.  I think
> this series actually is Linux 4.9 material.

Yeah, that is fine to cleanup in follow-up patch.

Reviewed-by: Ming Lei <tom.leiming@gmail.com>

Thanks
Ming Lei
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Oct. 27, 2016, 3:56 p.m. UTC | #11
On 10/26/2016 10:52 PM, Hannes Reinecke wrote:
> Hmm. Can't say I like having to have two RCU structure for the same
> purpose, but I guess that can't be helped.

Hello Hannes,

There are two RCU structures because of BLK_MQ_F_BLOCKING. Today only 
the nbd driver sets that flag. If the nbd driver would be modified such 
that it doesn't set that flag then the BLK_MQ_F_BLOCKING flag and also 
queue_rq_srcu could be eliminated from the blk-mq core.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/block/Kconfig b/block/Kconfig
index 1d4d624..0562ef9 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,6 +5,7 @@  menuconfig BLOCK
        bool "Enable the block layer" if EXPERT
        default y
        select SBITMAP
+       select SRCU
        help
 	 Provide block layer support for the kernel.
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0cf21c2..4945437 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -115,6 +115,31 @@  void blk_mq_unfreeze_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
+/**
+ * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
+ * @q: request queue.
+ *
+ * Note: this function does not prevent that the struct request end_io()
+ * callback function is invoked. Additionally, it is not prevented that
+ * new queue_rq() calls occur unless the queue has been stopped first.
+ */
+void blk_mq_quiesce_queue(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+	bool rcu = false;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (hctx->flags & BLK_MQ_F_BLOCKING)
+			synchronize_srcu(&hctx->queue_rq_srcu);
+		else
+			rcu = true;
+	}
+	if (rcu)
+		synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
+
 void blk_mq_wake_waiters(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
@@ -778,7 +803,7 @@  static inline unsigned int queued_to_index(unsigned int queued)
  * of IO. In particular, we'd like FIFO behaviour on handling existing
  * items on the hctx->dispatch list. Ignore that for now.
  */
-static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct request *rq;
@@ -790,9 +815,6 @@  static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	if (unlikely(blk_mq_hctx_stopped(hctx)))
 		return;
 
-	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
-		cpu_online(hctx->next_cpu));
-
 	hctx->run++;
 
 	/*
@@ -883,6 +905,24 @@  static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	}
 }
 
+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+	int srcu_idx;
+
+	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+		cpu_online(hctx->next_cpu));
+
+	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+		rcu_read_lock();
+		blk_mq_process_rq_list(hctx);
+		rcu_read_unlock();
+	} else {
+		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+		blk_mq_process_rq_list(hctx);
+		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+	}
+}
+
 /*
  * It'd be great if the workqueue API had a way to pass
  * in a mask and had some smarts for more clever placement.
@@ -1293,7 +1333,7 @@  static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
 	struct blk_map_ctx data;
 	struct request *rq;
-	unsigned int request_count = 0;
+	unsigned int request_count = 0, srcu_idx;
 	struct blk_plug *plug;
 	struct request *same_queue_rq = NULL;
 	blk_qc_t cookie;
@@ -1336,7 +1376,7 @@  static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_bio_to_request(rq, bio);
 
 		/*
-		 * We do limited pluging. If the bio can be merged, do that.
+		 * We do limited plugging. If the bio can be merged, do that.
 		 * Otherwise the existing request in the plug list will be
 		 * issued. So the plug list will have one request at most
 		 */
@@ -1356,7 +1396,16 @@  static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_put_ctx(data.ctx);
 		if (!old_rq)
 			goto done;
-		blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
+
+		if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
+			rcu_read_lock();
+			blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
+			rcu_read_unlock();
+		} else {
+			srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
+			blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
+			srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
+		}
 		goto done;
 	}
 
@@ -1635,6 +1684,9 @@  static void blk_mq_exit_hctx(struct request_queue *q,
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
 
+	if (hctx->flags & BLK_MQ_F_BLOCKING)
+		cleanup_srcu_struct(&hctx->queue_rq_srcu);
+
 	blk_mq_remove_cpuhp(hctx);
 	blk_free_flush_queue(hctx->fq);
 	sbitmap_free(&hctx->ctx_map);
@@ -1715,6 +1767,9 @@  static int blk_mq_init_hctx(struct request_queue *q,
 				   flush_start_tag + hctx_idx, node))
 		goto free_fq;
 
+	if (hctx->flags & BLK_MQ_F_BLOCKING)
+		init_srcu_struct(&hctx->queue_rq_srcu);
+
 	return 0;
 
  free_fq:
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index aa93000..61be48b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -3,6 +3,7 @@ 
 
 #include <linux/blkdev.h>
 #include <linux/sbitmap.h>
+#include <linux/srcu.h>
 
 struct blk_mq_tags;
 struct blk_flush_queue;
@@ -35,6 +36,8 @@  struct blk_mq_hw_ctx {
 
 	struct blk_mq_tags	*tags;
 
+	struct srcu_struct	queue_rq_srcu;
+
 	unsigned long		queued;
 	unsigned long		run;
 #define BLK_MQ_MAX_DISPATCH_ORDER	7
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c47c358..8259d87 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -824,6 +824,7 @@  extern void __blk_run_queue(struct request_queue *q);
 extern void __blk_run_queue_uncond(struct request_queue *q);
 extern void blk_run_queue(struct request_queue *);
 extern void blk_run_queue_async(struct request_queue *q);
+extern void blk_mq_quiesce_queue(struct request_queue *q);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
 			   struct rq_map_data *, void __user *, unsigned long,
 			   gfp_t);