Message ID | 98418e6d-2981-0fb7-dcdd-79b635955fcf@kernel.dk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, 2017-11-08 at 15:48 -0700, Jens Axboe wrote: > + * We got a tag, remove outselves from the wait queue to ensure ^^^^^^^^^ ourselves? Anyway, since this patch fixes a SCSI queue stall I ran into recently (see also https://www.mail-archive.com/linux-scsi@vger.kernel.org/msg68190.html): Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com> Tested-by: Bart Van Assche <bart.vanassche@wdc.com>
On Wed, Nov 08, 2017 at 03:48:51PM -0700, Jens Axboe wrote: > This patch attempts to make the case of hctx re-running on driver tag > failure more robust. Without this patch, it's pretty easy to trigger a > stall condition with shared tags. An example is using null_blk like > this: > > modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 > > which sets up 4 devices, sharing the same tag set with a depth of 1. > Running a fio job ala: > > [global] > bs=4k > rw=randread > norandommap > direct=1 > ioengine=libaio > iodepth=4 > > [nullb0] > filename=/dev/nullb0 > [nullb1] > filename=/dev/nullb1 > [nullb2] > filename=/dev/nullb2 > [nullb3] > filename=/dev/nullb3 > > will inevitably end with one or more threads being stuck waiting for a > scheduler tag. That IO is then stuck forever, until someone else > triggers a run of the queue. > > Ensure that we always re-run the hardware queue, if the driver tag we > were waiting for got freed before we added our leftover request entries > back on the dispatch list. > > Signed-off-by: Jens Axboe <axboe@kernel.dk> > > --- > > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c > index 7f4a1ba532af..bb7f08415203 100644 > --- a/block/blk-mq-debugfs.c > +++ b/block/blk-mq-debugfs.c > @@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = { > HCTX_STATE_NAME(STOPPED), > HCTX_STATE_NAME(TAG_ACTIVE), > HCTX_STATE_NAME(SCHED_RESTART), > - HCTX_STATE_NAME(TAG_WAITING), > HCTX_STATE_NAME(START_ON_RUN), > }; > #undef HCTX_STATE_NAME > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 3d759bb8a5bb..8dc5db40df9d 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -998,49 +998,64 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, > return rq->tag != -1; > } > > -static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, > - void *key) > +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, > + int flags, void *key) > { > struct blk_mq_hw_ctx *hctx; > > hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); > > - list_del(&wait->entry); > - clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); > + list_del_init(&wait->entry); > blk_mq_run_hw_queue(hctx, true); > return 1; > } > > -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) > +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, > + struct request *rq) > { > + struct blk_mq_hw_ctx *this_hctx = *hctx; > + wait_queue_entry_t *wait = &this_hctx->dispatch_wait; > struct sbq_wait_state *ws; > > + if (!list_empty_careful(&wait->entry)) > + return false; > + > + spin_lock(&this_hctx->lock); > + if (!list_empty(&wait->entry)) { > + spin_unlock(&this_hctx->lock); > + return false; > + } > + > + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); > + add_wait_queue(&ws->wait, wait); > + > /* > - * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. > - * The thread which wins the race to grab this bit adds the hardware > - * queue to the wait queue. > + * It's possible that a tag was freed in the window between the > + * allocation failure and adding the hardware queue to the wait > + * queue. > */ > - if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || > - test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) > + if (!blk_mq_get_driver_tag(rq, hctx, false)) { > + spin_unlock(&this_hctx->lock); > return false; > - > - init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); > - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); > + } > > /* > - * As soon as this returns, it's no longer safe to fiddle with > - * hctx->dispatch_wait, since a completion can wake up the wait queue > - * and unlock the bit. > + * We got a tag, remove outselves from the wait queue to ensure > + * someone else gets the wakeup. > */ > - add_wait_queue(&ws->wait, &hctx->dispatch_wait); > + spin_lock_irq(&ws->wait.lock); > + list_del_init(&wait->entry); > + spin_unlock_irq(&ws->wait.lock); > + spin_unlock(&this_hctx->lock); > return true; > } > > bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > - bool got_budget) > + bool got_budget) > { > struct blk_mq_hw_ctx *hctx; > struct request *rq, *nxt; > + bool no_tag = false; > int errors, queued; > > if (list_empty(list)) > @@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > if (!blk_mq_get_driver_tag(rq, &hctx, false)) { > /* > * The initial allocation attempt failed, so we need to > - * rerun the hardware queue when a tag is freed. > + * rerun the hardware queue when a tag is freed. The > + * waitqueue takes care of that. If the queue is run > + * before we add this entry back on the dispatch list, > + * we'll re-run it below. > */ > - if (!blk_mq_dispatch_wait_add(hctx)) { > - if (got_budget) > - blk_mq_put_dispatch_budget(hctx); > - break; > - } > - > - /* > - * It's possible that a tag was freed in the window > - * between the allocation failure and adding the > - * hardware queue to the wait queue. > - */ > - if (!blk_mq_get_driver_tag(rq, &hctx, false)) { > + if (!blk_mq_dispatch_wait_add(&hctx, rq)) { > if (got_budget) > blk_mq_put_dispatch_budget(hctx); > + no_tag = true; > break; > } > } > @@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > * it is no longer set that means that it was cleared by another > * thread and hence that a queue rerun is needed. > * > - * If TAG_WAITING is set that means that an I/O scheduler has > - * been configured and another thread is waiting for a driver > - * tag. To guarantee fairness, do not rerun this hardware queue > - * but let the other thread grab the driver tag. > + * If 'no_tag' is set, that means that we failed getting > + * a driver tag with an I/O scheduler attached. If our dispatch > + * waitqueue is no longer active, ensure that we run the queue > + * AFTER adding our entries back to the list. > * > * If no I/O scheduler has been configured it is possible that > * the hardware queue got stopped and restarted before requests > @@ -1156,7 +1164,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > * and dm-rq. > */ > if (!blk_mq_sched_needs_restart(hctx) && > - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) > + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) > blk_mq_run_hw_queue(hctx, true); If one rq is just completed after the check on list_empty_careful(&hctx->dispatch_wait.entry), the queue may not be run any more. May that be an issue? > } > > @@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q, > > hctx->nr_ctx = 0; > > + init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); > + INIT_LIST_HEAD(&hctx->dispatch_wait.entry); > + > if (set->ops->init_hctx && > set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) > goto free_bitmap; > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index 674641527da7..4ae987c2352c 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -35,7 +35,7 @@ struct blk_mq_hw_ctx { > struct blk_mq_ctx **ctxs; > unsigned int nr_ctx; > > - wait_queue_entry_t dispatch_wait; > + wait_queue_entry_t dispatch_wait; > atomic_t wait_index; > > struct blk_mq_tags *tags; > @@ -181,8 +181,7 @@ enum { > BLK_MQ_S_STOPPED = 0, > BLK_MQ_S_TAG_ACTIVE = 1, > BLK_MQ_S_SCHED_RESTART = 2, > - BLK_MQ_S_TAG_WAITING = 3, > - BLK_MQ_S_START_ON_RUN = 4, > + BLK_MQ_S_START_ON_RUN = 3, > > BLK_MQ_MAX_DEPTH = 10240, Looks the approach is smart, and effective, since requests are often completed at batch. No regression on scsi test too. Reviewed-by: Ming Lei <ming.lei@redhat.com>
On Thu, Nov 09, 2017 at 11:41:40AM +0800, Ming Lei wrote: > On Wed, Nov 08, 2017 at 03:48:51PM -0700, Jens Axboe wrote: > > This patch attempts to make the case of hctx re-running on driver tag > > failure more robust. Without this patch, it's pretty easy to trigger a > > stall condition with shared tags. An example is using null_blk like > > this: > > > > modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 > > > > which sets up 4 devices, sharing the same tag set with a depth of 1. > > Running a fio job ala: > > > > [global] > > bs=4k > > rw=randread > > norandommap > > direct=1 > > ioengine=libaio > > iodepth=4 > > > > [nullb0] > > filename=/dev/nullb0 > > [nullb1] > > filename=/dev/nullb1 > > [nullb2] > > filename=/dev/nullb2 > > [nullb3] > > filename=/dev/nullb3 > > > > will inevitably end with one or more threads being stuck waiting for a > > scheduler tag. That IO is then stuck forever, until someone else > > triggers a run of the queue. > > > > Ensure that we always re-run the hardware queue, if the driver tag we > > were waiting for got freed before we added our leftover request entries > > back on the dispatch list. > > > > Signed-off-by: Jens Axboe <axboe@kernel.dk> > > > > --- > > > > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c > > index 7f4a1ba532af..bb7f08415203 100644 > > --- a/block/blk-mq-debugfs.c > > +++ b/block/blk-mq-debugfs.c > > @@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = { > > HCTX_STATE_NAME(STOPPED), > > HCTX_STATE_NAME(TAG_ACTIVE), > > HCTX_STATE_NAME(SCHED_RESTART), > > - HCTX_STATE_NAME(TAG_WAITING), > > HCTX_STATE_NAME(START_ON_RUN), > > }; > > #undef HCTX_STATE_NAME > > diff --git a/block/blk-mq.c b/block/blk-mq.c > > index 3d759bb8a5bb..8dc5db40df9d 100644 > > --- a/block/blk-mq.c > > +++ b/block/blk-mq.c > > @@ -998,49 +998,64 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, > > return rq->tag != -1; > > } > > > > -static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, > > - void *key) > > +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, > > + int flags, void *key) > > { > > struct blk_mq_hw_ctx *hctx; > > > > hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); > > > > - list_del(&wait->entry); > > - clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); > > + list_del_init(&wait->entry); > > blk_mq_run_hw_queue(hctx, true); > > return 1; > > } > > > > -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) > > +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, > > + struct request *rq) > > { > > + struct blk_mq_hw_ctx *this_hctx = *hctx; > > + wait_queue_entry_t *wait = &this_hctx->dispatch_wait; > > struct sbq_wait_state *ws; > > > > + if (!list_empty_careful(&wait->entry)) > > + return false; > > + > > + spin_lock(&this_hctx->lock); > > + if (!list_empty(&wait->entry)) { > > + spin_unlock(&this_hctx->lock); > > + return false; > > + } > > + > > + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); > > + add_wait_queue(&ws->wait, wait); > > + > > /* > > - * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. > > - * The thread which wins the race to grab this bit adds the hardware > > - * queue to the wait queue. > > + * It's possible that a tag was freed in the window between the > > + * allocation failure and adding the hardware queue to the wait > > + * queue. > > */ > > - if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || > > - test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) > > + if (!blk_mq_get_driver_tag(rq, hctx, false)) { > > + spin_unlock(&this_hctx->lock); > > return false; > > - > > - init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); > > - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); > > + } > > > > /* > > - * As soon as this returns, it's no longer safe to fiddle with > > - * hctx->dispatch_wait, since a completion can wake up the wait queue > > - * and unlock the bit. > > + * We got a tag, remove outselves from the wait queue to ensure > > + * someone else gets the wakeup. > > */ > > - add_wait_queue(&ws->wait, &hctx->dispatch_wait); > > + spin_lock_irq(&ws->wait.lock); > > + list_del_init(&wait->entry); > > + spin_unlock_irq(&ws->wait.lock); > > + spin_unlock(&this_hctx->lock); > > return true; > > } > > > > bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > > - bool got_budget) > > + bool got_budget) > > { > > struct blk_mq_hw_ctx *hctx; > > struct request *rq, *nxt; > > + bool no_tag = false; > > int errors, queued; > > > > if (list_empty(list)) > > @@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > > if (!blk_mq_get_driver_tag(rq, &hctx, false)) { > > /* > > * The initial allocation attempt failed, so we need to > > - * rerun the hardware queue when a tag is freed. > > + * rerun the hardware queue when a tag is freed. The > > + * waitqueue takes care of that. If the queue is run > > + * before we add this entry back on the dispatch list, > > + * we'll re-run it below. > > */ > > - if (!blk_mq_dispatch_wait_add(hctx)) { > > - if (got_budget) > > - blk_mq_put_dispatch_budget(hctx); > > - break; > > - } > > - > > - /* > > - * It's possible that a tag was freed in the window > > - * between the allocation failure and adding the > > - * hardware queue to the wait queue. > > - */ > > - if (!blk_mq_get_driver_tag(rq, &hctx, false)) { > > + if (!blk_mq_dispatch_wait_add(&hctx, rq)) { > > if (got_budget) > > blk_mq_put_dispatch_budget(hctx); > > + no_tag = true; > > break; > > } > > } > > @@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > > * it is no longer set that means that it was cleared by another > > * thread and hence that a queue rerun is needed. > > * > > - * If TAG_WAITING is set that means that an I/O scheduler has > > - * been configured and another thread is waiting for a driver > > - * tag. To guarantee fairness, do not rerun this hardware queue > > - * but let the other thread grab the driver tag. > > + * If 'no_tag' is set, that means that we failed getting > > + * a driver tag with an I/O scheduler attached. If our dispatch > > + * waitqueue is no longer active, ensure that we run the queue > > + * AFTER adding our entries back to the list. > > * > > * If no I/O scheduler has been configured it is possible that > > * the hardware queue got stopped and restarted before requests > > @@ -1156,7 +1164,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > > * and dm-rq. > > */ > > if (!blk_mq_sched_needs_restart(hctx) && > > - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) > > + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) > > blk_mq_run_hw_queue(hctx, true); > > If one rq is just completed after the check on list_empty_careful(&hctx->dispatch_wait.entry), > the queue may not be run any more. May that be an issue? Looks that can be an issue. If I revert "Revert "blk-mq: don't handle TAG_SHARED in restart"", and apply 'blk-mq: put driver tag if dispatch budget can't be got' against for-4.15/block, I still can trigger IO hang in one or two minutes easily: 1) script #!/bin/sh modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 RUNTIME=10 while true; do fio --bs=4k --size=512G --rw=randread --norandommap --direct=1 --ioengine=libaio --iodepth=4 --runtime=$RUNTIME --group_reporting=1 --name=nullb0 --filename=/dev/nullb0 --name=nullb1 --filename=/dev/nullb1 --name=nullb2 --filename=/dev/nullb2 --name=nullb3 --filename=/dev/nullb3 done 2) debugfs log(similar with your previous log) [ming@VM]$sudo ./debug/dump-blk-info /dev/nullb0 =============nullb0/hctx0================== active 0 busy /sys/kernel/debug/block/nullb0//hctx0/cpu0 completed 252195 0 dispatched 252197 0 merged 0 rq_list /sys/kernel/debug/block/nullb0//hctx0/cpu1 completed 135710 0 dispatched 135710 0 merged 0 rq_list /sys/kernel/debug/block/nullb0//hctx0/cpu2 completed 277611 0 dispatched 277611 0 merged 0 rq_list /sys/kernel/debug/block/nullb0//hctx0/cpu3 completed 145017 0 dispatched 145017 0 merged 0 rq_list ctx_map 00000000: 00 dispatch ffff8802605c8000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0} ffff8802605c8240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1} dispatched 0 733616 1 807709 2 1412 4 0 8 0 16 0 32+ 0 flags alloc_policy=FIFO SHOULD_MERGE|TAG_SHARED io_poll considered=0 invoked=0 success=0 queued 810535 run 1478298 sched_tags nr_tags=2 nr_reserved_tags=0 active_queues=0 bitmap_tags: depth=2 busy=2 bits_per_word=64 map_nr=1 alloc_hint={0, 0, 0, 0, 1, 1, 0, 0} wake_batch=1 wake_index=1 ws={ {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=active}, } round_robin=0 sched_tags_bitmap 00000000: 03 state SCHED_RESTART tags nr_tags=1 nr_reserved_tags=0 active_queues=0 bitmap_tags: depth=1 busy=0 bits_per_word=64 map_nr=1 alloc_hint={0, 0, 0, 0, 0, 0, 0, 0} wake_batch=1 wake_index=1 ws={ {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, } round_robin=0 tags_bitmap 00000000: 00
On 11/09/2017 03:00 AM, Ming Lei wrote: > On Thu, Nov 09, 2017 at 11:41:40AM +0800, Ming Lei wrote: >> On Wed, Nov 08, 2017 at 03:48:51PM -0700, Jens Axboe wrote: >>> This patch attempts to make the case of hctx re-running on driver tag >>> failure more robust. Without this patch, it's pretty easy to trigger a >>> stall condition with shared tags. An example is using null_blk like >>> this: >>> >>> modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 >>> >>> which sets up 4 devices, sharing the same tag set with a depth of 1. >>> Running a fio job ala: >>> >>> [global] >>> bs=4k >>> rw=randread >>> norandommap >>> direct=1 >>> ioengine=libaio >>> iodepth=4 >>> >>> [nullb0] >>> filename=/dev/nullb0 >>> [nullb1] >>> filename=/dev/nullb1 >>> [nullb2] >>> filename=/dev/nullb2 >>> [nullb3] >>> filename=/dev/nullb3 >>> >>> will inevitably end with one or more threads being stuck waiting for a >>> scheduler tag. That IO is then stuck forever, until someone else >>> triggers a run of the queue. >>> >>> Ensure that we always re-run the hardware queue, if the driver tag we >>> were waiting for got freed before we added our leftover request entries >>> back on the dispatch list. >>> >>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>> >>> --- >>> >>> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c >>> index 7f4a1ba532af..bb7f08415203 100644 >>> --- a/block/blk-mq-debugfs.c >>> +++ b/block/blk-mq-debugfs.c >>> @@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = { >>> HCTX_STATE_NAME(STOPPED), >>> HCTX_STATE_NAME(TAG_ACTIVE), >>> HCTX_STATE_NAME(SCHED_RESTART), >>> - HCTX_STATE_NAME(TAG_WAITING), >>> HCTX_STATE_NAME(START_ON_RUN), >>> }; >>> #undef HCTX_STATE_NAME >>> diff --git a/block/blk-mq.c b/block/blk-mq.c >>> index 3d759bb8a5bb..8dc5db40df9d 100644 >>> --- a/block/blk-mq.c >>> +++ b/block/blk-mq.c >>> @@ -998,49 +998,64 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, >>> return rq->tag != -1; >>> } >>> >>> -static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, >>> - void *key) >>> +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, >>> + int flags, void *key) >>> { >>> struct blk_mq_hw_ctx *hctx; >>> >>> hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); >>> >>> - list_del(&wait->entry); >>> - clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); >>> + list_del_init(&wait->entry); >>> blk_mq_run_hw_queue(hctx, true); >>> return 1; >>> } >>> >>> -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) >>> +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, >>> + struct request *rq) >>> { >>> + struct blk_mq_hw_ctx *this_hctx = *hctx; >>> + wait_queue_entry_t *wait = &this_hctx->dispatch_wait; >>> struct sbq_wait_state *ws; >>> >>> + if (!list_empty_careful(&wait->entry)) >>> + return false; >>> + >>> + spin_lock(&this_hctx->lock); >>> + if (!list_empty(&wait->entry)) { >>> + spin_unlock(&this_hctx->lock); >>> + return false; >>> + } >>> + >>> + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); >>> + add_wait_queue(&ws->wait, wait); >>> + >>> /* >>> - * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. >>> - * The thread which wins the race to grab this bit adds the hardware >>> - * queue to the wait queue. >>> + * It's possible that a tag was freed in the window between the >>> + * allocation failure and adding the hardware queue to the wait >>> + * queue. >>> */ >>> - if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || >>> - test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) >>> + if (!blk_mq_get_driver_tag(rq, hctx, false)) { >>> + spin_unlock(&this_hctx->lock); >>> return false; >>> - >>> - init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); >>> - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); >>> + } >>> >>> /* >>> - * As soon as this returns, it's no longer safe to fiddle with >>> - * hctx->dispatch_wait, since a completion can wake up the wait queue >>> - * and unlock the bit. >>> + * We got a tag, remove outselves from the wait queue to ensure >>> + * someone else gets the wakeup. >>> */ >>> - add_wait_queue(&ws->wait, &hctx->dispatch_wait); >>> + spin_lock_irq(&ws->wait.lock); >>> + list_del_init(&wait->entry); >>> + spin_unlock_irq(&ws->wait.lock); >>> + spin_unlock(&this_hctx->lock); >>> return true; >>> } >>> >>> bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, >>> - bool got_budget) >>> + bool got_budget) >>> { >>> struct blk_mq_hw_ctx *hctx; >>> struct request *rq, *nxt; >>> + bool no_tag = false; >>> int errors, queued; >>> >>> if (list_empty(list)) >>> @@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, >>> if (!blk_mq_get_driver_tag(rq, &hctx, false)) { >>> /* >>> * The initial allocation attempt failed, so we need to >>> - * rerun the hardware queue when a tag is freed. >>> + * rerun the hardware queue when a tag is freed. The >>> + * waitqueue takes care of that. If the queue is run >>> + * before we add this entry back on the dispatch list, >>> + * we'll re-run it below. >>> */ >>> - if (!blk_mq_dispatch_wait_add(hctx)) { >>> - if (got_budget) >>> - blk_mq_put_dispatch_budget(hctx); >>> - break; >>> - } >>> - >>> - /* >>> - * It's possible that a tag was freed in the window >>> - * between the allocation failure and adding the >>> - * hardware queue to the wait queue. >>> - */ >>> - if (!blk_mq_get_driver_tag(rq, &hctx, false)) { >>> + if (!blk_mq_dispatch_wait_add(&hctx, rq)) { >>> if (got_budget) >>> blk_mq_put_dispatch_budget(hctx); >>> + no_tag = true; >>> break; >>> } >>> } >>> @@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, >>> * it is no longer set that means that it was cleared by another >>> * thread and hence that a queue rerun is needed. >>> * >>> - * If TAG_WAITING is set that means that an I/O scheduler has >>> - * been configured and another thread is waiting for a driver >>> - * tag. To guarantee fairness, do not rerun this hardware queue >>> - * but let the other thread grab the driver tag. >>> + * If 'no_tag' is set, that means that we failed getting >>> + * a driver tag with an I/O scheduler attached. If our dispatch >>> + * waitqueue is no longer active, ensure that we run the queue >>> + * AFTER adding our entries back to the list. >>> * >>> * If no I/O scheduler has been configured it is possible that >>> * the hardware queue got stopped and restarted before requests >>> @@ -1156,7 +1164,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, >>> * and dm-rq. >>> */ >>> if (!blk_mq_sched_needs_restart(hctx) && >>> - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) >>> + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) >>> blk_mq_run_hw_queue(hctx, true); >> >> If one rq is just completed after the check on list_empty_careful(&hctx->dispatch_wait.entry), >> the queue may not be run any more. May that be an issue? > > Looks that can be an issue. > > If I revert "Revert "blk-mq: don't handle TAG_SHARED in restart"", and > apply 'blk-mq: put driver tag if dispatch budget can't be got' against > for-4.15/block, I still can trigger IO hang in one or two minutes easily: > > 1) script > #!/bin/sh > modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 > RUNTIME=10 > > while true; do > fio --bs=4k --size=512G --rw=randread --norandommap --direct=1 --ioengine=libaio --iodepth=4 --runtime=$RUNTIME --group_reporting=1 --name=nullb0 --filename=/dev/nullb0 --name=nullb1 --filename=/dev/nullb1 --name=nullb2 --filename=/dev/nullb2 --name=nullb3 --filename=/dev/nullb3 Did you apply my patch too? I had my test case running overnight, and it completed just fine. That's current for-4.15/block + the patch I posted. Previously that would hang in minutes as well. I'm running your test case now here, but it looks identical to mine. It's been running for 5 min without issue so far, I'll leave it running for an hour or so.
On 11/09/2017 08:30 AM, Jens Axboe wrote: > On 11/09/2017 03:00 AM, Ming Lei wrote: >> On Thu, Nov 09, 2017 at 11:41:40AM +0800, Ming Lei wrote: >>> On Wed, Nov 08, 2017 at 03:48:51PM -0700, Jens Axboe wrote: >>>> This patch attempts to make the case of hctx re-running on driver tag >>>> failure more robust. Without this patch, it's pretty easy to trigger a >>>> stall condition with shared tags. An example is using null_blk like >>>> this: >>>> >>>> modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 >>>> >>>> which sets up 4 devices, sharing the same tag set with a depth of 1. >>>> Running a fio job ala: >>>> >>>> [global] >>>> bs=4k >>>> rw=randread >>>> norandommap >>>> direct=1 >>>> ioengine=libaio >>>> iodepth=4 >>>> >>>> [nullb0] >>>> filename=/dev/nullb0 >>>> [nullb1] >>>> filename=/dev/nullb1 >>>> [nullb2] >>>> filename=/dev/nullb2 >>>> [nullb3] >>>> filename=/dev/nullb3 >>>> >>>> will inevitably end with one or more threads being stuck waiting for a >>>> scheduler tag. That IO is then stuck forever, until someone else >>>> triggers a run of the queue. >>>> >>>> Ensure that we always re-run the hardware queue, if the driver tag we >>>> were waiting for got freed before we added our leftover request entries >>>> back on the dispatch list. >>>> >>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>>> >>>> --- >>>> >>>> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c >>>> index 7f4a1ba532af..bb7f08415203 100644 >>>> --- a/block/blk-mq-debugfs.c >>>> +++ b/block/blk-mq-debugfs.c >>>> @@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = { >>>> HCTX_STATE_NAME(STOPPED), >>>> HCTX_STATE_NAME(TAG_ACTIVE), >>>> HCTX_STATE_NAME(SCHED_RESTART), >>>> - HCTX_STATE_NAME(TAG_WAITING), >>>> HCTX_STATE_NAME(START_ON_RUN), >>>> }; >>>> #undef HCTX_STATE_NAME >>>> diff --git a/block/blk-mq.c b/block/blk-mq.c >>>> index 3d759bb8a5bb..8dc5db40df9d 100644 >>>> --- a/block/blk-mq.c >>>> +++ b/block/blk-mq.c >>>> @@ -998,49 +998,64 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, >>>> return rq->tag != -1; >>>> } >>>> >>>> -static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, >>>> - void *key) >>>> +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, >>>> + int flags, void *key) >>>> { >>>> struct blk_mq_hw_ctx *hctx; >>>> >>>> hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); >>>> >>>> - list_del(&wait->entry); >>>> - clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); >>>> + list_del_init(&wait->entry); >>>> blk_mq_run_hw_queue(hctx, true); >>>> return 1; >>>> } >>>> >>>> -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) >>>> +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, >>>> + struct request *rq) >>>> { >>>> + struct blk_mq_hw_ctx *this_hctx = *hctx; >>>> + wait_queue_entry_t *wait = &this_hctx->dispatch_wait; >>>> struct sbq_wait_state *ws; >>>> >>>> + if (!list_empty_careful(&wait->entry)) >>>> + return false; >>>> + >>>> + spin_lock(&this_hctx->lock); >>>> + if (!list_empty(&wait->entry)) { >>>> + spin_unlock(&this_hctx->lock); >>>> + return false; >>>> + } >>>> + >>>> + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); >>>> + add_wait_queue(&ws->wait, wait); >>>> + >>>> /* >>>> - * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. >>>> - * The thread which wins the race to grab this bit adds the hardware >>>> - * queue to the wait queue. >>>> + * It's possible that a tag was freed in the window between the >>>> + * allocation failure and adding the hardware queue to the wait >>>> + * queue. >>>> */ >>>> - if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || >>>> - test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) >>>> + if (!blk_mq_get_driver_tag(rq, hctx, false)) { >>>> + spin_unlock(&this_hctx->lock); >>>> return false; >>>> - >>>> - init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); >>>> - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); >>>> + } >>>> >>>> /* >>>> - * As soon as this returns, it's no longer safe to fiddle with >>>> - * hctx->dispatch_wait, since a completion can wake up the wait queue >>>> - * and unlock the bit. >>>> + * We got a tag, remove outselves from the wait queue to ensure >>>> + * someone else gets the wakeup. >>>> */ >>>> - add_wait_queue(&ws->wait, &hctx->dispatch_wait); >>>> + spin_lock_irq(&ws->wait.lock); >>>> + list_del_init(&wait->entry); >>>> + spin_unlock_irq(&ws->wait.lock); >>>> + spin_unlock(&this_hctx->lock); >>>> return true; >>>> } >>>> >>>> bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, >>>> - bool got_budget) >>>> + bool got_budget) >>>> { >>>> struct blk_mq_hw_ctx *hctx; >>>> struct request *rq, *nxt; >>>> + bool no_tag = false; >>>> int errors, queued; >>>> >>>> if (list_empty(list)) >>>> @@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, >>>> if (!blk_mq_get_driver_tag(rq, &hctx, false)) { >>>> /* >>>> * The initial allocation attempt failed, so we need to >>>> - * rerun the hardware queue when a tag is freed. >>>> + * rerun the hardware queue when a tag is freed. The >>>> + * waitqueue takes care of that. If the queue is run >>>> + * before we add this entry back on the dispatch list, >>>> + * we'll re-run it below. >>>> */ >>>> - if (!blk_mq_dispatch_wait_add(hctx)) { >>>> - if (got_budget) >>>> - blk_mq_put_dispatch_budget(hctx); >>>> - break; >>>> - } >>>> - >>>> - /* >>>> - * It's possible that a tag was freed in the window >>>> - * between the allocation failure and adding the >>>> - * hardware queue to the wait queue. >>>> - */ >>>> - if (!blk_mq_get_driver_tag(rq, &hctx, false)) { >>>> + if (!blk_mq_dispatch_wait_add(&hctx, rq)) { >>>> if (got_budget) >>>> blk_mq_put_dispatch_budget(hctx); >>>> + no_tag = true; >>>> break; >>>> } >>>> } >>>> @@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, >>>> * it is no longer set that means that it was cleared by another >>>> * thread and hence that a queue rerun is needed. >>>> * >>>> - * If TAG_WAITING is set that means that an I/O scheduler has >>>> - * been configured and another thread is waiting for a driver >>>> - * tag. To guarantee fairness, do not rerun this hardware queue >>>> - * but let the other thread grab the driver tag. >>>> + * If 'no_tag' is set, that means that we failed getting >>>> + * a driver tag with an I/O scheduler attached. If our dispatch >>>> + * waitqueue is no longer active, ensure that we run the queue >>>> + * AFTER adding our entries back to the list. >>>> * >>>> * If no I/O scheduler has been configured it is possible that >>>> * the hardware queue got stopped and restarted before requests >>>> @@ -1156,7 +1164,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, >>>> * and dm-rq. >>>> */ >>>> if (!blk_mq_sched_needs_restart(hctx) && >>>> - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) >>>> + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) >>>> blk_mq_run_hw_queue(hctx, true); >>> >>> If one rq is just completed after the check on list_empty_careful(&hctx->dispatch_wait.entry), >>> the queue may not be run any more. May that be an issue? >> >> Looks that can be an issue. >> >> If I revert "Revert "blk-mq: don't handle TAG_SHARED in restart"", and >> apply 'blk-mq: put driver tag if dispatch budget can't be got' against >> for-4.15/block, I still can trigger IO hang in one or two minutes easily: >> >> 1) script >> #!/bin/sh >> modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 >> RUNTIME=10 >> >> while true; do >> fio --bs=4k --size=512G --rw=randread --norandommap --direct=1 --ioengine=libaio --iodepth=4 --runtime=$RUNTIME --group_reporting=1 --name=nullb0 --filename=/dev/nullb0 --name=nullb1 --filename=/dev/nullb1 --name=nullb2 --filename=/dev/nullb2 --name=nullb3 --filename=/dev/nullb3 > > Did you apply my patch too? I had my test case running overnight, and it > completed just fine. That's current for-4.15/block + the patch I posted. > Previously that would hang in minutes as well. > > I'm running your test case now here, but it looks identical to mine. > It's been running for 5 min without issue so far, I'll leave it running > for an hour or so. It's been running happily for > 1 hour now, no issues observed.
On Wed, Nov 08, 2017 at 03:48:51PM -0700, Jens Axboe wrote: > This patch attempts to make the case of hctx re-running on driver tag > failure more robust. Without this patch, it's pretty easy to trigger a > stall condition with shared tags. An example is using null_blk like > this: > > modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 > > which sets up 4 devices, sharing the same tag set with a depth of 1. > Running a fio job ala: > > [global] > bs=4k > rw=randread > norandommap > direct=1 > ioengine=libaio > iodepth=4 > > [nullb0] > filename=/dev/nullb0 > [nullb1] > filename=/dev/nullb1 > [nullb2] > filename=/dev/nullb2 > [nullb3] > filename=/dev/nullb3 > > will inevitably end with one or more threads being stuck waiting for a > scheduler tag. That IO is then stuck forever, until someone else > triggers a run of the queue. > > Ensure that we always re-run the hardware queue, if the driver tag we > were waiting for got freed before we added our leftover request entries > back on the dispatch list. > > Signed-off-by: Jens Axboe <axboe@kernel.dk> Reviewed-by: Omar Sandoval <osandov@fb.com>
On Thu, 2017-11-09 at 09:32 -0700, Jens Axboe wrote:
> It's been running happily for > 1 hour now, no issues observed.
The same null_blk test runs fine on my setup. But what's weird is that if
I run the srp-test software that I again see a lockup in sd_probe_async().
That happens not only with today's for-next branch but also with the same
kernel tree with which my tests passed yesterday, which is weird. I will
analyze this further.
Bart.
On Thu, Nov 09, 2017 at 09:32:58AM -0700, Jens Axboe wrote: > On 11/09/2017 08:30 AM, Jens Axboe wrote: > > On 11/09/2017 03:00 AM, Ming Lei wrote: > >> On Thu, Nov 09, 2017 at 11:41:40AM +0800, Ming Lei wrote: > >>> On Wed, Nov 08, 2017 at 03:48:51PM -0700, Jens Axboe wrote: > >>>> This patch attempts to make the case of hctx re-running on driver tag > >>>> failure more robust. Without this patch, it's pretty easy to trigger a > >>>> stall condition with shared tags. An example is using null_blk like > >>>> this: > >>>> > >>>> modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 > >>>> > >>>> which sets up 4 devices, sharing the same tag set with a depth of 1. > >>>> Running a fio job ala: > >>>> > >>>> [global] > >>>> bs=4k > >>>> rw=randread > >>>> norandommap > >>>> direct=1 > >>>> ioengine=libaio > >>>> iodepth=4 > >>>> > >>>> [nullb0] > >>>> filename=/dev/nullb0 > >>>> [nullb1] > >>>> filename=/dev/nullb1 > >>>> [nullb2] > >>>> filename=/dev/nullb2 > >>>> [nullb3] > >>>> filename=/dev/nullb3 > >>>> > >>>> will inevitably end with one or more threads being stuck waiting for a > >>>> scheduler tag. That IO is then stuck forever, until someone else > >>>> triggers a run of the queue. > >>>> > >>>> Ensure that we always re-run the hardware queue, if the driver tag we > >>>> were waiting for got freed before we added our leftover request entries > >>>> back on the dispatch list. > >>>> > >>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> > >>>> > >>>> --- > >>>> > >>>> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c > >>>> index 7f4a1ba532af..bb7f08415203 100644 > >>>> --- a/block/blk-mq-debugfs.c > >>>> +++ b/block/blk-mq-debugfs.c > >>>> @@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = { > >>>> HCTX_STATE_NAME(STOPPED), > >>>> HCTX_STATE_NAME(TAG_ACTIVE), > >>>> HCTX_STATE_NAME(SCHED_RESTART), > >>>> - HCTX_STATE_NAME(TAG_WAITING), > >>>> HCTX_STATE_NAME(START_ON_RUN), > >>>> }; > >>>> #undef HCTX_STATE_NAME > >>>> diff --git a/block/blk-mq.c b/block/blk-mq.c > >>>> index 3d759bb8a5bb..8dc5db40df9d 100644 > >>>> --- a/block/blk-mq.c > >>>> +++ b/block/blk-mq.c > >>>> @@ -998,49 +998,64 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, > >>>> return rq->tag != -1; > >>>> } > >>>> > >>>> -static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, > >>>> - void *key) > >>>> +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, > >>>> + int flags, void *key) > >>>> { > >>>> struct blk_mq_hw_ctx *hctx; > >>>> > >>>> hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); > >>>> > >>>> - list_del(&wait->entry); > >>>> - clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); > >>>> + list_del_init(&wait->entry); > >>>> blk_mq_run_hw_queue(hctx, true); > >>>> return 1; > >>>> } > >>>> > >>>> -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) > >>>> +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, > >>>> + struct request *rq) > >>>> { > >>>> + struct blk_mq_hw_ctx *this_hctx = *hctx; > >>>> + wait_queue_entry_t *wait = &this_hctx->dispatch_wait; > >>>> struct sbq_wait_state *ws; > >>>> > >>>> + if (!list_empty_careful(&wait->entry)) > >>>> + return false; > >>>> + > >>>> + spin_lock(&this_hctx->lock); > >>>> + if (!list_empty(&wait->entry)) { > >>>> + spin_unlock(&this_hctx->lock); > >>>> + return false; > >>>> + } > >>>> + > >>>> + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); > >>>> + add_wait_queue(&ws->wait, wait); > >>>> + > >>>> /* > >>>> - * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. > >>>> - * The thread which wins the race to grab this bit adds the hardware > >>>> - * queue to the wait queue. > >>>> + * It's possible that a tag was freed in the window between the > >>>> + * allocation failure and adding the hardware queue to the wait > >>>> + * queue. > >>>> */ > >>>> - if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || > >>>> - test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) > >>>> + if (!blk_mq_get_driver_tag(rq, hctx, false)) { > >>>> + spin_unlock(&this_hctx->lock); > >>>> return false; > >>>> - > >>>> - init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); > >>>> - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); > >>>> + } > >>>> > >>>> /* > >>>> - * As soon as this returns, it's no longer safe to fiddle with > >>>> - * hctx->dispatch_wait, since a completion can wake up the wait queue > >>>> - * and unlock the bit. > >>>> + * We got a tag, remove outselves from the wait queue to ensure > >>>> + * someone else gets the wakeup. > >>>> */ > >>>> - add_wait_queue(&ws->wait, &hctx->dispatch_wait); > >>>> + spin_lock_irq(&ws->wait.lock); > >>>> + list_del_init(&wait->entry); > >>>> + spin_unlock_irq(&ws->wait.lock); > >>>> + spin_unlock(&this_hctx->lock); > >>>> return true; > >>>> } > >>>> > >>>> bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > >>>> - bool got_budget) > >>>> + bool got_budget) > >>>> { > >>>> struct blk_mq_hw_ctx *hctx; > >>>> struct request *rq, *nxt; > >>>> + bool no_tag = false; > >>>> int errors, queued; > >>>> > >>>> if (list_empty(list)) > >>>> @@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > >>>> if (!blk_mq_get_driver_tag(rq, &hctx, false)) { > >>>> /* > >>>> * The initial allocation attempt failed, so we need to > >>>> - * rerun the hardware queue when a tag is freed. > >>>> + * rerun the hardware queue when a tag is freed. The > >>>> + * waitqueue takes care of that. If the queue is run > >>>> + * before we add this entry back on the dispatch list, > >>>> + * we'll re-run it below. > >>>> */ > >>>> - if (!blk_mq_dispatch_wait_add(hctx)) { > >>>> - if (got_budget) > >>>> - blk_mq_put_dispatch_budget(hctx); > >>>> - break; > >>>> - } > >>>> - > >>>> - /* > >>>> - * It's possible that a tag was freed in the window > >>>> - * between the allocation failure and adding the > >>>> - * hardware queue to the wait queue. > >>>> - */ > >>>> - if (!blk_mq_get_driver_tag(rq, &hctx, false)) { > >>>> + if (!blk_mq_dispatch_wait_add(&hctx, rq)) { > >>>> if (got_budget) > >>>> blk_mq_put_dispatch_budget(hctx); > >>>> + no_tag = true; > >>>> break; > >>>> } > >>>> } > >>>> @@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > >>>> * it is no longer set that means that it was cleared by another > >>>> * thread and hence that a queue rerun is needed. > >>>> * > >>>> - * If TAG_WAITING is set that means that an I/O scheduler has > >>>> - * been configured and another thread is waiting for a driver > >>>> - * tag. To guarantee fairness, do not rerun this hardware queue > >>>> - * but let the other thread grab the driver tag. > >>>> + * If 'no_tag' is set, that means that we failed getting > >>>> + * a driver tag with an I/O scheduler attached. If our dispatch > >>>> + * waitqueue is no longer active, ensure that we run the queue > >>>> + * AFTER adding our entries back to the list. > >>>> * > >>>> * If no I/O scheduler has been configured it is possible that > >>>> * the hardware queue got stopped and restarted before requests > >>>> @@ -1156,7 +1164,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > >>>> * and dm-rq. > >>>> */ > >>>> if (!blk_mq_sched_needs_restart(hctx) && > >>>> - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) > >>>> + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) > >>>> blk_mq_run_hw_queue(hctx, true); > >>> > >>> If one rq is just completed after the check on list_empty_careful(&hctx->dispatch_wait.entry), > >>> the queue may not be run any more. May that be an issue? > >> > >> Looks that can be an issue. > >> > >> If I revert "Revert "blk-mq: don't handle TAG_SHARED in restart"", and > >> apply 'blk-mq: put driver tag if dispatch budget can't be got' against > >> for-4.15/block, I still can trigger IO hang in one or two minutes easily: > >> > >> 1) script > >> #!/bin/sh > >> modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 > >> RUNTIME=10 > >> > >> while true; do > >> fio --bs=4k --size=512G --rw=randread --norandommap --direct=1 --ioengine=libaio --iodepth=4 --runtime=$RUNTIME --group_reporting=1 --name=nullb0 --filename=/dev/nullb0 --name=nullb1 --filename=/dev/nullb1 --name=nullb2 --filename=/dev/nullb2 --name=nullb3 --filename=/dev/nullb3 > > > > Did you apply my patch too? I had my test case running overnight, and it > > completed just fine. That's current for-4.15/block + the patch I posted. > > Previously that would hang in minutes as well. > > > > I'm running your test case now here, but it looks identical to mine. > > It's been running for 5 min without issue so far, I'll leave it running > > for an hour or so. > > It's been running happily for > 1 hour now, no issues observed. Looks there was something wrong in yesterday's test, today I can't reproduce the issue by running latest for-4.15/block with revert a2820d1544c1, and observed IOPS is improved >20% compared yesterday's result meantime. Now looks the new dispatch wake works well, and it seems fine to cover RESTART for TAG_SHARED completely. In next dev cycle, we may consider to remove that workaround.
On Fri, Nov 10, 2017 at 01:53:18PM +0800, Ming Lei wrote: > On Thu, Nov 09, 2017 at 09:32:58AM -0700, Jens Axboe wrote: > > On 11/09/2017 08:30 AM, Jens Axboe wrote: > > > On 11/09/2017 03:00 AM, Ming Lei wrote: > > >> On Thu, Nov 09, 2017 at 11:41:40AM +0800, Ming Lei wrote: > > >>> On Wed, Nov 08, 2017 at 03:48:51PM -0700, Jens Axboe wrote: > > >>>> This patch attempts to make the case of hctx re-running on driver tag > > >>>> failure more robust. Without this patch, it's pretty easy to trigger a > > >>>> stall condition with shared tags. An example is using null_blk like > > >>>> this: > > >>>> > > >>>> modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 > > >>>> > > >>>> which sets up 4 devices, sharing the same tag set with a depth of 1. > > >>>> Running a fio job ala: > > >>>> > > >>>> [global] > > >>>> bs=4k > > >>>> rw=randread > > >>>> norandommap > > >>>> direct=1 > > >>>> ioengine=libaio > > >>>> iodepth=4 > > >>>> > > >>>> [nullb0] > > >>>> filename=/dev/nullb0 > > >>>> [nullb1] > > >>>> filename=/dev/nullb1 > > >>>> [nullb2] > > >>>> filename=/dev/nullb2 > > >>>> [nullb3] > > >>>> filename=/dev/nullb3 > > >>>> > > >>>> will inevitably end with one or more threads being stuck waiting for a > > >>>> scheduler tag. That IO is then stuck forever, until someone else > > >>>> triggers a run of the queue. > > >>>> > > >>>> Ensure that we always re-run the hardware queue, if the driver tag we > > >>>> were waiting for got freed before we added our leftover request entries > > >>>> back on the dispatch list. > > >>>> > > >>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> > > >>>> > > >>>> --- > > >>>> > > >>>> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c > > >>>> index 7f4a1ba532af..bb7f08415203 100644 > > >>>> --- a/block/blk-mq-debugfs.c > > >>>> +++ b/block/blk-mq-debugfs.c > > >>>> @@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = { > > >>>> HCTX_STATE_NAME(STOPPED), > > >>>> HCTX_STATE_NAME(TAG_ACTIVE), > > >>>> HCTX_STATE_NAME(SCHED_RESTART), > > >>>> - HCTX_STATE_NAME(TAG_WAITING), > > >>>> HCTX_STATE_NAME(START_ON_RUN), > > >>>> }; > > >>>> #undef HCTX_STATE_NAME > > >>>> diff --git a/block/blk-mq.c b/block/blk-mq.c > > >>>> index 3d759bb8a5bb..8dc5db40df9d 100644 > > >>>> --- a/block/blk-mq.c > > >>>> +++ b/block/blk-mq.c > > >>>> @@ -998,49 +998,64 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, > > >>>> return rq->tag != -1; > > >>>> } > > >>>> > > >>>> -static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, > > >>>> - void *key) > > >>>> +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, > > >>>> + int flags, void *key) > > >>>> { > > >>>> struct blk_mq_hw_ctx *hctx; > > >>>> > > >>>> hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); > > >>>> > > >>>> - list_del(&wait->entry); > > >>>> - clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); > > >>>> + list_del_init(&wait->entry); > > >>>> blk_mq_run_hw_queue(hctx, true); > > >>>> return 1; > > >>>> } > > >>>> > > >>>> -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) > > >>>> +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, > > >>>> + struct request *rq) > > >>>> { > > >>>> + struct blk_mq_hw_ctx *this_hctx = *hctx; > > >>>> + wait_queue_entry_t *wait = &this_hctx->dispatch_wait; > > >>>> struct sbq_wait_state *ws; > > >>>> > > >>>> + if (!list_empty_careful(&wait->entry)) > > >>>> + return false; > > >>>> + > > >>>> + spin_lock(&this_hctx->lock); > > >>>> + if (!list_empty(&wait->entry)) { > > >>>> + spin_unlock(&this_hctx->lock); > > >>>> + return false; > > >>>> + } > > >>>> + > > >>>> + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); > > >>>> + add_wait_queue(&ws->wait, wait); > > >>>> + > > >>>> /* > > >>>> - * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. > > >>>> - * The thread which wins the race to grab this bit adds the hardware > > >>>> - * queue to the wait queue. > > >>>> + * It's possible that a tag was freed in the window between the > > >>>> + * allocation failure and adding the hardware queue to the wait > > >>>> + * queue. > > >>>> */ > > >>>> - if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || > > >>>> - test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) > > >>>> + if (!blk_mq_get_driver_tag(rq, hctx, false)) { > > >>>> + spin_unlock(&this_hctx->lock); > > >>>> return false; > > >>>> - > > >>>> - init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); > > >>>> - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); > > >>>> + } > > >>>> > > >>>> /* > > >>>> - * As soon as this returns, it's no longer safe to fiddle with > > >>>> - * hctx->dispatch_wait, since a completion can wake up the wait queue > > >>>> - * and unlock the bit. > > >>>> + * We got a tag, remove outselves from the wait queue to ensure > > >>>> + * someone else gets the wakeup. > > >>>> */ > > >>>> - add_wait_queue(&ws->wait, &hctx->dispatch_wait); > > >>>> + spin_lock_irq(&ws->wait.lock); > > >>>> + list_del_init(&wait->entry); > > >>>> + spin_unlock_irq(&ws->wait.lock); > > >>>> + spin_unlock(&this_hctx->lock); > > >>>> return true; > > >>>> } > > >>>> > > >>>> bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > > >>>> - bool got_budget) > > >>>> + bool got_budget) > > >>>> { > > >>>> struct blk_mq_hw_ctx *hctx; > > >>>> struct request *rq, *nxt; > > >>>> + bool no_tag = false; > > >>>> int errors, queued; > > >>>> > > >>>> if (list_empty(list)) > > >>>> @@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > > >>>> if (!blk_mq_get_driver_tag(rq, &hctx, false)) { > > >>>> /* > > >>>> * The initial allocation attempt failed, so we need to > > >>>> - * rerun the hardware queue when a tag is freed. > > >>>> + * rerun the hardware queue when a tag is freed. The > > >>>> + * waitqueue takes care of that. If the queue is run > > >>>> + * before we add this entry back on the dispatch list, > > >>>> + * we'll re-run it below. > > >>>> */ > > >>>> - if (!blk_mq_dispatch_wait_add(hctx)) { > > >>>> - if (got_budget) > > >>>> - blk_mq_put_dispatch_budget(hctx); > > >>>> - break; > > >>>> - } > > >>>> - > > >>>> - /* > > >>>> - * It's possible that a tag was freed in the window > > >>>> - * between the allocation failure and adding the > > >>>> - * hardware queue to the wait queue. > > >>>> - */ > > >>>> - if (!blk_mq_get_driver_tag(rq, &hctx, false)) { > > >>>> + if (!blk_mq_dispatch_wait_add(&hctx, rq)) { > > >>>> if (got_budget) > > >>>> blk_mq_put_dispatch_budget(hctx); > > >>>> + no_tag = true; > > >>>> break; > > >>>> } > > >>>> } > > >>>> @@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > > >>>> * it is no longer set that means that it was cleared by another > > >>>> * thread and hence that a queue rerun is needed. > > >>>> * > > >>>> - * If TAG_WAITING is set that means that an I/O scheduler has > > >>>> - * been configured and another thread is waiting for a driver > > >>>> - * tag. To guarantee fairness, do not rerun this hardware queue > > >>>> - * but let the other thread grab the driver tag. > > >>>> + * If 'no_tag' is set, that means that we failed getting > > >>>> + * a driver tag with an I/O scheduler attached. If our dispatch > > >>>> + * waitqueue is no longer active, ensure that we run the queue > > >>>> + * AFTER adding our entries back to the list. > > >>>> * > > >>>> * If no I/O scheduler has been configured it is possible that > > >>>> * the hardware queue got stopped and restarted before requests > > >>>> @@ -1156,7 +1164,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > > >>>> * and dm-rq. > > >>>> */ > > >>>> if (!blk_mq_sched_needs_restart(hctx) && > > >>>> - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) > > >>>> + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) > > >>>> blk_mq_run_hw_queue(hctx, true); > > >>> > > >>> If one rq is just completed after the check on list_empty_careful(&hctx->dispatch_wait.entry), > > >>> the queue may not be run any more. May that be an issue? > > >> > > >> Looks that can be an issue. > > >> > > >> If I revert "Revert "blk-mq: don't handle TAG_SHARED in restart"", and > > >> apply 'blk-mq: put driver tag if dispatch budget can't be got' against > > >> for-4.15/block, I still can trigger IO hang in one or two minutes easily: > > >> > > >> 1) script > > >> #!/bin/sh > > >> modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 > > >> RUNTIME=10 > > >> > > >> while true; do > > >> fio --bs=4k --size=512G --rw=randread --norandommap --direct=1 --ioengine=libaio --iodepth=4 --runtime=$RUNTIME --group_reporting=1 --name=nullb0 --filename=/dev/nullb0 --name=nullb1 --filename=/dev/nullb1 --name=nullb2 --filename=/dev/nullb2 --name=nullb3 --filename=/dev/nullb3 > > > > > > Did you apply my patch too? I had my test case running overnight, and it > > > completed just fine. That's current for-4.15/block + the patch I posted. > > > Previously that would hang in minutes as well. > > > > > > I'm running your test case now here, but it looks identical to mine. > > > It's been running for 5 min without issue so far, I'll leave it running > > > for an hour or so. > > > > It's been running happily for > 1 hour now, no issues observed. > > Looks there was something wrong in yesterday's test, today I can't reproduce > the issue by running latest for-4.15/block with revert a2820d1544c1, and observed > IOPS is improved >20% compared yesterday's result meantime. Actually my yesterday's test wasn't wrong, the in-tree patch is different with your post: 1) posted patch if (!blk_mq_sched_needs_restart(hctx) && - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); 2) in-tree patch - if (!blk_mq_sched_needs_restart(hctx) && - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) + if (!blk_mq_sched_needs_restart(hctx) || + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); I'd suggest you mention that or post a V2 for this case next time, :-)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7f4a1ba532af..bb7f08415203 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), - HCTX_STATE_NAME(TAG_WAITING), HCTX_STATE_NAME(START_ON_RUN), }; #undef HCTX_STATE_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index 3d759bb8a5bb..8dc5db40df9d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -998,49 +998,64 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, return rq->tag != -1; } -static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, - void *key) +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, + int flags, void *key) { struct blk_mq_hw_ctx *hctx; hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); - list_del(&wait->entry); - clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); + list_del_init(&wait->entry); blk_mq_run_hw_queue(hctx, true); return 1; } -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, + struct request *rq) { + struct blk_mq_hw_ctx *this_hctx = *hctx; + wait_queue_entry_t *wait = &this_hctx->dispatch_wait; struct sbq_wait_state *ws; + if (!list_empty_careful(&wait->entry)) + return false; + + spin_lock(&this_hctx->lock); + if (!list_empty(&wait->entry)) { + spin_unlock(&this_hctx->lock); + return false; + } + + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); + add_wait_queue(&ws->wait, wait); + /* - * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. - * The thread which wins the race to grab this bit adds the hardware - * queue to the wait queue. + * It's possible that a tag was freed in the window between the + * allocation failure and adding the hardware queue to the wait + * queue. */ - if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || - test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) + if (!blk_mq_get_driver_tag(rq, hctx, false)) { + spin_unlock(&this_hctx->lock); return false; - - init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); + } /* - * As soon as this returns, it's no longer safe to fiddle with - * hctx->dispatch_wait, since a completion can wake up the wait queue - * and unlock the bit. + * We got a tag, remove outselves from the wait queue to ensure + * someone else gets the wakeup. */ - add_wait_queue(&ws->wait, &hctx->dispatch_wait); + spin_lock_irq(&ws->wait.lock); + list_del_init(&wait->entry); + spin_unlock_irq(&ws->wait.lock); + spin_unlock(&this_hctx->lock); return true; } bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, - bool got_budget) + bool got_budget) { struct blk_mq_hw_ctx *hctx; struct request *rq, *nxt; + bool no_tag = false; int errors, queued; if (list_empty(list)) @@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!blk_mq_get_driver_tag(rq, &hctx, false)) { /* * The initial allocation attempt failed, so we need to - * rerun the hardware queue when a tag is freed. + * rerun the hardware queue when a tag is freed. The + * waitqueue takes care of that. If the queue is run + * before we add this entry back on the dispatch list, + * we'll re-run it below. */ - if (!blk_mq_dispatch_wait_add(hctx)) { - if (got_budget) - blk_mq_put_dispatch_budget(hctx); - break; - } - - /* - * It's possible that a tag was freed in the window - * between the allocation failure and adding the - * hardware queue to the wait queue. - */ - if (!blk_mq_get_driver_tag(rq, &hctx, false)) { + if (!blk_mq_dispatch_wait_add(&hctx, rq)) { if (got_budget) blk_mq_put_dispatch_budget(hctx); + no_tag = true; break; } } @@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. * - * If TAG_WAITING is set that means that an I/O scheduler has - * been configured and another thread is waiting for a driver - * tag. To guarantee fairness, do not rerun this hardware queue - * but let the other thread grab the driver tag. + * If 'no_tag' is set, that means that we failed getting + * a driver tag with an I/O scheduler attached. If our dispatch + * waitqueue is no longer active, ensure that we run the queue + * AFTER adding our entries back to the list. * * If no I/O scheduler has been configured it is possible that * the hardware queue got stopped and restarted before requests @@ -1156,7 +1164,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * and dm-rq. */ if (!blk_mq_sched_needs_restart(hctx) && - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); } @@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->nr_ctx = 0; + init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); + INIT_LIST_HEAD(&hctx->dispatch_wait.entry); + if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto free_bitmap; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 674641527da7..4ae987c2352c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -35,7 +35,7 @@ struct blk_mq_hw_ctx { struct blk_mq_ctx **ctxs; unsigned int nr_ctx; - wait_queue_entry_t dispatch_wait; + wait_queue_entry_t dispatch_wait; atomic_t wait_index; struct blk_mq_tags *tags; @@ -181,8 +181,7 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, - BLK_MQ_S_TAG_WAITING = 3, - BLK_MQ_S_START_ON_RUN = 4, + BLK_MQ_S_START_ON_RUN = 3, BLK_MQ_MAX_DEPTH = 10240,
This patch attempts to make the case of hctx re-running on driver tag failure more robust. Without this patch, it's pretty easy to trigger a stall condition with shared tags. An example is using null_blk like this: modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 which sets up 4 devices, sharing the same tag set with a depth of 1. Running a fio job ala: [global] bs=4k rw=randread norandommap direct=1 ioengine=libaio iodepth=4 [nullb0] filename=/dev/nullb0 [nullb1] filename=/dev/nullb1 [nullb2] filename=/dev/nullb2 [nullb3] filename=/dev/nullb3 will inevitably end with one or more threads being stuck waiting for a scheduler tag. That IO is then stuck forever, until someone else triggers a run of the queue. Ensure that we always re-run the hardware queue, if the driver tag we were waiting for got freed before we added our leftover request entries back on the dispatch list. Signed-off-by: Jens Axboe <axboe@kernel.dk> ---