Message ID | 20220113025536.1479653-1-qiulaibin@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [-next,V5] blk-mq: fix tag_get wait task can't be awakened | expand |
On Thu, Jan 13, 2022 at 10:55:36AM +0800, Laibin Qiu wrote: > In case of shared tags, there might be more than one hctx which > allocates from the same tags, and each hctx is limited to allocate at > most: > hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U); > > tag idle detection is lazy, and may be delayed for 30sec, so there > could be just one real active hctx(queue) but all others are actually > idle and still accounted as active because of the lazy idle detection. > Then if wake_batch is > hctx_max_depth, driver tag allocation may wait > forever on this real active hctx. > > Fix this by recalculating wake_batch when inc or dec active_queues. FWIW, Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> Thanks! > Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps") > Suggested-by: Ming Lei <ming.lei@redhat.com> > Suggested-by: John Garry <john.garry@huawei.com> > Signed-off-by: Laibin Qiu <qiulaibin@huawei.com> > --- > block/blk-mq-tag.c | 40 +++++++++++++++++++++++++++++++++------- > include/linux/sbitmap.h | 11 +++++++++++ > lib/sbitmap.c | 25 ++++++++++++++++++++++--- > 3 files changed, 66 insertions(+), 10 deletions(-) > > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > index e55a6834c9a6..845f74e8dd7b 100644 > --- a/block/blk-mq-tag.c > +++ b/block/blk-mq-tag.c > @@ -16,6 +16,21 @@ > #include "blk-mq-sched.h" > #include "blk-mq-tag.h" > > +/* > + * Recalculate wakeup batch when tag is shared by hctx. > + */ > +static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, > + unsigned int users) > +{ > + if (!users) > + return; > + > + sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, > + users); > + sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, > + users); > +} > + > /* > * If a previously inactive queue goes active, bump the active user count. > * We need to do this before try to allocate driver tag, then even if fail > @@ -24,18 +39,26 @@ > */ > bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) > { > + unsigned int users; > + > if (blk_mq_is_shared_tags(hctx->flags)) { > struct request_queue *q = hctx->queue; > > - if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && > - !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) > - atomic_inc(&hctx->tags->active_queues); > + if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || > + test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) { > + return true; > + } > } else { > - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && > - !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) > - atomic_inc(&hctx->tags->active_queues); > + if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || > + test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) { > + return true; > + } > } > > + users = atomic_inc_return(&hctx->tags->active_queues); > + > + blk_mq_update_wake_batch(hctx->tags, users); > + > return true; > } > > @@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) > void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) > { > struct blk_mq_tags *tags = hctx->tags; > + unsigned int users; > > if (blk_mq_is_shared_tags(hctx->flags)) { > struct request_queue *q = hctx->queue; > @@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) > return; > } > > - atomic_dec(&tags->active_queues); > + users = atomic_dec_return(&tags->active_queues); > + > + blk_mq_update_wake_batch(tags, users); > > blk_mq_tag_wakeup_all(tags, false); > } > diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h > index fc0357a6e19b..95df357ec009 100644 > --- a/include/linux/sbitmap.h > +++ b/include/linux/sbitmap.h > @@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) > sbitmap_free(&sbq->sb); > } > > +/** > + * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch > + * @sbq: Bitmap queue to recalculate wake batch. > + * @users: Number of shares. > + * > + * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch > + * by depth. This interface is for HCTX shared tags or queue shared tags. > + */ > +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, > + unsigned int users); > + > /** > * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. > * @sbq: Bitmap queue to resize. > diff --git a/lib/sbitmap.c b/lib/sbitmap.c > index 2709ab825499..6220fa67fb7e 100644 > --- a/lib/sbitmap.c > +++ b/lib/sbitmap.c > @@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, > } > EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); > > -static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, > - unsigned int depth) > +static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, > + unsigned int wake_batch) > { > - unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth); > int i; > > if (sbq->wake_batch != wake_batch) { > @@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, > } > } > > +static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, > + unsigned int depth) > +{ > + unsigned int wake_batch; > + > + wake_batch = sbq_calc_wake_batch(sbq, depth); > + __sbitmap_queue_update_wake_batch(sbq, wake_batch); > +} > + > +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, > + unsigned int users) > +{ > + unsigned int wake_batch; > + > + wake_batch = clamp_val((sbq->sb.depth + users - 1) / > + users, 4, SBQ_WAKE_BATCH); > + __sbitmap_queue_update_wake_batch(sbq, wake_batch); > +} > +EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); > + > void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) > { > sbitmap_queue_update_wake_batch(sbq, depth); > -- > 2.22.0 >
On Thu, 13 Jan 2022 10:55:36 +0800, Laibin Qiu wrote: > In case of shared tags, there might be more than one hctx which > allocates from the same tags, and each hctx is limited to allocate at > most: > hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U); > > tag idle detection is lazy, and may be delayed for 30sec, so there > could be just one real active hctx(queue) but all others are actually > idle and still accounted as active because of the lazy idle detection. > Then if wake_batch is > hctx_max_depth, driver tag allocation may wait > forever on this real active hctx. > > [...] Applied, thanks! [1/1] blk-mq: fix tag_get wait task can't be awakened commit: 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Best regards,
Hi, On Thu, Jan 13, 2022 at 10:55:36AM +0800, Laibin Qiu wrote: > In case of shared tags, there might be more than one hctx which > allocates from the same tags, and each hctx is limited to allocate at > most: > hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U); > > tag idle detection is lazy, and may be delayed for 30sec, so there > could be just one real active hctx(queue) but all others are actually > idle and still accounted as active because of the lazy idle detection. > Then if wake_batch is > hctx_max_depth, driver tag allocation may wait > forever on this real active hctx. > > Fix this by recalculating wake_batch when inc or dec active_queues. > > Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps") > Suggested-by: Ming Lei <ming.lei@redhat.com> > Suggested-by: John Garry <john.garry@huawei.com> > Signed-off-by: Laibin Qiu <qiulaibin@huawei.com> I understand this problem has been reported already, but still: This patch causes a hang in several of my qemu emulations when trying to boot from usb. Reverting it fixes the problem. Bisect log is attached. Boot logs are available at https://kerneltests.org/builders/qemu-arm-aspeed-master/builds/230/steps/qemubuildcommand/logs/stdio but don't really show much: the affected tests simply hang until they are aborted. Guenter --- bisect log: # bad: [0280e3c58f92b2fe0e8fbbdf8d386449168de4a8] Merge tag 'nfs-for-5.17-1' of git://git.linux-nfs.org/projects/anna/linux-nfs # good: [64f29d8856a9e0d1fcdc5344f76e70c364b941cb] Merge tag 'ceph-for-5.17-rc1' of git://github.com/ceph/ceph-client git bisect start 'HEAD' '64f29d8856a9' # bad: [b087788c20aa959f83df989b31fdcc4182b2d067] Merge tag 'ata-5.17-rc1-part2' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/libata git bisect bad b087788c20aa959f83df989b31fdcc4182b2d067 # bad: [0854dc81e108c90cccda6d1fc54bc270f16a3cc9] Merge tag 'docs-5.17-2' of git://git.lwn.net/linux git bisect bad 0854dc81e108c90cccda6d1fc54bc270f16a3cc9 # good: [75242f31db6cabf602a5eb84c13b579099d72a65] Merge tag 'rtc-5.17' of git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux git bisect good 75242f31db6cabf602a5eb84c13b579099d72a65 # good: [f3a78227eef20c0ba13bbf9401f0a340bca3ad16] Merge tag 'io_uring-5.17-2022-01-21' of git://git.kernel.dk/linux-block git bisect good f3a78227eef20c0ba13bbf9401f0a340bca3ad16 # bad: [3c7c25038b6c7d66a6816028219914379be6a5cc] Merge tag 'block-5.17-2022-01-21' of git://git.kernel.dk/linux-block git bisect bad 3c7c25038b6c7d66a6816028219914379be6a5cc # bad: [e6a2e5116e07ce5acc8698785c29e9e47f010fd5] block: Remove unnecessary variable assignment git bisect bad e6a2e5116e07ce5acc8698785c29e9e47f010fd5 # bad: [413ec8057bc3d368574abd05dd27e747063b2f59] loop: remove redundant initialization of pointer node git bisect bad 413ec8057bc3d368574abd05dd27e747063b2f59 # bad: [180dccb0dba4f5e84a4a70c1be1d34cbb6528b32] blk-mq: fix tag_get wait task can't be awakened git bisect bad 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 # first bad commit: [180dccb0dba4f5e84a4a70c1be1d34cbb6528b32] blk-mq: fix tag_get wait task can't be awakened
On 1/26/22 6:32 PM, Guenter Roeck wrote: > Hi, > > On Thu, Jan 13, 2022 at 10:55:36AM +0800, Laibin Qiu wrote: >> In case of shared tags, there might be more than one hctx which >> allocates from the same tags, and each hctx is limited to allocate at >> most: >> hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U); >> >> tag idle detection is lazy, and may be delayed for 30sec, so there >> could be just one real active hctx(queue) but all others are actually >> idle and still accounted as active because of the lazy idle detection. >> Then if wake_batch is > hctx_max_depth, driver tag allocation may wait >> forever on this real active hctx. >> >> Fix this by recalculating wake_batch when inc or dec active_queues. >> >> Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps") >> Suggested-by: Ming Lei <ming.lei@redhat.com> >> Suggested-by: John Garry <john.garry@huawei.com> >> Signed-off-by: Laibin Qiu <qiulaibin@huawei.com> > > I understand this problem has been reported already, but still: > > This patch causes a hang in several of my qemu emulations when > trying to boot from usb. Reverting it fixes the problem. Bisect log > is attached. > > Boot logs are available at > https://kerneltests.org/builders/qemu-arm-aspeed-master/builds/230/steps/qemubuildcommand/logs/stdio > but don't really show much: the affected tests simply hang until they > are aborted. This one got reported a few days ago, can you check if applying: https://git.kernel.dk/cgit/linux-block/commit/?h=block-5.17&id=10825410b956dc1ed8c5fbc8bbedaffdadde7f20 fixes it for you?
On 1/27/22 09:28, Jens Axboe wrote: > On 1/26/22 6:32 PM, Guenter Roeck wrote: >> Hi, >> >> On Thu, Jan 13, 2022 at 10:55:36AM +0800, Laibin Qiu wrote: >>> In case of shared tags, there might be more than one hctx which >>> allocates from the same tags, and each hctx is limited to allocate at >>> most: >>> hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U); >>> >>> tag idle detection is lazy, and may be delayed for 30sec, so there >>> could be just one real active hctx(queue) but all others are actually >>> idle and still accounted as active because of the lazy idle detection. >>> Then if wake_batch is > hctx_max_depth, driver tag allocation may wait >>> forever on this real active hctx. >>> >>> Fix this by recalculating wake_batch when inc or dec active_queues. >>> >>> Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps") >>> Suggested-by: Ming Lei <ming.lei@redhat.com> >>> Suggested-by: John Garry <john.garry@huawei.com> >>> Signed-off-by: Laibin Qiu <qiulaibin@huawei.com> >> >> I understand this problem has been reported already, but still: >> >> This patch causes a hang in several of my qemu emulations when >> trying to boot from usb. Reverting it fixes the problem. Bisect log >> is attached. >> >> Boot logs are available at >> https://kerneltests.org/builders/qemu-arm-aspeed-master/builds/230/steps/qemubuildcommand/logs/stdio >> but don't really show much: the affected tests simply hang until they >> are aborted. > > This one got reported a few days ago, can you check if applying: > > https://git.kernel.dk/cgit/linux-block/commit/?h=block-5.17&id=10825410b956dc1ed8c5fbc8bbedaffdadde7f20 > > fixes it for you? > Yes, it does. Thanks, Guenter
On 1/27/22 11:04 AM, Guenter Roeck wrote: > On 1/27/22 09:28, Jens Axboe wrote: >> On 1/26/22 6:32 PM, Guenter Roeck wrote: >>> Hi, >>> >>> On Thu, Jan 13, 2022 at 10:55:36AM +0800, Laibin Qiu wrote: >>>> In case of shared tags, there might be more than one hctx which >>>> allocates from the same tags, and each hctx is limited to allocate at >>>> most: >>>> hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U); >>>> >>>> tag idle detection is lazy, and may be delayed for 30sec, so there >>>> could be just one real active hctx(queue) but all others are actually >>>> idle and still accounted as active because of the lazy idle detection. >>>> Then if wake_batch is > hctx_max_depth, driver tag allocation may wait >>>> forever on this real active hctx. >>>> >>>> Fix this by recalculating wake_batch when inc or dec active_queues. >>>> >>>> Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps") >>>> Suggested-by: Ming Lei <ming.lei@redhat.com> >>>> Suggested-by: John Garry <john.garry@huawei.com> >>>> Signed-off-by: Laibin Qiu <qiulaibin@huawei.com> >>> >>> I understand this problem has been reported already, but still: >>> >>> This patch causes a hang in several of my qemu emulations when >>> trying to boot from usb. Reverting it fixes the problem. Bisect log >>> is attached. >>> >>> Boot logs are available at >>> https://kerneltests.org/builders/qemu-arm-aspeed-master/builds/230/steps/qemubuildcommand/logs/stdio >>> but don't really show much: the affected tests simply hang until they >>> are aborted. >> >> This one got reported a few days ago, can you check if applying: >> >> https://git.kernel.dk/cgit/linux-block/commit/?h=block-5.17&id=10825410b956dc1ed8c5fbc8bbedaffdadde7f20 >> >> fixes it for you? >> > Yes, it does. Great, thanks for reporting/testing.
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e55a6834c9a6..845f74e8dd7b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -16,6 +16,21 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" +/* + * Recalculate wakeup batch when tag is shared by hctx. + */ +static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, + unsigned int users) +{ + if (!users) + return; + + sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, + users); + sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, + users); +} + /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail @@ -24,18 +39,26 @@ */ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { + unsigned int users; + if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; - if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && - !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) - atomic_inc(&hctx->tags->active_queues); + if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || + test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) { + return true; + } } else { - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && - !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - atomic_inc(&hctx->tags->active_queues); + if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || + test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) { + return true; + } } + users = atomic_inc_return(&hctx->tags->active_queues); + + blk_mq_update_wake_batch(hctx->tags, users); + return true; } @@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; + unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; @@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) return; } - atomic_dec(&tags->active_queues); + users = atomic_dec_return(&tags->active_queues); + + blk_mq_update_wake_batch(tags, users); blk_mq_tag_wakeup_all(tags, false); } diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index fc0357a6e19b..95df357ec009 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) sbitmap_free(&sbq->sb); } +/** + * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch + * @sbq: Bitmap queue to recalculate wake batch. + * @users: Number of shares. + * + * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch + * by depth. This interface is for HCTX shared tags or queue shared tags. + */ +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, + unsigned int users); + /** * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. * @sbq: Bitmap queue to resize. diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 2709ab825499..6220fa67fb7e 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); -static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, - unsigned int depth) +static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, + unsigned int wake_batch) { - unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth); int i; if (sbq->wake_batch != wake_batch) { @@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, } } +static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, + unsigned int depth) +{ + unsigned int wake_batch; + + wake_batch = sbq_calc_wake_batch(sbq, depth); + __sbitmap_queue_update_wake_batch(sbq, wake_batch); +} + +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, + unsigned int users) +{ + unsigned int wake_batch; + + wake_batch = clamp_val((sbq->sb.depth + users - 1) / + users, 4, SBQ_WAKE_BATCH); + __sbitmap_queue_update_wake_batch(sbq, wake_batch); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); + void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) { sbitmap_queue_update_wake_batch(sbq, depth);
In case of shared tags, there might be more than one hctx which allocates from the same tags, and each hctx is limited to allocate at most: hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U); tag idle detection is lazy, and may be delayed for 30sec, so there could be just one real active hctx(queue) but all others are actually idle and still accounted as active because of the lazy idle detection. Then if wake_batch is > hctx_max_depth, driver tag allocation may wait forever on this real active hctx. Fix this by recalculating wake_batch when inc or dec active_queues. Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps") Suggested-by: Ming Lei <ming.lei@redhat.com> Suggested-by: John Garry <john.garry@huawei.com> Signed-off-by: Laibin Qiu <qiulaibin@huawei.com> --- block/blk-mq-tag.c | 40 +++++++++++++++++++++++++++++++++------- include/linux/sbitmap.h | 11 +++++++++++ lib/sbitmap.c | 25 ++++++++++++++++++++++--- 3 files changed, 66 insertions(+), 10 deletions(-)