diff mbox series

[RESEND] blk-mq: fix hang caused by freeze/unfreeze sequence

Message ID 20190409090828.16282-1-bob.liu@oracle.com (mailing list archive)
State New, archived
Headers show
Series [RESEND] blk-mq: fix hang caused by freeze/unfreeze sequence | expand

Commit Message

Bob Liu April 9, 2019, 9:08 a.m. UTC
This patch was proposed by Roman Pen[3] years ago.
Recently we hit a bug which is likely caused by the same reason,so rebased his
fix to v5.1 and resend.
Below is almost copied from that patch[3].

------
Long time ago there was a similar fix proposed by Akinobu Mita[1],
but it seems that time everyone decided to fix this subtle race in
percpu-refcount and Tejun Heo[2] did an attempt (as I can see that
patchset was not applied).

The following is a description of a hang in blk_mq_freeze_queue_wait() -
same fix but a bug from another angle.

The hang happens on attempt to freeze a queue while another task does
queue unfreeze.

The root cause is an incorrect sequence of percpu_ref_reinit() and
percpu_ref_kill() and as a result those two can be swapped:

 CPU#0               CPU#1
 ----------------    -----------------
 percpu_ref_kill()

                     percpu_ref_kill() << atomic reference does
 percpu_ref_reinit()                   << not guarantee the order

                     blk_mq_freeze_queue_wait() << HANG HERE

                     percpu_ref_reinit()

Firstly this wrong sequence raises two kernel warnings:

  1st. WARNING at lib/percpu-recount.c:309
       percpu_ref_kill_and_confirm called more than once

  2nd. WARNING at lib/percpu-refcount.c:331

But the most unpleasant effect is a hang of a blk_mq_freeze_queue_wait(),
which waits for a zero of a q_usage_counter, which never happens
because percpu-ref was reinited (instead of being killed) and stays in
PERCPU state forever.

The simplified sequence above can be reproduced on shared tags, when
queue A is going to die meanwhile another queue B is in init state and
is trying to freeze the queue A, which shares the same tags set:

 CPU#0                           CPU#1
 ------------------------------- ------------------------------------
 q1 = blk_mq_init_queue(shared_tags)

                                q2 = blk_mq_init_queue(shared_tags):
                                  blk_mq_add_queue_tag_set(shared_tags):
                                    blk_mq_update_tag_set_depth(shared_tags):
                                      blk_mq_freeze_queue(q1)
 blk_cleanup_queue(q1)                 ...
   blk_mq_freeze_queue(q1)   <<<->>>   blk_mq_unfreeze_queue(q1)

[1] Message id: 1443287365-4244-7-git-send-email-akinobu.mita@gmail.com
[2] Message id: 1443563240-29306-6-git-send-email-tj@kernel.org
[3] https://patchwork.kernel.org/patch/9268199/

Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: linux-block@vger.kernel.org
Cc: linux-kernel@vger.kernel.org

---
 v3:
   - rebase to v5.1
 v2:
   - forgotten hunk from local repo
   - minor tweaks in the commit message
---
 block/blk-core.c       |  3 ++-
 block/blk-mq.c         | 19 ++++++++++---------
 include/linux/blkdev.h |  7 ++++++-
 3 files changed, 18 insertions(+), 11 deletions(-)

--
2.9.5

Comments

Jinpu Wang April 9, 2019, 9:29 a.m. UTC | #1
Bob Liu <bob.liu@oracle.com> 于2019年4月9日周二 上午11:11写道:
>
> This patch was proposed by Roman Pen[3] years ago.
> Recently we hit a bug which is likely caused by the same reason,so rebased his
> fix to v5.1 and resend.
> Below is almost copied from that patch[3].
>
> ------
> Long time ago there was a similar fix proposed by Akinobu Mita[1],
> but it seems that time everyone decided to fix this subtle race in
> percpu-refcount and Tejun Heo[2] did an attempt (as I can see that
> patchset was not applied).
>
> The following is a description of a hang in blk_mq_freeze_queue_wait() -
> same fix but a bug from another angle.
>
> The hang happens on attempt to freeze a queue while another task does
> queue unfreeze.
>
> The root cause is an incorrect sequence of percpu_ref_reinit() and
> percpu_ref_kill() and as a result those two can be swapped:
>
>  CPU#0               CPU#1
>  ----------------    -----------------
>  percpu_ref_kill()
>
>                      percpu_ref_kill() << atomic reference does
>  percpu_ref_reinit()                   << not guarantee the order
>
>                      blk_mq_freeze_queue_wait() << HANG HERE
>
>                      percpu_ref_reinit()
>
> Firstly this wrong sequence raises two kernel warnings:
>
>   1st. WARNING at lib/percpu-recount.c:309
>        percpu_ref_kill_and_confirm called more than once
>
>   2nd. WARNING at lib/percpu-refcount.c:331
>
> But the most unpleasant effect is a hang of a blk_mq_freeze_queue_wait(),
> which waits for a zero of a q_usage_counter, which never happens
> because percpu-ref was reinited (instead of being killed) and stays in
> PERCPU state forever.
>
> The simplified sequence above can be reproduced on shared tags, when
> queue A is going to die meanwhile another queue B is in init state and
> is trying to freeze the queue A, which shares the same tags set:
>
>  CPU#0                           CPU#1
>  ------------------------------- ------------------------------------
>  q1 = blk_mq_init_queue(shared_tags)
>
>                                 q2 = blk_mq_init_queue(shared_tags):
>                                   blk_mq_add_queue_tag_set(shared_tags):
>                                     blk_mq_update_tag_set_depth(shared_tags):
>                                       blk_mq_freeze_queue(q1)
>  blk_cleanup_queue(q1)                 ...
>    blk_mq_freeze_queue(q1)   <<<->>>   blk_mq_unfreeze_queue(q1)
>
> [1] Message id: 1443287365-4244-7-git-send-email-akinobu.mita@gmail.com
> [2] Message id: 1443563240-29306-6-git-send-email-tj@kernel.org
> [3] https://patchwork.kernel.org/patch/9268199/
>
> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
> Signed-off-by: Bob Liu <bob.liu@oracle.com>
> Cc: Akinobu Mita <akinobu.mita@gmail.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: linux-block@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
>

Replaced Roman's email address.

We at 1 & 1 IONOS (former ProfitBricks) have been carried this patch
for some years,
it has been running in production for some years too, would be good to
see it in upstream :)

Thanks,

Jack Wang
Linux Kernel Developer @ 1 & 1 IONOS
Dongli Zhang April 9, 2019, 11:27 a.m. UTC | #2
On 04/09/2019 05:08 PM, Bob Liu wrote:
> This patch was proposed by Roman Pen[3] years ago.
> Recently we hit a bug which is likely caused by the same reason,so rebased his
> fix to v5.1 and resend.
> Below is almost copied from that patch[3].
> 

Hi Bod,

I think the commit message is not applicable to the most recent linux. Perhaps
re-write the message based on the most recent version?

Thank you very much!

Dongli Zhang
Bob Liu April 13, 2019, 12:36 a.m. UTC | #3
On 4/9/19 5:29 PM, Jinpu Wang wrote:
> Bob Liu <bob.liu@oracle.com> 于2019年4月9日周二 上午11:11写道:
>>
>> This patch was proposed by Roman Pen[3] years ago.
>> Recently we hit a bug which is likely caused by the same reason,so rebased his
>> fix to v5.1 and resend.
>> Below is almost copied from that patch[3].
>>
>> ------
>> Long time ago there was a similar fix proposed by Akinobu Mita[1],
>> but it seems that time everyone decided to fix this subtle race in
>> percpu-refcount and Tejun Heo[2] did an attempt (as I can see that
>> patchset was not applied).
>>
>> The following is a description of a hang in blk_mq_freeze_queue_wait() -
>> same fix but a bug from another angle.
>>
>> The hang happens on attempt to freeze a queue while another task does
>> queue unfreeze.
>>
>> The root cause is an incorrect sequence of percpu_ref_reinit() and
>> percpu_ref_kill() and as a result those two can be swapped:
>>
>>  CPU#0               CPU#1
>>  ----------------    -----------------
>>  percpu_ref_kill()
>>
>>                      percpu_ref_kill() << atomic reference does
>>  percpu_ref_reinit()                   << not guarantee the order
>>
>>                      blk_mq_freeze_queue_wait() << HANG HERE
>>
>>                      percpu_ref_reinit()
>>
>> Firstly this wrong sequence raises two kernel warnings:
>>
>>   1st. WARNING at lib/percpu-recount.c:309
>>        percpu_ref_kill_and_confirm called more than once
>>
>>   2nd. WARNING at lib/percpu-refcount.c:331
>>
>> But the most unpleasant effect is a hang of a blk_mq_freeze_queue_wait(),
>> which waits for a zero of a q_usage_counter, which never happens
>> because percpu-ref was reinited (instead of being killed) and stays in
>> PERCPU state forever.
>>
>> The simplified sequence above can be reproduced on shared tags, when
>> queue A is going to die meanwhile another queue B is in init state and
>> is trying to freeze the queue A, which shares the same tags set:
>>
>>  CPU#0                           CPU#1
>>  ------------------------------- ------------------------------------
>>  q1 = blk_mq_init_queue(shared_tags)
>>
>>                                 q2 = blk_mq_init_queue(shared_tags):
>>                                   blk_mq_add_queue_tag_set(shared_tags):
>>                                     blk_mq_update_tag_set_depth(shared_tags):
>>                                       blk_mq_freeze_queue(q1)
>>  blk_cleanup_queue(q1)                 ...
>>    blk_mq_freeze_queue(q1)   <<<->>>   blk_mq_unfreeze_queue(q1)
>>
>> [1] Message id: 1443287365-4244-7-git-send-email-akinobu.mita@gmail.com
>> [2] Message id: 1443563240-29306-6-git-send-email-tj@kernel.org
>> [3] https://urldefense.proofpoint.com/v2/url?u=https-3A__patchwork.kernel.org_patch_9268199_&d=DwIFaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=OcA07QqFechuCug2pqm_-JpGP_mOt0YouTXApdePMGw&s=VM_-8S5gkFo8zUjT5RoY0CkbxN6hQmTwVmslulwsFJM&e=
>>
>> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
>> Signed-off-by: Bob Liu <bob.liu@oracle.com>
>> Cc: Akinobu Mita <akinobu.mita@gmail.com>
>> Cc: Tejun Heo <tj@kernel.org>
>> Cc: Jens Axboe <axboe@kernel.dk>
>> Cc: Christoph Hellwig <hch@lst.de>
>> Cc: linux-block@vger.kernel.org
>> Cc: linux-kernel@vger.kernel.org
>>
> 
> Replaced Roman's email address.
> 
> We at 1 & 1 IONOS (former ProfitBricks) have been carried this patch
> for some years,
> it has been running in production for some years too,

Nice to hear that!

> would be good to see it in upstream :)

Yes.
Could anyone have a review? Thanks!

> 
> Thanks,
> 
> Jack Wang
> Linux Kernel Developer @ 1 & 1 IONOS
>
Bart Van Assche April 13, 2019, 3:42 a.m. UTC | #4
On 4/9/19 2:08 AM, Bob Liu wrote:
>  void blk_freeze_queue_start(struct request_queue *q)
>  {
> -	int freeze_depth;
> -
> -	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
> -	if (freeze_depth == 1) {
> +	mutex_lock(&q->mq_freeze_lock);
> +	if (++q->mq_freeze_depth == 1) {
>  		percpu_ref_kill(&q->q_usage_counter);
> +		mutex_unlock(&q->mq_freeze_lock);
>  		if (queue_is_mq(q))
>  			blk_mq_run_hw_queues(q, false);
> +	} else {
> +		mutex_unlock(&q->mq_freeze_lock);
>  	}
>  }
Have you considered to move the mutex_unlock() call to the end of the function
such that there is only one mutex_unlock() call instead of two? In case you
would be worried about holding the mutex around the code that runs the queue,
how about changing the blk_mq_run_hw_queues() call such that the queues are
run async?

Bart.
Bob Liu April 14, 2019, 1:09 p.m. UTC | #5
On 4/13/19 11:42 AM, Bart Van Assche wrote:
> On 4/9/19 2:08 AM, Bob Liu wrote:
>>  void blk_freeze_queue_start(struct request_queue *q)
>>  {
>> -	int freeze_depth;
>> -
>> -	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
>> -	if (freeze_depth == 1) {
>> +	mutex_lock(&q->mq_freeze_lock);
>> +	if (++q->mq_freeze_depth == 1) {
>>  		percpu_ref_kill(&q->q_usage_counter);
>> +		mutex_unlock(&q->mq_freeze_lock);
>>  		if (queue_is_mq(q))
>>  			blk_mq_run_hw_queues(q, false);
>> +	} else {
>> +		mutex_unlock(&q->mq_freeze_lock);
>>  	}
>>  }
> Have you considered to move the mutex_unlock() call to the end of the function
> such that there is only one mutex_unlock() call instead of two? In case you> would be worried about holding the mutex around the code that runs the queue,
> how about changing the blk_mq_run_hw_queues() call such that the queues are
> run async?
>
 
Yes, but this bug is difficult to reproduce. I'd prefer keep the change minimum.
Roman Penyaev April 15, 2019, 9:46 a.m. UTC | #6
On 2019-04-13 05:42, Bart Van Assche wrote:
> On 4/9/19 2:08 AM, Bob Liu wrote:
>>  void blk_freeze_queue_start(struct request_queue *q)
>>  {
>> -	int freeze_depth;
>> -
>> -	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
>> -	if (freeze_depth == 1) {
>> +	mutex_lock(&q->mq_freeze_lock);
>> +	if (++q->mq_freeze_depth == 1) {
>>  		percpu_ref_kill(&q->q_usage_counter);
>> +		mutex_unlock(&q->mq_freeze_lock);
>>  		if (queue_is_mq(q))
>>  			blk_mq_run_hw_queues(q, false);
>> +	} else {
>> +		mutex_unlock(&q->mq_freeze_lock);
>>  	}
>>  }
> Have you considered to move the mutex_unlock() call to the end of the 
> function
> such that there is only one mutex_unlock() call instead of two? In case 
> you
> would be worried about holding the mutex around the code that runs the 
> queue,
> how about changing the blk_mq_run_hw_queues() call such that the queues 
> are
> run async?

Hi Bart,

The only purpose of 'mq_freeze_lock' is to avoid race between 
mq_freeze_depth
variable and the following usage of q_usage_counter percpu ref.  I admit 
that
my original comment is quite unclear, but locked section should be as 
short
as possible, so returning to your question: better to have two unlock 
calls
instead of expanding locked critical section.

Unfortunately I do not have hardware to play again with the issue, but I 
see
there is a nice candidate for a quick reproduction:  null_blk queues 
with
shared tags.  Having several queues with shared tags and a script, which
powers on/off (I mean 'power' entry of configfs of the null_blk) 
different
null devices from different cpus it is quite possible to trigger the 
issue.
Random short msdelay() in correct places can help to increase 
probability to
hit the issue quite fast.


But Bob, what is the backtrace of the issue you hit?  What is the 
device?
Conditions to reproduce the issue are quite specific and frankly I did 
not
find any "naked" (without any locks) calls of blk_mq_freeze/unfreeze 
sequence,
the only candidate which I found, seems, null_blk (not 100% sure, but 
worth to
try).


--
Roman
Bob Liu April 17, 2019, 4:06 a.m. UTC | #7
On 4/15/19 5:46 PM, Roman Penyaev wrote:
> On 2019-04-13 05:42, Bart Van Assche wrote:
>> On 4/9/19 2:08 AM, Bob Liu wrote:
>>>  void blk_freeze_queue_start(struct request_queue *q)
>>>  {
>>> -    int freeze_depth;
>>> -
>>> -    freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
>>> -    if (freeze_depth == 1) {
>>> +    mutex_lock(&q->mq_freeze_lock);
>>> +    if (++q->mq_freeze_depth == 1) {
>>>          percpu_ref_kill(&q->q_usage_counter);
>>> +        mutex_unlock(&q->mq_freeze_lock);
>>>          if (queue_is_mq(q))
>>>              blk_mq_run_hw_queues(q, false);
>>> +    } else {
>>> +        mutex_unlock(&q->mq_freeze_lock);
>>>      }
>>>  }
>> Have you considered to move the mutex_unlock() call to the end of the function
>> such that there is only one mutex_unlock() call instead of two? In case you
>> would be worried about holding the mutex around the code that runs the queue,
>> how about changing the blk_mq_run_hw_queues() call such that the queues are
>> run async?
> 
> Hi Bart,
> 
> The only purpose of 'mq_freeze_lock' is to avoid race between mq_freeze_depth
> variable and the following usage of q_usage_counter percpu ref.  I admit that
> my original comment is quite unclear, but locked section should be as short
> as possible, so returning to your question: better to have two unlock calls
> instead of expanding locked critical section.
> 
> Unfortunately I do not have hardware to play again with the issue, but I see
> there is a nice candidate for a quick reproduction:  null_blk queues with
> shared tags.  Having several queues with shared tags and a script, which
> powers on/off (I mean 'power' entry of configfs of the null_blk) different
> null devices from different cpus it is quite possible to trigger the issue.
> Random short msdelay() in correct places can help to increase probability to
> hit the issue quite fast.
> 
> 
> But Bob, what is the backtrace of the issue you hit?  What is the device?
> Conditions to reproduce the issue are quite specific and frankly I did not
> find any "naked" (without any locks) calls of blk_mq_freeze/unfreeze sequence,
> the only candidate which I found, seems, null_blk (not 100% sure, but worth to
> try).
> 

Yes, it can be reproduced with null_blk.
But I added a msleep to save time.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9437a5e..875967f1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -202,6 +202,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
        freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
        WARN_ON_ONCE(freeze_depth < 0);
        if (!freeze_depth) {
+               msleep(1000);
                percpu_ref_resurrect(&q->q_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
        }



Below is the backtrace:
---
[  234.604280] ------------[ cut here ]------------
[  234.604288] percpu_ref_kill_and_confirm called more than once on blk_queue_usage_counter_release!
[  234.604305] WARNING: CPU: 0 PID: 11854 at lib/percpu-refcount.c:336 percpu_ref_kill_and_confirm+0x99/0xb0
[  234.604306] Modules linked in: null_blk thunderbolt ccm xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc devlink ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter bnep arc4 intel_rapl iwlmvm x86_pkg_temp_thermal intel_powerclamp mac80211 coretemp snd_soc_skl snd_soc_skl_ipc snd_soc_sst_ipc snd_soc_sst_dsp kvm_intel snd_hda_ext_core snd_soc_acpi_intel_match snd_soc_acpi snd_soc_core nls_iso8859_1 snd_hda_codec_hdmi snd_hda_codec_conexant snd_compress ac97_bus snd_hda_codec_generic snd_pcm_dmaengine crct10dif_pclmul snd_hda_intel crc32_pclmul snd_hda_codec ghash_clmulni_intel iwlwifi snd_hda_core aesni_intel snd_hwdep snd_pcm thinkpad_acpi uvcvideo nvram ledtrig_audio aes_x86_64 crypto_simd btusb cryptd btrtl glue_helper snd_seq_midi btbcm snd_seq_midi_event intel_cstate videobuf2_vmalloc btintel snd_rawmidi cfg80211 bluetooth intel_rapl_perf
[  234.604361]  hid_sensor_accel_3d snd_seq hid_sensor_magn_3d videobuf2_memops videobuf2_v4l2 videobuf2_common hid_sensor_rotation rtsx_pci_ms hid_sensor_als wmi_bmof videodev input_leds joydev serio_raw hid_sensor_gyro_3d intel_wmi_thunderbolt hid_sensor_trigger snd_seq_device industrialio_triggered_buffer kfifo_buf snd_timer hid_sensor_iio_common media industrialio mei_me snd memstick ecdh_generic mei ucsi_acpi processor_thermal_device typec_ucsi intel_soc_dts_iosf intel_pch_thermal typec soundcore int3403_thermal int340x_thermal_zone int3400_thermal acpi_thermal_rel mac_hid acpi_pad sch_fq_codel parport_pc ppdev lp parport ip_tables x_tables autofs4 btrfs zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq raid0 multipath linear hid_sensor_custom hid_sensor_hub intel_ishtp_hid hid_logitech_hidpp i915 kvmgt vfio_mdev mdev vfio_iommu_type1 vfio kvm irqbypass i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm wacom
[  234.604417]  hid_logitech_dj rtsx_pci_sdmmc hid_generic e1000e usbhid psmouse hid intel_ish_ipc rtsx_pci intel_ishtp wmi video
[  234.604430] CPU: 0 PID: 11854 Comm: sh Not tainted 5.0.0+ #33
[  234.604431] Hardware name: LENOVO 20LJS2EV08/20LJS2EV08, BIOS R0SET33W (1.17 ) 07/18/2018
[  234.604436] RIP: 0010:percpu_ref_kill_and_confirm+0x99/0xb0
[  234.604439] Code: 00 eb d3 80 3d 76 a4 2a 01 00 75 ab 48 8b 53 10 48 c7 c6 00 fd a6 8f 48 c7 c7 b0 56 d5 8f c6 05 5b a4 2a 01 01 e8 f7 a2 b1 ff <0f> 0b 48 8b 43 08 eb 85 90 90 90 90 90 90 90 90 90 90 90 90 90 90
[  234.604441] RSP: 0018:ffffa2880ae9fd00 EFLAGS: 00010086
[  234.604443] RAX: 0000000000000000 RBX: ffff9768b3b815d8 RCX: 0000000000000000
[  234.604445] RDX: 0000000000000055 RSI: ffffffff905758f5 RDI: 0000000000000046
[  234.604447] RBP: ffffa2880ae9fd18 R08: fffffffa314c837d R09: ffffffffffffffff
[  234.604449] R10: ffffe3c84fc0a200 R11: 000000000001abb4 R12: 0000000000000246
[  234.604451] R13: 0000000000000000 R14: ffff9768b0448300 R15: ffff9768b2703600
[  234.604454] FS:  00007fae2f3e6540(0000) GS:ffff9768d1400000(0000) knlGS:0000000000000000
[  234.604456] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  234.604458] CR2: 000056078102ee08 CR3: 0000000431384002 CR4: 00000000003606f0
[  234.604460] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  234.604462] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  234.604463] Call Trace:
[  234.604471]  blk_freeze_queue_start+0x2d/0x50
[  234.604475]  blk_set_queue_dying+0x17/0x40
[  234.604479]  blk_cleanup_queue+0x26/0xd0
[  234.604486]  null_del_dev+0x59/0x110 [null_blk]
[  234.604491]  nullb_device_power_store+0xca/0x100 [null_blk]
[  234.604495]  configfs_write_file+0xb9/0x120
[  234.604500]  __vfs_write+0x3a/0x1b0
[  234.604505]  ? apparmor_file_permission+0x1a/0x20
[  234.604509]  ? security_file_permission+0x3b/0xf0
[  234.604513]  ? _cond_resched+0x1a/0x50
[  234.604516]  vfs_write+0xb8/0x1b0
[  234.604519]  ksys_write+0x55/0xc0
[  234.604523]  __x64_sys_write+0x1a/0x20
[  234.604528]  do_syscall_64+0x5a/0x110
[  234.604532]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  234.604535] RIP: 0033:0x7fae2ef04154
[  234.604537] Code: 89 02 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 8d 05 b1 07 2e 00 8b 00 85 c0 75 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 f3 c3 66 90 41 54 55 49 89 d4 53 48 89 f5
[  234.604538] RSP: 002b:00007ffddfe68e88 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  234.604540] RAX: ffffffffffffffda RBX: 00005649349e68a0 RCX: 00007fae2ef04154
[  234.604542] RDX: 0000000000000002 RSI: 00005649349e68a0 RDI: 0000000000000001
[  234.604543] RBP: 0000000000000002 R08: 0000000000000077 R09: 0000000000000000
[  234.604544] R10: 00005649349e6010 R11: 0000000000000246 R12: 0000000000000001
[  234.604545] R13: 0000000000000002 R14: 7fffffffffffffff R15: 00007ffddfe6aed9
[  234.604548] ---[ end trace d99129291464ebae ]---
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index a55389b..fb97497 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -433,7 +433,7 @@  int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 		smp_rmb();

 		wait_event(q->mq_freeze_wq,
-			   (atomic_read(&q->mq_freeze_depth) == 0 &&
+			   (!q->mq_freeze_depth &&
 			    (pm || (blk_pm_request_resume(q),
 				    !blk_queue_pm_only(q)))) ||
 			   blk_queue_dying(q));
@@ -523,6 +523,7 @@  struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	spin_lock_init(&q->queue_lock);

 	init_waitqueue_head(&q->mq_freeze_wq);
+	mutex_init(&q->mq_freeze_lock);

 	/*
 	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a935483..373af60 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -143,13 +143,14 @@  void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,

 void blk_freeze_queue_start(struct request_queue *q)
 {
-	int freeze_depth;
-
-	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
-	if (freeze_depth == 1) {
+	mutex_lock(&q->mq_freeze_lock);
+	if (++q->mq_freeze_depth == 1) {
 		percpu_ref_kill(&q->q_usage_counter);
+		mutex_unlock(&q->mq_freeze_lock);
 		if (queue_is_mq(q))
 			blk_mq_run_hw_queues(q, false);
+	} else {
+		mutex_unlock(&q->mq_freeze_lock);
 	}
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -198,14 +199,14 @@  EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);

 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	int freeze_depth;
-
-	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
-	WARN_ON_ONCE(freeze_depth < 0);
-	if (!freeze_depth) {
+	mutex_lock(&q->mq_freeze_lock);
+	q->mq_freeze_depth--;
+	WARN_ON_ONCE(q->mq_freeze_depth < 0);
+	if (!q->mq_freeze_depth) {
 		percpu_ref_resurrect(&q->q_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
+	mutex_unlock(&q->mq_freeze_lock);
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c58a3b..64f7683 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -535,7 +535,7 @@  struct request_queue {

 	struct mutex		sysfs_lock;

-	atomic_t		mq_freeze_depth;
+	int			mq_freeze_depth;

 #if defined(CONFIG_BLK_DEV_BSG)
 	struct bsg_class_device bsg_dev;
@@ -547,6 +547,11 @@  struct request_queue {
 #endif
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
+	/*
+	 * Protect concurrent access to q_usage_counter by
+	 * percpu_ref_kill() and percpu_ref_reinit().
+	 */
+	struct mutex		mq_freeze_lock;
 	struct percpu_ref	q_usage_counter;
 	struct list_head	all_q_node;