diff mbox series

[v4,1/3] btrfs: scrub: fix circular locking dependency warning

Message ID 1548830702-14676-2-git-send-email-anand.jain@oracle.com (mailing list archive)
State New, archived
Headers show
Series btrfs: scrub: fix scrub_lock | expand

Commit Message

Anand Jain Jan. 30, 2019, 6:45 a.m. UTC
Circular locking dependency check reports warning[1], that's because
the btrfs_scrub_dev() calls the stack #0 below with, the
fs_info::scrub_lock held. The test case leading to this warning..

  mkfs.btrfs -fq /dev/sdb && mount /dev/sdb /btrfs
  btrfs scrub start -B /btrfs

In fact we have fs_info::scrub_workers_refcnt to tack if the init and
destroy of the scrub workers are needed. So once we have incremented
and decremented the fs_info::scrub_workers_refcnt value in the thread,
its ok to drop the scrub_lock, and then actually do the
btrfs_destroy_workqueue() part. So this patch drops the scrub_lock
before calling btrfs_destroy_workqueue().

[1]
[   76.146826] ======================================================
[   76.147086] WARNING: possible circular locking dependency detected
[   76.147316] 4.20.0-rc3+ #41 Not tainted
[   76.147489] ------------------------------------------------------
[   76.147722] btrfs/4065 is trying to acquire lock:
[   76.147984] 0000000038593bc0 ((wq_completion)"%s-%s""btrfs",
name){+.+.}, at: flush_workqueue+0x70/0x4d0
[   76.148337]
but task is already holding lock:
[   76.148594] 0000000062392ab7 (&fs_info->scrub_lock){+.+.}, at:
btrfs_scrub_dev+0x316/0x5d0 [btrfs]
[   76.148909]
which lock already depends on the new lock.

[   76.149191]
the existing dependency chain (in reverse order) is:
[   76.149446]
-> #3 (&fs_info->scrub_lock){+.+.}:
[   76.149707]        btrfs_scrub_dev+0x11f/0x5d0 [btrfs]
[   76.149924]        btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
[   76.150216]        do_vfs_ioctl+0xa9/0x6d0
[   76.150468]        ksys_ioctl+0x60/0x90
[   76.150716]        __x64_sys_ioctl+0x16/0x20
[   76.150911]        do_syscall_64+0x50/0x180
[   76.151182]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   76.151469]
-> #2 (&fs_devs->device_list_mutex){+.+.}:
[   76.151851]        reada_start_machine_worker+0xca/0x3f0 [btrfs]
[   76.152195]        normal_work_helper+0xf0/0x4c0 [btrfs]
[   76.152489]        process_one_work+0x1f4/0x520
[   76.152751]        worker_thread+0x46/0x3d0
[   76.153715]        kthread+0xf8/0x130
[   76.153912]        ret_from_fork+0x3a/0x50
[   76.154178]
-> #1 ((work_completion)(&work->normal_work)){+.+.}:
[   76.154575]        worker_thread+0x46/0x3d0
[   76.154828]        kthread+0xf8/0x130
[   76.155108]        ret_from_fork+0x3a/0x50
[   76.155357]
-> #0 ((wq_completion)"%s-%s""btrfs", name){+.+.}:
[   76.155751]        flush_workqueue+0x9a/0x4d0
[   76.155911]        drain_workqueue+0xca/0x1a0
[   76.156182]        destroy_workqueue+0x17/0x230
[   76.156455]        btrfs_destroy_workqueue+0x5d/0x1c0 [btrfs]
[   76.156756]        scrub_workers_put+0x2e/0x60 [btrfs]
[   76.156931]        btrfs_scrub_dev+0x329/0x5d0 [btrfs]
[   76.157219]        btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
[   76.157491]        do_vfs_ioctl+0xa9/0x6d0
[   76.157742]        ksys_ioctl+0x60/0x90
[   76.157910]        __x64_sys_ioctl+0x16/0x20
[   76.158177]        do_syscall_64+0x50/0x180
[   76.158429]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   76.158716]
other info that might help us debug this:

[   76.158908] Chain exists of:
  (wq_completion)"%s-%s""btrfs", name --> &fs_devs->device_list_mutex
--> &fs_info->scrub_lock

[   76.159629]  Possible unsafe locking scenario:

[   76.160607]        CPU0                    CPU1
[   76.160934]        ----                    ----
[   76.161210]   lock(&fs_info->scrub_lock);
[   76.161458]
lock(&fs_devs->device_list_mutex);
[   76.161805]
lock(&fs_info->scrub_lock);
[   76.161909]   lock((wq_completion)"%s-%s""btrfs", name);
[   76.162201]
 *** DEADLOCK ***

[   76.162627] 2 locks held by btrfs/4065:
[   76.162897]  #0: 00000000bef2775b (sb_writers#12){.+.+}, at:
mnt_want_write_file+0x24/0x50
[   76.163335]  #1: 0000000062392ab7 (&fs_info->scrub_lock){+.+.}, at:
btrfs_scrub_dev+0x316/0x5d0 [btrfs]
[   76.163796]
stack backtrace:
[   76.163911] CPU: 1 PID: 4065 Comm: btrfs Not tainted 4.20.0-rc3+ #41
[   76.164228] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS
VirtualBox 12/01/2006
[   76.164646] Call Trace:
[   76.164872]  dump_stack+0x5e/0x8b
[   76.165128]  print_circular_bug.isra.37+0x1f1/0x1fe
[   76.165398]  __lock_acquire+0x14aa/0x1620
[   76.165652]  lock_acquire+0xb0/0x190
[   76.165910]  ? flush_workqueue+0x70/0x4d0
[   76.166175]  flush_workqueue+0x9a/0x4d0
[   76.166420]  ? flush_workqueue+0x70/0x4d0
[   76.166671]  ? drain_workqueue+0x52/0x1a0
[   76.166911]  drain_workqueue+0xca/0x1a0
[   76.167167]  destroy_workqueue+0x17/0x230
[   76.167428]  btrfs_destroy_workqueue+0x5d/0x1c0 [btrfs]
[   76.167720]  scrub_workers_put+0x2e/0x60 [btrfs]
[   76.168233]  btrfs_scrub_dev+0x329/0x5d0 [btrfs]
[   76.168504]  ? __sb_start_write+0x121/0x1b0
[   76.168759]  ? mnt_want_write_file+0x24/0x50
[   76.169654]  btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
[   76.169934]  ? find_held_lock+0x2d/0x90
[   76.170204]  ? find_held_lock+0x2d/0x90
[   76.170450]  do_vfs_ioctl+0xa9/0x6d0
[   76.170690]  ? __fget+0x101/0x1f0
[   76.170910]  ? __fget+0x5/0x1f0
[   76.171157]  ksys_ioctl+0x60/0x90
[   76.171391]  __x64_sys_ioctl+0x16/0x20
[   76.171634]  do_syscall_64+0x50/0x180
[   76.171892]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   76.172186] RIP: 0033:0x7f61d422e567
[   76.172425] Code: 44 00 00 48 8b 05 29 09 2d 00 64 c7 00 26 00 00 00
48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f
05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f9 08 2d 00 f7 d8 64 89 01 48
[   76.172911] RSP: 002b:00007f61d3936d68 EFLAGS: 00000246 ORIG_RAX:
0000000000000010
[   76.173328] RAX: ffffffffffffffda RBX: 00000000019026b0 RCX:
00007f61d422e567
[   76.173649] RDX: 00000000019026b0 RSI: 00000000c400941b RDI:
0000000000000003
[   76.173909] RBP: 0000000000000000 R08: 00007f61d3937700 R09:
0000000000000000
[   76.174244] R10: 00007f61d3937700 R11: 0000000000000246 R12:
0000000000000000
[   76.174566] R13: 0000000000801000 R14: 0000000000000000 R15:
00007f61d3937700
[   76.175217] btrfs (4065) used greatest stack depth: 11424 bytes left

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
v3->v4: Fix list corruption as reported by btrfs/073 by David.
           [1]
           https://patchwork.kernel.org/patch/10705741/
	Which I was able to reproduce with an instrumented kernel but not with
	btrfs/073.
        In v3 patch, it releases the fs_info::scrub_lock to destroy the work queue
	which raced with new scrub requests, overwriting the scrub workers
	pointers. So in v4, it kills the function scrub_workers_put(), and
	performs the destroy_workqueue in two stages, with worker pointers
	copied locally.
v2->v3: none
v1->v2: none

 fs/btrfs/scrub.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

Comments

David Sterba Jan. 30, 2019, 2:07 p.m. UTC | #1
On Wed, Jan 30, 2019 at 02:45:00PM +0800, Anand Jain wrote:
> v3->v4: Fix list corruption as reported by btrfs/073 by David.
>            [1]
>            https://patchwork.kernel.org/patch/10705741/
> 	Which I was able to reproduce with an instrumented kernel but not with
> 	btrfs/073.
>         In v3 patch, it releases the fs_info::scrub_lock to destroy the work queue
> 	which raced with new scrub requests, overwriting the scrub workers
> 	pointers. So in v4, it kills the function scrub_workers_put(), and
> 	performs the destroy_workqueue in two stages, with worker pointers
> 	copied locally.

> @@ -3932,9 +3925,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
>  
>  	mutex_lock(&fs_info->scrub_lock);
>  	dev->scrub_ctx = NULL;
> -	scrub_workers_put(fs_info);
> +	if (--fs_info->scrub_workers_refcnt == 0) {
> +		scrub_workers = fs_info->scrub_workers;
> +		scrub_wr_comp = fs_info->scrub_wr_completion_workers;
> +		scrub_parity = fs_info->scrub_parity_workers;
> +	}
>  	mutex_unlock(&fs_info->scrub_lock);
>  
> +	btrfs_destroy_workqueue(scrub_workers);
> +	btrfs_destroy_workqueue(scrub_wr_comp);
> +	btrfs_destroy_workqueue(scrub_parity);

https://lore.kernel.org/linux-btrfs/1543554924-17397-2-git-send-email-anand.jain@oracle.com/

Comparing to the previous version, it's almost the same I think. If
scrub_workers_get races between the unlock and destroy_workers, anything
that uses fs_info->scrub_wokers will soon use freed memory.

The difference is that the worker pointers are read from fs_info under a
lock but are still used outside. I haven't tested this version but from
the analysis of previous crash, I don't see how v4 is supposed to be
better.
Anand Jain Jan. 31, 2019, 6:34 a.m. UTC | #2
On 1/30/19 10:07 PM, David Sterba wrote:
> On Wed, Jan 30, 2019 at 02:45:00PM +0800, Anand Jain wrote:
>> v3->v4: Fix list corruption as reported by btrfs/073 by David.
>>             [1]
>>             https://patchwork.kernel.org/patch/10705741/
>> 	Which I was able to reproduce with an instrumented kernel but not with
>> 	btrfs/073.
>>          In v3 patch, it releases the fs_info::scrub_lock to destroy the work queue
>> 	which raced with new scrub requests, overwriting the scrub workers
>> 	pointers. So in v4, it kills the function scrub_workers_put(), and
>> 	performs the destroy_workqueue in two stages, with worker pointers
>> 	copied locally.
> 
>> @@ -3932,9 +3925,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
>>   
>>   	mutex_lock(&fs_info->scrub_lock);
>>   	dev->scrub_ctx = NULL;
>> -	scrub_workers_put(fs_info);
>> +	if (--fs_info->scrub_workers_refcnt == 0) {
>> +		scrub_workers = fs_info->scrub_workers;
>> +		scrub_wr_comp = fs_info->scrub_wr_completion_workers;
>> +		scrub_parity = fs_info->scrub_parity_workers;
>> +	}
>>   	mutex_unlock(&fs_info->scrub_lock);
>>   
>> +	btrfs_destroy_workqueue(scrub_workers);
>> +	btrfs_destroy_workqueue(scrub_wr_comp);
>> +	btrfs_destroy_workqueue(scrub_parity);
> 
> https://lore.kernel.org/linux-btrfs/1543554924-17397-2-git-send-email-anand.jain@oracle.com/
> 
> Comparing to the previous version, it's almost the same I think. If
> scrub_workers_get races between the unlock and destroy_workers, anything
> that uses fs_info->scrub_wokers will soon use freed memory.
> 
> The difference is that the worker pointers are read from fs_info under a
> lock but are still used outside. I haven't tested this version but from
> the analysis of previous crash, I don't see how v4 is supposed to be
> better.
>

Consider v3 code as below:

When process-A is at [1] (below) start another
btrfs scrub start, lets call it process-B.
When process-A is at [1] it unlocks the fs_info::scrub_lock so the
process-B can overwrite fs_info::scrub_workers,
fs_info::scrub_wr_completion_workers, fs_info::scrub_parity_workers
which the process-A at [1] has not yet called destroyed.

Process-A
---------

btrfs scrub start /mnt

::
         mutex_lock(&fs_info->scrub_lock);
::
         if (dev->scrub_ctx ||
             (!is_dev_replace &&
              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
                 up_read(&fs_info->dev_replace.rwsem);
                 mutex_unlock(&fs_info->scrub_lock);
                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                 ret = -EINPROGRESS;
                 goto out_free_ctx;
         }
::
         ret = scrub_workers_get(fs_info, is_dev_replace);  <-- [2]
::
         dev->scrub_ctx = sctx;
         mutex_unlock(&fs_info->scrub_lock);

::
                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
::
         atomic_dec(&fs_info->scrubs_running);
::

         mutex_lock(&fs_info->scrub_lock);
         dev->scrub_ctx = NULL;
         scrub_workers_put(fs_info);
         mutex_unlock(&fs_info->scrub_lock);



static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info 
*fs_info)
{
         lockdep_assert_held(&fs_info->scrub_lock);
         if (--fs_info->scrub_workers_refcnt == 0) {
                 mutex_unlock(&fs_info->scrub_lock);

		<wait for process-B>  [1]

                 btrfs_destroy_workqueue(fs_info->scrub_workers);
 
btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
                 btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
                 mutex_lock(&fs_info->scrub_lock);
         }
         WARN_ON(fs_info->scrub_workers_refcnt < 0);
}




Process-B
---------
Start when process-A is at [1] (above)
btrfs scrub start /mnt

::
  at [2] (above) the fs_info::scrub_workers,
fs_info::scrub_wr_completion_workers, fs_info::scrub_parity_workers
of process-A are overwritten.


So in v4.
--------

Similar to dev::scrub_ctx the fs_info::scrub_workers,
fs_info::scrub_wr_completion_workers, fs_info::scrub_parity_workers
are stored locally before fs_info::scrub_lock is released, so the
list pointers aren't corrupted.

Hope this clarifies.

Thanks, Anand
David Sterba Feb. 12, 2019, 3:49 p.m. UTC | #3
On Thu, Jan 31, 2019 at 02:34:54PM +0800, Anand Jain wrote:
> 
> 
> On 1/30/19 10:07 PM, David Sterba wrote:
> > On Wed, Jan 30, 2019 at 02:45:00PM +0800, Anand Jain wrote:
> >> v3->v4: Fix list corruption as reported by btrfs/073 by David.
> >>             [1]
> >>             https://patchwork.kernel.org/patch/10705741/
> >> 	Which I was able to reproduce with an instrumented kernel but not with
> >> 	btrfs/073.
> >>          In v3 patch, it releases the fs_info::scrub_lock to destroy the work queue
> >> 	which raced with new scrub requests, overwriting the scrub workers
> >> 	pointers. So in v4, it kills the function scrub_workers_put(), and
> >> 	performs the destroy_workqueue in two stages, with worker pointers
> >> 	copied locally.
> > 
> >> @@ -3932,9 +3925,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
> >>   
> >>   	mutex_lock(&fs_info->scrub_lock);
> >>   	dev->scrub_ctx = NULL;
> >> -	scrub_workers_put(fs_info);
> >> +	if (--fs_info->scrub_workers_refcnt == 0) {
> >> +		scrub_workers = fs_info->scrub_workers;
> >> +		scrub_wr_comp = fs_info->scrub_wr_completion_workers;
> >> +		scrub_parity = fs_info->scrub_parity_workers;
> >> +	}
> >>   	mutex_unlock(&fs_info->scrub_lock);
> >>   
> >> +	btrfs_destroy_workqueue(scrub_workers);
> >> +	btrfs_destroy_workqueue(scrub_wr_comp);
> >> +	btrfs_destroy_workqueue(scrub_parity);
> > 
> > https://lore.kernel.org/linux-btrfs/1543554924-17397-2-git-send-email-anand.jain@oracle.com/
> > 
> > Comparing to the previous version, it's almost the same I think. If
> > scrub_workers_get races between the unlock and destroy_workers, anything
> > that uses fs_info->scrub_wokers will soon use freed memory.
> > 
> > The difference is that the worker pointers are read from fs_info under a
> > lock but are still used outside. I haven't tested this version but from
> > the analysis of previous crash, I don't see how v4 is supposed to be
> > better.
> >
> 
> Consider v3 code as below:
> 
> When process-A is at [1] (below) start another
> btrfs scrub start, lets call it process-B.
> When process-A is at [1] it unlocks the fs_info::scrub_lock so the
> process-B can overwrite fs_info::scrub_workers,
> fs_info::scrub_wr_completion_workers, fs_info::scrub_parity_workers
> which the process-A at [1] has not yet called destroyed.
> 
> Process-A
> ---------
> 
> btrfs scrub start /mnt
> 
> ::
>          mutex_lock(&fs_info->scrub_lock);
> ::
>          if (dev->scrub_ctx ||
>              (!is_dev_replace &&
>               btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
>                  up_read(&fs_info->dev_replace.rwsem);
>                  mutex_unlock(&fs_info->scrub_lock);
>                  mutex_unlock(&fs_info->fs_devices->device_list_mutex);
>                  ret = -EINPROGRESS;
>                  goto out_free_ctx;
>          }
> ::
>          ret = scrub_workers_get(fs_info, is_dev_replace);  <-- [2]
> ::
>          dev->scrub_ctx = sctx;
>          mutex_unlock(&fs_info->scrub_lock);
> 
> ::
>                  ret = scrub_enumerate_chunks(sctx, dev, start, end);
> ::
>          atomic_dec(&fs_info->scrubs_running);
> ::
> 
>          mutex_lock(&fs_info->scrub_lock);
>          dev->scrub_ctx = NULL;
>          scrub_workers_put(fs_info);
>          mutex_unlock(&fs_info->scrub_lock);
> 
> 
> 
> static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info 
> *fs_info)
> {
>          lockdep_assert_held(&fs_info->scrub_lock);
>          if (--fs_info->scrub_workers_refcnt == 0) {
>                  mutex_unlock(&fs_info->scrub_lock);
> 
> 		<wait for process-B>  [1]
> 
>                  btrfs_destroy_workqueue(fs_info->scrub_workers);
>  
> btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
>                  btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
>                  mutex_lock(&fs_info->scrub_lock);
>          }
>          WARN_ON(fs_info->scrub_workers_refcnt < 0);
> }
> 
> 
> 
> 
> Process-B
> ---------
> Start when process-A is at [1] (above)
> btrfs scrub start /mnt
> 
> ::
>   at [2] (above) the fs_info::scrub_workers,
> fs_info::scrub_wr_completion_workers, fs_info::scrub_parity_workers
> of process-A are overwritten.
> 
> 
> So in v4.
> --------
> 
> Similar to dev::scrub_ctx the fs_info::scrub_workers,
> fs_info::scrub_wr_completion_workers, fs_info::scrub_parity_workers
> are stored locally before fs_info::scrub_lock is released, so the
> list pointers aren't corrupted.
> 
> Hope this clarifies.

Yes, thanks. I'd like to add some assertions, to make the worker
pointers always NULL if the refcount is 0. Ie. initially they're 0 due
to kzalloc of fs_info, so the additional part would go to the block.

--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3744,17 +3744,20 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
        lockdep_assert_held(&fs_info->scrub_lock);
 
        if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+               ASSERT(fs_info->scrub_workers == NULL);
                fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
                                flags, is_dev_replace ? 1 : max_active, 4);
                if (!fs_info->scrub_workers)
                        goto fail_scrub_workers;
 
+               ASSERT(fs_info->scrub_wr_completion_workers == NULL);
                fs_info->scrub_wr_completion_workers =
                        btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
                                              max_active, 2);
                if (!fs_info->scrub_wr_completion_workers)
                        goto fail_scrub_wr_completion_workers;
 
+               ASSERT(fs_info->scrub_parity_workers == NULL);
                fs_info->scrub_parity_workers =
                        btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
                                              max_active, 2);
@@ -3934,6 +3937,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                scrub_workers = fs_info->scrub_workers;
                scrub_wr_comp = fs_info->scrub_wr_completion_workers;
                scrub_parity = fs_info->scrub_parity_workers;
+
+               fs_info->scrub_workers = NULL;
+               fs_info->scrub_wr_completion_workers = NULL;
+               fs_info->scrub_parity_workers = NULL;
        }
        mutex_unlock(&fs_info->scrub_lock);
David Sterba Feb. 12, 2019, 4:45 p.m. UTC | #4
On Wed, Jan 30, 2019 at 02:45:00PM +0800, Anand Jain wrote:
> Circular locking dependency check reports warning[1], that's because
> the btrfs_scrub_dev() calls the stack #0 below with, the
> fs_info::scrub_lock held. The test case leading to this warning..
> 
>   mkfs.btrfs -fq /dev/sdb && mount /dev/sdb /btrfs
>   btrfs scrub start -B /btrfs
> 
> In fact we have fs_info::scrub_workers_refcnt to tack if the init and
> destroy of the scrub workers are needed. So once we have incremented
> and decremented the fs_info::scrub_workers_refcnt value in the thread,
> its ok to drop the scrub_lock, and then actually do the
> btrfs_destroy_workqueue() part. So this patch drops the scrub_lock
> before calling btrfs_destroy_workqueue().
> 
> [1]
> [   76.146826] ======================================================
> [   76.147086] WARNING: possible circular locking dependency detected
> [   76.147316] 4.20.0-rc3+ #41 Not tainted
> [   76.147489] ------------------------------------------------------
> [   76.147722] btrfs/4065 is trying to acquire lock:
> [   76.147984] 0000000038593bc0 ((wq_completion)"%s-%s""btrfs",
> name){+.+.}, at: flush_workqueue+0x70/0x4d0
> [   76.148337]
> but task is already holding lock:
> [   76.148594] 0000000062392ab7 (&fs_info->scrub_lock){+.+.}, at:
> btrfs_scrub_dev+0x316/0x5d0 [btrfs]
> [   76.148909]
> which lock already depends on the new lock.
> 
> [   76.149191]
> the existing dependency chain (in reverse order) is:
> [   76.149446]
> -> #3 (&fs_info->scrub_lock){+.+.}:
> [   76.149707]        btrfs_scrub_dev+0x11f/0x5d0 [btrfs]
> [   76.149924]        btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
> [   76.150216]        do_vfs_ioctl+0xa9/0x6d0
> [   76.150468]        ksys_ioctl+0x60/0x90
> [   76.150716]        __x64_sys_ioctl+0x16/0x20
> [   76.150911]        do_syscall_64+0x50/0x180
> [   76.151182]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [   76.151469]
> -> #2 (&fs_devs->device_list_mutex){+.+.}:
> [   76.151851]        reada_start_machine_worker+0xca/0x3f0 [btrfs]
> [   76.152195]        normal_work_helper+0xf0/0x4c0 [btrfs]
> [   76.152489]        process_one_work+0x1f4/0x520
> [   76.152751]        worker_thread+0x46/0x3d0
> [   76.153715]        kthread+0xf8/0x130
> [   76.153912]        ret_from_fork+0x3a/0x50
> [   76.154178]
> -> #1 ((work_completion)(&work->normal_work)){+.+.}:
> [   76.154575]        worker_thread+0x46/0x3d0
> [   76.154828]        kthread+0xf8/0x130
> [   76.155108]        ret_from_fork+0x3a/0x50
> [   76.155357]
> -> #0 ((wq_completion)"%s-%s""btrfs", name){+.+.}:
> [   76.155751]        flush_workqueue+0x9a/0x4d0
> [   76.155911]        drain_workqueue+0xca/0x1a0
> [   76.156182]        destroy_workqueue+0x17/0x230
> [   76.156455]        btrfs_destroy_workqueue+0x5d/0x1c0 [btrfs]
> [   76.156756]        scrub_workers_put+0x2e/0x60 [btrfs]
> [   76.156931]        btrfs_scrub_dev+0x329/0x5d0 [btrfs]
> [   76.157219]        btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
> [   76.157491]        do_vfs_ioctl+0xa9/0x6d0
> [   76.157742]        ksys_ioctl+0x60/0x90
> [   76.157910]        __x64_sys_ioctl+0x16/0x20
> [   76.158177]        do_syscall_64+0x50/0x180
> [   76.158429]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [   76.158716]
> other info that might help us debug this:
> 
> [   76.158908] Chain exists of:
>   (wq_completion)"%s-%s""btrfs", name --> &fs_devs->device_list_mutex
> --> &fs_info->scrub_lock
> 
> [   76.159629]  Possible unsafe locking scenario:
> 
> [   76.160607]        CPU0                    CPU1
> [   76.160934]        ----                    ----
> [   76.161210]   lock(&fs_info->scrub_lock);
> [   76.161458]
> lock(&fs_devs->device_list_mutex);
> [   76.161805]
> lock(&fs_info->scrub_lock);
> [   76.161909]   lock((wq_completion)"%s-%s""btrfs", name);
> [   76.162201]
>  *** DEADLOCK ***

Please next time make sure the log messages are copied without
whitespace damage. In many cases it's easy to glue the lines together,
but specially the above locking scenario needs to be exact as the
indentaion matters.

I'll replace the whole report with the one in my logs as it does not
have the device_list_mutex in the locking chain. This would be confusing
as the problem with device_list_mutex was fixed already.

> 
> [   76.162627] 2 locks held by btrfs/4065:
> [   76.162897]  #0: 00000000bef2775b (sb_writers#12){.+.+}, at:
> mnt_want_write_file+0x24/0x50
> [   76.163335]  #1: 0000000062392ab7 (&fs_info->scrub_lock){+.+.}, at:
> btrfs_scrub_dev+0x316/0x5d0 [btrfs]
> [   76.163796]
> stack backtrace:
> [   76.163911] CPU: 1 PID: 4065 Comm: btrfs Not tainted 4.20.0-rc3+ #41
> [   76.164228] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS
> VirtualBox 12/01/2006
> [   76.164646] Call Trace:
> [   76.164872]  dump_stack+0x5e/0x8b
> [   76.165128]  print_circular_bug.isra.37+0x1f1/0x1fe
> [   76.165398]  __lock_acquire+0x14aa/0x1620
> [   76.165652]  lock_acquire+0xb0/0x190
> [   76.165910]  ? flush_workqueue+0x70/0x4d0
> [   76.166175]  flush_workqueue+0x9a/0x4d0
> [   76.166420]  ? flush_workqueue+0x70/0x4d0
> [   76.166671]  ? drain_workqueue+0x52/0x1a0
> [   76.166911]  drain_workqueue+0xca/0x1a0
> [   76.167167]  destroy_workqueue+0x17/0x230
> [   76.167428]  btrfs_destroy_workqueue+0x5d/0x1c0 [btrfs]
> [   76.167720]  scrub_workers_put+0x2e/0x60 [btrfs]
> [   76.168233]  btrfs_scrub_dev+0x329/0x5d0 [btrfs]
> [   76.168504]  ? __sb_start_write+0x121/0x1b0
> [   76.168759]  ? mnt_want_write_file+0x24/0x50
> [   76.169654]  btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
> [   76.169934]  ? find_held_lock+0x2d/0x90
> [   76.170204]  ? find_held_lock+0x2d/0x90
> [   76.170450]  do_vfs_ioctl+0xa9/0x6d0
> [   76.170690]  ? __fget+0x101/0x1f0
> [   76.170910]  ? __fget+0x5/0x1f0
> [   76.171157]  ksys_ioctl+0x60/0x90
> [   76.171391]  __x64_sys_ioctl+0x16/0x20
> [   76.171634]  do_syscall_64+0x50/0x180
> [   76.171892]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [   76.172186] RIP: 0033:0x7f61d422e567
> [   76.172425] Code: 44 00 00 48 8b 05 29 09 2d 00 64 c7 00 26 00 00 00
> 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f
> 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f9 08 2d 00 f7 d8 64 89 01 48

This like can be ommitted from the log unless we really want to decode
the instructions.
Anand Jain Feb. 13, 2019, 4:02 p.m. UTC | #5
On 2/13/19 12:45 AM, David Sterba wrote:
> On Wed, Jan 30, 2019 at 02:45:00PM +0800, Anand Jain wrote:
>> Circular locking dependency check reports warning[1], that's because
>> the btrfs_scrub_dev() calls the stack #0 below with, the
>> fs_info::scrub_lock held. The test case leading to this warning..
>>
>>    mkfs.btrfs -fq /dev/sdb && mount /dev/sdb /btrfs
>>    btrfs scrub start -B /btrfs
>>
>> In fact we have fs_info::scrub_workers_refcnt to tack if the init and
>> destroy of the scrub workers are needed. So once we have incremented
>> and decremented the fs_info::scrub_workers_refcnt value in the thread,
>> its ok to drop the scrub_lock, and then actually do the
>> btrfs_destroy_workqueue() part. So this patch drops the scrub_lock
>> before calling btrfs_destroy_workqueue().
>>
>> [1]
>> [   76.146826] ======================================================
>> [   76.147086] WARNING: possible circular locking dependency detected
>> [   76.147316] 4.20.0-rc3+ #41 Not tainted
>> [   76.147489] ------------------------------------------------------
>> [   76.147722] btrfs/4065 is trying to acquire lock:
>> [   76.147984] 0000000038593bc0 ((wq_completion)"%s-%s""btrfs",
>> name){+.+.}, at: flush_workqueue+0x70/0x4d0
>> [   76.148337]
>> but task is already holding lock:
>> [   76.148594] 0000000062392ab7 (&fs_info->scrub_lock){+.+.}, at:
>> btrfs_scrub_dev+0x316/0x5d0 [btrfs]
>> [   76.148909]
>> which lock already depends on the new lock.
>>
>> [   76.149191]
>> the existing dependency chain (in reverse order) is:
>> [   76.149446]
>> -> #3 (&fs_info->scrub_lock){+.+.}:
>> [   76.149707]        btrfs_scrub_dev+0x11f/0x5d0 [btrfs]
>> [   76.149924]        btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
>> [   76.150216]        do_vfs_ioctl+0xa9/0x6d0
>> [   76.150468]        ksys_ioctl+0x60/0x90
>> [   76.150716]        __x64_sys_ioctl+0x16/0x20
>> [   76.150911]        do_syscall_64+0x50/0x180
>> [   76.151182]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
>> [   76.151469]
>> -> #2 (&fs_devs->device_list_mutex){+.+.}:
>> [   76.151851]        reada_start_machine_worker+0xca/0x3f0 [btrfs]
>> [   76.152195]        normal_work_helper+0xf0/0x4c0 [btrfs]
>> [   76.152489]        process_one_work+0x1f4/0x520
>> [   76.152751]        worker_thread+0x46/0x3d0
>> [   76.153715]        kthread+0xf8/0x130
>> [   76.153912]        ret_from_fork+0x3a/0x50
>> [   76.154178]
>> -> #1 ((work_completion)(&work->normal_work)){+.+.}:
>> [   76.154575]        worker_thread+0x46/0x3d0
>> [   76.154828]        kthread+0xf8/0x130
>> [   76.155108]        ret_from_fork+0x3a/0x50
>> [   76.155357]
>> -> #0 ((wq_completion)"%s-%s""btrfs", name){+.+.}:
>> [   76.155751]        flush_workqueue+0x9a/0x4d0
>> [   76.155911]        drain_workqueue+0xca/0x1a0
>> [   76.156182]        destroy_workqueue+0x17/0x230
>> [   76.156455]        btrfs_destroy_workqueue+0x5d/0x1c0 [btrfs]
>> [   76.156756]        scrub_workers_put+0x2e/0x60 [btrfs]
>> [   76.156931]        btrfs_scrub_dev+0x329/0x5d0 [btrfs]
>> [   76.157219]        btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
>> [   76.157491]        do_vfs_ioctl+0xa9/0x6d0
>> [   76.157742]        ksys_ioctl+0x60/0x90
>> [   76.157910]        __x64_sys_ioctl+0x16/0x20
>> [   76.158177]        do_syscall_64+0x50/0x180
>> [   76.158429]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
>> [   76.158716]
>> other info that might help us debug this:
>>
>> [   76.158908] Chain exists of:
>>    (wq_completion)"%s-%s""btrfs", name --> &fs_devs->device_list_mutex
>> --> &fs_info->scrub_lock
>>
>> [   76.159629]  Possible unsafe locking scenario:
>>
>> [   76.160607]        CPU0                    CPU1
>> [   76.160934]        ----                    ----
>> [   76.161210]   lock(&fs_info->scrub_lock);
>> [   76.161458]
>> lock(&fs_devs->device_list_mutex);
>> [   76.161805]
>> lock(&fs_info->scrub_lock);
>> [   76.161909]   lock((wq_completion)"%s-%s""btrfs", name);
>> [   76.162201]
>>   *** DEADLOCK ***
> 
> Please next time make sure the log messages are copied without
> whitespace damage. In many cases it's easy to glue the lines together,
> but specially the above locking scenario needs to be exact as the
> indentaion matters.
> 
> I'll replace the whole report with the one in my logs as it does not
> have the device_list_mutex in the locking chain. This would be confusing
> as the problem with device_list_mutex was fixed already.

  ok. Thanks.

-Anand

>>
>> [   76.162627] 2 locks held by btrfs/4065:
>> [   76.162897]  #0: 00000000bef2775b (sb_writers#12){.+.+}, at:
>> mnt_want_write_file+0x24/0x50
>> [   76.163335]  #1: 0000000062392ab7 (&fs_info->scrub_lock){+.+.}, at:
>> btrfs_scrub_dev+0x316/0x5d0 [btrfs]
>> [   76.163796]
>> stack backtrace:
>> [   76.163911] CPU: 1 PID: 4065 Comm: btrfs Not tainted 4.20.0-rc3+ #41
>> [   76.164228] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS
>> VirtualBox 12/01/2006
>> [   76.164646] Call Trace:
>> [   76.164872]  dump_stack+0x5e/0x8b
>> [   76.165128]  print_circular_bug.isra.37+0x1f1/0x1fe
>> [   76.165398]  __lock_acquire+0x14aa/0x1620
>> [   76.165652]  lock_acquire+0xb0/0x190
>> [   76.165910]  ? flush_workqueue+0x70/0x4d0
>> [   76.166175]  flush_workqueue+0x9a/0x4d0
>> [   76.166420]  ? flush_workqueue+0x70/0x4d0
>> [   76.166671]  ? drain_workqueue+0x52/0x1a0
>> [   76.166911]  drain_workqueue+0xca/0x1a0
>> [   76.167167]  destroy_workqueue+0x17/0x230
>> [   76.167428]  btrfs_destroy_workqueue+0x5d/0x1c0 [btrfs]
>> [   76.167720]  scrub_workers_put+0x2e/0x60 [btrfs]
>> [   76.168233]  btrfs_scrub_dev+0x329/0x5d0 [btrfs]
>> [   76.168504]  ? __sb_start_write+0x121/0x1b0
>> [   76.168759]  ? mnt_want_write_file+0x24/0x50
>> [   76.169654]  btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
>> [   76.169934]  ? find_held_lock+0x2d/0x90
>> [   76.170204]  ? find_held_lock+0x2d/0x90
>> [   76.170450]  do_vfs_ioctl+0xa9/0x6d0
>> [   76.170690]  ? __fget+0x101/0x1f0
>> [   76.170910]  ? __fget+0x5/0x1f0
>> [   76.171157]  ksys_ioctl+0x60/0x90
>> [   76.171391]  __x64_sys_ioctl+0x16/0x20
>> [   76.171634]  do_syscall_64+0x50/0x180
>> [   76.171892]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
>> [   76.172186] RIP: 0033:0x7f61d422e567
>> [   76.172425] Code: 44 00 00 48 8b 05 29 09 2d 00 64 c7 00 26 00 00 00
>> 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f
>> 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f9 08 2d 00 f7 d8 64 89 01 48
> 
> This like can be ommitted from the log unless we really want to decode
> the instructions.
>
diff mbox series

Patch

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 33f2793bdee0..f2f0be7864b8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3770,16 +3770,6 @@  static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 	return -ENOMEM;
 }
 
-static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
-{
-	if (--fs_info->scrub_workers_refcnt == 0) {
-		btrfs_destroy_workqueue(fs_info->scrub_workers);
-		btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
-		btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
-	}
-	WARN_ON(fs_info->scrub_workers_refcnt < 0);
-}
-
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		    u64 end, struct btrfs_scrub_progress *progress,
 		    int readonly, int is_dev_replace)
@@ -3788,6 +3778,9 @@  int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	int ret;
 	struct btrfs_device *dev;
 	unsigned int nofs_flag;
+	struct btrfs_workqueue *scrub_workers = NULL;
+	struct btrfs_workqueue *scrub_wr_comp = NULL;
+	struct btrfs_workqueue *scrub_parity = NULL;
 
 	if (btrfs_fs_closing(fs_info))
 		return -EINVAL;
@@ -3932,9 +3925,16 @@  int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
 	mutex_lock(&fs_info->scrub_lock);
 	dev->scrub_ctx = NULL;
-	scrub_workers_put(fs_info);
+	if (--fs_info->scrub_workers_refcnt == 0) {
+		scrub_workers = fs_info->scrub_workers;
+		scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+		scrub_parity = fs_info->scrub_parity_workers;
+	}
 	mutex_unlock(&fs_info->scrub_lock);
 
+	btrfs_destroy_workqueue(scrub_workers);
+	btrfs_destroy_workqueue(scrub_wr_comp);
+	btrfs_destroy_workqueue(scrub_parity);
 	scrub_put_ctx(sctx);
 
 	return ret;