btrfs: use nofs allocations for running delayed items
diff mbox series

Message ID 20200319141132.3127-1-josef@toxicpanda.com
State New
Headers show
Series
  • btrfs: use nofs allocations for running delayed items
Related show

Commit Message

Josef Bacik March 19, 2020, 2:11 p.m. UTC
Zygo reported the following lockdep splat while testing the balance
patches

======================================================
WARNING: possible circular locking dependency detected
5.6.0-c6f0579d496a+ #53 Not tainted
------------------------------------------------------
kswapd0/1133 is trying to acquire lock:
ffff888092f622c0 (&delayed_node->mutex){+.+.}, at: __btrfs_release_delayed_node+0x7c/0x5b0

but task is already holding lock:
ffffffff8fc5f860 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (fs_reclaim){+.+.}:
       fs_reclaim_acquire.part.91+0x29/0x30
       fs_reclaim_acquire+0x19/0x20
       kmem_cache_alloc_trace+0x32/0x740
       add_block_entry+0x45/0x260
       btrfs_ref_tree_mod+0x6e2/0x8b0
       btrfs_alloc_tree_block+0x789/0x880
       alloc_tree_block_no_bg_flush+0xc6/0xf0
       __btrfs_cow_block+0x270/0x940
       btrfs_cow_block+0x1ba/0x3a0
       btrfs_search_slot+0x999/0x1030
       btrfs_insert_empty_items+0x81/0xe0
       btrfs_insert_delayed_items+0x128/0x7d0
       __btrfs_run_delayed_items+0xf4/0x2a0
       btrfs_run_delayed_items+0x13/0x20
       btrfs_commit_transaction+0x5cc/0x1390
       insert_balance_item.isra.39+0x6b2/0x6e0
       btrfs_balance+0x72d/0x18d0
       btrfs_ioctl_balance+0x3de/0x4c0
       btrfs_ioctl+0x30ab/0x44a0
       ksys_ioctl+0xa1/0xe0
       __x64_sys_ioctl+0x43/0x50
       do_syscall_64+0x77/0x2c0
       entry_SYSCALL_64_after_hwframe+0x49/0xbe

-> #0 (&delayed_node->mutex){+.+.}:
       __lock_acquire+0x197e/0x2550
       lock_acquire+0x103/0x220
       __mutex_lock+0x13d/0xce0
       mutex_lock_nested+0x1b/0x20
       __btrfs_release_delayed_node+0x7c/0x5b0
       btrfs_remove_delayed_node+0x49/0x50
       btrfs_evict_inode+0x6fc/0x900
       evict+0x19a/0x2c0
       dispose_list+0xa0/0xe0
       prune_icache_sb+0xbd/0xf0
       super_cache_scan+0x1b5/0x250
       do_shrink_slab+0x1f6/0x530
       shrink_slab+0x32e/0x410
       shrink_node+0x2a5/0xba0
       balance_pgdat+0x4bd/0x8a0
       kswapd+0x35a/0x800
       kthread+0x1e9/0x210
       ret_from_fork+0x3a/0x50

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(fs_reclaim);
                               lock(&delayed_node->mutex);
                               lock(fs_reclaim);
  lock(&delayed_node->mutex);

 *** DEADLOCK ***

3 locks held by kswapd0/1133:
 #0: ffffffff8fc5f860 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30
 #1: ffffffff8fc380d8 (shrinker_rwsem){++++}, at: shrink_slab+0x1e8/0x410
 #2: ffff8881e0e6c0e8 (&type->s_umount_key#42){++++}, at: trylock_super+0x1b/0x70

stack backtrace:
CPU: 2 PID: 1133 Comm: kswapd0 Not tainted 5.6.0-c6f0579d496a+ #53
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
Call Trace:
 dump_stack+0xc1/0x11a
 print_circular_bug.isra.38.cold.57+0x145/0x14a
 check_noncircular+0x2a9/0x2f0
 ? print_circular_bug.isra.38+0x130/0x130
 ? stack_trace_consume_entry+0x90/0x90
 ? save_trace+0x3cc/0x420
 __lock_acquire+0x197e/0x2550
 ? btrfs_inode_clear_file_extent_range+0x9b/0xb0
 ? register_lock_class+0x960/0x960
 lock_acquire+0x103/0x220
 ? __btrfs_release_delayed_node+0x7c/0x5b0
 __mutex_lock+0x13d/0xce0
 ? __btrfs_release_delayed_node+0x7c/0x5b0
 ? __asan_loadN+0xf/0x20
 ? pvclock_clocksource_read+0xeb/0x190
 ? __btrfs_release_delayed_node+0x7c/0x5b0
 ? mutex_lock_io_nested+0xc20/0xc20
 ? __kasan_check_read+0x11/0x20
 ? check_chain_key+0x1e6/0x2e0
 mutex_lock_nested+0x1b/0x20
 ? mutex_lock_nested+0x1b/0x20
 __btrfs_release_delayed_node+0x7c/0x5b0
 btrfs_remove_delayed_node+0x49/0x50
 btrfs_evict_inode+0x6fc/0x900
 ? btrfs_setattr+0x840/0x840
 ? do_raw_spin_unlock+0xa8/0x140
 evict+0x19a/0x2c0
 dispose_list+0xa0/0xe0
 prune_icache_sb+0xbd/0xf0
 ? invalidate_inodes+0x310/0x310
 super_cache_scan+0x1b5/0x250
 do_shrink_slab+0x1f6/0x530
 shrink_slab+0x32e/0x410
 ? do_shrink_slab+0x530/0x530
 ? do_shrink_slab+0x530/0x530
 ? __kasan_check_read+0x11/0x20
 ? mem_cgroup_protected+0x13d/0x260
 shrink_node+0x2a5/0xba0
 balance_pgdat+0x4bd/0x8a0
 ? mem_cgroup_shrink_node+0x490/0x490
 ? _raw_spin_unlock_irq+0x27/0x40
 ? finish_task_switch+0xce/0x390
 ? rcu_read_lock_bh_held+0xb0/0xb0
 kswapd+0x35a/0x800
 ? _raw_spin_unlock_irqrestore+0x4c/0x60
 ? balance_pgdat+0x8a0/0x8a0
 ? finish_wait+0x110/0x110
 ? __kasan_check_read+0x11/0x20
 ? __kthread_parkme+0xc6/0xe0
 ? balance_pgdat+0x8a0/0x8a0
 kthread+0x1e9/0x210
 ? kthread_create_worker_on_cpu+0xc0/0xc0
 ret_from_fork+0x3a/0x50

This is because we hold that delayed node's mutex while doing tree
operations.  Fix this by just wrapping the searches in nofs.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/delayed-inode.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

Comments

David Sterba March 19, 2020, 2:34 p.m. UTC | #1
On Thu, Mar 19, 2020 at 10:11:32AM -0400, Josef Bacik wrote:
> This is because we hold that delayed node's mutex while doing tree
> operations.  Fix this by just wrapping the searches in nofs.

For the time being we have to do the explicit NOFS so in the code it's
a bit awkward. The hint is a function that takes transaction as
argument.

I'm working on the scope NOFS (marked by the transaction start/end), it's
intrusive, all over the code and there are some cases when I want to add
assertions that turns out to be tricky for some functions.
Josef Bacik March 19, 2020, 2:51 p.m. UTC | #2
On 3/19/20 10:34 AM, David Sterba wrote:
> On Thu, Mar 19, 2020 at 10:11:32AM -0400, Josef Bacik wrote:
>> This is because we hold that delayed node's mutex while doing tree
>> operations.  Fix this by just wrapping the searches in nofs.
> 
> For the time being we have to do the explicit NOFS so in the code it's
> a bit awkward. The hint is a function that takes transaction as
> argument.
> 
> I'm working on the scope NOFS (marked by the transaction start/end), it's
> intrusive, all over the code and there are some cases when I want to add
> assertions that turns out to be tricky for some functions.
> 

That could be cleaner, do you want me to drop this and just add a

if (trans)
	memalloc_nofs_save()

memalloc_nofs_restore()

in btrfs_search_slot?  That'll catch all of these usages.  I'm not married to 
any particular approach.  Thanks,

Josef
David Sterba March 19, 2020, 4:25 p.m. UTC | #3
On Thu, Mar 19, 2020 at 10:51:15AM -0400, Josef Bacik wrote:
> On 3/19/20 10:34 AM, David Sterba wrote:
> > On Thu, Mar 19, 2020 at 10:11:32AM -0400, Josef Bacik wrote:
> >> This is because we hold that delayed node's mutex while doing tree
> >> operations.  Fix this by just wrapping the searches in nofs.
> > 
> > For the time being we have to do the explicit NOFS so in the code it's
> > a bit awkward. The hint is a function that takes transaction as
> > argument.
> > 
> > I'm working on the scope NOFS (marked by the transaction start/end), it's
> > intrusive, all over the code and there are some cases when I want to add
> > assertions that turns out to be tricky for some functions.
> > 
> 
> That could be cleaner, do you want me to drop this and just add a
> 
> if (trans)
> 	memalloc_nofs_save()
> 
> memalloc_nofs_restore()
> 
> in btrfs_search_slot?  That'll catch all of these usages.  I'm not married to 
> any particular approach.  Thanks,

Not all I think but I haven't looked closer. Maybe the search slot is
used in majority of cases anyway. The outer scope between
memalloc_nofs_save/memalloc_nofs_restored has been used so far and it's
obvious where it starts and ends so even if it's not pretty let's use it
for now.
David Sterba March 25, 2020, 1:58 p.m. UTC | #4
On Thu, Mar 19, 2020 at 10:11:32AM -0400, Josef Bacik wrote:
[...]
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>

Added to misc-next, thanks.

Patch
diff mbox series

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index ee70584ddc45..14de5b72e57a 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -6,6 +6,7 @@ 
 
 #include <linux/slab.h>
 #include <linux/iversion.h>
+#include <linux/sched/mm.h>
 #include "misc.h"
 #include "delayed-inode.h"
 #include "disk-io.h"
@@ -805,11 +806,14 @@  static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 				     struct btrfs_delayed_item *delayed_item)
 {
 	struct extent_buffer *leaf;
+	unsigned int nofs_flag;
 	char *ptr;
 	int ret;
 
+	nofs_flag = memalloc_nofs_save();
 	ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
 				      delayed_item->data_len);
+	memalloc_nofs_restore(nofs_flag);
 	if (ret < 0 && ret != -EEXIST)
 		return ret;
 
@@ -937,6 +941,7 @@  static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
 				      struct btrfs_delayed_node *node)
 {
 	struct btrfs_delayed_item *curr, *prev;
+	unsigned int nofs_flag;
 	int ret = 0;
 
 do_again:
@@ -945,7 +950,9 @@  static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
 	if (!curr)
 		goto delete_fail;
 
+	nofs_flag = memalloc_nofs_save();
 	ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
+	memalloc_nofs_restore(nofs_flag);
 	if (ret < 0)
 		goto delete_fail;
 	else if (ret > 0) {
@@ -1012,6 +1019,7 @@  static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
+	unsigned int nofs_flag;
 	int mod;
 	int ret;
 
@@ -1024,7 +1032,9 @@  static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 	else
 		mod = 1;
 
+	nofs_flag = memalloc_nofs_save();
 	ret = btrfs_lookup_inode(trans, root, path, &key, mod);
+	memalloc_nofs_restore(nofs_flag);
 	if (ret > 0) {
 		btrfs_release_path(path);
 		return -ENOENT;
@@ -1075,7 +1085,10 @@  static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 
 	key.type = BTRFS_INODE_EXTREF_KEY;
 	key.offset = -1;
+
+	nofs_flag = memalloc_nofs_save();
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	memalloc_nofs_restore(nofs_flag);
 	if (ret < 0)
 		goto err_out;
 	ASSERT(ret);