diff mbox series

[fuse] alloc_page nofs avoid deadlock

Message ID 20210603125242.31699-1-chenguanyou@xiaomi.com (mailing list archive)
State New
Headers show
Series [fuse] alloc_page nofs avoid deadlock | expand

Commit Message

chenguanyou June 3, 2021, 12:52 p.m. UTC
ABA deadlock

PID: 17172 TASK: ffffffc0c162c000 CPU: 6 COMMAND: "Thread-21"
0 [ffffff802d16b400] __switch_to at ffffff8008086a4c
1 [ffffff802d16b470] __schedule at ffffff80091ffe58
2 [ffffff802d16b4d0] schedule at ffffff8009200348
3 [ffffff802d16b4f0] bit_wait at ffffff8009201098
4 [ffffff802d16b510] __wait_on_bit at ffffff8009200a34
5 [ffffff802d16b5b0] inode_wait_for_writeback at ffffff800830e1e8
6 [ffffff802d16b5e0] evict at ffffff80082fb15c
7 [ffffff802d16b620] iput at ffffff80082f9270
8 [ffffff802d16b680] dentry_unlink_inode at ffffff80082f4c90
9 [ffffff802d16b6a0] __dentry_kill at ffffff80082f1710
10 [ffffff802d16b6d0] shrink_dentry_list at ffffff80082f1c34
11 [ffffff802d16b750] prune_dcache_sb at ffffff80082f18a8
12 [ffffff802d16b770] super_cache_scan at ffffff80082d55ac
13 [ffffff802d16b860] shrink_slab at ffffff8008266170
14 [ffffff802d16b900] shrink_node at ffffff800826b420
15 [ffffff802d16b980] do_try_to_free_pages at ffffff8008268460
16 [ffffff802d16ba60] try_to_free_pages at ffffff80082680d0
17 [ffffff802d16bbe0] __alloc_pages_nodemask at ffffff8008256514
18 [ffffff802d16bc60] fuse_copy_fill at ffffff8008438268
19 [ffffff802d16bd00] fuse_dev_do_read at ffffff8008437654
20 [ffffff802d16bdc0] fuse_dev_splice_read at ffffff8008436f40
21 [ffffff802d16be60] sys_splice at ffffff8008315d18
22 [ffffff802d16bff0] __sys_trace at ffffff8008084014

PID: 9652 TASK: ffffffc0c9ce0000 CPU: 4 COMMAND: "kworker/u16:8"
0 [ffffff802e793650] __switch_to at ffffff8008086a4c
1 [ffffff802e7936c0] __schedule at ffffff80091ffe58
2 [ffffff802e793720] schedule at ffffff8009200348
3 [ffffff802e793770] __fuse_request_send at ffffff8008435760
4 [ffffff802e7937b0] fuse_simple_request at ffffff8008435b14
5 [ffffff802e793930] fuse_flush_times at ffffff800843a7a0
6 [ffffff802e793950] fuse_write_inode at ffffff800843e4dc
7 [ffffff802e793980] __writeback_single_inode at ffffff8008312740
8 [ffffff802e793aa0] writeback_sb_inodes at ffffff80083117e4
9 [ffffff802e793b00] __writeback_inodes_wb at ffffff8008311d98
10 [ffffff802e793c00] wb_writeback at ffffff8008310cfc
11 [ffffff802e793d00] wb_workfn at ffffff800830e4a8
12 [ffffff802e793d90] process_one_work at ffffff80080e4fac
13 [ffffff802e793e00] worker_thread at ffffff80080e5670
14 [ffffff802e793e60] kthread at ffffff80080eb650

Signed-off-by: chenguanyou <chenguanyou@xiaomi.com>
---
 fs/fuse/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Comments

chenguanyou June 8, 2021, 2:54 p.m. UTC | #1
ABCA deadlock

kswapd0 D 0 159 2 0x00000000
Call trace:
__switch_to+0x134/0x150
__schedule+0x12ac/0x172c
schedule+0x70/0x90
bit_wait+0x14/0x54
__wait_on_bit+0x74/0xe0
inode_wait_for_writeback+0xa0/0xe4
evict+0xa4/0x298
iput+0x33c/0x38c
dentry_unlink_inode+0xd8/0xe4
__dentry_kill+0xe8/0x22c
shrink_dentry_list+0x1e8/0x4f0
prune_dcache_sb+0x54/0x80
super_cache_scan+0x114/0x164
shrink_slab+0x5f8/0x708
shrink_node+0x144/0x318
kswapd+0xa10/0xc24
kthread+0x124/0x134
ret_from_fork+0x10/0x18

 
Thread-5 D 0 3396 698 0x01000808
Call trace:
__switch_to+0x134/0x150
__schedule+0x12ac/0x172c
schedule+0x70/0x90
try_to_free_pages+0x280/0x67c
__alloc_pages_nodemask+0x918/0x145c
fuse_copy_fill+0x15c/0x210
fuse_dev_do_read+0x4e8/0xcd4
fuse_dev_splice_read+0x84/0x1d8
SyS_splice+0x6ac/0x8fc
__sys_trace_return+0x0/0x4


u:r:kernel:s0 root 159 159 2 0 0 inode_wait_for_writeback 0 D 19 0 - 0 fg 6 [kswapd0] kswapd0
u:r:kernel:s0 root 25798 25798 2 0 0 __fuse_request_send 0 S 19 0 - 0 fg 2 [kworker/u16:0] kworker/u16:0
u:r:mediaprovider_app:s0:c203,c256,c512,c768 u0_a203 3254 3396 698 5736296 62012 try_to_free_pages 0 D 19 0 - 0 ta 4 com.android.providers.media.module Thread-5
Miklos Szeredi June 8, 2021, 3:30 p.m. UTC | #2
On Thu, 3 Jun 2021 at 14:52, chenguanyou <chenguanyou9338@gmail.com> wrote:
>
> ABA deadlock
>
> PID: 17172 TASK: ffffffc0c162c000 CPU: 6 COMMAND: "Thread-21"
> 0 [ffffff802d16b400] __switch_to at ffffff8008086a4c
> 1 [ffffff802d16b470] __schedule at ffffff80091ffe58
> 2 [ffffff802d16b4d0] schedule at ffffff8009200348
> 3 [ffffff802d16b4f0] bit_wait at ffffff8009201098
> 4 [ffffff802d16b510] __wait_on_bit at ffffff8009200a34
> 5 [ffffff802d16b5b0] inode_wait_for_writeback at ffffff800830e1e8
> 6 [ffffff802d16b5e0] evict at ffffff80082fb15c
> 7 [ffffff802d16b620] iput at ffffff80082f9270
> 8 [ffffff802d16b680] dentry_unlink_inode at ffffff80082f4c90
> 9 [ffffff802d16b6a0] __dentry_kill at ffffff80082f1710
> 10 [ffffff802d16b6d0] shrink_dentry_list at ffffff80082f1c34
> 11 [ffffff802d16b750] prune_dcache_sb at ffffff80082f18a8
> 12 [ffffff802d16b770] super_cache_scan at ffffff80082d55ac
> 13 [ffffff802d16b860] shrink_slab at ffffff8008266170
> 14 [ffffff802d16b900] shrink_node at ffffff800826b420
> 15 [ffffff802d16b980] do_try_to_free_pages at ffffff8008268460
> 16 [ffffff802d16ba60] try_to_free_pages at ffffff80082680d0
> 17 [ffffff802d16bbe0] __alloc_pages_nodemask at ffffff8008256514
> 18 [ffffff802d16bc60] fuse_copy_fill at ffffff8008438268
> 19 [ffffff802d16bd00] fuse_dev_do_read at ffffff8008437654
> 20 [ffffff802d16bdc0] fuse_dev_splice_read at ffffff8008436f40
> 21 [ffffff802d16be60] sys_splice at ffffff8008315d18
> 22 [ffffff802d16bff0] __sys_trace at ffffff8008084014
>
> PID: 9652 TASK: ffffffc0c9ce0000 CPU: 4 COMMAND: "kworker/u16:8"
> 0 [ffffff802e793650] __switch_to at ffffff8008086a4c
> 1 [ffffff802e7936c0] __schedule at ffffff80091ffe58
> 2 [ffffff802e793720] schedule at ffffff8009200348
> 3 [ffffff802e793770] __fuse_request_send at ffffff8008435760
> 4 [ffffff802e7937b0] fuse_simple_request at ffffff8008435b14
> 5 [ffffff802e793930] fuse_flush_times at ffffff800843a7a0
> 6 [ffffff802e793950] fuse_write_inode at ffffff800843e4dc
> 7 [ffffff802e793980] __writeback_single_inode at ffffff8008312740
> 8 [ffffff802e793aa0] writeback_sb_inodes at ffffff80083117e4
> 9 [ffffff802e793b00] __writeback_inodes_wb at ffffff8008311d98
> 10 [ffffff802e793c00] wb_writeback at ffffff8008310cfc
> 11 [ffffff802e793d00] wb_workfn at ffffff800830e4a8
> 12 [ffffff802e793d90] process_one_work at ffffff80080e4fac
> 13 [ffffff802e793e00] worker_thread at ffffff80080e5670
> 14 [ffffff802e793e60] kthread at ffffff80080eb650

The issue is real.

The fix, however, is not the right one.  The fundamental problem is
that fuse_write_inode() blocks on a request to userspace.

This is the same issue that fuse_writepage/fuse_writepages face.  In
that case the solution was to copy the page contents to a temporary
buffer and return immediately as if the writeback already completed.

Something similar needs to be done here: send the FUSE_SETATTR request
asynchronously and return immediately from fuse_write_inode().  The
tricky part is to make sure that multiple time updates for the same
inode aren't mixed up...

Thanks,
Miklos
chenguanyou June 11, 2021, 12:14 p.m. UTC | #3
PID: 9652 TASK: ffffffc0c9ce0000 CPU: 4 COMMAND: "kworker/u16:8"
#0 [ffffff802e793650] __switch_to at ffffff8008086a4c
#1 [ffffff802e7936c0] __schedule at ffffff80091ffe58
#2 [ffffff802e793720] schedule at ffffff8009200348
#3 [ffffff802e793770] __fuse_request_send at ffffff8008435760
#4 [ffffff802e7937b0] fuse_simple_request at ffffff8008435b14
#5 [ffffff802e793930] fuse_flush_times at ffffff800843a7a0
#6 [ffffff802e793950] fuse_write_inode at ffffff800843e4dc
#7 [ffffff802e793980] __writeback_single_inode at ffffff8008312740
#8 [ffffff802e793aa0] writeback_sb_inodes at ffffff80083117e4
#9 [ffffff802e793b00] __writeback_inodes_wb at ffffff8008311d98
#10 [ffffff802e793c00] wb_writeback at ffffff8008310cfc
#11 [ffffff802e793d00] wb_workfn at ffffff800830e4a8
#12 [ffffff802e793d90] process_one_work at ffffff80080e4fac
#13 [ffffff802e793e00] worker_thread at ffffff80080e5670
#14 [ffffff802e793e60] kthread at ffffff80080eb650

crash> p __writeback_single_inode
__writeback_single_inode = $119 =
{int (struct inode *, struct writeback_control *)} 0xffffff8008312040 <__writeback_single_inode>

0xffffff8008312040 <__writeback_single_inode>: stp x29, x30, [sp,#-96]!
0xffffff8008312044 <__writeback_single_inode+4>: stp x28, x27, [sp,#16]
0xffffff8008312048 <__writeback_single_inode+8>: stp x26, x25, [sp,#32]
0xffffff800831204c <__writeback_single_inode+12>: stp x24, x23, [sp,#48]
0xffffff8008312050 <__writeback_single_inode+16>: stp x22, x21, [sp,#64]
0xffffff8008312054 <__writeback_single_inode+20>: stp x20, x19, [sp,#80]
0xffffff8008312058 <__writeback_single_inode+24>: mov x29, sp

0xffffff8008311374 <writeback_sb_inodes>: sub sp, sp, #0x120
0xffffff8008311378 <writeback_sb_inodes+4>: stp x29, x30, [sp,#192]
0xffffff800831137c <writeback_sb_inodes+8>: stp x28, x27, [sp,#208]
0xffffff8008311380 <writeback_sb_inodes+12>: stp x26, x25, [sp,#224]
0xffffff8008311384 <writeback_sb_inodes+16>: stp x24, x23, [sp,#240]
0xffffff8008311388 <writeback_sb_inodes+20>: stp x22, x21, [sp,#256]
0xffffff800831138c <writeback_sb_inodes+24>: stp x20, x19, [sp,#272]
0xffffff8008311390 <writeback_sb_inodes+28>: add x29, sp, #0xc0

0xffffff80083117d8 <writeback_sb_inodes+1124>: add x1, sp, #0x60
0xffffff80083117dc <writeback_sb_inodes+1128>: mov x0, x27
0xffffff80083117e0 <writeback_sb_inodes+1132>: stp x21, xzr, [sp,#96]
0xffffff80083117e4 <writeback_sb_inodes+1136>: bl 0xffffff8008312040 <__writeback_single_inode>

x27 = inode,sp + 0x60 = writeback_control

#7 [ffffff802e793980] __writeback_single_inode at ffffff8008312740
ffffff802e793980: ffffff802e793aa0 ffffff80083117e8
ffffff802e793990: ffffffc0791dd1c8 ffffffc0791dd140

so : #7 [ffffff802e793980] __writeback_single_inode(0xffffffc0791dd140, 0xffffff802e793a40) at ffffff8008312740

crash> p __fuse_request_send
__fuse_request_send = $124 =
{void (struct fuse_conn *, struct fuse_req *)} 0xffffff8008435604 <__fuse_request_send>

0xffffff8008435604 <__fuse_request_send>: sub sp, sp, #0x70
0xffffff8008435608 <__fuse_request_send+4>: stp x29, x30, [sp,#48]
0xffffff800843560c <__fuse_request_send+8>: stp x24, x23, [sp,#64]
0xffffff8008435610 <__fuse_request_send+12>: stp x22, x21, [sp,#80]
0xffffff8008435614 <__fuse_request_send+16>: stp x20, x19, [sp,#96]
0xffffff8008435618 <__fuse_request_send+20>: add x29, sp, #0x3

0xffffff8008435b0c <fuse_simple_request+368>: mov x0, x19
0xffffff8008435b10 <fuse_simple_request+372>: mov x1, x20
0xffffff8008435b14 <fuse_simple_request+376>: bl 0xffffff8008435604 <__fuse_request_send>

x19 = fuse_conn,x20 = fuse_req

#3 [ffffff802e793770] __fuse_request_send at ffffff8008435760
ffffff802e793770: ffffff802e7937b0 ffffff8008435b18
ffffff802e793780: ffffffc0791dd1c8 ffffffc0c9ce0000
ffffff802e793790: ffffffc0c6feb1f8 ffffff802e7938b8
ffffff802e7937a0: ffffffc0c6feb148 ffffffc0901f2f80

#3 [ffffff802e793770] __fuse_request_send(0xffffffc0901f2f80, 0xffffffc0c6feb148) at ffffff8008435760

crash> struct -x fuse_req.flags ffffffc0c6feb148
flags = 0x9  = 0000 0000 1001  ===>> FR_WAITING | FR_ISREPLY

--------------------------------------------------------------------------------------------------------------------

PID: 17172 TASK: ffffffc0c162c000 CPU: 6 COMMAND: "Thread-21"
#0 [ffffff802d16b400] __switch_to at ffffff8008086a4c
#1 [ffffff802d16b470] __schedule at ffffff80091ffe58
#2 [ffffff802d16b4d0] schedule at ffffff8009200348
#3 [ffffff802d16b4f0] bit_wait at ffffff8009201098
#4 [ffffff802d16b510] __wait_on_bit at ffffff8009200a34
#5 [ffffff802d16b5b0] inode_wait_for_writeback at ffffff800830e1e8
#6 [ffffff802d16b5e0] evict at ffffff80082fb15c
#7 [ffffff802d16b620] iput at ffffff80082f9270
#8 [ffffff802d16b680] dentry_unlink_inode at ffffff80082f4c90
#9 [ffffff802d16b6a0] __dentry_kill at ffffff80082f1710
#10 [ffffff802d16b6d0] shrink_dentry_list at ffffff80082f1c34
#11 [ffffff802d16b750] prune_dcache_sb at ffffff80082f18a8
#12 [ffffff802d16b770] super_cache_scan at ffffff80082d55ac
#13 [ffffff802d16b860] shrink_slab at ffffff8008266170
#14 [ffffff802d16b900] shrink_node at ffffff800826b420
#15 [ffffff802d16b980] do_try_to_free_pages at ffffff8008268460
#16 [ffffff802d16ba60] try_to_free_pages at ffffff80082680d0
#17 [ffffff802d16bbe0] __alloc_pages_nodemask at ffffff8008256514
#18 [ffffff802d16bc60] fuse_copy_fill at ffffff8008438268
#19 [ffffff802d16bd00] fuse_dev_do_read at ffffff8008437654
#20 [ffffff802d16bdc0] fuse_dev_splice_read at ffffff8008436f40
#21 [ffffff802d16be60] sys_splice at ffffff8008315d18
#22 [ffffff802d16bff0] __sys_trace at ffffff8008084014

crash> p fuse_dev_do_read
fuse_dev_do_read = $259 = 
 {ssize_t (struct fuse_dev *, struct file *, struct fuse_copy_state *, size_t)} 0xffffff8008437170 <fuse_dev_do_read>

0xffffff8008437650 <fuse_dev_do_read+1248>: mov x0, x19
0xffffff8008437654 <fuse_dev_do_read+1252>: bl 0xffffff8008438110 <fuse_copy_fill>

0xffffff8008438110 <fuse_copy_fill>: sub sp, sp, #0x50
0xffffff8008438114 <fuse_copy_fill+4>: stp x29, x30, [sp,#32]
0xffffff8008438118 <fuse_copy_fill+8>: str x21, [sp,#48]
0xffffff800843811c <fuse_copy_fill+12>: stp x20, x19, [sp,#64]
0xffffff8008438120 <fuse_copy_fill+16>: add x29, sp, #0x20

#18 [ffffff802d16bc60] fuse_copy_fill at ffffff8008438268
ffffff802d16bc60: ffffff802d16bd00 ffffff8008437658
ffffff802d16bc70: 0000000000000028 ffffff8008437620
ffffff802d16bc80: ffffffc0901f2f80 ffffff802d16bd68   // fuse_copy_state

#4 [ffffff802d16bd00] fuse_dev_do_read(ffffffc0af775e00, ffffffc07ab65340, ffffff802d16bd68 ,0000000000021000) at ffffff8008437654

crash> struct -x fuse_copy_state ffffff802d16bd68
struct fuse_copy_state {
write = 0x1,
req = 0xffffffc0c6feb148,   // #3 [ffffff802e793770] __fuse_request_send(ffffffc0901f2f80, ffffffc0c6feb148) at ffffff8008435760
iter = 0x0, 
pipebufs = 0xffffffc0d1dd2000,
currbuf = 0x0,
pipe = 0xffffffc015bd7440,
nr_segs = 0x0,
pg = 0x0,
len = 0x0,
offset = 0x0,
move_pages = 0x0
}

0xffffff80082fb158 <evict+156>: mov x0, x19
0xffffff80082fb15c <evict+160>: bl 0xffffff800830e14c <inode_wait_for_writeback>

crash> p inode_wait_for_writeback
inode_wait_for_writeback = $107 =
{void (struct inode *)} 0xffffff800830e14c <inode_wait_for_writeback>

x19 = inode

crash> dis inode_wait_for_writeback
0xffffff800830e14c <inode_wait_for_writeback>: sub sp, sp, #0x80
0xffffff800830e150 <inode_wait_for_writeback+4>: stp x29, x30, [sp,#80]
0xffffff800830e154 <inode_wait_for_writeback+8>: stp x22, x21, [sp,#96]
0xffffff800830e158 <inode_wait_for_writeback+12>: stp x20, x19, [sp,#112]
0xffffff800830e15c <inode_wait_for_writeback+16>: add x29, sp, #0x50

#5 [ffffff802d16b5b0] inode_wait_for_writeback at ffffff800830e1e8
x19 = sp + 0x78 ===>>  x29 + 0x28 = ffffff802d16b5b0 + 0x28 = ffffff802d16b5d8

crash> rd ffffff802d16b5d8
ffffff802d16b5d8: ffffffc0791dd140 @..y.

crash> struct -x inode.i_state ffffffc0791dd140
i_state = 0xa0  = 1010 0000 = I_SYNC | I_FREEING

--------------------------------------------------------------------------------------------------------------------

                 9652                                                   17172
__fuse_request_send(ffffffc0901f2f80, ffffffc0c6feb148)
               ...                                              fuse_dev_do_read
	   	request_wait_answer   <<=========================				...
														|	|=> inode_wait_for_writeback
-----------------------------------------------------------------------------------------
 inode_sync_complete(0xffffffc0791dd140) ===============|==>| wake up 17172		
                                                        |
                                                        |<<== request_end(xxx, 0xffffffc0c6feb148) wake up 9652


static unsigned long super_cache_scan(struct shrinker *shrink,
                      struct shrink_control *sc)
{
    ...
    /*
     * Deadlock avoidance.  We may hold various FS locks, and we don't want
     * to recurse into the FS that called us in clear_inode() and friends..
     */
    if (!(sc->gfp_mask & __GFP_FS))   // page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
        return SHRINK_STOP;
	...
}

static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                    nodemask_t *nodemask)
{
	...
    /*
     * If the caller cannot enter the filesystem, it's possible that it
     * is due to the caller holding an FS lock or performing a journal
     * transaction in the case of a filesystem like ext[3|4]. In this case,
     * it is not safe to block on pfmemalloc_wait as kswapd could be
     * blocked waiting on the same lock. Instead, throttle for up to a
     * second before continuing.
     */
    if (!(gfp_mask & __GFP_FS)) {
        wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
            allow_direct_reclaim(pgdat), HZ);

        goto check_pending;
    }

    /* Throttle until kswapd wakes the process */
    wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
        allow_direct_reclaim(pgdat));
	...
}

The fix will cause other problems?

Thanks,
Guanyou.Chen
Ed Tsai July 13, 2021, 2:42 a.m. UTC | #4
On Tue, 2021-06-08 at 17:30 +0200, Miklos Szeredi wrote:
> On Thu, 3 Jun 2021 at 14:52, chenguanyou <chenguanyou9338@gmail.com>
> wrote:
> > 
> > ABA deadlock
> > 
> > PID: 17172 TASK: ffffffc0c162c000 CPU: 6 COMMAND: "Thread-21"
> > 0 [ffffff802d16b400] __switch_to at ffffff8008086a4c
> > 1 [ffffff802d16b470] __schedule at ffffff80091ffe58
> > 2 [ffffff802d16b4d0] schedule at ffffff8009200348
> > 3 [ffffff802d16b4f0] bit_wait at ffffff8009201098
> > 4 [ffffff802d16b510] __wait_on_bit at ffffff8009200a34
> > 5 [ffffff802d16b5b0] inode_wait_for_writeback at ffffff800830e1e8
> > 6 [ffffff802d16b5e0] evict at ffffff80082fb15c
> > 7 [ffffff802d16b620] iput at ffffff80082f9270
> > 8 [ffffff802d16b680] dentry_unlink_inode at ffffff80082f4c90
> > 9 [ffffff802d16b6a0] __dentry_kill at ffffff80082f1710
> > 10 [ffffff802d16b6d0] shrink_dentry_list at ffffff80082f1c34
> > 11 [ffffff802d16b750] prune_dcache_sb at ffffff80082f18a8
> > 12 [ffffff802d16b770] super_cache_scan at ffffff80082d55ac
> > 13 [ffffff802d16b860] shrink_slab at ffffff8008266170
> > 14 [ffffff802d16b900] shrink_node at ffffff800826b420
> > 15 [ffffff802d16b980] do_try_to_free_pages at ffffff8008268460
> > 16 [ffffff802d16ba60] try_to_free_pages at ffffff80082680d0
> > 17 [ffffff802d16bbe0] __alloc_pages_nodemask at ffffff8008256514
> > 18 [ffffff802d16bc60] fuse_copy_fill at ffffff8008438268
> > 19 [ffffff802d16bd00] fuse_dev_do_read at ffffff8008437654
> > 20 [ffffff802d16bdc0] fuse_dev_splice_read at ffffff8008436f40
> > 21 [ffffff802d16be60] sys_splice at ffffff8008315d18
> > 22 [ffffff802d16bff0] __sys_trace at ffffff8008084014
> > 
> > PID: 9652 TASK: ffffffc0c9ce0000 CPU: 4 COMMAND: "kworker/u16:8"
> > 0 [ffffff802e793650] __switch_to at ffffff8008086a4c
> > 1 [ffffff802e7936c0] __schedule at ffffff80091ffe58
> > 2 [ffffff802e793720] schedule at ffffff8009200348
> > 3 [ffffff802e793770] __fuse_request_send at ffffff8008435760
> > 4 [ffffff802e7937b0] fuse_simple_request at ffffff8008435b14
> > 5 [ffffff802e793930] fuse_flush_times at ffffff800843a7a0
> > 6 [ffffff802e793950] fuse_write_inode at ffffff800843e4dc
> > 7 [ffffff802e793980] __writeback_single_inode at ffffff8008312740
> > 8 [ffffff802e793aa0] writeback_sb_inodes at ffffff80083117e4
> > 9 [ffffff802e793b00] __writeback_inodes_wb at ffffff8008311d98
> > 10 [ffffff802e793c00] wb_writeback at ffffff8008310cfc
> > 11 [ffffff802e793d00] wb_workfn at ffffff800830e4a8
> > 12 [ffffff802e793d90] process_one_work at ffffff80080e4fac
> > 13 [ffffff802e793e00] worker_thread at ffffff80080e5670
> > 14 [ffffff802e793e60] kthread at ffffff80080eb650
> 
> The issue is real.
> 
> The fix, however, is not the right one.  The fundamental problem is
> that fuse_write_inode() blocks on a request to userspace.
> 
> This is the same issue that fuse_writepage/fuse_writepages face.  In
> that case the solution was to copy the page contents to a temporary
> buffer and return immediately as if the writeback already completed.
> 
> Something similar needs to be done here: send the FUSE_SETATTR
> request
> asynchronously and return immediately from fuse_write_inode().  The
> tricky part is to make sure that multiple time updates for the same
> inode aren't mixed up...
> 
> Thanks,
> Miklos

Dear Szeredi,

Writeback thread calls fuse_write_inode() and wait for user Daemon to
complete this write inode request. The user daemon will alloc_page()
after taking this request, and a deadlock could happen when we try to
shrink dentry list under memory pressure.

We (Mediatek) glad to work on this issue for mainline and also LTS. So
another problem is that we should not change the protocol or feature
for stable kernel.

Use GFP_NOFS | __GFP_HIGHMEM can really avoid this by skip the dentry
shirnker. It works but degrade the alloc_page success rate. In a more
fundamental way, we could cache the contents and return immediately.
But how to ensure the request will be done successfully, e.g., always
retry if it fails from daemon.
Miklos Szeredi Aug. 18, 2021, 9:24 a.m. UTC | #5
On Tue, 13 Jul 2021 at 04:42, Ed Tsai <ed.tsai@mediatek.com> wrote:
>
> On Tue, 2021-06-08 at 17:30 +0200, Miklos Szeredi wrote:
> > On Thu, 3 Jun 2021 at 14:52, chenguanyou <chenguanyou9338@gmail.com>
> > wrote:
> > >
> > > ABA deadlock
> > >
> > > PID: 17172 TASK: ffffffc0c162c000 CPU: 6 COMMAND: "Thread-21"
> > > 0 [ffffff802d16b400] __switch_to at ffffff8008086a4c
> > > 1 [ffffff802d16b470] __schedule at ffffff80091ffe58
> > > 2 [ffffff802d16b4d0] schedule at ffffff8009200348
> > > 3 [ffffff802d16b4f0] bit_wait at ffffff8009201098
> > > 4 [ffffff802d16b510] __wait_on_bit at ffffff8009200a34
> > > 5 [ffffff802d16b5b0] inode_wait_for_writeback at ffffff800830e1e8
> > > 6 [ffffff802d16b5e0] evict at ffffff80082fb15c
> > > 7 [ffffff802d16b620] iput at ffffff80082f9270
> > > 8 [ffffff802d16b680] dentry_unlink_inode at ffffff80082f4c90
> > > 9 [ffffff802d16b6a0] __dentry_kill at ffffff80082f1710
> > > 10 [ffffff802d16b6d0] shrink_dentry_list at ffffff80082f1c34
> > > 11 [ffffff802d16b750] prune_dcache_sb at ffffff80082f18a8
> > > 12 [ffffff802d16b770] super_cache_scan at ffffff80082d55ac
> > > 13 [ffffff802d16b860] shrink_slab at ffffff8008266170
> > > 14 [ffffff802d16b900] shrink_node at ffffff800826b420
> > > 15 [ffffff802d16b980] do_try_to_free_pages at ffffff8008268460
> > > 16 [ffffff802d16ba60] try_to_free_pages at ffffff80082680d0
> > > 17 [ffffff802d16bbe0] __alloc_pages_nodemask at ffffff8008256514
> > > 18 [ffffff802d16bc60] fuse_copy_fill at ffffff8008438268
> > > 19 [ffffff802d16bd00] fuse_dev_do_read at ffffff8008437654
> > > 20 [ffffff802d16bdc0] fuse_dev_splice_read at ffffff8008436f40
> > > 21 [ffffff802d16be60] sys_splice at ffffff8008315d18
> > > 22 [ffffff802d16bff0] __sys_trace at ffffff8008084014
> > >
> > > PID: 9652 TASK: ffffffc0c9ce0000 CPU: 4 COMMAND: "kworker/u16:8"
> > > 0 [ffffff802e793650] __switch_to at ffffff8008086a4c
> > > 1 [ffffff802e7936c0] __schedule at ffffff80091ffe58
> > > 2 [ffffff802e793720] schedule at ffffff8009200348
> > > 3 [ffffff802e793770] __fuse_request_send at ffffff8008435760
> > > 4 [ffffff802e7937b0] fuse_simple_request at ffffff8008435b14
> > > 5 [ffffff802e793930] fuse_flush_times at ffffff800843a7a0
> > > 6 [ffffff802e793950] fuse_write_inode at ffffff800843e4dc
> > > 7 [ffffff802e793980] __writeback_single_inode at ffffff8008312740
> > > 8 [ffffff802e793aa0] writeback_sb_inodes at ffffff80083117e4
> > > 9 [ffffff802e793b00] __writeback_inodes_wb at ffffff8008311d98
> > > 10 [ffffff802e793c00] wb_writeback at ffffff8008310cfc
> > > 11 [ffffff802e793d00] wb_workfn at ffffff800830e4a8
> > > 12 [ffffff802e793d90] process_one_work at ffffff80080e4fac
> > > 13 [ffffff802e793e00] worker_thread at ffffff80080e5670
> > > 14 [ffffff802e793e60] kthread at ffffff80080eb650
> >
> > The issue is real.
> >
> > The fix, however, is not the right one.  The fundamental problem is
> > that fuse_write_inode() blocks on a request to userspace.
> >
> > This is the same issue that fuse_writepage/fuse_writepages face.  In
> > that case the solution was to copy the page contents to a temporary
> > buffer and return immediately as if the writeback already completed.
> >
> > Something similar needs to be done here: send the FUSE_SETATTR
> > request
> > asynchronously and return immediately from fuse_write_inode().  The
> > tricky part is to make sure that multiple time updates for the same
> > inode aren't mixed up...
> >
> > Thanks,
> > Miklos
>
> Dear Szeredi,
>
> Writeback thread calls fuse_write_inode() and wait for user Daemon to
> complete this write inode request. The user daemon will alloc_page()
> after taking this request, and a deadlock could happen when we try to
> shrink dentry list under memory pressure.
>
> We (Mediatek) glad to work on this issue for mainline and also LTS. So
> another problem is that we should not change the protocol or feature
> for stable kernel.
>
> Use GFP_NOFS | __GFP_HIGHMEM can really avoid this by skip the dentry
> shirnker. It works but degrade the alloc_page success rate. In a more
> fundamental way, we could cache the contents and return immediately.
> But how to ensure the request will be done successfully, e.g., always
> retry if it fails from daemon.

Key is where the the dirty metadata is flushed.  To prevent deadlock
it must not be flushed from memory reclaim, so must make sure that it
is flushed on close(2) and munmap(2) and not dirtied after that.

I'm working on this currently and hope to get it ready for the next
merge window.

Thanks,
Miklos
Ed Tsai Sept. 24, 2021, 3:52 a.m. UTC | #6
On Wed, 2021-08-18 at 17:24 +0800, Miklos Szeredi wrote:
> On Tue, 13 Jul 2021 at 04:42, Ed Tsai <ed.tsai@mediatek.com> wrote:
> > 
> > On Tue, 2021-06-08 at 17:30 +0200, Miklos Szeredi wrote:
> > > On Thu, 3 Jun 2021 at 14:52, chenguanyou <
> > > chenguanyou9338@gmail.com>
> > > wrote:
> > > > 
> > > > ABA deadlock
> > > > 
> > > > PID: 17172 TASK: ffffffc0c162c000 CPU: 6 COMMAND: "Thread-21"
> > > > 0 [ffffff802d16b400] __switch_to at ffffff8008086a4c
> > > > 1 [ffffff802d16b470] __schedule at ffffff80091ffe58
> > > > 2 [ffffff802d16b4d0] schedule at ffffff8009200348
> > > > 3 [ffffff802d16b4f0] bit_wait at ffffff8009201098
> > > > 4 [ffffff802d16b510] __wait_on_bit at ffffff8009200a34
> > > > 5 [ffffff802d16b5b0] inode_wait_for_writeback at
> > > > ffffff800830e1e8
> > > > 6 [ffffff802d16b5e0] evict at ffffff80082fb15c
> > > > 7 [ffffff802d16b620] iput at ffffff80082f9270
> > > > 8 [ffffff802d16b680] dentry_unlink_inode at ffffff80082f4c90
> > > > 9 [ffffff802d16b6a0] __dentry_kill at ffffff80082f1710
> > > > 10 [ffffff802d16b6d0] shrink_dentry_list at ffffff80082f1c34
> > > > 11 [ffffff802d16b750] prune_dcache_sb at ffffff80082f18a8
> > > > 12 [ffffff802d16b770] super_cache_scan at ffffff80082d55ac
> > > > 13 [ffffff802d16b860] shrink_slab at ffffff8008266170
> > > > 14 [ffffff802d16b900] shrink_node at ffffff800826b420
> > > > 15 [ffffff802d16b980] do_try_to_free_pages at ffffff8008268460
> > > > 16 [ffffff802d16ba60] try_to_free_pages at ffffff80082680d0
> > > > 17 [ffffff802d16bbe0] __alloc_pages_nodemask at
> > > > ffffff8008256514
> > > > 18 [ffffff802d16bc60] fuse_copy_fill at ffffff8008438268
> > > > 19 [ffffff802d16bd00] fuse_dev_do_read at ffffff8008437654
> > > > 20 [ffffff802d16bdc0] fuse_dev_splice_read at ffffff8008436f40
> > > > 21 [ffffff802d16be60] sys_splice at ffffff8008315d18
> > > > 22 [ffffff802d16bff0] __sys_trace at ffffff8008084014
> > > > 
> > > > PID: 9652 TASK: ffffffc0c9ce0000 CPU: 4 COMMAND:
> > > > "kworker/u16:8"
> > > > 0 [ffffff802e793650] __switch_to at ffffff8008086a4c
> > > > 1 [ffffff802e7936c0] __schedule at ffffff80091ffe58
> > > > 2 [ffffff802e793720] schedule at ffffff8009200348
> > > > 3 [ffffff802e793770] __fuse_request_send at ffffff8008435760
> > > > 4 [ffffff802e7937b0] fuse_simple_request at ffffff8008435b14
> > > > 5 [ffffff802e793930] fuse_flush_times at ffffff800843a7a0
> > > > 6 [ffffff802e793950] fuse_write_inode at ffffff800843e4dc
> > > > 7 [ffffff802e793980] __writeback_single_inode at
> > > > ffffff8008312740
> > > > 8 [ffffff802e793aa0] writeback_sb_inodes at ffffff80083117e4
> > > > 9 [ffffff802e793b00] __writeback_inodes_wb at ffffff8008311d98
> > > > 10 [ffffff802e793c00] wb_writeback at ffffff8008310cfc
> > > > 11 [ffffff802e793d00] wb_workfn at ffffff800830e4a8
> > > > 12 [ffffff802e793d90] process_one_work at ffffff80080e4fac
> > > > 13 [ffffff802e793e00] worker_thread at ffffff80080e5670
> > > > 14 [ffffff802e793e60] kthread at ffffff80080eb650
> > > 
> > > The issue is real.
> > > 
> > > The fix, however, is not the right one.  The fundamental problem
> > > is
> > > that fuse_write_inode() blocks on a request to userspace.
> > > 
> > > This is the same issue that fuse_writepage/fuse_writepages
> > > face.  In
> > > that case the solution was to copy the page contents to a
> > > temporary
> > > buffer and return immediately as if the writeback already
> > > completed.
> > > 
> > > Something similar needs to be done here: send the FUSE_SETATTR
> > > request
> > > asynchronously and return immediately from
> > > fuse_write_inode().  The
> > > tricky part is to make sure that multiple time updates for the
> > > same
> > > inode aren't mixed up...
> > > 
> > > Thanks,
> > > Miklos
> > 
> > Dear Szeredi,
> > 
> > Writeback thread calls fuse_write_inode() and wait for user Daemon
> > to
> > complete this write inode request. The user daemon will
> > alloc_page()
> > after taking this request, and a deadlock could happen when we try
> > to
> > shrink dentry list under memory pressure.
> > 
> > We (Mediatek) glad to work on this issue for mainline and also LTS.
> > So
> > another problem is that we should not change the protocol or
> > feature
> > for stable kernel.
> > 
> > Use GFP_NOFS | __GFP_HIGHMEM can really avoid this by skip the
> > dentry
> > shirnker. It works but degrade the alloc_page success rate. In a
> > more
> > fundamental way, we could cache the contents and return
> > immediately.
> > But how to ensure the request will be done successfully, e.g.,
> > always
> > retry if it fails from daemon.
> 
> Key is where the the dirty metadata is flushed.  To prevent deadlock
> it must not be flushed from memory reclaim, so must make sure that it
> is flushed on close(2) and munmap(2) and not dirtied after that.
> 
> I'm working on this currently and hope to get it ready for the next
> merge window.
> 
> Thanks,
> Miklos

Hi Miklos,

I'm not sure whether it has already been resolved in mainline.
If it still WIP, please cc me on future emails.

Best regards,
Ed Tsai
Miklos Szeredi Sept. 24, 2021, 7:52 a.m. UTC | #7
On Fri, 24 Sept 2021 at 05:52, Ed Tsai <ed.tsai@mediatek.com> wrote:
>
> On Wed, 2021-08-18 at 17:24 +0800, Miklos Szeredi wrote:
> > On Tue, 13 Jul 2021 at 04:42, Ed Tsai <ed.tsai@mediatek.com> wrote:
> > >
> > > On Tue, 2021-06-08 at 17:30 +0200, Miklos Szeredi wrote:
> > > > On Thu, 3 Jun 2021 at 14:52, chenguanyou <
> > > > chenguanyou9338@gmail.com>
> > > > wrote:
> > > > >
> > > > > ABA deadlock
> > > > >
> > > > > PID: 17172 TASK: ffffffc0c162c000 CPU: 6 COMMAND: "Thread-21"
> > > > > 0 [ffffff802d16b400] __switch_to at ffffff8008086a4c
> > > > > 1 [ffffff802d16b470] __schedule at ffffff80091ffe58
> > > > > 2 [ffffff802d16b4d0] schedule at ffffff8009200348
> > > > > 3 [ffffff802d16b4f0] bit_wait at ffffff8009201098
> > > > > 4 [ffffff802d16b510] __wait_on_bit at ffffff8009200a34
> > > > > 5 [ffffff802d16b5b0] inode_wait_for_writeback at
> > > > > ffffff800830e1e8
> > > > > 6 [ffffff802d16b5e0] evict at ffffff80082fb15c
> > > > > 7 [ffffff802d16b620] iput at ffffff80082f9270
> > > > > 8 [ffffff802d16b680] dentry_unlink_inode at ffffff80082f4c90
> > > > > 9 [ffffff802d16b6a0] __dentry_kill at ffffff80082f1710
> > > > > 10 [ffffff802d16b6d0] shrink_dentry_list at ffffff80082f1c34
> > > > > 11 [ffffff802d16b750] prune_dcache_sb at ffffff80082f18a8
> > > > > 12 [ffffff802d16b770] super_cache_scan at ffffff80082d55ac
> > > > > 13 [ffffff802d16b860] shrink_slab at ffffff8008266170
> > > > > 14 [ffffff802d16b900] shrink_node at ffffff800826b420
> > > > > 15 [ffffff802d16b980] do_try_to_free_pages at ffffff8008268460
> > > > > 16 [ffffff802d16ba60] try_to_free_pages at ffffff80082680d0
> > > > > 17 [ffffff802d16bbe0] __alloc_pages_nodemask at
> > > > > ffffff8008256514
> > > > > 18 [ffffff802d16bc60] fuse_copy_fill at ffffff8008438268
> > > > > 19 [ffffff802d16bd00] fuse_dev_do_read at ffffff8008437654
> > > > > 20 [ffffff802d16bdc0] fuse_dev_splice_read at ffffff8008436f40
> > > > > 21 [ffffff802d16be60] sys_splice at ffffff8008315d18
> > > > > 22 [ffffff802d16bff0] __sys_trace at ffffff8008084014
> > > > >
> > > > > PID: 9652 TASK: ffffffc0c9ce0000 CPU: 4 COMMAND:
> > > > > "kworker/u16:8"
> > > > > 0 [ffffff802e793650] __switch_to at ffffff8008086a4c
> > > > > 1 [ffffff802e7936c0] __schedule at ffffff80091ffe58
> > > > > 2 [ffffff802e793720] schedule at ffffff8009200348
> > > > > 3 [ffffff802e793770] __fuse_request_send at ffffff8008435760
> > > > > 4 [ffffff802e7937b0] fuse_simple_request at ffffff8008435b14
> > > > > 5 [ffffff802e793930] fuse_flush_times at ffffff800843a7a0
> > > > > 6 [ffffff802e793950] fuse_write_inode at ffffff800843e4dc
> > > > > 7 [ffffff802e793980] __writeback_single_inode at
> > > > > ffffff8008312740
> > > > > 8 [ffffff802e793aa0] writeback_sb_inodes at ffffff80083117e4
> > > > > 9 [ffffff802e793b00] __writeback_inodes_wb at ffffff8008311d98
> > > > > 10 [ffffff802e793c00] wb_writeback at ffffff8008310cfc
> > > > > 11 [ffffff802e793d00] wb_workfn at ffffff800830e4a8
> > > > > 12 [ffffff802e793d90] process_one_work at ffffff80080e4fac
> > > > > 13 [ffffff802e793e00] worker_thread at ffffff80080e5670
> > > > > 14 [ffffff802e793e60] kthread at ffffff80080eb650
> > > >
> > > > The issue is real.
> > > >
> > > > The fix, however, is not the right one.  The fundamental problem
> > > > is
> > > > that fuse_write_inode() blocks on a request to userspace.
> > > >
> > > > This is the same issue that fuse_writepage/fuse_writepages
> > > > face.  In
> > > > that case the solution was to copy the page contents to a
> > > > temporary
> > > > buffer and return immediately as if the writeback already
> > > > completed.
> > > >
> > > > Something similar needs to be done here: send the FUSE_SETATTR
> > > > request
> > > > asynchronously and return immediately from
> > > > fuse_write_inode().  The
> > > > tricky part is to make sure that multiple time updates for the
> > > > same
> > > > inode aren't mixed up...
> > > >
> > > > Thanks,
> > > > Miklos
> > >
> > > Dear Szeredi,
> > >
> > > Writeback thread calls fuse_write_inode() and wait for user Daemon
> > > to
> > > complete this write inode request. The user daemon will
> > > alloc_page()
> > > after taking this request, and a deadlock could happen when we try
> > > to
> > > shrink dentry list under memory pressure.
> > >
> > > We (Mediatek) glad to work on this issue for mainline and also LTS.
> > > So
> > > another problem is that we should not change the protocol or
> > > feature
> > > for stable kernel.
> > >
> > > Use GFP_NOFS | __GFP_HIGHMEM can really avoid this by skip the
> > > dentry
> > > shirnker. It works but degrade the alloc_page success rate. In a
> > > more
> > > fundamental way, we could cache the contents and return
> > > immediately.
> > > But how to ensure the request will be done successfully, e.g.,
> > > always
> > > retry if it fails from daemon.
> >
> > Key is where the the dirty metadata is flushed.  To prevent deadlock
> > it must not be flushed from memory reclaim, so must make sure that it
> > is flushed on close(2) and munmap(2) and not dirtied after that.
> >
> > I'm working on this currently and hope to get it ready for the next
> > merge window.
> >
> > Thanks,
> > Miklos
>
> Hi Miklos,
>
> I'm not sure whether it has already been resolved in mainline.
> If it still WIP, please cc me on future emails.

Hi,

This is taking a bit longer, unfortunately, but I already have
something in testing and currently cleaning it up for review.  Hope to
post a series today or early next week.

Thanks,
Miklos
Miklos Szeredi Sept. 28, 2021, 3:25 p.m. UTC | #8
On Fri, Sep 24, 2021 at 09:52:35AM +0200, Miklos Szeredi wrote:
> On Fri, 24 Sept 2021 at 05:52, Ed Tsai <ed.tsai@mediatek.com> wrote:
> >
> > On Wed, 2021-08-18 at 17:24 +0800, Miklos Szeredi wrote:
> > > On Tue, 13 Jul 2021 at 04:42, Ed Tsai <ed.tsai@mediatek.com> wrote:
> > > >
> > > > On Tue, 2021-06-08 at 17:30 +0200, Miklos Szeredi wrote:
> > > > > On Thu, 3 Jun 2021 at 14:52, chenguanyou <
> > > > > chenguanyou9338@gmail.com>
> > > > > wrote:
> > > > > >
> > > > > > ABA deadlock
> > > > > >
> > > > > > PID: 17172 TASK: ffffffc0c162c000 CPU: 6 COMMAND: "Thread-21"
> > > > > > 0 [ffffff802d16b400] __switch_to at ffffff8008086a4c
> > > > > > 1 [ffffff802d16b470] __schedule at ffffff80091ffe58
> > > > > > 2 [ffffff802d16b4d0] schedule at ffffff8009200348
> > > > > > 3 [ffffff802d16b4f0] bit_wait at ffffff8009201098
> > > > > > 4 [ffffff802d16b510] __wait_on_bit at ffffff8009200a34
> > > > > > 5 [ffffff802d16b5b0] inode_wait_for_writeback at
> > > > > > ffffff800830e1e8
> > > > > > 6 [ffffff802d16b5e0] evict at ffffff80082fb15c
> > > > > > 7 [ffffff802d16b620] iput at ffffff80082f9270
> > > > > > 8 [ffffff802d16b680] dentry_unlink_inode at ffffff80082f4c90
> > > > > > 9 [ffffff802d16b6a0] __dentry_kill at ffffff80082f1710
> > > > > > 10 [ffffff802d16b6d0] shrink_dentry_list at ffffff80082f1c34
> > > > > > 11 [ffffff802d16b750] prune_dcache_sb at ffffff80082f18a8
> > > > > > 12 [ffffff802d16b770] super_cache_scan at ffffff80082d55ac
> > > > > > 13 [ffffff802d16b860] shrink_slab at ffffff8008266170
> > > > > > 14 [ffffff802d16b900] shrink_node at ffffff800826b420
> > > > > > 15 [ffffff802d16b980] do_try_to_free_pages at ffffff8008268460
> > > > > > 16 [ffffff802d16ba60] try_to_free_pages at ffffff80082680d0
> > > > > > 17 [ffffff802d16bbe0] __alloc_pages_nodemask at
> > > > > > ffffff8008256514
> > > > > > 18 [ffffff802d16bc60] fuse_copy_fill at ffffff8008438268
> > > > > > 19 [ffffff802d16bd00] fuse_dev_do_read at ffffff8008437654
> > > > > > 20 [ffffff802d16bdc0] fuse_dev_splice_read at ffffff8008436f40
> > > > > > 21 [ffffff802d16be60] sys_splice at ffffff8008315d18
> > > > > > 22 [ffffff802d16bff0] __sys_trace at ffffff8008084014
> > > > > >
> > > > > > PID: 9652 TASK: ffffffc0c9ce0000 CPU: 4 COMMAND:
> > > > > > "kworker/u16:8"
> > > > > > 0 [ffffff802e793650] __switch_to at ffffff8008086a4c
> > > > > > 1 [ffffff802e7936c0] __schedule at ffffff80091ffe58
> > > > > > 2 [ffffff802e793720] schedule at ffffff8009200348
> > > > > > 3 [ffffff802e793770] __fuse_request_send at ffffff8008435760
> > > > > > 4 [ffffff802e7937b0] fuse_simple_request at ffffff8008435b14
> > > > > > 5 [ffffff802e793930] fuse_flush_times at ffffff800843a7a0
> > > > > > 6 [ffffff802e793950] fuse_write_inode at ffffff800843e4dc
> > > > > > 7 [ffffff802e793980] __writeback_single_inode at
> > > > > > ffffff8008312740
> > > > > > 8 [ffffff802e793aa0] writeback_sb_inodes at ffffff80083117e4
> > > > > > 9 [ffffff802e793b00] __writeback_inodes_wb at ffffff8008311d98
> > > > > > 10 [ffffff802e793c00] wb_writeback at ffffff8008310cfc
> > > > > > 11 [ffffff802e793d00] wb_workfn at ffffff800830e4a8
> > > > > > 12 [ffffff802e793d90] process_one_work at ffffff80080e4fac
> > > > > > 13 [ffffff802e793e00] worker_thread at ffffff80080e5670
> > > > > > 14 [ffffff802e793e60] kthread at ffffff80080eb650
> > > > >
> > > > > The issue is real.
> > > > >
> > > > > The fix, however, is not the right one.  The fundamental problem
> > > > > is
> > > > > that fuse_write_inode() blocks on a request to userspace.
> > > > >
> > > > > This is the same issue that fuse_writepage/fuse_writepages
> > > > > face.  In
> > > > > that case the solution was to copy the page contents to a
> > > > > temporary
> > > > > buffer and return immediately as if the writeback already
> > > > > completed.
> > > > >
> > > > > Something similar needs to be done here: send the FUSE_SETATTR
> > > > > request
> > > > > asynchronously and return immediately from
> > > > > fuse_write_inode().  The
> > > > > tricky part is to make sure that multiple time updates for the
> > > > > same
> > > > > inode aren't mixed up...
> > > > >
> > > > > Thanks,
> > > > > Miklos
> > > >
> > > > Dear Szeredi,
> > > >
> > > > Writeback thread calls fuse_write_inode() and wait for user Daemon
> > > > to
> > > > complete this write inode request. The user daemon will
> > > > alloc_page()
> > > > after taking this request, and a deadlock could happen when we try
> > > > to
> > > > shrink dentry list under memory pressure.
> > > >
> > > > We (Mediatek) glad to work on this issue for mainline and also LTS.
> > > > So
> > > > another problem is that we should not change the protocol or
> > > > feature
> > > > for stable kernel.
> > > >
> > > > Use GFP_NOFS | __GFP_HIGHMEM can really avoid this by skip the
> > > > dentry
> > > > shirnker. It works but degrade the alloc_page success rate. In a
> > > > more
> > > > fundamental way, we could cache the contents and return
> > > > immediately.
> > > > But how to ensure the request will be done successfully, e.g.,
> > > > always
> > > > retry if it fails from daemon.
> > >
> > > Key is where the the dirty metadata is flushed.  To prevent deadlock
> > > it must not be flushed from memory reclaim, so must make sure that it
> > > is flushed on close(2) and munmap(2) and not dirtied after that.
> > >
> > > I'm working on this currently and hope to get it ready for the next
> > > merge window.
> > >
> > > Thanks,
> > > Miklos
> >
> > Hi Miklos,
> >
> > I'm not sure whether it has already been resolved in mainline.
> > If it still WIP, please cc me on future emails.
> 
> Hi,
> 
> This is taking a bit longer, unfortunately, but I already have
> something in testing and currently cleaning it up for review.  Hope to
> post a series today or early next week.


Here's a minimal patch.  It's been through some iterations and some testing, but
more review and testing is definitely welcome.

Chenguanyou, can you please verify that it fixes the deadlock?

Thanks,
Miklos

---
From: Miklos Szeredi <mszeredi@redhat.com>
Subject: fuse: make sure reclaim doesn't write the inode

In writeback cache mode mtime/ctime updates are cached, and flushed to the
server using the ->write_inode() callback.

Closing the file will result in a dirty inode being immediately written,
but in other cases the inode can remain dirty after all references are
dropped.  This result in the inode being written back from reclaim, which
can deadlock on a regular allocation while the request is being served.

The usual mechanisms (GFP_NOFS/PF_MEMALLOC*) don't work for FUSE, because
serving a request involves unrelated userspace process(es).

Instead do the same as for dirty pages: make sure the inode is written
before the last reference is gone.

 - fuse_vma_close(): flush times in addition to the dirty pages

 - fallocate(2)/copy_file_range(2): these call file_update_time() or
   file_modified(), so flush the inode before returning from the call

 - unlink(2), link(2) and rename(2): these call fuse_update_ctime(), so
   flush the ctime directly from this helper

Reported-by: chenguanyou <chenguanyou@xiaomi.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dir.c    |    8 ++++++++
 fs/fuse/file.c   |   24 +++++++++++++++++++++---
 fs/fuse/fuse_i.h |    1 +
 3 files changed, 30 insertions(+), 3 deletions(-)

--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -738,12 +738,20 @@ static int fuse_symlink(struct user_name
 	return create_new_entry(fm, &args, dir, entry, S_IFLNK);
 }
 
+void fuse_flush_time_update(struct inode *inode)
+{
+	int err = sync_inode_metadata(inode, 1);
+
+	mapping_set_error(inode->i_mapping, err);
+}
+
 void fuse_update_ctime(struct inode *inode)
 {
 	fuse_invalidate_attr(inode);
 	if (!IS_NOCMTIME(inode)) {
 		inode->i_ctime = current_time(inode);
 		mark_inode_dirty_sync(inode);
+		fuse_flush_time_update(inode);
 	}
 }
 
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1847,6 +1847,17 @@ int fuse_write_inode(struct inode *inode
 	struct fuse_file *ff;
 	int err;
 
+	/*
+	 * Inode is always written before the last reference is dropped and
+	 * hence this should not be reached from reclaim.
+	 *
+	 * Writing back the inode from reclaim can deadlock if the request
+	 * processing itself needs an allocation.  Allocations triggering
+	 * reclaim while serving a request can't be prevented, because it can
+	 * involve any number of unrelated userspace processes.
+	 */
+	WARN_ON(wbc->for_reclaim);
+
 	ff = __fuse_write_file_get(fi);
 	err = fuse_flush_times(inode, ff);
 	if (ff)
@@ -2339,12 +2350,15 @@ static int fuse_launder_page(struct page
 }
 
 /*
- * Write back dirty pages now, because there may not be any suitable
- * open files later
+ * Write back dirty data/metadata now (there may not be any suitable
+ * open files later for data)
  */
 static void fuse_vma_close(struct vm_area_struct *vma)
 {
-	filemap_write_and_wait(vma->vm_file->f_mapping);
+	int err;
+
+	err = write_inode_now(vma->vm_file->f_mapping->host, 1);
+	mapping_set_error(vma->vm_file->f_mapping, err);
 }
 
 /*
@@ -3001,6 +3015,8 @@ static long fuse_file_fallocate(struct f
 	if (lock_inode)
 		inode_unlock(inode);
 
+	fuse_flush_time_update(inode);
+
 	return err;
 }
 
@@ -3110,6 +3126,8 @@ static ssize_t __fuse_copy_file_range(st
 	inode_unlock(inode_out);
 	file_accessed(file_in);
 
+	fuse_flush_time_update(inode_out);
+
 	return err;
 }
 
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1145,6 +1145,7 @@ int fuse_allow_current_process(struct fu
 
 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
 
+void fuse_flush_time_update(struct inode *inode);
 void fuse_update_ctime(struct inode *inode);
 
 int fuse_update_attributes(struct inode *inode, struct file *file);
chenguanyou Sept. 29, 2021, 3:31 a.m. UTC | #9
Hi Miklos,

With pleasure.

Thanks,
Guanyou.Chen
diff mbox series

Patch

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c0fee830a34e..d36125ff0405 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -721,7 +721,7 @@  static int fuse_copy_fill(struct fuse_copy_state *cs)
 			if (cs->nr_segs >= cs->pipe->max_usage)
 				return -EIO;
 
-			page = alloc_page(GFP_HIGHUSER);
+			page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 			if (!page)
 				return -ENOMEM;