diff mbox series

vfs: Fix potential circular locking through setxattr() and removexattr()

Message ID 2136178.1721725194@warthog.procyon.org.uk (mailing list archive)
State New
Headers show
Series vfs: Fix potential circular locking through setxattr() and removexattr() | expand

Commit Message

David Howells July 23, 2024, 8:59 a.m. UTC
When using cachefiles, lockdep may emit something similar to the circular
locking dependency notice below.  The problem appears to stem from the
following:

 (1) Cachefiles manipulates xattrs on the files in its cache when called
     from ->writepages().

 (2) The setxattr() and removexattr() system call handlers get the name
     (and value) from userspace after taking the sb_writers lock, putting
     accesses of the vma->vm_lock and mm->mmap_lock inside of that.

 (3) The afs filesystem uses a per-inode lock to prevent multiple
     revalidation RPCs and in writeback vs truncate to prevent parallel
     operations from deadlocking against the server on one side and local
     page locks on the other.

Fix this by moving the getting of the name and value in {get,remove}xattr()
outside of the sb_writers lock.  This also has the minor benefits that we
don't need to reget these in the event of a retry and we never try to take
the sb_writers lock in the event we can't pull the name and value into the
kernel.

Alternative approaches that might fix this include moving the dispatch of a
write to the cache off to a workqueue or trying to do without the
validation lock in afs.  Note that this might also affect other filesystems
that use netfslib and/or cachefiles.

 ======================================================
 WARNING: possible circular locking dependency detected
 6.10.0-build2+ #956 Not tainted
 ------------------------------------------------------
 fsstress/6050 is trying to acquire lock:
 ffff888138fd82f0 (mapping.invalidate_lock#3){++++}-{3:3}, at: filemap_fault+0x26e/0x8b0

 but task is already holding lock:
 ffff888113f26d18 (&vma->vm_lock->lock){++++}-{3:3}, at: lock_vma_under_rcu+0x165/0x250

 which lock already depends on the new lock.

 the existing dependency chain (in reverse order) is:

 -> #4 (&vma->vm_lock->lock){++++}-{3:3}:
        __lock_acquire+0xaf0/0xd80
        lock_acquire.part.0+0x103/0x280
        down_write+0x3b/0x50
        vma_start_write+0x6b/0xa0
        vma_link+0xcc/0x140
        insert_vm_struct+0xb7/0xf0
        alloc_bprm+0x2c1/0x390
        kernel_execve+0x65/0x1a0
        call_usermodehelper_exec_async+0x14d/0x190
        ret_from_fork+0x24/0x40
        ret_from_fork_asm+0x1a/0x30

 -> #3 (&mm->mmap_lock){++++}-{3:3}:
        __lock_acquire+0xaf0/0xd80
        lock_acquire.part.0+0x103/0x280
        __might_fault+0x7c/0xb0
        strncpy_from_user+0x25/0x160
        removexattr+0x7f/0x100
        __do_sys_fremovexattr+0x7e/0xb0
        do_syscall_64+0x9f/0x100
        entry_SYSCALL_64_after_hwframe+0x76/0x7e

 -> #2 (sb_writers#14){.+.+}-{0:0}:
        __lock_acquire+0xaf0/0xd80
        lock_acquire.part.0+0x103/0x280
        percpu_down_read+0x3c/0x90
        vfs_iocb_iter_write+0xe9/0x1d0
        __cachefiles_write+0x367/0x430
        cachefiles_issue_write+0x299/0x2f0
        netfs_advance_write+0x117/0x140
        netfs_write_folio.isra.0+0x5ca/0x6e0
        netfs_writepages+0x230/0x2f0
        afs_writepages+0x4d/0x70
        do_writepages+0x1e8/0x3e0
        filemap_fdatawrite_wbc+0x84/0xa0
        __filemap_fdatawrite_range+0xa8/0xf0
        file_write_and_wait_range+0x59/0x90
        afs_release+0x10f/0x270
        __fput+0x25f/0x3d0
        __do_sys_close+0x43/0x70
        do_syscall_64+0x9f/0x100
        entry_SYSCALL_64_after_hwframe+0x76/0x7e

 -> #1 (&vnode->validate_lock){++++}-{3:3}:
        __lock_acquire+0xaf0/0xd80
        lock_acquire.part.0+0x103/0x280
        down_read+0x95/0x200
        afs_writepages+0x37/0x70
        do_writepages+0x1e8/0x3e0
        filemap_fdatawrite_wbc+0x84/0xa0
        filemap_invalidate_inode+0x167/0x1e0
        netfs_unbuffered_write_iter+0x1bd/0x2d0
        vfs_write+0x22e/0x320
        ksys_write+0xbc/0x130
        do_syscall_64+0x9f/0x100
        entry_SYSCALL_64_after_hwframe+0x76/0x7e

 -> #0 (mapping.invalidate_lock#3){++++}-{3:3}:
        check_noncircular+0x119/0x160
        check_prev_add+0x195/0x430
        __lock_acquire+0xaf0/0xd80
        lock_acquire.part.0+0x103/0x280
        down_read+0x95/0x200
        filemap_fault+0x26e/0x8b0
        __do_fault+0x57/0xd0
        do_pte_missing+0x23b/0x320
        __handle_mm_fault+0x2d4/0x320
        handle_mm_fault+0x14f/0x260
        do_user_addr_fault+0x2a2/0x500
        exc_page_fault+0x71/0x90
        asm_exc_page_fault+0x22/0x30

 other info that might help us debug this:

 Chain exists of:
   mapping.invalidate_lock#3 --> &mm->mmap_lock --> &vma->vm_lock->lock

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   rlock(&vma->vm_lock->lock);
                                lock(&mm->mmap_lock);
                                lock(&vma->vm_lock->lock);
   rlock(mapping.invalidate_lock#3);

  *** DEADLOCK ***

 1 lock held by fsstress/6050:
  #0: ffff888113f26d18 (&vma->vm_lock->lock){++++}-{3:3}, at: lock_vma_under_rcu+0x165/0x250

 stack backtrace:
 CPU: 0 PID: 6050 Comm: fsstress Not tainted 6.10.0-build2+ #956
 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
 Call Trace:
  <TASK>
  dump_stack_lvl+0x57/0x80
  check_noncircular+0x119/0x160
  ? queued_spin_lock_slowpath+0x4be/0x510
  ? __pfx_check_noncircular+0x10/0x10
  ? __pfx_queued_spin_lock_slowpath+0x10/0x10
  ? mark_lock+0x47/0x160
  ? init_chain_block+0x9c/0xc0
  ? add_chain_block+0x84/0xf0
  check_prev_add+0x195/0x430
  __lock_acquire+0xaf0/0xd80
  ? __pfx___lock_acquire+0x10/0x10
  ? __lock_release.isra.0+0x13b/0x230
  lock_acquire.part.0+0x103/0x280
  ? filemap_fault+0x26e/0x8b0
  ? __pfx_lock_acquire.part.0+0x10/0x10
  ? rcu_is_watching+0x34/0x60
  ? lock_acquire+0xd7/0x120
  down_read+0x95/0x200
  ? filemap_fault+0x26e/0x8b0
  ? __pfx_down_read+0x10/0x10
  ? __filemap_get_folio+0x25/0x1a0
  filemap_fault+0x26e/0x8b0
  ? __pfx_filemap_fault+0x10/0x10
  ? find_held_lock+0x7c/0x90
  ? __pfx___lock_release.isra.0+0x10/0x10
  ? __pte_offset_map+0x99/0x110
  __do_fault+0x57/0xd0
  do_pte_missing+0x23b/0x320
  __handle_mm_fault+0x2d4/0x320
  ? __pfx___handle_mm_fault+0x10/0x10
  handle_mm_fault+0x14f/0x260
  do_user_addr_fault+0x2a2/0x500
  exc_page_fault+0x71/0x90
  asm_exc_page_fault+0x22/0x30

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Christian Brauner <brauner@kernel.org>
cc: Jan Kara <jack@suse.cz>
cc: Jeff Layton <jlayton@kernel.org>
cc: Gao Xiang <xiang@kernel.org>
cc: Matthew Wilcox <willy@infradead.org>
cc: netfs@lists.linux.dev
cc: linux-erofs@lists.ozlabs.org
cc: linux-fsdevel@vger.kernel.org
---
 fs/xattr.c |   88 ++++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 47 insertions(+), 41 deletions(-)

Comments

Jan Kara July 23, 2024, 10:45 a.m. UTC | #1
On Tue 23-07-24 09:59:54, David Howells wrote:
> When using cachefiles, lockdep may emit something similar to the circular
> locking dependency notice below.  The problem appears to stem from the
> following:
> 
>  (1) Cachefiles manipulates xattrs on the files in its cache when called
>      from ->writepages().
> 
>  (2) The setxattr() and removexattr() system call handlers get the name
>      (and value) from userspace after taking the sb_writers lock, putting
>      accesses of the vma->vm_lock and mm->mmap_lock inside of that.
> 
>  (3) The afs filesystem uses a per-inode lock to prevent multiple
>      revalidation RPCs and in writeback vs truncate to prevent parallel
>      operations from deadlocking against the server on one side and local
>      page locks on the other.
> 
> Fix this by moving the getting of the name and value in {get,remove}xattr()
> outside of the sb_writers lock.  This also has the minor benefits that we
> don't need to reget these in the event of a retry and we never try to take
> the sb_writers lock in the event we can't pull the name and value into the
> kernel.

Well, it seems like you are trying to get rid of the dependency
sb_writers->mmap_sem. But there are other places where this dependency is
created, in particular write(2) path is a place where it would be very
difficult to get rid of it (you take sb_writers, then do all the work
preparing the write and then you copy user data into page cache which
may require mmap_sem).

But looking at the lockdep splat below:

>  ======================================================
>  WARNING: possible circular locking dependency detected
>  6.10.0-build2+ #956 Not tainted
>  ------------------------------------------------------
>  fsstress/6050 is trying to acquire lock:
>  ffff888138fd82f0 (mapping.invalidate_lock#3){++++}-{3:3}, at: filemap_fault+0x26e/0x8b0
> 
>  but task is already holding lock:
>  ffff888113f26d18 (&vma->vm_lock->lock){++++}-{3:3}, at: lock_vma_under_rcu+0x165/0x250
> 
>  which lock already depends on the new lock.
> 
>  the existing dependency chain (in reverse order) is:
> 
>  -> #4 (&vma->vm_lock->lock){++++}-{3:3}:
>         __lock_acquire+0xaf0/0xd80
>         lock_acquire.part.0+0x103/0x280
>         down_write+0x3b/0x50
>         vma_start_write+0x6b/0xa0
>         vma_link+0xcc/0x140
>         insert_vm_struct+0xb7/0xf0
>         alloc_bprm+0x2c1/0x390
>         kernel_execve+0x65/0x1a0
>         call_usermodehelper_exec_async+0x14d/0x190
>         ret_from_fork+0x24/0x40
>         ret_from_fork_asm+0x1a/0x30
> 
>  -> #3 (&mm->mmap_lock){++++}-{3:3}:
>         __lock_acquire+0xaf0/0xd80
>         lock_acquire.part.0+0x103/0x280
>         __might_fault+0x7c/0xb0
>         strncpy_from_user+0x25/0x160
>         removexattr+0x7f/0x100
>         __do_sys_fremovexattr+0x7e/0xb0
>         do_syscall_64+0x9f/0x100
>         entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
>  -> #2 (sb_writers#14){.+.+}-{0:0}:
>         __lock_acquire+0xaf0/0xd80
>         lock_acquire.part.0+0x103/0x280
>         percpu_down_read+0x3c/0x90
>         vfs_iocb_iter_write+0xe9/0x1d0
>         __cachefiles_write+0x367/0x430
>         cachefiles_issue_write+0x299/0x2f0
>         netfs_advance_write+0x117/0x140
>         netfs_write_folio.isra.0+0x5ca/0x6e0
>         netfs_writepages+0x230/0x2f0
>         afs_writepages+0x4d/0x70
>         do_writepages+0x1e8/0x3e0
>         filemap_fdatawrite_wbc+0x84/0xa0
>         __filemap_fdatawrite_range+0xa8/0xf0
>         file_write_and_wait_range+0x59/0x90
>         afs_release+0x10f/0x270
>         __fput+0x25f/0x3d0
>         __do_sys_close+0x43/0x70
>         do_syscall_64+0x9f/0x100
>         entry_SYSCALL_64_after_hwframe+0x76/0x7e

This is the problematic step - from quite deep in the locking chain holding
invalidate_lock and having PG_Writeback set you suddently jump to very outer
locking context grabbing sb_writers. Now AFAICT this is not a real deadlock
problem because the locks are actually on different filesystems, just
lockdep isn't able to see this. So I don't think you will get rid of these
lockdep splats unless you somehow manage to convey to lockdep that there's
the "upper" fs (AFS in this case) and the "lower" fs (the one behind
cachefiles) and their locks are different.

>  -> #1 (&vnode->validate_lock){++++}-{3:3}:
>         __lock_acquire+0xaf0/0xd80
>         lock_acquire.part.0+0x103/0x280
>         down_read+0x95/0x200
>         afs_writepages+0x37/0x70
>         do_writepages+0x1e8/0x3e0
>         filemap_fdatawrite_wbc+0x84/0xa0
>         filemap_invalidate_inode+0x167/0x1e0
>         netfs_unbuffered_write_iter+0x1bd/0x2d0
>         vfs_write+0x22e/0x320
>         ksys_write+0xbc/0x130
>         do_syscall_64+0x9f/0x100
>         entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
>  -> #0 (mapping.invalidate_lock#3){++++}-{3:3}:
>         check_noncircular+0x119/0x160
>         check_prev_add+0x195/0x430
>         __lock_acquire+0xaf0/0xd80
>         lock_acquire.part.0+0x103/0x280
>         down_read+0x95/0x200
>         filemap_fault+0x26e/0x8b0
>         __do_fault+0x57/0xd0
>         do_pte_missing+0x23b/0x320
>         __handle_mm_fault+0x2d4/0x320
>         handle_mm_fault+0x14f/0x260
>         do_user_addr_fault+0x2a2/0x500
>         exc_page_fault+0x71/0x90
>         asm_exc_page_fault+0x22/0x30

								Honza
Christian Brauner July 23, 2024, 11:11 a.m. UTC | #2
On Tue, Jul 23, 2024 at 12:45:33PM GMT, Jan Kara wrote:
> On Tue 23-07-24 09:59:54, David Howells wrote:
> > When using cachefiles, lockdep may emit something similar to the circular
> > locking dependency notice below.  The problem appears to stem from the
> > following:
> > 
> >  (1) Cachefiles manipulates xattrs on the files in its cache when called
> >      from ->writepages().
> > 
> >  (2) The setxattr() and removexattr() system call handlers get the name
> >      (and value) from userspace after taking the sb_writers lock, putting
> >      accesses of the vma->vm_lock and mm->mmap_lock inside of that.
> > 
> >  (3) The afs filesystem uses a per-inode lock to prevent multiple
> >      revalidation RPCs and in writeback vs truncate to prevent parallel
> >      operations from deadlocking against the server on one side and local
> >      page locks on the other.
> > 
> > Fix this by moving the getting of the name and value in {get,remove}xattr()
> > outside of the sb_writers lock.  This also has the minor benefits that we
> > don't need to reget these in the event of a retry and we never try to take
> > the sb_writers lock in the event we can't pull the name and value into the
> > kernel.
> 
> Well, it seems like you are trying to get rid of the dependency
> sb_writers->mmap_sem. But there are other places where this dependency is

Independent of this issue, I think that moving the retrieval of name and
value out of the lock is a good thing. The commit message might need to
get reworded of course.

> created, in particular write(2) path is a place where it would be very
> difficult to get rid of it (you take sb_writers, then do all the work
> preparing the write and then you copy user data into page cache which
> may require mmap_sem).
> 
> But looking at the lockdep splat below:
> 
> >  ======================================================
> >  WARNING: possible circular locking dependency detected
> >  6.10.0-build2+ #956 Not tainted
> >  ------------------------------------------------------
> >  fsstress/6050 is trying to acquire lock:
> >  ffff888138fd82f0 (mapping.invalidate_lock#3){++++}-{3:3}, at: filemap_fault+0x26e/0x8b0
> > 
> >  but task is already holding lock:
> >  ffff888113f26d18 (&vma->vm_lock->lock){++++}-{3:3}, at: lock_vma_under_rcu+0x165/0x250
> > 
> >  which lock already depends on the new lock.
> > 
> >  the existing dependency chain (in reverse order) is:
> > 
> >  -> #4 (&vma->vm_lock->lock){++++}-{3:3}:
> >         __lock_acquire+0xaf0/0xd80
> >         lock_acquire.part.0+0x103/0x280
> >         down_write+0x3b/0x50
> >         vma_start_write+0x6b/0xa0
> >         vma_link+0xcc/0x140
> >         insert_vm_struct+0xb7/0xf0
> >         alloc_bprm+0x2c1/0x390
> >         kernel_execve+0x65/0x1a0
> >         call_usermodehelper_exec_async+0x14d/0x190
> >         ret_from_fork+0x24/0x40
> >         ret_from_fork_asm+0x1a/0x30
> > 
> >  -> #3 (&mm->mmap_lock){++++}-{3:3}:
> >         __lock_acquire+0xaf0/0xd80
> >         lock_acquire.part.0+0x103/0x280
> >         __might_fault+0x7c/0xb0
> >         strncpy_from_user+0x25/0x160
> >         removexattr+0x7f/0x100
> >         __do_sys_fremovexattr+0x7e/0xb0
> >         do_syscall_64+0x9f/0x100
> >         entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > 
> >  -> #2 (sb_writers#14){.+.+}-{0:0}:
> >         __lock_acquire+0xaf0/0xd80
> >         lock_acquire.part.0+0x103/0x280
> >         percpu_down_read+0x3c/0x90
> >         vfs_iocb_iter_write+0xe9/0x1d0
> >         __cachefiles_write+0x367/0x430
> >         cachefiles_issue_write+0x299/0x2f0
> >         netfs_advance_write+0x117/0x140
> >         netfs_write_folio.isra.0+0x5ca/0x6e0
> >         netfs_writepages+0x230/0x2f0
> >         afs_writepages+0x4d/0x70
> >         do_writepages+0x1e8/0x3e0
> >         filemap_fdatawrite_wbc+0x84/0xa0
> >         __filemap_fdatawrite_range+0xa8/0xf0
> >         file_write_and_wait_range+0x59/0x90
> >         afs_release+0x10f/0x270
> >         __fput+0x25f/0x3d0
> >         __do_sys_close+0x43/0x70
> >         do_syscall_64+0x9f/0x100
> >         entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
> This is the problematic step - from quite deep in the locking chain holding
> invalidate_lock and having PG_Writeback set you suddently jump to very outer
> locking context grabbing sb_writers. Now AFAICT this is not a real deadlock
> problem because the locks are actually on different filesystems, just
> lockdep isn't able to see this. So I don't think you will get rid of these
> lockdep splats unless you somehow manage to convey to lockdep that there's
> the "upper" fs (AFS in this case) and the "lower" fs (the one behind
> cachefiles) and their locks are different.
> 
> >  -> #1 (&vnode->validate_lock){++++}-{3:3}:
> >         __lock_acquire+0xaf0/0xd80
> >         lock_acquire.part.0+0x103/0x280
> >         down_read+0x95/0x200
> >         afs_writepages+0x37/0x70
> >         do_writepages+0x1e8/0x3e0
> >         filemap_fdatawrite_wbc+0x84/0xa0
> >         filemap_invalidate_inode+0x167/0x1e0
> >         netfs_unbuffered_write_iter+0x1bd/0x2d0
> >         vfs_write+0x22e/0x320
> >         ksys_write+0xbc/0x130
> >         do_syscall_64+0x9f/0x100
> >         entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > 
> >  -> #0 (mapping.invalidate_lock#3){++++}-{3:3}:
> >         check_noncircular+0x119/0x160
> >         check_prev_add+0x195/0x430
> >         __lock_acquire+0xaf0/0xd80
> >         lock_acquire.part.0+0x103/0x280
> >         down_read+0x95/0x200
> >         filemap_fault+0x26e/0x8b0
> >         __do_fault+0x57/0xd0
> >         do_pte_missing+0x23b/0x320
> >         __handle_mm_fault+0x2d4/0x320
> >         handle_mm_fault+0x14f/0x260
> >         do_user_addr_fault+0x2a2/0x500
> >         exc_page_fault+0x71/0x90
> >         asm_exc_page_fault+0x22/0x30
> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Jan Kara July 23, 2024, 12:32 p.m. UTC | #3
On Tue 23-07-24 13:11:51, Christian Brauner wrote:
> On Tue, Jul 23, 2024 at 12:45:33PM GMT, Jan Kara wrote:
> > On Tue 23-07-24 09:59:54, David Howells wrote:
> > > When using cachefiles, lockdep may emit something similar to the circular
> > > locking dependency notice below.  The problem appears to stem from the
> > > following:
> > > 
> > >  (1) Cachefiles manipulates xattrs on the files in its cache when called
> > >      from ->writepages().
> > > 
> > >  (2) The setxattr() and removexattr() system call handlers get the name
> > >      (and value) from userspace after taking the sb_writers lock, putting
> > >      accesses of the vma->vm_lock and mm->mmap_lock inside of that.
> > > 
> > >  (3) The afs filesystem uses a per-inode lock to prevent multiple
> > >      revalidation RPCs and in writeback vs truncate to prevent parallel
> > >      operations from deadlocking against the server on one side and local
> > >      page locks on the other.
> > > 
> > > Fix this by moving the getting of the name and value in {get,remove}xattr()
> > > outside of the sb_writers lock.  This also has the minor benefits that we
> > > don't need to reget these in the event of a retry and we never try to take
> > > the sb_writers lock in the event we can't pull the name and value into the
> > > kernel.
> > 
> > Well, it seems like you are trying to get rid of the dependency
> > sb_writers->mmap_sem. But there are other places where this dependency is
> 
> Independent of this issue, I think that moving the retrieval of name and
> value out of the lock is a good thing. The commit message might need to
> get reworded of course.

Oh, absolutely. I think the change itself makes sense, just it will not fix
what David hopes to fix :)

								Honza
Jan Kara July 23, 2024, 12:57 p.m. UTC | #4
Regarding the contents of the change itself:

On Tue 23-07-24 09:59:54, David Howells wrote:
> @@ -954,15 +952,23 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
>  SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
>  {
>  	struct fd f = fdget(fd);
> +	char kname[XATTR_NAME_MAX + 1];
>  	int error = -EBADF;
>  
>  	if (!f.file)
>  		return error;
>  	audit_file(f.file);
> +
> +	error = strncpy_from_user(kname, name, sizeof(kname));
> +	if (error == 0 || error == sizeof(kname))
> +		error = -ERANGE;
> +	if (error < 0)
> +		return error;

Missing fdput() here.

> +
>  	error = mnt_want_write_file(f.file);
>  	if (!error) {
>  		error = removexattr(file_mnt_idmap(f.file),
> -				    f.file->f_path.dentry, name);
> +				    f.file->f_path.dentry, kname);
>  		mnt_drop_write_file(f.file);
>  	}
>  	fdput(f);

Otherwise the patch looks good to me.

								Honza
David Howells July 23, 2024, 1:57 p.m. UTC | #5
Jan Kara <jack@suse.cz> wrote:

> Well, it seems like you are trying to get rid of the dependency
> sb_writers->mmap_sem. But there are other places where this dependency is
> created, in particular write(2) path is a place where it would be very
> difficult to get rid of it (you take sb_writers, then do all the work
> preparing the write and then you copy user data into page cache which
> may require mmap_sem).
>
> ...
> 
> This is the problematic step - from quite deep in the locking chain holding
> invalidate_lock and having PG_Writeback set you suddently jump to very outer
> locking context grabbing sb_writers. Now AFAICT this is not a real deadlock
> problem because the locks are actually on different filesystems, just
> lockdep isn't able to see this. So I don't think you will get rid of these
> lockdep splats unless you somehow manage to convey to lockdep that there's
> the "upper" fs (AFS in this case) and the "lower" fs (the one behind
> cachefiles) and their locks are different.

I'm not sure you're correct about that.  If you look at the lockdep splat:

>  -> #2 (sb_writers#14){.+.+}-{0:0}:

The sb_writers lock is "personalised" to the filesystem type (the "#14"
annotation) which is set here:

	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
		if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
					sb_writers_name[i],
					&type->s_writers_key[i]))  <----
			goto fail;
	}

in fs/super.c.

I think the problem is (1) that on one side, you've got, say, sys_setxattr()
taking an sb_writers lock and then accessing a userspace buffer, which (a) may
take mm->mmap_lock and vma->vm_lock and (b) may cause reading or writeback
from the netfs-based filesystem via an mmapped xattr name buffer].

Then (2) on the other side, you have a read or a write to the network
filesystem through netfslib which may invoke the cache, which may require
cachefiles to check the xattr on the cache file and maybe set/remove it -
which requires the sb_writers lock on the cache filesystem.

So if ->read_folio(), ->readahead() or ->writepages() can ever be called with
mm->mmap_lock or vma->vm_lock held, netfslib may call down to cachefiles and
ultimately, it should[*] then take the sb_writers lock on the backing
filesystem to perform xattr manipulation.

[*] I say "should" because at the moment cachefiles calls vfs_set/removexattr
    functions which *don't* take this lock (which is a bug).  Is this an error
    on the part of vfs_set/removexattr()?  Should they take this lock
    analogously with vfs_truncate() and vfs_iocb_iter_write()?

However, as it doesn't it manages to construct a locking chain via the
mapping.invalidate_lock, the afs vnode->validate_lock and something in execve
that I don't exactly follow.


I wonder if this is might be deadlockable by a multithreaded process (ie. so
they share the mm locks) where one thread is writing to a cached file whilst
another thread is trying to set/remove the xattr on that file.

David
Christian Brauner July 23, 2024, 3:40 p.m. UTC | #6
On Tue, Jul 23, 2024 at 02:57:46PM GMT, David Howells wrote:
> Jan Kara <jack@suse.cz> wrote:
> 
> > Well, it seems like you are trying to get rid of the dependency
> > sb_writers->mmap_sem. But there are other places where this dependency is
> > created, in particular write(2) path is a place where it would be very
> > difficult to get rid of it (you take sb_writers, then do all the work
> > preparing the write and then you copy user data into page cache which
> > may require mmap_sem).
> >
> > ...
> > 
> > This is the problematic step - from quite deep in the locking chain holding
> > invalidate_lock and having PG_Writeback set you suddently jump to very outer
> > locking context grabbing sb_writers. Now AFAICT this is not a real deadlock
> > problem because the locks are actually on different filesystems, just
> > lockdep isn't able to see this. So I don't think you will get rid of these
> > lockdep splats unless you somehow manage to convey to lockdep that there's
> > the "upper" fs (AFS in this case) and the "lower" fs (the one behind
> > cachefiles) and their locks are different.
> 
> I'm not sure you're correct about that.  If you look at the lockdep splat:
> 
> >  -> #2 (sb_writers#14){.+.+}-{0:0}:
> 
> The sb_writers lock is "personalised" to the filesystem type (the "#14"
> annotation) which is set here:
> 
> 	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
> 		if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
> 					sb_writers_name[i],
> 					&type->s_writers_key[i]))  <----
> 			goto fail;
> 	}
> 
> in fs/super.c.
> 
> I think the problem is (1) that on one side, you've got, say, sys_setxattr()
> taking an sb_writers lock and then accessing a userspace buffer, which (a) may
> take mm->mmap_lock and vma->vm_lock and (b) may cause reading or writeback
> from the netfs-based filesystem via an mmapped xattr name buffer].
> 
> Then (2) on the other side, you have a read or a write to the network
> filesystem through netfslib which may invoke the cache, which may require
> cachefiles to check the xattr on the cache file and maybe set/remove it -
> which requires the sb_writers lock on the cache filesystem.
> 
> So if ->read_folio(), ->readahead() or ->writepages() can ever be called with
> mm->mmap_lock or vma->vm_lock held, netfslib may call down to cachefiles and
> ultimately, it should[*] then take the sb_writers lock on the backing
> filesystem to perform xattr manipulation.
> 
> [*] I say "should" because at the moment cachefiles calls vfs_set/removexattr
>     functions which *don't* take this lock (which is a bug).  Is this an error
>     on the part of vfs_set/removexattr()?  Should they take this lock
>     analogously with vfs_truncate() and vfs_iocb_iter_write()?

It's not bug per se. We have a few vfs_*() functions that don't take
write access iirc. The reason often is that e.g., callers like overlayfs
call vfs_*() helpers in various locations where write access to the
underlying and overlayfs layer is taken some time before calling into
vs_*() helper (see ovl_do_setxattr() during various copy up operation
iirc.).

Fwiw, I suspect that e.g., ecryptfs doesn't claim write access at all to
the underlying filesystem - not just xattrs.

(What would probably be good is if we could add lockdep asserts so that
we get warnings about missing locks taken.)

> 
> However, as it doesn't it manages to construct a locking chain via the
> mapping.invalidate_lock, the afs vnode->validate_lock and something in execve
> that I don't exactly follow.
> 
> 
> I wonder if this is might be deadlockable by a multithreaded process (ie. so
> they share the mm locks) where one thread is writing to a cached file whilst
> another thread is trying to set/remove the xattr on that file.
> 
> David
>
Dave Chinner July 24, 2024, 1:34 a.m. UTC | #7
On Tue, Jul 23, 2024 at 12:45:33PM +0200, Jan Kara wrote:
> On Tue 23-07-24 09:59:54, David Howells wrote:
> >  ======================================================
> >  WARNING: possible circular locking dependency detected
> >  6.10.0-build2+ #956 Not tainted
> >  ------------------------------------------------------
> >  fsstress/6050 is trying to acquire lock:
> >  ffff888138fd82f0 (mapping.invalidate_lock#3){++++}-{3:3}, at: filemap_fault+0x26e/0x8b0
> > 
> >  but task is already holding lock:
> >  ffff888113f26d18 (&vma->vm_lock->lock){++++}-{3:3}, at: lock_vma_under_rcu+0x165/0x250
> > 
> >  which lock already depends on the new lock.
> > 
> >  the existing dependency chain (in reverse order) is:
> > 
> >  -> #4 (&vma->vm_lock->lock){++++}-{3:3}:
> >         __lock_acquire+0xaf0/0xd80
> >         lock_acquire.part.0+0x103/0x280
> >         down_write+0x3b/0x50
> >         vma_start_write+0x6b/0xa0
> >         vma_link+0xcc/0x140
> >         insert_vm_struct+0xb7/0xf0
> >         alloc_bprm+0x2c1/0x390
> >         kernel_execve+0x65/0x1a0
> >         call_usermodehelper_exec_async+0x14d/0x190
> >         ret_from_fork+0x24/0x40
> >         ret_from_fork_asm+0x1a/0x30
> > 
> >  -> #3 (&mm->mmap_lock){++++}-{3:3}:
> >         __lock_acquire+0xaf0/0xd80
> >         lock_acquire.part.0+0x103/0x280
> >         __might_fault+0x7c/0xb0
> >         strncpy_from_user+0x25/0x160
> >         removexattr+0x7f/0x100
> >         __do_sys_fremovexattr+0x7e/0xb0
> >         do_syscall_64+0x9f/0x100
> >         entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > 
> >  -> #2 (sb_writers#14){.+.+}-{0:0}:
> >         __lock_acquire+0xaf0/0xd80
> >         lock_acquire.part.0+0x103/0x280
> >         percpu_down_read+0x3c/0x90
> >         vfs_iocb_iter_write+0xe9/0x1d0
> >         __cachefiles_write+0x367/0x430
> >         cachefiles_issue_write+0x299/0x2f0
> >         netfs_advance_write+0x117/0x140
> >         netfs_write_folio.isra.0+0x5ca/0x6e0
> >         netfs_writepages+0x230/0x2f0
> >         afs_writepages+0x4d/0x70
> >         do_writepages+0x1e8/0x3e0
> >         filemap_fdatawrite_wbc+0x84/0xa0
> >         __filemap_fdatawrite_range+0xa8/0xf0
> >         file_write_and_wait_range+0x59/0x90
> >         afs_release+0x10f/0x270
> >         __fput+0x25f/0x3d0
> >         __do_sys_close+0x43/0x70
> >         do_syscall_64+0x9f/0x100
> >         entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
> This is the problematic step - from quite deep in the locking chain holding
> invalidate_lock and having PG_Writeback set you suddently jump to very outer
> locking context grabbing sb_writers. Now AFAICT this is not a real deadlock
> problem because the locks are actually on different filesystems, just
> lockdep isn't able to see this. So I don't think you will get rid of these
> lockdep splats unless you somehow manage to convey to lockdep that there's
> the "upper" fs (AFS in this case) and the "lower" fs (the one behind
> cachefiles) and their locks are different.

Actually, that can deadlock. We've just been over this with the NFS
localio proposal.  Network filesystem writeback that can recurse
into a local filesystem needs to be using (PF_LOCAL_THROTTLE |
PF_MEMALLOC_NOFS) for the writeback context.

This is to prevent the deadlocks on upper->lower->upper and
lower->upper->lower filesystem recursion via GFP_KERNEL memory
allocation and reclaim recursing between the two filesystems. This
is especially relevant for filesystems with ->writepage methods that
can be called from direct reclaim. Hence allocations in this path
need to be at least NOFS to prevent recursion back into the upper
filesystem from writeback into the lower filesystem.

Further, anywhere that dirty page writeback recurses into the front
end write path of a filesystem can deadlock in
balance_dirty_pages(). The upper filesystem can consume all the
dirty threshold and then the lower filesystem blocks trying to clean
dirty upper filesystem pages. Hence PF_LOCAL_THROTTLE is needed for
the lower filesystem IO to prevent it from being throttled when the
upper filesystem is throttled.

-Dave.
Matthew Wilcox July 24, 2024, 2:42 a.m. UTC | #8
On Wed, Jul 24, 2024 at 11:34:57AM +1000, Dave Chinner wrote:
> This is to prevent the deadlocks on upper->lower->upper and
> lower->upper->lower filesystem recursion via GFP_KERNEL memory
> allocation and reclaim recursing between the two filesystems. This
> is especially relevant for filesystems with ->writepage methods that
> can be called from direct reclaim. Hence allocations in this path
> need to be at least NOFS to prevent recursion back into the upper
> filesystem from writeback into the lower filesystem.

FYI, we're making good progress on removing ->writepage.

$ git grep '\.writepage\>.*='
fs/ceph/addr.c: .writepage = ceph_writepage,
fs/ecryptfs/mmap.c:     .writepage = ecryptfs_writepage,
fs/f2fs/checkpoint.c:   .writepage      = f2fs_write_meta_page,
fs/f2fs/data.c: .writepage      = f2fs_write_data_page,
fs/f2fs/node.c: .writepage      = f2fs_write_node_page,
fs/gfs2/aops.c: .writepage = gfs2_jdata_writepage,
fs/gfs2/meta_io.c:      .writepage = gfs2_aspace_writepage,
fs/gfs2/meta_io.c:      .writepage = gfs2_aspace_writepage,
fs/hostfs/hostfs_kern.c:        .writepage      = hostfs_writepage,
fs/nilfs2/inode.c:      .writepage              = nilfs_writepage,
fs/nilfs2/mdt.c:        .writepage              = nilfs_mdt_write_page,
fs/orangefs/inode.c:    .writepage = orangefs_writepage,
fs/vboxsf/file.c:       .writepage = vboxsf_writepage,
mm/shmem.c:     .writepage      = shmem_writepage,
mm/swap_state.c:        .writepage      = swap_writepage,

so mostly just the usual stragglers.  I sent a series to fix up gfs2
earlier this week:
https://lore.kernel.org/linux-fsdevel/20240719175105.788253-1-willy@infradead.org/
David Howells July 24, 2024, 8:11 a.m. UTC | #9
Jan Kara <jack@suse.cz> wrote:

> > +	error = strncpy_from_user(kname, name, sizeof(kname));
> > +	if (error == 0 || error == sizeof(kname))
> > +		error = -ERANGE;
> > +	if (error < 0)
> > +		return error;
> 
> Missing fdput() here.

Thanks.  I think Christian is altering the patch to use auto-destruction
constructs (CLASS()).

David
Jan Kara July 24, 2024, 1:30 p.m. UTC | #10
On Tue 23-07-24 14:57:46, David Howells wrote:
> Jan Kara <jack@suse.cz> wrote:
> 
> > Well, it seems like you are trying to get rid of the dependency
> > sb_writers->mmap_sem. But there are other places where this dependency is
> > created, in particular write(2) path is a place where it would be very
> > difficult to get rid of it (you take sb_writers, then do all the work
> > preparing the write and then you copy user data into page cache which
> > may require mmap_sem).
> >
> > ...
> > 
> > This is the problematic step - from quite deep in the locking chain holding
> > invalidate_lock and having PG_Writeback set you suddently jump to very outer
> > locking context grabbing sb_writers. Now AFAICT this is not a real deadlock
> > problem because the locks are actually on different filesystems, just
> > lockdep isn't able to see this. So I don't think you will get rid of these
> > lockdep splats unless you somehow manage to convey to lockdep that there's
> > the "upper" fs (AFS in this case) and the "lower" fs (the one behind
> > cachefiles) and their locks are different.
> 
> I'm not sure you're correct about that.  If you look at the lockdep splat:
> 
> >  -> #2 (sb_writers#14){.+.+}-{0:0}:
> 
> The sb_writers lock is "personalised" to the filesystem type (the "#14"
> annotation) which is set here:
> 
> 	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
> 		if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
> 					sb_writers_name[i],
> 					&type->s_writers_key[i]))  <----
> 			goto fail;
> 	}
> 
> in fs/super.c.

Right, forgot about that. Thanks for correction! So after pondering about
this some more, this is actually a real problem and deadlockable. See
below.

> I think the problem is (1) that on one side, you've got, say, sys_setxattr()
> taking an sb_writers lock and then accessing a userspace buffer, which (a) may
> take mm->mmap_lock and vma->vm_lock and (b) may cause reading or writeback
> from the netfs-based filesystem via an mmapped xattr name buffer].

Yes, we agree on that. I was just pointing out that:

vfs_write()
  file_start_write() -> grabs sb_writers
  generic_file_write_iter()
    generic_perform_write()
      fault_in_iov_iter_readable()

is another path which enforces exactly the same lock ordering.

> Then (2) on the other side, you have a read or a write to the network
> filesystem through netfslib which may invoke the cache, which may require
> cachefiles to check the xattr on the cache file and maybe set/remove it -
> which requires the sb_writers lock on the cache filesystem.
> 
> So if ->read_folio(), ->readahead() or ->writepages() can ever be called with
> mm->mmap_lock or vma->vm_lock held, netfslib may call down to cachefiles and
> ultimately, it should[*] then take the sb_writers lock on the backing
> filesystem to perform xattr manipulation.

Well, ->read_folio() under mm->mmap_lock is a standard thing to happen in a
page fault. Now grabbing sb_writers (of any filesystem) in that path is
problematic and can deadlock though:

page fault
  grab mm->mmap_lock
  filemap_fault()
    if (unlikely(!folio_test_uptodate(folio))) {
      filemap_read_folio() on fs A
        now if you grab sb_writers on fs B:
			freeze_super() on fs B		write(2) on fs B
							  sb_start_write(fs B)
			  sb->s_writers.frozen = SB_FREEZE_WRITE;
			  sb_wait_write(sb, SB_FREEZE_WRITE);
			    - waits for write
	sb_start_write(fs B) - blocked behind freeze_super()
							  generic_perform_write()
							    fault_in_iov_iter_readable()
							      page fault
							        grab mm->mmap_lock
								  => deadlock

Now this is not the deadlock your lockdep trace is showing but it is a
similar one. Like:

filemap_invalidate() on fs A	freeze_super() on fs B	page fault on fs A	write(2) on fs B
  filemap_invalidate_lock()				  lock mm->mmap_lock	  sb_start_write(fs B)
  filemap_fdatawrite_wbc()				  filemap_fault()
    afs_writepages()					    filemap_invalidate_lock_shared()
      cachefiles_issue_write()				      => blocks behind filemap_invalidate()
				  sb->s_writers.frozen = SB_FREEZE_WRITE;
				  sb_wait_write(sb, SB_FREEZE_WRITE);
				    => blocks behind write(2)
        sb_start_write() on fs B
	  => blocks behind freeze_super()
							  			  generic_perform_write()
										    fault_in_iov_iter_readable()
										      page fault
										        grab mm->mmap_lock
											  => deadlock

So I still maintain that grabbing sb_start_write() from quite deep within
locking hierarchy (like from writepages when having pages locked, but even
holding invalidate_lock is enough for the problems) is problematic and
prone to deadlocks.

> [*] I say "should" because at the moment cachefiles calls vfs_set/removexattr
>     functions which *don't* take this lock (which is a bug).  Is this an error
>     on the part of vfs_set/removexattr()?  Should they take this lock
>     analogously with vfs_truncate() and vfs_iocb_iter_write()?
> 
> However, as it doesn't it manages to construct a locking chain via the
> mapping.invalidate_lock, the afs vnode->validate_lock and something in execve
> that I don't exactly follow.
> 
> 
> I wonder if this is might be deadlockable by a multithreaded process (ie. so
> they share the mm locks) where one thread is writing to a cached file whilst
> another thread is trying to set/remove the xattr on that file.

Yep, see above. Now the hard question is how to fix this because what you
are doing seems to be inherent in how cachefiles fs is designed to work.

								Honza
Christian Brauner July 29, 2024, 3:28 p.m. UTC | #11
On Wed, Jul 24, 2024 at 03:30:09PM GMT, Jan Kara wrote:
> On Tue 23-07-24 14:57:46, David Howells wrote:
> > Jan Kara <jack@suse.cz> wrote:
> > 
> > > Well, it seems like you are trying to get rid of the dependency
> > > sb_writers->mmap_sem. But there are other places where this dependency is
> > > created, in particular write(2) path is a place where it would be very
> > > difficult to get rid of it (you take sb_writers, then do all the work
> > > preparing the write and then you copy user data into page cache which
> > > may require mmap_sem).
> > >
> > > ...
> > > 
> > > This is the problematic step - from quite deep in the locking chain holding
> > > invalidate_lock and having PG_Writeback set you suddently jump to very outer
> > > locking context grabbing sb_writers. Now AFAICT this is not a real deadlock
> > > problem because the locks are actually on different filesystems, just
> > > lockdep isn't able to see this. So I don't think you will get rid of these
> > > lockdep splats unless you somehow manage to convey to lockdep that there's
> > > the "upper" fs (AFS in this case) and the "lower" fs (the one behind
> > > cachefiles) and their locks are different.
> > 
> > I'm not sure you're correct about that.  If you look at the lockdep splat:
> > 
> > >  -> #2 (sb_writers#14){.+.+}-{0:0}:
> > 
> > The sb_writers lock is "personalised" to the filesystem type (the "#14"
> > annotation) which is set here:
> > 
> > 	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
> > 		if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
> > 					sb_writers_name[i],
> > 					&type->s_writers_key[i]))  <----
> > 			goto fail;
> > 	}
> > 
> > in fs/super.c.
> 
> Right, forgot about that. Thanks for correction! So after pondering about
> this some more, this is actually a real problem and deadlockable. See
> below.
> 
> > I think the problem is (1) that on one side, you've got, say, sys_setxattr()
> > taking an sb_writers lock and then accessing a userspace buffer, which (a) may
> > take mm->mmap_lock and vma->vm_lock and (b) may cause reading or writeback
> > from the netfs-based filesystem via an mmapped xattr name buffer].
> 
> Yes, we agree on that. I was just pointing out that:
> 
> vfs_write()
>   file_start_write() -> grabs sb_writers
>   generic_file_write_iter()
>     generic_perform_write()
>       fault_in_iov_iter_readable()
> 
> is another path which enforces exactly the same lock ordering.
> 
> > Then (2) on the other side, you have a read or a write to the network
> > filesystem through netfslib which may invoke the cache, which may require
> > cachefiles to check the xattr on the cache file and maybe set/remove it -
> > which requires the sb_writers lock on the cache filesystem.
> > 
> > So if ->read_folio(), ->readahead() or ->writepages() can ever be called with
> > mm->mmap_lock or vma->vm_lock held, netfslib may call down to cachefiles and
> > ultimately, it should[*] then take the sb_writers lock on the backing
> > filesystem to perform xattr manipulation.
> 
> Well, ->read_folio() under mm->mmap_lock is a standard thing to happen in a
> page fault. Now grabbing sb_writers (of any filesystem) in that path is
> problematic and can deadlock though:
> 
> page fault
>   grab mm->mmap_lock
>   filemap_fault()
>     if (unlikely(!folio_test_uptodate(folio))) {
>       filemap_read_folio() on fs A
>         now if you grab sb_writers on fs B:
> 			freeze_super() on fs B		write(2) on fs B
> 							  sb_start_write(fs B)
> 			  sb->s_writers.frozen = SB_FREEZE_WRITE;
> 			  sb_wait_write(sb, SB_FREEZE_WRITE);
> 			    - waits for write
> 	sb_start_write(fs B) - blocked behind freeze_super()
> 							  generic_perform_write()
> 							    fault_in_iov_iter_readable()
> 							      page fault
> 							        grab mm->mmap_lock
> 								  => deadlock
> 
> Now this is not the deadlock your lockdep trace is showing but it is a
> similar one. Like:
> 
> filemap_invalidate() on fs A	freeze_super() on fs B	page fault on fs A	write(2) on fs B
>   filemap_invalidate_lock()				  lock mm->mmap_lock	  sb_start_write(fs B)
>   filemap_fdatawrite_wbc()				  filemap_fault()
>     afs_writepages()					    filemap_invalidate_lock_shared()
>       cachefiles_issue_write()				      => blocks behind filemap_invalidate()
> 				  sb->s_writers.frozen = SB_FREEZE_WRITE;
> 				  sb_wait_write(sb, SB_FREEZE_WRITE);
> 				    => blocks behind write(2)
>         sb_start_write() on fs B
> 	  => blocks behind freeze_super()
> 							  			  generic_perform_write()
> 										    fault_in_iov_iter_readable()
> 										      page fault
> 										        grab mm->mmap_lock
> 											  => deadlock
> 
> So I still maintain that grabbing sb_start_write() from quite deep within
> locking hierarchy (like from writepages when having pages locked, but even
> holding invalidate_lock is enough for the problems) is problematic and
> prone to deadlocks.
> 
> > [*] I say "should" because at the moment cachefiles calls vfs_set/removexattr
> >     functions which *don't* take this lock (which is a bug).  Is this an error
> >     on the part of vfs_set/removexattr()?  Should they take this lock
> >     analogously with vfs_truncate() and vfs_iocb_iter_write()?
> > 
> > However, as it doesn't it manages to construct a locking chain via the
> > mapping.invalidate_lock, the afs vnode->validate_lock and something in execve
> > that I don't exactly follow.
> > 
> > 
> > I wonder if this is might be deadlockable by a multithreaded process (ie. so
> > they share the mm locks) where one thread is writing to a cached file whilst
> > another thread is trying to set/remove the xattr on that file.
> 
> Yep, see above. Now the hard question is how to fix this because what you
> are doing seems to be inherent in how cachefiles fs is designed to work.

So one idea may be to create a private mount for cachefiles and then claim
write access when that private mount is created and retaining that write access
for the duration of cachefiles being run. See my draft patch below.

Right now things are as follows (roughly). Say cachefiles uses a directory of
the rootfs e.g., /mnt/cache-dir/ then cache->mnt points to /.

With my patch this becomes:

mnt["/mnt/cache-dir"] = clone_private_mnt("/mnt/cache-dir");

so cache->mnt points to "/mnt/cache-dir". That shouldn't be an issue though.

The more interesting changes in behavior come from mount properties. For
example, when

mount --bind /mnt/cache-dir /opt/

is created and /opt is used for cachefiles. When the bind-mount of
/mnt/cache-dir at /opt is made read-only it becomes read-only for cachefiles as
well.

But with an internal private mount that's no longer true. The internal mount
cannot be mounted read-only from userspace. Arguably, that would have been the
correct semantics right from the start similar to what overlayfs is doing.

However, the bigger semantic change would come from claiming write access when
the private mount is created. Claiming write access for as long as cachefiles
is running means that that the filesystem that is used cannot be remounted
read-only because mnt_hold_writers() would tell you to get lost. That
might be ok though. It just is something that should only be done with
capable(CAP_SYS_ADMIN) which cachefiles requires anyway.

I think the bigger issue might be freezing. Not because of the type of
operation but because freeze_super() would wait in sb_wait_write(sb,
SB_FREEZE_WRITE) until all writes finish and obviously they won't if
cachefiles just keeps running. That means you might hang forever.

One way to get around this last issue might be to introduce
SB_FREEZE_WRITE_LONGTERM which cachefiles can use (might even be able to
do that without a private mount then) and have freeze return with an
error in case long-term write access is claimed.

So anyway, trade-offs to be considered here. I just thought I throw this
out as a potential solution.

/* draft, sketch */

diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c
index 9fb06dc16520..acbb5b95a9d1 100644
--- a/fs/cachefiles/cache.c
+++ b/fs/cachefiles/cache.c
@@ -20,6 +20,7 @@ int cachefiles_add_cache(struct cachefiles_cache *cache)
        struct path path;
        struct kstatfs stats;
        struct dentry *graveyard, *cachedir, *root;
+       struct vfsmount *mnt;
        const struct cred *saved_cred;
        int ret;

@@ -41,6 +42,20 @@ int cachefiles_add_cache(struct cachefiles_cache *cache)
        if (ret < 0)
                goto error_open_root;

+       mnt = clone_private_mount(&path);
+       if (IS_ERR(mnt)) {
+               path_put(&path);
+               goto error_unsupported;
+       }
+
+       ret = mnt_want_write(mnt);
+       if (ret) {
+               mntput(mnt);
+               path_put(&path);
+               goto error_unsupported;
+       }
+
+       mntput(path.mnt);
        cache->mnt = path.mnt;
        root = path.dentry;

diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 89b11336a836..c3f0a5e6bdb5 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -816,6 +816,7 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache)

        cachefiles_put_directory(cache->graveyard);
        cachefiles_put_directory(cache->store);
+       mnt_drop_write(cache->mnt);
        mntput(cache->mnt);
        put_cred(cache->cache_cred);
Jan Kara July 31, 2024, 6:16 p.m. UTC | #12
On Mon 29-07-24 17:28:22, Christian Brauner wrote:
> On Wed, Jul 24, 2024 at 03:30:09PM GMT, Jan Kara wrote:
> > On Tue 23-07-24 14:57:46, David Howells wrote:
> > > Then (2) on the other side, you have a read or a write to the network
> > > filesystem through netfslib which may invoke the cache, which may require
> > > cachefiles to check the xattr on the cache file and maybe set/remove it -
> > > which requires the sb_writers lock on the cache filesystem.
> > > 
> > > So if ->read_folio(), ->readahead() or ->writepages() can ever be called with
> > > mm->mmap_lock or vma->vm_lock held, netfslib may call down to cachefiles and
> > > ultimately, it should[*] then take the sb_writers lock on the backing
> > > filesystem to perform xattr manipulation.
> > 
> > Well, ->read_folio() under mm->mmap_lock is a standard thing to happen in a
> > page fault. Now grabbing sb_writers (of any filesystem) in that path is
> > problematic and can deadlock though:
> > 
> > page fault
> >   grab mm->mmap_lock
> >   filemap_fault()
> >     if (unlikely(!folio_test_uptodate(folio))) {
> >       filemap_read_folio() on fs A
> >         now if you grab sb_writers on fs B:
> > 			freeze_super() on fs B		write(2) on fs B
> > 							  sb_start_write(fs B)
> > 			  sb->s_writers.frozen = SB_FREEZE_WRITE;
> > 			  sb_wait_write(sb, SB_FREEZE_WRITE);
> > 			    - waits for write
> > 	sb_start_write(fs B) - blocked behind freeze_super()
> > 							  generic_perform_write()
> > 							    fault_in_iov_iter_readable()
> > 							      page fault
> > 							        grab mm->mmap_lock
> > 								  => deadlock
> > 
> > Now this is not the deadlock your lockdep trace is showing but it is a
> > similar one. Like:
> > 
> > filemap_invalidate() on fs A	freeze_super() on fs B	page fault on fs A	write(2) on fs B
> >   filemap_invalidate_lock()				  lock mm->mmap_lock	  sb_start_write(fs B)
> >   filemap_fdatawrite_wbc()				  filemap_fault()
> >     afs_writepages()					    filemap_invalidate_lock_shared()
> >       cachefiles_issue_write()				      => blocks behind filemap_invalidate()
> > 				  sb->s_writers.frozen = SB_FREEZE_WRITE;
> > 				  sb_wait_write(sb, SB_FREEZE_WRITE);
> > 				    => blocks behind write(2)
> >         sb_start_write() on fs B
> > 	  => blocks behind freeze_super()
> > 							  			  generic_perform_write()
> > 										    fault_in_iov_iter_readable()
> > 										      page fault
> > 										        grab mm->mmap_lock
> > 											  => deadlock
> > 
> > So I still maintain that grabbing sb_start_write() from quite deep within
> > locking hierarchy (like from writepages when having pages locked, but even
> > holding invalidate_lock is enough for the problems) is problematic and
> > prone to deadlocks.
> > 
> > > [*] I say "should" because at the moment cachefiles calls vfs_set/removexattr
> > >     functions which *don't* take this lock (which is a bug).  Is this an error
> > >     on the part of vfs_set/removexattr()?  Should they take this lock
> > >     analogously with vfs_truncate() and vfs_iocb_iter_write()?
> > > 
> > > However, as it doesn't it manages to construct a locking chain via the
> > > mapping.invalidate_lock, the afs vnode->validate_lock and something in execve
> > > that I don't exactly follow.
> > > 
> > > 
> > > I wonder if this is might be deadlockable by a multithreaded process (ie. so
> > > they share the mm locks) where one thread is writing to a cached file whilst
> > > another thread is trying to set/remove the xattr on that file.
> > 
> > Yep, see above. Now the hard question is how to fix this because what you
> > are doing seems to be inherent in how cachefiles fs is designed to work.
> 
> So one idea may be to create a private mount for cachefiles and then claim
> write access when that private mount is created and retaining that write access
> for the duration of cachefiles being run. See my draft patch below.

OK, that would deal with one problem but the fundamental knot in this coil
is mmap_lock. I don't see the exact call stack that gets us to xattr code
but I can see e.g.:

filemap_invalidate() - grabs invalidate_lock on AFS inode
  afs_writepages
    netfs_writepages
      netfs_write_folio
        cachefiles_issue_write
          vfs_fallocate()
	    - grabs i_rwsem on backing fs inode

Which again is taking locks out of order. That would be no problem because
these are on different filesystems (AFS vs backing fs) but if you have
process with two threads, one doing page fault on AFS, another doing
write(2) to backing fs, their mmap_lock will tie the locking hierarchies on
these two filesystems together. Like:

filemap_invalidate() on fs A	page fault on fs A	write(2) on fs B
  filemap_invalidate_lock()	  lock mm->mmap_lock
  filemap_fdatawrite_wbc()	  filemap_fault()
    afs_writepages()		    filemap_invalidate_lock_shared()
      cachefiles_issue_write()	      => blocks behind filemap_invalidate()
        vfs_fallocate() on fs B
							  inode_lock(inode on fs B)
				  			  generic_perform_write()
							    fault_in_iov_iter_readable()
							      page fault
							        grab mm->mmap_lock
								  => blocks behind page fault
	  inode_lock(inode on fs B)
	    => deadlock

So I don't think there's easy way to completely avoid these deadlocks.
Realistically, I don't think that a process taking a page fault on upper fs
while doing write on lower fs is that common but it's certainly a DoS
vector. Also it's kind of annoying because of the lockdep splats in testing
and resulting inability to find other locking issues (as lockdep gets
disabled).

To fix this, either we'd have to keep the lower cache filesystem private to
cachefiles (but I don't think that works with the usecases) or we have to
somehow untangle this mmap_lock knot. This "page fault does quite some fs
locking under mmap_lock" problem is not causing filesystems headaches for
the first time. I would *love* to be able to always drop mmap_lock in the
page fault handler, fill the data into the page cache and then retry the
fault (so that filemap_map_pages() would then handle the fault without
filesystem involvement). It would make many things in filesystem locking
simpler. As far as I'm checking there are now not that many places that
could not handle dropping of mmap_lock during fault (traditionally the
problem is with get_user_pages() / pin_user_pages() users). So maybe this
dream would be feasible after all.

								Honza
Matthew Wilcox July 31, 2024, 6:27 p.m. UTC | #13
On Wed, Jul 31, 2024 at 08:16:57PM +0200, Jan Kara wrote:
> To fix this, either we'd have to keep the lower cache filesystem private to
> cachefiles (but I don't think that works with the usecases) or we have to
> somehow untangle this mmap_lock knot. This "page fault does quite some fs
> locking under mmap_lock" problem is not causing filesystems headaches for
> the first time. I would *love* to be able to always drop mmap_lock in the
> page fault handler, fill the data into the page cache and then retry the
> fault (so that filemap_map_pages() would then handle the fault without
> filesystem involvement). It would make many things in filesystem locking
> simpler. As far as I'm checking there are now not that many places that
> could not handle dropping of mmap_lock during fault (traditionally the
> problem is with get_user_pages() / pin_user_pages() users). So maybe this
> dream would be feasible after all.

The traditional problem was the array of VMAs which was removed in
commit b2cac248191b -- if we dropped the mmap_lock, any previous
entries in that array would become invalid.  Now that array is gone,
do we have any remaining dependencies on the VMAs remaining valid?
Jan Kara July 31, 2024, 6:55 p.m. UTC | #14
On Wed 31-07-24 19:27:43, Matthew Wilcox wrote:
> On Wed, Jul 31, 2024 at 08:16:57PM +0200, Jan Kara wrote:
> > To fix this, either we'd have to keep the lower cache filesystem private to
> > cachefiles (but I don't think that works with the usecases) or we have to
> > somehow untangle this mmap_lock knot. This "page fault does quite some fs
> > locking under mmap_lock" problem is not causing filesystems headaches for
> > the first time. I would *love* to be able to always drop mmap_lock in the
> > page fault handler, fill the data into the page cache and then retry the
> > fault (so that filemap_map_pages() would then handle the fault without
> > filesystem involvement). It would make many things in filesystem locking
> > simpler. As far as I'm checking there are now not that many places that
> > could not handle dropping of mmap_lock during fault (traditionally the
> > problem is with get_user_pages() / pin_user_pages() users). So maybe this
> > dream would be feasible after all.
> 
> The traditional problem was the array of VMAs which was removed in
> commit b2cac248191b -- if we dropped the mmap_lock, any previous
> entries in that array would become invalid.  Now that array is gone,
> do we have any remaining dependencies on the VMAs remaining valid?

So as far as I've checked the callers of get_user_pages() /
pin_user_pages() I didn't find any that fundamentally could not handle
dropping of mmap_lock. So at least for callers I've seen it was mostly
about teaching them to handle dropped mmap_lock, reacquire it and possibly
reestablish some state which could get invalidated after the mmap_lock got
dropped.

								Honza
diff mbox series

Patch

diff --git a/fs/xattr.c b/fs/xattr.c
index f8b643f91a98..ab46c5bd168f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -630,10 +630,9 @@  int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
 			ctx->kvalue, ctx->size, ctx->flags);
 }
 
-static long
-setxattr(struct mnt_idmap *idmap, struct dentry *d,
-	const char __user *name, const void __user *value, size_t size,
-	int flags)
+static int path_setxattr(const char __user *pathname,
+			 const char __user *name, const void __user *value,
+			 size_t size, int flags, unsigned int lookup_flags)
 {
 	struct xattr_name kname;
 	struct xattr_ctx ctx = {
@@ -643,33 +642,20 @@  setxattr(struct mnt_idmap *idmap, struct dentry *d,
 		.kname    = &kname,
 		.flags    = flags,
 	};
+	struct path path;
 	int error;
 
 	error = setxattr_copy(name, &ctx);
 	if (error)
 		return error;
 
-	error = do_setxattr(idmap, d, &ctx);
-
-	kvfree(ctx.kvalue);
-	return error;
-}
-
-static int path_setxattr(const char __user *pathname,
-			 const char __user *name, const void __user *value,
-			 size_t size, int flags, unsigned int lookup_flags)
-{
-	struct path path;
-	int error;
-
 retry:
 	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
 	if (error)
-		return error;
+		goto out;
 	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = setxattr(mnt_idmap(path.mnt), path.dentry, name,
-				 value, size, flags);
+		error = do_setxattr(mnt_idmap(path.mnt), path.dentry, &ctx);
 		mnt_drop_write(path.mnt);
 	}
 	path_put(&path);
@@ -677,6 +663,9 @@  static int path_setxattr(const char __user *pathname,
 		lookup_flags |= LOOKUP_REVAL;
 		goto retry;
 	}
+
+out:
+	kvfree(ctx.kvalue);
 	return error;
 }
 
@@ -697,19 +686,32 @@  SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
 SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
 		const void __user *,value, size_t, size, int, flags)
 {
+	struct xattr_name kname;
+	struct xattr_ctx ctx = {
+		.cvalue   = value,
+		.kvalue   = NULL,
+		.size     = size,
+		.kname    = &kname,
+		.flags    = flags,
+	};
 	struct fd f = fdget(fd);
-	int error = -EBADF;
+	int error;
 
 	if (!f.file)
-		return error;
+		return -EBADF;
 	audit_file(f.file);
+	error = setxattr_copy(name, &ctx);
+	if (error)
+		goto out_f;
+
 	error = mnt_want_write_file(f.file);
 	if (!error) {
-		error = setxattr(file_mnt_idmap(f.file),
-				 f.file->f_path.dentry, name,
-				 value, size, flags);
+		error = do_setxattr(file_mnt_idmap(f.file),
+				    f.file->f_path.dentry, &ctx);
 		mnt_drop_write_file(f.file);
 	}
+	kvfree(ctx.kvalue);
+out_f:
 	fdput(f);
 	return error;
 }
@@ -899,9 +901,17 @@  SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
  * Extended attribute REMOVE operations
  */
 static long
-removexattr(struct mnt_idmap *idmap, struct dentry *d,
-	    const char __user *name)
+removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name)
 {
+	if (is_posix_acl_xattr(name))
+		return vfs_remove_acl(idmap, d, name);
+	return vfs_removexattr(idmap, d, name);
+}
+
+static int path_removexattr(const char __user *pathname,
+			    const char __user *name, unsigned int lookup_flags)
+{
+	struct path path;
 	int error;
 	char kname[XATTR_NAME_MAX + 1];
 
@@ -910,25 +920,13 @@  removexattr(struct mnt_idmap *idmap, struct dentry *d,
 		error = -ERANGE;
 	if (error < 0)
 		return error;
-
-	if (is_posix_acl_xattr(kname))
-		return vfs_remove_acl(idmap, d, kname);
-
-	return vfs_removexattr(idmap, d, kname);
-}
-
-static int path_removexattr(const char __user *pathname,
-			    const char __user *name, unsigned int lookup_flags)
-{
-	struct path path;
-	int error;
 retry:
 	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
 	if (error)
 		return error;
 	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = removexattr(mnt_idmap(path.mnt), path.dentry, name);
+		error = removexattr(mnt_idmap(path.mnt), path.dentry, kname);
 		mnt_drop_write(path.mnt);
 	}
 	path_put(&path);
@@ -954,15 +952,23 @@  SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
 SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 {
 	struct fd f = fdget(fd);
+	char kname[XATTR_NAME_MAX + 1];
 	int error = -EBADF;
 
 	if (!f.file)
 		return error;
 	audit_file(f.file);
+
+	error = strncpy_from_user(kname, name, sizeof(kname));
+	if (error == 0 || error == sizeof(kname))
+		error = -ERANGE;
+	if (error < 0)
+		return error;
+
 	error = mnt_want_write_file(f.file);
 	if (!error) {
 		error = removexattr(file_mnt_idmap(f.file),
-				    f.file->f_path.dentry, name);
+				    f.file->f_path.dentry, kname);
 		mnt_drop_write_file(f.file);
 	}
 	fdput(f);