Message ID | 20210525135100.11221-7-jack@suse.cz (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | fs: Hole punch vs page cache filling races | expand |
On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote: > Use invalidate_lock instead of XFS internal i_mmap_lock. The intended > purpose of invalidate_lock is exactly the same. Note that the locking in > __xfs_filemap_fault() slightly changes as filemap_fault() already takes > invalidate_lock. > > Reviewed-by: Christoph Hellwig <hch@lst.de> > CC: <linux-xfs@vger.kernel.org> > CC: "Darrick J. Wong" <darrick.wong@oracle.com> It's djwong@kernel.org now. > Signed-off-by: Jan Kara <jack@suse.cz> > --- > fs/xfs/xfs_file.c | 12 ++++++----- > fs/xfs/xfs_inode.c | 52 ++++++++++++++++++++++++++-------------------- > fs/xfs/xfs_inode.h | 1 - > fs/xfs/xfs_super.c | 2 -- > 4 files changed, 36 insertions(+), 31 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 396ef36dcd0a..dc9cb5c20549 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -1282,7 +1282,7 @@ xfs_file_llseek( > * > * mmap_lock (MM) > * sb_start_pagefault(vfs, freeze) > - * i_mmaplock (XFS - truncate serialisation) > + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) > * page_lock (MM) > * i_lock (XFS - extent map serialisation) > */ > @@ -1303,24 +1303,26 @@ __xfs_filemap_fault( > file_update_time(vmf->vma->vm_file); > } > > - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > if (IS_DAX(inode)) { > pfn_t pfn; > > + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, > (write_fault && !vmf->cow_page) ? > &xfs_direct_write_iomap_ops : > &xfs_read_iomap_ops); > if (ret & VM_FAULT_NEEDDSYNC) > ret = dax_finish_sync_fault(vmf, pe_size, pfn); > + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > } else { > - if (write_fault) > + if (write_fault) { > + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > ret = iomap_page_mkwrite(vmf, > &xfs_buffered_write_iomap_ops); > - else > + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > + } else > ret = filemap_fault(vmf); > } > - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > if (write_fault) > sb_end_pagefault(inode->i_sb); > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index 0369eb22c1bb..53bb5fc33621 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -131,7 +131,7 @@ xfs_ilock_attr_map_shared( > > /* > * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 > - * multi-reader locks: i_mmap_lock and the i_lock. This routine allows > + * multi-reader locks: invalidate_lock and the i_lock. This routine allows > * various combinations of the locks to be obtained. > * > * The 3 locks should always be ordered so that the IO lock is obtained first, > @@ -139,23 +139,23 @@ xfs_ilock_attr_map_shared( > * > * Basic locking order: > * > - * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock > + * i_rwsem -> invalidate_lock -> page_lock -> i_ilock > * > * mmap_lock locking order: > * > * i_rwsem -> page lock -> mmap_lock > - * mmap_lock -> i_mmap_lock -> page_lock > + * mmap_lock -> invalidate_lock -> page_lock > * > * The difference in mmap_lock locking order mean that we cannot hold the > - * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can > - * fault in pages during copy in/out (for buffered IO) or require the mmap_lock > - * in get_user_pages() to map the user pages into the kernel address space for > - * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because > - * page faults already hold the mmap_lock. > + * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths > + * can fault in pages during copy in/out (for buffered IO) or require the > + * mmap_lock in get_user_pages() to map the user pages into the kernel address > + * space for direct IO. Similarly the i_rwsem cannot be taken inside a page > + * fault because page faults already hold the mmap_lock. > * > * Hence to serialise fully against both syscall and mmap based IO, we need to > - * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both > - * taken in places where we need to invalidate the page cache in a race > + * take both the i_rwsem and the invalidate_lock. These locks should *only* be > + * both taken in places where we need to invalidate the page cache in a race > * free manner (e.g. truncate, hole punch and other extent manipulation > * functions). > */ > @@ -187,10 +187,13 @@ xfs_ilock( > XFS_IOLOCK_DEP(lock_flags)); > } > > - if (lock_flags & XFS_MMAPLOCK_EXCL) > - mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); > - else if (lock_flags & XFS_MMAPLOCK_SHARED) > - mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); > + if (lock_flags & XFS_MMAPLOCK_EXCL) { > + down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock, > + XFS_MMAPLOCK_DEP(lock_flags)); > + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { > + down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock, > + XFS_MMAPLOCK_DEP(lock_flags)); > + } > > if (lock_flags & XFS_ILOCK_EXCL) > mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); > @@ -239,10 +242,10 @@ xfs_ilock_nowait( > } > > if (lock_flags & XFS_MMAPLOCK_EXCL) { > - if (!mrtryupdate(&ip->i_mmaplock)) > + if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) > goto out_undo_iolock; > } else if (lock_flags & XFS_MMAPLOCK_SHARED) { > - if (!mrtryaccess(&ip->i_mmaplock)) > + if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) > goto out_undo_iolock; > } > > @@ -257,9 +260,9 @@ xfs_ilock_nowait( > > out_undo_mmaplock: > if (lock_flags & XFS_MMAPLOCK_EXCL) > - mrunlock_excl(&ip->i_mmaplock); > + up_write(&VFS_I(ip)->i_mapping->invalidate_lock); > else if (lock_flags & XFS_MMAPLOCK_SHARED) > - mrunlock_shared(&ip->i_mmaplock); > + up_read(&VFS_I(ip)->i_mapping->invalidate_lock); > out_undo_iolock: > if (lock_flags & XFS_IOLOCK_EXCL) > up_write(&VFS_I(ip)->i_rwsem); > @@ -306,9 +309,9 @@ xfs_iunlock( > up_read(&VFS_I(ip)->i_rwsem); > > if (lock_flags & XFS_MMAPLOCK_EXCL) > - mrunlock_excl(&ip->i_mmaplock); > + up_write(&VFS_I(ip)->i_mapping->invalidate_lock); > else if (lock_flags & XFS_MMAPLOCK_SHARED) > - mrunlock_shared(&ip->i_mmaplock); > + up_read(&VFS_I(ip)->i_mapping->invalidate_lock); > > if (lock_flags & XFS_ILOCK_EXCL) > mrunlock_excl(&ip->i_lock); > @@ -334,7 +337,7 @@ xfs_ilock_demote( > if (lock_flags & XFS_ILOCK_EXCL) > mrdemote(&ip->i_lock); > if (lock_flags & XFS_MMAPLOCK_EXCL) > - mrdemote(&ip->i_mmaplock); > + downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock); > if (lock_flags & XFS_IOLOCK_EXCL) > downgrade_write(&VFS_I(ip)->i_rwsem); > > @@ -355,8 +358,11 @@ xfs_isilocked( > > if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { > if (!(lock_flags & XFS_MMAPLOCK_SHARED)) > - return !!ip->i_mmaplock.mr_writer; > - return rwsem_is_locked(&ip->i_mmaplock.mr_lock); > + return !debug_locks || > + lockdep_is_held_type( > + &VFS_I(ip)->i_mapping->invalidate_lock, > + 0); > + return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock); This doesn't look right... If lockdep is disabled, we always return true for xfs_isilocked(ip, XFS_MMAPLOCK_EXCL) even if nobody holds the lock? Granted, you probably just copy-pasted from the IOLOCK_SHARED clause beneath it. Er... oh right, preichl was messing with all that... https://lore.kernel.org/linux-xfs/20201016021005.548850-2-preichl@redhat.com/ I guess I'll go have a look at that again. --D > } > > if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index ca826cfba91c..a0e4153efbbe 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -40,7 +40,6 @@ typedef struct xfs_inode { > /* Transaction and locking information. */ > struct xfs_inode_log_item *i_itemp; /* logging information */ > mrlock_t i_lock; /* inode lock */ > - mrlock_t i_mmaplock; /* inode mmap IO lock */ > atomic_t i_pincount; /* inode pin count */ > > /* > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index a2dab05332ac..eeaf44910b5f 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -715,8 +715,6 @@ xfs_fs_inode_init_once( > atomic_set(&ip->i_pincount, 0); > spin_lock_init(&ip->i_flags_lock); > > - mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, > - "xfsino", ip->i_ino); > mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, > "xfsino", ip->i_ino); > } > -- > 2.26.2 >
On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote: > Use invalidate_lock instead of XFS internal i_mmap_lock. The intended > purpose of invalidate_lock is exactly the same. Note that the locking in > __xfs_filemap_fault() slightly changes as filemap_fault() already takes > invalidate_lock. > > Reviewed-by: Christoph Hellwig <hch@lst.de> > CC: <linux-xfs@vger.kernel.org> > CC: "Darrick J. Wong" <darrick.wong@oracle.com> > Signed-off-by: Jan Kara <jack@suse.cz> > --- > fs/xfs/xfs_file.c | 12 ++++++----- > fs/xfs/xfs_inode.c | 52 ++++++++++++++++++++++++++-------------------- > fs/xfs/xfs_inode.h | 1 - > fs/xfs/xfs_super.c | 2 -- > 4 files changed, 36 insertions(+), 31 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 396ef36dcd0a..dc9cb5c20549 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -1282,7 +1282,7 @@ xfs_file_llseek( > * > * mmap_lock (MM) > * sb_start_pagefault(vfs, freeze) > - * i_mmaplock (XFS - truncate serialisation) > + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) > * page_lock (MM) > * i_lock (XFS - extent map serialisation) > */ > @@ -1303,24 +1303,26 @@ __xfs_filemap_fault( > file_update_time(vmf->vma->vm_file); > } > > - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > if (IS_DAX(inode)) { > pfn_t pfn; > > + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, > (write_fault && !vmf->cow_page) ? > &xfs_direct_write_iomap_ops : > &xfs_read_iomap_ops); > if (ret & VM_FAULT_NEEDDSYNC) > ret = dax_finish_sync_fault(vmf, pe_size, pfn); > + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > } else { > - if (write_fault) > + if (write_fault) { > + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > ret = iomap_page_mkwrite(vmf, > &xfs_buffered_write_iomap_ops); > - else > + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > + } else > ret = filemap_fault(vmf); > } > - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); This seems kinda messy. filemap_fault() basically takes the invalidate lock around the entire operation, it runs, so maybe it would be cleaner to implement it as: filemap_fault_locked(vmf) { /* does the filemap fault work */ } filemap_fault(vmf) { filemap_invalidate_down_read(...) ret = filemap_fault_locked(vmf) filemap_invalidate_up_read(...) return ret; } And that means XFS could just call filemap_fault_locked() and not have to do all this messy locking just to avoid holding the lock that filemap_fault has now internalised. > @@ -355,8 +358,11 @@ xfs_isilocked( > > if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { > if (!(lock_flags & XFS_MMAPLOCK_SHARED)) > - return !!ip->i_mmaplock.mr_writer; > - return rwsem_is_locked(&ip->i_mmaplock.mr_lock); > + return !debug_locks || > + lockdep_is_held_type( > + &VFS_I(ip)->i_mapping->invalidate_lock, > + 0); > + return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock); > } <sigh> And so here we are again, losing more of our read vs write debug checks on debug kernels when lockdep is not enabled.... Can we please add rwsem_is_locked_read() and rwsem_is_locked_write() wrappers that just look at the rwsem counter value to determine how the lock is held? Then the mrlock_t can go away entirely.... Cheers, Dave.
On Tue 25-05-21 14:37:29, Darrick J. Wong wrote: > On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote: > > Use invalidate_lock instead of XFS internal i_mmap_lock. The intended > > purpose of invalidate_lock is exactly the same. Note that the locking in > > __xfs_filemap_fault() slightly changes as filemap_fault() already takes > > invalidate_lock. > > > > Reviewed-by: Christoph Hellwig <hch@lst.de> > > CC: <linux-xfs@vger.kernel.org> > > CC: "Darrick J. Wong" <darrick.wong@oracle.com> > > It's djwong@kernel.org now. OK, updated. > > @@ -355,8 +358,11 @@ xfs_isilocked( > > > > if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { > > if (!(lock_flags & XFS_MMAPLOCK_SHARED)) > > - return !!ip->i_mmaplock.mr_writer; > > - return rwsem_is_locked(&ip->i_mmaplock.mr_lock); > > + return !debug_locks || > > + lockdep_is_held_type( > > + &VFS_I(ip)->i_mapping->invalidate_lock, > > + 0); > > + return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock); > > This doesn't look right... > > If lockdep is disabled, we always return true for > xfs_isilocked(ip, XFS_MMAPLOCK_EXCL) even if nobody holds the lock? > > Granted, you probably just copy-pasted from the IOLOCK_SHARED clause > beneath it. Er... oh right, preichl was messing with all that... > > https://lore.kernel.org/linux-xfs/20201016021005.548850-2-preichl@redhat.com/ Indeed copy-paste programming ;) It certainly makes the assertions happy but useless. Should I pull the patch you reference into the series? It seems to have been uncontroversial and reviewed. Or will you pull the series to xfs tree so I can just rebase on top? Honza
On Wed 26-05-21 07:40:41, Dave Chinner wrote: > On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote: > > Use invalidate_lock instead of XFS internal i_mmap_lock. The intended > > purpose of invalidate_lock is exactly the same. Note that the locking in > > __xfs_filemap_fault() slightly changes as filemap_fault() already takes > > invalidate_lock. > > > > Reviewed-by: Christoph Hellwig <hch@lst.de> > > CC: <linux-xfs@vger.kernel.org> > > CC: "Darrick J. Wong" <darrick.wong@oracle.com> > > Signed-off-by: Jan Kara <jack@suse.cz> > > --- > > fs/xfs/xfs_file.c | 12 ++++++----- > > fs/xfs/xfs_inode.c | 52 ++++++++++++++++++++++++++-------------------- > > fs/xfs/xfs_inode.h | 1 - > > fs/xfs/xfs_super.c | 2 -- > > 4 files changed, 36 insertions(+), 31 deletions(-) > > > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > > index 396ef36dcd0a..dc9cb5c20549 100644 > > --- a/fs/xfs/xfs_file.c > > +++ b/fs/xfs/xfs_file.c > > @@ -1282,7 +1282,7 @@ xfs_file_llseek( > > * > > * mmap_lock (MM) > > * sb_start_pagefault(vfs, freeze) > > - * i_mmaplock (XFS - truncate serialisation) > > + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) > > * page_lock (MM) > > * i_lock (XFS - extent map serialisation) > > */ > > @@ -1303,24 +1303,26 @@ __xfs_filemap_fault( > > file_update_time(vmf->vma->vm_file); > > } > > > > - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > if (IS_DAX(inode)) { > > pfn_t pfn; > > > > + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, > > (write_fault && !vmf->cow_page) ? > > &xfs_direct_write_iomap_ops : > > &xfs_read_iomap_ops); > > if (ret & VM_FAULT_NEEDDSYNC) > > ret = dax_finish_sync_fault(vmf, pe_size, pfn); > > + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > } else { > > - if (write_fault) > > + if (write_fault) { > > + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > ret = iomap_page_mkwrite(vmf, > > &xfs_buffered_write_iomap_ops); > > - else > > + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > + } else > > ret = filemap_fault(vmf); > > } > > - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > This seems kinda messy. filemap_fault() basically takes the > invalidate lock around the entire operation, it runs, so maybe it > would be cleaner to implement it as: > > filemap_fault_locked(vmf) > { > /* does the filemap fault work */ > } > > filemap_fault(vmf) > { > filemap_invalidate_down_read(...) > ret = filemap_fault_locked(vmf) > filemap_invalidate_up_read(...) > return ret; > } > > And that means XFS could just call filemap_fault_locked() and not > have to do all this messy locking just to avoid holding the lock > that filemap_fault has now internalised. Sure, I can do that. > > @@ -355,8 +358,11 @@ xfs_isilocked( > > > > if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { > > if (!(lock_flags & XFS_MMAPLOCK_SHARED)) > > - return !!ip->i_mmaplock.mr_writer; > > - return rwsem_is_locked(&ip->i_mmaplock.mr_lock); > > + return !debug_locks || > > + lockdep_is_held_type( > > + &VFS_I(ip)->i_mapping->invalidate_lock, > > + 0); > > + return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock); > > } > > <sigh> > > And so here we are again, losing more of our read vs write debug > checks on debug kernels when lockdep is not enabled.... > > Can we please add rwsem_is_locked_read() and rwsem_is_locked_write() > wrappers that just look at the rwsem counter value to determine how > the lock is held? Then the mrlock_t can go away entirely.... Apparently someone already did that for XFS as Darrick pointed out. So we just have to sort out how to merge it. Honza
On Wed 26-05-21 12:20:59, Jan Kara wrote: > On Wed 26-05-21 07:40:41, Dave Chinner wrote: > > On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote: > > > Use invalidate_lock instead of XFS internal i_mmap_lock. The intended > > > purpose of invalidate_lock is exactly the same. Note that the locking in > > > __xfs_filemap_fault() slightly changes as filemap_fault() already takes > > > invalidate_lock. > > > > > > Reviewed-by: Christoph Hellwig <hch@lst.de> > > > CC: <linux-xfs@vger.kernel.org> > > > CC: "Darrick J. Wong" <darrick.wong@oracle.com> > > > Signed-off-by: Jan Kara <jack@suse.cz> > > > --- > > > fs/xfs/xfs_file.c | 12 ++++++----- > > > fs/xfs/xfs_inode.c | 52 ++++++++++++++++++++++++++-------------------- > > > fs/xfs/xfs_inode.h | 1 - > > > fs/xfs/xfs_super.c | 2 -- > > > 4 files changed, 36 insertions(+), 31 deletions(-) > > > > > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > > > index 396ef36dcd0a..dc9cb5c20549 100644 > > > --- a/fs/xfs/xfs_file.c > > > +++ b/fs/xfs/xfs_file.c > > > @@ -1282,7 +1282,7 @@ xfs_file_llseek( > > > * > > > * mmap_lock (MM) > > > * sb_start_pagefault(vfs, freeze) > > > - * i_mmaplock (XFS - truncate serialisation) > > > + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) > > > * page_lock (MM) > > > * i_lock (XFS - extent map serialisation) > > > */ > > > @@ -1303,24 +1303,26 @@ __xfs_filemap_fault( > > > file_update_time(vmf->vma->vm_file); > > > } > > > > > > - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > > if (IS_DAX(inode)) { > > > pfn_t pfn; > > > > > > + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > > ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, > > > (write_fault && !vmf->cow_page) ? > > > &xfs_direct_write_iomap_ops : > > > &xfs_read_iomap_ops); > > > if (ret & VM_FAULT_NEEDDSYNC) > > > ret = dax_finish_sync_fault(vmf, pe_size, pfn); > > > + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > > } else { > > > - if (write_fault) > > > + if (write_fault) { > > > + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > > ret = iomap_page_mkwrite(vmf, > > > &xfs_buffered_write_iomap_ops); > > > - else > > > + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > > + } else > > > ret = filemap_fault(vmf); > > > } > > > - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > > > This seems kinda messy. filemap_fault() basically takes the > > invalidate lock around the entire operation, it runs, so maybe it > > would be cleaner to implement it as: > > > > filemap_fault_locked(vmf) > > { > > /* does the filemap fault work */ > > } > > > > filemap_fault(vmf) > > { > > filemap_invalidate_down_read(...) > > ret = filemap_fault_locked(vmf) > > filemap_invalidate_up_read(...) > > return ret; > > } > > > > And that means XFS could just call filemap_fault_locked() and not > > have to do all this messy locking just to avoid holding the lock > > that filemap_fault has now internalised. > > Sure, I can do that. Hum, looking into this in more detail it isn't as easy. There are some operations inside filemap_fault() that need to be done outside of invalidate_lock. In particular we call into readahead code which will grab invalidate_lock for itself. So we'd need to pass in struct readahead_control whether invalidate_lock is held or not which is IMHO uglier than what we currently do in __xfs_filemap_fault(). Honza
On Wed, May 26, 2021 at 12:18:40PM +0200, Jan Kara wrote: > On Tue 25-05-21 14:37:29, Darrick J. Wong wrote: > > On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote: > > > Use invalidate_lock instead of XFS internal i_mmap_lock. The intended > > > purpose of invalidate_lock is exactly the same. Note that the locking in > > > __xfs_filemap_fault() slightly changes as filemap_fault() already takes > > > invalidate_lock. > > > > > > Reviewed-by: Christoph Hellwig <hch@lst.de> > > > CC: <linux-xfs@vger.kernel.org> > > > CC: "Darrick J. Wong" <darrick.wong@oracle.com> > > > > It's djwong@kernel.org now. > > OK, updated. > > > > @@ -355,8 +358,11 @@ xfs_isilocked( > > > > > > if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { > > > if (!(lock_flags & XFS_MMAPLOCK_SHARED)) > > > - return !!ip->i_mmaplock.mr_writer; > > > - return rwsem_is_locked(&ip->i_mmaplock.mr_lock); > > > + return !debug_locks || > > > + lockdep_is_held_type( > > > + &VFS_I(ip)->i_mapping->invalidate_lock, > > > + 0); > > > + return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock); > > > > This doesn't look right... > > > > If lockdep is disabled, we always return true for > > xfs_isilocked(ip, XFS_MMAPLOCK_EXCL) even if nobody holds the lock? > > > > Granted, you probably just copy-pasted from the IOLOCK_SHARED clause > > beneath it. Er... oh right, preichl was messing with all that... > > > > https://lore.kernel.org/linux-xfs/20201016021005.548850-2-preichl@redhat.com/ > > Indeed copy-paste programming ;) It certainly makes the assertions happy > but useless. Should I pull the patch you reference into the series? It > seems to have been uncontroversial and reviewed. Or will you pull the > series to xfs tree so I can just rebase on top? The full conversion series introduced assertion failures because lockdep can't handle some of the ILOCK usage patterns, specifically the fact that a thread sometimes takes the ILOCK but then hands the inode to a workqueue to avoid overflowing the first thread's stack. That's why it never got merged into the xfs tree. However, that kind of switcheroo isn't done with the MMAPLOCK/invalidate_lock, so you could simply pull the patch I linked above into your series. --D > > Honza > -- > Jan Kara <jack@suse.com> > SUSE Labs, CR
On Wed 26-05-21 08:32:51, Darrick J. Wong wrote: > On Wed, May 26, 2021 at 12:18:40PM +0200, Jan Kara wrote: > > On Tue 25-05-21 14:37:29, Darrick J. Wong wrote: > > > On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote: > > > > Use invalidate_lock instead of XFS internal i_mmap_lock. The intended > > > > purpose of invalidate_lock is exactly the same. Note that the locking in > > > > __xfs_filemap_fault() slightly changes as filemap_fault() already takes > > > > invalidate_lock. > > > > > > > > Reviewed-by: Christoph Hellwig <hch@lst.de> > > > > CC: <linux-xfs@vger.kernel.org> > > > > CC: "Darrick J. Wong" <darrick.wong@oracle.com> > > > > > > It's djwong@kernel.org now. > > > > OK, updated. > > > > > > @@ -355,8 +358,11 @@ xfs_isilocked( > > > > > > > > if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { > > > > if (!(lock_flags & XFS_MMAPLOCK_SHARED)) > > > > - return !!ip->i_mmaplock.mr_writer; > > > > - return rwsem_is_locked(&ip->i_mmaplock.mr_lock); > > > > + return !debug_locks || > > > > + lockdep_is_held_type( > > > > + &VFS_I(ip)->i_mapping->invalidate_lock, > > > > + 0); > > > > + return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock); > > > > > > This doesn't look right... > > > > > > If lockdep is disabled, we always return true for > > > xfs_isilocked(ip, XFS_MMAPLOCK_EXCL) even if nobody holds the lock? > > > > > > Granted, you probably just copy-pasted from the IOLOCK_SHARED clause > > > beneath it. Er... oh right, preichl was messing with all that... > > > > > > https://lore.kernel.org/linux-xfs/20201016021005.548850-2-preichl@redhat.com/ > > > > Indeed copy-paste programming ;) It certainly makes the assertions happy > > but useless. Should I pull the patch you reference into the series? It > > seems to have been uncontroversial and reviewed. Or will you pull the > > series to xfs tree so I can just rebase on top? > > The full conversion series introduced assertion failures because lockdep > can't handle some of the ILOCK usage patterns, specifically the fact > that a thread sometimes takes the ILOCK but then hands the inode to a > workqueue to avoid overflowing the first thread's stack. That's why it > never got merged into the xfs tree. I see. Yeah, we do "interesting" dances around lockdep fs-freezing annotations for AIO as well where the freeze protection is inherited from submission to completion context (we effectively generate false release event for lockdep when exiting submit context and false acquire event in the completion context). It can be done but it's ugly and error prone. > However, that kind of switcheroo isn't done with the > MMAPLOCK/invalidate_lock, so you could simply pull the patch I linked > above into your series. OK, will do! Honza
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 396ef36dcd0a..dc9cb5c20549 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1282,7 +1282,7 @@ xfs_file_llseek( * * mmap_lock (MM) * sb_start_pagefault(vfs, freeze) - * i_mmaplock (XFS - truncate serialisation) + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) * page_lock (MM) * i_lock (XFS - extent map serialisation) */ @@ -1303,24 +1303,26 @@ __xfs_filemap_fault( file_update_time(vmf->vma->vm_file); } - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { pfn_t pfn; + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, (write_fault && !vmf->cow_page) ? &xfs_direct_write_iomap_ops : &xfs_read_iomap_ops); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { - if (write_fault) + if (write_fault) { + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); - else + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + } else ret = filemap_fault(vmf); } - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (write_fault) sb_end_pagefault(inode->i_sb); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0369eb22c1bb..53bb5fc33621 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -131,7 +131,7 @@ xfs_ilock_attr_map_shared( /* * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 - * multi-reader locks: i_mmap_lock and the i_lock. This routine allows + * multi-reader locks: invalidate_lock and the i_lock. This routine allows * various combinations of the locks to be obtained. * * The 3 locks should always be ordered so that the IO lock is obtained first, @@ -139,23 +139,23 @@ xfs_ilock_attr_map_shared( * * Basic locking order: * - * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock + * i_rwsem -> invalidate_lock -> page_lock -> i_ilock * * mmap_lock locking order: * * i_rwsem -> page lock -> mmap_lock - * mmap_lock -> i_mmap_lock -> page_lock + * mmap_lock -> invalidate_lock -> page_lock * * The difference in mmap_lock locking order mean that we cannot hold the - * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can - * fault in pages during copy in/out (for buffered IO) or require the mmap_lock - * in get_user_pages() to map the user pages into the kernel address space for - * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because - * page faults already hold the mmap_lock. + * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths + * can fault in pages during copy in/out (for buffered IO) or require the + * mmap_lock in get_user_pages() to map the user pages into the kernel address + * space for direct IO. Similarly the i_rwsem cannot be taken inside a page + * fault because page faults already hold the mmap_lock. * * Hence to serialise fully against both syscall and mmap based IO, we need to - * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both - * taken in places where we need to invalidate the page cache in a race + * take both the i_rwsem and the invalidate_lock. These locks should *only* be + * both taken in places where we need to invalidate the page cache in a race * free manner (e.g. truncate, hole punch and other extent manipulation * functions). */ @@ -187,10 +187,13 @@ xfs_ilock( XFS_IOLOCK_DEP(lock_flags)); } - if (lock_flags & XFS_MMAPLOCK_EXCL) - mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_MMAPLOCK_SHARED) - mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + if (lock_flags & XFS_MMAPLOCK_EXCL) { + down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock, + XFS_MMAPLOCK_DEP(lock_flags)); + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock, + XFS_MMAPLOCK_DEP(lock_flags)); + } if (lock_flags & XFS_ILOCK_EXCL) mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); @@ -239,10 +242,10 @@ xfs_ilock_nowait( } if (lock_flags & XFS_MMAPLOCK_EXCL) { - if (!mrtryupdate(&ip->i_mmaplock)) + if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) goto out_undo_iolock; } else if (lock_flags & XFS_MMAPLOCK_SHARED) { - if (!mrtryaccess(&ip->i_mmaplock)) + if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) goto out_undo_iolock; } @@ -257,9 +260,9 @@ xfs_ilock_nowait( out_undo_mmaplock: if (lock_flags & XFS_MMAPLOCK_EXCL) - mrunlock_excl(&ip->i_mmaplock); + up_write(&VFS_I(ip)->i_mapping->invalidate_lock); else if (lock_flags & XFS_MMAPLOCK_SHARED) - mrunlock_shared(&ip->i_mmaplock); + up_read(&VFS_I(ip)->i_mapping->invalidate_lock); out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) up_write(&VFS_I(ip)->i_rwsem); @@ -306,9 +309,9 @@ xfs_iunlock( up_read(&VFS_I(ip)->i_rwsem); if (lock_flags & XFS_MMAPLOCK_EXCL) - mrunlock_excl(&ip->i_mmaplock); + up_write(&VFS_I(ip)->i_mapping->invalidate_lock); else if (lock_flags & XFS_MMAPLOCK_SHARED) - mrunlock_shared(&ip->i_mmaplock); + up_read(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_ILOCK_EXCL) mrunlock_excl(&ip->i_lock); @@ -334,7 +337,7 @@ xfs_ilock_demote( if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); if (lock_flags & XFS_MMAPLOCK_EXCL) - mrdemote(&ip->i_mmaplock); + downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_IOLOCK_EXCL) downgrade_write(&VFS_I(ip)->i_rwsem); @@ -355,8 +358,11 @@ xfs_isilocked( if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { if (!(lock_flags & XFS_MMAPLOCK_SHARED)) - return !!ip->i_mmaplock.mr_writer; - return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + return !debug_locks || + lockdep_is_held_type( + &VFS_I(ip)->i_mapping->invalidate_lock, + 0); + return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock); } if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index ca826cfba91c..a0e4153efbbe 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -40,7 +40,6 @@ typedef struct xfs_inode { /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ - mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index a2dab05332ac..eeaf44910b5f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -715,8 +715,6 @@ xfs_fs_inode_init_once( atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); - mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, - "xfsino", ip->i_ino); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); }